summaryrefslogtreecommitdiff
path: root/fs/btrfs
diff options
context:
space:
mode:
authorAndré Fabian Silva Delgado <emulatorman@parabola.nu>2015-08-05 17:04:01 -0300
committerAndré Fabian Silva Delgado <emulatorman@parabola.nu>2015-08-05 17:04:01 -0300
commit57f0f512b273f60d52568b8c6b77e17f5636edc0 (patch)
tree5e910f0e82173f4ef4f51111366a3f1299037a7b /fs/btrfs
Initial import
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/Kconfig91
-rw-r--r--fs/btrfs/Makefile19
-rw-r--r--fs/btrfs/acl.c166
-rw-r--r--fs/btrfs/async-thread.c365
-rw-r--r--fs/btrfs/async-thread.h81
-rw-r--r--fs/btrfs/backref.c1978
-rw-r--r--fs/btrfs/backref.h77
-rw-r--r--fs/btrfs/btrfs_inode.h328
-rw-r--r--fs/btrfs/check-integrity.c3245
-rw-r--r--fs/btrfs/check-integrity.h38
-rw-r--r--fs/btrfs/compression.c1091
-rw-r--r--fs/btrfs/compression.h83
-rw-r--r--fs/btrfs/ctree.c5910
-rw-r--r--fs/btrfs/ctree.h4247
-rw-r--r--fs/btrfs/delayed-inode.c1997
-rw-r--r--fs/btrfs/delayed-inode.h156
-rw-r--r--fs/btrfs/delayed-ref.c962
-rw-r--r--fs/btrfs/delayed-ref.h276
-rw-r--r--fs/btrfs/dev-replace.c937
-rw-r--r--fs/btrfs/dev-replace.h44
-rw-r--r--fs/btrfs/dir-item.c482
-rw-r--r--fs/btrfs/disk-io.c4338
-rw-r--r--fs/btrfs/disk-io.h159
-rw-r--r--fs/btrfs/export.c304
-rw-r--r--fs/btrfs/export.h19
-rw-r--r--fs/btrfs/extent-tree.c10174
-rw-r--r--fs/btrfs/extent_io.c5632
-rw-r--r--fs/btrfs/extent_io.h382
-rw-r--r--fs/btrfs/extent_map.c455
-rw-r--r--fs/btrfs/extent_map.h85
-rw-r--r--fs/btrfs/file-item.c953
-rw-r--r--fs/btrfs/file.c2860
-rw-r--r--fs/btrfs/free-space-cache.c3653
-rw-r--r--fs/btrfs/free-space-cache.h133
-rw-r--r--fs/btrfs/hash.c46
-rw-r--r--fs/btrfs/hash.h42
-rw-r--r--fs/btrfs/inode-item.c440
-rw-r--r--fs/btrfs/inode-map.c575
-rw-r--r--fs/btrfs/inode-map.h13
-rw-r--r--fs/btrfs/inode.c9907
-rw-r--r--fs/btrfs/ioctl.c5373
-rw-r--r--fs/btrfs/locking.c300
-rw-r--r--fs/btrfs/locking.h62
-rw-r--r--fs/btrfs/lzo.c443
-rw-r--r--fs/btrfs/math.h42
-rw-r--r--fs/btrfs/ordered-data.c1051
-rw-r--r--fs/btrfs/ordered-data.h212
-rw-r--r--fs/btrfs/orphan.c71
-rw-r--r--fs/btrfs/print-tree.c349
-rw-r--r--fs/btrfs/print-tree.h23
-rw-r--r--fs/btrfs/props.c429
-rw-r--r--fs/btrfs/props.h42
-rw-r--r--fs/btrfs/qgroup.c2966
-rw-r--r--fs/btrfs/qgroup.h107
-rw-r--r--fs/btrfs/raid56.c2670
-rw-r--r--fs/btrfs/raid56.h62
-rw-r--r--fs/btrfs/rcu-string.h56
-rw-r--r--fs/btrfs/reada.c992
-rw-r--r--fs/btrfs/relocation.c4658
-rw-r--r--fs/btrfs/root-tree.c497
-rw-r--r--fs/btrfs/scrub.c4228
-rw-r--r--fs/btrfs/send.c5978
-rw-r--r--fs/btrfs/send.h134
-rw-r--r--fs/btrfs/struct-funcs.c142
-rw-r--r--fs/btrfs/super.c2220
-rw-r--r--fs/btrfs/sysfs.c758
-rw-r--r--fs/btrfs/sysfs.h89
-rw-r--r--fs/btrfs/tests/btrfs-tests.c171
-rw-r--r--fs/btrfs/tests/btrfs-tests.h68
-rw-r--r--fs/btrfs/tests/extent-buffer-tests.c229
-rw-r--r--fs/btrfs/tests/extent-io-tests.c275
-rw-r--r--fs/btrfs/tests/free-space-tests.c909
-rw-r--r--fs/btrfs/tests/inode-tests.c1123
-rw-r--r--fs/btrfs/tests/qgroup-tests.c469
-rw-r--r--fs/btrfs/transaction.c2204
-rw-r--r--fs/btrfs/transaction.h194
-rw-r--r--fs/btrfs/tree-defrag.c139
-rw-r--r--fs/btrfs/tree-log.c5093
-rw-r--r--fs/btrfs/tree-log.h85
-rw-r--r--fs/btrfs/ulist.c251
-rw-r--r--fs/btrfs/ulist.h80
-rw-r--r--fs/btrfs/uuid-tree.c354
-rw-r--r--fs/btrfs/volumes.c6730
-rw-r--r--fs/btrfs/volumes.h541
-rw-r--r--fs/btrfs/xattr.c517
-rw-r--r--fs/btrfs/xattr.h41
-rw-r--r--fs/btrfs/zlib.c412
87 files changed, 116582 insertions, 0 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
new file mode 100644
index 000000000..80e9c18ea
--- /dev/null
+++ b/fs/btrfs/Kconfig
@@ -0,0 +1,91 @@
+config BTRFS_FS
+ tristate "Btrfs filesystem support"
+ select CRYPTO
+ select CRYPTO_CRC32C
+ select ZLIB_INFLATE
+ select ZLIB_DEFLATE
+ select LZO_COMPRESS
+ select LZO_DECOMPRESS
+ select RAID6_PQ
+ select XOR_BLOCKS
+ select SRCU
+
+ help
+ Btrfs is a general purpose copy-on-write filesystem with extents,
+ writable snapshotting, support for multiple devices and many more
+ features focused on fault tolerance, repair and easy administration.
+
+ The filesystem disk format is no longer unstable, and it's not
+ expected to change unless there are strong reasons to do so. If there
+ is a format change, file systems with a unchanged format will
+ continue to be mountable and usable by newer kernels.
+
+ For more information, please see the web pages at
+ http://btrfs.wiki.kernel.org.
+
+ To compile this file system support as a module, choose M here. The
+ module will be called btrfs.
+
+ If unsure, say N.
+
+config BTRFS_FS_POSIX_ACL
+ bool "Btrfs POSIX Access Control Lists"
+ depends on BTRFS_FS
+ select FS_POSIX_ACL
+ help
+ POSIX Access Control Lists (ACLs) support permissions for users and
+ groups beyond the owner/group/world scheme.
+
+ To learn more about Access Control Lists, visit the POSIX ACLs for
+ Linux website <http://acl.bestbits.at/>.
+
+ If you don't know what Access Control Lists are, say N
+
+config BTRFS_FS_CHECK_INTEGRITY
+ bool "Btrfs with integrity check tool compiled in (DANGEROUS)"
+ depends on BTRFS_FS
+ help
+ Adds code that examines all block write requests (including
+ writes of the super block). The goal is to verify that the
+ state of the filesystem on disk is always consistent, i.e.,
+ after a power-loss or kernel panic event the filesystem is
+ in a consistent state.
+
+ If the integrity check tool is included and activated in
+ the mount options, plenty of kernel memory is used, and
+ plenty of additional CPU cycles are spent. Enabling this
+ functionality is not intended for normal use.
+
+ In most cases, unless you are a btrfs developer who needs
+ to verify the integrity of (super)-block write requests
+ during the run of a regression test, say N
+
+config BTRFS_FS_RUN_SANITY_TESTS
+ bool "Btrfs will run sanity tests upon loading"
+ depends on BTRFS_FS
+ help
+ This will run some basic sanity tests on the free space cache
+ code to make sure it is acting as it should. These are mostly
+ regression tests and are only really interesting to btrfs
+ developers.
+
+ If unsure, say N.
+
+config BTRFS_DEBUG
+ bool "Btrfs debugging support"
+ depends on BTRFS_FS
+ help
+ Enable run-time debugging support for the btrfs filesystem. This may
+ enable additional and expensive checks with negative impact on
+ performance, or export extra information via sysfs.
+
+ If unsure, say N.
+
+config BTRFS_ASSERT
+ bool "Btrfs assert support"
+ depends on BTRFS_FS
+ help
+ Enable run-time assertion checking. This will result in panics if
+ any of the assertions trip. This is meant for btrfs developers only.
+
+ If unsure, say N.
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
new file mode 100644
index 000000000..6d1d0b93b
--- /dev/null
+++ b/fs/btrfs/Makefile
@@ -0,0 +1,19 @@
+
+obj-$(CONFIG_BTRFS_FS) := btrfs.o
+
+btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
+ file-item.o inode-item.o inode-map.o disk-io.o \
+ transaction.o inode.o file.o tree-defrag.o \
+ extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
+ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
+ export.o tree-log.o free-space-cache.o zlib.o lzo.o \
+ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
+ reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
+ uuid-tree.o props.o hash.o
+
+btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
+btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
+
+btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \
+ tests/extent-buffer-tests.o tests/btrfs-tests.o \
+ tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
new file mode 100644
index 000000000..9a0124a95
--- /dev/null
+++ b/fs/btrfs/acl.c
@@ -0,0 +1,166 @@
+/*
+ * Copyright (C) 2007 Red Hat. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/posix_acl.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+#include "ctree.h"
+#include "btrfs_inode.h"
+#include "xattr.h"
+
+struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
+{
+ int size;
+ const char *name;
+ char *value = NULL;
+ struct posix_acl *acl;
+
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ name = POSIX_ACL_XATTR_ACCESS;
+ break;
+ case ACL_TYPE_DEFAULT:
+ name = POSIX_ACL_XATTR_DEFAULT;
+ break;
+ default:
+ BUG();
+ }
+
+ size = __btrfs_getxattr(inode, name, "", 0);
+ if (size > 0) {
+ value = kzalloc(size, GFP_NOFS);
+ if (!value)
+ return ERR_PTR(-ENOMEM);
+ size = __btrfs_getxattr(inode, name, value, size);
+ }
+ if (size > 0) {
+ acl = posix_acl_from_xattr(&init_user_ns, value, size);
+ } else if (size == -ENOENT || size == -ENODATA || size == 0) {
+ /* FIXME, who returns -ENOENT? I think nobody */
+ acl = NULL;
+ } else {
+ acl = ERR_PTR(-EIO);
+ }
+ kfree(value);
+
+ if (!IS_ERR(acl))
+ set_cached_acl(inode, type, acl);
+
+ return acl;
+}
+
+/*
+ * Needs to be called with fs_mutex held
+ */
+static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
+ struct inode *inode, struct posix_acl *acl, int type)
+{
+ int ret, size = 0;
+ const char *name;
+ char *value = NULL;
+
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ name = POSIX_ACL_XATTR_ACCESS;
+ if (acl) {
+ ret = posix_acl_equiv_mode(acl, &inode->i_mode);
+ if (ret < 0)
+ return ret;
+ if (ret == 0)
+ acl = NULL;
+ }
+ ret = 0;
+ break;
+ case ACL_TYPE_DEFAULT:
+ if (!S_ISDIR(inode->i_mode))
+ return acl ? -EINVAL : 0;
+ name = POSIX_ACL_XATTR_DEFAULT;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (acl) {
+ size = posix_acl_xattr_size(acl->a_count);
+ value = kmalloc(size, GFP_NOFS);
+ if (!value) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = posix_acl_to_xattr(&init_user_ns, acl, value, size);
+ if (ret < 0)
+ goto out;
+ }
+
+ ret = __btrfs_setxattr(trans, inode, name, value, size, 0);
+out:
+ kfree(value);
+
+ if (!ret)
+ set_cached_acl(inode, type, acl);
+
+ return ret;
+}
+
+int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+{
+ return __btrfs_set_acl(NULL, inode, acl, type);
+}
+
+/*
+ * btrfs_init_acl is already generally called under fs_mutex, so the locking
+ * stuff has been fixed to work with that. If the locking stuff changes, we
+ * need to re-evaluate the acl locking stuff.
+ */
+int btrfs_init_acl(struct btrfs_trans_handle *trans,
+ struct inode *inode, struct inode *dir)
+{
+ struct posix_acl *default_acl, *acl;
+ int ret = 0;
+
+ /* this happens with subvols */
+ if (!dir)
+ return 0;
+
+ ret = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
+ if (ret)
+ return ret;
+
+ if (default_acl) {
+ ret = __btrfs_set_acl(trans, inode, default_acl,
+ ACL_TYPE_DEFAULT);
+ posix_acl_release(default_acl);
+ }
+
+ if (acl) {
+ if (!ret)
+ ret = __btrfs_set_acl(trans, inode, acl,
+ ACL_TYPE_ACCESS);
+ posix_acl_release(acl);
+ }
+
+ if (!default_acl && !acl)
+ cache_no_acl(inode);
+ return ret;
+}
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
new file mode 100644
index 000000000..df9932b00
--- /dev/null
+++ b/fs/btrfs/async-thread.c
@@ -0,0 +1,365 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ * Copyright (C) 2014 Fujitsu. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/kthread.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/freezer.h>
+#include "async-thread.h"
+#include "ctree.h"
+
+#define WORK_DONE_BIT 0
+#define WORK_ORDER_DONE_BIT 1
+#define WORK_HIGH_PRIO_BIT 2
+
+#define NO_THRESHOLD (-1)
+#define DFT_THRESHOLD (32)
+
+struct __btrfs_workqueue {
+ struct workqueue_struct *normal_wq;
+ /* List head pointing to ordered work list */
+ struct list_head ordered_list;
+
+ /* Spinlock for ordered_list */
+ spinlock_t list_lock;
+
+ /* Thresholding related variants */
+ atomic_t pending;
+ int max_active;
+ int current_max;
+ int thresh;
+ unsigned int count;
+ spinlock_t thres_lock;
+};
+
+struct btrfs_workqueue {
+ struct __btrfs_workqueue *normal;
+ struct __btrfs_workqueue *high;
+};
+
+static void normal_work_helper(struct btrfs_work *work);
+
+#define BTRFS_WORK_HELPER(name) \
+void btrfs_##name(struct work_struct *arg) \
+{ \
+ struct btrfs_work *work = container_of(arg, struct btrfs_work, \
+ normal_work); \
+ normal_work_helper(work); \
+}
+
+BTRFS_WORK_HELPER(worker_helper);
+BTRFS_WORK_HELPER(delalloc_helper);
+BTRFS_WORK_HELPER(flush_delalloc_helper);
+BTRFS_WORK_HELPER(cache_helper);
+BTRFS_WORK_HELPER(submit_helper);
+BTRFS_WORK_HELPER(fixup_helper);
+BTRFS_WORK_HELPER(endio_helper);
+BTRFS_WORK_HELPER(endio_meta_helper);
+BTRFS_WORK_HELPER(endio_meta_write_helper);
+BTRFS_WORK_HELPER(endio_raid56_helper);
+BTRFS_WORK_HELPER(endio_repair_helper);
+BTRFS_WORK_HELPER(rmw_helper);
+BTRFS_WORK_HELPER(endio_write_helper);
+BTRFS_WORK_HELPER(freespace_write_helper);
+BTRFS_WORK_HELPER(delayed_meta_helper);
+BTRFS_WORK_HELPER(readahead_helper);
+BTRFS_WORK_HELPER(qgroup_rescan_helper);
+BTRFS_WORK_HELPER(extent_refs_helper);
+BTRFS_WORK_HELPER(scrub_helper);
+BTRFS_WORK_HELPER(scrubwrc_helper);
+BTRFS_WORK_HELPER(scrubnc_helper);
+
+static struct __btrfs_workqueue *
+__btrfs_alloc_workqueue(const char *name, unsigned int flags, int max_active,
+ int thresh)
+{
+ struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
+
+ if (!ret)
+ return NULL;
+
+ ret->max_active = max_active;
+ atomic_set(&ret->pending, 0);
+ if (thresh == 0)
+ thresh = DFT_THRESHOLD;
+ /* For low threshold, disabling threshold is a better choice */
+ if (thresh < DFT_THRESHOLD) {
+ ret->current_max = max_active;
+ ret->thresh = NO_THRESHOLD;
+ } else {
+ ret->current_max = 1;
+ ret->thresh = thresh;
+ }
+
+ if (flags & WQ_HIGHPRI)
+ ret->normal_wq = alloc_workqueue("%s-%s-high", flags,
+ ret->max_active,
+ "btrfs", name);
+ else
+ ret->normal_wq = alloc_workqueue("%s-%s", flags,
+ ret->max_active, "btrfs",
+ name);
+ if (!ret->normal_wq) {
+ kfree(ret);
+ return NULL;
+ }
+
+ INIT_LIST_HEAD(&ret->ordered_list);
+ spin_lock_init(&ret->list_lock);
+ spin_lock_init(&ret->thres_lock);
+ trace_btrfs_workqueue_alloc(ret, name, flags & WQ_HIGHPRI);
+ return ret;
+}
+
+static inline void
+__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq);
+
+struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
+ unsigned int flags,
+ int max_active,
+ int thresh)
+{
+ struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
+
+ if (!ret)
+ return NULL;
+
+ ret->normal = __btrfs_alloc_workqueue(name, flags & ~WQ_HIGHPRI,
+ max_active, thresh);
+ if (!ret->normal) {
+ kfree(ret);
+ return NULL;
+ }
+
+ if (flags & WQ_HIGHPRI) {
+ ret->high = __btrfs_alloc_workqueue(name, flags, max_active,
+ thresh);
+ if (!ret->high) {
+ __btrfs_destroy_workqueue(ret->normal);
+ kfree(ret);
+ return NULL;
+ }
+ }
+ return ret;
+}
+
+/*
+ * Hook for threshold which will be called in btrfs_queue_work.
+ * This hook WILL be called in IRQ handler context,
+ * so workqueue_set_max_active MUST NOT be called in this hook
+ */
+static inline void thresh_queue_hook(struct __btrfs_workqueue *wq)
+{
+ if (wq->thresh == NO_THRESHOLD)
+ return;
+ atomic_inc(&wq->pending);
+}
+
+/*
+ * Hook for threshold which will be called before executing the work,
+ * This hook is called in kthread content.
+ * So workqueue_set_max_active is called here.
+ */
+static inline void thresh_exec_hook(struct __btrfs_workqueue *wq)
+{
+ int new_max_active;
+ long pending;
+ int need_change = 0;
+
+ if (wq->thresh == NO_THRESHOLD)
+ return;
+
+ atomic_dec(&wq->pending);
+ spin_lock(&wq->thres_lock);
+ /*
+ * Use wq->count to limit the calling frequency of
+ * workqueue_set_max_active.
+ */
+ wq->count++;
+ wq->count %= (wq->thresh / 4);
+ if (!wq->count)
+ goto out;
+ new_max_active = wq->current_max;
+
+ /*
+ * pending may be changed later, but it's OK since we really
+ * don't need it so accurate to calculate new_max_active.
+ */
+ pending = atomic_read(&wq->pending);
+ if (pending > wq->thresh)
+ new_max_active++;
+ if (pending < wq->thresh / 2)
+ new_max_active--;
+ new_max_active = clamp_val(new_max_active, 1, wq->max_active);
+ if (new_max_active != wq->current_max) {
+ need_change = 1;
+ wq->current_max = new_max_active;
+ }
+out:
+ spin_unlock(&wq->thres_lock);
+
+ if (need_change) {
+ workqueue_set_max_active(wq->normal_wq, wq->current_max);
+ }
+}
+
+static void run_ordered_work(struct __btrfs_workqueue *wq)
+{
+ struct list_head *list = &wq->ordered_list;
+ struct btrfs_work *work;
+ spinlock_t *lock = &wq->list_lock;
+ unsigned long flags;
+
+ while (1) {
+ spin_lock_irqsave(lock, flags);
+ if (list_empty(list))
+ break;
+ work = list_entry(list->next, struct btrfs_work,
+ ordered_list);
+ if (!test_bit(WORK_DONE_BIT, &work->flags))
+ break;
+
+ /*
+ * we are going to call the ordered done function, but
+ * we leave the work item on the list as a barrier so
+ * that later work items that are done don't have their
+ * functions called before this one returns
+ */
+ if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags))
+ break;
+ trace_btrfs_ordered_sched(work);
+ spin_unlock_irqrestore(lock, flags);
+ work->ordered_func(work);
+
+ /* now take the lock again and drop our item from the list */
+ spin_lock_irqsave(lock, flags);
+ list_del(&work->ordered_list);
+ spin_unlock_irqrestore(lock, flags);
+
+ /*
+ * we don't want to call the ordered free functions
+ * with the lock held though
+ */
+ work->ordered_free(work);
+ trace_btrfs_all_work_done(work);
+ }
+ spin_unlock_irqrestore(lock, flags);
+}
+
+static void normal_work_helper(struct btrfs_work *work)
+{
+ struct __btrfs_workqueue *wq;
+ int need_order = 0;
+
+ /*
+ * We should not touch things inside work in the following cases:
+ * 1) after work->func() if it has no ordered_free
+ * Since the struct is freed in work->func().
+ * 2) after setting WORK_DONE_BIT
+ * The work may be freed in other threads almost instantly.
+ * So we save the needed things here.
+ */
+ if (work->ordered_func)
+ need_order = 1;
+ wq = work->wq;
+
+ trace_btrfs_work_sched(work);
+ thresh_exec_hook(wq);
+ work->func(work);
+ if (need_order) {
+ set_bit(WORK_DONE_BIT, &work->flags);
+ run_ordered_work(wq);
+ }
+ if (!need_order)
+ trace_btrfs_all_work_done(work);
+}
+
+void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t uniq_func,
+ btrfs_func_t func,
+ btrfs_func_t ordered_func,
+ btrfs_func_t ordered_free)
+{
+ work->func = func;
+ work->ordered_func = ordered_func;
+ work->ordered_free = ordered_free;
+ INIT_WORK(&work->normal_work, uniq_func);
+ INIT_LIST_HEAD(&work->ordered_list);
+ work->flags = 0;
+}
+
+static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq,
+ struct btrfs_work *work)
+{
+ unsigned long flags;
+
+ work->wq = wq;
+ thresh_queue_hook(wq);
+ if (work->ordered_func) {
+ spin_lock_irqsave(&wq->list_lock, flags);
+ list_add_tail(&work->ordered_list, &wq->ordered_list);
+ spin_unlock_irqrestore(&wq->list_lock, flags);
+ }
+ queue_work(wq->normal_wq, &work->normal_work);
+ trace_btrfs_work_queued(work);
+}
+
+void btrfs_queue_work(struct btrfs_workqueue *wq,
+ struct btrfs_work *work)
+{
+ struct __btrfs_workqueue *dest_wq;
+
+ if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags) && wq->high)
+ dest_wq = wq->high;
+ else
+ dest_wq = wq->normal;
+ __btrfs_queue_work(dest_wq, work);
+}
+
+static inline void
+__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq)
+{
+ destroy_workqueue(wq->normal_wq);
+ trace_btrfs_workqueue_destroy(wq);
+ kfree(wq);
+}
+
+void btrfs_destroy_workqueue(struct btrfs_workqueue *wq)
+{
+ if (!wq)
+ return;
+ if (wq->high)
+ __btrfs_destroy_workqueue(wq->high);
+ __btrfs_destroy_workqueue(wq->normal);
+ kfree(wq);
+}
+
+void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max)
+{
+ if (!wq)
+ return;
+ wq->normal->max_active = max;
+ if (wq->high)
+ wq->high->max_active = max;
+}
+
+void btrfs_set_work_high_priority(struct btrfs_work *work)
+{
+ set_bit(WORK_HIGH_PRIO_BIT, &work->flags);
+}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
new file mode 100644
index 000000000..ec2ee477f
--- /dev/null
+++ b/fs/btrfs/async-thread.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ * Copyright (C) 2014 Fujitsu. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_ASYNC_THREAD_
+#define __BTRFS_ASYNC_THREAD_
+#include <linux/workqueue.h>
+
+struct btrfs_workqueue;
+/* Internal use only */
+struct __btrfs_workqueue;
+struct btrfs_work;
+typedef void (*btrfs_func_t)(struct btrfs_work *arg);
+typedef void (*btrfs_work_func_t)(struct work_struct *arg);
+
+struct btrfs_work {
+ btrfs_func_t func;
+ btrfs_func_t ordered_func;
+ btrfs_func_t ordered_free;
+
+ /* Don't touch things below */
+ struct work_struct normal_work;
+ struct list_head ordered_list;
+ struct __btrfs_workqueue *wq;
+ unsigned long flags;
+};
+
+#define BTRFS_WORK_HELPER_PROTO(name) \
+void btrfs_##name(struct work_struct *arg)
+
+BTRFS_WORK_HELPER_PROTO(worker_helper);
+BTRFS_WORK_HELPER_PROTO(delalloc_helper);
+BTRFS_WORK_HELPER_PROTO(flush_delalloc_helper);
+BTRFS_WORK_HELPER_PROTO(cache_helper);
+BTRFS_WORK_HELPER_PROTO(submit_helper);
+BTRFS_WORK_HELPER_PROTO(fixup_helper);
+BTRFS_WORK_HELPER_PROTO(endio_helper);
+BTRFS_WORK_HELPER_PROTO(endio_meta_helper);
+BTRFS_WORK_HELPER_PROTO(endio_meta_write_helper);
+BTRFS_WORK_HELPER_PROTO(endio_raid56_helper);
+BTRFS_WORK_HELPER_PROTO(endio_repair_helper);
+BTRFS_WORK_HELPER_PROTO(rmw_helper);
+BTRFS_WORK_HELPER_PROTO(endio_write_helper);
+BTRFS_WORK_HELPER_PROTO(freespace_write_helper);
+BTRFS_WORK_HELPER_PROTO(delayed_meta_helper);
+BTRFS_WORK_HELPER_PROTO(readahead_helper);
+BTRFS_WORK_HELPER_PROTO(qgroup_rescan_helper);
+BTRFS_WORK_HELPER_PROTO(extent_refs_helper);
+BTRFS_WORK_HELPER_PROTO(scrub_helper);
+BTRFS_WORK_HELPER_PROTO(scrubwrc_helper);
+BTRFS_WORK_HELPER_PROTO(scrubnc_helper);
+
+struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
+ unsigned int flags,
+ int max_active,
+ int thresh);
+void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t helper,
+ btrfs_func_t func,
+ btrfs_func_t ordered_func,
+ btrfs_func_t ordered_free);
+void btrfs_queue_work(struct btrfs_workqueue *wq,
+ struct btrfs_work *work);
+void btrfs_destroy_workqueue(struct btrfs_workqueue *wq);
+void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max);
+void btrfs_set_work_high_priority(struct btrfs_work *work);
+#endif
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
new file mode 100644
index 000000000..614aaa196
--- /dev/null
+++ b/fs/btrfs/backref.c
@@ -0,0 +1,1978 @@
+/*
+ * Copyright (C) 2011 STRATO. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/vmalloc.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "backref.h"
+#include "ulist.h"
+#include "transaction.h"
+#include "delayed-ref.h"
+#include "locking.h"
+
+/* Just an arbitrary number so we can be sure this happened */
+#define BACKREF_FOUND_SHARED 6
+
+struct extent_inode_elem {
+ u64 inum;
+ u64 offset;
+ struct extent_inode_elem *next;
+};
+
+static int check_extent_in_eb(struct btrfs_key *key, struct extent_buffer *eb,
+ struct btrfs_file_extent_item *fi,
+ u64 extent_item_pos,
+ struct extent_inode_elem **eie)
+{
+ u64 offset = 0;
+ struct extent_inode_elem *e;
+
+ if (!btrfs_file_extent_compression(eb, fi) &&
+ !btrfs_file_extent_encryption(eb, fi) &&
+ !btrfs_file_extent_other_encoding(eb, fi)) {
+ u64 data_offset;
+ u64 data_len;
+
+ data_offset = btrfs_file_extent_offset(eb, fi);
+ data_len = btrfs_file_extent_num_bytes(eb, fi);
+
+ if (extent_item_pos < data_offset ||
+ extent_item_pos >= data_offset + data_len)
+ return 1;
+ offset = extent_item_pos - data_offset;
+ }
+
+ e = kmalloc(sizeof(*e), GFP_NOFS);
+ if (!e)
+ return -ENOMEM;
+
+ e->next = *eie;
+ e->inum = key->objectid;
+ e->offset = key->offset + offset;
+ *eie = e;
+
+ return 0;
+}
+
+static void free_inode_elem_list(struct extent_inode_elem *eie)
+{
+ struct extent_inode_elem *eie_next;
+
+ for (; eie; eie = eie_next) {
+ eie_next = eie->next;
+ kfree(eie);
+ }
+}
+
+static int find_extent_in_eb(struct extent_buffer *eb, u64 wanted_disk_byte,
+ u64 extent_item_pos,
+ struct extent_inode_elem **eie)
+{
+ u64 disk_byte;
+ struct btrfs_key key;
+ struct btrfs_file_extent_item *fi;
+ int slot;
+ int nritems;
+ int extent_type;
+ int ret;
+
+ /*
+ * from the shared data ref, we only have the leaf but we need
+ * the key. thus, we must look into all items and see that we
+ * find one (some) with a reference to our extent item.
+ */
+ nritems = btrfs_header_nritems(eb);
+ for (slot = 0; slot < nritems; ++slot) {
+ btrfs_item_key_to_cpu(eb, &key, slot);
+ if (key.type != BTRFS_EXTENT_DATA_KEY)
+ continue;
+ fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
+ extent_type = btrfs_file_extent_type(eb, fi);
+ if (extent_type == BTRFS_FILE_EXTENT_INLINE)
+ continue;
+ /* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */
+ disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
+ if (disk_byte != wanted_disk_byte)
+ continue;
+
+ ret = check_extent_in_eb(&key, eb, fi, extent_item_pos, eie);
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
+}
+
+/*
+ * this structure records all encountered refs on the way up to the root
+ */
+struct __prelim_ref {
+ struct list_head list;
+ u64 root_id;
+ struct btrfs_key key_for_search;
+ int level;
+ int count;
+ struct extent_inode_elem *inode_list;
+ u64 parent;
+ u64 wanted_disk_byte;
+};
+
+static struct kmem_cache *btrfs_prelim_ref_cache;
+
+int __init btrfs_prelim_ref_init(void)
+{
+ btrfs_prelim_ref_cache = kmem_cache_create("btrfs_prelim_ref",
+ sizeof(struct __prelim_ref),
+ 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+ NULL);
+ if (!btrfs_prelim_ref_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void btrfs_prelim_ref_exit(void)
+{
+ if (btrfs_prelim_ref_cache)
+ kmem_cache_destroy(btrfs_prelim_ref_cache);
+}
+
+/*
+ * the rules for all callers of this function are:
+ * - obtaining the parent is the goal
+ * - if you add a key, you must know that it is a correct key
+ * - if you cannot add the parent or a correct key, then we will look into the
+ * block later to set a correct key
+ *
+ * delayed refs
+ * ============
+ * backref type | shared | indirect | shared | indirect
+ * information | tree | tree | data | data
+ * --------------------+--------+----------+--------+----------
+ * parent logical | y | - | - | -
+ * key to resolve | - | y | y | y
+ * tree block logical | - | - | - | -
+ * root for resolving | y | y | y | y
+ *
+ * - column 1: we've the parent -> done
+ * - column 2, 3, 4: we use the key to find the parent
+ *
+ * on disk refs (inline or keyed)
+ * ==============================
+ * backref type | shared | indirect | shared | indirect
+ * information | tree | tree | data | data
+ * --------------------+--------+----------+--------+----------
+ * parent logical | y | - | y | -
+ * key to resolve | - | - | - | y
+ * tree block logical | y | y | y | y
+ * root for resolving | - | y | y | y
+ *
+ * - column 1, 3: we've the parent -> done
+ * - column 2: we take the first key from the block to find the parent
+ * (see __add_missing_keys)
+ * - column 4: we use the key to find the parent
+ *
+ * additional information that's available but not required to find the parent
+ * block might help in merging entries to gain some speed.
+ */
+
+static int __add_prelim_ref(struct list_head *head, u64 root_id,
+ struct btrfs_key *key, int level,
+ u64 parent, u64 wanted_disk_byte, int count,
+ gfp_t gfp_mask)
+{
+ struct __prelim_ref *ref;
+
+ if (root_id == BTRFS_DATA_RELOC_TREE_OBJECTID)
+ return 0;
+
+ ref = kmem_cache_alloc(btrfs_prelim_ref_cache, gfp_mask);
+ if (!ref)
+ return -ENOMEM;
+
+ ref->root_id = root_id;
+ if (key)
+ ref->key_for_search = *key;
+ else
+ memset(&ref->key_for_search, 0, sizeof(ref->key_for_search));
+
+ ref->inode_list = NULL;
+ ref->level = level;
+ ref->count = count;
+ ref->parent = parent;
+ ref->wanted_disk_byte = wanted_disk_byte;
+ list_add_tail(&ref->list, head);
+
+ return 0;
+}
+
+static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
+ struct ulist *parents, struct __prelim_ref *ref,
+ int level, u64 time_seq, const u64 *extent_item_pos,
+ u64 total_refs)
+{
+ int ret = 0;
+ int slot;
+ struct extent_buffer *eb;
+ struct btrfs_key key;
+ struct btrfs_key *key_for_search = &ref->key_for_search;
+ struct btrfs_file_extent_item *fi;
+ struct extent_inode_elem *eie = NULL, *old = NULL;
+ u64 disk_byte;
+ u64 wanted_disk_byte = ref->wanted_disk_byte;
+ u64 count = 0;
+
+ if (level != 0) {
+ eb = path->nodes[level];
+ ret = ulist_add(parents, eb->start, 0, GFP_NOFS);
+ if (ret < 0)
+ return ret;
+ return 0;
+ }
+
+ /*
+ * We normally enter this function with the path already pointing to
+ * the first item to check. But sometimes, we may enter it with
+ * slot==nritems. In that case, go to the next leaf before we continue.
+ */
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0]))
+ ret = btrfs_next_old_leaf(root, path, time_seq);
+
+ while (!ret && count < total_refs) {
+ eb = path->nodes[0];
+ slot = path->slots[0];
+
+ btrfs_item_key_to_cpu(eb, &key, slot);
+
+ if (key.objectid != key_for_search->objectid ||
+ key.type != BTRFS_EXTENT_DATA_KEY)
+ break;
+
+ fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
+ disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
+
+ if (disk_byte == wanted_disk_byte) {
+ eie = NULL;
+ old = NULL;
+ count++;
+ if (extent_item_pos) {
+ ret = check_extent_in_eb(&key, eb, fi,
+ *extent_item_pos,
+ &eie);
+ if (ret < 0)
+ break;
+ }
+ if (ret > 0)
+ goto next;
+ ret = ulist_add_merge_ptr(parents, eb->start,
+ eie, (void **)&old, GFP_NOFS);
+ if (ret < 0)
+ break;
+ if (!ret && extent_item_pos) {
+ while (old->next)
+ old = old->next;
+ old->next = eie;
+ }
+ eie = NULL;
+ }
+next:
+ ret = btrfs_next_old_item(root, path, time_seq);
+ }
+
+ if (ret > 0)
+ ret = 0;
+ else if (ret < 0)
+ free_inode_elem_list(eie);
+ return ret;
+}
+
+/*
+ * resolve an indirect backref in the form (root_id, key, level)
+ * to a logical address
+ */
+static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path, u64 time_seq,
+ struct __prelim_ref *ref,
+ struct ulist *parents,
+ const u64 *extent_item_pos, u64 total_refs)
+{
+ struct btrfs_root *root;
+ struct btrfs_key root_key;
+ struct extent_buffer *eb;
+ int ret = 0;
+ int root_level;
+ int level = ref->level;
+ int index;
+
+ root_key.objectid = ref->root_id;
+ root_key.type = BTRFS_ROOT_ITEM_KEY;
+ root_key.offset = (u64)-1;
+
+ index = srcu_read_lock(&fs_info->subvol_srcu);
+
+ root = btrfs_read_fs_root_no_name(fs_info, &root_key);
+ if (IS_ERR(root)) {
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
+ ret = PTR_ERR(root);
+ goto out;
+ }
+
+ if (path->search_commit_root)
+ root_level = btrfs_header_level(root->commit_root);
+ else
+ root_level = btrfs_old_root_level(root, time_seq);
+
+ if (root_level + 1 == level) {
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
+ goto out;
+ }
+
+ path->lowest_level = level;
+ ret = btrfs_search_old_slot(root, &ref->key_for_search, path, time_seq);
+
+ /* root node has been locked, we can release @subvol_srcu safely here */
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
+
+ pr_debug("search slot in root %llu (level %d, ref count %d) returned "
+ "%d for key (%llu %u %llu)\n",
+ ref->root_id, level, ref->count, ret,
+ ref->key_for_search.objectid, ref->key_for_search.type,
+ ref->key_for_search.offset);
+ if (ret < 0)
+ goto out;
+
+ eb = path->nodes[level];
+ while (!eb) {
+ if (WARN_ON(!level)) {
+ ret = 1;
+ goto out;
+ }
+ level--;
+ eb = path->nodes[level];
+ }
+
+ ret = add_all_parents(root, path, parents, ref, level, time_seq,
+ extent_item_pos, total_refs);
+out:
+ path->lowest_level = 0;
+ btrfs_release_path(path);
+ return ret;
+}
+
+/*
+ * resolve all indirect backrefs from the list
+ */
+static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path, u64 time_seq,
+ struct list_head *head,
+ const u64 *extent_item_pos, u64 total_refs,
+ u64 root_objectid)
+{
+ int err;
+ int ret = 0;
+ struct __prelim_ref *ref;
+ struct __prelim_ref *ref_safe;
+ struct __prelim_ref *new_ref;
+ struct ulist *parents;
+ struct ulist_node *node;
+ struct ulist_iterator uiter;
+
+ parents = ulist_alloc(GFP_NOFS);
+ if (!parents)
+ return -ENOMEM;
+
+ /*
+ * _safe allows us to insert directly after the current item without
+ * iterating over the newly inserted items.
+ * we're also allowed to re-assign ref during iteration.
+ */
+ list_for_each_entry_safe(ref, ref_safe, head, list) {
+ if (ref->parent) /* already direct */
+ continue;
+ if (ref->count == 0)
+ continue;
+ if (root_objectid && ref->root_id != root_objectid) {
+ ret = BACKREF_FOUND_SHARED;
+ goto out;
+ }
+ err = __resolve_indirect_ref(fs_info, path, time_seq, ref,
+ parents, extent_item_pos,
+ total_refs);
+ /*
+ * we can only tolerate ENOENT,otherwise,we should catch error
+ * and return directly.
+ */
+ if (err == -ENOENT) {
+ continue;
+ } else if (err) {
+ ret = err;
+ goto out;
+ }
+
+ /* we put the first parent into the ref at hand */
+ ULIST_ITER_INIT(&uiter);
+ node = ulist_next(parents, &uiter);
+ ref->parent = node ? node->val : 0;
+ ref->inode_list = node ?
+ (struct extent_inode_elem *)(uintptr_t)node->aux : NULL;
+
+ /* additional parents require new refs being added here */
+ while ((node = ulist_next(parents, &uiter))) {
+ new_ref = kmem_cache_alloc(btrfs_prelim_ref_cache,
+ GFP_NOFS);
+ if (!new_ref) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ memcpy(new_ref, ref, sizeof(*ref));
+ new_ref->parent = node->val;
+ new_ref->inode_list = (struct extent_inode_elem *)
+ (uintptr_t)node->aux;
+ list_add(&new_ref->list, &ref->list);
+ }
+ ulist_reinit(parents);
+ }
+out:
+ ulist_free(parents);
+ return ret;
+}
+
+static inline int ref_for_same_block(struct __prelim_ref *ref1,
+ struct __prelim_ref *ref2)
+{
+ if (ref1->level != ref2->level)
+ return 0;
+ if (ref1->root_id != ref2->root_id)
+ return 0;
+ if (ref1->key_for_search.type != ref2->key_for_search.type)
+ return 0;
+ if (ref1->key_for_search.objectid != ref2->key_for_search.objectid)
+ return 0;
+ if (ref1->key_for_search.offset != ref2->key_for_search.offset)
+ return 0;
+ if (ref1->parent != ref2->parent)
+ return 0;
+
+ return 1;
+}
+
+/*
+ * read tree blocks and add keys where required.
+ */
+static int __add_missing_keys(struct btrfs_fs_info *fs_info,
+ struct list_head *head)
+{
+ struct list_head *pos;
+ struct extent_buffer *eb;
+
+ list_for_each(pos, head) {
+ struct __prelim_ref *ref;
+ ref = list_entry(pos, struct __prelim_ref, list);
+
+ if (ref->parent)
+ continue;
+ if (ref->key_for_search.type)
+ continue;
+ BUG_ON(!ref->wanted_disk_byte);
+ eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte,
+ 0);
+ if (!eb || !extent_buffer_uptodate(eb)) {
+ free_extent_buffer(eb);
+ return -EIO;
+ }
+ btrfs_tree_read_lock(eb);
+ if (btrfs_header_level(eb) == 0)
+ btrfs_item_key_to_cpu(eb, &ref->key_for_search, 0);
+ else
+ btrfs_node_key_to_cpu(eb, &ref->key_for_search, 0);
+ btrfs_tree_read_unlock(eb);
+ free_extent_buffer(eb);
+ }
+ return 0;
+}
+
+/*
+ * merge two lists of backrefs and adjust counts accordingly
+ *
+ * mode = 1: merge identical keys, if key is set
+ * FIXME: if we add more keys in __add_prelim_ref, we can merge more here.
+ * additionally, we could even add a key range for the blocks we
+ * looked into to merge even more (-> replace unresolved refs by those
+ * having a parent).
+ * mode = 2: merge identical parents
+ */
+static void __merge_refs(struct list_head *head, int mode)
+{
+ struct list_head *pos1;
+
+ list_for_each(pos1, head) {
+ struct list_head *n2;
+ struct list_head *pos2;
+ struct __prelim_ref *ref1;
+
+ ref1 = list_entry(pos1, struct __prelim_ref, list);
+
+ for (pos2 = pos1->next, n2 = pos2->next; pos2 != head;
+ pos2 = n2, n2 = pos2->next) {
+ struct __prelim_ref *ref2;
+ struct __prelim_ref *xchg;
+ struct extent_inode_elem *eie;
+
+ ref2 = list_entry(pos2, struct __prelim_ref, list);
+
+ if (mode == 1) {
+ if (!ref_for_same_block(ref1, ref2))
+ continue;
+ if (!ref1->parent && ref2->parent) {
+ xchg = ref1;
+ ref1 = ref2;
+ ref2 = xchg;
+ }
+ } else {
+ if (ref1->parent != ref2->parent)
+ continue;
+ }
+
+ eie = ref1->inode_list;
+ while (eie && eie->next)
+ eie = eie->next;
+ if (eie)
+ eie->next = ref2->inode_list;
+ else
+ ref1->inode_list = ref2->inode_list;
+ ref1->count += ref2->count;
+
+ list_del(&ref2->list);
+ kmem_cache_free(btrfs_prelim_ref_cache, ref2);
+ }
+
+ }
+}
+
+/*
+ * add all currently queued delayed refs from this head whose seq nr is
+ * smaller or equal that seq to the list
+ */
+static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
+ struct list_head *prefs, u64 *total_refs,
+ u64 inum)
+{
+ struct btrfs_delayed_extent_op *extent_op = head->extent_op;
+ struct rb_node *n = &head->node.rb_node;
+ struct btrfs_key key;
+ struct btrfs_key op_key = {0};
+ int sgn;
+ int ret = 0;
+
+ if (extent_op && extent_op->update_key)
+ btrfs_disk_key_to_cpu(&op_key, &extent_op->key);
+
+ spin_lock(&head->lock);
+ n = rb_first(&head->ref_root);
+ while (n) {
+ struct btrfs_delayed_ref_node *node;
+ node = rb_entry(n, struct btrfs_delayed_ref_node,
+ rb_node);
+ n = rb_next(n);
+ if (node->seq > seq)
+ continue;
+
+ switch (node->action) {
+ case BTRFS_ADD_DELAYED_EXTENT:
+ case BTRFS_UPDATE_DELAYED_HEAD:
+ WARN_ON(1);
+ continue;
+ case BTRFS_ADD_DELAYED_REF:
+ sgn = 1;
+ break;
+ case BTRFS_DROP_DELAYED_REF:
+ sgn = -1;
+ break;
+ default:
+ BUG_ON(1);
+ }
+ *total_refs += (node->ref_mod * sgn);
+ switch (node->type) {
+ case BTRFS_TREE_BLOCK_REF_KEY: {
+ struct btrfs_delayed_tree_ref *ref;
+
+ ref = btrfs_delayed_node_to_tree_ref(node);
+ ret = __add_prelim_ref(prefs, ref->root, &op_key,
+ ref->level + 1, 0, node->bytenr,
+ node->ref_mod * sgn, GFP_ATOMIC);
+ break;
+ }
+ case BTRFS_SHARED_BLOCK_REF_KEY: {
+ struct btrfs_delayed_tree_ref *ref;
+
+ ref = btrfs_delayed_node_to_tree_ref(node);
+ ret = __add_prelim_ref(prefs, ref->root, NULL,
+ ref->level + 1, ref->parent,
+ node->bytenr,
+ node->ref_mod * sgn, GFP_ATOMIC);
+ break;
+ }
+ case BTRFS_EXTENT_DATA_REF_KEY: {
+ struct btrfs_delayed_data_ref *ref;
+ ref = btrfs_delayed_node_to_data_ref(node);
+
+ key.objectid = ref->objectid;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = ref->offset;
+
+ /*
+ * Found a inum that doesn't match our known inum, we
+ * know it's shared.
+ */
+ if (inum && ref->objectid != inum) {
+ ret = BACKREF_FOUND_SHARED;
+ break;
+ }
+
+ ret = __add_prelim_ref(prefs, ref->root, &key, 0, 0,
+ node->bytenr,
+ node->ref_mod * sgn, GFP_ATOMIC);
+ break;
+ }
+ case BTRFS_SHARED_DATA_REF_KEY: {
+ struct btrfs_delayed_data_ref *ref;
+
+ ref = btrfs_delayed_node_to_data_ref(node);
+
+ key.objectid = ref->objectid;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = ref->offset;
+ ret = __add_prelim_ref(prefs, ref->root, &key, 0,
+ ref->parent, node->bytenr,
+ node->ref_mod * sgn, GFP_ATOMIC);
+ break;
+ }
+ default:
+ WARN_ON(1);
+ }
+ if (ret)
+ break;
+ }
+ spin_unlock(&head->lock);
+ return ret;
+}
+
+/*
+ * add all inline backrefs for bytenr to the list
+ */
+static int __add_inline_refs(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path, u64 bytenr,
+ int *info_level, struct list_head *prefs,
+ u64 *total_refs, u64 inum)
+{
+ int ret = 0;
+ int slot;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ unsigned long ptr;
+ unsigned long end;
+ struct btrfs_extent_item *ei;
+ u64 flags;
+ u64 item_size;
+
+ /*
+ * enumerate all inline refs
+ */
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+
+ item_size = btrfs_item_size_nr(leaf, slot);
+ BUG_ON(item_size < sizeof(*ei));
+
+ ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
+ flags = btrfs_extent_flags(leaf, ei);
+ *total_refs += btrfs_extent_refs(leaf, ei);
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+ ptr = (unsigned long)(ei + 1);
+ end = (unsigned long)ei + item_size;
+
+ if (found_key.type == BTRFS_EXTENT_ITEM_KEY &&
+ flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+ struct btrfs_tree_block_info *info;
+
+ info = (struct btrfs_tree_block_info *)ptr;
+ *info_level = btrfs_tree_block_level(leaf, info);
+ ptr += sizeof(struct btrfs_tree_block_info);
+ BUG_ON(ptr > end);
+ } else if (found_key.type == BTRFS_METADATA_ITEM_KEY) {
+ *info_level = found_key.offset;
+ } else {
+ BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
+ }
+
+ while (ptr < end) {
+ struct btrfs_extent_inline_ref *iref;
+ u64 offset;
+ int type;
+
+ iref = (struct btrfs_extent_inline_ref *)ptr;
+ type = btrfs_extent_inline_ref_type(leaf, iref);
+ offset = btrfs_extent_inline_ref_offset(leaf, iref);
+
+ switch (type) {
+ case BTRFS_SHARED_BLOCK_REF_KEY:
+ ret = __add_prelim_ref(prefs, 0, NULL,
+ *info_level + 1, offset,
+ bytenr, 1, GFP_NOFS);
+ break;
+ case BTRFS_SHARED_DATA_REF_KEY: {
+ struct btrfs_shared_data_ref *sdref;
+ int count;
+
+ sdref = (struct btrfs_shared_data_ref *)(iref + 1);
+ count = btrfs_shared_data_ref_count(leaf, sdref);
+ ret = __add_prelim_ref(prefs, 0, NULL, 0, offset,
+ bytenr, count, GFP_NOFS);
+ break;
+ }
+ case BTRFS_TREE_BLOCK_REF_KEY:
+ ret = __add_prelim_ref(prefs, offset, NULL,
+ *info_level + 1, 0,
+ bytenr, 1, GFP_NOFS);
+ break;
+ case BTRFS_EXTENT_DATA_REF_KEY: {
+ struct btrfs_extent_data_ref *dref;
+ int count;
+ u64 root;
+
+ dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+ count = btrfs_extent_data_ref_count(leaf, dref);
+ key.objectid = btrfs_extent_data_ref_objectid(leaf,
+ dref);
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = btrfs_extent_data_ref_offset(leaf, dref);
+
+ if (inum && key.objectid != inum) {
+ ret = BACKREF_FOUND_SHARED;
+ break;
+ }
+
+ root = btrfs_extent_data_ref_root(leaf, dref);
+ ret = __add_prelim_ref(prefs, root, &key, 0, 0,
+ bytenr, count, GFP_NOFS);
+ break;
+ }
+ default:
+ WARN_ON(1);
+ }
+ if (ret)
+ return ret;
+ ptr += btrfs_extent_inline_ref_size(type);
+ }
+
+ return 0;
+}
+
+/*
+ * add all non-inline backrefs for bytenr to the list
+ */
+static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path, u64 bytenr,
+ int info_level, struct list_head *prefs, u64 inum)
+{
+ struct btrfs_root *extent_root = fs_info->extent_root;
+ int ret;
+ int slot;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+
+ while (1) {
+ ret = btrfs_next_item(extent_root, path);
+ if (ret < 0)
+ break;
+ if (ret) {
+ ret = 0;
+ break;
+ }
+
+ slot = path->slots[0];
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+
+ if (key.objectid != bytenr)
+ break;
+ if (key.type < BTRFS_TREE_BLOCK_REF_KEY)
+ continue;
+ if (key.type > BTRFS_SHARED_DATA_REF_KEY)
+ break;
+
+ switch (key.type) {
+ case BTRFS_SHARED_BLOCK_REF_KEY:
+ ret = __add_prelim_ref(prefs, 0, NULL,
+ info_level + 1, key.offset,
+ bytenr, 1, GFP_NOFS);
+ break;
+ case BTRFS_SHARED_DATA_REF_KEY: {
+ struct btrfs_shared_data_ref *sdref;
+ int count;
+
+ sdref = btrfs_item_ptr(leaf, slot,
+ struct btrfs_shared_data_ref);
+ count = btrfs_shared_data_ref_count(leaf, sdref);
+ ret = __add_prelim_ref(prefs, 0, NULL, 0, key.offset,
+ bytenr, count, GFP_NOFS);
+ break;
+ }
+ case BTRFS_TREE_BLOCK_REF_KEY:
+ ret = __add_prelim_ref(prefs, key.offset, NULL,
+ info_level + 1, 0,
+ bytenr, 1, GFP_NOFS);
+ break;
+ case BTRFS_EXTENT_DATA_REF_KEY: {
+ struct btrfs_extent_data_ref *dref;
+ int count;
+ u64 root;
+
+ dref = btrfs_item_ptr(leaf, slot,
+ struct btrfs_extent_data_ref);
+ count = btrfs_extent_data_ref_count(leaf, dref);
+ key.objectid = btrfs_extent_data_ref_objectid(leaf,
+ dref);
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = btrfs_extent_data_ref_offset(leaf, dref);
+
+ if (inum && key.objectid != inum) {
+ ret = BACKREF_FOUND_SHARED;
+ break;
+ }
+
+ root = btrfs_extent_data_ref_root(leaf, dref);
+ ret = __add_prelim_ref(prefs, root, &key, 0, 0,
+ bytenr, count, GFP_NOFS);
+ break;
+ }
+ default:
+ WARN_ON(1);
+ }
+ if (ret)
+ return ret;
+
+ }
+
+ return ret;
+}
+
+/*
+ * this adds all existing backrefs (inline backrefs, backrefs and delayed
+ * refs) for the given bytenr to the refs list, merges duplicates and resolves
+ * indirect refs to their parent bytenr.
+ * When roots are found, they're added to the roots list
+ *
+ * NOTE: This can return values > 0
+ *
+ * FIXME some caching might speed things up
+ */
+static int find_parent_nodes(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 bytenr,
+ u64 time_seq, struct ulist *refs,
+ struct ulist *roots, const u64 *extent_item_pos,
+ u64 root_objectid, u64 inum)
+{
+ struct btrfs_key key;
+ struct btrfs_path *path;
+ struct btrfs_delayed_ref_root *delayed_refs = NULL;
+ struct btrfs_delayed_ref_head *head;
+ int info_level = 0;
+ int ret;
+ struct list_head prefs_delayed;
+ struct list_head prefs;
+ struct __prelim_ref *ref;
+ struct extent_inode_elem *eie = NULL;
+ u64 total_refs = 0;
+
+ INIT_LIST_HEAD(&prefs);
+ INIT_LIST_HEAD(&prefs_delayed);
+
+ key.objectid = bytenr;
+ key.offset = (u64)-1;
+ if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
+ key.type = BTRFS_METADATA_ITEM_KEY;
+ else
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ if (!trans) {
+ path->search_commit_root = 1;
+ path->skip_locking = 1;
+ }
+
+ /*
+ * grab both a lock on the path and a lock on the delayed ref head.
+ * We need both to get a consistent picture of how the refs look
+ * at a specified point in time
+ */
+again:
+ head = NULL;
+
+ ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ BUG_ON(ret == 0);
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ if (trans && likely(trans->type != __TRANS_DUMMY)) {
+#else
+ if (trans) {
+#endif
+ /*
+ * look if there are updates for this ref queued and lock the
+ * head
+ */
+ delayed_refs = &trans->transaction->delayed_refs;
+ spin_lock(&delayed_refs->lock);
+ head = btrfs_find_delayed_ref_head(trans, bytenr);
+ if (head) {
+ if (!mutex_trylock(&head->mutex)) {
+ atomic_inc(&head->node.refs);
+ spin_unlock(&delayed_refs->lock);
+
+ btrfs_release_path(path);
+
+ /*
+ * Mutex was contended, block until it's
+ * released and try again
+ */
+ mutex_lock(&head->mutex);
+ mutex_unlock(&head->mutex);
+ btrfs_put_delayed_ref(&head->node);
+ goto again;
+ }
+ spin_unlock(&delayed_refs->lock);
+ ret = __add_delayed_refs(head, time_seq,
+ &prefs_delayed, &total_refs,
+ inum);
+ mutex_unlock(&head->mutex);
+ if (ret)
+ goto out;
+ } else {
+ spin_unlock(&delayed_refs->lock);
+ }
+ }
+
+ if (path->slots[0]) {
+ struct extent_buffer *leaf;
+ int slot;
+
+ path->slots[0]--;
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid == bytenr &&
+ (key.type == BTRFS_EXTENT_ITEM_KEY ||
+ key.type == BTRFS_METADATA_ITEM_KEY)) {
+ ret = __add_inline_refs(fs_info, path, bytenr,
+ &info_level, &prefs,
+ &total_refs, inum);
+ if (ret)
+ goto out;
+ ret = __add_keyed_refs(fs_info, path, bytenr,
+ info_level, &prefs, inum);
+ if (ret)
+ goto out;
+ }
+ }
+ btrfs_release_path(path);
+
+ list_splice_init(&prefs_delayed, &prefs);
+
+ ret = __add_missing_keys(fs_info, &prefs);
+ if (ret)
+ goto out;
+
+ __merge_refs(&prefs, 1);
+
+ ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs,
+ extent_item_pos, total_refs,
+ root_objectid);
+ if (ret)
+ goto out;
+
+ __merge_refs(&prefs, 2);
+
+ while (!list_empty(&prefs)) {
+ ref = list_first_entry(&prefs, struct __prelim_ref, list);
+ WARN_ON(ref->count < 0);
+ if (roots && ref->count && ref->root_id && ref->parent == 0) {
+ if (root_objectid && ref->root_id != root_objectid) {
+ ret = BACKREF_FOUND_SHARED;
+ goto out;
+ }
+
+ /* no parent == root of tree */
+ ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
+ if (ret < 0)
+ goto out;
+ }
+ if (ref->count && ref->parent) {
+ if (extent_item_pos && !ref->inode_list &&
+ ref->level == 0) {
+ struct extent_buffer *eb;
+
+ eb = read_tree_block(fs_info->extent_root,
+ ref->parent, 0);
+ if (!eb || !extent_buffer_uptodate(eb)) {
+ free_extent_buffer(eb);
+ ret = -EIO;
+ goto out;
+ }
+ btrfs_tree_read_lock(eb);
+ btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ ret = find_extent_in_eb(eb, bytenr,
+ *extent_item_pos, &eie);
+ btrfs_tree_read_unlock_blocking(eb);
+ free_extent_buffer(eb);
+ if (ret < 0)
+ goto out;
+ ref->inode_list = eie;
+ }
+ ret = ulist_add_merge_ptr(refs, ref->parent,
+ ref->inode_list,
+ (void **)&eie, GFP_NOFS);
+ if (ret < 0)
+ goto out;
+ if (!ret && extent_item_pos) {
+ /*
+ * we've recorded that parent, so we must extend
+ * its inode list here
+ */
+ BUG_ON(!eie);
+ while (eie->next)
+ eie = eie->next;
+ eie->next = ref->inode_list;
+ }
+ eie = NULL;
+ }
+ list_del(&ref->list);
+ kmem_cache_free(btrfs_prelim_ref_cache, ref);
+ }
+
+out:
+ btrfs_free_path(path);
+ while (!list_empty(&prefs)) {
+ ref = list_first_entry(&prefs, struct __prelim_ref, list);
+ list_del(&ref->list);
+ kmem_cache_free(btrfs_prelim_ref_cache, ref);
+ }
+ while (!list_empty(&prefs_delayed)) {
+ ref = list_first_entry(&prefs_delayed, struct __prelim_ref,
+ list);
+ list_del(&ref->list);
+ kmem_cache_free(btrfs_prelim_ref_cache, ref);
+ }
+ if (ret < 0)
+ free_inode_elem_list(eie);
+ return ret;
+}
+
+static void free_leaf_list(struct ulist *blocks)
+{
+ struct ulist_node *node = NULL;
+ struct extent_inode_elem *eie;
+ struct ulist_iterator uiter;
+
+ ULIST_ITER_INIT(&uiter);
+ while ((node = ulist_next(blocks, &uiter))) {
+ if (!node->aux)
+ continue;
+ eie = (struct extent_inode_elem *)(uintptr_t)node->aux;
+ free_inode_elem_list(eie);
+ node->aux = 0;
+ }
+
+ ulist_free(blocks);
+}
+
+/*
+ * Finds all leafs with a reference to the specified combination of bytenr and
+ * offset. key_list_head will point to a list of corresponding keys (caller must
+ * free each list element). The leafs will be stored in the leafs ulist, which
+ * must be freed with ulist_free.
+ *
+ * returns 0 on success, <0 on error
+ */
+static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 bytenr,
+ u64 time_seq, struct ulist **leafs,
+ const u64 *extent_item_pos)
+{
+ int ret;
+
+ *leafs = ulist_alloc(GFP_NOFS);
+ if (!*leafs)
+ return -ENOMEM;
+
+ ret = find_parent_nodes(trans, fs_info, bytenr,
+ time_seq, *leafs, NULL, extent_item_pos, 0, 0);
+ if (ret < 0 && ret != -ENOENT) {
+ free_leaf_list(*leafs);
+ return ret;
+ }
+
+ return 0;
+}
+
+/*
+ * walk all backrefs for a given extent to find all roots that reference this
+ * extent. Walking a backref means finding all extents that reference this
+ * extent and in turn walk the backrefs of those, too. Naturally this is a
+ * recursive process, but here it is implemented in an iterative fashion: We
+ * find all referencing extents for the extent in question and put them on a
+ * list. In turn, we find all referencing extents for those, further appending
+ * to the list. The way we iterate the list allows adding more elements after
+ * the current while iterating. The process stops when we reach the end of the
+ * list. Found roots are added to the roots list.
+ *
+ * returns 0 on success, < 0 on error.
+ */
+static int __btrfs_find_all_roots(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 bytenr,
+ u64 time_seq, struct ulist **roots)
+{
+ struct ulist *tmp;
+ struct ulist_node *node = NULL;
+ struct ulist_iterator uiter;
+ int ret;
+
+ tmp = ulist_alloc(GFP_NOFS);
+ if (!tmp)
+ return -ENOMEM;
+ *roots = ulist_alloc(GFP_NOFS);
+ if (!*roots) {
+ ulist_free(tmp);
+ return -ENOMEM;
+ }
+
+ ULIST_ITER_INIT(&uiter);
+ while (1) {
+ ret = find_parent_nodes(trans, fs_info, bytenr,
+ time_seq, tmp, *roots, NULL, 0, 0);
+ if (ret < 0 && ret != -ENOENT) {
+ ulist_free(tmp);
+ ulist_free(*roots);
+ return ret;
+ }
+ node = ulist_next(tmp, &uiter);
+ if (!node)
+ break;
+ bytenr = node->val;
+ cond_resched();
+ }
+
+ ulist_free(tmp);
+ return 0;
+}
+
+int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 bytenr,
+ u64 time_seq, struct ulist **roots)
+{
+ int ret;
+
+ if (!trans)
+ down_read(&fs_info->commit_root_sem);
+ ret = __btrfs_find_all_roots(trans, fs_info, bytenr, time_seq, roots);
+ if (!trans)
+ up_read(&fs_info->commit_root_sem);
+ return ret;
+}
+
+/**
+ * btrfs_check_shared - tell us whether an extent is shared
+ *
+ * @trans: optional trans handle
+ *
+ * btrfs_check_shared uses the backref walking code but will short
+ * circuit as soon as it finds a root or inode that doesn't match the
+ * one passed in. This provides a significant performance benefit for
+ * callers (such as fiemap) which want to know whether the extent is
+ * shared but do not need a ref count.
+ *
+ * Return: 0 if extent is not shared, 1 if it is shared, < 0 on error.
+ */
+int btrfs_check_shared(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 root_objectid,
+ u64 inum, u64 bytenr)
+{
+ struct ulist *tmp = NULL;
+ struct ulist *roots = NULL;
+ struct ulist_iterator uiter;
+ struct ulist_node *node;
+ struct seq_list elem = SEQ_LIST_INIT(elem);
+ int ret = 0;
+
+ tmp = ulist_alloc(GFP_NOFS);
+ roots = ulist_alloc(GFP_NOFS);
+ if (!tmp || !roots) {
+ ulist_free(tmp);
+ ulist_free(roots);
+ return -ENOMEM;
+ }
+
+ if (trans)
+ btrfs_get_tree_mod_seq(fs_info, &elem);
+ else
+ down_read(&fs_info->commit_root_sem);
+ ULIST_ITER_INIT(&uiter);
+ while (1) {
+ ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, tmp,
+ roots, NULL, root_objectid, inum);
+ if (ret == BACKREF_FOUND_SHARED) {
+ /* this is the only condition under which we return 1 */
+ ret = 1;
+ break;
+ }
+ if (ret < 0 && ret != -ENOENT)
+ break;
+ ret = 0;
+ node = ulist_next(tmp, &uiter);
+ if (!node)
+ break;
+ bytenr = node->val;
+ cond_resched();
+ }
+ if (trans)
+ btrfs_put_tree_mod_seq(fs_info, &elem);
+ else
+ up_read(&fs_info->commit_root_sem);
+ ulist_free(tmp);
+ ulist_free(roots);
+ return ret;
+}
+
+int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
+ u64 start_off, struct btrfs_path *path,
+ struct btrfs_inode_extref **ret_extref,
+ u64 *found_off)
+{
+ int ret, slot;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ struct btrfs_inode_extref *extref;
+ struct extent_buffer *leaf;
+ unsigned long ptr;
+
+ key.objectid = inode_objectid;
+ key.type = BTRFS_INODE_EXTREF_KEY;
+ key.offset = start_off;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+
+ while (1) {
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ if (slot >= btrfs_header_nritems(leaf)) {
+ /*
+ * If the item at offset is not found,
+ * btrfs_search_slot will point us to the slot
+ * where it should be inserted. In our case
+ * that will be the slot directly before the
+ * next INODE_REF_KEY_V2 item. In the case
+ * that we're pointing to the last slot in a
+ * leaf, we must move one leaf over.
+ */
+ ret = btrfs_next_leaf(root, path);
+ if (ret) {
+ if (ret >= 1)
+ ret = -ENOENT;
+ break;
+ }
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+ /*
+ * Check that we're still looking at an extended ref key for
+ * this particular objectid. If we have different
+ * objectid or type then there are no more to be found
+ * in the tree and we can exit.
+ */
+ ret = -ENOENT;
+ if (found_key.objectid != inode_objectid)
+ break;
+ if (found_key.type != BTRFS_INODE_EXTREF_KEY)
+ break;
+
+ ret = 0;
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ extref = (struct btrfs_inode_extref *)ptr;
+ *ret_extref = extref;
+ if (found_off)
+ *found_off = found_key.offset;
+ break;
+ }
+
+ return ret;
+}
+
+/*
+ * this iterates to turn a name (from iref/extref) into a full filesystem path.
+ * Elements of the path are separated by '/' and the path is guaranteed to be
+ * 0-terminated. the path is only given within the current file system.
+ * Therefore, it never starts with a '/'. the caller is responsible to provide
+ * "size" bytes in "dest". the dest buffer will be filled backwards. finally,
+ * the start point of the resulting string is returned. this pointer is within
+ * dest, normally.
+ * in case the path buffer would overflow, the pointer is decremented further
+ * as if output was written to the buffer, though no more output is actually
+ * generated. that way, the caller can determine how much space would be
+ * required for the path to fit into the buffer. in that case, the returned
+ * value will be smaller than dest. callers must check this!
+ */
+char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
+ u32 name_len, unsigned long name_off,
+ struct extent_buffer *eb_in, u64 parent,
+ char *dest, u32 size)
+{
+ int slot;
+ u64 next_inum;
+ int ret;
+ s64 bytes_left = ((s64)size) - 1;
+ struct extent_buffer *eb = eb_in;
+ struct btrfs_key found_key;
+ int leave_spinning = path->leave_spinning;
+ struct btrfs_inode_ref *iref;
+
+ if (bytes_left >= 0)
+ dest[bytes_left] = '\0';
+
+ path->leave_spinning = 1;
+ while (1) {
+ bytes_left -= name_len;
+ if (bytes_left >= 0)
+ read_extent_buffer(eb, dest + bytes_left,
+ name_off, name_len);
+ if (eb != eb_in) {
+ btrfs_tree_read_unlock_blocking(eb);
+ free_extent_buffer(eb);
+ }
+ ret = btrfs_find_item(fs_root, path, parent, 0,
+ BTRFS_INODE_REF_KEY, &found_key);
+ if (ret > 0)
+ ret = -ENOENT;
+ if (ret)
+ break;
+
+ next_inum = found_key.offset;
+
+ /* regular exit ahead */
+ if (parent == next_inum)
+ break;
+
+ slot = path->slots[0];
+ eb = path->nodes[0];
+ /* make sure we can use eb after releasing the path */
+ if (eb != eb_in) {
+ atomic_inc(&eb->refs);
+ btrfs_tree_read_lock(eb);
+ btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ }
+ btrfs_release_path(path);
+ iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
+
+ name_len = btrfs_inode_ref_name_len(eb, iref);
+ name_off = (unsigned long)(iref + 1);
+
+ parent = next_inum;
+ --bytes_left;
+ if (bytes_left >= 0)
+ dest[bytes_left] = '/';
+ }
+
+ btrfs_release_path(path);
+ path->leave_spinning = leave_spinning;
+
+ if (ret)
+ return ERR_PTR(ret);
+
+ return dest + bytes_left;
+}
+
+/*
+ * this makes the path point to (logical EXTENT_ITEM *)
+ * returns BTRFS_EXTENT_FLAG_DATA for data, BTRFS_EXTENT_FLAG_TREE_BLOCK for
+ * tree blocks and <0 on error.
+ */
+int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
+ struct btrfs_path *path, struct btrfs_key *found_key,
+ u64 *flags_ret)
+{
+ int ret;
+ u64 flags;
+ u64 size = 0;
+ u32 item_size;
+ struct extent_buffer *eb;
+ struct btrfs_extent_item *ei;
+ struct btrfs_key key;
+
+ if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
+ key.type = BTRFS_METADATA_ITEM_KEY;
+ else
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.objectid = logical;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+
+ ret = btrfs_previous_extent_item(fs_info->extent_root, path, 0);
+ if (ret) {
+ if (ret > 0)
+ ret = -ENOENT;
+ return ret;
+ }
+ btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
+ if (found_key->type == BTRFS_METADATA_ITEM_KEY)
+ size = fs_info->extent_root->nodesize;
+ else if (found_key->type == BTRFS_EXTENT_ITEM_KEY)
+ size = found_key->offset;
+
+ if (found_key->objectid > logical ||
+ found_key->objectid + size <= logical) {
+ pr_debug("logical %llu is not within any extent\n", logical);
+ return -ENOENT;
+ }
+
+ eb = path->nodes[0];
+ item_size = btrfs_item_size_nr(eb, path->slots[0]);
+ BUG_ON(item_size < sizeof(*ei));
+
+ ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
+ flags = btrfs_extent_flags(eb, ei);
+
+ pr_debug("logical %llu is at position %llu within the extent (%llu "
+ "EXTENT_ITEM %llu) flags %#llx size %u\n",
+ logical, logical - found_key->objectid, found_key->objectid,
+ found_key->offset, flags, item_size);
+
+ WARN_ON(!flags_ret);
+ if (flags_ret) {
+ if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+ *flags_ret = BTRFS_EXTENT_FLAG_TREE_BLOCK;
+ else if (flags & BTRFS_EXTENT_FLAG_DATA)
+ *flags_ret = BTRFS_EXTENT_FLAG_DATA;
+ else
+ BUG_ON(1);
+ return 0;
+ }
+
+ return -EIO;
+}
+
+/*
+ * helper function to iterate extent inline refs. ptr must point to a 0 value
+ * for the first call and may be modified. it is used to track state.
+ * if more refs exist, 0 is returned and the next call to
+ * __get_extent_inline_ref must pass the modified ptr parameter to get the
+ * next ref. after the last ref was processed, 1 is returned.
+ * returns <0 on error
+ */
+static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb,
+ struct btrfs_key *key,
+ struct btrfs_extent_item *ei, u32 item_size,
+ struct btrfs_extent_inline_ref **out_eiref,
+ int *out_type)
+{
+ unsigned long end;
+ u64 flags;
+ struct btrfs_tree_block_info *info;
+
+ if (!*ptr) {
+ /* first call */
+ flags = btrfs_extent_flags(eb, ei);
+ if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+ if (key->type == BTRFS_METADATA_ITEM_KEY) {
+ /* a skinny metadata extent */
+ *out_eiref =
+ (struct btrfs_extent_inline_ref *)(ei + 1);
+ } else {
+ WARN_ON(key->type != BTRFS_EXTENT_ITEM_KEY);
+ info = (struct btrfs_tree_block_info *)(ei + 1);
+ *out_eiref =
+ (struct btrfs_extent_inline_ref *)(info + 1);
+ }
+ } else {
+ *out_eiref = (struct btrfs_extent_inline_ref *)(ei + 1);
+ }
+ *ptr = (unsigned long)*out_eiref;
+ if ((unsigned long)(*ptr) >= (unsigned long)ei + item_size)
+ return -ENOENT;
+ }
+
+ end = (unsigned long)ei + item_size;
+ *out_eiref = (struct btrfs_extent_inline_ref *)(*ptr);
+ *out_type = btrfs_extent_inline_ref_type(eb, *out_eiref);
+
+ *ptr += btrfs_extent_inline_ref_size(*out_type);
+ WARN_ON(*ptr > end);
+ if (*ptr == end)
+ return 1; /* last */
+
+ return 0;
+}
+
+/*
+ * reads the tree block backref for an extent. tree level and root are returned
+ * through out_level and out_root. ptr must point to a 0 value for the first
+ * call and may be modified (see __get_extent_inline_ref comment).
+ * returns 0 if data was provided, 1 if there was no more data to provide or
+ * <0 on error.
+ */
+int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
+ struct btrfs_key *key, struct btrfs_extent_item *ei,
+ u32 item_size, u64 *out_root, u8 *out_level)
+{
+ int ret;
+ int type;
+ struct btrfs_extent_inline_ref *eiref;
+
+ if (*ptr == (unsigned long)-1)
+ return 1;
+
+ while (1) {
+ ret = __get_extent_inline_ref(ptr, eb, key, ei, item_size,
+ &eiref, &type);
+ if (ret < 0)
+ return ret;
+
+ if (type == BTRFS_TREE_BLOCK_REF_KEY ||
+ type == BTRFS_SHARED_BLOCK_REF_KEY)
+ break;
+
+ if (ret == 1)
+ return 1;
+ }
+
+ /* we can treat both ref types equally here */
+ *out_root = btrfs_extent_inline_ref_offset(eb, eiref);
+
+ if (key->type == BTRFS_EXTENT_ITEM_KEY) {
+ struct btrfs_tree_block_info *info;
+
+ info = (struct btrfs_tree_block_info *)(ei + 1);
+ *out_level = btrfs_tree_block_level(eb, info);
+ } else {
+ ASSERT(key->type == BTRFS_METADATA_ITEM_KEY);
+ *out_level = (u8)key->offset;
+ }
+
+ if (ret == 1)
+ *ptr = (unsigned long)-1;
+
+ return 0;
+}
+
+static int iterate_leaf_refs(struct extent_inode_elem *inode_list,
+ u64 root, u64 extent_item_objectid,
+ iterate_extent_inodes_t *iterate, void *ctx)
+{
+ struct extent_inode_elem *eie;
+ int ret = 0;
+
+ for (eie = inode_list; eie; eie = eie->next) {
+ pr_debug("ref for %llu resolved, key (%llu EXTEND_DATA %llu), "
+ "root %llu\n", extent_item_objectid,
+ eie->inum, eie->offset, root);
+ ret = iterate(eie->inum, eie->offset, root, ctx);
+ if (ret) {
+ pr_debug("stopping iteration for %llu due to ret=%d\n",
+ extent_item_objectid, ret);
+ break;
+ }
+ }
+
+ return ret;
+}
+
+/*
+ * calls iterate() for every inode that references the extent identified by
+ * the given parameters.
+ * when the iterator function returns a non-zero value, iteration stops.
+ */
+int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
+ u64 extent_item_objectid, u64 extent_item_pos,
+ int search_commit_root,
+ iterate_extent_inodes_t *iterate, void *ctx)
+{
+ int ret;
+ struct btrfs_trans_handle *trans = NULL;
+ struct ulist *refs = NULL;
+ struct ulist *roots = NULL;
+ struct ulist_node *ref_node = NULL;
+ struct ulist_node *root_node = NULL;
+ struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem);
+ struct ulist_iterator ref_uiter;
+ struct ulist_iterator root_uiter;
+
+ pr_debug("resolving all inodes for extent %llu\n",
+ extent_item_objectid);
+
+ if (!search_commit_root) {
+ trans = btrfs_join_transaction(fs_info->extent_root);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+ btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
+ } else {
+ down_read(&fs_info->commit_root_sem);
+ }
+
+ ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
+ tree_mod_seq_elem.seq, &refs,
+ &extent_item_pos);
+ if (ret)
+ goto out;
+
+ ULIST_ITER_INIT(&ref_uiter);
+ while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) {
+ ret = __btrfs_find_all_roots(trans, fs_info, ref_node->val,
+ tree_mod_seq_elem.seq, &roots);
+ if (ret)
+ break;
+ ULIST_ITER_INIT(&root_uiter);
+ while (!ret && (root_node = ulist_next(roots, &root_uiter))) {
+ pr_debug("root %llu references leaf %llu, data list "
+ "%#llx\n", root_node->val, ref_node->val,
+ ref_node->aux);
+ ret = iterate_leaf_refs((struct extent_inode_elem *)
+ (uintptr_t)ref_node->aux,
+ root_node->val,
+ extent_item_objectid,
+ iterate, ctx);
+ }
+ ulist_free(roots);
+ }
+
+ free_leaf_list(refs);
+out:
+ if (!search_commit_root) {
+ btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
+ btrfs_end_transaction(trans, fs_info->extent_root);
+ } else {
+ up_read(&fs_info->commit_root_sem);
+ }
+
+ return ret;
+}
+
+int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path,
+ iterate_extent_inodes_t *iterate, void *ctx)
+{
+ int ret;
+ u64 extent_item_pos;
+ u64 flags = 0;
+ struct btrfs_key found_key;
+ int search_commit_root = path->search_commit_root;
+
+ ret = extent_from_logical(fs_info, logical, path, &found_key, &flags);
+ btrfs_release_path(path);
+ if (ret < 0)
+ return ret;
+ if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+ return -EINVAL;
+
+ extent_item_pos = logical - found_key.objectid;
+ ret = iterate_extent_inodes(fs_info, found_key.objectid,
+ extent_item_pos, search_commit_root,
+ iterate, ctx);
+
+ return ret;
+}
+
+typedef int (iterate_irefs_t)(u64 parent, u32 name_len, unsigned long name_off,
+ struct extent_buffer *eb, void *ctx);
+
+static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
+ struct btrfs_path *path,
+ iterate_irefs_t *iterate, void *ctx)
+{
+ int ret = 0;
+ int slot;
+ u32 cur;
+ u32 len;
+ u32 name_len;
+ u64 parent = 0;
+ int found = 0;
+ struct extent_buffer *eb;
+ struct btrfs_item *item;
+ struct btrfs_inode_ref *iref;
+ struct btrfs_key found_key;
+
+ while (!ret) {
+ ret = btrfs_find_item(fs_root, path, inum,
+ parent ? parent + 1 : 0, BTRFS_INODE_REF_KEY,
+ &found_key);
+
+ if (ret < 0)
+ break;
+ if (ret) {
+ ret = found ? 0 : -ENOENT;
+ break;
+ }
+ ++found;
+
+ parent = found_key.offset;
+ slot = path->slots[0];
+ eb = btrfs_clone_extent_buffer(path->nodes[0]);
+ if (!eb) {
+ ret = -ENOMEM;
+ break;
+ }
+ extent_buffer_get(eb);
+ btrfs_tree_read_lock(eb);
+ btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ btrfs_release_path(path);
+
+ item = btrfs_item_nr(slot);
+ iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
+
+ for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) {
+ name_len = btrfs_inode_ref_name_len(eb, iref);
+ /* path must be released before calling iterate()! */
+ pr_debug("following ref at offset %u for inode %llu in "
+ "tree %llu\n", cur, found_key.objectid,
+ fs_root->objectid);
+ ret = iterate(parent, name_len,
+ (unsigned long)(iref + 1), eb, ctx);
+ if (ret)
+ break;
+ len = sizeof(*iref) + name_len;
+ iref = (struct btrfs_inode_ref *)((char *)iref + len);
+ }
+ btrfs_tree_read_unlock_blocking(eb);
+ free_extent_buffer(eb);
+ }
+
+ btrfs_release_path(path);
+
+ return ret;
+}
+
+static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
+ struct btrfs_path *path,
+ iterate_irefs_t *iterate, void *ctx)
+{
+ int ret;
+ int slot;
+ u64 offset = 0;
+ u64 parent;
+ int found = 0;
+ struct extent_buffer *eb;
+ struct btrfs_inode_extref *extref;
+ struct extent_buffer *leaf;
+ u32 item_size;
+ u32 cur_offset;
+ unsigned long ptr;
+
+ while (1) {
+ ret = btrfs_find_one_extref(fs_root, inum, offset, path, &extref,
+ &offset);
+ if (ret < 0)
+ break;
+ if (ret) {
+ ret = found ? 0 : -ENOENT;
+ break;
+ }
+ ++found;
+
+ slot = path->slots[0];
+ eb = btrfs_clone_extent_buffer(path->nodes[0]);
+ if (!eb) {
+ ret = -ENOMEM;
+ break;
+ }
+ extent_buffer_get(eb);
+
+ btrfs_tree_read_lock(eb);
+ btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ btrfs_release_path(path);
+
+ leaf = path->nodes[0];
+ item_size = btrfs_item_size_nr(leaf, slot);
+ ptr = btrfs_item_ptr_offset(leaf, slot);
+ cur_offset = 0;
+
+ while (cur_offset < item_size) {
+ u32 name_len;
+
+ extref = (struct btrfs_inode_extref *)(ptr + cur_offset);
+ parent = btrfs_inode_extref_parent(eb, extref);
+ name_len = btrfs_inode_extref_name_len(eb, extref);
+ ret = iterate(parent, name_len,
+ (unsigned long)&extref->name, eb, ctx);
+ if (ret)
+ break;
+
+ cur_offset += btrfs_inode_extref_name_len(leaf, extref);
+ cur_offset += sizeof(*extref);
+ }
+ btrfs_tree_read_unlock_blocking(eb);
+ free_extent_buffer(eb);
+
+ offset++;
+ }
+
+ btrfs_release_path(path);
+
+ return ret;
+}
+
+static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
+ struct btrfs_path *path, iterate_irefs_t *iterate,
+ void *ctx)
+{
+ int ret;
+ int found_refs = 0;
+
+ ret = iterate_inode_refs(inum, fs_root, path, iterate, ctx);
+ if (!ret)
+ ++found_refs;
+ else if (ret != -ENOENT)
+ return ret;
+
+ ret = iterate_inode_extrefs(inum, fs_root, path, iterate, ctx);
+ if (ret == -ENOENT && found_refs)
+ return 0;
+
+ return ret;
+}
+
+/*
+ * returns 0 if the path could be dumped (probably truncated)
+ * returns <0 in case of an error
+ */
+static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off,
+ struct extent_buffer *eb, void *ctx)
+{
+ struct inode_fs_paths *ipath = ctx;
+ char *fspath;
+ char *fspath_min;
+ int i = ipath->fspath->elem_cnt;
+ const int s_ptr = sizeof(char *);
+ u32 bytes_left;
+
+ bytes_left = ipath->fspath->bytes_left > s_ptr ?
+ ipath->fspath->bytes_left - s_ptr : 0;
+
+ fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr;
+ fspath = btrfs_ref_to_path(ipath->fs_root, ipath->btrfs_path, name_len,
+ name_off, eb, inum, fspath_min, bytes_left);
+ if (IS_ERR(fspath))
+ return PTR_ERR(fspath);
+
+ if (fspath > fspath_min) {
+ ipath->fspath->val[i] = (u64)(unsigned long)fspath;
+ ++ipath->fspath->elem_cnt;
+ ipath->fspath->bytes_left = fspath - fspath_min;
+ } else {
+ ++ipath->fspath->elem_missed;
+ ipath->fspath->bytes_missing += fspath_min - fspath;
+ ipath->fspath->bytes_left = 0;
+ }
+
+ return 0;
+}
+
+/*
+ * this dumps all file system paths to the inode into the ipath struct, provided
+ * is has been created large enough. each path is zero-terminated and accessed
+ * from ipath->fspath->val[i].
+ * when it returns, there are ipath->fspath->elem_cnt number of paths available
+ * in ipath->fspath->val[]. when the allocated space wasn't sufficient, the
+ * number of missed paths in recored in ipath->fspath->elem_missed, otherwise,
+ * it's zero. ipath->fspath->bytes_missing holds the number of bytes that would
+ * have been needed to return all paths.
+ */
+int paths_from_inode(u64 inum, struct inode_fs_paths *ipath)
+{
+ return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path,
+ inode_to_path, ipath);
+}
+
+struct btrfs_data_container *init_data_container(u32 total_bytes)
+{
+ struct btrfs_data_container *data;
+ size_t alloc_bytes;
+
+ alloc_bytes = max_t(size_t, total_bytes, sizeof(*data));
+ data = vmalloc(alloc_bytes);
+ if (!data)
+ return ERR_PTR(-ENOMEM);
+
+ if (total_bytes >= sizeof(*data)) {
+ data->bytes_left = total_bytes - sizeof(*data);
+ data->bytes_missing = 0;
+ } else {
+ data->bytes_missing = sizeof(*data) - total_bytes;
+ data->bytes_left = 0;
+ }
+
+ data->elem_cnt = 0;
+ data->elem_missed = 0;
+
+ return data;
+}
+
+/*
+ * allocates space to return multiple file system paths for an inode.
+ * total_bytes to allocate are passed, note that space usable for actual path
+ * information will be total_bytes - sizeof(struct inode_fs_paths).
+ * the returned pointer must be freed with free_ipath() in the end.
+ */
+struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
+ struct btrfs_path *path)
+{
+ struct inode_fs_paths *ifp;
+ struct btrfs_data_container *fspath;
+
+ fspath = init_data_container(total_bytes);
+ if (IS_ERR(fspath))
+ return (void *)fspath;
+
+ ifp = kmalloc(sizeof(*ifp), GFP_NOFS);
+ if (!ifp) {
+ kfree(fspath);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ ifp->btrfs_path = path;
+ ifp->fspath = fspath;
+ ifp->fs_root = fs_root;
+
+ return ifp;
+}
+
+void free_ipath(struct inode_fs_paths *ipath)
+{
+ if (!ipath)
+ return;
+ vfree(ipath->fspath);
+ kfree(ipath);
+}
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
new file mode 100644
index 000000000..9c41fbac3
--- /dev/null
+++ b/fs/btrfs/backref.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (C) 2011 STRATO. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_BACKREF__
+#define __BTRFS_BACKREF__
+
+#include <linux/btrfs.h>
+#include "ulist.h"
+#include "extent_io.h"
+
+struct inode_fs_paths {
+ struct btrfs_path *btrfs_path;
+ struct btrfs_root *fs_root;
+ struct btrfs_data_container *fspath;
+};
+
+typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root,
+ void *ctx);
+
+int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
+ struct btrfs_path *path, struct btrfs_key *found_key,
+ u64 *flags);
+
+int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
+ struct btrfs_key *key, struct btrfs_extent_item *ei,
+ u32 item_size, u64 *out_root, u8 *out_level);
+
+int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
+ u64 extent_item_objectid,
+ u64 extent_offset, int search_commit_root,
+ iterate_extent_inodes_t *iterate, void *ctx);
+
+int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path,
+ iterate_extent_inodes_t *iterate, void *ctx);
+
+int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
+
+int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 bytenr,
+ u64 time_seq, struct ulist **roots);
+char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
+ u32 name_len, unsigned long name_off,
+ struct extent_buffer *eb_in, u64 parent,
+ char *dest, u32 size);
+
+struct btrfs_data_container *init_data_container(u32 total_bytes);
+struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
+ struct btrfs_path *path);
+void free_ipath(struct inode_fs_paths *ipath);
+
+int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
+ u64 start_off, struct btrfs_path *path,
+ struct btrfs_inode_extref **ret_extref,
+ u64 *found_off);
+int btrfs_check_shared(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 root_objectid,
+ u64 inum, u64 bytenr);
+
+int __init btrfs_prelim_ref_init(void);
+void btrfs_prelim_ref_exit(void);
+#endif
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
new file mode 100644
index 000000000..0ef5cc13f
--- /dev/null
+++ b/fs/btrfs/btrfs_inode.h
@@ -0,0 +1,328 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_I__
+#define __BTRFS_I__
+
+#include <linux/hash.h>
+#include "extent_map.h"
+#include "extent_io.h"
+#include "ordered-data.h"
+#include "delayed-inode.h"
+
+/*
+ * ordered_data_close is set by truncate when a file that used
+ * to have good data has been truncated to zero. When it is set
+ * the btrfs file release call will add this inode to the
+ * ordered operations list so that we make sure to flush out any
+ * new data the application may have written before commit.
+ */
+#define BTRFS_INODE_ORDERED_DATA_CLOSE 0
+#define BTRFS_INODE_ORPHAN_META_RESERVED 1
+#define BTRFS_INODE_DUMMY 2
+#define BTRFS_INODE_IN_DEFRAG 3
+#define BTRFS_INODE_DELALLOC_META_RESERVED 4
+#define BTRFS_INODE_HAS_ORPHAN_ITEM 5
+#define BTRFS_INODE_HAS_ASYNC_EXTENT 6
+#define BTRFS_INODE_NEEDS_FULL_SYNC 7
+#define BTRFS_INODE_COPY_EVERYTHING 8
+#define BTRFS_INODE_IN_DELALLOC_LIST 9
+#define BTRFS_INODE_READDIO_NEED_LOCK 10
+#define BTRFS_INODE_HAS_PROPS 11
+/*
+ * The following 3 bits are meant only for the btree inode.
+ * When any of them is set, it means an error happened while writing an
+ * extent buffer belonging to:
+ * 1) a non-log btree
+ * 2) a log btree and first log sub-transaction
+ * 3) a log btree and second log sub-transaction
+ */
+#define BTRFS_INODE_BTREE_ERR 12
+#define BTRFS_INODE_BTREE_LOG1_ERR 13
+#define BTRFS_INODE_BTREE_LOG2_ERR 14
+
+/* in memory btrfs inode */
+struct btrfs_inode {
+ /* which subvolume this inode belongs to */
+ struct btrfs_root *root;
+
+ /* key used to find this inode on disk. This is used by the code
+ * to read in roots of subvolumes
+ */
+ struct btrfs_key location;
+
+ /*
+ * Lock for counters and all fields used to determine if the inode is in
+ * the log or not (last_trans, last_sub_trans, last_log_commit,
+ * logged_trans).
+ */
+ spinlock_t lock;
+
+ /* the extent_tree has caches of all the extent mappings to disk */
+ struct extent_map_tree extent_tree;
+
+ /* the io_tree does range state (DIRTY, LOCKED etc) */
+ struct extent_io_tree io_tree;
+
+ /* special utility tree used to record which mirrors have already been
+ * tried when checksums fail for a given block
+ */
+ struct extent_io_tree io_failure_tree;
+
+ /* held while logging the inode in tree-log.c */
+ struct mutex log_mutex;
+
+ /* held while doing delalloc reservations */
+ struct mutex delalloc_mutex;
+
+ /* used to order data wrt metadata */
+ struct btrfs_ordered_inode_tree ordered_tree;
+
+ /* list of all the delalloc inodes in the FS. There are times we need
+ * to write all the delalloc pages to disk, and this list is used
+ * to walk them all.
+ */
+ struct list_head delalloc_inodes;
+
+ /* node for the red-black tree that links inodes in subvolume root */
+ struct rb_node rb_node;
+
+ unsigned long runtime_flags;
+
+ /* Keep track of who's O_SYNC/fsyncing currently */
+ atomic_t sync_writers;
+
+ /* full 64 bit generation number, struct vfs_inode doesn't have a big
+ * enough field for this.
+ */
+ u64 generation;
+
+ /*
+ * transid of the trans_handle that last modified this inode
+ */
+ u64 last_trans;
+
+ /*
+ * transid that last logged this inode
+ */
+ u64 logged_trans;
+
+ /*
+ * log transid when this inode was last modified
+ */
+ int last_sub_trans;
+
+ /* a local copy of root's last_log_commit */
+ int last_log_commit;
+
+ /* total number of bytes pending delalloc, used by stat to calc the
+ * real block usage of the file
+ */
+ u64 delalloc_bytes;
+
+ /*
+ * total number of bytes pending defrag, used by stat to check whether
+ * it needs COW.
+ */
+ u64 defrag_bytes;
+
+ /*
+ * the size of the file stored in the metadata on disk. data=ordered
+ * means the in-memory i_size might be larger than the size on disk
+ * because not all the blocks are written yet.
+ */
+ u64 disk_i_size;
+
+ /*
+ * if this is a directory then index_cnt is the counter for the index
+ * number for new files that are created
+ */
+ u64 index_cnt;
+
+ /* Cache the directory index number to speed the dir/file remove */
+ u64 dir_index;
+
+ /* the fsync log has some corner cases that mean we have to check
+ * directories to see if any unlinks have been done before
+ * the directory was logged. See tree-log.c for all the
+ * details
+ */
+ u64 last_unlink_trans;
+
+ /*
+ * Number of bytes outstanding that are going to need csums. This is
+ * used in ENOSPC accounting.
+ */
+ u64 csum_bytes;
+
+ /* flags field from the on disk inode */
+ u32 flags;
+
+ /*
+ * Counters to keep track of the number of extent item's we may use due
+ * to delalloc and such. outstanding_extents is the number of extent
+ * items we think we'll end up using, and reserved_extents is the number
+ * of extent items we've reserved metadata for.
+ */
+ unsigned outstanding_extents;
+ unsigned reserved_extents;
+
+ /*
+ * always compress this one file
+ */
+ unsigned force_compress;
+
+ struct btrfs_delayed_node *delayed_node;
+
+ /* File creation time. */
+ struct timespec i_otime;
+
+ struct inode vfs_inode;
+};
+
+extern unsigned char btrfs_filetype_table[];
+
+static inline struct btrfs_inode *BTRFS_I(struct inode *inode)
+{
+ return container_of(inode, struct btrfs_inode, vfs_inode);
+}
+
+static inline unsigned long btrfs_inode_hash(u64 objectid,
+ const struct btrfs_root *root)
+{
+ u64 h = objectid ^ (root->objectid * GOLDEN_RATIO_PRIME);
+
+#if BITS_PER_LONG == 32
+ h = (h >> 32) ^ (h & 0xffffffff);
+#endif
+
+ return (unsigned long)h;
+}
+
+static inline void btrfs_insert_inode_hash(struct inode *inode)
+{
+ unsigned long h = btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root);
+
+ __insert_inode_hash(inode, h);
+}
+
+static inline u64 btrfs_ino(struct inode *inode)
+{
+ u64 ino = BTRFS_I(inode)->location.objectid;
+
+ /*
+ * !ino: btree_inode
+ * type == BTRFS_ROOT_ITEM_KEY: subvol dir
+ */
+ if (!ino || BTRFS_I(inode)->location.type == BTRFS_ROOT_ITEM_KEY)
+ ino = inode->i_ino;
+ return ino;
+}
+
+static inline void btrfs_i_size_write(struct inode *inode, u64 size)
+{
+ i_size_write(inode, size);
+ BTRFS_I(inode)->disk_i_size = size;
+}
+
+static inline bool btrfs_is_free_space_inode(struct inode *inode)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+
+ if (root == root->fs_info->tree_root &&
+ btrfs_ino(inode) != BTRFS_BTREE_INODE_OBJECTID)
+ return true;
+ if (BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID)
+ return true;
+ return false;
+}
+
+static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
+{
+ int ret = 0;
+
+ spin_lock(&BTRFS_I(inode)->lock);
+ if (BTRFS_I(inode)->logged_trans == generation &&
+ BTRFS_I(inode)->last_sub_trans <=
+ BTRFS_I(inode)->last_log_commit &&
+ BTRFS_I(inode)->last_sub_trans <=
+ BTRFS_I(inode)->root->last_log_commit) {
+ /*
+ * After a ranged fsync we might have left some extent maps
+ * (that fall outside the fsync's range). So return false
+ * here if the list isn't empty, to make sure btrfs_log_inode()
+ * will be called and process those extent maps.
+ */
+ smp_mb();
+ if (list_empty(&BTRFS_I(inode)->extent_tree.modified_extents))
+ ret = 1;
+ }
+ spin_unlock(&BTRFS_I(inode)->lock);
+ return ret;
+}
+
+#define BTRFS_DIO_ORIG_BIO_SUBMITTED 0x1
+
+struct btrfs_dio_private {
+ struct inode *inode;
+ unsigned long flags;
+ u64 logical_offset;
+ u64 disk_bytenr;
+ u64 bytes;
+ void *private;
+
+ /* number of bios pending for this dio */
+ atomic_t pending_bios;
+
+ /* IO errors */
+ int errors;
+
+ /* orig_bio is our btrfs_io_bio */
+ struct bio *orig_bio;
+
+ /* dio_bio came from fs/direct-io.c */
+ struct bio *dio_bio;
+
+ /*
+ * The original bio may be splited to several sub-bios, this is
+ * done during endio of sub-bios
+ */
+ int (*subio_endio)(struct inode *, struct btrfs_io_bio *, int);
+};
+
+/*
+ * Disable DIO read nolock optimization, so new dio readers will be forced
+ * to grab i_mutex. It is used to avoid the endless truncate due to
+ * nonlocked dio read.
+ */
+static inline void btrfs_inode_block_unlocked_dio(struct inode *inode)
+{
+ set_bit(BTRFS_INODE_READDIO_NEED_LOCK, &BTRFS_I(inode)->runtime_flags);
+ smp_mb();
+}
+
+static inline void btrfs_inode_resume_unlocked_dio(struct inode *inode)
+{
+ smp_mb__before_atomic();
+ clear_bit(BTRFS_INODE_READDIO_NEED_LOCK,
+ &BTRFS_I(inode)->runtime_flags);
+}
+
+bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end);
+
+#endif
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
new file mode 100644
index 000000000..ce7dec88f
--- /dev/null
+++ b/fs/btrfs/check-integrity.c
@@ -0,0 +1,3245 @@
+/*
+ * Copyright (C) STRATO AG 2011. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+/*
+ * This module can be used to catch cases when the btrfs kernel
+ * code executes write requests to the disk that bring the file
+ * system in an inconsistent state. In such a state, a power-loss
+ * or kernel panic event would cause that the data on disk is
+ * lost or at least damaged.
+ *
+ * Code is added that examines all block write requests during
+ * runtime (including writes of the super block). Three rules
+ * are verified and an error is printed on violation of the
+ * rules:
+ * 1. It is not allowed to write a disk block which is
+ * currently referenced by the super block (either directly
+ * or indirectly).
+ * 2. When a super block is written, it is verified that all
+ * referenced (directly or indirectly) blocks fulfill the
+ * following requirements:
+ * 2a. All referenced blocks have either been present when
+ * the file system was mounted, (i.e., they have been
+ * referenced by the super block) or they have been
+ * written since then and the write completion callback
+ * was called and no write error was indicated and a
+ * FLUSH request to the device where these blocks are
+ * located was received and completed.
+ * 2b. All referenced blocks need to have a generation
+ * number which is equal to the parent's number.
+ *
+ * One issue that was found using this module was that the log
+ * tree on disk became temporarily corrupted because disk blocks
+ * that had been in use for the log tree had been freed and
+ * reused too early, while being referenced by the written super
+ * block.
+ *
+ * The search term in the kernel log that can be used to filter
+ * on the existence of detected integrity issues is
+ * "btrfs: attempt".
+ *
+ * The integrity check is enabled via mount options. These
+ * mount options are only supported if the integrity check
+ * tool is compiled by defining BTRFS_FS_CHECK_INTEGRITY.
+ *
+ * Example #1, apply integrity checks to all metadata:
+ * mount /dev/sdb1 /mnt -o check_int
+ *
+ * Example #2, apply integrity checks to all metadata and
+ * to data extents:
+ * mount /dev/sdb1 /mnt -o check_int_data
+ *
+ * Example #3, apply integrity checks to all metadata and dump
+ * the tree that the super block references to kernel messages
+ * each time after a super block was written:
+ * mount /dev/sdb1 /mnt -o check_int,check_int_print_mask=263
+ *
+ * If the integrity check tool is included and activated in
+ * the mount options, plenty of kernel memory is used, and
+ * plenty of additional CPU cycles are spent. Enabling this
+ * functionality is not intended for normal use. In most
+ * cases, unless you are a btrfs developer who needs to verify
+ * the integrity of (super)-block write requests, do not
+ * enable the config option BTRFS_FS_CHECK_INTEGRITY to
+ * include and compile the integrity check tool.
+ *
+ * Expect millions of lines of information in the kernel log with an
+ * enabled check_int_print_mask. Therefore set LOG_BUF_SHIFT in the
+ * kernel config to at least 26 (which is 64MB). Usually the value is
+ * limited to 21 (which is 2MB) in init/Kconfig. The file needs to be
+ * changed like this before LOG_BUF_SHIFT can be set to a high value:
+ * config LOG_BUF_SHIFT
+ * int "Kernel log buffer size (16 => 64KB, 17 => 128KB)"
+ * range 12 30
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/buffer_head.h>
+#include <linux/mutex.h>
+#include <linux/genhd.h>
+#include <linux/blkdev.h>
+#include <linux/vmalloc.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "hash.h"
+#include "transaction.h"
+#include "extent_io.h"
+#include "volumes.h"
+#include "print-tree.h"
+#include "locking.h"
+#include "check-integrity.h"
+#include "rcu-string.h"
+
+#define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000
+#define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000
+#define BTRFSIC_DEV2STATE_HASHTABLE_SIZE 0x100
+#define BTRFSIC_BLOCK_MAGIC_NUMBER 0x14491051
+#define BTRFSIC_BLOCK_LINK_MAGIC_NUMBER 0x11070807
+#define BTRFSIC_DEV2STATE_MAGIC_NUMBER 0x20111530
+#define BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER 20111300
+#define BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL (200 - 6) /* in characters,
+ * excluding " [...]" */
+#define BTRFSIC_GENERATION_UNKNOWN ((u64)-1)
+
+/*
+ * The definition of the bitmask fields for the print_mask.
+ * They are specified with the mount option check_integrity_print_mask.
+ */
+#define BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE 0x00000001
+#define BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION 0x00000002
+#define BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE 0x00000004
+#define BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE 0x00000008
+#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH 0x00000010
+#define BTRFSIC_PRINT_MASK_END_IO_BIO_BH 0x00000020
+#define BTRFSIC_PRINT_MASK_VERBOSE 0x00000040
+#define BTRFSIC_PRINT_MASK_VERY_VERBOSE 0x00000080
+#define BTRFSIC_PRINT_MASK_INITIAL_TREE 0x00000100
+#define BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES 0x00000200
+#define BTRFSIC_PRINT_MASK_INITIAL_DATABASE 0x00000400
+#define BTRFSIC_PRINT_MASK_NUM_COPIES 0x00000800
+#define BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS 0x00001000
+#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE 0x00002000
+
+struct btrfsic_dev_state;
+struct btrfsic_state;
+
+struct btrfsic_block {
+ u32 magic_num; /* only used for debug purposes */
+ unsigned int is_metadata:1; /* if it is meta-data, not data-data */
+ unsigned int is_superblock:1; /* if it is one of the superblocks */
+ unsigned int is_iodone:1; /* if is done by lower subsystem */
+ unsigned int iodone_w_error:1; /* error was indicated to endio */
+ unsigned int never_written:1; /* block was added because it was
+ * referenced, not because it was
+ * written */
+ unsigned int mirror_num; /* large enough to hold
+ * BTRFS_SUPER_MIRROR_MAX */
+ struct btrfsic_dev_state *dev_state;
+ u64 dev_bytenr; /* key, physical byte num on disk */
+ u64 logical_bytenr; /* logical byte num on disk */
+ u64 generation;
+ struct btrfs_disk_key disk_key; /* extra info to print in case of
+ * issues, will not always be correct */
+ struct list_head collision_resolving_node; /* list node */
+ struct list_head all_blocks_node; /* list node */
+
+ /* the following two lists contain block_link items */
+ struct list_head ref_to_list; /* list */
+ struct list_head ref_from_list; /* list */
+ struct btrfsic_block *next_in_same_bio;
+ void *orig_bio_bh_private;
+ union {
+ bio_end_io_t *bio;
+ bh_end_io_t *bh;
+ } orig_bio_bh_end_io;
+ int submit_bio_bh_rw;
+ u64 flush_gen; /* only valid if !never_written */
+};
+
+/*
+ * Elements of this type are allocated dynamically and required because
+ * each block object can refer to and can be ref from multiple blocks.
+ * The key to lookup them in the hashtable is the dev_bytenr of
+ * the block ref to plus the one from the block refered from.
+ * The fact that they are searchable via a hashtable and that a
+ * ref_cnt is maintained is not required for the btrfs integrity
+ * check algorithm itself, it is only used to make the output more
+ * beautiful in case that an error is detected (an error is defined
+ * as a write operation to a block while that block is still referenced).
+ */
+struct btrfsic_block_link {
+ u32 magic_num; /* only used for debug purposes */
+ u32 ref_cnt;
+ struct list_head node_ref_to; /* list node */
+ struct list_head node_ref_from; /* list node */
+ struct list_head collision_resolving_node; /* list node */
+ struct btrfsic_block *block_ref_to;
+ struct btrfsic_block *block_ref_from;
+ u64 parent_generation;
+};
+
+struct btrfsic_dev_state {
+ u32 magic_num; /* only used for debug purposes */
+ struct block_device *bdev;
+ struct btrfsic_state *state;
+ struct list_head collision_resolving_node; /* list node */
+ struct btrfsic_block dummy_block_for_bio_bh_flush;
+ u64 last_flush_gen;
+ char name[BDEVNAME_SIZE];
+};
+
+struct btrfsic_block_hashtable {
+ struct list_head table[BTRFSIC_BLOCK_HASHTABLE_SIZE];
+};
+
+struct btrfsic_block_link_hashtable {
+ struct list_head table[BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE];
+};
+
+struct btrfsic_dev_state_hashtable {
+ struct list_head table[BTRFSIC_DEV2STATE_HASHTABLE_SIZE];
+};
+
+struct btrfsic_block_data_ctx {
+ u64 start; /* virtual bytenr */
+ u64 dev_bytenr; /* physical bytenr on device */
+ u32 len;
+ struct btrfsic_dev_state *dev;
+ char **datav;
+ struct page **pagev;
+ void *mem_to_free;
+};
+
+/* This structure is used to implement recursion without occupying
+ * any stack space, refer to btrfsic_process_metablock() */
+struct btrfsic_stack_frame {
+ u32 magic;
+ u32 nr;
+ int error;
+ int i;
+ int limit_nesting;
+ int num_copies;
+ int mirror_num;
+ struct btrfsic_block *block;
+ struct btrfsic_block_data_ctx *block_ctx;
+ struct btrfsic_block *next_block;
+ struct btrfsic_block_data_ctx next_block_ctx;
+ struct btrfs_header *hdr;
+ struct btrfsic_stack_frame *prev;
+};
+
+/* Some state per mounted filesystem */
+struct btrfsic_state {
+ u32 print_mask;
+ int include_extent_data;
+ int csum_size;
+ struct list_head all_blocks_list;
+ struct btrfsic_block_hashtable block_hashtable;
+ struct btrfsic_block_link_hashtable block_link_hashtable;
+ struct btrfs_root *root;
+ u64 max_superblock_generation;
+ struct btrfsic_block *latest_superblock;
+ u32 metablock_size;
+ u32 datablock_size;
+};
+
+static void btrfsic_block_init(struct btrfsic_block *b);
+static struct btrfsic_block *btrfsic_block_alloc(void);
+static void btrfsic_block_free(struct btrfsic_block *b);
+static void btrfsic_block_link_init(struct btrfsic_block_link *n);
+static struct btrfsic_block_link *btrfsic_block_link_alloc(void);
+static void btrfsic_block_link_free(struct btrfsic_block_link *n);
+static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds);
+static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void);
+static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds);
+static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h);
+static void btrfsic_block_hashtable_add(struct btrfsic_block *b,
+ struct btrfsic_block_hashtable *h);
+static void btrfsic_block_hashtable_remove(struct btrfsic_block *b);
+static struct btrfsic_block *btrfsic_block_hashtable_lookup(
+ struct block_device *bdev,
+ u64 dev_bytenr,
+ struct btrfsic_block_hashtable *h);
+static void btrfsic_block_link_hashtable_init(
+ struct btrfsic_block_link_hashtable *h);
+static void btrfsic_block_link_hashtable_add(
+ struct btrfsic_block_link *l,
+ struct btrfsic_block_link_hashtable *h);
+static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l);
+static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup(
+ struct block_device *bdev_ref_to,
+ u64 dev_bytenr_ref_to,
+ struct block_device *bdev_ref_from,
+ u64 dev_bytenr_ref_from,
+ struct btrfsic_block_link_hashtable *h);
+static void btrfsic_dev_state_hashtable_init(
+ struct btrfsic_dev_state_hashtable *h);
+static void btrfsic_dev_state_hashtable_add(
+ struct btrfsic_dev_state *ds,
+ struct btrfsic_dev_state_hashtable *h);
+static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds);
+static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(
+ struct block_device *bdev,
+ struct btrfsic_dev_state_hashtable *h);
+static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void);
+static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf);
+static int btrfsic_process_superblock(struct btrfsic_state *state,
+ struct btrfs_fs_devices *fs_devices);
+static int btrfsic_process_metablock(struct btrfsic_state *state,
+ struct btrfsic_block *block,
+ struct btrfsic_block_data_ctx *block_ctx,
+ int limit_nesting, int force_iodone_flag);
+static void btrfsic_read_from_block_data(
+ struct btrfsic_block_data_ctx *block_ctx,
+ void *dst, u32 offset, size_t len);
+static int btrfsic_create_link_to_next_block(
+ struct btrfsic_state *state,
+ struct btrfsic_block *block,
+ struct btrfsic_block_data_ctx
+ *block_ctx, u64 next_bytenr,
+ int limit_nesting,
+ struct btrfsic_block_data_ctx *next_block_ctx,
+ struct btrfsic_block **next_blockp,
+ int force_iodone_flag,
+ int *num_copiesp, int *mirror_nump,
+ struct btrfs_disk_key *disk_key,
+ u64 parent_generation);
+static int btrfsic_handle_extent_data(struct btrfsic_state *state,
+ struct btrfsic_block *block,
+ struct btrfsic_block_data_ctx *block_ctx,
+ u32 item_offset, int force_iodone_flag);
+static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
+ struct btrfsic_block_data_ctx *block_ctx_out,
+ int mirror_num);
+static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx);
+static int btrfsic_read_block(struct btrfsic_state *state,
+ struct btrfsic_block_data_ctx *block_ctx);
+static void btrfsic_dump_database(struct btrfsic_state *state);
+static int btrfsic_test_for_metadata(struct btrfsic_state *state,
+ char **datav, unsigned int num_pages);
+static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
+ u64 dev_bytenr, char **mapped_datav,
+ unsigned int num_pages,
+ struct bio *bio, int *bio_is_patched,
+ struct buffer_head *bh,
+ int submit_bio_bh_rw);
+static int btrfsic_process_written_superblock(
+ struct btrfsic_state *state,
+ struct btrfsic_block *const block,
+ struct btrfs_super_block *const super_hdr);
+static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status);
+static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate);
+static int btrfsic_is_block_ref_by_superblock(const struct btrfsic_state *state,
+ const struct btrfsic_block *block,
+ int recursion_level);
+static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
+ struct btrfsic_block *const block,
+ int recursion_level);
+static void btrfsic_print_add_link(const struct btrfsic_state *state,
+ const struct btrfsic_block_link *l);
+static void btrfsic_print_rem_link(const struct btrfsic_state *state,
+ const struct btrfsic_block_link *l);
+static char btrfsic_get_block_type(const struct btrfsic_state *state,
+ const struct btrfsic_block *block);
+static void btrfsic_dump_tree(const struct btrfsic_state *state);
+static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
+ const struct btrfsic_block *block,
+ int indent_level);
+static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add(
+ struct btrfsic_state *state,
+ struct btrfsic_block_data_ctx *next_block_ctx,
+ struct btrfsic_block *next_block,
+ struct btrfsic_block *from_block,
+ u64 parent_generation);
+static struct btrfsic_block *btrfsic_block_lookup_or_add(
+ struct btrfsic_state *state,
+ struct btrfsic_block_data_ctx *block_ctx,
+ const char *additional_string,
+ int is_metadata,
+ int is_iodone,
+ int never_written,
+ int mirror_num,
+ int *was_created);
+static int btrfsic_process_superblock_dev_mirror(
+ struct btrfsic_state *state,
+ struct btrfsic_dev_state *dev_state,
+ struct btrfs_device *device,
+ int superblock_mirror_num,
+ struct btrfsic_dev_state **selected_dev_state,
+ struct btrfs_super_block *selected_super);
+static struct btrfsic_dev_state *btrfsic_dev_state_lookup(
+ struct block_device *bdev);
+static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
+ u64 bytenr,
+ struct btrfsic_dev_state *dev_state,
+ u64 dev_bytenr);
+
+static struct mutex btrfsic_mutex;
+static int btrfsic_is_initialized;
+static struct btrfsic_dev_state_hashtable btrfsic_dev_state_hashtable;
+
+
+static void btrfsic_block_init(struct btrfsic_block *b)
+{
+ b->magic_num = BTRFSIC_BLOCK_MAGIC_NUMBER;
+ b->dev_state = NULL;
+ b->dev_bytenr = 0;
+ b->logical_bytenr = 0;
+ b->generation = BTRFSIC_GENERATION_UNKNOWN;
+ b->disk_key.objectid = 0;
+ b->disk_key.type = 0;
+ b->disk_key.offset = 0;
+ b->is_metadata = 0;
+ b->is_superblock = 0;
+ b->is_iodone = 0;
+ b->iodone_w_error = 0;
+ b->never_written = 0;
+ b->mirror_num = 0;
+ b->next_in_same_bio = NULL;
+ b->orig_bio_bh_private = NULL;
+ b->orig_bio_bh_end_io.bio = NULL;
+ INIT_LIST_HEAD(&b->collision_resolving_node);
+ INIT_LIST_HEAD(&b->all_blocks_node);
+ INIT_LIST_HEAD(&b->ref_to_list);
+ INIT_LIST_HEAD(&b->ref_from_list);
+ b->submit_bio_bh_rw = 0;
+ b->flush_gen = 0;
+}
+
+static struct btrfsic_block *btrfsic_block_alloc(void)
+{
+ struct btrfsic_block *b;
+
+ b = kzalloc(sizeof(*b), GFP_NOFS);
+ if (NULL != b)
+ btrfsic_block_init(b);
+
+ return b;
+}
+
+static void btrfsic_block_free(struct btrfsic_block *b)
+{
+ BUG_ON(!(NULL == b || BTRFSIC_BLOCK_MAGIC_NUMBER == b->magic_num));
+ kfree(b);
+}
+
+static void btrfsic_block_link_init(struct btrfsic_block_link *l)
+{
+ l->magic_num = BTRFSIC_BLOCK_LINK_MAGIC_NUMBER;
+ l->ref_cnt = 1;
+ INIT_LIST_HEAD(&l->node_ref_to);
+ INIT_LIST_HEAD(&l->node_ref_from);
+ INIT_LIST_HEAD(&l->collision_resolving_node);
+ l->block_ref_to = NULL;
+ l->block_ref_from = NULL;
+}
+
+static struct btrfsic_block_link *btrfsic_block_link_alloc(void)
+{
+ struct btrfsic_block_link *l;
+
+ l = kzalloc(sizeof(*l), GFP_NOFS);
+ if (NULL != l)
+ btrfsic_block_link_init(l);
+
+ return l;
+}
+
+static void btrfsic_block_link_free(struct btrfsic_block_link *l)
+{
+ BUG_ON(!(NULL == l || BTRFSIC_BLOCK_LINK_MAGIC_NUMBER == l->magic_num));
+ kfree(l);
+}
+
+static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds)
+{
+ ds->magic_num = BTRFSIC_DEV2STATE_MAGIC_NUMBER;
+ ds->bdev = NULL;
+ ds->state = NULL;
+ ds->name[0] = '\0';
+ INIT_LIST_HEAD(&ds->collision_resolving_node);
+ ds->last_flush_gen = 0;
+ btrfsic_block_init(&ds->dummy_block_for_bio_bh_flush);
+ ds->dummy_block_for_bio_bh_flush.is_iodone = 1;
+ ds->dummy_block_for_bio_bh_flush.dev_state = ds;
+}
+
+static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void)
+{
+ struct btrfsic_dev_state *ds;
+
+ ds = kzalloc(sizeof(*ds), GFP_NOFS);
+ if (NULL != ds)
+ btrfsic_dev_state_init(ds);
+
+ return ds;
+}
+
+static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds)
+{
+ BUG_ON(!(NULL == ds ||
+ BTRFSIC_DEV2STATE_MAGIC_NUMBER == ds->magic_num));
+ kfree(ds);
+}
+
+static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h)
+{
+ int i;
+
+ for (i = 0; i < BTRFSIC_BLOCK_HASHTABLE_SIZE; i++)
+ INIT_LIST_HEAD(h->table + i);
+}
+
+static void btrfsic_block_hashtable_add(struct btrfsic_block *b,
+ struct btrfsic_block_hashtable *h)
+{
+ const unsigned int hashval =
+ (((unsigned int)(b->dev_bytenr >> 16)) ^
+ ((unsigned int)((uintptr_t)b->dev_state->bdev))) &
+ (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1);
+
+ list_add(&b->collision_resolving_node, h->table + hashval);
+}
+
+static void btrfsic_block_hashtable_remove(struct btrfsic_block *b)
+{
+ list_del(&b->collision_resolving_node);
+}
+
+static struct btrfsic_block *btrfsic_block_hashtable_lookup(
+ struct block_device *bdev,
+ u64 dev_bytenr,
+ struct btrfsic_block_hashtable *h)
+{
+ const unsigned int hashval =
+ (((unsigned int)(dev_bytenr >> 16)) ^
+ ((unsigned int)((uintptr_t)bdev))) &
+ (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1);
+ struct list_head *elem;
+
+ list_for_each(elem, h->table + hashval) {
+ struct btrfsic_block *const b =
+ list_entry(elem, struct btrfsic_block,
+ collision_resolving_node);
+
+ if (b->dev_state->bdev == bdev && b->dev_bytenr == dev_bytenr)
+ return b;
+ }
+
+ return NULL;
+}
+
+static void btrfsic_block_link_hashtable_init(
+ struct btrfsic_block_link_hashtable *h)
+{
+ int i;
+
+ for (i = 0; i < BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE; i++)
+ INIT_LIST_HEAD(h->table + i);
+}
+
+static void btrfsic_block_link_hashtable_add(
+ struct btrfsic_block_link *l,
+ struct btrfsic_block_link_hashtable *h)
+{
+ const unsigned int hashval =
+ (((unsigned int)(l->block_ref_to->dev_bytenr >> 16)) ^
+ ((unsigned int)(l->block_ref_from->dev_bytenr >> 16)) ^
+ ((unsigned int)((uintptr_t)l->block_ref_to->dev_state->bdev)) ^
+ ((unsigned int)((uintptr_t)l->block_ref_from->dev_state->bdev)))
+ & (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1);
+
+ BUG_ON(NULL == l->block_ref_to);
+ BUG_ON(NULL == l->block_ref_from);
+ list_add(&l->collision_resolving_node, h->table + hashval);
+}
+
+static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l)
+{
+ list_del(&l->collision_resolving_node);
+}
+
+static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup(
+ struct block_device *bdev_ref_to,
+ u64 dev_bytenr_ref_to,
+ struct block_device *bdev_ref_from,
+ u64 dev_bytenr_ref_from,
+ struct btrfsic_block_link_hashtable *h)
+{
+ const unsigned int hashval =
+ (((unsigned int)(dev_bytenr_ref_to >> 16)) ^
+ ((unsigned int)(dev_bytenr_ref_from >> 16)) ^
+ ((unsigned int)((uintptr_t)bdev_ref_to)) ^
+ ((unsigned int)((uintptr_t)bdev_ref_from))) &
+ (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1);
+ struct list_head *elem;
+
+ list_for_each(elem, h->table + hashval) {
+ struct btrfsic_block_link *const l =
+ list_entry(elem, struct btrfsic_block_link,
+ collision_resolving_node);
+
+ BUG_ON(NULL == l->block_ref_to);
+ BUG_ON(NULL == l->block_ref_from);
+ if (l->block_ref_to->dev_state->bdev == bdev_ref_to &&
+ l->block_ref_to->dev_bytenr == dev_bytenr_ref_to &&
+ l->block_ref_from->dev_state->bdev == bdev_ref_from &&
+ l->block_ref_from->dev_bytenr == dev_bytenr_ref_from)
+ return l;
+ }
+
+ return NULL;
+}
+
+static void btrfsic_dev_state_hashtable_init(
+ struct btrfsic_dev_state_hashtable *h)
+{
+ int i;
+
+ for (i = 0; i < BTRFSIC_DEV2STATE_HASHTABLE_SIZE; i++)
+ INIT_LIST_HEAD(h->table + i);
+}
+
+static void btrfsic_dev_state_hashtable_add(
+ struct btrfsic_dev_state *ds,
+ struct btrfsic_dev_state_hashtable *h)
+{
+ const unsigned int hashval =
+ (((unsigned int)((uintptr_t)ds->bdev)) &
+ (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1));
+
+ list_add(&ds->collision_resolving_node, h->table + hashval);
+}
+
+static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds)
+{
+ list_del(&ds->collision_resolving_node);
+}
+
+static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(
+ struct block_device *bdev,
+ struct btrfsic_dev_state_hashtable *h)
+{
+ const unsigned int hashval =
+ (((unsigned int)((uintptr_t)bdev)) &
+ (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1));
+ struct list_head *elem;
+
+ list_for_each(elem, h->table + hashval) {
+ struct btrfsic_dev_state *const ds =
+ list_entry(elem, struct btrfsic_dev_state,
+ collision_resolving_node);
+
+ if (ds->bdev == bdev)
+ return ds;
+ }
+
+ return NULL;
+}
+
+static int btrfsic_process_superblock(struct btrfsic_state *state,
+ struct btrfs_fs_devices *fs_devices)
+{
+ int ret = 0;
+ struct btrfs_super_block *selected_super;
+ struct list_head *dev_head = &fs_devices->devices;
+ struct btrfs_device *device;
+ struct btrfsic_dev_state *selected_dev_state = NULL;
+ int pass;
+
+ BUG_ON(NULL == state);
+ selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS);
+ if (NULL == selected_super) {
+ printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+ return -1;
+ }
+
+ list_for_each_entry(device, dev_head, dev_list) {
+ int i;
+ struct btrfsic_dev_state *dev_state;
+
+ if (!device->bdev || !device->name)
+ continue;
+
+ dev_state = btrfsic_dev_state_lookup(device->bdev);
+ BUG_ON(NULL == dev_state);
+ for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+ ret = btrfsic_process_superblock_dev_mirror(
+ state, dev_state, device, i,
+ &selected_dev_state, selected_super);
+ if (0 != ret && 0 == i) {
+ kfree(selected_super);
+ return ret;
+ }
+ }
+ }
+
+ if (NULL == state->latest_superblock) {
+ printk(KERN_INFO "btrfsic: no superblock found!\n");
+ kfree(selected_super);
+ return -1;
+ }
+
+ state->csum_size = btrfs_super_csum_size(selected_super);
+
+ for (pass = 0; pass < 3; pass++) {
+ int num_copies;
+ int mirror_num;
+ u64 next_bytenr;
+
+ switch (pass) {
+ case 0:
+ next_bytenr = btrfs_super_root(selected_super);
+ if (state->print_mask &
+ BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+ printk(KERN_INFO "root@%llu\n", next_bytenr);
+ break;
+ case 1:
+ next_bytenr = btrfs_super_chunk_root(selected_super);
+ if (state->print_mask &
+ BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+ printk(KERN_INFO "chunk@%llu\n", next_bytenr);
+ break;
+ case 2:
+ next_bytenr = btrfs_super_log_root(selected_super);
+ if (0 == next_bytenr)
+ continue;
+ if (state->print_mask &
+ BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+ printk(KERN_INFO "log@%llu\n", next_bytenr);
+ break;
+ }
+
+ num_copies =
+ btrfs_num_copies(state->root->fs_info,
+ next_bytenr, state->metablock_size);
+ if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+ printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+ next_bytenr, num_copies);
+
+ for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+ struct btrfsic_block *next_block;
+ struct btrfsic_block_data_ctx tmp_next_block_ctx;
+ struct btrfsic_block_link *l;
+
+ ret = btrfsic_map_block(state, next_bytenr,
+ state->metablock_size,
+ &tmp_next_block_ctx,
+ mirror_num);
+ if (ret) {
+ printk(KERN_INFO "btrfsic:"
+ " btrfsic_map_block(root @%llu,"
+ " mirror %d) failed!\n",
+ next_bytenr, mirror_num);
+ kfree(selected_super);
+ return -1;
+ }
+
+ next_block = btrfsic_block_hashtable_lookup(
+ tmp_next_block_ctx.dev->bdev,
+ tmp_next_block_ctx.dev_bytenr,
+ &state->block_hashtable);
+ BUG_ON(NULL == next_block);
+
+ l = btrfsic_block_link_hashtable_lookup(
+ tmp_next_block_ctx.dev->bdev,
+ tmp_next_block_ctx.dev_bytenr,
+ state->latest_superblock->dev_state->
+ bdev,
+ state->latest_superblock->dev_bytenr,
+ &state->block_link_hashtable);
+ BUG_ON(NULL == l);
+
+ ret = btrfsic_read_block(state, &tmp_next_block_ctx);
+ if (ret < (int)PAGE_CACHE_SIZE) {
+ printk(KERN_INFO
+ "btrfsic: read @logical %llu failed!\n",
+ tmp_next_block_ctx.start);
+ btrfsic_release_block_ctx(&tmp_next_block_ctx);
+ kfree(selected_super);
+ return -1;
+ }
+
+ ret = btrfsic_process_metablock(state,
+ next_block,
+ &tmp_next_block_ctx,
+ BTRFS_MAX_LEVEL + 3, 1);
+ btrfsic_release_block_ctx(&tmp_next_block_ctx);
+ }
+ }
+
+ kfree(selected_super);
+ return ret;
+}
+
+static int btrfsic_process_superblock_dev_mirror(
+ struct btrfsic_state *state,
+ struct btrfsic_dev_state *dev_state,
+ struct btrfs_device *device,
+ int superblock_mirror_num,
+ struct btrfsic_dev_state **selected_dev_state,
+ struct btrfs_super_block *selected_super)
+{
+ struct btrfs_super_block *super_tmp;
+ u64 dev_bytenr;
+ struct buffer_head *bh;
+ struct btrfsic_block *superblock_tmp;
+ int pass;
+ struct block_device *const superblock_bdev = device->bdev;
+
+ /* super block bytenr is always the unmapped device bytenr */
+ dev_bytenr = btrfs_sb_offset(superblock_mirror_num);
+ if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->commit_total_bytes)
+ return -1;
+ bh = __bread(superblock_bdev, dev_bytenr / 4096,
+ BTRFS_SUPER_INFO_SIZE);
+ if (NULL == bh)
+ return -1;
+ super_tmp = (struct btrfs_super_block *)
+ (bh->b_data + (dev_bytenr & 4095));
+
+ if (btrfs_super_bytenr(super_tmp) != dev_bytenr ||
+ btrfs_super_magic(super_tmp) != BTRFS_MAGIC ||
+ memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) ||
+ btrfs_super_nodesize(super_tmp) != state->metablock_size ||
+ btrfs_super_sectorsize(super_tmp) != state->datablock_size) {
+ brelse(bh);
+ return 0;
+ }
+
+ superblock_tmp =
+ btrfsic_block_hashtable_lookup(superblock_bdev,
+ dev_bytenr,
+ &state->block_hashtable);
+ if (NULL == superblock_tmp) {
+ superblock_tmp = btrfsic_block_alloc();
+ if (NULL == superblock_tmp) {
+ printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+ brelse(bh);
+ return -1;
+ }
+ /* for superblock, only the dev_bytenr makes sense */
+ superblock_tmp->dev_bytenr = dev_bytenr;
+ superblock_tmp->dev_state = dev_state;
+ superblock_tmp->logical_bytenr = dev_bytenr;
+ superblock_tmp->generation = btrfs_super_generation(super_tmp);
+ superblock_tmp->is_metadata = 1;
+ superblock_tmp->is_superblock = 1;
+ superblock_tmp->is_iodone = 1;
+ superblock_tmp->never_written = 0;
+ superblock_tmp->mirror_num = 1 + superblock_mirror_num;
+ if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
+ printk_in_rcu(KERN_INFO "New initial S-block (bdev %p, %s)"
+ " @%llu (%s/%llu/%d)\n",
+ superblock_bdev,
+ rcu_str_deref(device->name), dev_bytenr,
+ dev_state->name, dev_bytenr,
+ superblock_mirror_num);
+ list_add(&superblock_tmp->all_blocks_node,
+ &state->all_blocks_list);
+ btrfsic_block_hashtable_add(superblock_tmp,
+ &state->block_hashtable);
+ }
+
+ /* select the one with the highest generation field */
+ if (btrfs_super_generation(super_tmp) >
+ state->max_superblock_generation ||
+ 0 == state->max_superblock_generation) {
+ memcpy(selected_super, super_tmp, sizeof(*selected_super));
+ *selected_dev_state = dev_state;
+ state->max_superblock_generation =
+ btrfs_super_generation(super_tmp);
+ state->latest_superblock = superblock_tmp;
+ }
+
+ for (pass = 0; pass < 3; pass++) {
+ u64 next_bytenr;
+ int num_copies;
+ int mirror_num;
+ const char *additional_string = NULL;
+ struct btrfs_disk_key tmp_disk_key;
+
+ tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY;
+ tmp_disk_key.offset = 0;
+ switch (pass) {
+ case 0:
+ btrfs_set_disk_key_objectid(&tmp_disk_key,
+ BTRFS_ROOT_TREE_OBJECTID);
+ additional_string = "initial root ";
+ next_bytenr = btrfs_super_root(super_tmp);
+ break;
+ case 1:
+ btrfs_set_disk_key_objectid(&tmp_disk_key,
+ BTRFS_CHUNK_TREE_OBJECTID);
+ additional_string = "initial chunk ";
+ next_bytenr = btrfs_super_chunk_root(super_tmp);
+ break;
+ case 2:
+ btrfs_set_disk_key_objectid(&tmp_disk_key,
+ BTRFS_TREE_LOG_OBJECTID);
+ additional_string = "initial log ";
+ next_bytenr = btrfs_super_log_root(super_tmp);
+ if (0 == next_bytenr)
+ continue;
+ break;
+ }
+
+ num_copies =
+ btrfs_num_copies(state->root->fs_info,
+ next_bytenr, state->metablock_size);
+ if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+ printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+ next_bytenr, num_copies);
+ for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+ struct btrfsic_block *next_block;
+ struct btrfsic_block_data_ctx tmp_next_block_ctx;
+ struct btrfsic_block_link *l;
+
+ if (btrfsic_map_block(state, next_bytenr,
+ state->metablock_size,
+ &tmp_next_block_ctx,
+ mirror_num)) {
+ printk(KERN_INFO "btrfsic: btrfsic_map_block("
+ "bytenr @%llu, mirror %d) failed!\n",
+ next_bytenr, mirror_num);
+ brelse(bh);
+ return -1;
+ }
+
+ next_block = btrfsic_block_lookup_or_add(
+ state, &tmp_next_block_ctx,
+ additional_string, 1, 1, 0,
+ mirror_num, NULL);
+ if (NULL == next_block) {
+ btrfsic_release_block_ctx(&tmp_next_block_ctx);
+ brelse(bh);
+ return -1;
+ }
+
+ next_block->disk_key = tmp_disk_key;
+ next_block->generation = BTRFSIC_GENERATION_UNKNOWN;
+ l = btrfsic_block_link_lookup_or_add(
+ state, &tmp_next_block_ctx,
+ next_block, superblock_tmp,
+ BTRFSIC_GENERATION_UNKNOWN);
+ btrfsic_release_block_ctx(&tmp_next_block_ctx);
+ if (NULL == l) {
+ brelse(bh);
+ return -1;
+ }
+ }
+ }
+ if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES)
+ btrfsic_dump_tree_sub(state, superblock_tmp, 0);
+
+ brelse(bh);
+ return 0;
+}
+
+static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void)
+{
+ struct btrfsic_stack_frame *sf;
+
+ sf = kzalloc(sizeof(*sf), GFP_NOFS);
+ if (NULL == sf)
+ printk(KERN_INFO "btrfsic: alloc memory failed!\n");
+ else
+ sf->magic = BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER;
+ return sf;
+}
+
+static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf)
+{
+ BUG_ON(!(NULL == sf ||
+ BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER == sf->magic));
+ kfree(sf);
+}
+
+static int btrfsic_process_metablock(
+ struct btrfsic_state *state,
+ struct btrfsic_block *const first_block,
+ struct btrfsic_block_data_ctx *const first_block_ctx,
+ int first_limit_nesting, int force_iodone_flag)
+{
+ struct btrfsic_stack_frame initial_stack_frame = { 0 };
+ struct btrfsic_stack_frame *sf;
+ struct btrfsic_stack_frame *next_stack;
+ struct btrfs_header *const first_hdr =
+ (struct btrfs_header *)first_block_ctx->datav[0];
+
+ BUG_ON(!first_hdr);
+ sf = &initial_stack_frame;
+ sf->error = 0;
+ sf->i = -1;
+ sf->limit_nesting = first_limit_nesting;
+ sf->block = first_block;
+ sf->block_ctx = first_block_ctx;
+ sf->next_block = NULL;
+ sf->hdr = first_hdr;
+ sf->prev = NULL;
+
+continue_with_new_stack_frame:
+ sf->block->generation = le64_to_cpu(sf->hdr->generation);
+ if (0 == sf->hdr->level) {
+ struct btrfs_leaf *const leafhdr =
+ (struct btrfs_leaf *)sf->hdr;
+
+ if (-1 == sf->i) {
+ sf->nr = btrfs_stack_header_nritems(&leafhdr->header);
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO
+ "leaf %llu items %d generation %llu"
+ " owner %llu\n",
+ sf->block_ctx->start, sf->nr,
+ btrfs_stack_header_generation(
+ &leafhdr->header),
+ btrfs_stack_header_owner(
+ &leafhdr->header));
+ }
+
+continue_with_current_leaf_stack_frame:
+ if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) {
+ sf->i++;
+ sf->num_copies = 0;
+ }
+
+ if (sf->i < sf->nr) {
+ struct btrfs_item disk_item;
+ u32 disk_item_offset =
+ (uintptr_t)(leafhdr->items + sf->i) -
+ (uintptr_t)leafhdr;
+ struct btrfs_disk_key *disk_key;
+ u8 type;
+ u32 item_offset;
+ u32 item_size;
+
+ if (disk_item_offset + sizeof(struct btrfs_item) >
+ sf->block_ctx->len) {
+leaf_item_out_of_bounce_error:
+ printk(KERN_INFO
+ "btrfsic: leaf item out of bounce at logical %llu, dev %s\n",
+ sf->block_ctx->start,
+ sf->block_ctx->dev->name);
+ goto one_stack_frame_backwards;
+ }
+ btrfsic_read_from_block_data(sf->block_ctx,
+ &disk_item,
+ disk_item_offset,
+ sizeof(struct btrfs_item));
+ item_offset = btrfs_stack_item_offset(&disk_item);
+ item_size = btrfs_stack_item_size(&disk_item);
+ disk_key = &disk_item.key;
+ type = btrfs_disk_key_type(disk_key);
+
+ if (BTRFS_ROOT_ITEM_KEY == type) {
+ struct btrfs_root_item root_item;
+ u32 root_item_offset;
+ u64 next_bytenr;
+
+ root_item_offset = item_offset +
+ offsetof(struct btrfs_leaf, items);
+ if (root_item_offset + item_size >
+ sf->block_ctx->len)
+ goto leaf_item_out_of_bounce_error;
+ btrfsic_read_from_block_data(
+ sf->block_ctx, &root_item,
+ root_item_offset,
+ item_size);
+ next_bytenr = btrfs_root_bytenr(&root_item);
+
+ sf->error =
+ btrfsic_create_link_to_next_block(
+ state,
+ sf->block,
+ sf->block_ctx,
+ next_bytenr,
+ sf->limit_nesting,
+ &sf->next_block_ctx,
+ &sf->next_block,
+ force_iodone_flag,
+ &sf->num_copies,
+ &sf->mirror_num,
+ disk_key,
+ btrfs_root_generation(
+ &root_item));
+ if (sf->error)
+ goto one_stack_frame_backwards;
+
+ if (NULL != sf->next_block) {
+ struct btrfs_header *const next_hdr =
+ (struct btrfs_header *)
+ sf->next_block_ctx.datav[0];
+
+ next_stack =
+ btrfsic_stack_frame_alloc();
+ if (NULL == next_stack) {
+ sf->error = -1;
+ btrfsic_release_block_ctx(
+ &sf->
+ next_block_ctx);
+ goto one_stack_frame_backwards;
+ }
+
+ next_stack->i = -1;
+ next_stack->block = sf->next_block;
+ next_stack->block_ctx =
+ &sf->next_block_ctx;
+ next_stack->next_block = NULL;
+ next_stack->hdr = next_hdr;
+ next_stack->limit_nesting =
+ sf->limit_nesting - 1;
+ next_stack->prev = sf;
+ sf = next_stack;
+ goto continue_with_new_stack_frame;
+ }
+ } else if (BTRFS_EXTENT_DATA_KEY == type &&
+ state->include_extent_data) {
+ sf->error = btrfsic_handle_extent_data(
+ state,
+ sf->block,
+ sf->block_ctx,
+ item_offset,
+ force_iodone_flag);
+ if (sf->error)
+ goto one_stack_frame_backwards;
+ }
+
+ goto continue_with_current_leaf_stack_frame;
+ }
+ } else {
+ struct btrfs_node *const nodehdr = (struct btrfs_node *)sf->hdr;
+
+ if (-1 == sf->i) {
+ sf->nr = btrfs_stack_header_nritems(&nodehdr->header);
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO "node %llu level %d items %d"
+ " generation %llu owner %llu\n",
+ sf->block_ctx->start,
+ nodehdr->header.level, sf->nr,
+ btrfs_stack_header_generation(
+ &nodehdr->header),
+ btrfs_stack_header_owner(
+ &nodehdr->header));
+ }
+
+continue_with_current_node_stack_frame:
+ if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) {
+ sf->i++;
+ sf->num_copies = 0;
+ }
+
+ if (sf->i < sf->nr) {
+ struct btrfs_key_ptr key_ptr;
+ u32 key_ptr_offset;
+ u64 next_bytenr;
+
+ key_ptr_offset = (uintptr_t)(nodehdr->ptrs + sf->i) -
+ (uintptr_t)nodehdr;
+ if (key_ptr_offset + sizeof(struct btrfs_key_ptr) >
+ sf->block_ctx->len) {
+ printk(KERN_INFO
+ "btrfsic: node item out of bounce at logical %llu, dev %s\n",
+ sf->block_ctx->start,
+ sf->block_ctx->dev->name);
+ goto one_stack_frame_backwards;
+ }
+ btrfsic_read_from_block_data(
+ sf->block_ctx, &key_ptr, key_ptr_offset,
+ sizeof(struct btrfs_key_ptr));
+ next_bytenr = btrfs_stack_key_blockptr(&key_ptr);
+
+ sf->error = btrfsic_create_link_to_next_block(
+ state,
+ sf->block,
+ sf->block_ctx,
+ next_bytenr,
+ sf->limit_nesting,
+ &sf->next_block_ctx,
+ &sf->next_block,
+ force_iodone_flag,
+ &sf->num_copies,
+ &sf->mirror_num,
+ &key_ptr.key,
+ btrfs_stack_key_generation(&key_ptr));
+ if (sf->error)
+ goto one_stack_frame_backwards;
+
+ if (NULL != sf->next_block) {
+ struct btrfs_header *const next_hdr =
+ (struct btrfs_header *)
+ sf->next_block_ctx.datav[0];
+
+ next_stack = btrfsic_stack_frame_alloc();
+ if (NULL == next_stack) {
+ sf->error = -1;
+ goto one_stack_frame_backwards;
+ }
+
+ next_stack->i = -1;
+ next_stack->block = sf->next_block;
+ next_stack->block_ctx = &sf->next_block_ctx;
+ next_stack->next_block = NULL;
+ next_stack->hdr = next_hdr;
+ next_stack->limit_nesting =
+ sf->limit_nesting - 1;
+ next_stack->prev = sf;
+ sf = next_stack;
+ goto continue_with_new_stack_frame;
+ }
+
+ goto continue_with_current_node_stack_frame;
+ }
+ }
+
+one_stack_frame_backwards:
+ if (NULL != sf->prev) {
+ struct btrfsic_stack_frame *const prev = sf->prev;
+
+ /* the one for the initial block is freed in the caller */
+ btrfsic_release_block_ctx(sf->block_ctx);
+
+ if (sf->error) {
+ prev->error = sf->error;
+ btrfsic_stack_frame_free(sf);
+ sf = prev;
+ goto one_stack_frame_backwards;
+ }
+
+ btrfsic_stack_frame_free(sf);
+ sf = prev;
+ goto continue_with_new_stack_frame;
+ } else {
+ BUG_ON(&initial_stack_frame != sf);
+ }
+
+ return sf->error;
+}
+
+static void btrfsic_read_from_block_data(
+ struct btrfsic_block_data_ctx *block_ctx,
+ void *dstv, u32 offset, size_t len)
+{
+ size_t cur;
+ size_t offset_in_page;
+ char *kaddr;
+ char *dst = (char *)dstv;
+ size_t start_offset = block_ctx->start & ((u64)PAGE_CACHE_SIZE - 1);
+ unsigned long i = (start_offset + offset) >> PAGE_CACHE_SHIFT;
+
+ WARN_ON(offset + len > block_ctx->len);
+ offset_in_page = (start_offset + offset) & (PAGE_CACHE_SIZE - 1);
+
+ while (len > 0) {
+ cur = min(len, ((size_t)PAGE_CACHE_SIZE - offset_in_page));
+ BUG_ON(i >= DIV_ROUND_UP(block_ctx->len, PAGE_CACHE_SIZE));
+ kaddr = block_ctx->datav[i];
+ memcpy(dst, kaddr + offset_in_page, cur);
+
+ dst += cur;
+ len -= cur;
+ offset_in_page = 0;
+ i++;
+ }
+}
+
+static int btrfsic_create_link_to_next_block(
+ struct btrfsic_state *state,
+ struct btrfsic_block *block,
+ struct btrfsic_block_data_ctx *block_ctx,
+ u64 next_bytenr,
+ int limit_nesting,
+ struct btrfsic_block_data_ctx *next_block_ctx,
+ struct btrfsic_block **next_blockp,
+ int force_iodone_flag,
+ int *num_copiesp, int *mirror_nump,
+ struct btrfs_disk_key *disk_key,
+ u64 parent_generation)
+{
+ struct btrfsic_block *next_block = NULL;
+ int ret;
+ struct btrfsic_block_link *l;
+ int did_alloc_block_link;
+ int block_was_created;
+
+ *next_blockp = NULL;
+ if (0 == *num_copiesp) {
+ *num_copiesp =
+ btrfs_num_copies(state->root->fs_info,
+ next_bytenr, state->metablock_size);
+ if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+ printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+ next_bytenr, *num_copiesp);
+ *mirror_nump = 1;
+ }
+
+ if (*mirror_nump > *num_copiesp)
+ return 0;
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO
+ "btrfsic_create_link_to_next_block(mirror_num=%d)\n",
+ *mirror_nump);
+ ret = btrfsic_map_block(state, next_bytenr,
+ state->metablock_size,
+ next_block_ctx, *mirror_nump);
+ if (ret) {
+ printk(KERN_INFO
+ "btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n",
+ next_bytenr, *mirror_nump);
+ btrfsic_release_block_ctx(next_block_ctx);
+ *next_blockp = NULL;
+ return -1;
+ }
+
+ next_block = btrfsic_block_lookup_or_add(state,
+ next_block_ctx, "referenced ",
+ 1, force_iodone_flag,
+ !force_iodone_flag,
+ *mirror_nump,
+ &block_was_created);
+ if (NULL == next_block) {
+ btrfsic_release_block_ctx(next_block_ctx);
+ *next_blockp = NULL;
+ return -1;
+ }
+ if (block_was_created) {
+ l = NULL;
+ next_block->generation = BTRFSIC_GENERATION_UNKNOWN;
+ } else {
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) {
+ if (next_block->logical_bytenr != next_bytenr &&
+ !(!next_block->is_metadata &&
+ 0 == next_block->logical_bytenr))
+ printk(KERN_INFO
+ "Referenced block @%llu (%s/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu).\n",
+ next_bytenr, next_block_ctx->dev->name,
+ next_block_ctx->dev_bytenr, *mirror_nump,
+ btrfsic_get_block_type(state,
+ next_block),
+ next_block->logical_bytenr);
+ else
+ printk(KERN_INFO
+ "Referenced block @%llu (%s/%llu/%d) found in hash table, %c.\n",
+ next_bytenr, next_block_ctx->dev->name,
+ next_block_ctx->dev_bytenr, *mirror_nump,
+ btrfsic_get_block_type(state,
+ next_block));
+ }
+ next_block->logical_bytenr = next_bytenr;
+
+ next_block->mirror_num = *mirror_nump;
+ l = btrfsic_block_link_hashtable_lookup(
+ next_block_ctx->dev->bdev,
+ next_block_ctx->dev_bytenr,
+ block_ctx->dev->bdev,
+ block_ctx->dev_bytenr,
+ &state->block_link_hashtable);
+ }
+
+ next_block->disk_key = *disk_key;
+ if (NULL == l) {
+ l = btrfsic_block_link_alloc();
+ if (NULL == l) {
+ printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+ btrfsic_release_block_ctx(next_block_ctx);
+ *next_blockp = NULL;
+ return -1;
+ }
+
+ did_alloc_block_link = 1;
+ l->block_ref_to = next_block;
+ l->block_ref_from = block;
+ l->ref_cnt = 1;
+ l->parent_generation = parent_generation;
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ btrfsic_print_add_link(state, l);
+
+ list_add(&l->node_ref_to, &block->ref_to_list);
+ list_add(&l->node_ref_from, &next_block->ref_from_list);
+
+ btrfsic_block_link_hashtable_add(l,
+ &state->block_link_hashtable);
+ } else {
+ did_alloc_block_link = 0;
+ if (0 == limit_nesting) {
+ l->ref_cnt++;
+ l->parent_generation = parent_generation;
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ btrfsic_print_add_link(state, l);
+ }
+ }
+
+ if (limit_nesting > 0 && did_alloc_block_link) {
+ ret = btrfsic_read_block(state, next_block_ctx);
+ if (ret < (int)next_block_ctx->len) {
+ printk(KERN_INFO
+ "btrfsic: read block @logical %llu failed!\n",
+ next_bytenr);
+ btrfsic_release_block_ctx(next_block_ctx);
+ *next_blockp = NULL;
+ return -1;
+ }
+
+ *next_blockp = next_block;
+ } else {
+ *next_blockp = NULL;
+ }
+ (*mirror_nump)++;
+
+ return 0;
+}
+
+static int btrfsic_handle_extent_data(
+ struct btrfsic_state *state,
+ struct btrfsic_block *block,
+ struct btrfsic_block_data_ctx *block_ctx,
+ u32 item_offset, int force_iodone_flag)
+{
+ int ret;
+ struct btrfs_file_extent_item file_extent_item;
+ u64 file_extent_item_offset;
+ u64 next_bytenr;
+ u64 num_bytes;
+ u64 generation;
+ struct btrfsic_block_link *l;
+
+ file_extent_item_offset = offsetof(struct btrfs_leaf, items) +
+ item_offset;
+ if (file_extent_item_offset +
+ offsetof(struct btrfs_file_extent_item, disk_num_bytes) >
+ block_ctx->len) {
+ printk(KERN_INFO
+ "btrfsic: file item out of bounce at logical %llu, dev %s\n",
+ block_ctx->start, block_ctx->dev->name);
+ return -1;
+ }
+
+ btrfsic_read_from_block_data(block_ctx, &file_extent_item,
+ file_extent_item_offset,
+ offsetof(struct btrfs_file_extent_item, disk_num_bytes));
+ if (BTRFS_FILE_EXTENT_REG != file_extent_item.type ||
+ btrfs_stack_file_extent_disk_bytenr(&file_extent_item) == 0) {
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
+ printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu\n",
+ file_extent_item.type,
+ btrfs_stack_file_extent_disk_bytenr(
+ &file_extent_item));
+ return 0;
+ }
+
+ if (file_extent_item_offset + sizeof(struct btrfs_file_extent_item) >
+ block_ctx->len) {
+ printk(KERN_INFO
+ "btrfsic: file item out of bounce at logical %llu, dev %s\n",
+ block_ctx->start, block_ctx->dev->name);
+ return -1;
+ }
+ btrfsic_read_from_block_data(block_ctx, &file_extent_item,
+ file_extent_item_offset,
+ sizeof(struct btrfs_file_extent_item));
+ next_bytenr = btrfs_stack_file_extent_disk_bytenr(&file_extent_item);
+ if (btrfs_stack_file_extent_compression(&file_extent_item) ==
+ BTRFS_COMPRESS_NONE) {
+ next_bytenr += btrfs_stack_file_extent_offset(&file_extent_item);
+ num_bytes = btrfs_stack_file_extent_num_bytes(&file_extent_item);
+ } else {
+ num_bytes = btrfs_stack_file_extent_disk_num_bytes(&file_extent_item);
+ }
+ generation = btrfs_stack_file_extent_generation(&file_extent_item);
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
+ printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu,"
+ " offset = %llu, num_bytes = %llu\n",
+ file_extent_item.type,
+ btrfs_stack_file_extent_disk_bytenr(&file_extent_item),
+ btrfs_stack_file_extent_offset(&file_extent_item),
+ num_bytes);
+ while (num_bytes > 0) {
+ u32 chunk_len;
+ int num_copies;
+ int mirror_num;
+
+ if (num_bytes > state->datablock_size)
+ chunk_len = state->datablock_size;
+ else
+ chunk_len = num_bytes;
+
+ num_copies =
+ btrfs_num_copies(state->root->fs_info,
+ next_bytenr, state->datablock_size);
+ if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+ printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+ next_bytenr, num_copies);
+ for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+ struct btrfsic_block_data_ctx next_block_ctx;
+ struct btrfsic_block *next_block;
+ int block_was_created;
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO "btrfsic_handle_extent_data("
+ "mirror_num=%d)\n", mirror_num);
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
+ printk(KERN_INFO
+ "\tdisk_bytenr = %llu, num_bytes %u\n",
+ next_bytenr, chunk_len);
+ ret = btrfsic_map_block(state, next_bytenr,
+ chunk_len, &next_block_ctx,
+ mirror_num);
+ if (ret) {
+ printk(KERN_INFO
+ "btrfsic: btrfsic_map_block(@%llu,"
+ " mirror=%d) failed!\n",
+ next_bytenr, mirror_num);
+ return -1;
+ }
+
+ next_block = btrfsic_block_lookup_or_add(
+ state,
+ &next_block_ctx,
+ "referenced ",
+ 0,
+ force_iodone_flag,
+ !force_iodone_flag,
+ mirror_num,
+ &block_was_created);
+ if (NULL == next_block) {
+ printk(KERN_INFO
+ "btrfsic: error, kmalloc failed!\n");
+ btrfsic_release_block_ctx(&next_block_ctx);
+ return -1;
+ }
+ if (!block_was_created) {
+ if ((state->print_mask &
+ BTRFSIC_PRINT_MASK_VERBOSE) &&
+ next_block->logical_bytenr != next_bytenr &&
+ !(!next_block->is_metadata &&
+ 0 == next_block->logical_bytenr)) {
+ printk(KERN_INFO
+ "Referenced block"
+ " @%llu (%s/%llu/%d)"
+ " found in hash table, D,"
+ " bytenr mismatch"
+ " (!= stored %llu).\n",
+ next_bytenr,
+ next_block_ctx.dev->name,
+ next_block_ctx.dev_bytenr,
+ mirror_num,
+ next_block->logical_bytenr);
+ }
+ next_block->logical_bytenr = next_bytenr;
+ next_block->mirror_num = mirror_num;
+ }
+
+ l = btrfsic_block_link_lookup_or_add(state,
+ &next_block_ctx,
+ next_block, block,
+ generation);
+ btrfsic_release_block_ctx(&next_block_ctx);
+ if (NULL == l)
+ return -1;
+ }
+
+ next_bytenr += chunk_len;
+ num_bytes -= chunk_len;
+ }
+
+ return 0;
+}
+
+static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
+ struct btrfsic_block_data_ctx *block_ctx_out,
+ int mirror_num)
+{
+ int ret;
+ u64 length;
+ struct btrfs_bio *multi = NULL;
+ struct btrfs_device *device;
+
+ length = len;
+ ret = btrfs_map_block(state->root->fs_info, READ,
+ bytenr, &length, &multi, mirror_num);
+
+ if (ret) {
+ block_ctx_out->start = 0;
+ block_ctx_out->dev_bytenr = 0;
+ block_ctx_out->len = 0;
+ block_ctx_out->dev = NULL;
+ block_ctx_out->datav = NULL;
+ block_ctx_out->pagev = NULL;
+ block_ctx_out->mem_to_free = NULL;
+
+ return ret;
+ }
+
+ device = multi->stripes[0].dev;
+ block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev);
+ block_ctx_out->dev_bytenr = multi->stripes[0].physical;
+ block_ctx_out->start = bytenr;
+ block_ctx_out->len = len;
+ block_ctx_out->datav = NULL;
+ block_ctx_out->pagev = NULL;
+ block_ctx_out->mem_to_free = NULL;
+
+ kfree(multi);
+ if (NULL == block_ctx_out->dev) {
+ ret = -ENXIO;
+ printk(KERN_INFO "btrfsic: error, cannot lookup dev (#1)!\n");
+ }
+
+ return ret;
+}
+
+static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx)
+{
+ if (block_ctx->mem_to_free) {
+ unsigned int num_pages;
+
+ BUG_ON(!block_ctx->datav);
+ BUG_ON(!block_ctx->pagev);
+ num_pages = (block_ctx->len + (u64)PAGE_CACHE_SIZE - 1) >>
+ PAGE_CACHE_SHIFT;
+ while (num_pages > 0) {
+ num_pages--;
+ if (block_ctx->datav[num_pages]) {
+ kunmap(block_ctx->pagev[num_pages]);
+ block_ctx->datav[num_pages] = NULL;
+ }
+ if (block_ctx->pagev[num_pages]) {
+ __free_page(block_ctx->pagev[num_pages]);
+ block_ctx->pagev[num_pages] = NULL;
+ }
+ }
+
+ kfree(block_ctx->mem_to_free);
+ block_ctx->mem_to_free = NULL;
+ block_ctx->pagev = NULL;
+ block_ctx->datav = NULL;
+ }
+}
+
+static int btrfsic_read_block(struct btrfsic_state *state,
+ struct btrfsic_block_data_ctx *block_ctx)
+{
+ unsigned int num_pages;
+ unsigned int i;
+ u64 dev_bytenr;
+ int ret;
+
+ BUG_ON(block_ctx->datav);
+ BUG_ON(block_ctx->pagev);
+ BUG_ON(block_ctx->mem_to_free);
+ if (block_ctx->dev_bytenr & ((u64)PAGE_CACHE_SIZE - 1)) {
+ printk(KERN_INFO
+ "btrfsic: read_block() with unaligned bytenr %llu\n",
+ block_ctx->dev_bytenr);
+ return -1;
+ }
+
+ num_pages = (block_ctx->len + (u64)PAGE_CACHE_SIZE - 1) >>
+ PAGE_CACHE_SHIFT;
+ block_ctx->mem_to_free = kzalloc((sizeof(*block_ctx->datav) +
+ sizeof(*block_ctx->pagev)) *
+ num_pages, GFP_NOFS);
+ if (!block_ctx->mem_to_free)
+ return -1;
+ block_ctx->datav = block_ctx->mem_to_free;
+ block_ctx->pagev = (struct page **)(block_ctx->datav + num_pages);
+ for (i = 0; i < num_pages; i++) {
+ block_ctx->pagev[i] = alloc_page(GFP_NOFS);
+ if (!block_ctx->pagev[i])
+ return -1;
+ }
+
+ dev_bytenr = block_ctx->dev_bytenr;
+ for (i = 0; i < num_pages;) {
+ struct bio *bio;
+ unsigned int j;
+
+ bio = btrfs_io_bio_alloc(GFP_NOFS, num_pages - i);
+ if (!bio) {
+ printk(KERN_INFO
+ "btrfsic: bio_alloc() for %u pages failed!\n",
+ num_pages - i);
+ return -1;
+ }
+ bio->bi_bdev = block_ctx->dev->bdev;
+ bio->bi_iter.bi_sector = dev_bytenr >> 9;
+
+ for (j = i; j < num_pages; j++) {
+ ret = bio_add_page(bio, block_ctx->pagev[j],
+ PAGE_CACHE_SIZE, 0);
+ if (PAGE_CACHE_SIZE != ret)
+ break;
+ }
+ if (j == i) {
+ printk(KERN_INFO
+ "btrfsic: error, failed to add a single page!\n");
+ return -1;
+ }
+ if (submit_bio_wait(READ, bio)) {
+ printk(KERN_INFO
+ "btrfsic: read error at logical %llu dev %s!\n",
+ block_ctx->start, block_ctx->dev->name);
+ bio_put(bio);
+ return -1;
+ }
+ bio_put(bio);
+ dev_bytenr += (j - i) * PAGE_CACHE_SIZE;
+ i = j;
+ }
+ for (i = 0; i < num_pages; i++) {
+ block_ctx->datav[i] = kmap(block_ctx->pagev[i]);
+ if (!block_ctx->datav[i]) {
+ printk(KERN_INFO "btrfsic: kmap() failed (dev %s)!\n",
+ block_ctx->dev->name);
+ return -1;
+ }
+ }
+
+ return block_ctx->len;
+}
+
+static void btrfsic_dump_database(struct btrfsic_state *state)
+{
+ struct list_head *elem_all;
+
+ BUG_ON(NULL == state);
+
+ printk(KERN_INFO "all_blocks_list:\n");
+ list_for_each(elem_all, &state->all_blocks_list) {
+ const struct btrfsic_block *const b_all =
+ list_entry(elem_all, struct btrfsic_block,
+ all_blocks_node);
+ struct list_head *elem_ref_to;
+ struct list_head *elem_ref_from;
+
+ printk(KERN_INFO "%c-block @%llu (%s/%llu/%d)\n",
+ btrfsic_get_block_type(state, b_all),
+ b_all->logical_bytenr, b_all->dev_state->name,
+ b_all->dev_bytenr, b_all->mirror_num);
+
+ list_for_each(elem_ref_to, &b_all->ref_to_list) {
+ const struct btrfsic_block_link *const l =
+ list_entry(elem_ref_to,
+ struct btrfsic_block_link,
+ node_ref_to);
+
+ printk(KERN_INFO " %c @%llu (%s/%llu/%d)"
+ " refers %u* to"
+ " %c @%llu (%s/%llu/%d)\n",
+ btrfsic_get_block_type(state, b_all),
+ b_all->logical_bytenr, b_all->dev_state->name,
+ b_all->dev_bytenr, b_all->mirror_num,
+ l->ref_cnt,
+ btrfsic_get_block_type(state, l->block_ref_to),
+ l->block_ref_to->logical_bytenr,
+ l->block_ref_to->dev_state->name,
+ l->block_ref_to->dev_bytenr,
+ l->block_ref_to->mirror_num);
+ }
+
+ list_for_each(elem_ref_from, &b_all->ref_from_list) {
+ const struct btrfsic_block_link *const l =
+ list_entry(elem_ref_from,
+ struct btrfsic_block_link,
+ node_ref_from);
+
+ printk(KERN_INFO " %c @%llu (%s/%llu/%d)"
+ " is ref %u* from"
+ " %c @%llu (%s/%llu/%d)\n",
+ btrfsic_get_block_type(state, b_all),
+ b_all->logical_bytenr, b_all->dev_state->name,
+ b_all->dev_bytenr, b_all->mirror_num,
+ l->ref_cnt,
+ btrfsic_get_block_type(state, l->block_ref_from),
+ l->block_ref_from->logical_bytenr,
+ l->block_ref_from->dev_state->name,
+ l->block_ref_from->dev_bytenr,
+ l->block_ref_from->mirror_num);
+ }
+
+ printk(KERN_INFO "\n");
+ }
+}
+
+/*
+ * Test whether the disk block contains a tree block (leaf or node)
+ * (note that this test fails for the super block)
+ */
+static int btrfsic_test_for_metadata(struct btrfsic_state *state,
+ char **datav, unsigned int num_pages)
+{
+ struct btrfs_header *h;
+ u8 csum[BTRFS_CSUM_SIZE];
+ u32 crc = ~(u32)0;
+ unsigned int i;
+
+ if (num_pages * PAGE_CACHE_SIZE < state->metablock_size)
+ return 1; /* not metadata */
+ num_pages = state->metablock_size >> PAGE_CACHE_SHIFT;
+ h = (struct btrfs_header *)datav[0];
+
+ if (memcmp(h->fsid, state->root->fs_info->fsid, BTRFS_UUID_SIZE))
+ return 1;
+
+ for (i = 0; i < num_pages; i++) {
+ u8 *data = i ? datav[i] : (datav[i] + BTRFS_CSUM_SIZE);
+ size_t sublen = i ? PAGE_CACHE_SIZE :
+ (PAGE_CACHE_SIZE - BTRFS_CSUM_SIZE);
+
+ crc = btrfs_crc32c(crc, data, sublen);
+ }
+ btrfs_csum_final(crc, csum);
+ if (memcmp(csum, h->csum, state->csum_size))
+ return 1;
+
+ return 0; /* is metadata */
+}
+
+static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
+ u64 dev_bytenr, char **mapped_datav,
+ unsigned int num_pages,
+ struct bio *bio, int *bio_is_patched,
+ struct buffer_head *bh,
+ int submit_bio_bh_rw)
+{
+ int is_metadata;
+ struct btrfsic_block *block;
+ struct btrfsic_block_data_ctx block_ctx;
+ int ret;
+ struct btrfsic_state *state = dev_state->state;
+ struct block_device *bdev = dev_state->bdev;
+ unsigned int processed_len;
+
+ if (NULL != bio_is_patched)
+ *bio_is_patched = 0;
+
+again:
+ if (num_pages == 0)
+ return;
+
+ processed_len = 0;
+ is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_datav,
+ num_pages));
+
+ block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr,
+ &state->block_hashtable);
+ if (NULL != block) {
+ u64 bytenr = 0;
+ struct list_head *elem_ref_to;
+ struct list_head *tmp_ref_to;
+
+ if (block->is_superblock) {
+ bytenr = btrfs_super_bytenr((struct btrfs_super_block *)
+ mapped_datav[0]);
+ if (num_pages * PAGE_CACHE_SIZE <
+ BTRFS_SUPER_INFO_SIZE) {
+ printk(KERN_INFO
+ "btrfsic: cannot work with too short bios!\n");
+ return;
+ }
+ is_metadata = 1;
+ BUG_ON(BTRFS_SUPER_INFO_SIZE & (PAGE_CACHE_SIZE - 1));
+ processed_len = BTRFS_SUPER_INFO_SIZE;
+ if (state->print_mask &
+ BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) {
+ printk(KERN_INFO
+ "[before new superblock is written]:\n");
+ btrfsic_dump_tree_sub(state, block, 0);
+ }
+ }
+ if (is_metadata) {
+ if (!block->is_superblock) {
+ if (num_pages * PAGE_CACHE_SIZE <
+ state->metablock_size) {
+ printk(KERN_INFO
+ "btrfsic: cannot work with too short bios!\n");
+ return;
+ }
+ processed_len = state->metablock_size;
+ bytenr = btrfs_stack_header_bytenr(
+ (struct btrfs_header *)
+ mapped_datav[0]);
+ btrfsic_cmp_log_and_dev_bytenr(state, bytenr,
+ dev_state,
+ dev_bytenr);
+ }
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) {
+ if (block->logical_bytenr != bytenr &&
+ !(!block->is_metadata &&
+ block->logical_bytenr == 0))
+ printk(KERN_INFO
+ "Written block @%llu (%s/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu).\n",
+ bytenr, dev_state->name,
+ dev_bytenr,
+ block->mirror_num,
+ btrfsic_get_block_type(state,
+ block),
+ block->logical_bytenr);
+ else
+ printk(KERN_INFO
+ "Written block @%llu (%s/%llu/%d) found in hash table, %c.\n",
+ bytenr, dev_state->name,
+ dev_bytenr, block->mirror_num,
+ btrfsic_get_block_type(state,
+ block));
+ }
+ block->logical_bytenr = bytenr;
+ } else {
+ if (num_pages * PAGE_CACHE_SIZE <
+ state->datablock_size) {
+ printk(KERN_INFO
+ "btrfsic: cannot work with too short bios!\n");
+ return;
+ }
+ processed_len = state->datablock_size;
+ bytenr = block->logical_bytenr;
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO
+ "Written block @%llu (%s/%llu/%d)"
+ " found in hash table, %c.\n",
+ bytenr, dev_state->name, dev_bytenr,
+ block->mirror_num,
+ btrfsic_get_block_type(state, block));
+ }
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO
+ "ref_to_list: %cE, ref_from_list: %cE\n",
+ list_empty(&block->ref_to_list) ? ' ' : '!',
+ list_empty(&block->ref_from_list) ? ' ' : '!');
+ if (btrfsic_is_block_ref_by_superblock(state, block, 0)) {
+ printk(KERN_INFO "btrfs: attempt to overwrite %c-block"
+ " @%llu (%s/%llu/%d), old(gen=%llu,"
+ " objectid=%llu, type=%d, offset=%llu),"
+ " new(gen=%llu),"
+ " which is referenced by most recent superblock"
+ " (superblockgen=%llu)!\n",
+ btrfsic_get_block_type(state, block), bytenr,
+ dev_state->name, dev_bytenr, block->mirror_num,
+ block->generation,
+ btrfs_disk_key_objectid(&block->disk_key),
+ block->disk_key.type,
+ btrfs_disk_key_offset(&block->disk_key),
+ btrfs_stack_header_generation(
+ (struct btrfs_header *) mapped_datav[0]),
+ state->max_superblock_generation);
+ btrfsic_dump_tree(state);
+ }
+
+ if (!block->is_iodone && !block->never_written) {
+ printk(KERN_INFO "btrfs: attempt to overwrite %c-block"
+ " @%llu (%s/%llu/%d), oldgen=%llu, newgen=%llu,"
+ " which is not yet iodone!\n",
+ btrfsic_get_block_type(state, block), bytenr,
+ dev_state->name, dev_bytenr, block->mirror_num,
+ block->generation,
+ btrfs_stack_header_generation(
+ (struct btrfs_header *)
+ mapped_datav[0]));
+ /* it would not be safe to go on */
+ btrfsic_dump_tree(state);
+ goto continue_loop;
+ }
+
+ /*
+ * Clear all references of this block. Do not free
+ * the block itself even if is not referenced anymore
+ * because it still carries valueable information
+ * like whether it was ever written and IO completed.
+ */
+ list_for_each_safe(elem_ref_to, tmp_ref_to,
+ &block->ref_to_list) {
+ struct btrfsic_block_link *const l =
+ list_entry(elem_ref_to,
+ struct btrfsic_block_link,
+ node_ref_to);
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ btrfsic_print_rem_link(state, l);
+ l->ref_cnt--;
+ if (0 == l->ref_cnt) {
+ list_del(&l->node_ref_to);
+ list_del(&l->node_ref_from);
+ btrfsic_block_link_hashtable_remove(l);
+ btrfsic_block_link_free(l);
+ }
+ }
+
+ block_ctx.dev = dev_state;
+ block_ctx.dev_bytenr = dev_bytenr;
+ block_ctx.start = bytenr;
+ block_ctx.len = processed_len;
+ block_ctx.pagev = NULL;
+ block_ctx.mem_to_free = NULL;
+ block_ctx.datav = mapped_datav;
+
+ if (is_metadata || state->include_extent_data) {
+ block->never_written = 0;
+ block->iodone_w_error = 0;
+ if (NULL != bio) {
+ block->is_iodone = 0;
+ BUG_ON(NULL == bio_is_patched);
+ if (!*bio_is_patched) {
+ block->orig_bio_bh_private =
+ bio->bi_private;
+ block->orig_bio_bh_end_io.bio =
+ bio->bi_end_io;
+ block->next_in_same_bio = NULL;
+ bio->bi_private = block;
+ bio->bi_end_io = btrfsic_bio_end_io;
+ *bio_is_patched = 1;
+ } else {
+ struct btrfsic_block *chained_block =
+ (struct btrfsic_block *)
+ bio->bi_private;
+
+ BUG_ON(NULL == chained_block);
+ block->orig_bio_bh_private =
+ chained_block->orig_bio_bh_private;
+ block->orig_bio_bh_end_io.bio =
+ chained_block->orig_bio_bh_end_io.
+ bio;
+ block->next_in_same_bio = chained_block;
+ bio->bi_private = block;
+ }
+ } else if (NULL != bh) {
+ block->is_iodone = 0;
+ block->orig_bio_bh_private = bh->b_private;
+ block->orig_bio_bh_end_io.bh = bh->b_end_io;
+ block->next_in_same_bio = NULL;
+ bh->b_private = block;
+ bh->b_end_io = btrfsic_bh_end_io;
+ } else {
+ block->is_iodone = 1;
+ block->orig_bio_bh_private = NULL;
+ block->orig_bio_bh_end_io.bio = NULL;
+ block->next_in_same_bio = NULL;
+ }
+ }
+
+ block->flush_gen = dev_state->last_flush_gen + 1;
+ block->submit_bio_bh_rw = submit_bio_bh_rw;
+ if (is_metadata) {
+ block->logical_bytenr = bytenr;
+ block->is_metadata = 1;
+ if (block->is_superblock) {
+ BUG_ON(PAGE_CACHE_SIZE !=
+ BTRFS_SUPER_INFO_SIZE);
+ ret = btrfsic_process_written_superblock(
+ state,
+ block,
+ (struct btrfs_super_block *)
+ mapped_datav[0]);
+ if (state->print_mask &
+ BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE) {
+ printk(KERN_INFO
+ "[after new superblock is written]:\n");
+ btrfsic_dump_tree_sub(state, block, 0);
+ }
+ } else {
+ block->mirror_num = 0; /* unknown */
+ ret = btrfsic_process_metablock(
+ state,
+ block,
+ &block_ctx,
+ 0, 0);
+ }
+ if (ret)
+ printk(KERN_INFO
+ "btrfsic: btrfsic_process_metablock"
+ "(root @%llu) failed!\n",
+ dev_bytenr);
+ } else {
+ block->is_metadata = 0;
+ block->mirror_num = 0; /* unknown */
+ block->generation = BTRFSIC_GENERATION_UNKNOWN;
+ if (!state->include_extent_data
+ && list_empty(&block->ref_from_list)) {
+ /*
+ * disk block is overwritten with extent
+ * data (not meta data) and we are configured
+ * to not include extent data: take the
+ * chance and free the block's memory
+ */
+ btrfsic_block_hashtable_remove(block);
+ list_del(&block->all_blocks_node);
+ btrfsic_block_free(block);
+ }
+ }
+ btrfsic_release_block_ctx(&block_ctx);
+ } else {
+ /* block has not been found in hash table */
+ u64 bytenr;
+
+ if (!is_metadata) {
+ processed_len = state->datablock_size;
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO "Written block (%s/%llu/?)"
+ " !found in hash table, D.\n",
+ dev_state->name, dev_bytenr);
+ if (!state->include_extent_data) {
+ /* ignore that written D block */
+ goto continue_loop;
+ }
+
+ /* this is getting ugly for the
+ * include_extent_data case... */
+ bytenr = 0; /* unknown */
+ } else {
+ processed_len = state->metablock_size;
+ bytenr = btrfs_stack_header_bytenr(
+ (struct btrfs_header *)
+ mapped_datav[0]);
+ btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state,
+ dev_bytenr);
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO
+ "Written block @%llu (%s/%llu/?)"
+ " !found in hash table, M.\n",
+ bytenr, dev_state->name, dev_bytenr);
+ }
+
+ block_ctx.dev = dev_state;
+ block_ctx.dev_bytenr = dev_bytenr;
+ block_ctx.start = bytenr;
+ block_ctx.len = processed_len;
+ block_ctx.pagev = NULL;
+ block_ctx.mem_to_free = NULL;
+ block_ctx.datav = mapped_datav;
+
+ block = btrfsic_block_alloc();
+ if (NULL == block) {
+ printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+ btrfsic_release_block_ctx(&block_ctx);
+ goto continue_loop;
+ }
+ block->dev_state = dev_state;
+ block->dev_bytenr = dev_bytenr;
+ block->logical_bytenr = bytenr;
+ block->is_metadata = is_metadata;
+ block->never_written = 0;
+ block->iodone_w_error = 0;
+ block->mirror_num = 0; /* unknown */
+ block->flush_gen = dev_state->last_flush_gen + 1;
+ block->submit_bio_bh_rw = submit_bio_bh_rw;
+ if (NULL != bio) {
+ block->is_iodone = 0;
+ BUG_ON(NULL == bio_is_patched);
+ if (!*bio_is_patched) {
+ block->orig_bio_bh_private = bio->bi_private;
+ block->orig_bio_bh_end_io.bio = bio->bi_end_io;
+ block->next_in_same_bio = NULL;
+ bio->bi_private = block;
+ bio->bi_end_io = btrfsic_bio_end_io;
+ *bio_is_patched = 1;
+ } else {
+ struct btrfsic_block *chained_block =
+ (struct btrfsic_block *)
+ bio->bi_private;
+
+ BUG_ON(NULL == chained_block);
+ block->orig_bio_bh_private =
+ chained_block->orig_bio_bh_private;
+ block->orig_bio_bh_end_io.bio =
+ chained_block->orig_bio_bh_end_io.bio;
+ block->next_in_same_bio = chained_block;
+ bio->bi_private = block;
+ }
+ } else if (NULL != bh) {
+ block->is_iodone = 0;
+ block->orig_bio_bh_private = bh->b_private;
+ block->orig_bio_bh_end_io.bh = bh->b_end_io;
+ block->next_in_same_bio = NULL;
+ bh->b_private = block;
+ bh->b_end_io = btrfsic_bh_end_io;
+ } else {
+ block->is_iodone = 1;
+ block->orig_bio_bh_private = NULL;
+ block->orig_bio_bh_end_io.bio = NULL;
+ block->next_in_same_bio = NULL;
+ }
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO
+ "New written %c-block @%llu (%s/%llu/%d)\n",
+ is_metadata ? 'M' : 'D',
+ block->logical_bytenr, block->dev_state->name,
+ block->dev_bytenr, block->mirror_num);
+ list_add(&block->all_blocks_node, &state->all_blocks_list);
+ btrfsic_block_hashtable_add(block, &state->block_hashtable);
+
+ if (is_metadata) {
+ ret = btrfsic_process_metablock(state, block,
+ &block_ctx, 0, 0);
+ if (ret)
+ printk(KERN_INFO
+ "btrfsic: process_metablock(root @%llu)"
+ " failed!\n",
+ dev_bytenr);
+ }
+ btrfsic_release_block_ctx(&block_ctx);
+ }
+
+continue_loop:
+ BUG_ON(!processed_len);
+ dev_bytenr += processed_len;
+ mapped_datav += processed_len >> PAGE_CACHE_SHIFT;
+ num_pages -= processed_len >> PAGE_CACHE_SHIFT;
+ goto again;
+}
+
+static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status)
+{
+ struct btrfsic_block *block = (struct btrfsic_block *)bp->bi_private;
+ int iodone_w_error;
+
+ /* mutex is not held! This is not save if IO is not yet completed
+ * on umount */
+ iodone_w_error = 0;
+ if (bio_error_status)
+ iodone_w_error = 1;
+
+ BUG_ON(NULL == block);
+ bp->bi_private = block->orig_bio_bh_private;
+ bp->bi_end_io = block->orig_bio_bh_end_io.bio;
+
+ do {
+ struct btrfsic_block *next_block;
+ struct btrfsic_dev_state *const dev_state = block->dev_state;
+
+ if ((dev_state->state->print_mask &
+ BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
+ printk(KERN_INFO
+ "bio_end_io(err=%d) for %c @%llu (%s/%llu/%d)\n",
+ bio_error_status,
+ btrfsic_get_block_type(dev_state->state, block),
+ block->logical_bytenr, dev_state->name,
+ block->dev_bytenr, block->mirror_num);
+ next_block = block->next_in_same_bio;
+ block->iodone_w_error = iodone_w_error;
+ if (block->submit_bio_bh_rw & REQ_FLUSH) {
+ dev_state->last_flush_gen++;
+ if ((dev_state->state->print_mask &
+ BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
+ printk(KERN_INFO
+ "bio_end_io() new %s flush_gen=%llu\n",
+ dev_state->name,
+ dev_state->last_flush_gen);
+ }
+ if (block->submit_bio_bh_rw & REQ_FUA)
+ block->flush_gen = 0; /* FUA completed means block is
+ * on disk */
+ block->is_iodone = 1; /* for FLUSH, this releases the block */
+ block = next_block;
+ } while (NULL != block);
+
+ bp->bi_end_io(bp, bio_error_status);
+}
+
+static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate)
+{
+ struct btrfsic_block *block = (struct btrfsic_block *)bh->b_private;
+ int iodone_w_error = !uptodate;
+ struct btrfsic_dev_state *dev_state;
+
+ BUG_ON(NULL == block);
+ dev_state = block->dev_state;
+ if ((dev_state->state->print_mask & BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
+ printk(KERN_INFO
+ "bh_end_io(error=%d) for %c @%llu (%s/%llu/%d)\n",
+ iodone_w_error,
+ btrfsic_get_block_type(dev_state->state, block),
+ block->logical_bytenr, block->dev_state->name,
+ block->dev_bytenr, block->mirror_num);
+
+ block->iodone_w_error = iodone_w_error;
+ if (block->submit_bio_bh_rw & REQ_FLUSH) {
+ dev_state->last_flush_gen++;
+ if ((dev_state->state->print_mask &
+ BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
+ printk(KERN_INFO
+ "bh_end_io() new %s flush_gen=%llu\n",
+ dev_state->name, dev_state->last_flush_gen);
+ }
+ if (block->submit_bio_bh_rw & REQ_FUA)
+ block->flush_gen = 0; /* FUA completed means block is on disk */
+
+ bh->b_private = block->orig_bio_bh_private;
+ bh->b_end_io = block->orig_bio_bh_end_io.bh;
+ block->is_iodone = 1; /* for FLUSH, this releases the block */
+ bh->b_end_io(bh, uptodate);
+}
+
+static int btrfsic_process_written_superblock(
+ struct btrfsic_state *state,
+ struct btrfsic_block *const superblock,
+ struct btrfs_super_block *const super_hdr)
+{
+ int pass;
+
+ superblock->generation = btrfs_super_generation(super_hdr);
+ if (!(superblock->generation > state->max_superblock_generation ||
+ 0 == state->max_superblock_generation)) {
+ if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
+ printk(KERN_INFO
+ "btrfsic: superblock @%llu (%s/%llu/%d)"
+ " with old gen %llu <= %llu\n",
+ superblock->logical_bytenr,
+ superblock->dev_state->name,
+ superblock->dev_bytenr, superblock->mirror_num,
+ btrfs_super_generation(super_hdr),
+ state->max_superblock_generation);
+ } else {
+ if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
+ printk(KERN_INFO
+ "btrfsic: got new superblock @%llu (%s/%llu/%d)"
+ " with new gen %llu > %llu\n",
+ superblock->logical_bytenr,
+ superblock->dev_state->name,
+ superblock->dev_bytenr, superblock->mirror_num,
+ btrfs_super_generation(super_hdr),
+ state->max_superblock_generation);
+
+ state->max_superblock_generation =
+ btrfs_super_generation(super_hdr);
+ state->latest_superblock = superblock;
+ }
+
+ for (pass = 0; pass < 3; pass++) {
+ int ret;
+ u64 next_bytenr;
+ struct btrfsic_block *next_block;
+ struct btrfsic_block_data_ctx tmp_next_block_ctx;
+ struct btrfsic_block_link *l;
+ int num_copies;
+ int mirror_num;
+ const char *additional_string = NULL;
+ struct btrfs_disk_key tmp_disk_key = {0};
+
+ btrfs_set_disk_key_objectid(&tmp_disk_key,
+ BTRFS_ROOT_ITEM_KEY);
+ btrfs_set_disk_key_objectid(&tmp_disk_key, 0);
+
+ switch (pass) {
+ case 0:
+ btrfs_set_disk_key_objectid(&tmp_disk_key,
+ BTRFS_ROOT_TREE_OBJECTID);
+ additional_string = "root ";
+ next_bytenr = btrfs_super_root(super_hdr);
+ if (state->print_mask &
+ BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+ printk(KERN_INFO "root@%llu\n", next_bytenr);
+ break;
+ case 1:
+ btrfs_set_disk_key_objectid(&tmp_disk_key,
+ BTRFS_CHUNK_TREE_OBJECTID);
+ additional_string = "chunk ";
+ next_bytenr = btrfs_super_chunk_root(super_hdr);
+ if (state->print_mask &
+ BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+ printk(KERN_INFO "chunk@%llu\n", next_bytenr);
+ break;
+ case 2:
+ btrfs_set_disk_key_objectid(&tmp_disk_key,
+ BTRFS_TREE_LOG_OBJECTID);
+ additional_string = "log ";
+ next_bytenr = btrfs_super_log_root(super_hdr);
+ if (0 == next_bytenr)
+ continue;
+ if (state->print_mask &
+ BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+ printk(KERN_INFO "log@%llu\n", next_bytenr);
+ break;
+ }
+
+ num_copies =
+ btrfs_num_copies(state->root->fs_info,
+ next_bytenr, BTRFS_SUPER_INFO_SIZE);
+ if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+ printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+ next_bytenr, num_copies);
+ for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+ int was_created;
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO
+ "btrfsic_process_written_superblock("
+ "mirror_num=%d)\n", mirror_num);
+ ret = btrfsic_map_block(state, next_bytenr,
+ BTRFS_SUPER_INFO_SIZE,
+ &tmp_next_block_ctx,
+ mirror_num);
+ if (ret) {
+ printk(KERN_INFO
+ "btrfsic: btrfsic_map_block(@%llu,"
+ " mirror=%d) failed!\n",
+ next_bytenr, mirror_num);
+ return -1;
+ }
+
+ next_block = btrfsic_block_lookup_or_add(
+ state,
+ &tmp_next_block_ctx,
+ additional_string,
+ 1, 0, 1,
+ mirror_num,
+ &was_created);
+ if (NULL == next_block) {
+ printk(KERN_INFO
+ "btrfsic: error, kmalloc failed!\n");
+ btrfsic_release_block_ctx(&tmp_next_block_ctx);
+ return -1;
+ }
+
+ next_block->disk_key = tmp_disk_key;
+ if (was_created)
+ next_block->generation =
+ BTRFSIC_GENERATION_UNKNOWN;
+ l = btrfsic_block_link_lookup_or_add(
+ state,
+ &tmp_next_block_ctx,
+ next_block,
+ superblock,
+ BTRFSIC_GENERATION_UNKNOWN);
+ btrfsic_release_block_ctx(&tmp_next_block_ctx);
+ if (NULL == l)
+ return -1;
+ }
+ }
+
+ if (WARN_ON(-1 == btrfsic_check_all_ref_blocks(state, superblock, 0)))
+ btrfsic_dump_tree(state);
+
+ return 0;
+}
+
+static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
+ struct btrfsic_block *const block,
+ int recursion_level)
+{
+ struct list_head *elem_ref_to;
+ int ret = 0;
+
+ if (recursion_level >= 3 + BTRFS_MAX_LEVEL) {
+ /*
+ * Note that this situation can happen and does not
+ * indicate an error in regular cases. It happens
+ * when disk blocks are freed and later reused.
+ * The check-integrity module is not aware of any
+ * block free operations, it just recognizes block
+ * write operations. Therefore it keeps the linkage
+ * information for a block until a block is
+ * rewritten. This can temporarily cause incorrect
+ * and even circular linkage informations. This
+ * causes no harm unless such blocks are referenced
+ * by the most recent super block.
+ */
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO
+ "btrfsic: abort cyclic linkage (case 1).\n");
+
+ return ret;
+ }
+
+ /*
+ * This algorithm is recursive because the amount of used stack
+ * space is very small and the max recursion depth is limited.
+ */
+ list_for_each(elem_ref_to, &block->ref_to_list) {
+ const struct btrfsic_block_link *const l =
+ list_entry(elem_ref_to, struct btrfsic_block_link,
+ node_ref_to);
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO
+ "rl=%d, %c @%llu (%s/%llu/%d)"
+ " %u* refers to %c @%llu (%s/%llu/%d)\n",
+ recursion_level,
+ btrfsic_get_block_type(state, block),
+ block->logical_bytenr, block->dev_state->name,
+ block->dev_bytenr, block->mirror_num,
+ l->ref_cnt,
+ btrfsic_get_block_type(state, l->block_ref_to),
+ l->block_ref_to->logical_bytenr,
+ l->block_ref_to->dev_state->name,
+ l->block_ref_to->dev_bytenr,
+ l->block_ref_to->mirror_num);
+ if (l->block_ref_to->never_written) {
+ printk(KERN_INFO "btrfs: attempt to write superblock"
+ " which references block %c @%llu (%s/%llu/%d)"
+ " which is never written!\n",
+ btrfsic_get_block_type(state, l->block_ref_to),
+ l->block_ref_to->logical_bytenr,
+ l->block_ref_to->dev_state->name,
+ l->block_ref_to->dev_bytenr,
+ l->block_ref_to->mirror_num);
+ ret = -1;
+ } else if (!l->block_ref_to->is_iodone) {
+ printk(KERN_INFO "btrfs: attempt to write superblock"
+ " which references block %c @%llu (%s/%llu/%d)"
+ " which is not yet iodone!\n",
+ btrfsic_get_block_type(state, l->block_ref_to),
+ l->block_ref_to->logical_bytenr,
+ l->block_ref_to->dev_state->name,
+ l->block_ref_to->dev_bytenr,
+ l->block_ref_to->mirror_num);
+ ret = -1;
+ } else if (l->block_ref_to->iodone_w_error) {
+ printk(KERN_INFO "btrfs: attempt to write superblock"
+ " which references block %c @%llu (%s/%llu/%d)"
+ " which has write error!\n",
+ btrfsic_get_block_type(state, l->block_ref_to),
+ l->block_ref_to->logical_bytenr,
+ l->block_ref_to->dev_state->name,
+ l->block_ref_to->dev_bytenr,
+ l->block_ref_to->mirror_num);
+ ret = -1;
+ } else if (l->parent_generation !=
+ l->block_ref_to->generation &&
+ BTRFSIC_GENERATION_UNKNOWN !=
+ l->parent_generation &&
+ BTRFSIC_GENERATION_UNKNOWN !=
+ l->block_ref_to->generation) {
+ printk(KERN_INFO "btrfs: attempt to write superblock"
+ " which references block %c @%llu (%s/%llu/%d)"
+ " with generation %llu !="
+ " parent generation %llu!\n",
+ btrfsic_get_block_type(state, l->block_ref_to),
+ l->block_ref_to->logical_bytenr,
+ l->block_ref_to->dev_state->name,
+ l->block_ref_to->dev_bytenr,
+ l->block_ref_to->mirror_num,
+ l->block_ref_to->generation,
+ l->parent_generation);
+ ret = -1;
+ } else if (l->block_ref_to->flush_gen >
+ l->block_ref_to->dev_state->last_flush_gen) {
+ printk(KERN_INFO "btrfs: attempt to write superblock"
+ " which references block %c @%llu (%s/%llu/%d)"
+ " which is not flushed out of disk's write cache"
+ " (block flush_gen=%llu,"
+ " dev->flush_gen=%llu)!\n",
+ btrfsic_get_block_type(state, l->block_ref_to),
+ l->block_ref_to->logical_bytenr,
+ l->block_ref_to->dev_state->name,
+ l->block_ref_to->dev_bytenr,
+ l->block_ref_to->mirror_num, block->flush_gen,
+ l->block_ref_to->dev_state->last_flush_gen);
+ ret = -1;
+ } else if (-1 == btrfsic_check_all_ref_blocks(state,
+ l->block_ref_to,
+ recursion_level +
+ 1)) {
+ ret = -1;
+ }
+ }
+
+ return ret;
+}
+
+static int btrfsic_is_block_ref_by_superblock(
+ const struct btrfsic_state *state,
+ const struct btrfsic_block *block,
+ int recursion_level)
+{
+ struct list_head *elem_ref_from;
+
+ if (recursion_level >= 3 + BTRFS_MAX_LEVEL) {
+ /* refer to comment at "abort cyclic linkage (case 1)" */
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO
+ "btrfsic: abort cyclic linkage (case 2).\n");
+
+ return 0;
+ }
+
+ /*
+ * This algorithm is recursive because the amount of used stack space
+ * is very small and the max recursion depth is limited.
+ */
+ list_for_each(elem_ref_from, &block->ref_from_list) {
+ const struct btrfsic_block_link *const l =
+ list_entry(elem_ref_from, struct btrfsic_block_link,
+ node_ref_from);
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO
+ "rl=%d, %c @%llu (%s/%llu/%d)"
+ " is ref %u* from %c @%llu (%s/%llu/%d)\n",
+ recursion_level,
+ btrfsic_get_block_type(state, block),
+ block->logical_bytenr, block->dev_state->name,
+ block->dev_bytenr, block->mirror_num,
+ l->ref_cnt,
+ btrfsic_get_block_type(state, l->block_ref_from),
+ l->block_ref_from->logical_bytenr,
+ l->block_ref_from->dev_state->name,
+ l->block_ref_from->dev_bytenr,
+ l->block_ref_from->mirror_num);
+ if (l->block_ref_from->is_superblock &&
+ state->latest_superblock->dev_bytenr ==
+ l->block_ref_from->dev_bytenr &&
+ state->latest_superblock->dev_state->bdev ==
+ l->block_ref_from->dev_state->bdev)
+ return 1;
+ else if (btrfsic_is_block_ref_by_superblock(state,
+ l->block_ref_from,
+ recursion_level +
+ 1))
+ return 1;
+ }
+
+ return 0;
+}
+
+static void btrfsic_print_add_link(const struct btrfsic_state *state,
+ const struct btrfsic_block_link *l)
+{
+ printk(KERN_INFO
+ "Add %u* link from %c @%llu (%s/%llu/%d)"
+ " to %c @%llu (%s/%llu/%d).\n",
+ l->ref_cnt,
+ btrfsic_get_block_type(state, l->block_ref_from),
+ l->block_ref_from->logical_bytenr,
+ l->block_ref_from->dev_state->name,
+ l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num,
+ btrfsic_get_block_type(state, l->block_ref_to),
+ l->block_ref_to->logical_bytenr,
+ l->block_ref_to->dev_state->name, l->block_ref_to->dev_bytenr,
+ l->block_ref_to->mirror_num);
+}
+
+static void btrfsic_print_rem_link(const struct btrfsic_state *state,
+ const struct btrfsic_block_link *l)
+{
+ printk(KERN_INFO
+ "Rem %u* link from %c @%llu (%s/%llu/%d)"
+ " to %c @%llu (%s/%llu/%d).\n",
+ l->ref_cnt,
+ btrfsic_get_block_type(state, l->block_ref_from),
+ l->block_ref_from->logical_bytenr,
+ l->block_ref_from->dev_state->name,
+ l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num,
+ btrfsic_get_block_type(state, l->block_ref_to),
+ l->block_ref_to->logical_bytenr,
+ l->block_ref_to->dev_state->name, l->block_ref_to->dev_bytenr,
+ l->block_ref_to->mirror_num);
+}
+
+static char btrfsic_get_block_type(const struct btrfsic_state *state,
+ const struct btrfsic_block *block)
+{
+ if (block->is_superblock &&
+ state->latest_superblock->dev_bytenr == block->dev_bytenr &&
+ state->latest_superblock->dev_state->bdev == block->dev_state->bdev)
+ return 'S';
+ else if (block->is_superblock)
+ return 's';
+ else if (block->is_metadata)
+ return 'M';
+ else
+ return 'D';
+}
+
+static void btrfsic_dump_tree(const struct btrfsic_state *state)
+{
+ btrfsic_dump_tree_sub(state, state->latest_superblock, 0);
+}
+
+static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
+ const struct btrfsic_block *block,
+ int indent_level)
+{
+ struct list_head *elem_ref_to;
+ int indent_add;
+ static char buf[80];
+ int cursor_position;
+
+ /*
+ * Should better fill an on-stack buffer with a complete line and
+ * dump it at once when it is time to print a newline character.
+ */
+
+ /*
+ * This algorithm is recursive because the amount of used stack space
+ * is very small and the max recursion depth is limited.
+ */
+ indent_add = sprintf(buf, "%c-%llu(%s/%llu/%d)",
+ btrfsic_get_block_type(state, block),
+ block->logical_bytenr, block->dev_state->name,
+ block->dev_bytenr, block->mirror_num);
+ if (indent_level + indent_add > BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) {
+ printk("[...]\n");
+ return;
+ }
+ printk(buf);
+ indent_level += indent_add;
+ if (list_empty(&block->ref_to_list)) {
+ printk("\n");
+ return;
+ }
+ if (block->mirror_num > 1 &&
+ !(state->print_mask & BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS)) {
+ printk(" [...]\n");
+ return;
+ }
+
+ cursor_position = indent_level;
+ list_for_each(elem_ref_to, &block->ref_to_list) {
+ const struct btrfsic_block_link *const l =
+ list_entry(elem_ref_to, struct btrfsic_block_link,
+ node_ref_to);
+
+ while (cursor_position < indent_level) {
+ printk(" ");
+ cursor_position++;
+ }
+ if (l->ref_cnt > 1)
+ indent_add = sprintf(buf, " %d*--> ", l->ref_cnt);
+ else
+ indent_add = sprintf(buf, " --> ");
+ if (indent_level + indent_add >
+ BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) {
+ printk("[...]\n");
+ cursor_position = 0;
+ continue;
+ }
+
+ printk(buf);
+
+ btrfsic_dump_tree_sub(state, l->block_ref_to,
+ indent_level + indent_add);
+ cursor_position = 0;
+ }
+}
+
+static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add(
+ struct btrfsic_state *state,
+ struct btrfsic_block_data_ctx *next_block_ctx,
+ struct btrfsic_block *next_block,
+ struct btrfsic_block *from_block,
+ u64 parent_generation)
+{
+ struct btrfsic_block_link *l;
+
+ l = btrfsic_block_link_hashtable_lookup(next_block_ctx->dev->bdev,
+ next_block_ctx->dev_bytenr,
+ from_block->dev_state->bdev,
+ from_block->dev_bytenr,
+ &state->block_link_hashtable);
+ if (NULL == l) {
+ l = btrfsic_block_link_alloc();
+ if (NULL == l) {
+ printk(KERN_INFO
+ "btrfsic: error, kmalloc" " failed!\n");
+ return NULL;
+ }
+
+ l->block_ref_to = next_block;
+ l->block_ref_from = from_block;
+ l->ref_cnt = 1;
+ l->parent_generation = parent_generation;
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ btrfsic_print_add_link(state, l);
+
+ list_add(&l->node_ref_to, &from_block->ref_to_list);
+ list_add(&l->node_ref_from, &next_block->ref_from_list);
+
+ btrfsic_block_link_hashtable_add(l,
+ &state->block_link_hashtable);
+ } else {
+ l->ref_cnt++;
+ l->parent_generation = parent_generation;
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ btrfsic_print_add_link(state, l);
+ }
+
+ return l;
+}
+
+static struct btrfsic_block *btrfsic_block_lookup_or_add(
+ struct btrfsic_state *state,
+ struct btrfsic_block_data_ctx *block_ctx,
+ const char *additional_string,
+ int is_metadata,
+ int is_iodone,
+ int never_written,
+ int mirror_num,
+ int *was_created)
+{
+ struct btrfsic_block *block;
+
+ block = btrfsic_block_hashtable_lookup(block_ctx->dev->bdev,
+ block_ctx->dev_bytenr,
+ &state->block_hashtable);
+ if (NULL == block) {
+ struct btrfsic_dev_state *dev_state;
+
+ block = btrfsic_block_alloc();
+ if (NULL == block) {
+ printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+ return NULL;
+ }
+ dev_state = btrfsic_dev_state_lookup(block_ctx->dev->bdev);
+ if (NULL == dev_state) {
+ printk(KERN_INFO
+ "btrfsic: error, lookup dev_state failed!\n");
+ btrfsic_block_free(block);
+ return NULL;
+ }
+ block->dev_state = dev_state;
+ block->dev_bytenr = block_ctx->dev_bytenr;
+ block->logical_bytenr = block_ctx->start;
+ block->is_metadata = is_metadata;
+ block->is_iodone = is_iodone;
+ block->never_written = never_written;
+ block->mirror_num = mirror_num;
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO
+ "New %s%c-block @%llu (%s/%llu/%d)\n",
+ additional_string,
+ btrfsic_get_block_type(state, block),
+ block->logical_bytenr, dev_state->name,
+ block->dev_bytenr, mirror_num);
+ list_add(&block->all_blocks_node, &state->all_blocks_list);
+ btrfsic_block_hashtable_add(block, &state->block_hashtable);
+ if (NULL != was_created)
+ *was_created = 1;
+ } else {
+ if (NULL != was_created)
+ *was_created = 0;
+ }
+
+ return block;
+}
+
+static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
+ u64 bytenr,
+ struct btrfsic_dev_state *dev_state,
+ u64 dev_bytenr)
+{
+ int num_copies;
+ int mirror_num;
+ int ret;
+ struct btrfsic_block_data_ctx block_ctx;
+ int match = 0;
+
+ num_copies = btrfs_num_copies(state->root->fs_info,
+ bytenr, state->metablock_size);
+
+ for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+ ret = btrfsic_map_block(state, bytenr, state->metablock_size,
+ &block_ctx, mirror_num);
+ if (ret) {
+ printk(KERN_INFO "btrfsic:"
+ " btrfsic_map_block(logical @%llu,"
+ " mirror %d) failed!\n",
+ bytenr, mirror_num);
+ continue;
+ }
+
+ if (dev_state->bdev == block_ctx.dev->bdev &&
+ dev_bytenr == block_ctx.dev_bytenr) {
+ match++;
+ btrfsic_release_block_ctx(&block_ctx);
+ break;
+ }
+ btrfsic_release_block_ctx(&block_ctx);
+ }
+
+ if (WARN_ON(!match)) {
+ printk(KERN_INFO "btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio,"
+ " buffer->log_bytenr=%llu, submit_bio(bdev=%s,"
+ " phys_bytenr=%llu)!\n",
+ bytenr, dev_state->name, dev_bytenr);
+ for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+ ret = btrfsic_map_block(state, bytenr,
+ state->metablock_size,
+ &block_ctx, mirror_num);
+ if (ret)
+ continue;
+
+ printk(KERN_INFO "Read logical bytenr @%llu maps to"
+ " (%s/%llu/%d)\n",
+ bytenr, block_ctx.dev->name,
+ block_ctx.dev_bytenr, mirror_num);
+ }
+ }
+}
+
+static struct btrfsic_dev_state *btrfsic_dev_state_lookup(
+ struct block_device *bdev)
+{
+ struct btrfsic_dev_state *ds;
+
+ ds = btrfsic_dev_state_hashtable_lookup(bdev,
+ &btrfsic_dev_state_hashtable);
+ return ds;
+}
+
+int btrfsic_submit_bh(int rw, struct buffer_head *bh)
+{
+ struct btrfsic_dev_state *dev_state;
+
+ if (!btrfsic_is_initialized)
+ return submit_bh(rw, bh);
+
+ mutex_lock(&btrfsic_mutex);
+ /* since btrfsic_submit_bh() might also be called before
+ * btrfsic_mount(), this might return NULL */
+ dev_state = btrfsic_dev_state_lookup(bh->b_bdev);
+
+ /* Only called to write the superblock (incl. FLUSH/FUA) */
+ if (NULL != dev_state &&
+ (rw & WRITE) && bh->b_size > 0) {
+ u64 dev_bytenr;
+
+ dev_bytenr = 4096 * bh->b_blocknr;
+ if (dev_state->state->print_mask &
+ BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+ printk(KERN_INFO
+ "submit_bh(rw=0x%x, blocknr=%llu (bytenr %llu),"
+ " size=%zu, data=%p, bdev=%p)\n",
+ rw, (unsigned long long)bh->b_blocknr,
+ dev_bytenr, bh->b_size, bh->b_data, bh->b_bdev);
+ btrfsic_process_written_block(dev_state, dev_bytenr,
+ &bh->b_data, 1, NULL,
+ NULL, bh, rw);
+ } else if (NULL != dev_state && (rw & REQ_FLUSH)) {
+ if (dev_state->state->print_mask &
+ BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+ printk(KERN_INFO
+ "submit_bh(rw=0x%x FLUSH, bdev=%p)\n",
+ rw, bh->b_bdev);
+ if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
+ if ((dev_state->state->print_mask &
+ (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
+ BTRFSIC_PRINT_MASK_VERBOSE)))
+ printk(KERN_INFO
+ "btrfsic_submit_bh(%s) with FLUSH"
+ " but dummy block already in use"
+ " (ignored)!\n",
+ dev_state->name);
+ } else {
+ struct btrfsic_block *const block =
+ &dev_state->dummy_block_for_bio_bh_flush;
+
+ block->is_iodone = 0;
+ block->never_written = 0;
+ block->iodone_w_error = 0;
+ block->flush_gen = dev_state->last_flush_gen + 1;
+ block->submit_bio_bh_rw = rw;
+ block->orig_bio_bh_private = bh->b_private;
+ block->orig_bio_bh_end_io.bh = bh->b_end_io;
+ block->next_in_same_bio = NULL;
+ bh->b_private = block;
+ bh->b_end_io = btrfsic_bh_end_io;
+ }
+ }
+ mutex_unlock(&btrfsic_mutex);
+ return submit_bh(rw, bh);
+}
+
+static void __btrfsic_submit_bio(int rw, struct bio *bio)
+{
+ struct btrfsic_dev_state *dev_state;
+
+ if (!btrfsic_is_initialized)
+ return;
+
+ mutex_lock(&btrfsic_mutex);
+ /* since btrfsic_submit_bio() is also called before
+ * btrfsic_mount(), this might return NULL */
+ dev_state = btrfsic_dev_state_lookup(bio->bi_bdev);
+ if (NULL != dev_state &&
+ (rw & WRITE) && NULL != bio->bi_io_vec) {
+ unsigned int i;
+ u64 dev_bytenr;
+ u64 cur_bytenr;
+ int bio_is_patched;
+ char **mapped_datav;
+
+ dev_bytenr = 512 * bio->bi_iter.bi_sector;
+ bio_is_patched = 0;
+ if (dev_state->state->print_mask &
+ BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+ printk(KERN_INFO
+ "submit_bio(rw=0x%x, bi_vcnt=%u,"
+ " bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n",
+ rw, bio->bi_vcnt,
+ (unsigned long long)bio->bi_iter.bi_sector,
+ dev_bytenr, bio->bi_bdev);
+
+ mapped_datav = kmalloc_array(bio->bi_vcnt,
+ sizeof(*mapped_datav), GFP_NOFS);
+ if (!mapped_datav)
+ goto leave;
+ cur_bytenr = dev_bytenr;
+ for (i = 0; i < bio->bi_vcnt; i++) {
+ BUG_ON(bio->bi_io_vec[i].bv_len != PAGE_CACHE_SIZE);
+ mapped_datav[i] = kmap(bio->bi_io_vec[i].bv_page);
+ if (!mapped_datav[i]) {
+ while (i > 0) {
+ i--;
+ kunmap(bio->bi_io_vec[i].bv_page);
+ }
+ kfree(mapped_datav);
+ goto leave;
+ }
+ if (dev_state->state->print_mask &
+ BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE)
+ printk(KERN_INFO
+ "#%u: bytenr=%llu, len=%u, offset=%u\n",
+ i, cur_bytenr, bio->bi_io_vec[i].bv_len,
+ bio->bi_io_vec[i].bv_offset);
+ cur_bytenr += bio->bi_io_vec[i].bv_len;
+ }
+ btrfsic_process_written_block(dev_state, dev_bytenr,
+ mapped_datav, bio->bi_vcnt,
+ bio, &bio_is_patched,
+ NULL, rw);
+ while (i > 0) {
+ i--;
+ kunmap(bio->bi_io_vec[i].bv_page);
+ }
+ kfree(mapped_datav);
+ } else if (NULL != dev_state && (rw & REQ_FLUSH)) {
+ if (dev_state->state->print_mask &
+ BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+ printk(KERN_INFO
+ "submit_bio(rw=0x%x FLUSH, bdev=%p)\n",
+ rw, bio->bi_bdev);
+ if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
+ if ((dev_state->state->print_mask &
+ (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
+ BTRFSIC_PRINT_MASK_VERBOSE)))
+ printk(KERN_INFO
+ "btrfsic_submit_bio(%s) with FLUSH"
+ " but dummy block already in use"
+ " (ignored)!\n",
+ dev_state->name);
+ } else {
+ struct btrfsic_block *const block =
+ &dev_state->dummy_block_for_bio_bh_flush;
+
+ block->is_iodone = 0;
+ block->never_written = 0;
+ block->iodone_w_error = 0;
+ block->flush_gen = dev_state->last_flush_gen + 1;
+ block->submit_bio_bh_rw = rw;
+ block->orig_bio_bh_private = bio->bi_private;
+ block->orig_bio_bh_end_io.bio = bio->bi_end_io;
+ block->next_in_same_bio = NULL;
+ bio->bi_private = block;
+ bio->bi_end_io = btrfsic_bio_end_io;
+ }
+ }
+leave:
+ mutex_unlock(&btrfsic_mutex);
+}
+
+void btrfsic_submit_bio(int rw, struct bio *bio)
+{
+ __btrfsic_submit_bio(rw, bio);
+ submit_bio(rw, bio);
+}
+
+int btrfsic_submit_bio_wait(int rw, struct bio *bio)
+{
+ __btrfsic_submit_bio(rw, bio);
+ return submit_bio_wait(rw, bio);
+}
+
+int btrfsic_mount(struct btrfs_root *root,
+ struct btrfs_fs_devices *fs_devices,
+ int including_extent_data, u32 print_mask)
+{
+ int ret;
+ struct btrfsic_state *state;
+ struct list_head *dev_head = &fs_devices->devices;
+ struct btrfs_device *device;
+
+ if (root->nodesize & ((u64)PAGE_CACHE_SIZE - 1)) {
+ printk(KERN_INFO
+ "btrfsic: cannot handle nodesize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
+ root->nodesize, PAGE_CACHE_SIZE);
+ return -1;
+ }
+ if (root->sectorsize & ((u64)PAGE_CACHE_SIZE - 1)) {
+ printk(KERN_INFO
+ "btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
+ root->sectorsize, PAGE_CACHE_SIZE);
+ return -1;
+ }
+ state = kzalloc(sizeof(*state), GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
+ if (!state) {
+ state = vzalloc(sizeof(*state));
+ if (!state) {
+ printk(KERN_INFO "btrfs check-integrity: vzalloc() failed!\n");
+ return -1;
+ }
+ }
+
+ if (!btrfsic_is_initialized) {
+ mutex_init(&btrfsic_mutex);
+ btrfsic_dev_state_hashtable_init(&btrfsic_dev_state_hashtable);
+ btrfsic_is_initialized = 1;
+ }
+ mutex_lock(&btrfsic_mutex);
+ state->root = root;
+ state->print_mask = print_mask;
+ state->include_extent_data = including_extent_data;
+ state->csum_size = 0;
+ state->metablock_size = root->nodesize;
+ state->datablock_size = root->sectorsize;
+ INIT_LIST_HEAD(&state->all_blocks_list);
+ btrfsic_block_hashtable_init(&state->block_hashtable);
+ btrfsic_block_link_hashtable_init(&state->block_link_hashtable);
+ state->max_superblock_generation = 0;
+ state->latest_superblock = NULL;
+
+ list_for_each_entry(device, dev_head, dev_list) {
+ struct btrfsic_dev_state *ds;
+ char *p;
+
+ if (!device->bdev || !device->name)
+ continue;
+
+ ds = btrfsic_dev_state_alloc();
+ if (NULL == ds) {
+ printk(KERN_INFO
+ "btrfs check-integrity: kmalloc() failed!\n");
+ mutex_unlock(&btrfsic_mutex);
+ return -1;
+ }
+ ds->bdev = device->bdev;
+ ds->state = state;
+ bdevname(ds->bdev, ds->name);
+ ds->name[BDEVNAME_SIZE - 1] = '\0';
+ for (p = ds->name; *p != '\0'; p++);
+ while (p > ds->name && *p != '/')
+ p--;
+ if (*p == '/')
+ p++;
+ strlcpy(ds->name, p, sizeof(ds->name));
+ btrfsic_dev_state_hashtable_add(ds,
+ &btrfsic_dev_state_hashtable);
+ }
+
+ ret = btrfsic_process_superblock(state, fs_devices);
+ if (0 != ret) {
+ mutex_unlock(&btrfsic_mutex);
+ btrfsic_unmount(root, fs_devices);
+ return ret;
+ }
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_DATABASE)
+ btrfsic_dump_database(state);
+ if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_TREE)
+ btrfsic_dump_tree(state);
+
+ mutex_unlock(&btrfsic_mutex);
+ return 0;
+}
+
+void btrfsic_unmount(struct btrfs_root *root,
+ struct btrfs_fs_devices *fs_devices)
+{
+ struct list_head *elem_all;
+ struct list_head *tmp_all;
+ struct btrfsic_state *state;
+ struct list_head *dev_head = &fs_devices->devices;
+ struct btrfs_device *device;
+
+ if (!btrfsic_is_initialized)
+ return;
+
+ mutex_lock(&btrfsic_mutex);
+
+ state = NULL;
+ list_for_each_entry(device, dev_head, dev_list) {
+ struct btrfsic_dev_state *ds;
+
+ if (!device->bdev || !device->name)
+ continue;
+
+ ds = btrfsic_dev_state_hashtable_lookup(
+ device->bdev,
+ &btrfsic_dev_state_hashtable);
+ if (NULL != ds) {
+ state = ds->state;
+ btrfsic_dev_state_hashtable_remove(ds);
+ btrfsic_dev_state_free(ds);
+ }
+ }
+
+ if (NULL == state) {
+ printk(KERN_INFO
+ "btrfsic: error, cannot find state information"
+ " on umount!\n");
+ mutex_unlock(&btrfsic_mutex);
+ return;
+ }
+
+ /*
+ * Don't care about keeping the lists' state up to date,
+ * just free all memory that was allocated dynamically.
+ * Free the blocks and the block_links.
+ */
+ list_for_each_safe(elem_all, tmp_all, &state->all_blocks_list) {
+ struct btrfsic_block *const b_all =
+ list_entry(elem_all, struct btrfsic_block,
+ all_blocks_node);
+ struct list_head *elem_ref_to;
+ struct list_head *tmp_ref_to;
+
+ list_for_each_safe(elem_ref_to, tmp_ref_to,
+ &b_all->ref_to_list) {
+ struct btrfsic_block_link *const l =
+ list_entry(elem_ref_to,
+ struct btrfsic_block_link,
+ node_ref_to);
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ btrfsic_print_rem_link(state, l);
+
+ l->ref_cnt--;
+ if (0 == l->ref_cnt)
+ btrfsic_block_link_free(l);
+ }
+
+ if (b_all->is_iodone || b_all->never_written)
+ btrfsic_block_free(b_all);
+ else
+ printk(KERN_INFO "btrfs: attempt to free %c-block"
+ " @%llu (%s/%llu/%d) on umount which is"
+ " not yet iodone!\n",
+ btrfsic_get_block_type(state, b_all),
+ b_all->logical_bytenr, b_all->dev_state->name,
+ b_all->dev_bytenr, b_all->mirror_num);
+ }
+
+ mutex_unlock(&btrfsic_mutex);
+
+ kvfree(state);
+}
diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h
new file mode 100644
index 000000000..13b8566c9
--- /dev/null
+++ b/fs/btrfs/check-integrity.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (C) STRATO AG 2011. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#if !defined(__BTRFS_CHECK_INTEGRITY__)
+#define __BTRFS_CHECK_INTEGRITY__
+
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+int btrfsic_submit_bh(int rw, struct buffer_head *bh);
+void btrfsic_submit_bio(int rw, struct bio *bio);
+int btrfsic_submit_bio_wait(int rw, struct bio *bio);
+#else
+#define btrfsic_submit_bh submit_bh
+#define btrfsic_submit_bio submit_bio
+#define btrfsic_submit_bio_wait submit_bio_wait
+#endif
+
+int btrfsic_mount(struct btrfs_root *root,
+ struct btrfs_fs_devices *fs_devices,
+ int including_extent_data, u32 print_mask);
+void btrfsic_unmount(struct btrfs_root *root,
+ struct btrfs_fs_devices *fs_devices);
+
+#endif
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
new file mode 100644
index 000000000..ce62324c7
--- /dev/null
+++ b/fs/btrfs/compression.c
@@ -0,0 +1,1091 @@
+/*
+ * Copyright (C) 2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/bio.h>
+#include <linux/buffer_head.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/backing-dev.h>
+#include <linux/mpage.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/bit_spinlock.h>
+#include <linux/slab.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "volumes.h"
+#include "ordered-data.h"
+#include "compression.h"
+#include "extent_io.h"
+#include "extent_map.h"
+
+struct compressed_bio {
+ /* number of bios pending for this compressed extent */
+ atomic_t pending_bios;
+
+ /* the pages with the compressed data on them */
+ struct page **compressed_pages;
+
+ /* inode that owns this data */
+ struct inode *inode;
+
+ /* starting offset in the inode for our pages */
+ u64 start;
+
+ /* number of bytes in the inode we're working on */
+ unsigned long len;
+
+ /* number of bytes on disk */
+ unsigned long compressed_len;
+
+ /* the compression algorithm for this bio */
+ int compress_type;
+
+ /* number of compressed pages in the array */
+ unsigned long nr_pages;
+
+ /* IO errors */
+ int errors;
+ int mirror_num;
+
+ /* for reads, this is the bio we are copying the data into */
+ struct bio *orig_bio;
+
+ /*
+ * the start of a variable length array of checksums only
+ * used by reads
+ */
+ u32 sums;
+};
+
+static int btrfs_decompress_biovec(int type, struct page **pages_in,
+ u64 disk_start, struct bio_vec *bvec,
+ int vcnt, size_t srclen);
+
+static inline int compressed_bio_size(struct btrfs_root *root,
+ unsigned long disk_size)
+{
+ u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
+
+ return sizeof(struct compressed_bio) +
+ (DIV_ROUND_UP(disk_size, root->sectorsize)) * csum_size;
+}
+
+static struct bio *compressed_bio_alloc(struct block_device *bdev,
+ u64 first_byte, gfp_t gfp_flags)
+{
+ int nr_vecs;
+
+ nr_vecs = bio_get_nr_vecs(bdev);
+ return btrfs_bio_alloc(bdev, first_byte >> 9, nr_vecs, gfp_flags);
+}
+
+static int check_compressed_csum(struct inode *inode,
+ struct compressed_bio *cb,
+ u64 disk_start)
+{
+ int ret;
+ struct page *page;
+ unsigned long i;
+ char *kaddr;
+ u32 csum;
+ u32 *cb_sum = &cb->sums;
+
+ if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
+ return 0;
+
+ for (i = 0; i < cb->nr_pages; i++) {
+ page = cb->compressed_pages[i];
+ csum = ~(u32)0;
+
+ kaddr = kmap_atomic(page);
+ csum = btrfs_csum_data(kaddr, csum, PAGE_CACHE_SIZE);
+ btrfs_csum_final(csum, (char *)&csum);
+ kunmap_atomic(kaddr);
+
+ if (csum != *cb_sum) {
+ btrfs_info(BTRFS_I(inode)->root->fs_info,
+ "csum failed ino %llu extent %llu csum %u wanted %u mirror %d",
+ btrfs_ino(inode), disk_start, csum, *cb_sum,
+ cb->mirror_num);
+ ret = -EIO;
+ goto fail;
+ }
+ cb_sum++;
+
+ }
+ ret = 0;
+fail:
+ return ret;
+}
+
+/* when we finish reading compressed pages from the disk, we
+ * decompress them and then run the bio end_io routines on the
+ * decompressed pages (in the inode address space).
+ *
+ * This allows the checksumming and other IO error handling routines
+ * to work normally
+ *
+ * The compressed pages are freed here, and it must be run
+ * in process context
+ */
+static void end_compressed_bio_read(struct bio *bio, int err)
+{
+ struct compressed_bio *cb = bio->bi_private;
+ struct inode *inode;
+ struct page *page;
+ unsigned long index;
+ int ret;
+
+ if (err)
+ cb->errors = 1;
+
+ /* if there are more bios still pending for this compressed
+ * extent, just exit
+ */
+ if (!atomic_dec_and_test(&cb->pending_bios))
+ goto out;
+
+ inode = cb->inode;
+ ret = check_compressed_csum(inode, cb,
+ (u64)bio->bi_iter.bi_sector << 9);
+ if (ret)
+ goto csum_failed;
+
+ /* ok, we're the last bio for this extent, lets start
+ * the decompression.
+ */
+ ret = btrfs_decompress_biovec(cb->compress_type,
+ cb->compressed_pages,
+ cb->start,
+ cb->orig_bio->bi_io_vec,
+ cb->orig_bio->bi_vcnt,
+ cb->compressed_len);
+csum_failed:
+ if (ret)
+ cb->errors = 1;
+
+ /* release the compressed pages */
+ index = 0;
+ for (index = 0; index < cb->nr_pages; index++) {
+ page = cb->compressed_pages[index];
+ page->mapping = NULL;
+ page_cache_release(page);
+ }
+
+ /* do io completion on the original bio */
+ if (cb->errors) {
+ bio_io_error(cb->orig_bio);
+ } else {
+ int i;
+ struct bio_vec *bvec;
+
+ /*
+ * we have verified the checksum already, set page
+ * checked so the end_io handlers know about it
+ */
+ bio_for_each_segment_all(bvec, cb->orig_bio, i)
+ SetPageChecked(bvec->bv_page);
+
+ bio_endio(cb->orig_bio, 0);
+ }
+
+ /* finally free the cb struct */
+ kfree(cb->compressed_pages);
+ kfree(cb);
+out:
+ bio_put(bio);
+}
+
+/*
+ * Clear the writeback bits on all of the file
+ * pages for a compressed write
+ */
+static noinline void end_compressed_writeback(struct inode *inode,
+ const struct compressed_bio *cb)
+{
+ unsigned long index = cb->start >> PAGE_CACHE_SHIFT;
+ unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_CACHE_SHIFT;
+ struct page *pages[16];
+ unsigned long nr_pages = end_index - index + 1;
+ int i;
+ int ret;
+
+ if (cb->errors)
+ mapping_set_error(inode->i_mapping, -EIO);
+
+ while (nr_pages > 0) {
+ ret = find_get_pages_contig(inode->i_mapping, index,
+ min_t(unsigned long,
+ nr_pages, ARRAY_SIZE(pages)), pages);
+ if (ret == 0) {
+ nr_pages -= 1;
+ index += 1;
+ continue;
+ }
+ for (i = 0; i < ret; i++) {
+ if (cb->errors)
+ SetPageError(pages[i]);
+ end_page_writeback(pages[i]);
+ page_cache_release(pages[i]);
+ }
+ nr_pages -= ret;
+ index += ret;
+ }
+ /* the inode may be gone now */
+}
+
+/*
+ * do the cleanup once all the compressed pages hit the disk.
+ * This will clear writeback on the file pages and free the compressed
+ * pages.
+ *
+ * This also calls the writeback end hooks for the file pages so that
+ * metadata and checksums can be updated in the file.
+ */
+static void end_compressed_bio_write(struct bio *bio, int err)
+{
+ struct extent_io_tree *tree;
+ struct compressed_bio *cb = bio->bi_private;
+ struct inode *inode;
+ struct page *page;
+ unsigned long index;
+
+ if (err)
+ cb->errors = 1;
+
+ /* if there are more bios still pending for this compressed
+ * extent, just exit
+ */
+ if (!atomic_dec_and_test(&cb->pending_bios))
+ goto out;
+
+ /* ok, we're the last bio for this extent, step one is to
+ * call back into the FS and do all the end_io operations
+ */
+ inode = cb->inode;
+ tree = &BTRFS_I(inode)->io_tree;
+ cb->compressed_pages[0]->mapping = cb->inode->i_mapping;
+ tree->ops->writepage_end_io_hook(cb->compressed_pages[0],
+ cb->start,
+ cb->start + cb->len - 1,
+ NULL,
+ err ? 0 : 1);
+ cb->compressed_pages[0]->mapping = NULL;
+
+ end_compressed_writeback(inode, cb);
+ /* note, our inode could be gone now */
+
+ /*
+ * release the compressed pages, these came from alloc_page and
+ * are not attached to the inode at all
+ */
+ index = 0;
+ for (index = 0; index < cb->nr_pages; index++) {
+ page = cb->compressed_pages[index];
+ page->mapping = NULL;
+ page_cache_release(page);
+ }
+
+ /* finally free the cb struct */
+ kfree(cb->compressed_pages);
+ kfree(cb);
+out:
+ bio_put(bio);
+}
+
+/*
+ * worker function to build and submit bios for previously compressed pages.
+ * The corresponding pages in the inode should be marked for writeback
+ * and the compressed pages should have a reference on them for dropping
+ * when the IO is complete.
+ *
+ * This also checksums the file bytes and gets things ready for
+ * the end io hooks.
+ */
+int btrfs_submit_compressed_write(struct inode *inode, u64 start,
+ unsigned long len, u64 disk_start,
+ unsigned long compressed_len,
+ struct page **compressed_pages,
+ unsigned long nr_pages)
+{
+ struct bio *bio = NULL;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct compressed_bio *cb;
+ unsigned long bytes_left;
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ int pg_index = 0;
+ struct page *page;
+ u64 first_byte = disk_start;
+ struct block_device *bdev;
+ int ret;
+ int skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+
+ WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1));
+ cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
+ if (!cb)
+ return -ENOMEM;
+ atomic_set(&cb->pending_bios, 0);
+ cb->errors = 0;
+ cb->inode = inode;
+ cb->start = start;
+ cb->len = len;
+ cb->mirror_num = 0;
+ cb->compressed_pages = compressed_pages;
+ cb->compressed_len = compressed_len;
+ cb->orig_bio = NULL;
+ cb->nr_pages = nr_pages;
+
+ bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+
+ bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
+ if (!bio) {
+ kfree(cb);
+ return -ENOMEM;
+ }
+ bio->bi_private = cb;
+ bio->bi_end_io = end_compressed_bio_write;
+ atomic_inc(&cb->pending_bios);
+
+ /* create and submit bios for the compressed pages */
+ bytes_left = compressed_len;
+ for (pg_index = 0; pg_index < cb->nr_pages; pg_index++) {
+ page = compressed_pages[pg_index];
+ page->mapping = inode->i_mapping;
+ if (bio->bi_iter.bi_size)
+ ret = io_tree->ops->merge_bio_hook(WRITE, page, 0,
+ PAGE_CACHE_SIZE,
+ bio, 0);
+ else
+ ret = 0;
+
+ page->mapping = NULL;
+ if (ret || bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) <
+ PAGE_CACHE_SIZE) {
+ bio_get(bio);
+
+ /*
+ * inc the count before we submit the bio so
+ * we know the end IO handler won't happen before
+ * we inc the count. Otherwise, the cb might get
+ * freed before we're done setting it up
+ */
+ atomic_inc(&cb->pending_bios);
+ ret = btrfs_bio_wq_end_io(root->fs_info, bio,
+ BTRFS_WQ_ENDIO_DATA);
+ BUG_ON(ret); /* -ENOMEM */
+
+ if (!skip_sum) {
+ ret = btrfs_csum_one_bio(root, inode, bio,
+ start, 1);
+ BUG_ON(ret); /* -ENOMEM */
+ }
+
+ ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
+ BUG_ON(ret); /* -ENOMEM */
+
+ bio_put(bio);
+
+ bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
+ BUG_ON(!bio);
+ bio->bi_private = cb;
+ bio->bi_end_io = end_compressed_bio_write;
+ bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
+ }
+ if (bytes_left < PAGE_CACHE_SIZE) {
+ btrfs_info(BTRFS_I(inode)->root->fs_info,
+ "bytes left %lu compress len %lu nr %lu",
+ bytes_left, cb->compressed_len, cb->nr_pages);
+ }
+ bytes_left -= PAGE_CACHE_SIZE;
+ first_byte += PAGE_CACHE_SIZE;
+ cond_resched();
+ }
+ bio_get(bio);
+
+ ret = btrfs_bio_wq_end_io(root->fs_info, bio, BTRFS_WQ_ENDIO_DATA);
+ BUG_ON(ret); /* -ENOMEM */
+
+ if (!skip_sum) {
+ ret = btrfs_csum_one_bio(root, inode, bio, start, 1);
+ BUG_ON(ret); /* -ENOMEM */
+ }
+
+ ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
+ BUG_ON(ret); /* -ENOMEM */
+
+ bio_put(bio);
+ return 0;
+}
+
+static noinline int add_ra_bio_pages(struct inode *inode,
+ u64 compressed_end,
+ struct compressed_bio *cb)
+{
+ unsigned long end_index;
+ unsigned long pg_index;
+ u64 last_offset;
+ u64 isize = i_size_read(inode);
+ int ret;
+ struct page *page;
+ unsigned long nr_pages = 0;
+ struct extent_map *em;
+ struct address_space *mapping = inode->i_mapping;
+ struct extent_map_tree *em_tree;
+ struct extent_io_tree *tree;
+ u64 end;
+ int misses = 0;
+
+ page = cb->orig_bio->bi_io_vec[cb->orig_bio->bi_vcnt - 1].bv_page;
+ last_offset = (page_offset(page) + PAGE_CACHE_SIZE);
+ em_tree = &BTRFS_I(inode)->extent_tree;
+ tree = &BTRFS_I(inode)->io_tree;
+
+ if (isize == 0)
+ return 0;
+
+ end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
+
+ while (last_offset < compressed_end) {
+ pg_index = last_offset >> PAGE_CACHE_SHIFT;
+
+ if (pg_index > end_index)
+ break;
+
+ rcu_read_lock();
+ page = radix_tree_lookup(&mapping->page_tree, pg_index);
+ rcu_read_unlock();
+ if (page && !radix_tree_exceptional_entry(page)) {
+ misses++;
+ if (misses > 4)
+ break;
+ goto next;
+ }
+
+ page = __page_cache_alloc(mapping_gfp_mask(mapping) &
+ ~__GFP_FS);
+ if (!page)
+ break;
+
+ if (add_to_page_cache_lru(page, mapping, pg_index,
+ GFP_NOFS)) {
+ page_cache_release(page);
+ goto next;
+ }
+
+ end = last_offset + PAGE_CACHE_SIZE - 1;
+ /*
+ * at this point, we have a locked page in the page cache
+ * for these bytes in the file. But, we have to make
+ * sure they map to this compressed extent on disk.
+ */
+ set_page_extent_mapped(page);
+ lock_extent(tree, last_offset, end);
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, last_offset,
+ PAGE_CACHE_SIZE);
+ read_unlock(&em_tree->lock);
+
+ if (!em || last_offset < em->start ||
+ (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) ||
+ (em->block_start >> 9) != cb->orig_bio->bi_iter.bi_sector) {
+ free_extent_map(em);
+ unlock_extent(tree, last_offset, end);
+ unlock_page(page);
+ page_cache_release(page);
+ break;
+ }
+ free_extent_map(em);
+
+ if (page->index == end_index) {
+ char *userpage;
+ size_t zero_offset = isize & (PAGE_CACHE_SIZE - 1);
+
+ if (zero_offset) {
+ int zeros;
+ zeros = PAGE_CACHE_SIZE - zero_offset;
+ userpage = kmap_atomic(page);
+ memset(userpage + zero_offset, 0, zeros);
+ flush_dcache_page(page);
+ kunmap_atomic(userpage);
+ }
+ }
+
+ ret = bio_add_page(cb->orig_bio, page,
+ PAGE_CACHE_SIZE, 0);
+
+ if (ret == PAGE_CACHE_SIZE) {
+ nr_pages++;
+ page_cache_release(page);
+ } else {
+ unlock_extent(tree, last_offset, end);
+ unlock_page(page);
+ page_cache_release(page);
+ break;
+ }
+next:
+ last_offset += PAGE_CACHE_SIZE;
+ }
+ return 0;
+}
+
+/*
+ * for a compressed read, the bio we get passed has all the inode pages
+ * in it. We don't actually do IO on those pages but allocate new ones
+ * to hold the compressed pages on disk.
+ *
+ * bio->bi_iter.bi_sector points to the compressed extent on disk
+ * bio->bi_io_vec points to all of the inode pages
+ * bio->bi_vcnt is a count of pages
+ *
+ * After the compressed pages are read, we copy the bytes into the
+ * bio we were passed and then call the bio end_io calls
+ */
+int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+ int mirror_num, unsigned long bio_flags)
+{
+ struct extent_io_tree *tree;
+ struct extent_map_tree *em_tree;
+ struct compressed_bio *cb;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ unsigned long uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
+ unsigned long compressed_len;
+ unsigned long nr_pages;
+ unsigned long pg_index;
+ struct page *page;
+ struct block_device *bdev;
+ struct bio *comp_bio;
+ u64 cur_disk_byte = (u64)bio->bi_iter.bi_sector << 9;
+ u64 em_len;
+ u64 em_start;
+ struct extent_map *em;
+ int ret = -ENOMEM;
+ int faili = 0;
+ u32 *sums;
+
+ tree = &BTRFS_I(inode)->io_tree;
+ em_tree = &BTRFS_I(inode)->extent_tree;
+
+ /* we need the actual starting offset of this extent in the file */
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree,
+ page_offset(bio->bi_io_vec->bv_page),
+ PAGE_CACHE_SIZE);
+ read_unlock(&em_tree->lock);
+ if (!em)
+ return -EIO;
+
+ compressed_len = em->block_len;
+ cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
+ if (!cb)
+ goto out;
+
+ atomic_set(&cb->pending_bios, 0);
+ cb->errors = 0;
+ cb->inode = inode;
+ cb->mirror_num = mirror_num;
+ sums = &cb->sums;
+
+ cb->start = em->orig_start;
+ em_len = em->len;
+ em_start = em->start;
+
+ free_extent_map(em);
+ em = NULL;
+
+ cb->len = uncompressed_len;
+ cb->compressed_len = compressed_len;
+ cb->compress_type = extent_compress_type(bio_flags);
+ cb->orig_bio = bio;
+
+ nr_pages = DIV_ROUND_UP(compressed_len, PAGE_CACHE_SIZE);
+ cb->compressed_pages = kcalloc(nr_pages, sizeof(struct page *),
+ GFP_NOFS);
+ if (!cb->compressed_pages)
+ goto fail1;
+
+ bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+
+ for (pg_index = 0; pg_index < nr_pages; pg_index++) {
+ cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS |
+ __GFP_HIGHMEM);
+ if (!cb->compressed_pages[pg_index]) {
+ faili = pg_index - 1;
+ ret = -ENOMEM;
+ goto fail2;
+ }
+ }
+ faili = nr_pages - 1;
+ cb->nr_pages = nr_pages;
+
+ /* In the parent-locked case, we only locked the range we are
+ * interested in. In all other cases, we can opportunistically
+ * cache decompressed data that goes beyond the requested range. */
+ if (!(bio_flags & EXTENT_BIO_PARENT_LOCKED))
+ add_ra_bio_pages(inode, em_start + em_len, cb);
+
+ /* include any pages we added in add_ra-bio_pages */
+ uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
+ cb->len = uncompressed_len;
+
+ comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS);
+ if (!comp_bio)
+ goto fail2;
+ comp_bio->bi_private = cb;
+ comp_bio->bi_end_io = end_compressed_bio_read;
+ atomic_inc(&cb->pending_bios);
+
+ for (pg_index = 0; pg_index < nr_pages; pg_index++) {
+ page = cb->compressed_pages[pg_index];
+ page->mapping = inode->i_mapping;
+ page->index = em_start >> PAGE_CACHE_SHIFT;
+
+ if (comp_bio->bi_iter.bi_size)
+ ret = tree->ops->merge_bio_hook(READ, page, 0,
+ PAGE_CACHE_SIZE,
+ comp_bio, 0);
+ else
+ ret = 0;
+
+ page->mapping = NULL;
+ if (ret || bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0) <
+ PAGE_CACHE_SIZE) {
+ bio_get(comp_bio);
+
+ ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio,
+ BTRFS_WQ_ENDIO_DATA);
+ BUG_ON(ret); /* -ENOMEM */
+
+ /*
+ * inc the count before we submit the bio so
+ * we know the end IO handler won't happen before
+ * we inc the count. Otherwise, the cb might get
+ * freed before we're done setting it up
+ */
+ atomic_inc(&cb->pending_bios);
+
+ if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
+ ret = btrfs_lookup_bio_sums(root, inode,
+ comp_bio, sums);
+ BUG_ON(ret); /* -ENOMEM */
+ }
+ sums += DIV_ROUND_UP(comp_bio->bi_iter.bi_size,
+ root->sectorsize);
+
+ ret = btrfs_map_bio(root, READ, comp_bio,
+ mirror_num, 0);
+ if (ret)
+ bio_endio(comp_bio, ret);
+
+ bio_put(comp_bio);
+
+ comp_bio = compressed_bio_alloc(bdev, cur_disk_byte,
+ GFP_NOFS);
+ BUG_ON(!comp_bio);
+ comp_bio->bi_private = cb;
+ comp_bio->bi_end_io = end_compressed_bio_read;
+
+ bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0);
+ }
+ cur_disk_byte += PAGE_CACHE_SIZE;
+ }
+ bio_get(comp_bio);
+
+ ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio,
+ BTRFS_WQ_ENDIO_DATA);
+ BUG_ON(ret); /* -ENOMEM */
+
+ if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
+ ret = btrfs_lookup_bio_sums(root, inode, comp_bio, sums);
+ BUG_ON(ret); /* -ENOMEM */
+ }
+
+ ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
+ if (ret)
+ bio_endio(comp_bio, ret);
+
+ bio_put(comp_bio);
+ return 0;
+
+fail2:
+ while (faili >= 0) {
+ __free_page(cb->compressed_pages[faili]);
+ faili--;
+ }
+
+ kfree(cb->compressed_pages);
+fail1:
+ kfree(cb);
+out:
+ free_extent_map(em);
+ return ret;
+}
+
+static struct list_head comp_idle_workspace[BTRFS_COMPRESS_TYPES];
+static spinlock_t comp_workspace_lock[BTRFS_COMPRESS_TYPES];
+static int comp_num_workspace[BTRFS_COMPRESS_TYPES];
+static atomic_t comp_alloc_workspace[BTRFS_COMPRESS_TYPES];
+static wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES];
+
+static const struct btrfs_compress_op * const btrfs_compress_op[] = {
+ &btrfs_zlib_compress,
+ &btrfs_lzo_compress,
+};
+
+void __init btrfs_init_compress(void)
+{
+ int i;
+
+ for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
+ INIT_LIST_HEAD(&comp_idle_workspace[i]);
+ spin_lock_init(&comp_workspace_lock[i]);
+ atomic_set(&comp_alloc_workspace[i], 0);
+ init_waitqueue_head(&comp_workspace_wait[i]);
+ }
+}
+
+/*
+ * this finds an available workspace or allocates a new one
+ * ERR_PTR is returned if things go bad.
+ */
+static struct list_head *find_workspace(int type)
+{
+ struct list_head *workspace;
+ int cpus = num_online_cpus();
+ int idx = type - 1;
+
+ struct list_head *idle_workspace = &comp_idle_workspace[idx];
+ spinlock_t *workspace_lock = &comp_workspace_lock[idx];
+ atomic_t *alloc_workspace = &comp_alloc_workspace[idx];
+ wait_queue_head_t *workspace_wait = &comp_workspace_wait[idx];
+ int *num_workspace = &comp_num_workspace[idx];
+again:
+ spin_lock(workspace_lock);
+ if (!list_empty(idle_workspace)) {
+ workspace = idle_workspace->next;
+ list_del(workspace);
+ (*num_workspace)--;
+ spin_unlock(workspace_lock);
+ return workspace;
+
+ }
+ if (atomic_read(alloc_workspace) > cpus) {
+ DEFINE_WAIT(wait);
+
+ spin_unlock(workspace_lock);
+ prepare_to_wait(workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
+ if (atomic_read(alloc_workspace) > cpus && !*num_workspace)
+ schedule();
+ finish_wait(workspace_wait, &wait);
+ goto again;
+ }
+ atomic_inc(alloc_workspace);
+ spin_unlock(workspace_lock);
+
+ workspace = btrfs_compress_op[idx]->alloc_workspace();
+ if (IS_ERR(workspace)) {
+ atomic_dec(alloc_workspace);
+ wake_up(workspace_wait);
+ }
+ return workspace;
+}
+
+/*
+ * put a workspace struct back on the list or free it if we have enough
+ * idle ones sitting around
+ */
+static void free_workspace(int type, struct list_head *workspace)
+{
+ int idx = type - 1;
+ struct list_head *idle_workspace = &comp_idle_workspace[idx];
+ spinlock_t *workspace_lock = &comp_workspace_lock[idx];
+ atomic_t *alloc_workspace = &comp_alloc_workspace[idx];
+ wait_queue_head_t *workspace_wait = &comp_workspace_wait[idx];
+ int *num_workspace = &comp_num_workspace[idx];
+
+ spin_lock(workspace_lock);
+ if (*num_workspace < num_online_cpus()) {
+ list_add(workspace, idle_workspace);
+ (*num_workspace)++;
+ spin_unlock(workspace_lock);
+ goto wake;
+ }
+ spin_unlock(workspace_lock);
+
+ btrfs_compress_op[idx]->free_workspace(workspace);
+ atomic_dec(alloc_workspace);
+wake:
+ smp_mb();
+ if (waitqueue_active(workspace_wait))
+ wake_up(workspace_wait);
+}
+
+/*
+ * cleanup function for module exit
+ */
+static void free_workspaces(void)
+{
+ struct list_head *workspace;
+ int i;
+
+ for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
+ while (!list_empty(&comp_idle_workspace[i])) {
+ workspace = comp_idle_workspace[i].next;
+ list_del(workspace);
+ btrfs_compress_op[i]->free_workspace(workspace);
+ atomic_dec(&comp_alloc_workspace[i]);
+ }
+ }
+}
+
+/*
+ * given an address space and start/len, compress the bytes.
+ *
+ * pages are allocated to hold the compressed result and stored
+ * in 'pages'
+ *
+ * out_pages is used to return the number of pages allocated. There
+ * may be pages allocated even if we return an error
+ *
+ * total_in is used to return the number of bytes actually read. It
+ * may be smaller then len if we had to exit early because we
+ * ran out of room in the pages array or because we cross the
+ * max_out threshold.
+ *
+ * total_out is used to return the total number of compressed bytes
+ *
+ * max_out tells us the max number of bytes that we're allowed to
+ * stuff into pages
+ */
+int btrfs_compress_pages(int type, struct address_space *mapping,
+ u64 start, unsigned long len,
+ struct page **pages,
+ unsigned long nr_dest_pages,
+ unsigned long *out_pages,
+ unsigned long *total_in,
+ unsigned long *total_out,
+ unsigned long max_out)
+{
+ struct list_head *workspace;
+ int ret;
+
+ workspace = find_workspace(type);
+ if (IS_ERR(workspace))
+ return PTR_ERR(workspace);
+
+ ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping,
+ start, len, pages,
+ nr_dest_pages, out_pages,
+ total_in, total_out,
+ max_out);
+ free_workspace(type, workspace);
+ return ret;
+}
+
+/*
+ * pages_in is an array of pages with compressed data.
+ *
+ * disk_start is the starting logical offset of this array in the file
+ *
+ * bvec is a bio_vec of pages from the file that we want to decompress into
+ *
+ * vcnt is the count of pages in the biovec
+ *
+ * srclen is the number of bytes in pages_in
+ *
+ * The basic idea is that we have a bio that was created by readpages.
+ * The pages in the bio are for the uncompressed data, and they may not
+ * be contiguous. They all correspond to the range of bytes covered by
+ * the compressed extent.
+ */
+static int btrfs_decompress_biovec(int type, struct page **pages_in,
+ u64 disk_start, struct bio_vec *bvec,
+ int vcnt, size_t srclen)
+{
+ struct list_head *workspace;
+ int ret;
+
+ workspace = find_workspace(type);
+ if (IS_ERR(workspace))
+ return PTR_ERR(workspace);
+
+ ret = btrfs_compress_op[type-1]->decompress_biovec(workspace, pages_in,
+ disk_start,
+ bvec, vcnt, srclen);
+ free_workspace(type, workspace);
+ return ret;
+}
+
+/*
+ * a less complex decompression routine. Our compressed data fits in a
+ * single page, and we want to read a single page out of it.
+ * start_byte tells us the offset into the compressed data we're interested in
+ */
+int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
+ unsigned long start_byte, size_t srclen, size_t destlen)
+{
+ struct list_head *workspace;
+ int ret;
+
+ workspace = find_workspace(type);
+ if (IS_ERR(workspace))
+ return PTR_ERR(workspace);
+
+ ret = btrfs_compress_op[type-1]->decompress(workspace, data_in,
+ dest_page, start_byte,
+ srclen, destlen);
+
+ free_workspace(type, workspace);
+ return ret;
+}
+
+void btrfs_exit_compress(void)
+{
+ free_workspaces();
+}
+
+/*
+ * Copy uncompressed data from working buffer to pages.
+ *
+ * buf_start is the byte offset we're of the start of our workspace buffer.
+ *
+ * total_out is the last byte of the buffer
+ */
+int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
+ unsigned long total_out, u64 disk_start,
+ struct bio_vec *bvec, int vcnt,
+ unsigned long *pg_index,
+ unsigned long *pg_offset)
+{
+ unsigned long buf_offset;
+ unsigned long current_buf_start;
+ unsigned long start_byte;
+ unsigned long working_bytes = total_out - buf_start;
+ unsigned long bytes;
+ char *kaddr;
+ struct page *page_out = bvec[*pg_index].bv_page;
+
+ /*
+ * start byte is the first byte of the page we're currently
+ * copying into relative to the start of the compressed data.
+ */
+ start_byte = page_offset(page_out) - disk_start;
+
+ /* we haven't yet hit data corresponding to this page */
+ if (total_out <= start_byte)
+ return 1;
+
+ /*
+ * the start of the data we care about is offset into
+ * the middle of our working buffer
+ */
+ if (total_out > start_byte && buf_start < start_byte) {
+ buf_offset = start_byte - buf_start;
+ working_bytes -= buf_offset;
+ } else {
+ buf_offset = 0;
+ }
+ current_buf_start = buf_start;
+
+ /* copy bytes from the working buffer into the pages */
+ while (working_bytes > 0) {
+ bytes = min(PAGE_CACHE_SIZE - *pg_offset,
+ PAGE_CACHE_SIZE - buf_offset);
+ bytes = min(bytes, working_bytes);
+ kaddr = kmap_atomic(page_out);
+ memcpy(kaddr + *pg_offset, buf + buf_offset, bytes);
+ kunmap_atomic(kaddr);
+ flush_dcache_page(page_out);
+
+ *pg_offset += bytes;
+ buf_offset += bytes;
+ working_bytes -= bytes;
+ current_buf_start += bytes;
+
+ /* check if we need to pick another page */
+ if (*pg_offset == PAGE_CACHE_SIZE) {
+ (*pg_index)++;
+ if (*pg_index >= vcnt)
+ return 0;
+
+ page_out = bvec[*pg_index].bv_page;
+ *pg_offset = 0;
+ start_byte = page_offset(page_out) - disk_start;
+
+ /*
+ * make sure our new page is covered by this
+ * working buffer
+ */
+ if (total_out <= start_byte)
+ return 1;
+
+ /*
+ * the next page in the biovec might not be adjacent
+ * to the last page, but it might still be found
+ * inside this working buffer. bump our offset pointer
+ */
+ if (total_out > start_byte &&
+ current_buf_start < start_byte) {
+ buf_offset = start_byte - buf_start;
+ working_bytes = total_out - start_byte;
+ current_buf_start = buf_start + buf_offset;
+ }
+ }
+ }
+
+ return 1;
+}
+
+/*
+ * When uncompressing data, we need to make sure and zero any parts of
+ * the biovec that were not filled in by the decompression code. pg_index
+ * and pg_offset indicate the last page and the last offset of that page
+ * that have been filled in. This will zero everything remaining in the
+ * biovec.
+ */
+void btrfs_clear_biovec_end(struct bio_vec *bvec, int vcnt,
+ unsigned long pg_index,
+ unsigned long pg_offset)
+{
+ while (pg_index < vcnt) {
+ struct page *page = bvec[pg_index].bv_page;
+ unsigned long off = bvec[pg_index].bv_offset;
+ unsigned long len = bvec[pg_index].bv_len;
+
+ if (pg_offset < off)
+ pg_offset = off;
+ if (pg_offset < off + len) {
+ unsigned long bytes = off + len - pg_offset;
+ char *kaddr;
+
+ kaddr = kmap_atomic(page);
+ memset(kaddr + pg_offset, 0, bytes);
+ kunmap_atomic(kaddr);
+ }
+ pg_index++;
+ pg_offset = 0;
+ }
+}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
new file mode 100644
index 000000000..13a4dc043
--- /dev/null
+++ b/fs/btrfs/compression.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright (C) 2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_COMPRESSION_
+#define __BTRFS_COMPRESSION_
+
+void btrfs_init_compress(void);
+void btrfs_exit_compress(void);
+
+int btrfs_compress_pages(int type, struct address_space *mapping,
+ u64 start, unsigned long len,
+ struct page **pages,
+ unsigned long nr_dest_pages,
+ unsigned long *out_pages,
+ unsigned long *total_in,
+ unsigned long *total_out,
+ unsigned long max_out);
+int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
+ unsigned long start_byte, size_t srclen, size_t destlen);
+int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
+ unsigned long total_out, u64 disk_start,
+ struct bio_vec *bvec, int vcnt,
+ unsigned long *pg_index,
+ unsigned long *pg_offset);
+
+int btrfs_submit_compressed_write(struct inode *inode, u64 start,
+ unsigned long len, u64 disk_start,
+ unsigned long compressed_len,
+ struct page **compressed_pages,
+ unsigned long nr_pages);
+int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+ int mirror_num, unsigned long bio_flags);
+void btrfs_clear_biovec_end(struct bio_vec *bvec, int vcnt,
+ unsigned long pg_index,
+ unsigned long pg_offset);
+struct btrfs_compress_op {
+ struct list_head *(*alloc_workspace)(void);
+
+ void (*free_workspace)(struct list_head *workspace);
+
+ int (*compress_pages)(struct list_head *workspace,
+ struct address_space *mapping,
+ u64 start, unsigned long len,
+ struct page **pages,
+ unsigned long nr_dest_pages,
+ unsigned long *out_pages,
+ unsigned long *total_in,
+ unsigned long *total_out,
+ unsigned long max_out);
+
+ int (*decompress_biovec)(struct list_head *workspace,
+ struct page **pages_in,
+ u64 disk_start,
+ struct bio_vec *bvec,
+ int vcnt,
+ size_t srclen);
+
+ int (*decompress)(struct list_head *workspace,
+ unsigned char *data_in,
+ struct page *dest_page,
+ unsigned long start_byte,
+ size_t srclen, size_t destlen);
+};
+
+extern const struct btrfs_compress_op btrfs_zlib_compress;
+extern const struct btrfs_compress_op btrfs_lzo_compress;
+
+#endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
new file mode 100644
index 000000000..0f11ebc92
--- /dev/null
+++ b/fs/btrfs/ctree.c
@@ -0,0 +1,5910 @@
+/*
+ * Copyright (C) 2007,2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/rbtree.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "print-tree.h"
+#include "locking.h"
+
+static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_path *path, int level);
+static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_key *ins_key,
+ struct btrfs_path *path, int data_size, int extend);
+static int push_node_left(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct extent_buffer *dst,
+ struct extent_buffer *src, int empty);
+static int balance_node_right(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *dst_buf,
+ struct extent_buffer *src_buf);
+static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
+ int level, int slot);
+static int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb);
+
+struct btrfs_path *btrfs_alloc_path(void)
+{
+ struct btrfs_path *path;
+ path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
+ return path;
+}
+
+/*
+ * set all locked nodes in the path to blocking locks. This should
+ * be done before scheduling
+ */
+noinline void btrfs_set_path_blocking(struct btrfs_path *p)
+{
+ int i;
+ for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+ if (!p->nodes[i] || !p->locks[i])
+ continue;
+ btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
+ if (p->locks[i] == BTRFS_READ_LOCK)
+ p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
+ else if (p->locks[i] == BTRFS_WRITE_LOCK)
+ p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
+ }
+}
+
+/*
+ * reset all the locked nodes in the patch to spinning locks.
+ *
+ * held is used to keep lockdep happy, when lockdep is enabled
+ * we set held to a blocking lock before we go around and
+ * retake all the spinlocks in the path. You can safely use NULL
+ * for held
+ */
+noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
+ struct extent_buffer *held, int held_rw)
+{
+ int i;
+
+ if (held) {
+ btrfs_set_lock_blocking_rw(held, held_rw);
+ if (held_rw == BTRFS_WRITE_LOCK)
+ held_rw = BTRFS_WRITE_LOCK_BLOCKING;
+ else if (held_rw == BTRFS_READ_LOCK)
+ held_rw = BTRFS_READ_LOCK_BLOCKING;
+ }
+ btrfs_set_path_blocking(p);
+
+ for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
+ if (p->nodes[i] && p->locks[i]) {
+ btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]);
+ if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING)
+ p->locks[i] = BTRFS_WRITE_LOCK;
+ else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING)
+ p->locks[i] = BTRFS_READ_LOCK;
+ }
+ }
+
+ if (held)
+ btrfs_clear_lock_blocking_rw(held, held_rw);
+}
+
+/* this also releases the path */
+void btrfs_free_path(struct btrfs_path *p)
+{
+ if (!p)
+ return;
+ btrfs_release_path(p);
+ kmem_cache_free(btrfs_path_cachep, p);
+}
+
+/*
+ * path release drops references on the extent buffers in the path
+ * and it drops any locks held by this path
+ *
+ * It is safe to call this on paths that no locks or extent buffers held.
+ */
+noinline void btrfs_release_path(struct btrfs_path *p)
+{
+ int i;
+
+ for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+ p->slots[i] = 0;
+ if (!p->nodes[i])
+ continue;
+ if (p->locks[i]) {
+ btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
+ p->locks[i] = 0;
+ }
+ free_extent_buffer(p->nodes[i]);
+ p->nodes[i] = NULL;
+ }
+}
+
+/*
+ * safely gets a reference on the root node of a tree. A lock
+ * is not taken, so a concurrent writer may put a different node
+ * at the root of the tree. See btrfs_lock_root_node for the
+ * looping required.
+ *
+ * The extent buffer returned by this has a reference taken, so
+ * it won't disappear. It may stop being the root of the tree
+ * at any time because there are no locks held.
+ */
+struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
+{
+ struct extent_buffer *eb;
+
+ while (1) {
+ rcu_read_lock();
+ eb = rcu_dereference(root->node);
+
+ /*
+ * RCU really hurts here, we could free up the root node because
+ * it was cow'ed but we may not get the new root node yet so do
+ * the inc_not_zero dance and if it doesn't work then
+ * synchronize_rcu and try again.
+ */
+ if (atomic_inc_not_zero(&eb->refs)) {
+ rcu_read_unlock();
+ break;
+ }
+ rcu_read_unlock();
+ synchronize_rcu();
+ }
+ return eb;
+}
+
+/* loop around taking references on and locking the root node of the
+ * tree until you end up with a lock on the root. A locked buffer
+ * is returned, with a reference held.
+ */
+struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
+{
+ struct extent_buffer *eb;
+
+ while (1) {
+ eb = btrfs_root_node(root);
+ btrfs_tree_lock(eb);
+ if (eb == root->node)
+ break;
+ btrfs_tree_unlock(eb);
+ free_extent_buffer(eb);
+ }
+ return eb;
+}
+
+/* loop around taking references on and locking the root node of the
+ * tree until you end up with a lock on the root. A locked buffer
+ * is returned, with a reference held.
+ */
+static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
+{
+ struct extent_buffer *eb;
+
+ while (1) {
+ eb = btrfs_root_node(root);
+ btrfs_tree_read_lock(eb);
+ if (eb == root->node)
+ break;
+ btrfs_tree_read_unlock(eb);
+ free_extent_buffer(eb);
+ }
+ return eb;
+}
+
+/* cowonly root (everything not a reference counted cow subvolume), just get
+ * put onto a simple dirty list. transaction.c walks this to make sure they
+ * get properly updated on disk.
+ */
+static void add_root_to_dirty_list(struct btrfs_root *root)
+{
+ if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
+ !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
+ return;
+
+ spin_lock(&root->fs_info->trans_lock);
+ if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
+ /* Want the extent tree to be the last on the list */
+ if (root->objectid == BTRFS_EXTENT_TREE_OBJECTID)
+ list_move_tail(&root->dirty_list,
+ &root->fs_info->dirty_cowonly_roots);
+ else
+ list_move(&root->dirty_list,
+ &root->fs_info->dirty_cowonly_roots);
+ }
+ spin_unlock(&root->fs_info->trans_lock);
+}
+
+/*
+ * used by snapshot creation to make a copy of a root for a tree with
+ * a given objectid. The buffer with the new root node is returned in
+ * cow_ret, and this func returns zero on success or a negative error code.
+ */
+int btrfs_copy_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *buf,
+ struct extent_buffer **cow_ret, u64 new_root_objectid)
+{
+ struct extent_buffer *cow;
+ int ret = 0;
+ int level;
+ struct btrfs_disk_key disk_key;
+
+ WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+ trans->transid != root->fs_info->running_transaction->transid);
+ WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+ trans->transid != root->last_trans);
+
+ level = btrfs_header_level(buf);
+ if (level == 0)
+ btrfs_item_key(buf, &disk_key, 0);
+ else
+ btrfs_node_key(buf, &disk_key, 0);
+
+ cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
+ &disk_key, level, buf->start, 0);
+ if (IS_ERR(cow))
+ return PTR_ERR(cow);
+
+ copy_extent_buffer(cow, buf, 0, 0, cow->len);
+ btrfs_set_header_bytenr(cow, cow->start);
+ btrfs_set_header_generation(cow, trans->transid);
+ btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
+ btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
+ BTRFS_HEADER_FLAG_RELOC);
+ if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
+ btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
+ else
+ btrfs_set_header_owner(cow, new_root_objectid);
+
+ write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(),
+ BTRFS_FSID_SIZE);
+
+ WARN_ON(btrfs_header_generation(buf) > trans->transid);
+ if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
+ ret = btrfs_inc_ref(trans, root, cow, 1);
+ else
+ ret = btrfs_inc_ref(trans, root, cow, 0);
+
+ if (ret)
+ return ret;
+
+ btrfs_mark_buffer_dirty(cow);
+ *cow_ret = cow;
+ return 0;
+}
+
+enum mod_log_op {
+ MOD_LOG_KEY_REPLACE,
+ MOD_LOG_KEY_ADD,
+ MOD_LOG_KEY_REMOVE,
+ MOD_LOG_KEY_REMOVE_WHILE_FREEING,
+ MOD_LOG_KEY_REMOVE_WHILE_MOVING,
+ MOD_LOG_MOVE_KEYS,
+ MOD_LOG_ROOT_REPLACE,
+};
+
+struct tree_mod_move {
+ int dst_slot;
+ int nr_items;
+};
+
+struct tree_mod_root {
+ u64 logical;
+ u8 level;
+};
+
+struct tree_mod_elem {
+ struct rb_node node;
+ u64 index; /* shifted logical */
+ u64 seq;
+ enum mod_log_op op;
+
+ /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
+ int slot;
+
+ /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
+ u64 generation;
+
+ /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
+ struct btrfs_disk_key key;
+ u64 blockptr;
+
+ /* this is used for op == MOD_LOG_MOVE_KEYS */
+ struct tree_mod_move move;
+
+ /* this is used for op == MOD_LOG_ROOT_REPLACE */
+ struct tree_mod_root old_root;
+};
+
+static inline void tree_mod_log_read_lock(struct btrfs_fs_info *fs_info)
+{
+ read_lock(&fs_info->tree_mod_log_lock);
+}
+
+static inline void tree_mod_log_read_unlock(struct btrfs_fs_info *fs_info)
+{
+ read_unlock(&fs_info->tree_mod_log_lock);
+}
+
+static inline void tree_mod_log_write_lock(struct btrfs_fs_info *fs_info)
+{
+ write_lock(&fs_info->tree_mod_log_lock);
+}
+
+static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)
+{
+ write_unlock(&fs_info->tree_mod_log_lock);
+}
+
+/*
+ * Pull a new tree mod seq number for our operation.
+ */
+static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
+{
+ return atomic64_inc_return(&fs_info->tree_mod_seq);
+}
+
+/*
+ * This adds a new blocker to the tree mod log's blocker list if the @elem
+ * passed does not already have a sequence number set. So when a caller expects
+ * to record tree modifications, it should ensure to set elem->seq to zero
+ * before calling btrfs_get_tree_mod_seq.
+ * Returns a fresh, unused tree log modification sequence number, even if no new
+ * blocker was added.
+ */
+u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
+ struct seq_list *elem)
+{
+ tree_mod_log_write_lock(fs_info);
+ spin_lock(&fs_info->tree_mod_seq_lock);
+ if (!elem->seq) {
+ elem->seq = btrfs_inc_tree_mod_seq(fs_info);
+ list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
+ }
+ spin_unlock(&fs_info->tree_mod_seq_lock);
+ tree_mod_log_write_unlock(fs_info);
+
+ return elem->seq;
+}
+
+void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
+ struct seq_list *elem)
+{
+ struct rb_root *tm_root;
+ struct rb_node *node;
+ struct rb_node *next;
+ struct seq_list *cur_elem;
+ struct tree_mod_elem *tm;
+ u64 min_seq = (u64)-1;
+ u64 seq_putting = elem->seq;
+
+ if (!seq_putting)
+ return;
+
+ spin_lock(&fs_info->tree_mod_seq_lock);
+ list_del(&elem->list);
+ elem->seq = 0;
+
+ list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
+ if (cur_elem->seq < min_seq) {
+ if (seq_putting > cur_elem->seq) {
+ /*
+ * blocker with lower sequence number exists, we
+ * cannot remove anything from the log
+ */
+ spin_unlock(&fs_info->tree_mod_seq_lock);
+ return;
+ }
+ min_seq = cur_elem->seq;
+ }
+ }
+ spin_unlock(&fs_info->tree_mod_seq_lock);
+
+ /*
+ * anything that's lower than the lowest existing (read: blocked)
+ * sequence number can be removed from the tree.
+ */
+ tree_mod_log_write_lock(fs_info);
+ tm_root = &fs_info->tree_mod_log;
+ for (node = rb_first(tm_root); node; node = next) {
+ next = rb_next(node);
+ tm = container_of(node, struct tree_mod_elem, node);
+ if (tm->seq > min_seq)
+ continue;
+ rb_erase(node, tm_root);
+ kfree(tm);
+ }
+ tree_mod_log_write_unlock(fs_info);
+}
+
+/*
+ * key order of the log:
+ * index -> sequence
+ *
+ * the index is the shifted logical of the *new* root node for root replace
+ * operations, or the shifted logical of the affected block for all other
+ * operations.
+ *
+ * Note: must be called with write lock (tree_mod_log_write_lock).
+ */
+static noinline int
+__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
+{
+ struct rb_root *tm_root;
+ struct rb_node **new;
+ struct rb_node *parent = NULL;
+ struct tree_mod_elem *cur;
+
+ BUG_ON(!tm);
+
+ tm->seq = btrfs_inc_tree_mod_seq(fs_info);
+
+ tm_root = &fs_info->tree_mod_log;
+ new = &tm_root->rb_node;
+ while (*new) {
+ cur = container_of(*new, struct tree_mod_elem, node);
+ parent = *new;
+ if (cur->index < tm->index)
+ new = &((*new)->rb_left);
+ else if (cur->index > tm->index)
+ new = &((*new)->rb_right);
+ else if (cur->seq < tm->seq)
+ new = &((*new)->rb_left);
+ else if (cur->seq > tm->seq)
+ new = &((*new)->rb_right);
+ else
+ return -EEXIST;
+ }
+
+ rb_link_node(&tm->node, parent, new);
+ rb_insert_color(&tm->node, tm_root);
+ return 0;
+}
+
+/*
+ * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
+ * returns zero with the tree_mod_log_lock acquired. The caller must hold
+ * this until all tree mod log insertions are recorded in the rb tree and then
+ * call tree_mod_log_write_unlock() to release.
+ */
+static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb) {
+ smp_mb();
+ if (list_empty(&(fs_info)->tree_mod_seq_list))
+ return 1;
+ if (eb && btrfs_header_level(eb) == 0)
+ return 1;
+
+ tree_mod_log_write_lock(fs_info);
+ if (list_empty(&(fs_info)->tree_mod_seq_list)) {
+ tree_mod_log_write_unlock(fs_info);
+ return 1;
+ }
+
+ return 0;
+}
+
+/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
+static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb)
+{
+ smp_mb();
+ if (list_empty(&(fs_info)->tree_mod_seq_list))
+ return 0;
+ if (eb && btrfs_header_level(eb) == 0)
+ return 0;
+
+ return 1;
+}
+
+static struct tree_mod_elem *
+alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
+ enum mod_log_op op, gfp_t flags)
+{
+ struct tree_mod_elem *tm;
+
+ tm = kzalloc(sizeof(*tm), flags);
+ if (!tm)
+ return NULL;
+
+ tm->index = eb->start >> PAGE_CACHE_SHIFT;
+ if (op != MOD_LOG_KEY_ADD) {
+ btrfs_node_key(eb, &tm->key, slot);
+ tm->blockptr = btrfs_node_blockptr(eb, slot);
+ }
+ tm->op = op;
+ tm->slot = slot;
+ tm->generation = btrfs_node_ptr_generation(eb, slot);
+ RB_CLEAR_NODE(&tm->node);
+
+ return tm;
+}
+
+static noinline int
+tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb, int slot,
+ enum mod_log_op op, gfp_t flags)
+{
+ struct tree_mod_elem *tm;
+ int ret;
+
+ if (!tree_mod_need_log(fs_info, eb))
+ return 0;
+
+ tm = alloc_tree_mod_elem(eb, slot, op, flags);
+ if (!tm)
+ return -ENOMEM;
+
+ if (tree_mod_dont_log(fs_info, eb)) {
+ kfree(tm);
+ return 0;
+ }
+
+ ret = __tree_mod_log_insert(fs_info, tm);
+ tree_mod_log_write_unlock(fs_info);
+ if (ret)
+ kfree(tm);
+
+ return ret;
+}
+
+static noinline int
+tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb, int dst_slot, int src_slot,
+ int nr_items, gfp_t flags)
+{
+ struct tree_mod_elem *tm = NULL;
+ struct tree_mod_elem **tm_list = NULL;
+ int ret = 0;
+ int i;
+ int locked = 0;
+
+ if (!tree_mod_need_log(fs_info, eb))
+ return 0;
+
+ tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), flags);
+ if (!tm_list)
+ return -ENOMEM;
+
+ tm = kzalloc(sizeof(*tm), flags);
+ if (!tm) {
+ ret = -ENOMEM;
+ goto free_tms;
+ }
+
+ tm->index = eb->start >> PAGE_CACHE_SHIFT;
+ tm->slot = src_slot;
+ tm->move.dst_slot = dst_slot;
+ tm->move.nr_items = nr_items;
+ tm->op = MOD_LOG_MOVE_KEYS;
+
+ for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
+ tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
+ MOD_LOG_KEY_REMOVE_WHILE_MOVING, flags);
+ if (!tm_list[i]) {
+ ret = -ENOMEM;
+ goto free_tms;
+ }
+ }
+
+ if (tree_mod_dont_log(fs_info, eb))
+ goto free_tms;
+ locked = 1;
+
+ /*
+ * When we override something during the move, we log these removals.
+ * This can only happen when we move towards the beginning of the
+ * buffer, i.e. dst_slot < src_slot.
+ */
+ for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
+ ret = __tree_mod_log_insert(fs_info, tm_list[i]);
+ if (ret)
+ goto free_tms;
+ }
+
+ ret = __tree_mod_log_insert(fs_info, tm);
+ if (ret)
+ goto free_tms;
+ tree_mod_log_write_unlock(fs_info);
+ kfree(tm_list);
+
+ return 0;
+free_tms:
+ for (i = 0; i < nr_items; i++) {
+ if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
+ rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
+ kfree(tm_list[i]);
+ }
+ if (locked)
+ tree_mod_log_write_unlock(fs_info);
+ kfree(tm_list);
+ kfree(tm);
+
+ return ret;
+}
+
+static inline int
+__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
+ struct tree_mod_elem **tm_list,
+ int nritems)
+{
+ int i, j;
+ int ret;
+
+ for (i = nritems - 1; i >= 0; i--) {
+ ret = __tree_mod_log_insert(fs_info, tm_list[i]);
+ if (ret) {
+ for (j = nritems - 1; j > i; j--)
+ rb_erase(&tm_list[j]->node,
+ &fs_info->tree_mod_log);
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static noinline int
+tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *old_root,
+ struct extent_buffer *new_root, gfp_t flags,
+ int log_removal)
+{
+ struct tree_mod_elem *tm = NULL;
+ struct tree_mod_elem **tm_list = NULL;
+ int nritems = 0;
+ int ret = 0;
+ int i;
+
+ if (!tree_mod_need_log(fs_info, NULL))
+ return 0;
+
+ if (log_removal && btrfs_header_level(old_root) > 0) {
+ nritems = btrfs_header_nritems(old_root);
+ tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
+ flags);
+ if (!tm_list) {
+ ret = -ENOMEM;
+ goto free_tms;
+ }
+ for (i = 0; i < nritems; i++) {
+ tm_list[i] = alloc_tree_mod_elem(old_root, i,
+ MOD_LOG_KEY_REMOVE_WHILE_FREEING, flags);
+ if (!tm_list[i]) {
+ ret = -ENOMEM;
+ goto free_tms;
+ }
+ }
+ }
+
+ tm = kzalloc(sizeof(*tm), flags);
+ if (!tm) {
+ ret = -ENOMEM;
+ goto free_tms;
+ }
+
+ tm->index = new_root->start >> PAGE_CACHE_SHIFT;
+ tm->old_root.logical = old_root->start;
+ tm->old_root.level = btrfs_header_level(old_root);
+ tm->generation = btrfs_header_generation(old_root);
+ tm->op = MOD_LOG_ROOT_REPLACE;
+
+ if (tree_mod_dont_log(fs_info, NULL))
+ goto free_tms;
+
+ if (tm_list)
+ ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
+ if (!ret)
+ ret = __tree_mod_log_insert(fs_info, tm);
+
+ tree_mod_log_write_unlock(fs_info);
+ if (ret)
+ goto free_tms;
+ kfree(tm_list);
+
+ return ret;
+
+free_tms:
+ if (tm_list) {
+ for (i = 0; i < nritems; i++)
+ kfree(tm_list[i]);
+ kfree(tm_list);
+ }
+ kfree(tm);
+
+ return ret;
+}
+
+static struct tree_mod_elem *
+__tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
+ int smallest)
+{
+ struct rb_root *tm_root;
+ struct rb_node *node;
+ struct tree_mod_elem *cur = NULL;
+ struct tree_mod_elem *found = NULL;
+ u64 index = start >> PAGE_CACHE_SHIFT;
+
+ tree_mod_log_read_lock(fs_info);
+ tm_root = &fs_info->tree_mod_log;
+ node = tm_root->rb_node;
+ while (node) {
+ cur = container_of(node, struct tree_mod_elem, node);
+ if (cur->index < index) {
+ node = node->rb_left;
+ } else if (cur->index > index) {
+ node = node->rb_right;
+ } else if (cur->seq < min_seq) {
+ node = node->rb_left;
+ } else if (!smallest) {
+ /* we want the node with the highest seq */
+ if (found)
+ BUG_ON(found->seq > cur->seq);
+ found = cur;
+ node = node->rb_left;
+ } else if (cur->seq > min_seq) {
+ /* we want the node with the smallest seq */
+ if (found)
+ BUG_ON(found->seq < cur->seq);
+ found = cur;
+ node = node->rb_right;
+ } else {
+ found = cur;
+ break;
+ }
+ }
+ tree_mod_log_read_unlock(fs_info);
+
+ return found;
+}
+
+/*
+ * this returns the element from the log with the smallest time sequence
+ * value that's in the log (the oldest log item). any element with a time
+ * sequence lower than min_seq will be ignored.
+ */
+static struct tree_mod_elem *
+tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
+ u64 min_seq)
+{
+ return __tree_mod_log_search(fs_info, start, min_seq, 1);
+}
+
+/*
+ * this returns the element from the log with the largest time sequence
+ * value that's in the log (the most recent log item). any element with
+ * a time sequence lower than min_seq will be ignored.
+ */
+static struct tree_mod_elem *
+tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
+{
+ return __tree_mod_log_search(fs_info, start, min_seq, 0);
+}
+
+static noinline int
+tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
+ struct extent_buffer *src, unsigned long dst_offset,
+ unsigned long src_offset, int nr_items)
+{
+ int ret = 0;
+ struct tree_mod_elem **tm_list = NULL;
+ struct tree_mod_elem **tm_list_add, **tm_list_rem;
+ int i;
+ int locked = 0;
+
+ if (!tree_mod_need_log(fs_info, NULL))
+ return 0;
+
+ if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
+ return 0;
+
+ tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
+ GFP_NOFS);
+ if (!tm_list)
+ return -ENOMEM;
+
+ tm_list_add = tm_list;
+ tm_list_rem = tm_list + nr_items;
+ for (i = 0; i < nr_items; i++) {
+ tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
+ MOD_LOG_KEY_REMOVE, GFP_NOFS);
+ if (!tm_list_rem[i]) {
+ ret = -ENOMEM;
+ goto free_tms;
+ }
+
+ tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
+ MOD_LOG_KEY_ADD, GFP_NOFS);
+ if (!tm_list_add[i]) {
+ ret = -ENOMEM;
+ goto free_tms;
+ }
+ }
+
+ if (tree_mod_dont_log(fs_info, NULL))
+ goto free_tms;
+ locked = 1;
+
+ for (i = 0; i < nr_items; i++) {
+ ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]);
+ if (ret)
+ goto free_tms;
+ ret = __tree_mod_log_insert(fs_info, tm_list_add[i]);
+ if (ret)
+ goto free_tms;
+ }
+
+ tree_mod_log_write_unlock(fs_info);
+ kfree(tm_list);
+
+ return 0;
+
+free_tms:
+ for (i = 0; i < nr_items * 2; i++) {
+ if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
+ rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
+ kfree(tm_list[i]);
+ }
+ if (locked)
+ tree_mod_log_write_unlock(fs_info);
+ kfree(tm_list);
+
+ return ret;
+}
+
+static inline void
+tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
+ int dst_offset, int src_offset, int nr_items)
+{
+ int ret;
+ ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset,
+ nr_items, GFP_NOFS);
+ BUG_ON(ret < 0);
+}
+
+static noinline void
+tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb, int slot, int atomic)
+{
+ int ret;
+
+ ret = tree_mod_log_insert_key(fs_info, eb, slot,
+ MOD_LOG_KEY_REPLACE,
+ atomic ? GFP_ATOMIC : GFP_NOFS);
+ BUG_ON(ret < 0);
+}
+
+static noinline int
+tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
+{
+ struct tree_mod_elem **tm_list = NULL;
+ int nritems = 0;
+ int i;
+ int ret = 0;
+
+ if (btrfs_header_level(eb) == 0)
+ return 0;
+
+ if (!tree_mod_need_log(fs_info, NULL))
+ return 0;
+
+ nritems = btrfs_header_nritems(eb);
+ tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
+ if (!tm_list)
+ return -ENOMEM;
+
+ for (i = 0; i < nritems; i++) {
+ tm_list[i] = alloc_tree_mod_elem(eb, i,
+ MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
+ if (!tm_list[i]) {
+ ret = -ENOMEM;
+ goto free_tms;
+ }
+ }
+
+ if (tree_mod_dont_log(fs_info, eb))
+ goto free_tms;
+
+ ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
+ tree_mod_log_write_unlock(fs_info);
+ if (ret)
+ goto free_tms;
+ kfree(tm_list);
+
+ return 0;
+
+free_tms:
+ for (i = 0; i < nritems; i++)
+ kfree(tm_list[i]);
+ kfree(tm_list);
+
+ return ret;
+}
+
+static noinline void
+tree_mod_log_set_root_pointer(struct btrfs_root *root,
+ struct extent_buffer *new_root_node,
+ int log_removal)
+{
+ int ret;
+ ret = tree_mod_log_insert_root(root->fs_info, root->node,
+ new_root_node, GFP_NOFS, log_removal);
+ BUG_ON(ret < 0);
+}
+
+/*
+ * check if the tree block can be shared by multiple trees
+ */
+int btrfs_block_can_be_shared(struct btrfs_root *root,
+ struct extent_buffer *buf)
+{
+ /*
+ * Tree blocks not in refernece counted trees and tree roots
+ * are never shared. If a block was allocated after the last
+ * snapshot and the block was not allocated by tree relocation,
+ * we know the block is not shared.
+ */
+ if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+ buf != root->node && buf != root->commit_root &&
+ (btrfs_header_generation(buf) <=
+ btrfs_root_last_snapshot(&root->root_item) ||
+ btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
+ return 1;
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+ if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+ btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
+ return 1;
+#endif
+ return 0;
+}
+
+static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *buf,
+ struct extent_buffer *cow,
+ int *last_ref)
+{
+ u64 refs;
+ u64 owner;
+ u64 flags;
+ u64 new_flags = 0;
+ int ret;
+
+ /*
+ * Backrefs update rules:
+ *
+ * Always use full backrefs for extent pointers in tree block
+ * allocated by tree relocation.
+ *
+ * If a shared tree block is no longer referenced by its owner
+ * tree (btrfs_header_owner(buf) == root->root_key.objectid),
+ * use full backrefs for extent pointers in tree block.
+ *
+ * If a tree block is been relocating
+ * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
+ * use full backrefs for extent pointers in tree block.
+ * The reason for this is some operations (such as drop tree)
+ * are only allowed for blocks use full backrefs.
+ */
+
+ if (btrfs_block_can_be_shared(root, buf)) {
+ ret = btrfs_lookup_extent_info(trans, root, buf->start,
+ btrfs_header_level(buf), 1,
+ &refs, &flags);
+ if (ret)
+ return ret;
+ if (refs == 0) {
+ ret = -EROFS;
+ btrfs_std_error(root->fs_info, ret);
+ return ret;
+ }
+ } else {
+ refs = 1;
+ if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
+ btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
+ flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
+ else
+ flags = 0;
+ }
+
+ owner = btrfs_header_owner(buf);
+ BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
+ !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
+
+ if (refs > 1) {
+ if ((owner == root->root_key.objectid ||
+ root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
+ !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
+ ret = btrfs_inc_ref(trans, root, buf, 1);
+ BUG_ON(ret); /* -ENOMEM */
+
+ if (root->root_key.objectid ==
+ BTRFS_TREE_RELOC_OBJECTID) {
+ ret = btrfs_dec_ref(trans, root, buf, 0);
+ BUG_ON(ret); /* -ENOMEM */
+ ret = btrfs_inc_ref(trans, root, cow, 1);
+ BUG_ON(ret); /* -ENOMEM */
+ }
+ new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
+ } else {
+
+ if (root->root_key.objectid ==
+ BTRFS_TREE_RELOC_OBJECTID)
+ ret = btrfs_inc_ref(trans, root, cow, 1);
+ else
+ ret = btrfs_inc_ref(trans, root, cow, 0);
+ BUG_ON(ret); /* -ENOMEM */
+ }
+ if (new_flags != 0) {
+ int level = btrfs_header_level(buf);
+
+ ret = btrfs_set_disk_extent_flags(trans, root,
+ buf->start,
+ buf->len,
+ new_flags, level, 0);
+ if (ret)
+ return ret;
+ }
+ } else {
+ if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
+ if (root->root_key.objectid ==
+ BTRFS_TREE_RELOC_OBJECTID)
+ ret = btrfs_inc_ref(trans, root, cow, 1);
+ else
+ ret = btrfs_inc_ref(trans, root, cow, 0);
+ BUG_ON(ret); /* -ENOMEM */
+ ret = btrfs_dec_ref(trans, root, buf, 1);
+ BUG_ON(ret); /* -ENOMEM */
+ }
+ clean_tree_block(trans, root->fs_info, buf);
+ *last_ref = 1;
+ }
+ return 0;
+}
+
+/*
+ * does the dirty work in cow of a single block. The parent block (if
+ * supplied) is updated to point to the new cow copy. The new buffer is marked
+ * dirty and returned locked. If you modify the block it needs to be marked
+ * dirty again.
+ *
+ * search_start -- an allocation hint for the new block
+ *
+ * empty_size -- a hint that you plan on doing more cow. This is the size in
+ * bytes the allocator should try to find free next to the block it returns.
+ * This is just a hint and may be ignored by the allocator.
+ */
+static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *buf,
+ struct extent_buffer *parent, int parent_slot,
+ struct extent_buffer **cow_ret,
+ u64 search_start, u64 empty_size)
+{
+ struct btrfs_disk_key disk_key;
+ struct extent_buffer *cow;
+ int level, ret;
+ int last_ref = 0;
+ int unlock_orig = 0;
+ u64 parent_start;
+
+ if (*cow_ret == buf)
+ unlock_orig = 1;
+
+ btrfs_assert_tree_locked(buf);
+
+ WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+ trans->transid != root->fs_info->running_transaction->transid);
+ WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+ trans->transid != root->last_trans);
+
+ level = btrfs_header_level(buf);
+
+ if (level == 0)
+ btrfs_item_key(buf, &disk_key, 0);
+ else
+ btrfs_node_key(buf, &disk_key, 0);
+
+ if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
+ if (parent)
+ parent_start = parent->start;
+ else
+ parent_start = 0;
+ } else
+ parent_start = 0;
+
+ cow = btrfs_alloc_tree_block(trans, root, parent_start,
+ root->root_key.objectid, &disk_key, level,
+ search_start, empty_size);
+ if (IS_ERR(cow))
+ return PTR_ERR(cow);
+
+ /* cow is set to blocking by btrfs_init_new_buffer */
+
+ copy_extent_buffer(cow, buf, 0, 0, cow->len);
+ btrfs_set_header_bytenr(cow, cow->start);
+ btrfs_set_header_generation(cow, trans->transid);
+ btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
+ btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
+ BTRFS_HEADER_FLAG_RELOC);
+ if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+ btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
+ else
+ btrfs_set_header_owner(cow, root->root_key.objectid);
+
+ write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(),
+ BTRFS_FSID_SIZE);
+
+ ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ return ret;
+ }
+
+ if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
+ ret = btrfs_reloc_cow_block(trans, root, buf, cow);
+ if (ret)
+ return ret;
+ }
+
+ if (buf == root->node) {
+ WARN_ON(parent && parent != buf);
+ if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
+ btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
+ parent_start = buf->start;
+ else
+ parent_start = 0;
+
+ extent_buffer_get(cow);
+ tree_mod_log_set_root_pointer(root, cow, 1);
+ rcu_assign_pointer(root->node, cow);
+
+ btrfs_free_tree_block(trans, root, buf, parent_start,
+ last_ref);
+ free_extent_buffer(buf);
+ add_root_to_dirty_list(root);
+ } else {
+ if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+ parent_start = parent->start;
+ else
+ parent_start = 0;
+
+ WARN_ON(trans->transid != btrfs_header_generation(parent));
+ tree_mod_log_insert_key(root->fs_info, parent, parent_slot,
+ MOD_LOG_KEY_REPLACE, GFP_NOFS);
+ btrfs_set_node_blockptr(parent, parent_slot,
+ cow->start);
+ btrfs_set_node_ptr_generation(parent, parent_slot,
+ trans->transid);
+ btrfs_mark_buffer_dirty(parent);
+ if (last_ref) {
+ ret = tree_mod_log_free_eb(root->fs_info, buf);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ return ret;
+ }
+ }
+ btrfs_free_tree_block(trans, root, buf, parent_start,
+ last_ref);
+ }
+ if (unlock_orig)
+ btrfs_tree_unlock(buf);
+ free_extent_buffer_stale(buf);
+ btrfs_mark_buffer_dirty(cow);
+ *cow_ret = cow;
+ return 0;
+}
+
+/*
+ * returns the logical address of the oldest predecessor of the given root.
+ * entries older than time_seq are ignored.
+ */
+static struct tree_mod_elem *
+__tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb_root, u64 time_seq)
+{
+ struct tree_mod_elem *tm;
+ struct tree_mod_elem *found = NULL;
+ u64 root_logical = eb_root->start;
+ int looped = 0;
+
+ if (!time_seq)
+ return NULL;
+
+ /*
+ * the very last operation that's logged for a root is the replacement
+ * operation (if it is replaced at all). this has the index of the *new*
+ * root, making it the very first operation that's logged for this root.
+ */
+ while (1) {
+ tm = tree_mod_log_search_oldest(fs_info, root_logical,
+ time_seq);
+ if (!looped && !tm)
+ return NULL;
+ /*
+ * if there are no tree operation for the oldest root, we simply
+ * return it. this should only happen if that (old) root is at
+ * level 0.
+ */
+ if (!tm)
+ break;
+
+ /*
+ * if there's an operation that's not a root replacement, we
+ * found the oldest version of our root. normally, we'll find a
+ * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
+ */
+ if (tm->op != MOD_LOG_ROOT_REPLACE)
+ break;
+
+ found = tm;
+ root_logical = tm->old_root.logical;
+ looped = 1;
+ }
+
+ /* if there's no old root to return, return what we found instead */
+ if (!found)
+ found = tm;
+
+ return found;
+}
+
+/*
+ * tm is a pointer to the first operation to rewind within eb. then, all
+ * previous operations will be rewinded (until we reach something older than
+ * time_seq).
+ */
+static void
+__tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
+ u64 time_seq, struct tree_mod_elem *first_tm)
+{
+ u32 n;
+ struct rb_node *next;
+ struct tree_mod_elem *tm = first_tm;
+ unsigned long o_dst;
+ unsigned long o_src;
+ unsigned long p_size = sizeof(struct btrfs_key_ptr);
+
+ n = btrfs_header_nritems(eb);
+ tree_mod_log_read_lock(fs_info);
+ while (tm && tm->seq >= time_seq) {
+ /*
+ * all the operations are recorded with the operator used for
+ * the modification. as we're going backwards, we do the
+ * opposite of each operation here.
+ */
+ switch (tm->op) {
+ case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
+ BUG_ON(tm->slot < n);
+ /* Fallthrough */
+ case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
+ case MOD_LOG_KEY_REMOVE:
+ btrfs_set_node_key(eb, &tm->key, tm->slot);
+ btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
+ btrfs_set_node_ptr_generation(eb, tm->slot,
+ tm->generation);
+ n++;
+ break;
+ case MOD_LOG_KEY_REPLACE:
+ BUG_ON(tm->slot >= n);
+ btrfs_set_node_key(eb, &tm->key, tm->slot);
+ btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
+ btrfs_set_node_ptr_generation(eb, tm->slot,
+ tm->generation);
+ break;
+ case MOD_LOG_KEY_ADD:
+ /* if a move operation is needed it's in the log */
+ n--;
+ break;
+ case MOD_LOG_MOVE_KEYS:
+ o_dst = btrfs_node_key_ptr_offset(tm->slot);
+ o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
+ memmove_extent_buffer(eb, o_dst, o_src,
+ tm->move.nr_items * p_size);
+ break;
+ case MOD_LOG_ROOT_REPLACE:
+ /*
+ * this operation is special. for roots, this must be
+ * handled explicitly before rewinding.
+ * for non-roots, this operation may exist if the node
+ * was a root: root A -> child B; then A gets empty and
+ * B is promoted to the new root. in the mod log, we'll
+ * have a root-replace operation for B, a tree block
+ * that is no root. we simply ignore that operation.
+ */
+ break;
+ }
+ next = rb_next(&tm->node);
+ if (!next)
+ break;
+ tm = container_of(next, struct tree_mod_elem, node);
+ if (tm->index != first_tm->index)
+ break;
+ }
+ tree_mod_log_read_unlock(fs_info);
+ btrfs_set_header_nritems(eb, n);
+}
+
+/*
+ * Called with eb read locked. If the buffer cannot be rewinded, the same buffer
+ * is returned. If rewind operations happen, a fresh buffer is returned. The
+ * returned buffer is always read-locked. If the returned buffer is not the
+ * input buffer, the lock on the input buffer is released and the input buffer
+ * is freed (its refcount is decremented).
+ */
+static struct extent_buffer *
+tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
+ struct extent_buffer *eb, u64 time_seq)
+{
+ struct extent_buffer *eb_rewin;
+ struct tree_mod_elem *tm;
+
+ if (!time_seq)
+ return eb;
+
+ if (btrfs_header_level(eb) == 0)
+ return eb;
+
+ tm = tree_mod_log_search(fs_info, eb->start, time_seq);
+ if (!tm)
+ return eb;
+
+ btrfs_set_path_blocking(path);
+ btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+
+ if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
+ BUG_ON(tm->slot != 0);
+ eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
+ if (!eb_rewin) {
+ btrfs_tree_read_unlock_blocking(eb);
+ free_extent_buffer(eb);
+ return NULL;
+ }
+ btrfs_set_header_bytenr(eb_rewin, eb->start);
+ btrfs_set_header_backref_rev(eb_rewin,
+ btrfs_header_backref_rev(eb));
+ btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
+ btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
+ } else {
+ eb_rewin = btrfs_clone_extent_buffer(eb);
+ if (!eb_rewin) {
+ btrfs_tree_read_unlock_blocking(eb);
+ free_extent_buffer(eb);
+ return NULL;
+ }
+ }
+
+ btrfs_clear_path_blocking(path, NULL, BTRFS_READ_LOCK);
+ btrfs_tree_read_unlock_blocking(eb);
+ free_extent_buffer(eb);
+
+ extent_buffer_get(eb_rewin);
+ btrfs_tree_read_lock(eb_rewin);
+ __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
+ WARN_ON(btrfs_header_nritems(eb_rewin) >
+ BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root));
+
+ return eb_rewin;
+}
+
+/*
+ * get_old_root() rewinds the state of @root's root node to the given @time_seq
+ * value. If there are no changes, the current root->root_node is returned. If
+ * anything changed in between, there's a fresh buffer allocated on which the
+ * rewind operations are done. In any case, the returned buffer is read locked.
+ * Returns NULL on error (with no locks held).
+ */
+static inline struct extent_buffer *
+get_old_root(struct btrfs_root *root, u64 time_seq)
+{
+ struct tree_mod_elem *tm;
+ struct extent_buffer *eb = NULL;
+ struct extent_buffer *eb_root;
+ struct extent_buffer *old;
+ struct tree_mod_root *old_root = NULL;
+ u64 old_generation = 0;
+ u64 logical;
+
+ eb_root = btrfs_read_lock_root_node(root);
+ tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
+ if (!tm)
+ return eb_root;
+
+ if (tm->op == MOD_LOG_ROOT_REPLACE) {
+ old_root = &tm->old_root;
+ old_generation = tm->generation;
+ logical = old_root->logical;
+ } else {
+ logical = eb_root->start;
+ }
+
+ tm = tree_mod_log_search(root->fs_info, logical, time_seq);
+ if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
+ btrfs_tree_read_unlock(eb_root);
+ free_extent_buffer(eb_root);
+ old = read_tree_block(root, logical, 0);
+ if (WARN_ON(!old || !extent_buffer_uptodate(old))) {
+ free_extent_buffer(old);
+ btrfs_warn(root->fs_info,
+ "failed to read tree block %llu from get_old_root", logical);
+ } else {
+ eb = btrfs_clone_extent_buffer(old);
+ free_extent_buffer(old);
+ }
+ } else if (old_root) {
+ btrfs_tree_read_unlock(eb_root);
+ free_extent_buffer(eb_root);
+ eb = alloc_dummy_extent_buffer(root->fs_info, logical);
+ } else {
+ btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
+ eb = btrfs_clone_extent_buffer(eb_root);
+ btrfs_tree_read_unlock_blocking(eb_root);
+ free_extent_buffer(eb_root);
+ }
+
+ if (!eb)
+ return NULL;
+ extent_buffer_get(eb);
+ btrfs_tree_read_lock(eb);
+ if (old_root) {
+ btrfs_set_header_bytenr(eb, eb->start);
+ btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
+ btrfs_set_header_owner(eb, btrfs_header_owner(eb_root));
+ btrfs_set_header_level(eb, old_root->level);
+ btrfs_set_header_generation(eb, old_generation);
+ }
+ if (tm)
+ __tree_mod_log_rewind(root->fs_info, eb, time_seq, tm);
+ else
+ WARN_ON(btrfs_header_level(eb) != 0);
+ WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root));
+
+ return eb;
+}
+
+int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
+{
+ struct tree_mod_elem *tm;
+ int level;
+ struct extent_buffer *eb_root = btrfs_root_node(root);
+
+ tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
+ if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
+ level = tm->old_root.level;
+ } else {
+ level = btrfs_header_level(eb_root);
+ }
+ free_extent_buffer(eb_root);
+
+ return level;
+}
+
+static inline int should_cow_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *buf)
+{
+ if (btrfs_test_is_dummy_root(root))
+ return 0;
+
+ /* ensure we can see the force_cow */
+ smp_rmb();
+
+ /*
+ * We do not need to cow a block if
+ * 1) this block is not created or changed in this transaction;
+ * 2) this block does not belong to TREE_RELOC tree;
+ * 3) the root is not forced COW.
+ *
+ * What is forced COW:
+ * when we create snapshot during commiting the transaction,
+ * after we've finished coping src root, we must COW the shared
+ * block to ensure the metadata consistency.
+ */
+ if (btrfs_header_generation(buf) == trans->transid &&
+ !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
+ !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
+ btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
+ !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
+ return 0;
+ return 1;
+}
+
+/*
+ * cows a single block, see __btrfs_cow_block for the real work.
+ * This version of it has extra checks so that a block isn't cow'd more than
+ * once per transaction, as long as it hasn't been written yet
+ */
+noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct extent_buffer *buf,
+ struct extent_buffer *parent, int parent_slot,
+ struct extent_buffer **cow_ret)
+{
+ u64 search_start;
+ int ret;
+
+ if (trans->transaction != root->fs_info->running_transaction)
+ WARN(1, KERN_CRIT "trans %llu running %llu\n",
+ trans->transid,
+ root->fs_info->running_transaction->transid);
+
+ if (trans->transid != root->fs_info->generation)
+ WARN(1, KERN_CRIT "trans %llu running %llu\n",
+ trans->transid, root->fs_info->generation);
+
+ if (!should_cow_block(trans, root, buf)) {
+ *cow_ret = buf;
+ return 0;
+ }
+
+ search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
+
+ if (parent)
+ btrfs_set_lock_blocking(parent);
+ btrfs_set_lock_blocking(buf);
+
+ ret = __btrfs_cow_block(trans, root, buf, parent,
+ parent_slot, cow_ret, search_start, 0);
+
+ trace_btrfs_cow_block(root, buf, *cow_ret);
+
+ return ret;
+}
+
+/*
+ * helper function for defrag to decide if two blocks pointed to by a
+ * node are actually close by
+ */
+static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
+{
+ if (blocknr < other && other - (blocknr + blocksize) < 32768)
+ return 1;
+ if (blocknr > other && blocknr - (other + blocksize) < 32768)
+ return 1;
+ return 0;
+}
+
+/*
+ * compare two keys in a memcmp fashion
+ */
+static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
+{
+ struct btrfs_key k1;
+
+ btrfs_disk_key_to_cpu(&k1, disk);
+
+ return btrfs_comp_cpu_keys(&k1, k2);
+}
+
+/*
+ * same as comp_keys only with two btrfs_key's
+ */
+int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
+{
+ if (k1->objectid > k2->objectid)
+ return 1;
+ if (k1->objectid < k2->objectid)
+ return -1;
+ if (k1->type > k2->type)
+ return 1;
+ if (k1->type < k2->type)
+ return -1;
+ if (k1->offset > k2->offset)
+ return 1;
+ if (k1->offset < k2->offset)
+ return -1;
+ return 0;
+}
+
+/*
+ * this is used by the defrag code to go through all the
+ * leaves pointed to by a node and reallocate them so that
+ * disk order is close to key order
+ */
+int btrfs_realloc_node(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct extent_buffer *parent,
+ int start_slot, u64 *last_ret,
+ struct btrfs_key *progress)
+{
+ struct extent_buffer *cur;
+ u64 blocknr;
+ u64 gen;
+ u64 search_start = *last_ret;
+ u64 last_block = 0;
+ u64 other;
+ u32 parent_nritems;
+ int end_slot;
+ int i;
+ int err = 0;
+ int parent_level;
+ int uptodate;
+ u32 blocksize;
+ int progress_passed = 0;
+ struct btrfs_disk_key disk_key;
+
+ parent_level = btrfs_header_level(parent);
+
+ WARN_ON(trans->transaction != root->fs_info->running_transaction);
+ WARN_ON(trans->transid != root->fs_info->generation);
+
+ parent_nritems = btrfs_header_nritems(parent);
+ blocksize = root->nodesize;
+ end_slot = parent_nritems - 1;
+
+ if (parent_nritems <= 1)
+ return 0;
+
+ btrfs_set_lock_blocking(parent);
+
+ for (i = start_slot; i <= end_slot; i++) {
+ int close = 1;
+
+ btrfs_node_key(parent, &disk_key, i);
+ if (!progress_passed && comp_keys(&disk_key, progress) < 0)
+ continue;
+
+ progress_passed = 1;
+ blocknr = btrfs_node_blockptr(parent, i);
+ gen = btrfs_node_ptr_generation(parent, i);
+ if (last_block == 0)
+ last_block = blocknr;
+
+ if (i > 0) {
+ other = btrfs_node_blockptr(parent, i - 1);
+ close = close_blocks(blocknr, other, blocksize);
+ }
+ if (!close && i < end_slot) {
+ other = btrfs_node_blockptr(parent, i + 1);
+ close = close_blocks(blocknr, other, blocksize);
+ }
+ if (close) {
+ last_block = blocknr;
+ continue;
+ }
+
+ cur = btrfs_find_tree_block(root->fs_info, blocknr);
+ if (cur)
+ uptodate = btrfs_buffer_uptodate(cur, gen, 0);
+ else
+ uptodate = 0;
+ if (!cur || !uptodate) {
+ if (!cur) {
+ cur = read_tree_block(root, blocknr, gen);
+ if (!cur || !extent_buffer_uptodate(cur)) {
+ free_extent_buffer(cur);
+ return -EIO;
+ }
+ } else if (!uptodate) {
+ err = btrfs_read_buffer(cur, gen);
+ if (err) {
+ free_extent_buffer(cur);
+ return err;
+ }
+ }
+ }
+ if (search_start == 0)
+ search_start = last_block;
+
+ btrfs_tree_lock(cur);
+ btrfs_set_lock_blocking(cur);
+ err = __btrfs_cow_block(trans, root, cur, parent, i,
+ &cur, search_start,
+ min(16 * blocksize,
+ (end_slot - i) * blocksize));
+ if (err) {
+ btrfs_tree_unlock(cur);
+ free_extent_buffer(cur);
+ break;
+ }
+ search_start = cur->start;
+ last_block = cur->start;
+ *last_ret = search_start;
+ btrfs_tree_unlock(cur);
+ free_extent_buffer(cur);
+ }
+ return err;
+}
+
+/*
+ * The leaf data grows from end-to-front in the node.
+ * this returns the address of the start of the last item,
+ * which is the stop of the leaf data stack
+ */
+static inline unsigned int leaf_data_end(struct btrfs_root *root,
+ struct extent_buffer *leaf)
+{
+ u32 nr = btrfs_header_nritems(leaf);
+ if (nr == 0)
+ return BTRFS_LEAF_DATA_SIZE(root);
+ return btrfs_item_offset_nr(leaf, nr - 1);
+}
+
+
+/*
+ * search for key in the extent_buffer. The items start at offset p,
+ * and they are item_size apart. There are 'max' items in p.
+ *
+ * the slot in the array is returned via slot, and it points to
+ * the place where you would insert key if it is not found in
+ * the array.
+ *
+ * slot may point to max if the key is bigger than all of the keys
+ */
+static noinline int generic_bin_search(struct extent_buffer *eb,
+ unsigned long p,
+ int item_size, struct btrfs_key *key,
+ int max, int *slot)
+{
+ int low = 0;
+ int high = max;
+ int mid;
+ int ret;
+ struct btrfs_disk_key *tmp = NULL;
+ struct btrfs_disk_key unaligned;
+ unsigned long offset;
+ char *kaddr = NULL;
+ unsigned long map_start = 0;
+ unsigned long map_len = 0;
+ int err;
+
+ while (low < high) {
+ mid = (low + high) / 2;
+ offset = p + mid * item_size;
+
+ if (!kaddr || offset < map_start ||
+ (offset + sizeof(struct btrfs_disk_key)) >
+ map_start + map_len) {
+
+ err = map_private_extent_buffer(eb, offset,
+ sizeof(struct btrfs_disk_key),
+ &kaddr, &map_start, &map_len);
+
+ if (!err) {
+ tmp = (struct btrfs_disk_key *)(kaddr + offset -
+ map_start);
+ } else {
+ read_extent_buffer(eb, &unaligned,
+ offset, sizeof(unaligned));
+ tmp = &unaligned;
+ }
+
+ } else {
+ tmp = (struct btrfs_disk_key *)(kaddr + offset -
+ map_start);
+ }
+ ret = comp_keys(tmp, key);
+
+ if (ret < 0)
+ low = mid + 1;
+ else if (ret > 0)
+ high = mid;
+ else {
+ *slot = mid;
+ return 0;
+ }
+ }
+ *slot = low;
+ return 1;
+}
+
+/*
+ * simple bin_search frontend that does the right thing for
+ * leaves vs nodes
+ */
+static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
+ int level, int *slot)
+{
+ if (level == 0)
+ return generic_bin_search(eb,
+ offsetof(struct btrfs_leaf, items),
+ sizeof(struct btrfs_item),
+ key, btrfs_header_nritems(eb),
+ slot);
+ else
+ return generic_bin_search(eb,
+ offsetof(struct btrfs_node, ptrs),
+ sizeof(struct btrfs_key_ptr),
+ key, btrfs_header_nritems(eb),
+ slot);
+}
+
+int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
+ int level, int *slot)
+{
+ return bin_search(eb, key, level, slot);
+}
+
+static void root_add_used(struct btrfs_root *root, u32 size)
+{
+ spin_lock(&root->accounting_lock);
+ btrfs_set_root_used(&root->root_item,
+ btrfs_root_used(&root->root_item) + size);
+ spin_unlock(&root->accounting_lock);
+}
+
+static void root_sub_used(struct btrfs_root *root, u32 size)
+{
+ spin_lock(&root->accounting_lock);
+ btrfs_set_root_used(&root->root_item,
+ btrfs_root_used(&root->root_item) - size);
+ spin_unlock(&root->accounting_lock);
+}
+
+/* given a node and slot number, this reads the blocks it points to. The
+ * extent buffer is returned with a reference taken (but unlocked).
+ * NULL is returned on error.
+ */
+static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
+ struct extent_buffer *parent, int slot)
+{
+ int level = btrfs_header_level(parent);
+ struct extent_buffer *eb;
+
+ if (slot < 0)
+ return NULL;
+ if (slot >= btrfs_header_nritems(parent))
+ return NULL;
+
+ BUG_ON(level == 0);
+
+ eb = read_tree_block(root, btrfs_node_blockptr(parent, slot),
+ btrfs_node_ptr_generation(parent, slot));
+ if (eb && !extent_buffer_uptodate(eb)) {
+ free_extent_buffer(eb);
+ eb = NULL;
+ }
+
+ return eb;
+}
+
+/*
+ * node level balancing, used to make sure nodes are in proper order for
+ * item deletion. We balance from the top down, so we have to make sure
+ * that a deletion won't leave an node completely empty later on.
+ */
+static noinline int balance_level(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, int level)
+{
+ struct extent_buffer *right = NULL;
+ struct extent_buffer *mid;
+ struct extent_buffer *left = NULL;
+ struct extent_buffer *parent = NULL;
+ int ret = 0;
+ int wret;
+ int pslot;
+ int orig_slot = path->slots[level];
+ u64 orig_ptr;
+
+ if (level == 0)
+ return 0;
+
+ mid = path->nodes[level];
+
+ WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
+ path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
+ WARN_ON(btrfs_header_generation(mid) != trans->transid);
+
+ orig_ptr = btrfs_node_blockptr(mid, orig_slot);
+
+ if (level < BTRFS_MAX_LEVEL - 1) {
+ parent = path->nodes[level + 1];
+ pslot = path->slots[level + 1];
+ }
+
+ /*
+ * deal with the case where there is only one pointer in the root
+ * by promoting the node below to a root
+ */
+ if (!parent) {
+ struct extent_buffer *child;
+
+ if (btrfs_header_nritems(mid) != 1)
+ return 0;
+
+ /* promote the child to a root */
+ child = read_node_slot(root, mid, 0);
+ if (!child) {
+ ret = -EROFS;
+ btrfs_std_error(root->fs_info, ret);
+ goto enospc;
+ }
+
+ btrfs_tree_lock(child);
+ btrfs_set_lock_blocking(child);
+ ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
+ if (ret) {
+ btrfs_tree_unlock(child);
+ free_extent_buffer(child);
+ goto enospc;
+ }
+
+ tree_mod_log_set_root_pointer(root, child, 1);
+ rcu_assign_pointer(root->node, child);
+
+ add_root_to_dirty_list(root);
+ btrfs_tree_unlock(child);
+
+ path->locks[level] = 0;
+ path->nodes[level] = NULL;
+ clean_tree_block(trans, root->fs_info, mid);
+ btrfs_tree_unlock(mid);
+ /* once for the path */
+ free_extent_buffer(mid);
+
+ root_sub_used(root, mid->len);
+ btrfs_free_tree_block(trans, root, mid, 0, 1);
+ /* once for the root ptr */
+ free_extent_buffer_stale(mid);
+ return 0;
+ }
+ if (btrfs_header_nritems(mid) >
+ BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
+ return 0;
+
+ left = read_node_slot(root, parent, pslot - 1);
+ if (left) {
+ btrfs_tree_lock(left);
+ btrfs_set_lock_blocking(left);
+ wret = btrfs_cow_block(trans, root, left,
+ parent, pslot - 1, &left);
+ if (wret) {
+ ret = wret;
+ goto enospc;
+ }
+ }
+ right = read_node_slot(root, parent, pslot + 1);
+ if (right) {
+ btrfs_tree_lock(right);
+ btrfs_set_lock_blocking(right);
+ wret = btrfs_cow_block(trans, root, right,
+ parent, pslot + 1, &right);
+ if (wret) {
+ ret = wret;
+ goto enospc;
+ }
+ }
+
+ /* first, try to make some room in the middle buffer */
+ if (left) {
+ orig_slot += btrfs_header_nritems(left);
+ wret = push_node_left(trans, root, left, mid, 1);
+ if (wret < 0)
+ ret = wret;
+ }
+
+ /*
+ * then try to empty the right most buffer into the middle
+ */
+ if (right) {
+ wret = push_node_left(trans, root, mid, right, 1);
+ if (wret < 0 && wret != -ENOSPC)
+ ret = wret;
+ if (btrfs_header_nritems(right) == 0) {
+ clean_tree_block(trans, root->fs_info, right);
+ btrfs_tree_unlock(right);
+ del_ptr(root, path, level + 1, pslot + 1);
+ root_sub_used(root, right->len);
+ btrfs_free_tree_block(trans, root, right, 0, 1);
+ free_extent_buffer_stale(right);
+ right = NULL;
+ } else {
+ struct btrfs_disk_key right_key;
+ btrfs_node_key(right, &right_key, 0);
+ tree_mod_log_set_node_key(root->fs_info, parent,
+ pslot + 1, 0);
+ btrfs_set_node_key(parent, &right_key, pslot + 1);
+ btrfs_mark_buffer_dirty(parent);
+ }
+ }
+ if (btrfs_header_nritems(mid) == 1) {
+ /*
+ * we're not allowed to leave a node with one item in the
+ * tree during a delete. A deletion from lower in the tree
+ * could try to delete the only pointer in this node.
+ * So, pull some keys from the left.
+ * There has to be a left pointer at this point because
+ * otherwise we would have pulled some pointers from the
+ * right
+ */
+ if (!left) {
+ ret = -EROFS;
+ btrfs_std_error(root->fs_info, ret);
+ goto enospc;
+ }
+ wret = balance_node_right(trans, root, mid, left);
+ if (wret < 0) {
+ ret = wret;
+ goto enospc;
+ }
+ if (wret == 1) {
+ wret = push_node_left(trans, root, left, mid, 1);
+ if (wret < 0)
+ ret = wret;
+ }
+ BUG_ON(wret == 1);
+ }
+ if (btrfs_header_nritems(mid) == 0) {
+ clean_tree_block(trans, root->fs_info, mid);
+ btrfs_tree_unlock(mid);
+ del_ptr(root, path, level + 1, pslot);
+ root_sub_used(root, mid->len);
+ btrfs_free_tree_block(trans, root, mid, 0, 1);
+ free_extent_buffer_stale(mid);
+ mid = NULL;
+ } else {
+ /* update the parent key to reflect our changes */
+ struct btrfs_disk_key mid_key;
+ btrfs_node_key(mid, &mid_key, 0);
+ tree_mod_log_set_node_key(root->fs_info, parent,
+ pslot, 0);
+ btrfs_set_node_key(parent, &mid_key, pslot);
+ btrfs_mark_buffer_dirty(parent);
+ }
+
+ /* update the path */
+ if (left) {
+ if (btrfs_header_nritems(left) > orig_slot) {
+ extent_buffer_get(left);
+ /* left was locked after cow */
+ path->nodes[level] = left;
+ path->slots[level + 1] -= 1;
+ path->slots[level] = orig_slot;
+ if (mid) {
+ btrfs_tree_unlock(mid);
+ free_extent_buffer(mid);
+ }
+ } else {
+ orig_slot -= btrfs_header_nritems(left);
+ path->slots[level] = orig_slot;
+ }
+ }
+ /* double check we haven't messed things up */
+ if (orig_ptr !=
+ btrfs_node_blockptr(path->nodes[level], path->slots[level]))
+ BUG();
+enospc:
+ if (right) {
+ btrfs_tree_unlock(right);
+ free_extent_buffer(right);
+ }
+ if (left) {
+ if (path->nodes[level] != left)
+ btrfs_tree_unlock(left);
+ free_extent_buffer(left);
+ }
+ return ret;
+}
+
+/* Node balancing for insertion. Here we only split or push nodes around
+ * when they are completely full. This is also done top down, so we
+ * have to be pessimistic.
+ */
+static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, int level)
+{
+ struct extent_buffer *right = NULL;
+ struct extent_buffer *mid;
+ struct extent_buffer *left = NULL;
+ struct extent_buffer *parent = NULL;
+ int ret = 0;
+ int wret;
+ int pslot;
+ int orig_slot = path->slots[level];
+
+ if (level == 0)
+ return 1;
+
+ mid = path->nodes[level];
+ WARN_ON(btrfs_header_generation(mid) != trans->transid);
+
+ if (level < BTRFS_MAX_LEVEL - 1) {
+ parent = path->nodes[level + 1];
+ pslot = path->slots[level + 1];
+ }
+
+ if (!parent)
+ return 1;
+
+ left = read_node_slot(root, parent, pslot - 1);
+
+ /* first, try to make some room in the middle buffer */
+ if (left) {
+ u32 left_nr;
+
+ btrfs_tree_lock(left);
+ btrfs_set_lock_blocking(left);
+
+ left_nr = btrfs_header_nritems(left);
+ if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
+ wret = 1;
+ } else {
+ ret = btrfs_cow_block(trans, root, left, parent,
+ pslot - 1, &left);
+ if (ret)
+ wret = 1;
+ else {
+ wret = push_node_left(trans, root,
+ left, mid, 0);
+ }
+ }
+ if (wret < 0)
+ ret = wret;
+ if (wret == 0) {
+ struct btrfs_disk_key disk_key;
+ orig_slot += left_nr;
+ btrfs_node_key(mid, &disk_key, 0);
+ tree_mod_log_set_node_key(root->fs_info, parent,
+ pslot, 0);
+ btrfs_set_node_key(parent, &disk_key, pslot);
+ btrfs_mark_buffer_dirty(parent);
+ if (btrfs_header_nritems(left) > orig_slot) {
+ path->nodes[level] = left;
+ path->slots[level + 1] -= 1;
+ path->slots[level] = orig_slot;
+ btrfs_tree_unlock(mid);
+ free_extent_buffer(mid);
+ } else {
+ orig_slot -=
+ btrfs_header_nritems(left);
+ path->slots[level] = orig_slot;
+ btrfs_tree_unlock(left);
+ free_extent_buffer(left);
+ }
+ return 0;
+ }
+ btrfs_tree_unlock(left);
+ free_extent_buffer(left);
+ }
+ right = read_node_slot(root, parent, pslot + 1);
+
+ /*
+ * then try to empty the right most buffer into the middle
+ */
+ if (right) {
+ u32 right_nr;
+
+ btrfs_tree_lock(right);
+ btrfs_set_lock_blocking(right);
+
+ right_nr = btrfs_header_nritems(right);
+ if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
+ wret = 1;
+ } else {
+ ret = btrfs_cow_block(trans, root, right,
+ parent, pslot + 1,
+ &right);
+ if (ret)
+ wret = 1;
+ else {
+ wret = balance_node_right(trans, root,
+ right, mid);
+ }
+ }
+ if (wret < 0)
+ ret = wret;
+ if (wret == 0) {
+ struct btrfs_disk_key disk_key;
+
+ btrfs_node_key(right, &disk_key, 0);
+ tree_mod_log_set_node_key(root->fs_info, parent,
+ pslot + 1, 0);
+ btrfs_set_node_key(parent, &disk_key, pslot + 1);
+ btrfs_mark_buffer_dirty(parent);
+
+ if (btrfs_header_nritems(mid) <= orig_slot) {
+ path->nodes[level] = right;
+ path->slots[level + 1] += 1;
+ path->slots[level] = orig_slot -
+ btrfs_header_nritems(mid);
+ btrfs_tree_unlock(mid);
+ free_extent_buffer(mid);
+ } else {
+ btrfs_tree_unlock(right);
+ free_extent_buffer(right);
+ }
+ return 0;
+ }
+ btrfs_tree_unlock(right);
+ free_extent_buffer(right);
+ }
+ return 1;
+}
+
+/*
+ * readahead one full node of leaves, finding things that are close
+ * to the block in 'slot', and triggering ra on them.
+ */
+static void reada_for_search(struct btrfs_root *root,
+ struct btrfs_path *path,
+ int level, int slot, u64 objectid)
+{
+ struct extent_buffer *node;
+ struct btrfs_disk_key disk_key;
+ u32 nritems;
+ u64 search;
+ u64 target;
+ u64 nread = 0;
+ u64 gen;
+ int direction = path->reada;
+ struct extent_buffer *eb;
+ u32 nr;
+ u32 blocksize;
+ u32 nscan = 0;
+
+ if (level != 1)
+ return;
+
+ if (!path->nodes[level])
+ return;
+
+ node = path->nodes[level];
+
+ search = btrfs_node_blockptr(node, slot);
+ blocksize = root->nodesize;
+ eb = btrfs_find_tree_block(root->fs_info, search);
+ if (eb) {
+ free_extent_buffer(eb);
+ return;
+ }
+
+ target = search;
+
+ nritems = btrfs_header_nritems(node);
+ nr = slot;
+
+ while (1) {
+ if (direction < 0) {
+ if (nr == 0)
+ break;
+ nr--;
+ } else if (direction > 0) {
+ nr++;
+ if (nr >= nritems)
+ break;
+ }
+ if (path->reada < 0 && objectid) {
+ btrfs_node_key(node, &disk_key, nr);
+ if (btrfs_disk_key_objectid(&disk_key) != objectid)
+ break;
+ }
+ search = btrfs_node_blockptr(node, nr);
+ if ((search <= target && target - search <= 65536) ||
+ (search > target && search - target <= 65536)) {
+ gen = btrfs_node_ptr_generation(node, nr);
+ readahead_tree_block(root, search);
+ nread += blocksize;
+ }
+ nscan++;
+ if ((nread > 65536 || nscan > 32))
+ break;
+ }
+}
+
+static noinline void reada_for_balance(struct btrfs_root *root,
+ struct btrfs_path *path, int level)
+{
+ int slot;
+ int nritems;
+ struct extent_buffer *parent;
+ struct extent_buffer *eb;
+ u64 gen;
+ u64 block1 = 0;
+ u64 block2 = 0;
+
+ parent = path->nodes[level + 1];
+ if (!parent)
+ return;
+
+ nritems = btrfs_header_nritems(parent);
+ slot = path->slots[level + 1];
+
+ if (slot > 0) {
+ block1 = btrfs_node_blockptr(parent, slot - 1);
+ gen = btrfs_node_ptr_generation(parent, slot - 1);
+ eb = btrfs_find_tree_block(root->fs_info, block1);
+ /*
+ * if we get -eagain from btrfs_buffer_uptodate, we
+ * don't want to return eagain here. That will loop
+ * forever
+ */
+ if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
+ block1 = 0;
+ free_extent_buffer(eb);
+ }
+ if (slot + 1 < nritems) {
+ block2 = btrfs_node_blockptr(parent, slot + 1);
+ gen = btrfs_node_ptr_generation(parent, slot + 1);
+ eb = btrfs_find_tree_block(root->fs_info, block2);
+ if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
+ block2 = 0;
+ free_extent_buffer(eb);
+ }
+
+ if (block1)
+ readahead_tree_block(root, block1);
+ if (block2)
+ readahead_tree_block(root, block2);
+}
+
+
+/*
+ * when we walk down the tree, it is usually safe to unlock the higher layers
+ * in the tree. The exceptions are when our path goes through slot 0, because
+ * operations on the tree might require changing key pointers higher up in the
+ * tree.
+ *
+ * callers might also have set path->keep_locks, which tells this code to keep
+ * the lock if the path points to the last slot in the block. This is part of
+ * walking through the tree, and selecting the next slot in the higher block.
+ *
+ * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so
+ * if lowest_unlock is 1, level 0 won't be unlocked
+ */
+static noinline void unlock_up(struct btrfs_path *path, int level,
+ int lowest_unlock, int min_write_lock_level,
+ int *write_lock_level)
+{
+ int i;
+ int skip_level = level;
+ int no_skips = 0;
+ struct extent_buffer *t;
+
+ for (i = level; i < BTRFS_MAX_LEVEL; i++) {
+ if (!path->nodes[i])
+ break;
+ if (!path->locks[i])
+ break;
+ if (!no_skips && path->slots[i] == 0) {
+ skip_level = i + 1;
+ continue;
+ }
+ if (!no_skips && path->keep_locks) {
+ u32 nritems;
+ t = path->nodes[i];
+ nritems = btrfs_header_nritems(t);
+ if (nritems < 1 || path->slots[i] >= nritems - 1) {
+ skip_level = i + 1;
+ continue;
+ }
+ }
+ if (skip_level < i && i >= lowest_unlock)
+ no_skips = 1;
+
+ t = path->nodes[i];
+ if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
+ btrfs_tree_unlock_rw(t, path->locks[i]);
+ path->locks[i] = 0;
+ if (write_lock_level &&
+ i > min_write_lock_level &&
+ i <= *write_lock_level) {
+ *write_lock_level = i - 1;
+ }
+ }
+ }
+}
+
+/*
+ * This releases any locks held in the path starting at level and
+ * going all the way up to the root.
+ *
+ * btrfs_search_slot will keep the lock held on higher nodes in a few
+ * corner cases, such as COW of the block at slot zero in the node. This
+ * ignores those rules, and it should only be called when there are no
+ * more updates to be done higher up in the tree.
+ */
+noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
+{
+ int i;
+
+ if (path->keep_locks)
+ return;
+
+ for (i = level; i < BTRFS_MAX_LEVEL; i++) {
+ if (!path->nodes[i])
+ continue;
+ if (!path->locks[i])
+ continue;
+ btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
+ path->locks[i] = 0;
+ }
+}
+
+/*
+ * helper function for btrfs_search_slot. The goal is to find a block
+ * in cache without setting the path to blocking. If we find the block
+ * we return zero and the path is unchanged.
+ *
+ * If we can't find the block, we set the path blocking and do some
+ * reada. -EAGAIN is returned and the search must be repeated.
+ */
+static int
+read_block_for_search(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *p,
+ struct extent_buffer **eb_ret, int level, int slot,
+ struct btrfs_key *key, u64 time_seq)
+{
+ u64 blocknr;
+ u64 gen;
+ struct extent_buffer *b = *eb_ret;
+ struct extent_buffer *tmp;
+ int ret;
+
+ blocknr = btrfs_node_blockptr(b, slot);
+ gen = btrfs_node_ptr_generation(b, slot);
+
+ tmp = btrfs_find_tree_block(root->fs_info, blocknr);
+ if (tmp) {
+ /* first we do an atomic uptodate check */
+ if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
+ *eb_ret = tmp;
+ return 0;
+ }
+
+ /* the pages were up to date, but we failed
+ * the generation number check. Do a full
+ * read for the generation number that is correct.
+ * We must do this without dropping locks so
+ * we can trust our generation number
+ */
+ btrfs_set_path_blocking(p);
+
+ /* now we're allowed to do a blocking uptodate check */
+ ret = btrfs_read_buffer(tmp, gen);
+ if (!ret) {
+ *eb_ret = tmp;
+ return 0;
+ }
+ free_extent_buffer(tmp);
+ btrfs_release_path(p);
+ return -EIO;
+ }
+
+ /*
+ * reduce lock contention at high levels
+ * of the btree by dropping locks before
+ * we read. Don't release the lock on the current
+ * level because we need to walk this node to figure
+ * out which blocks to read.
+ */
+ btrfs_unlock_up_safe(p, level + 1);
+ btrfs_set_path_blocking(p);
+
+ free_extent_buffer(tmp);
+ if (p->reada)
+ reada_for_search(root, p, level, slot, key->objectid);
+
+ btrfs_release_path(p);
+
+ ret = -EAGAIN;
+ tmp = read_tree_block(root, blocknr, 0);
+ if (tmp) {
+ /*
+ * If the read above didn't mark this buffer up to date,
+ * it will never end up being up to date. Set ret to EIO now
+ * and give up so that our caller doesn't loop forever
+ * on our EAGAINs.
+ */
+ if (!btrfs_buffer_uptodate(tmp, 0, 0))
+ ret = -EIO;
+ free_extent_buffer(tmp);
+ }
+ return ret;
+}
+
+/*
+ * helper function for btrfs_search_slot. This does all of the checks
+ * for node-level blocks and does any balancing required based on
+ * the ins_len.
+ *
+ * If no extra work was required, zero is returned. If we had to
+ * drop the path, -EAGAIN is returned and btrfs_search_slot must
+ * start over
+ */
+static int
+setup_nodes_for_search(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *p,
+ struct extent_buffer *b, int level, int ins_len,
+ int *write_lock_level)
+{
+ int ret;
+ if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
+ BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
+ int sret;
+
+ if (*write_lock_level < level + 1) {
+ *write_lock_level = level + 1;
+ btrfs_release_path(p);
+ goto again;
+ }
+
+ btrfs_set_path_blocking(p);
+ reada_for_balance(root, p, level);
+ sret = split_node(trans, root, p, level);
+ btrfs_clear_path_blocking(p, NULL, 0);
+
+ BUG_ON(sret > 0);
+ if (sret) {
+ ret = sret;
+ goto done;
+ }
+ b = p->nodes[level];
+ } else if (ins_len < 0 && btrfs_header_nritems(b) <
+ BTRFS_NODEPTRS_PER_BLOCK(root) / 2) {
+ int sret;
+
+ if (*write_lock_level < level + 1) {
+ *write_lock_level = level + 1;
+ btrfs_release_path(p);
+ goto again;
+ }
+
+ btrfs_set_path_blocking(p);
+ reada_for_balance(root, p, level);
+ sret = balance_level(trans, root, p, level);
+ btrfs_clear_path_blocking(p, NULL, 0);
+
+ if (sret) {
+ ret = sret;
+ goto done;
+ }
+ b = p->nodes[level];
+ if (!b) {
+ btrfs_release_path(p);
+ goto again;
+ }
+ BUG_ON(btrfs_header_nritems(b) == 1);
+ }
+ return 0;
+
+again:
+ ret = -EAGAIN;
+done:
+ return ret;
+}
+
+static void key_search_validate(struct extent_buffer *b,
+ struct btrfs_key *key,
+ int level)
+{
+#ifdef CONFIG_BTRFS_ASSERT
+ struct btrfs_disk_key disk_key;
+
+ btrfs_cpu_key_to_disk(&disk_key, key);
+
+ if (level == 0)
+ ASSERT(!memcmp_extent_buffer(b, &disk_key,
+ offsetof(struct btrfs_leaf, items[0].key),
+ sizeof(disk_key)));
+ else
+ ASSERT(!memcmp_extent_buffer(b, &disk_key,
+ offsetof(struct btrfs_node, ptrs[0].key),
+ sizeof(disk_key)));
+#endif
+}
+
+static int key_search(struct extent_buffer *b, struct btrfs_key *key,
+ int level, int *prev_cmp, int *slot)
+{
+ if (*prev_cmp != 0) {
+ *prev_cmp = bin_search(b, key, level, slot);
+ return *prev_cmp;
+ }
+
+ key_search_validate(b, key, level);
+ *slot = 0;
+
+ return 0;
+}
+
+int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
+ u64 iobjectid, u64 ioff, u8 key_type,
+ struct btrfs_key *found_key)
+{
+ int ret;
+ struct btrfs_key key;
+ struct extent_buffer *eb;
+
+ ASSERT(path);
+ ASSERT(found_key);
+
+ key.type = key_type;
+ key.objectid = iobjectid;
+ key.offset = ioff;
+
+ ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+
+ eb = path->nodes[0];
+ if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
+ ret = btrfs_next_leaf(fs_root, path);
+ if (ret)
+ return ret;
+ eb = path->nodes[0];
+ }
+
+ btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
+ if (found_key->type != key.type ||
+ found_key->objectid != key.objectid)
+ return 1;
+
+ return 0;
+}
+
+/*
+ * look for key in the tree. path is filled in with nodes along the way
+ * if key is found, we return zero and you can find the item in the leaf
+ * level of the path (level 0)
+ *
+ * If the key isn't found, the path points to the slot where it should
+ * be inserted, and 1 is returned. If there are other errors during the
+ * search a negative error number is returned.
+ *
+ * if ins_len > 0, nodes and leaves will be split as we walk down the
+ * tree. if ins_len < 0, nodes will be merged as we walk down the tree (if
+ * possible)
+ */
+int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_key *key, struct btrfs_path *p, int
+ ins_len, int cow)
+{
+ struct extent_buffer *b;
+ int slot;
+ int ret;
+ int err;
+ int level;
+ int lowest_unlock = 1;
+ int root_lock;
+ /* everything at write_lock_level or lower must be write locked */
+ int write_lock_level = 0;
+ u8 lowest_level = 0;
+ int min_write_lock_level;
+ int prev_cmp;
+
+ lowest_level = p->lowest_level;
+ WARN_ON(lowest_level && ins_len > 0);
+ WARN_ON(p->nodes[0] != NULL);
+ BUG_ON(!cow && ins_len);
+
+ if (ins_len < 0) {
+ lowest_unlock = 2;
+
+ /* when we are removing items, we might have to go up to level
+ * two as we update tree pointers Make sure we keep write
+ * for those levels as well
+ */
+ write_lock_level = 2;
+ } else if (ins_len > 0) {
+ /*
+ * for inserting items, make sure we have a write lock on
+ * level 1 so we can update keys
+ */
+ write_lock_level = 1;
+ }
+
+ if (!cow)
+ write_lock_level = -1;
+
+ if (cow && (p->keep_locks || p->lowest_level))
+ write_lock_level = BTRFS_MAX_LEVEL;
+
+ min_write_lock_level = write_lock_level;
+
+again:
+ prev_cmp = -1;
+ /*
+ * we try very hard to do read locks on the root
+ */
+ root_lock = BTRFS_READ_LOCK;
+ level = 0;
+ if (p->search_commit_root) {
+ /*
+ * the commit roots are read only
+ * so we always do read locks
+ */
+ if (p->need_commit_sem)
+ down_read(&root->fs_info->commit_root_sem);
+ b = root->commit_root;
+ extent_buffer_get(b);
+ level = btrfs_header_level(b);
+ if (p->need_commit_sem)
+ up_read(&root->fs_info->commit_root_sem);
+ if (!p->skip_locking)
+ btrfs_tree_read_lock(b);
+ } else {
+ if (p->skip_locking) {
+ b = btrfs_root_node(root);
+ level = btrfs_header_level(b);
+ } else {
+ /* we don't know the level of the root node
+ * until we actually have it read locked
+ */
+ b = btrfs_read_lock_root_node(root);
+ level = btrfs_header_level(b);
+ if (level <= write_lock_level) {
+ /* whoops, must trade for write lock */
+ btrfs_tree_read_unlock(b);
+ free_extent_buffer(b);
+ b = btrfs_lock_root_node(root);
+ root_lock = BTRFS_WRITE_LOCK;
+
+ /* the level might have changed, check again */
+ level = btrfs_header_level(b);
+ }
+ }
+ }
+ p->nodes[level] = b;
+ if (!p->skip_locking)
+ p->locks[level] = root_lock;
+
+ while (b) {
+ level = btrfs_header_level(b);
+
+ /*
+ * setup the path here so we can release it under lock
+ * contention with the cow code
+ */
+ if (cow) {
+ /*
+ * if we don't really need to cow this block
+ * then we don't want to set the path blocking,
+ * so we test it here
+ */
+ if (!should_cow_block(trans, root, b))
+ goto cow_done;
+
+ /*
+ * must have write locks on this node and the
+ * parent
+ */
+ if (level > write_lock_level ||
+ (level + 1 > write_lock_level &&
+ level + 1 < BTRFS_MAX_LEVEL &&
+ p->nodes[level + 1])) {
+ write_lock_level = level + 1;
+ btrfs_release_path(p);
+ goto again;
+ }
+
+ btrfs_set_path_blocking(p);
+ err = btrfs_cow_block(trans, root, b,
+ p->nodes[level + 1],
+ p->slots[level + 1], &b);
+ if (err) {
+ ret = err;
+ goto done;
+ }
+ }
+cow_done:
+ p->nodes[level] = b;
+ btrfs_clear_path_blocking(p, NULL, 0);
+
+ /*
+ * we have a lock on b and as long as we aren't changing
+ * the tree, there is no way to for the items in b to change.
+ * It is safe to drop the lock on our parent before we
+ * go through the expensive btree search on b.
+ *
+ * If we're inserting or deleting (ins_len != 0), then we might
+ * be changing slot zero, which may require changing the parent.
+ * So, we can't drop the lock until after we know which slot
+ * we're operating on.
+ */
+ if (!ins_len && !p->keep_locks) {
+ int u = level + 1;
+
+ if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
+ btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
+ p->locks[u] = 0;
+ }
+ }
+
+ ret = key_search(b, key, level, &prev_cmp, &slot);
+
+ if (level != 0) {
+ int dec = 0;
+ if (ret && slot > 0) {
+ dec = 1;
+ slot -= 1;
+ }
+ p->slots[level] = slot;
+ err = setup_nodes_for_search(trans, root, p, b, level,
+ ins_len, &write_lock_level);
+ if (err == -EAGAIN)
+ goto again;
+ if (err) {
+ ret = err;
+ goto done;
+ }
+ b = p->nodes[level];
+ slot = p->slots[level];
+
+ /*
+ * slot 0 is special, if we change the key
+ * we have to update the parent pointer
+ * which means we must have a write lock
+ * on the parent
+ */
+ if (slot == 0 && ins_len &&
+ write_lock_level < level + 1) {
+ write_lock_level = level + 1;
+ btrfs_release_path(p);
+ goto again;
+ }
+
+ unlock_up(p, level, lowest_unlock,
+ min_write_lock_level, &write_lock_level);
+
+ if (level == lowest_level) {
+ if (dec)
+ p->slots[level]++;
+ goto done;
+ }
+
+ err = read_block_for_search(trans, root, p,
+ &b, level, slot, key, 0);
+ if (err == -EAGAIN)
+ goto again;
+ if (err) {
+ ret = err;
+ goto done;
+ }
+
+ if (!p->skip_locking) {
+ level = btrfs_header_level(b);
+ if (level <= write_lock_level) {
+ err = btrfs_try_tree_write_lock(b);
+ if (!err) {
+ btrfs_set_path_blocking(p);
+ btrfs_tree_lock(b);
+ btrfs_clear_path_blocking(p, b,
+ BTRFS_WRITE_LOCK);
+ }
+ p->locks[level] = BTRFS_WRITE_LOCK;
+ } else {
+ err = btrfs_tree_read_lock_atomic(b);
+ if (!err) {
+ btrfs_set_path_blocking(p);
+ btrfs_tree_read_lock(b);
+ btrfs_clear_path_blocking(p, b,
+ BTRFS_READ_LOCK);
+ }
+ p->locks[level] = BTRFS_READ_LOCK;
+ }
+ p->nodes[level] = b;
+ }
+ } else {
+ p->slots[level] = slot;
+ if (ins_len > 0 &&
+ btrfs_leaf_free_space(root, b) < ins_len) {
+ if (write_lock_level < 1) {
+ write_lock_level = 1;
+ btrfs_release_path(p);
+ goto again;
+ }
+
+ btrfs_set_path_blocking(p);
+ err = split_leaf(trans, root, key,
+ p, ins_len, ret == 0);
+ btrfs_clear_path_blocking(p, NULL, 0);
+
+ BUG_ON(err > 0);
+ if (err) {
+ ret = err;
+ goto done;
+ }
+ }
+ if (!p->search_for_split)
+ unlock_up(p, level, lowest_unlock,
+ min_write_lock_level, &write_lock_level);
+ goto done;
+ }
+ }
+ ret = 1;
+done:
+ /*
+ * we don't really know what they plan on doing with the path
+ * from here on, so for now just mark it as blocking
+ */
+ if (!p->leave_spinning)
+ btrfs_set_path_blocking(p);
+ if (ret < 0 && !p->skip_release_on_error)
+ btrfs_release_path(p);
+ return ret;
+}
+
+/*
+ * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
+ * current state of the tree together with the operations recorded in the tree
+ * modification log to search for the key in a previous version of this tree, as
+ * denoted by the time_seq parameter.
+ *
+ * Naturally, there is no support for insert, delete or cow operations.
+ *
+ * The resulting path and return value will be set up as if we called
+ * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
+ */
+int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
+ struct btrfs_path *p, u64 time_seq)
+{
+ struct extent_buffer *b;
+ int slot;
+ int ret;
+ int err;
+ int level;
+ int lowest_unlock = 1;
+ u8 lowest_level = 0;
+ int prev_cmp = -1;
+
+ lowest_level = p->lowest_level;
+ WARN_ON(p->nodes[0] != NULL);
+
+ if (p->search_commit_root) {
+ BUG_ON(time_seq);
+ return btrfs_search_slot(NULL, root, key, p, 0, 0);
+ }
+
+again:
+ b = get_old_root(root, time_seq);
+ level = btrfs_header_level(b);
+ p->locks[level] = BTRFS_READ_LOCK;
+
+ while (b) {
+ level = btrfs_header_level(b);
+ p->nodes[level] = b;
+ btrfs_clear_path_blocking(p, NULL, 0);
+
+ /*
+ * we have a lock on b and as long as we aren't changing
+ * the tree, there is no way to for the items in b to change.
+ * It is safe to drop the lock on our parent before we
+ * go through the expensive btree search on b.
+ */
+ btrfs_unlock_up_safe(p, level + 1);
+
+ /*
+ * Since we can unwind eb's we want to do a real search every
+ * time.
+ */
+ prev_cmp = -1;
+ ret = key_search(b, key, level, &prev_cmp, &slot);
+
+ if (level != 0) {
+ int dec = 0;
+ if (ret && slot > 0) {
+ dec = 1;
+ slot -= 1;
+ }
+ p->slots[level] = slot;
+ unlock_up(p, level, lowest_unlock, 0, NULL);
+
+ if (level == lowest_level) {
+ if (dec)
+ p->slots[level]++;
+ goto done;
+ }
+
+ err = read_block_for_search(NULL, root, p, &b, level,
+ slot, key, time_seq);
+ if (err == -EAGAIN)
+ goto again;
+ if (err) {
+ ret = err;
+ goto done;
+ }
+
+ level = btrfs_header_level(b);
+ err = btrfs_tree_read_lock_atomic(b);
+ if (!err) {
+ btrfs_set_path_blocking(p);
+ btrfs_tree_read_lock(b);
+ btrfs_clear_path_blocking(p, b,
+ BTRFS_READ_LOCK);
+ }
+ b = tree_mod_log_rewind(root->fs_info, p, b, time_seq);
+ if (!b) {
+ ret = -ENOMEM;
+ goto done;
+ }
+ p->locks[level] = BTRFS_READ_LOCK;
+ p->nodes[level] = b;
+ } else {
+ p->slots[level] = slot;
+ unlock_up(p, level, lowest_unlock, 0, NULL);
+ goto done;
+ }
+ }
+ ret = 1;
+done:
+ if (!p->leave_spinning)
+ btrfs_set_path_blocking(p);
+ if (ret < 0)
+ btrfs_release_path(p);
+
+ return ret;
+}
+
+/*
+ * helper to use instead of search slot if no exact match is needed but
+ * instead the next or previous item should be returned.
+ * When find_higher is true, the next higher item is returned, the next lower
+ * otherwise.
+ * When return_any and find_higher are both true, and no higher item is found,
+ * return the next lower instead.
+ * When return_any is true and find_higher is false, and no lower item is found,
+ * return the next higher instead.
+ * It returns 0 if any item is found, 1 if none is found (tree empty), and
+ * < 0 on error
+ */
+int btrfs_search_slot_for_read(struct btrfs_root *root,
+ struct btrfs_key *key, struct btrfs_path *p,
+ int find_higher, int return_any)
+{
+ int ret;
+ struct extent_buffer *leaf;
+
+again:
+ ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
+ if (ret <= 0)
+ return ret;
+ /*
+ * a return value of 1 means the path is at the position where the
+ * item should be inserted. Normally this is the next bigger item,
+ * but in case the previous item is the last in a leaf, path points
+ * to the first free slot in the previous leaf, i.e. at an invalid
+ * item.
+ */
+ leaf = p->nodes[0];
+
+ if (find_higher) {
+ if (p->slots[0] >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, p);
+ if (ret <= 0)
+ return ret;
+ if (!return_any)
+ return 1;
+ /*
+ * no higher item found, return the next
+ * lower instead
+ */
+ return_any = 0;
+ find_higher = 0;
+ btrfs_release_path(p);
+ goto again;
+ }
+ } else {
+ if (p->slots[0] == 0) {
+ ret = btrfs_prev_leaf(root, p);
+ if (ret < 0)
+ return ret;
+ if (!ret) {
+ leaf = p->nodes[0];
+ if (p->slots[0] == btrfs_header_nritems(leaf))
+ p->slots[0]--;
+ return 0;
+ }
+ if (!return_any)
+ return 1;
+ /*
+ * no lower item found, return the next
+ * higher instead
+ */
+ return_any = 0;
+ find_higher = 1;
+ btrfs_release_path(p);
+ goto again;
+ } else {
+ --p->slots[0];
+ }
+ }
+ return 0;
+}
+
+/*
+ * adjust the pointers going up the tree, starting at level
+ * making sure the right key of each node is points to 'key'.
+ * This is used after shifting pointers to the left, so it stops
+ * fixing up pointers when a given leaf/node is not in slot 0 of the
+ * higher levels
+ *
+ */
+static void fixup_low_keys(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path,
+ struct btrfs_disk_key *key, int level)
+{
+ int i;
+ struct extent_buffer *t;
+
+ for (i = level; i < BTRFS_MAX_LEVEL; i++) {
+ int tslot = path->slots[i];
+ if (!path->nodes[i])
+ break;
+ t = path->nodes[i];
+ tree_mod_log_set_node_key(fs_info, t, tslot, 1);
+ btrfs_set_node_key(t, key, tslot);
+ btrfs_mark_buffer_dirty(path->nodes[i]);
+ if (tslot != 0)
+ break;
+ }
+}
+
+/*
+ * update item key.
+ *
+ * This function isn't completely safe. It's the caller's responsibility
+ * that the new key won't break the order
+ */
+void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path,
+ struct btrfs_key *new_key)
+{
+ struct btrfs_disk_key disk_key;
+ struct extent_buffer *eb;
+ int slot;
+
+ eb = path->nodes[0];
+ slot = path->slots[0];
+ if (slot > 0) {
+ btrfs_item_key(eb, &disk_key, slot - 1);
+ BUG_ON(comp_keys(&disk_key, new_key) >= 0);
+ }
+ if (slot < btrfs_header_nritems(eb) - 1) {
+ btrfs_item_key(eb, &disk_key, slot + 1);
+ BUG_ON(comp_keys(&disk_key, new_key) <= 0);
+ }
+
+ btrfs_cpu_key_to_disk(&disk_key, new_key);
+ btrfs_set_item_key(eb, &disk_key, slot);
+ btrfs_mark_buffer_dirty(eb);
+ if (slot == 0)
+ fixup_low_keys(fs_info, path, &disk_key, 1);
+}
+
+/*
+ * try to push data from one node into the next node left in the
+ * tree.
+ *
+ * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
+ * error, and > 0 if there was no room in the left hand block.
+ */
+static int push_node_left(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct extent_buffer *dst,
+ struct extent_buffer *src, int empty)
+{
+ int push_items = 0;
+ int src_nritems;
+ int dst_nritems;
+ int ret = 0;
+
+ src_nritems = btrfs_header_nritems(src);
+ dst_nritems = btrfs_header_nritems(dst);
+ push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
+ WARN_ON(btrfs_header_generation(src) != trans->transid);
+ WARN_ON(btrfs_header_generation(dst) != trans->transid);
+
+ if (!empty && src_nritems <= 8)
+ return 1;
+
+ if (push_items <= 0)
+ return 1;
+
+ if (empty) {
+ push_items = min(src_nritems, push_items);
+ if (push_items < src_nritems) {
+ /* leave at least 8 pointers in the node if
+ * we aren't going to empty it
+ */
+ if (src_nritems - push_items < 8) {
+ if (push_items <= 8)
+ return 1;
+ push_items -= 8;
+ }
+ }
+ } else
+ push_items = min(src_nritems - 8, push_items);
+
+ ret = tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0,
+ push_items);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ return ret;
+ }
+ copy_extent_buffer(dst, src,
+ btrfs_node_key_ptr_offset(dst_nritems),
+ btrfs_node_key_ptr_offset(0),
+ push_items * sizeof(struct btrfs_key_ptr));
+
+ if (push_items < src_nritems) {
+ /*
+ * don't call tree_mod_log_eb_move here, key removal was already
+ * fully logged by tree_mod_log_eb_copy above.
+ */
+ memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
+ btrfs_node_key_ptr_offset(push_items),
+ (src_nritems - push_items) *
+ sizeof(struct btrfs_key_ptr));
+ }
+ btrfs_set_header_nritems(src, src_nritems - push_items);
+ btrfs_set_header_nritems(dst, dst_nritems + push_items);
+ btrfs_mark_buffer_dirty(src);
+ btrfs_mark_buffer_dirty(dst);
+
+ return ret;
+}
+
+/*
+ * try to push data from one node into the next node right in the
+ * tree.
+ *
+ * returns 0 if some ptrs were pushed, < 0 if there was some horrible
+ * error, and > 0 if there was no room in the right hand block.
+ *
+ * this will only push up to 1/2 the contents of the left node over
+ */
+static int balance_node_right(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *dst,
+ struct extent_buffer *src)
+{
+ int push_items = 0;
+ int max_push;
+ int src_nritems;
+ int dst_nritems;
+ int ret = 0;
+
+ WARN_ON(btrfs_header_generation(src) != trans->transid);
+ WARN_ON(btrfs_header_generation(dst) != trans->transid);
+
+ src_nritems = btrfs_header_nritems(src);
+ dst_nritems = btrfs_header_nritems(dst);
+ push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
+ if (push_items <= 0)
+ return 1;
+
+ if (src_nritems < 4)
+ return 1;
+
+ max_push = src_nritems / 2 + 1;
+ /* don't try to empty the node */
+ if (max_push >= src_nritems)
+ return 1;
+
+ if (max_push < push_items)
+ push_items = max_push;
+
+ tree_mod_log_eb_move(root->fs_info, dst, push_items, 0, dst_nritems);
+ memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
+ btrfs_node_key_ptr_offset(0),
+ (dst_nritems) *
+ sizeof(struct btrfs_key_ptr));
+
+ ret = tree_mod_log_eb_copy(root->fs_info, dst, src, 0,
+ src_nritems - push_items, push_items);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ return ret;
+ }
+ copy_extent_buffer(dst, src,
+ btrfs_node_key_ptr_offset(0),
+ btrfs_node_key_ptr_offset(src_nritems - push_items),
+ push_items * sizeof(struct btrfs_key_ptr));
+
+ btrfs_set_header_nritems(src, src_nritems - push_items);
+ btrfs_set_header_nritems(dst, dst_nritems + push_items);
+
+ btrfs_mark_buffer_dirty(src);
+ btrfs_mark_buffer_dirty(dst);
+
+ return ret;
+}
+
+/*
+ * helper function to insert a new root level in the tree.
+ * A new node is allocated, and a single item is inserted to
+ * point to the existing root
+ *
+ * returns zero on success or < 0 on failure.
+ */
+static noinline int insert_new_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, int level)
+{
+ u64 lower_gen;
+ struct extent_buffer *lower;
+ struct extent_buffer *c;
+ struct extent_buffer *old;
+ struct btrfs_disk_key lower_key;
+
+ BUG_ON(path->nodes[level]);
+ BUG_ON(path->nodes[level-1] != root->node);
+
+ lower = path->nodes[level-1];
+ if (level == 1)
+ btrfs_item_key(lower, &lower_key, 0);
+ else
+ btrfs_node_key(lower, &lower_key, 0);
+
+ c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
+ &lower_key, level, root->node->start, 0);
+ if (IS_ERR(c))
+ return PTR_ERR(c);
+
+ root_add_used(root, root->nodesize);
+
+ memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
+ btrfs_set_header_nritems(c, 1);
+ btrfs_set_header_level(c, level);
+ btrfs_set_header_bytenr(c, c->start);
+ btrfs_set_header_generation(c, trans->transid);
+ btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
+ btrfs_set_header_owner(c, root->root_key.objectid);
+
+ write_extent_buffer(c, root->fs_info->fsid, btrfs_header_fsid(),
+ BTRFS_FSID_SIZE);
+
+ write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
+ btrfs_header_chunk_tree_uuid(c), BTRFS_UUID_SIZE);
+
+ btrfs_set_node_key(c, &lower_key, 0);
+ btrfs_set_node_blockptr(c, 0, lower->start);
+ lower_gen = btrfs_header_generation(lower);
+ WARN_ON(lower_gen != trans->transid);
+
+ btrfs_set_node_ptr_generation(c, 0, lower_gen);
+
+ btrfs_mark_buffer_dirty(c);
+
+ old = root->node;
+ tree_mod_log_set_root_pointer(root, c, 0);
+ rcu_assign_pointer(root->node, c);
+
+ /* the super has an extra ref to root->node */
+ free_extent_buffer(old);
+
+ add_root_to_dirty_list(root);
+ extent_buffer_get(c);
+ path->nodes[level] = c;
+ path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
+ path->slots[level] = 0;
+ return 0;
+}
+
+/*
+ * worker function to insert a single pointer in a node.
+ * the node should have enough room for the pointer already
+ *
+ * slot and level indicate where you want the key to go, and
+ * blocknr is the block the key points to.
+ */
+static void insert_ptr(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ struct btrfs_disk_key *key, u64 bytenr,
+ int slot, int level)
+{
+ struct extent_buffer *lower;
+ int nritems;
+ int ret;
+
+ BUG_ON(!path->nodes[level]);
+ btrfs_assert_tree_locked(path->nodes[level]);
+ lower = path->nodes[level];
+ nritems = btrfs_header_nritems(lower);
+ BUG_ON(slot > nritems);
+ BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root));
+ if (slot != nritems) {
+ if (level)
+ tree_mod_log_eb_move(root->fs_info, lower, slot + 1,
+ slot, nritems - slot);
+ memmove_extent_buffer(lower,
+ btrfs_node_key_ptr_offset(slot + 1),
+ btrfs_node_key_ptr_offset(slot),
+ (nritems - slot) * sizeof(struct btrfs_key_ptr));
+ }
+ if (level) {
+ ret = tree_mod_log_insert_key(root->fs_info, lower, slot,
+ MOD_LOG_KEY_ADD, GFP_NOFS);
+ BUG_ON(ret < 0);
+ }
+ btrfs_set_node_key(lower, key, slot);
+ btrfs_set_node_blockptr(lower, slot, bytenr);
+ WARN_ON(trans->transid == 0);
+ btrfs_set_node_ptr_generation(lower, slot, trans->transid);
+ btrfs_set_header_nritems(lower, nritems + 1);
+ btrfs_mark_buffer_dirty(lower);
+}
+
+/*
+ * split the node at the specified level in path in two.
+ * The path is corrected to point to the appropriate node after the split
+ *
+ * Before splitting this tries to make some room in the node by pushing
+ * left and right, if either one works, it returns right away.
+ *
+ * returns 0 on success and < 0 on failure
+ */
+static noinline int split_node(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, int level)
+{
+ struct extent_buffer *c;
+ struct extent_buffer *split;
+ struct btrfs_disk_key disk_key;
+ int mid;
+ int ret;
+ u32 c_nritems;
+
+ c = path->nodes[level];
+ WARN_ON(btrfs_header_generation(c) != trans->transid);
+ if (c == root->node) {
+ /*
+ * trying to split the root, lets make a new one
+ *
+ * tree mod log: We don't log_removal old root in
+ * insert_new_root, because that root buffer will be kept as a
+ * normal node. We are going to log removal of half of the
+ * elements below with tree_mod_log_eb_copy. We're holding a
+ * tree lock on the buffer, which is why we cannot race with
+ * other tree_mod_log users.
+ */
+ ret = insert_new_root(trans, root, path, level + 1);
+ if (ret)
+ return ret;
+ } else {
+ ret = push_nodes_for_insert(trans, root, path, level);
+ c = path->nodes[level];
+ if (!ret && btrfs_header_nritems(c) <
+ BTRFS_NODEPTRS_PER_BLOCK(root) - 3)
+ return 0;
+ if (ret < 0)
+ return ret;
+ }
+
+ c_nritems = btrfs_header_nritems(c);
+ mid = (c_nritems + 1) / 2;
+ btrfs_node_key(c, &disk_key, mid);
+
+ split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
+ &disk_key, level, c->start, 0);
+ if (IS_ERR(split))
+ return PTR_ERR(split);
+
+ root_add_used(root, root->nodesize);
+
+ memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header));
+ btrfs_set_header_level(split, btrfs_header_level(c));
+ btrfs_set_header_bytenr(split, split->start);
+ btrfs_set_header_generation(split, trans->transid);
+ btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV);
+ btrfs_set_header_owner(split, root->root_key.objectid);
+ write_extent_buffer(split, root->fs_info->fsid,
+ btrfs_header_fsid(), BTRFS_FSID_SIZE);
+ write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
+ btrfs_header_chunk_tree_uuid(split),
+ BTRFS_UUID_SIZE);
+
+ ret = tree_mod_log_eb_copy(root->fs_info, split, c, 0,
+ mid, c_nritems - mid);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ return ret;
+ }
+ copy_extent_buffer(split, c,
+ btrfs_node_key_ptr_offset(0),
+ btrfs_node_key_ptr_offset(mid),
+ (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
+ btrfs_set_header_nritems(split, c_nritems - mid);
+ btrfs_set_header_nritems(c, mid);
+ ret = 0;
+
+ btrfs_mark_buffer_dirty(c);
+ btrfs_mark_buffer_dirty(split);
+
+ insert_ptr(trans, root, path, &disk_key, split->start,
+ path->slots[level + 1] + 1, level + 1);
+
+ if (path->slots[level] >= mid) {
+ path->slots[level] -= mid;
+ btrfs_tree_unlock(c);
+ free_extent_buffer(c);
+ path->nodes[level] = split;
+ path->slots[level + 1] += 1;
+ } else {
+ btrfs_tree_unlock(split);
+ free_extent_buffer(split);
+ }
+ return ret;
+}
+
+/*
+ * how many bytes are required to store the items in a leaf. start
+ * and nr indicate which items in the leaf to check. This totals up the
+ * space used both by the item structs and the item data
+ */
+static int leaf_space_used(struct extent_buffer *l, int start, int nr)
+{
+ struct btrfs_item *start_item;
+ struct btrfs_item *end_item;
+ struct btrfs_map_token token;
+ int data_len;
+ int nritems = btrfs_header_nritems(l);
+ int end = min(nritems, start + nr) - 1;
+
+ if (!nr)
+ return 0;
+ btrfs_init_map_token(&token);
+ start_item = btrfs_item_nr(start);
+ end_item = btrfs_item_nr(end);
+ data_len = btrfs_token_item_offset(l, start_item, &token) +
+ btrfs_token_item_size(l, start_item, &token);
+ data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
+ data_len += sizeof(struct btrfs_item) * nr;
+ WARN_ON(data_len < 0);
+ return data_len;
+}
+
+/*
+ * The space between the end of the leaf items and
+ * the start of the leaf data. IOW, how much room
+ * the leaf has left for both items and data
+ */
+noinline int btrfs_leaf_free_space(struct btrfs_root *root,
+ struct extent_buffer *leaf)
+{
+ int nritems = btrfs_header_nritems(leaf);
+ int ret;
+ ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
+ if (ret < 0) {
+ btrfs_crit(root->fs_info,
+ "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
+ ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
+ leaf_space_used(leaf, 0, nritems), nritems);
+ }
+ return ret;
+}
+
+/*
+ * min slot controls the lowest index we're willing to push to the
+ * right. We'll push up to and including min_slot, but no lower
+ */
+static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ int data_size, int empty,
+ struct extent_buffer *right,
+ int free_space, u32 left_nritems,
+ u32 min_slot)
+{
+ struct extent_buffer *left = path->nodes[0];
+ struct extent_buffer *upper = path->nodes[1];
+ struct btrfs_map_token token;
+ struct btrfs_disk_key disk_key;
+ int slot;
+ u32 i;
+ int push_space = 0;
+ int push_items = 0;
+ struct btrfs_item *item;
+ u32 nr;
+ u32 right_nritems;
+ u32 data_end;
+ u32 this_item_size;
+
+ btrfs_init_map_token(&token);
+
+ if (empty)
+ nr = 0;
+ else
+ nr = max_t(u32, 1, min_slot);
+
+ if (path->slots[0] >= left_nritems)
+ push_space += data_size;
+
+ slot = path->slots[1];
+ i = left_nritems - 1;
+ while (i >= nr) {
+ item = btrfs_item_nr(i);
+
+ if (!empty && push_items > 0) {
+ if (path->slots[0] > i)
+ break;
+ if (path->slots[0] == i) {
+ int space = btrfs_leaf_free_space(root, left);
+ if (space + push_space * 2 > free_space)
+ break;
+ }
+ }
+
+ if (path->slots[0] == i)
+ push_space += data_size;
+
+ this_item_size = btrfs_item_size(left, item);
+ if (this_item_size + sizeof(*item) + push_space > free_space)
+ break;
+
+ push_items++;
+ push_space += this_item_size + sizeof(*item);
+ if (i == 0)
+ break;
+ i--;
+ }
+
+ if (push_items == 0)
+ goto out_unlock;
+
+ WARN_ON(!empty && push_items == left_nritems);
+
+ /* push left to right */
+ right_nritems = btrfs_header_nritems(right);
+
+ push_space = btrfs_item_end_nr(left, left_nritems - push_items);
+ push_space -= leaf_data_end(root, left);
+
+ /* make room in the right data area */
+ data_end = leaf_data_end(root, right);
+ memmove_extent_buffer(right,
+ btrfs_leaf_data(right) + data_end - push_space,
+ btrfs_leaf_data(right) + data_end,
+ BTRFS_LEAF_DATA_SIZE(root) - data_end);
+
+ /* copy from the left data area */
+ copy_extent_buffer(right, left, btrfs_leaf_data(right) +
+ BTRFS_LEAF_DATA_SIZE(root) - push_space,
+ btrfs_leaf_data(left) + leaf_data_end(root, left),
+ push_space);
+
+ memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
+ btrfs_item_nr_offset(0),
+ right_nritems * sizeof(struct btrfs_item));
+
+ /* copy the items from left to right */
+ copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
+ btrfs_item_nr_offset(left_nritems - push_items),
+ push_items * sizeof(struct btrfs_item));
+
+ /* update the item pointers */
+ right_nritems += push_items;
+ btrfs_set_header_nritems(right, right_nritems);
+ push_space = BTRFS_LEAF_DATA_SIZE(root);
+ for (i = 0; i < right_nritems; i++) {
+ item = btrfs_item_nr(i);
+ push_space -= btrfs_token_item_size(right, item, &token);
+ btrfs_set_token_item_offset(right, item, push_space, &token);
+ }
+
+ left_nritems -= push_items;
+ btrfs_set_header_nritems(left, left_nritems);
+
+ if (left_nritems)
+ btrfs_mark_buffer_dirty(left);
+ else
+ clean_tree_block(trans, root->fs_info, left);
+
+ btrfs_mark_buffer_dirty(right);
+
+ btrfs_item_key(right, &disk_key, 0);
+ btrfs_set_node_key(upper, &disk_key, slot + 1);
+ btrfs_mark_buffer_dirty(upper);
+
+ /* then fixup the leaf pointer in the path */
+ if (path->slots[0] >= left_nritems) {
+ path->slots[0] -= left_nritems;
+ if (btrfs_header_nritems(path->nodes[0]) == 0)
+ clean_tree_block(trans, root->fs_info, path->nodes[0]);
+ btrfs_tree_unlock(path->nodes[0]);
+ free_extent_buffer(path->nodes[0]);
+ path->nodes[0] = right;
+ path->slots[1] += 1;
+ } else {
+ btrfs_tree_unlock(right);
+ free_extent_buffer(right);
+ }
+ return 0;
+
+out_unlock:
+ btrfs_tree_unlock(right);
+ free_extent_buffer(right);
+ return 1;
+}
+
+/*
+ * push some data in the path leaf to the right, trying to free up at
+ * least data_size bytes. returns zero if the push worked, nonzero otherwise
+ *
+ * returns 1 if the push failed because the other node didn't have enough
+ * room, 0 if everything worked out and < 0 if there were major errors.
+ *
+ * this will push starting from min_slot to the end of the leaf. It won't
+ * push any slot lower than min_slot
+ */
+static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_path *path,
+ int min_data_size, int data_size,
+ int empty, u32 min_slot)
+{
+ struct extent_buffer *left = path->nodes[0];
+ struct extent_buffer *right;
+ struct extent_buffer *upper;
+ int slot;
+ int free_space;
+ u32 left_nritems;
+ int ret;
+
+ if (!path->nodes[1])
+ return 1;
+
+ slot = path->slots[1];
+ upper = path->nodes[1];
+ if (slot >= btrfs_header_nritems(upper) - 1)
+ return 1;
+
+ btrfs_assert_tree_locked(path->nodes[1]);
+
+ right = read_node_slot(root, upper, slot + 1);
+ if (right == NULL)
+ return 1;
+
+ btrfs_tree_lock(right);
+ btrfs_set_lock_blocking(right);
+
+ free_space = btrfs_leaf_free_space(root, right);
+ if (free_space < data_size)
+ goto out_unlock;
+
+ /* cow and double check */
+ ret = btrfs_cow_block(trans, root, right, upper,
+ slot + 1, &right);
+ if (ret)
+ goto out_unlock;
+
+ free_space = btrfs_leaf_free_space(root, right);
+ if (free_space < data_size)
+ goto out_unlock;
+
+ left_nritems = btrfs_header_nritems(left);
+ if (left_nritems == 0)
+ goto out_unlock;
+
+ if (path->slots[0] == left_nritems && !empty) {
+ /* Key greater than all keys in the leaf, right neighbor has
+ * enough room for it and we're not emptying our leaf to delete
+ * it, therefore use right neighbor to insert the new item and
+ * no need to touch/dirty our left leaft. */
+ btrfs_tree_unlock(left);
+ free_extent_buffer(left);
+ path->nodes[0] = right;
+ path->slots[0] = 0;
+ path->slots[1]++;
+ return 0;
+ }
+
+ return __push_leaf_right(trans, root, path, min_data_size, empty,
+ right, free_space, left_nritems, min_slot);
+out_unlock:
+ btrfs_tree_unlock(right);
+ free_extent_buffer(right);
+ return 1;
+}
+
+/*
+ * push some data in the path leaf to the left, trying to free up at
+ * least data_size bytes. returns zero if the push worked, nonzero otherwise
+ *
+ * max_slot can put a limit on how far into the leaf we'll push items. The
+ * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the
+ * items
+ */
+static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, int data_size,
+ int empty, struct extent_buffer *left,
+ int free_space, u32 right_nritems,
+ u32 max_slot)
+{
+ struct btrfs_disk_key disk_key;
+ struct extent_buffer *right = path->nodes[0];
+ int i;
+ int push_space = 0;
+ int push_items = 0;
+ struct btrfs_item *item;
+ u32 old_left_nritems;
+ u32 nr;
+ int ret = 0;
+ u32 this_item_size;
+ u32 old_left_item_size;
+ struct btrfs_map_token token;
+
+ btrfs_init_map_token(&token);
+
+ if (empty)
+ nr = min(right_nritems, max_slot);
+ else
+ nr = min(right_nritems - 1, max_slot);
+
+ for (i = 0; i < nr; i++) {
+ item = btrfs_item_nr(i);
+
+ if (!empty && push_items > 0) {
+ if (path->slots[0] < i)
+ break;
+ if (path->slots[0] == i) {
+ int space = btrfs_leaf_free_space(root, right);
+ if (space + push_space * 2 > free_space)
+ break;
+ }
+ }
+
+ if (path->slots[0] == i)
+ push_space += data_size;
+
+ this_item_size = btrfs_item_size(right, item);
+ if (this_item_size + sizeof(*item) + push_space > free_space)
+ break;
+
+ push_items++;
+ push_space += this_item_size + sizeof(*item);
+ }
+
+ if (push_items == 0) {
+ ret = 1;
+ goto out;
+ }
+ WARN_ON(!empty && push_items == btrfs_header_nritems(right));
+
+ /* push data from right to left */
+ copy_extent_buffer(left, right,
+ btrfs_item_nr_offset(btrfs_header_nritems(left)),
+ btrfs_item_nr_offset(0),
+ push_items * sizeof(struct btrfs_item));
+
+ push_space = BTRFS_LEAF_DATA_SIZE(root) -
+ btrfs_item_offset_nr(right, push_items - 1);
+
+ copy_extent_buffer(left, right, btrfs_leaf_data(left) +
+ leaf_data_end(root, left) - push_space,
+ btrfs_leaf_data(right) +
+ btrfs_item_offset_nr(right, push_items - 1),
+ push_space);
+ old_left_nritems = btrfs_header_nritems(left);
+ BUG_ON(old_left_nritems <= 0);
+
+ old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
+ for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
+ u32 ioff;
+
+ item = btrfs_item_nr(i);
+
+ ioff = btrfs_token_item_offset(left, item, &token);
+ btrfs_set_token_item_offset(left, item,
+ ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size),
+ &token);
+ }
+ btrfs_set_header_nritems(left, old_left_nritems + push_items);
+
+ /* fixup right node */
+ if (push_items > right_nritems)
+ WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
+ right_nritems);
+
+ if (push_items < right_nritems) {
+ push_space = btrfs_item_offset_nr(right, push_items - 1) -
+ leaf_data_end(root, right);
+ memmove_extent_buffer(right, btrfs_leaf_data(right) +
+ BTRFS_LEAF_DATA_SIZE(root) - push_space,
+ btrfs_leaf_data(right) +
+ leaf_data_end(root, right), push_space);
+
+ memmove_extent_buffer(right, btrfs_item_nr_offset(0),
+ btrfs_item_nr_offset(push_items),
+ (btrfs_header_nritems(right) - push_items) *
+ sizeof(struct btrfs_item));
+ }
+ right_nritems -= push_items;
+ btrfs_set_header_nritems(right, right_nritems);
+ push_space = BTRFS_LEAF_DATA_SIZE(root);
+ for (i = 0; i < right_nritems; i++) {
+ item = btrfs_item_nr(i);
+
+ push_space = push_space - btrfs_token_item_size(right,
+ item, &token);
+ btrfs_set_token_item_offset(right, item, push_space, &token);
+ }
+
+ btrfs_mark_buffer_dirty(left);
+ if (right_nritems)
+ btrfs_mark_buffer_dirty(right);
+ else
+ clean_tree_block(trans, root->fs_info, right);
+
+ btrfs_item_key(right, &disk_key, 0);
+ fixup_low_keys(root->fs_info, path, &disk_key, 1);
+
+ /* then fixup the leaf pointer in the path */
+ if (path->slots[0] < push_items) {
+ path->slots[0] += old_left_nritems;
+ btrfs_tree_unlock(path->nodes[0]);
+ free_extent_buffer(path->nodes[0]);
+ path->nodes[0] = left;
+ path->slots[1] -= 1;
+ } else {
+ btrfs_tree_unlock(left);
+ free_extent_buffer(left);
+ path->slots[0] -= push_items;
+ }
+ BUG_ON(path->slots[0] < 0);
+ return ret;
+out:
+ btrfs_tree_unlock(left);
+ free_extent_buffer(left);
+ return ret;
+}
+
+/*
+ * push some data in the path leaf to the left, trying to free up at
+ * least data_size bytes. returns zero if the push worked, nonzero otherwise
+ *
+ * max_slot can put a limit on how far into the leaf we'll push items. The
+ * item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the
+ * items
+ */
+static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_path *path, int min_data_size,
+ int data_size, int empty, u32 max_slot)
+{
+ struct extent_buffer *right = path->nodes[0];
+ struct extent_buffer *left;
+ int slot;
+ int free_space;
+ u32 right_nritems;
+ int ret = 0;
+
+ slot = path->slots[1];
+ if (slot == 0)
+ return 1;
+ if (!path->nodes[1])
+ return 1;
+
+ right_nritems = btrfs_header_nritems(right);
+ if (right_nritems == 0)
+ return 1;
+
+ btrfs_assert_tree_locked(path->nodes[1]);
+
+ left = read_node_slot(root, path->nodes[1], slot - 1);
+ if (left == NULL)
+ return 1;
+
+ btrfs_tree_lock(left);
+ btrfs_set_lock_blocking(left);
+
+ free_space = btrfs_leaf_free_space(root, left);
+ if (free_space < data_size) {
+ ret = 1;
+ goto out;
+ }
+
+ /* cow and double check */
+ ret = btrfs_cow_block(trans, root, left,
+ path->nodes[1], slot - 1, &left);
+ if (ret) {
+ /* we hit -ENOSPC, but it isn't fatal here */
+ if (ret == -ENOSPC)
+ ret = 1;
+ goto out;
+ }
+
+ free_space = btrfs_leaf_free_space(root, left);
+ if (free_space < data_size) {
+ ret = 1;
+ goto out;
+ }
+
+ return __push_leaf_left(trans, root, path, min_data_size,
+ empty, left, free_space, right_nritems,
+ max_slot);
+out:
+ btrfs_tree_unlock(left);
+ free_extent_buffer(left);
+ return ret;
+}
+
+/*
+ * split the path's leaf in two, making sure there is at least data_size
+ * available for the resulting leaf level of the path.
+ */
+static noinline void copy_for_split(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct extent_buffer *l,
+ struct extent_buffer *right,
+ int slot, int mid, int nritems)
+{
+ int data_copy_size;
+ int rt_data_off;
+ int i;
+ struct btrfs_disk_key disk_key;
+ struct btrfs_map_token token;
+
+ btrfs_init_map_token(&token);
+
+ nritems = nritems - mid;
+ btrfs_set_header_nritems(right, nritems);
+ data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
+
+ copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
+ btrfs_item_nr_offset(mid),
+ nritems * sizeof(struct btrfs_item));
+
+ copy_extent_buffer(right, l,
+ btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
+ data_copy_size, btrfs_leaf_data(l) +
+ leaf_data_end(root, l), data_copy_size);
+
+ rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
+ btrfs_item_end_nr(l, mid);
+
+ for (i = 0; i < nritems; i++) {
+ struct btrfs_item *item = btrfs_item_nr(i);
+ u32 ioff;
+
+ ioff = btrfs_token_item_offset(right, item, &token);
+ btrfs_set_token_item_offset(right, item,
+ ioff + rt_data_off, &token);
+ }
+
+ btrfs_set_header_nritems(l, mid);
+ btrfs_item_key(right, &disk_key, 0);
+ insert_ptr(trans, root, path, &disk_key, right->start,
+ path->slots[1] + 1, 1);
+
+ btrfs_mark_buffer_dirty(right);
+ btrfs_mark_buffer_dirty(l);
+ BUG_ON(path->slots[0] != slot);
+
+ if (mid <= slot) {
+ btrfs_tree_unlock(path->nodes[0]);
+ free_extent_buffer(path->nodes[0]);
+ path->nodes[0] = right;
+ path->slots[0] -= mid;
+ path->slots[1] += 1;
+ } else {
+ btrfs_tree_unlock(right);
+ free_extent_buffer(right);
+ }
+
+ BUG_ON(path->slots[0] < 0);
+}
+
+/*
+ * double splits happen when we need to insert a big item in the middle
+ * of a leaf. A double split can leave us with 3 mostly empty leaves:
+ * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
+ * A B C
+ *
+ * We avoid this by trying to push the items on either side of our target
+ * into the adjacent leaves. If all goes well we can avoid the double split
+ * completely.
+ */
+static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ int data_size)
+{
+ int ret;
+ int progress = 0;
+ int slot;
+ u32 nritems;
+ int space_needed = data_size;
+
+ slot = path->slots[0];
+ if (slot < btrfs_header_nritems(path->nodes[0]))
+ space_needed -= btrfs_leaf_free_space(root, path->nodes[0]);
+
+ /*
+ * try to push all the items after our slot into the
+ * right leaf
+ */
+ ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
+ if (ret < 0)
+ return ret;
+
+ if (ret == 0)
+ progress++;
+
+ nritems = btrfs_header_nritems(path->nodes[0]);
+ /*
+ * our goal is to get our slot at the start or end of a leaf. If
+ * we've done so we're done
+ */
+ if (path->slots[0] == 0 || path->slots[0] == nritems)
+ return 0;
+
+ if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
+ return 0;
+
+ /* try to push all the items before our slot into the next leaf */
+ slot = path->slots[0];
+ ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
+ if (ret < 0)
+ return ret;
+
+ if (ret == 0)
+ progress++;
+
+ if (progress)
+ return 0;
+ return 1;
+}
+
+/*
+ * split the path's leaf in two, making sure there is at least data_size
+ * available for the resulting leaf level of the path.
+ *
+ * returns 0 if all went well and < 0 on failure.
+ */
+static noinline int split_leaf(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_key *ins_key,
+ struct btrfs_path *path, int data_size,
+ int extend)
+{
+ struct btrfs_disk_key disk_key;
+ struct extent_buffer *l;
+ u32 nritems;
+ int mid;
+ int slot;
+ struct extent_buffer *right;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ int ret = 0;
+ int wret;
+ int split;
+ int num_doubles = 0;
+ int tried_avoid_double = 0;
+
+ l = path->nodes[0];
+ slot = path->slots[0];
+ if (extend && data_size + btrfs_item_size_nr(l, slot) +
+ sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root))
+ return -EOVERFLOW;
+
+ /* first try to make some room by pushing left and right */
+ if (data_size && path->nodes[1]) {
+ int space_needed = data_size;
+
+ if (slot < btrfs_header_nritems(l))
+ space_needed -= btrfs_leaf_free_space(root, l);
+
+ wret = push_leaf_right(trans, root, path, space_needed,
+ space_needed, 0, 0);
+ if (wret < 0)
+ return wret;
+ if (wret) {
+ wret = push_leaf_left(trans, root, path, space_needed,
+ space_needed, 0, (u32)-1);
+ if (wret < 0)
+ return wret;
+ }
+ l = path->nodes[0];
+
+ /* did the pushes work? */
+ if (btrfs_leaf_free_space(root, l) >= data_size)
+ return 0;
+ }
+
+ if (!path->nodes[1]) {
+ ret = insert_new_root(trans, root, path, 1);
+ if (ret)
+ return ret;
+ }
+again:
+ split = 1;
+ l = path->nodes[0];
+ slot = path->slots[0];
+ nritems = btrfs_header_nritems(l);
+ mid = (nritems + 1) / 2;
+
+ if (mid <= slot) {
+ if (nritems == 1 ||
+ leaf_space_used(l, mid, nritems - mid) + data_size >
+ BTRFS_LEAF_DATA_SIZE(root)) {
+ if (slot >= nritems) {
+ split = 0;
+ } else {
+ mid = slot;
+ if (mid != nritems &&
+ leaf_space_used(l, mid, nritems - mid) +
+ data_size > BTRFS_LEAF_DATA_SIZE(root)) {
+ if (data_size && !tried_avoid_double)
+ goto push_for_double;
+ split = 2;
+ }
+ }
+ }
+ } else {
+ if (leaf_space_used(l, 0, mid) + data_size >
+ BTRFS_LEAF_DATA_SIZE(root)) {
+ if (!extend && data_size && slot == 0) {
+ split = 0;
+ } else if ((extend || !data_size) && slot == 0) {
+ mid = 1;
+ } else {
+ mid = slot;
+ if (mid != nritems &&
+ leaf_space_used(l, mid, nritems - mid) +
+ data_size > BTRFS_LEAF_DATA_SIZE(root)) {
+ if (data_size && !tried_avoid_double)
+ goto push_for_double;
+ split = 2;
+ }
+ }
+ }
+ }
+
+ if (split == 0)
+ btrfs_cpu_key_to_disk(&disk_key, ins_key);
+ else
+ btrfs_item_key(l, &disk_key, mid);
+
+ right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
+ &disk_key, 0, l->start, 0);
+ if (IS_ERR(right))
+ return PTR_ERR(right);
+
+ root_add_used(root, root->nodesize);
+
+ memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
+ btrfs_set_header_bytenr(right, right->start);
+ btrfs_set_header_generation(right, trans->transid);
+ btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV);
+ btrfs_set_header_owner(right, root->root_key.objectid);
+ btrfs_set_header_level(right, 0);
+ write_extent_buffer(right, fs_info->fsid,
+ btrfs_header_fsid(), BTRFS_FSID_SIZE);
+
+ write_extent_buffer(right, fs_info->chunk_tree_uuid,
+ btrfs_header_chunk_tree_uuid(right),
+ BTRFS_UUID_SIZE);
+
+ if (split == 0) {
+ if (mid <= slot) {
+ btrfs_set_header_nritems(right, 0);
+ insert_ptr(trans, root, path, &disk_key, right->start,
+ path->slots[1] + 1, 1);
+ btrfs_tree_unlock(path->nodes[0]);
+ free_extent_buffer(path->nodes[0]);
+ path->nodes[0] = right;
+ path->slots[0] = 0;
+ path->slots[1] += 1;
+ } else {
+ btrfs_set_header_nritems(right, 0);
+ insert_ptr(trans, root, path, &disk_key, right->start,
+ path->slots[1], 1);
+ btrfs_tree_unlock(path->nodes[0]);
+ free_extent_buffer(path->nodes[0]);
+ path->nodes[0] = right;
+ path->slots[0] = 0;
+ if (path->slots[1] == 0)
+ fixup_low_keys(fs_info, path, &disk_key, 1);
+ }
+ btrfs_mark_buffer_dirty(right);
+ return ret;
+ }
+
+ copy_for_split(trans, root, path, l, right, slot, mid, nritems);
+
+ if (split == 2) {
+ BUG_ON(num_doubles != 0);
+ num_doubles++;
+ goto again;
+ }
+
+ return 0;
+
+push_for_double:
+ push_for_double_split(trans, root, path, data_size);
+ tried_avoid_double = 1;
+ if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
+ return 0;
+ goto again;
+}
+
+static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, int ins_len)
+{
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ struct btrfs_file_extent_item *fi;
+ u64 extent_len = 0;
+ u32 item_size;
+ int ret;
+
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+ BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
+ key.type != BTRFS_EXTENT_CSUM_KEY);
+
+ if (btrfs_leaf_free_space(root, leaf) >= ins_len)
+ return 0;
+
+ item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ if (key.type == BTRFS_EXTENT_DATA_KEY) {
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ extent_len = btrfs_file_extent_num_bytes(leaf, fi);
+ }
+ btrfs_release_path(path);
+
+ path->keep_locks = 1;
+ path->search_for_split = 1;
+ ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+ path->search_for_split = 0;
+ if (ret > 0)
+ ret = -EAGAIN;
+ if (ret < 0)
+ goto err;
+
+ ret = -EAGAIN;
+ leaf = path->nodes[0];
+ /* if our item isn't there, return now */
+ if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
+ goto err;
+
+ /* the leaf has changed, it now has room. return now */
+ if (btrfs_leaf_free_space(root, path->nodes[0]) >= ins_len)
+ goto err;
+
+ if (key.type == BTRFS_EXTENT_DATA_KEY) {
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
+ goto err;
+ }
+
+ btrfs_set_path_blocking(path);
+ ret = split_leaf(trans, root, &key, path, ins_len, 1);
+ if (ret)
+ goto err;
+
+ path->keep_locks = 0;
+ btrfs_unlock_up_safe(path, 1);
+ return 0;
+err:
+ path->keep_locks = 0;
+ return ret;
+}
+
+static noinline int split_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_key *new_key,
+ unsigned long split_offset)
+{
+ struct extent_buffer *leaf;
+ struct btrfs_item *item;
+ struct btrfs_item *new_item;
+ int slot;
+ char *buf;
+ u32 nritems;
+ u32 item_size;
+ u32 orig_offset;
+ struct btrfs_disk_key disk_key;
+
+ leaf = path->nodes[0];
+ BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
+
+ btrfs_set_path_blocking(path);
+
+ item = btrfs_item_nr(path->slots[0]);
+ orig_offset = btrfs_item_offset(leaf, item);
+ item_size = btrfs_item_size(leaf, item);
+
+ buf = kmalloc(item_size, GFP_NOFS);
+ if (!buf)
+ return -ENOMEM;
+
+ read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
+ path->slots[0]), item_size);
+
+ slot = path->slots[0] + 1;
+ nritems = btrfs_header_nritems(leaf);
+ if (slot != nritems) {
+ /* shift the items */
+ memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
+ btrfs_item_nr_offset(slot),
+ (nritems - slot) * sizeof(struct btrfs_item));
+ }
+
+ btrfs_cpu_key_to_disk(&disk_key, new_key);
+ btrfs_set_item_key(leaf, &disk_key, slot);
+
+ new_item = btrfs_item_nr(slot);
+
+ btrfs_set_item_offset(leaf, new_item, orig_offset);
+ btrfs_set_item_size(leaf, new_item, item_size - split_offset);
+
+ btrfs_set_item_offset(leaf, item,
+ orig_offset + item_size - split_offset);
+ btrfs_set_item_size(leaf, item, split_offset);
+
+ btrfs_set_header_nritems(leaf, nritems + 1);
+
+ /* write the data for the start of the original item */
+ write_extent_buffer(leaf, buf,
+ btrfs_item_ptr_offset(leaf, path->slots[0]),
+ split_offset);
+
+ /* write the data for the new item */
+ write_extent_buffer(leaf, buf + split_offset,
+ btrfs_item_ptr_offset(leaf, slot),
+ item_size - split_offset);
+ btrfs_mark_buffer_dirty(leaf);
+
+ BUG_ON(btrfs_leaf_free_space(root, leaf) < 0);
+ kfree(buf);
+ return 0;
+}
+
+/*
+ * This function splits a single item into two items,
+ * giving 'new_key' to the new item and splitting the
+ * old one at split_offset (from the start of the item).
+ *
+ * The path may be released by this operation. After
+ * the split, the path is pointing to the old item. The
+ * new item is going to be in the same node as the old one.
+ *
+ * Note, the item being split must be smaller enough to live alone on
+ * a tree block with room for one extra struct btrfs_item
+ *
+ * This allows us to split the item in place, keeping a lock on the
+ * leaf the entire time.
+ */
+int btrfs_split_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_key *new_key,
+ unsigned long split_offset)
+{
+ int ret;
+ ret = setup_leaf_for_split(trans, root, path,
+ sizeof(struct btrfs_item));
+ if (ret)
+ return ret;
+
+ ret = split_item(trans, root, path, new_key, split_offset);
+ return ret;
+}
+
+/*
+ * This function duplicate a item, giving 'new_key' to the new item.
+ * It guarantees both items live in the same tree leaf and the new item
+ * is contiguous with the original item.
+ *
+ * This allows us to split file extent in place, keeping a lock on the
+ * leaf the entire time.
+ */
+int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_key *new_key)
+{
+ struct extent_buffer *leaf;
+ int ret;
+ u32 item_size;
+
+ leaf = path->nodes[0];
+ item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ ret = setup_leaf_for_split(trans, root, path,
+ item_size + sizeof(struct btrfs_item));
+ if (ret)
+ return ret;
+
+ path->slots[0]++;
+ setup_items_for_insert(root, path, new_key, &item_size,
+ item_size, item_size +
+ sizeof(struct btrfs_item), 1);
+ leaf = path->nodes[0];
+ memcpy_extent_buffer(leaf,
+ btrfs_item_ptr_offset(leaf, path->slots[0]),
+ btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
+ item_size);
+ return 0;
+}
+
+/*
+ * make the item pointed to by the path smaller. new_size indicates
+ * how small to make it, and from_end tells us if we just chop bytes
+ * off the end of the item or if we shift the item to chop bytes off
+ * the front.
+ */
+void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
+ u32 new_size, int from_end)
+{
+ int slot;
+ struct extent_buffer *leaf;
+ struct btrfs_item *item;
+ u32 nritems;
+ unsigned int data_end;
+ unsigned int old_data_start;
+ unsigned int old_size;
+ unsigned int size_diff;
+ int i;
+ struct btrfs_map_token token;
+
+ btrfs_init_map_token(&token);
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+
+ old_size = btrfs_item_size_nr(leaf, slot);
+ if (old_size == new_size)
+ return;
+
+ nritems = btrfs_header_nritems(leaf);
+ data_end = leaf_data_end(root, leaf);
+
+ old_data_start = btrfs_item_offset_nr(leaf, slot);
+
+ size_diff = old_size - new_size;
+
+ BUG_ON(slot < 0);
+ BUG_ON(slot >= nritems);
+
+ /*
+ * item0..itemN ... dataN.offset..dataN.size .. data0.size
+ */
+ /* first correct the data pointers */
+ for (i = slot; i < nritems; i++) {
+ u32 ioff;
+ item = btrfs_item_nr(i);
+
+ ioff = btrfs_token_item_offset(leaf, item, &token);
+ btrfs_set_token_item_offset(leaf, item,
+ ioff + size_diff, &token);
+ }
+
+ /* shift the data */
+ if (from_end) {
+ memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
+ data_end + size_diff, btrfs_leaf_data(leaf) +
+ data_end, old_data_start + new_size - data_end);
+ } else {
+ struct btrfs_disk_key disk_key;
+ u64 offset;
+
+ btrfs_item_key(leaf, &disk_key, slot);
+
+ if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
+ unsigned long ptr;
+ struct btrfs_file_extent_item *fi;
+
+ fi = btrfs_item_ptr(leaf, slot,
+ struct btrfs_file_extent_item);
+ fi = (struct btrfs_file_extent_item *)(
+ (unsigned long)fi - size_diff);
+
+ if (btrfs_file_extent_type(leaf, fi) ==
+ BTRFS_FILE_EXTENT_INLINE) {
+ ptr = btrfs_item_ptr_offset(leaf, slot);
+ memmove_extent_buffer(leaf, ptr,
+ (unsigned long)fi,
+ BTRFS_FILE_EXTENT_INLINE_DATA_START);
+ }
+ }
+
+ memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
+ data_end + size_diff, btrfs_leaf_data(leaf) +
+ data_end, old_data_start - data_end);
+
+ offset = btrfs_disk_key_offset(&disk_key);
+ btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
+ btrfs_set_item_key(leaf, &disk_key, slot);
+ if (slot == 0)
+ fixup_low_keys(root->fs_info, path, &disk_key, 1);
+ }
+
+ item = btrfs_item_nr(slot);
+ btrfs_set_item_size(leaf, item, new_size);
+ btrfs_mark_buffer_dirty(leaf);
+
+ if (btrfs_leaf_free_space(root, leaf) < 0) {
+ btrfs_print_leaf(root, leaf);
+ BUG();
+ }
+}
+
+/*
+ * make the item pointed to by the path bigger, data_size is the added size.
+ */
+void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,
+ u32 data_size)
+{
+ int slot;
+ struct extent_buffer *leaf;
+ struct btrfs_item *item;
+ u32 nritems;
+ unsigned int data_end;
+ unsigned int old_data;
+ unsigned int old_size;
+ int i;
+ struct btrfs_map_token token;
+
+ btrfs_init_map_token(&token);
+
+ leaf = path->nodes[0];
+
+ nritems = btrfs_header_nritems(leaf);
+ data_end = leaf_data_end(root, leaf);
+
+ if (btrfs_leaf_free_space(root, leaf) < data_size) {
+ btrfs_print_leaf(root, leaf);
+ BUG();
+ }
+ slot = path->slots[0];
+ old_data = btrfs_item_end_nr(leaf, slot);
+
+ BUG_ON(slot < 0);
+ if (slot >= nritems) {
+ btrfs_print_leaf(root, leaf);
+ btrfs_crit(root->fs_info, "slot %d too large, nritems %d",
+ slot, nritems);
+ BUG_ON(1);
+ }
+
+ /*
+ * item0..itemN ... dataN.offset..dataN.size .. data0.size
+ */
+ /* first correct the data pointers */
+ for (i = slot; i < nritems; i++) {
+ u32 ioff;
+ item = btrfs_item_nr(i);
+
+ ioff = btrfs_token_item_offset(leaf, item, &token);
+ btrfs_set_token_item_offset(leaf, item,
+ ioff - data_size, &token);
+ }
+
+ /* shift the data */
+ memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
+ data_end - data_size, btrfs_leaf_data(leaf) +
+ data_end, old_data - data_end);
+
+ data_end = old_data;
+ old_size = btrfs_item_size_nr(leaf, slot);
+ item = btrfs_item_nr(slot);
+ btrfs_set_item_size(leaf, item, old_size + data_size);
+ btrfs_mark_buffer_dirty(leaf);
+
+ if (btrfs_leaf_free_space(root, leaf) < 0) {
+ btrfs_print_leaf(root, leaf);
+ BUG();
+ }
+}
+
+/*
+ * this is a helper for btrfs_insert_empty_items, the main goal here is
+ * to save stack depth by doing the bulk of the work in a function
+ * that doesn't call btrfs_search_slot
+ */
+void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
+ struct btrfs_key *cpu_key, u32 *data_size,
+ u32 total_data, u32 total_size, int nr)
+{
+ struct btrfs_item *item;
+ int i;
+ u32 nritems;
+ unsigned int data_end;
+ struct btrfs_disk_key disk_key;
+ struct extent_buffer *leaf;
+ int slot;
+ struct btrfs_map_token token;
+
+ if (path->slots[0] == 0) {
+ btrfs_cpu_key_to_disk(&disk_key, cpu_key);
+ fixup_low_keys(root->fs_info, path, &disk_key, 1);
+ }
+ btrfs_unlock_up_safe(path, 1);
+
+ btrfs_init_map_token(&token);
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+
+ nritems = btrfs_header_nritems(leaf);
+ data_end = leaf_data_end(root, leaf);
+
+ if (btrfs_leaf_free_space(root, leaf) < total_size) {
+ btrfs_print_leaf(root, leaf);
+ btrfs_crit(root->fs_info, "not enough freespace need %u have %d",
+ total_size, btrfs_leaf_free_space(root, leaf));
+ BUG();
+ }
+
+ if (slot != nritems) {
+ unsigned int old_data = btrfs_item_end_nr(leaf, slot);
+
+ if (old_data < data_end) {
+ btrfs_print_leaf(root, leaf);
+ btrfs_crit(root->fs_info, "slot %d old_data %d data_end %d",
+ slot, old_data, data_end);
+ BUG_ON(1);
+ }
+ /*
+ * item0..itemN ... dataN.offset..dataN.size .. data0.size
+ */
+ /* first correct the data pointers */
+ for (i = slot; i < nritems; i++) {
+ u32 ioff;
+
+ item = btrfs_item_nr( i);
+ ioff = btrfs_token_item_offset(leaf, item, &token);
+ btrfs_set_token_item_offset(leaf, item,
+ ioff - total_data, &token);
+ }
+ /* shift the items */
+ memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
+ btrfs_item_nr_offset(slot),
+ (nritems - slot) * sizeof(struct btrfs_item));
+
+ /* shift the data */
+ memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
+ data_end - total_data, btrfs_leaf_data(leaf) +
+ data_end, old_data - data_end);
+ data_end = old_data;
+ }
+
+ /* setup the item for the new data */
+ for (i = 0; i < nr; i++) {
+ btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
+ btrfs_set_item_key(leaf, &disk_key, slot + i);
+ item = btrfs_item_nr(slot + i);
+ btrfs_set_token_item_offset(leaf, item,
+ data_end - data_size[i], &token);
+ data_end -= data_size[i];
+ btrfs_set_token_item_size(leaf, item, data_size[i], &token);
+ }
+
+ btrfs_set_header_nritems(leaf, nritems + nr);
+ btrfs_mark_buffer_dirty(leaf);
+
+ if (btrfs_leaf_free_space(root, leaf) < 0) {
+ btrfs_print_leaf(root, leaf);
+ BUG();
+ }
+}
+
+/*
+ * Given a key and some data, insert items into the tree.
+ * This does all the path init required, making room in the tree if needed.
+ */
+int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_key *cpu_key, u32 *data_size,
+ int nr)
+{
+ int ret = 0;
+ int slot;
+ int i;
+ u32 total_size = 0;
+ u32 total_data = 0;
+
+ for (i = 0; i < nr; i++)
+ total_data += data_size[i];
+
+ total_size = total_data + (nr * sizeof(struct btrfs_item));
+ ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
+ if (ret == 0)
+ return -EEXIST;
+ if (ret < 0)
+ return ret;
+
+ slot = path->slots[0];
+ BUG_ON(slot < 0);
+
+ setup_items_for_insert(root, path, cpu_key, data_size,
+ total_data, total_size, nr);
+ return 0;
+}
+
+/*
+ * Given a key and some data, insert an item into the tree.
+ * This does all the path init required, making room in the tree if needed.
+ */
+int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_key *cpu_key, void *data, u32
+ data_size)
+{
+ int ret = 0;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ unsigned long ptr;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
+ if (!ret) {
+ leaf = path->nodes[0];
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ write_extent_buffer(leaf, data, ptr, data_size);
+ btrfs_mark_buffer_dirty(leaf);
+ }
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * delete the pointer from a given node.
+ *
+ * the tree should have been previously balanced so the deletion does not
+ * empty a node.
+ */
+static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
+ int level, int slot)
+{
+ struct extent_buffer *parent = path->nodes[level];
+ u32 nritems;
+ int ret;
+
+ nritems = btrfs_header_nritems(parent);
+ if (slot != nritems - 1) {
+ if (level)
+ tree_mod_log_eb_move(root->fs_info, parent, slot,
+ slot + 1, nritems - slot - 1);
+ memmove_extent_buffer(parent,
+ btrfs_node_key_ptr_offset(slot),
+ btrfs_node_key_ptr_offset(slot + 1),
+ sizeof(struct btrfs_key_ptr) *
+ (nritems - slot - 1));
+ } else if (level) {
+ ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
+ MOD_LOG_KEY_REMOVE, GFP_NOFS);
+ BUG_ON(ret < 0);
+ }
+
+ nritems--;
+ btrfs_set_header_nritems(parent, nritems);
+ if (nritems == 0 && parent == root->node) {
+ BUG_ON(btrfs_header_level(root->node) != 1);
+ /* just turn the root into a leaf and break */
+ btrfs_set_header_level(root->node, 0);
+ } else if (slot == 0) {
+ struct btrfs_disk_key disk_key;
+
+ btrfs_node_key(parent, &disk_key, 0);
+ fixup_low_keys(root->fs_info, path, &disk_key, level + 1);
+ }
+ btrfs_mark_buffer_dirty(parent);
+}
+
+/*
+ * a helper function to delete the leaf pointed to by path->slots[1] and
+ * path->nodes[1].
+ *
+ * This deletes the pointer in path->nodes[1] and frees the leaf
+ * block extent. zero is returned if it all worked out, < 0 otherwise.
+ *
+ * The path must have already been setup for deleting the leaf, including
+ * all the proper balancing. path->nodes[1] must be locked.
+ */
+static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct extent_buffer *leaf)
+{
+ WARN_ON(btrfs_header_generation(leaf) != trans->transid);
+ del_ptr(root, path, 1, path->slots[1]);
+
+ /*
+ * btrfs_free_extent is expensive, we want to make sure we
+ * aren't holding any locks when we call it
+ */
+ btrfs_unlock_up_safe(path, 0);
+
+ root_sub_used(root, leaf->len);
+
+ extent_buffer_get(leaf);
+ btrfs_free_tree_block(trans, root, leaf, 0, 1);
+ free_extent_buffer_stale(leaf);
+}
+/*
+ * delete the item at the leaf level in path. If that empties
+ * the leaf, remove it from the tree
+ */
+int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct btrfs_path *path, int slot, int nr)
+{
+ struct extent_buffer *leaf;
+ struct btrfs_item *item;
+ int last_off;
+ int dsize = 0;
+ int ret = 0;
+ int wret;
+ int i;
+ u32 nritems;
+ struct btrfs_map_token token;
+
+ btrfs_init_map_token(&token);
+
+ leaf = path->nodes[0];
+ last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
+
+ for (i = 0; i < nr; i++)
+ dsize += btrfs_item_size_nr(leaf, slot + i);
+
+ nritems = btrfs_header_nritems(leaf);
+
+ if (slot + nr != nritems) {
+ int data_end = leaf_data_end(root, leaf);
+
+ memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
+ data_end + dsize,
+ btrfs_leaf_data(leaf) + data_end,
+ last_off - data_end);
+
+ for (i = slot + nr; i < nritems; i++) {
+ u32 ioff;
+
+ item = btrfs_item_nr(i);
+ ioff = btrfs_token_item_offset(leaf, item, &token);
+ btrfs_set_token_item_offset(leaf, item,
+ ioff + dsize, &token);
+ }
+
+ memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
+ btrfs_item_nr_offset(slot + nr),
+ sizeof(struct btrfs_item) *
+ (nritems - slot - nr));
+ }
+ btrfs_set_header_nritems(leaf, nritems - nr);
+ nritems -= nr;
+
+ /* delete the leaf if we've emptied it */
+ if (nritems == 0) {
+ if (leaf == root->node) {
+ btrfs_set_header_level(leaf, 0);
+ } else {
+ btrfs_set_path_blocking(path);
+ clean_tree_block(trans, root->fs_info, leaf);
+ btrfs_del_leaf(trans, root, path, leaf);
+ }
+ } else {
+ int used = leaf_space_used(leaf, 0, nritems);
+ if (slot == 0) {
+ struct btrfs_disk_key disk_key;
+
+ btrfs_item_key(leaf, &disk_key, 0);
+ fixup_low_keys(root->fs_info, path, &disk_key, 1);
+ }
+
+ /* delete the leaf if it is mostly empty */
+ if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) {
+ /* push_leaf_left fixes the path.
+ * make sure the path still points to our leaf
+ * for possible call to del_ptr below
+ */
+ slot = path->slots[1];
+ extent_buffer_get(leaf);
+
+ btrfs_set_path_blocking(path);
+ wret = push_leaf_left(trans, root, path, 1, 1,
+ 1, (u32)-1);
+ if (wret < 0 && wret != -ENOSPC)
+ ret = wret;
+
+ if (path->nodes[0] == leaf &&
+ btrfs_header_nritems(leaf)) {
+ wret = push_leaf_right(trans, root, path, 1,
+ 1, 1, 0);
+ if (wret < 0 && wret != -ENOSPC)
+ ret = wret;
+ }
+
+ if (btrfs_header_nritems(leaf) == 0) {
+ path->slots[1] = slot;
+ btrfs_del_leaf(trans, root, path, leaf);
+ free_extent_buffer(leaf);
+ ret = 0;
+ } else {
+ /* if we're still in the path, make sure
+ * we're dirty. Otherwise, one of the
+ * push_leaf functions must have already
+ * dirtied this buffer
+ */
+ if (path->nodes[0] == leaf)
+ btrfs_mark_buffer_dirty(leaf);
+ free_extent_buffer(leaf);
+ }
+ } else {
+ btrfs_mark_buffer_dirty(leaf);
+ }
+ }
+ return ret;
+}
+
+/*
+ * search the tree again to find a leaf with lesser keys
+ * returns 0 if it found something or 1 if there are no lesser leaves.
+ * returns < 0 on io errors.
+ *
+ * This may release the path, and so you may lose any locks held at the
+ * time you call it.
+ */
+int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
+{
+ struct btrfs_key key;
+ struct btrfs_disk_key found_key;
+ int ret;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
+
+ if (key.offset > 0) {
+ key.offset--;
+ } else if (key.type > 0) {
+ key.type--;
+ key.offset = (u64)-1;
+ } else if (key.objectid > 0) {
+ key.objectid--;
+ key.type = (u8)-1;
+ key.offset = (u64)-1;
+ } else {
+ return 1;
+ }
+
+ btrfs_release_path(path);
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+ btrfs_item_key(path->nodes[0], &found_key, 0);
+ ret = comp_keys(&found_key, &key);
+ /*
+ * We might have had an item with the previous key in the tree right
+ * before we released our path. And after we released our path, that
+ * item might have been pushed to the first slot (0) of the leaf we
+ * were holding due to a tree balance. Alternatively, an item with the
+ * previous key can exist as the only element of a leaf (big fat item).
+ * Therefore account for these 2 cases, so that our callers (like
+ * btrfs_previous_item) don't miss an existing item with a key matching
+ * the previous key we computed above.
+ */
+ if (ret <= 0)
+ return 0;
+ return 1;
+}
+
+/*
+ * A helper function to walk down the tree starting at min_key, and looking
+ * for nodes or leaves that are have a minimum transaction id.
+ * This is used by the btree defrag code, and tree logging
+ *
+ * This does not cow, but it does stuff the starting key it finds back
+ * into min_key, so you can call btrfs_search_slot with cow=1 on the
+ * key and get a writable path.
+ *
+ * This does lock as it descends, and path->keep_locks should be set
+ * to 1 by the caller.
+ *
+ * This honors path->lowest_level to prevent descent past a given level
+ * of the tree.
+ *
+ * min_trans indicates the oldest transaction that you are interested
+ * in walking through. Any nodes or leaves older than min_trans are
+ * skipped over (without reading them).
+ *
+ * returns zero if something useful was found, < 0 on error and 1 if there
+ * was nothing in the tree that matched the search criteria.
+ */
+int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
+ struct btrfs_path *path,
+ u64 min_trans)
+{
+ struct extent_buffer *cur;
+ struct btrfs_key found_key;
+ int slot;
+ int sret;
+ u32 nritems;
+ int level;
+ int ret = 1;
+ int keep_locks = path->keep_locks;
+
+ path->keep_locks = 1;
+again:
+ cur = btrfs_read_lock_root_node(root);
+ level = btrfs_header_level(cur);
+ WARN_ON(path->nodes[level]);
+ path->nodes[level] = cur;
+ path->locks[level] = BTRFS_READ_LOCK;
+
+ if (btrfs_header_generation(cur) < min_trans) {
+ ret = 1;
+ goto out;
+ }
+ while (1) {
+ nritems = btrfs_header_nritems(cur);
+ level = btrfs_header_level(cur);
+ sret = bin_search(cur, min_key, level, &slot);
+
+ /* at the lowest level, we're done, setup the path and exit */
+ if (level == path->lowest_level) {
+ if (slot >= nritems)
+ goto find_next_key;
+ ret = 0;
+ path->slots[level] = slot;
+ btrfs_item_key_to_cpu(cur, &found_key, slot);
+ goto out;
+ }
+ if (sret && slot > 0)
+ slot--;
+ /*
+ * check this node pointer against the min_trans parameters.
+ * If it is too old, old, skip to the next one.
+ */
+ while (slot < nritems) {
+ u64 gen;
+
+ gen = btrfs_node_ptr_generation(cur, slot);
+ if (gen < min_trans) {
+ slot++;
+ continue;
+ }
+ break;
+ }
+find_next_key:
+ /*
+ * we didn't find a candidate key in this node, walk forward
+ * and find another one
+ */
+ if (slot >= nritems) {
+ path->slots[level] = slot;
+ btrfs_set_path_blocking(path);
+ sret = btrfs_find_next_key(root, path, min_key, level,
+ min_trans);
+ if (sret == 0) {
+ btrfs_release_path(path);
+ goto again;
+ } else {
+ goto out;
+ }
+ }
+ /* save our key for returning back */
+ btrfs_node_key_to_cpu(cur, &found_key, slot);
+ path->slots[level] = slot;
+ if (level == path->lowest_level) {
+ ret = 0;
+ goto out;
+ }
+ btrfs_set_path_blocking(path);
+ cur = read_node_slot(root, cur, slot);
+ BUG_ON(!cur); /* -ENOMEM */
+
+ btrfs_tree_read_lock(cur);
+
+ path->locks[level - 1] = BTRFS_READ_LOCK;
+ path->nodes[level - 1] = cur;
+ unlock_up(path, level, 1, 0, NULL);
+ btrfs_clear_path_blocking(path, NULL, 0);
+ }
+out:
+ path->keep_locks = keep_locks;
+ if (ret == 0) {
+ btrfs_unlock_up_safe(path, path->lowest_level + 1);
+ btrfs_set_path_blocking(path);
+ memcpy(min_key, &found_key, sizeof(found_key));
+ }
+ return ret;
+}
+
+static void tree_move_down(struct btrfs_root *root,
+ struct btrfs_path *path,
+ int *level, int root_level)
+{
+ BUG_ON(*level == 0);
+ path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level],
+ path->slots[*level]);
+ path->slots[*level - 1] = 0;
+ (*level)--;
+}
+
+static int tree_move_next_or_upnext(struct btrfs_root *root,
+ struct btrfs_path *path,
+ int *level, int root_level)
+{
+ int ret = 0;
+ int nritems;
+ nritems = btrfs_header_nritems(path->nodes[*level]);
+
+ path->slots[*level]++;
+
+ while (path->slots[*level] >= nritems) {
+ if (*level == root_level)
+ return -1;
+
+ /* move upnext */
+ path->slots[*level] = 0;
+ free_extent_buffer(path->nodes[*level]);
+ path->nodes[*level] = NULL;
+ (*level)++;
+ path->slots[*level]++;
+
+ nritems = btrfs_header_nritems(path->nodes[*level]);
+ ret = 1;
+ }
+ return ret;
+}
+
+/*
+ * Returns 1 if it had to move up and next. 0 is returned if it moved only next
+ * or down.
+ */
+static int tree_advance(struct btrfs_root *root,
+ struct btrfs_path *path,
+ int *level, int root_level,
+ int allow_down,
+ struct btrfs_key *key)
+{
+ int ret;
+
+ if (*level == 0 || !allow_down) {
+ ret = tree_move_next_or_upnext(root, path, level, root_level);
+ } else {
+ tree_move_down(root, path, level, root_level);
+ ret = 0;
+ }
+ if (ret >= 0) {
+ if (*level == 0)
+ btrfs_item_key_to_cpu(path->nodes[*level], key,
+ path->slots[*level]);
+ else
+ btrfs_node_key_to_cpu(path->nodes[*level], key,
+ path->slots[*level]);
+ }
+ return ret;
+}
+
+static int tree_compare_item(struct btrfs_root *left_root,
+ struct btrfs_path *left_path,
+ struct btrfs_path *right_path,
+ char *tmp_buf)
+{
+ int cmp;
+ int len1, len2;
+ unsigned long off1, off2;
+
+ len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
+ len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]);
+ if (len1 != len2)
+ return 1;
+
+ off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]);
+ off2 = btrfs_item_ptr_offset(right_path->nodes[0],
+ right_path->slots[0]);
+
+ read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);
+
+ cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1);
+ if (cmp)
+ return 1;
+ return 0;
+}
+
+#define ADVANCE 1
+#define ADVANCE_ONLY_NEXT -1
+
+/*
+ * This function compares two trees and calls the provided callback for
+ * every changed/new/deleted item it finds.
+ * If shared tree blocks are encountered, whole subtrees are skipped, making
+ * the compare pretty fast on snapshotted subvolumes.
+ *
+ * This currently works on commit roots only. As commit roots are read only,
+ * we don't do any locking. The commit roots are protected with transactions.
+ * Transactions are ended and rejoined when a commit is tried in between.
+ *
+ * This function checks for modifications done to the trees while comparing.
+ * If it detects a change, it aborts immediately.
+ */
+int btrfs_compare_trees(struct btrfs_root *left_root,
+ struct btrfs_root *right_root,
+ btrfs_changed_cb_t changed_cb, void *ctx)
+{
+ int ret;
+ int cmp;
+ struct btrfs_path *left_path = NULL;
+ struct btrfs_path *right_path = NULL;
+ struct btrfs_key left_key;
+ struct btrfs_key right_key;
+ char *tmp_buf = NULL;
+ int left_root_level;
+ int right_root_level;
+ int left_level;
+ int right_level;
+ int left_end_reached;
+ int right_end_reached;
+ int advance_left;
+ int advance_right;
+ u64 left_blockptr;
+ u64 right_blockptr;
+ u64 left_gen;
+ u64 right_gen;
+
+ left_path = btrfs_alloc_path();
+ if (!left_path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ right_path = btrfs_alloc_path();
+ if (!right_path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ tmp_buf = kmalloc(left_root->nodesize, GFP_NOFS);
+ if (!tmp_buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ left_path->search_commit_root = 1;
+ left_path->skip_locking = 1;
+ right_path->search_commit_root = 1;
+ right_path->skip_locking = 1;
+
+ /*
+ * Strategy: Go to the first items of both trees. Then do
+ *
+ * If both trees are at level 0
+ * Compare keys of current items
+ * If left < right treat left item as new, advance left tree
+ * and repeat
+ * If left > right treat right item as deleted, advance right tree
+ * and repeat
+ * If left == right do deep compare of items, treat as changed if
+ * needed, advance both trees and repeat
+ * If both trees are at the same level but not at level 0
+ * Compare keys of current nodes/leafs
+ * If left < right advance left tree and repeat
+ * If left > right advance right tree and repeat
+ * If left == right compare blockptrs of the next nodes/leafs
+ * If they match advance both trees but stay at the same level
+ * and repeat
+ * If they don't match advance both trees while allowing to go
+ * deeper and repeat
+ * If tree levels are different
+ * Advance the tree that needs it and repeat
+ *
+ * Advancing a tree means:
+ * If we are at level 0, try to go to the next slot. If that's not
+ * possible, go one level up and repeat. Stop when we found a level
+ * where we could go to the next slot. We may at this point be on a
+ * node or a leaf.
+ *
+ * If we are not at level 0 and not on shared tree blocks, go one
+ * level deeper.
+ *
+ * If we are not at level 0 and on shared tree blocks, go one slot to
+ * the right if possible or go up and right.
+ */
+
+ down_read(&left_root->fs_info->commit_root_sem);
+ left_level = btrfs_header_level(left_root->commit_root);
+ left_root_level = left_level;
+ left_path->nodes[left_level] = left_root->commit_root;
+ extent_buffer_get(left_path->nodes[left_level]);
+
+ right_level = btrfs_header_level(right_root->commit_root);
+ right_root_level = right_level;
+ right_path->nodes[right_level] = right_root->commit_root;
+ extent_buffer_get(right_path->nodes[right_level]);
+ up_read(&left_root->fs_info->commit_root_sem);
+
+ if (left_level == 0)
+ btrfs_item_key_to_cpu(left_path->nodes[left_level],
+ &left_key, left_path->slots[left_level]);
+ else
+ btrfs_node_key_to_cpu(left_path->nodes[left_level],
+ &left_key, left_path->slots[left_level]);
+ if (right_level == 0)
+ btrfs_item_key_to_cpu(right_path->nodes[right_level],
+ &right_key, right_path->slots[right_level]);
+ else
+ btrfs_node_key_to_cpu(right_path->nodes[right_level],
+ &right_key, right_path->slots[right_level]);
+
+ left_end_reached = right_end_reached = 0;
+ advance_left = advance_right = 0;
+
+ while (1) {
+ if (advance_left && !left_end_reached) {
+ ret = tree_advance(left_root, left_path, &left_level,
+ left_root_level,
+ advance_left != ADVANCE_ONLY_NEXT,
+ &left_key);
+ if (ret < 0)
+ left_end_reached = ADVANCE;
+ advance_left = 0;
+ }
+ if (advance_right && !right_end_reached) {
+ ret = tree_advance(right_root, right_path, &right_level,
+ right_root_level,
+ advance_right != ADVANCE_ONLY_NEXT,
+ &right_key);
+ if (ret < 0)
+ right_end_reached = ADVANCE;
+ advance_right = 0;
+ }
+
+ if (left_end_reached && right_end_reached) {
+ ret = 0;
+ goto out;
+ } else if (left_end_reached) {
+ if (right_level == 0) {
+ ret = changed_cb(left_root, right_root,
+ left_path, right_path,
+ &right_key,
+ BTRFS_COMPARE_TREE_DELETED,
+ ctx);
+ if (ret < 0)
+ goto out;
+ }
+ advance_right = ADVANCE;
+ continue;
+ } else if (right_end_reached) {
+ if (left_level == 0) {
+ ret = changed_cb(left_root, right_root,
+ left_path, right_path,
+ &left_key,
+ BTRFS_COMPARE_TREE_NEW,
+ ctx);
+ if (ret < 0)
+ goto out;
+ }
+ advance_left = ADVANCE;
+ continue;
+ }
+
+ if (left_level == 0 && right_level == 0) {
+ cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
+ if (cmp < 0) {
+ ret = changed_cb(left_root, right_root,
+ left_path, right_path,
+ &left_key,
+ BTRFS_COMPARE_TREE_NEW,
+ ctx);
+ if (ret < 0)
+ goto out;
+ advance_left = ADVANCE;
+ } else if (cmp > 0) {
+ ret = changed_cb(left_root, right_root,
+ left_path, right_path,
+ &right_key,
+ BTRFS_COMPARE_TREE_DELETED,
+ ctx);
+ if (ret < 0)
+ goto out;
+ advance_right = ADVANCE;
+ } else {
+ enum btrfs_compare_tree_result result;
+
+ WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
+ ret = tree_compare_item(left_root, left_path,
+ right_path, tmp_buf);
+ if (ret)
+ result = BTRFS_COMPARE_TREE_CHANGED;
+ else
+ result = BTRFS_COMPARE_TREE_SAME;
+ ret = changed_cb(left_root, right_root,
+ left_path, right_path,
+ &left_key, result, ctx);
+ if (ret < 0)
+ goto out;
+ advance_left = ADVANCE;
+ advance_right = ADVANCE;
+ }
+ } else if (left_level == right_level) {
+ cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
+ if (cmp < 0) {
+ advance_left = ADVANCE;
+ } else if (cmp > 0) {
+ advance_right = ADVANCE;
+ } else {
+ left_blockptr = btrfs_node_blockptr(
+ left_path->nodes[left_level],
+ left_path->slots[left_level]);
+ right_blockptr = btrfs_node_blockptr(
+ right_path->nodes[right_level],
+ right_path->slots[right_level]);
+ left_gen = btrfs_node_ptr_generation(
+ left_path->nodes[left_level],
+ left_path->slots[left_level]);
+ right_gen = btrfs_node_ptr_generation(
+ right_path->nodes[right_level],
+ right_path->slots[right_level]);
+ if (left_blockptr == right_blockptr &&
+ left_gen == right_gen) {
+ /*
+ * As we're on a shared block, don't
+ * allow to go deeper.
+ */
+ advance_left = ADVANCE_ONLY_NEXT;
+ advance_right = ADVANCE_ONLY_NEXT;
+ } else {
+ advance_left = ADVANCE;
+ advance_right = ADVANCE;
+ }
+ }
+ } else if (left_level < right_level) {
+ advance_right = ADVANCE;
+ } else {
+ advance_left = ADVANCE;
+ }
+ }
+
+out:
+ btrfs_free_path(left_path);
+ btrfs_free_path(right_path);
+ kfree(tmp_buf);
+ return ret;
+}
+
+/*
+ * this is similar to btrfs_next_leaf, but does not try to preserve
+ * and fixup the path. It looks for and returns the next key in the
+ * tree based on the current path and the min_trans parameters.
+ *
+ * 0 is returned if another key is found, < 0 if there are any errors
+ * and 1 is returned if there are no higher keys in the tree
+ *
+ * path->keep_locks should be set to 1 on the search made before
+ * calling this function.
+ */
+int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
+ struct btrfs_key *key, int level, u64 min_trans)
+{
+ int slot;
+ struct extent_buffer *c;
+
+ WARN_ON(!path->keep_locks);
+ while (level < BTRFS_MAX_LEVEL) {
+ if (!path->nodes[level])
+ return 1;
+
+ slot = path->slots[level] + 1;
+ c = path->nodes[level];
+next:
+ if (slot >= btrfs_header_nritems(c)) {
+ int ret;
+ int orig_lowest;
+ struct btrfs_key cur_key;
+ if (level + 1 >= BTRFS_MAX_LEVEL ||
+ !path->nodes[level + 1])
+ return 1;
+
+ if (path->locks[level + 1]) {
+ level++;
+ continue;
+ }
+
+ slot = btrfs_header_nritems(c) - 1;
+ if (level == 0)
+ btrfs_item_key_to_cpu(c, &cur_key, slot);
+ else
+ btrfs_node_key_to_cpu(c, &cur_key, slot);
+
+ orig_lowest = path->lowest_level;
+ btrfs_release_path(path);
+ path->lowest_level = level;
+ ret = btrfs_search_slot(NULL, root, &cur_key, path,
+ 0, 0);
+ path->lowest_level = orig_lowest;
+ if (ret < 0)
+ return ret;
+
+ c = path->nodes[level];
+ slot = path->slots[level];
+ if (ret == 0)
+ slot++;
+ goto next;
+ }
+
+ if (level == 0)
+ btrfs_item_key_to_cpu(c, key, slot);
+ else {
+ u64 gen = btrfs_node_ptr_generation(c, slot);
+
+ if (gen < min_trans) {
+ slot++;
+ goto next;
+ }
+ btrfs_node_key_to_cpu(c, key, slot);
+ }
+ return 0;
+ }
+ return 1;
+}
+
+/*
+ * search the tree again to find a leaf with greater keys
+ * returns 0 if it found something or 1 if there are no greater leaves.
+ * returns < 0 on io errors.
+ */
+int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
+{
+ return btrfs_next_old_leaf(root, path, 0);
+}
+
+int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
+ u64 time_seq)
+{
+ int slot;
+ int level;
+ struct extent_buffer *c;
+ struct extent_buffer *next;
+ struct btrfs_key key;
+ u32 nritems;
+ int ret;
+ int old_spinning = path->leave_spinning;
+ int next_rw_lock = 0;
+
+ nritems = btrfs_header_nritems(path->nodes[0]);
+ if (nritems == 0)
+ return 1;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
+again:
+ level = 1;
+ next = NULL;
+ next_rw_lock = 0;
+ btrfs_release_path(path);
+
+ path->keep_locks = 1;
+ path->leave_spinning = 1;
+
+ if (time_seq)
+ ret = btrfs_search_old_slot(root, &key, path, time_seq);
+ else
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ path->keep_locks = 0;
+
+ if (ret < 0)
+ return ret;
+
+ nritems = btrfs_header_nritems(path->nodes[0]);
+ /*
+ * by releasing the path above we dropped all our locks. A balance
+ * could have added more items next to the key that used to be
+ * at the very end of the block. So, check again here and
+ * advance the path if there are now more items available.
+ */
+ if (nritems > 0 && path->slots[0] < nritems - 1) {
+ if (ret == 0)
+ path->slots[0]++;
+ ret = 0;
+ goto done;
+ }
+ /*
+ * So the above check misses one case:
+ * - after releasing the path above, someone has removed the item that
+ * used to be at the very end of the block, and balance between leafs
+ * gets another one with bigger key.offset to replace it.
+ *
+ * This one should be returned as well, or we can get leaf corruption
+ * later(esp. in __btrfs_drop_extents()).
+ *
+ * And a bit more explanation about this check,
+ * with ret > 0, the key isn't found, the path points to the slot
+ * where it should be inserted, so the path->slots[0] item must be the
+ * bigger one.
+ */
+ if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
+ ret = 0;
+ goto done;
+ }
+
+ while (level < BTRFS_MAX_LEVEL) {
+ if (!path->nodes[level]) {
+ ret = 1;
+ goto done;
+ }
+
+ slot = path->slots[level] + 1;
+ c = path->nodes[level];
+ if (slot >= btrfs_header_nritems(c)) {
+ level++;
+ if (level == BTRFS_MAX_LEVEL) {
+ ret = 1;
+ goto done;
+ }
+ continue;
+ }
+
+ if (next) {
+ btrfs_tree_unlock_rw(next, next_rw_lock);
+ free_extent_buffer(next);
+ }
+
+ next = c;
+ next_rw_lock = path->locks[level];
+ ret = read_block_for_search(NULL, root, path, &next, level,
+ slot, &key, 0);
+ if (ret == -EAGAIN)
+ goto again;
+
+ if (ret < 0) {
+ btrfs_release_path(path);
+ goto done;
+ }
+
+ if (!path->skip_locking) {
+ ret = btrfs_try_tree_read_lock(next);
+ if (!ret && time_seq) {
+ /*
+ * If we don't get the lock, we may be racing
+ * with push_leaf_left, holding that lock while
+ * itself waiting for the leaf we've currently
+ * locked. To solve this situation, we give up
+ * on our lock and cycle.
+ */
+ free_extent_buffer(next);
+ btrfs_release_path(path);
+ cond_resched();
+ goto again;
+ }
+ if (!ret) {
+ btrfs_set_path_blocking(path);
+ btrfs_tree_read_lock(next);
+ btrfs_clear_path_blocking(path, next,
+ BTRFS_READ_LOCK);
+ }
+ next_rw_lock = BTRFS_READ_LOCK;
+ }
+ break;
+ }
+ path->slots[level] = slot;
+ while (1) {
+ level--;
+ c = path->nodes[level];
+ if (path->locks[level])
+ btrfs_tree_unlock_rw(c, path->locks[level]);
+
+ free_extent_buffer(c);
+ path->nodes[level] = next;
+ path->slots[level] = 0;
+ if (!path->skip_locking)
+ path->locks[level] = next_rw_lock;
+ if (!level)
+ break;
+
+ ret = read_block_for_search(NULL, root, path, &next, level,
+ 0, &key, 0);
+ if (ret == -EAGAIN)
+ goto again;
+
+ if (ret < 0) {
+ btrfs_release_path(path);
+ goto done;
+ }
+
+ if (!path->skip_locking) {
+ ret = btrfs_try_tree_read_lock(next);
+ if (!ret) {
+ btrfs_set_path_blocking(path);
+ btrfs_tree_read_lock(next);
+ btrfs_clear_path_blocking(path, next,
+ BTRFS_READ_LOCK);
+ }
+ next_rw_lock = BTRFS_READ_LOCK;
+ }
+ }
+ ret = 0;
+done:
+ unlock_up(path, 0, 1, 0, NULL);
+ path->leave_spinning = old_spinning;
+ if (!old_spinning)
+ btrfs_set_path_blocking(path);
+
+ return ret;
+}
+
+/*
+ * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
+ * searching until it gets past min_objectid or finds an item of 'type'
+ *
+ * returns 0 if something is found, 1 if nothing was found and < 0 on error
+ */
+int btrfs_previous_item(struct btrfs_root *root,
+ struct btrfs_path *path, u64 min_objectid,
+ int type)
+{
+ struct btrfs_key found_key;
+ struct extent_buffer *leaf;
+ u32 nritems;
+ int ret;
+
+ while (1) {
+ if (path->slots[0] == 0) {
+ btrfs_set_path_blocking(path);
+ ret = btrfs_prev_leaf(root, path);
+ if (ret != 0)
+ return ret;
+ } else {
+ path->slots[0]--;
+ }
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ if (nritems == 0)
+ return 1;
+ if (path->slots[0] == nritems)
+ path->slots[0]--;
+
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ if (found_key.objectid < min_objectid)
+ break;
+ if (found_key.type == type)
+ return 0;
+ if (found_key.objectid == min_objectid &&
+ found_key.type < type)
+ break;
+ }
+ return 1;
+}
+
+/*
+ * search in extent tree to find a previous Metadata/Data extent item with
+ * min objecitd.
+ *
+ * returns 0 if something is found, 1 if nothing was found and < 0 on error
+ */
+int btrfs_previous_extent_item(struct btrfs_root *root,
+ struct btrfs_path *path, u64 min_objectid)
+{
+ struct btrfs_key found_key;
+ struct extent_buffer *leaf;
+ u32 nritems;
+ int ret;
+
+ while (1) {
+ if (path->slots[0] == 0) {
+ btrfs_set_path_blocking(path);
+ ret = btrfs_prev_leaf(root, path);
+ if (ret != 0)
+ return ret;
+ } else {
+ path->slots[0]--;
+ }
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ if (nritems == 0)
+ return 1;
+ if (path->slots[0] == nritems)
+ path->slots[0]--;
+
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ if (found_key.objectid < min_objectid)
+ break;
+ if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
+ found_key.type == BTRFS_METADATA_ITEM_KEY)
+ return 0;
+ if (found_key.objectid == min_objectid &&
+ found_key.type < BTRFS_EXTENT_ITEM_KEY)
+ break;
+ }
+ return 1;
+}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
new file mode 100644
index 000000000..6f364e1d8
--- /dev/null
+++ b/fs/btrfs/ctree.h
@@ -0,0 +1,4247 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_CTREE__
+#define __BTRFS_CTREE__
+
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/fs.h>
+#include <linux/rwsem.h>
+#include <linux/semaphore.h>
+#include <linux/completion.h>
+#include <linux/backing-dev.h>
+#include <linux/wait.h>
+#include <linux/slab.h>
+#include <linux/kobject.h>
+#include <trace/events/btrfs.h>
+#include <asm/kmap_types.h>
+#include <linux/pagemap.h>
+#include <linux/btrfs.h>
+#include <linux/workqueue.h>
+#include <linux/security.h>
+#include "extent_io.h"
+#include "extent_map.h"
+#include "async-thread.h"
+
+struct btrfs_trans_handle;
+struct btrfs_transaction;
+struct btrfs_pending_snapshot;
+extern struct kmem_cache *btrfs_trans_handle_cachep;
+extern struct kmem_cache *btrfs_transaction_cachep;
+extern struct kmem_cache *btrfs_bit_radix_cachep;
+extern struct kmem_cache *btrfs_path_cachep;
+extern struct kmem_cache *btrfs_free_space_cachep;
+struct btrfs_ordered_sum;
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+#define STATIC noinline
+#else
+#define STATIC static noinline
+#endif
+
+#define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */
+
+#define BTRFS_MAX_MIRRORS 3
+
+#define BTRFS_MAX_LEVEL 8
+
+#define BTRFS_COMPAT_EXTENT_TREE_V0
+
+/* holds pointers to all of the tree roots */
+#define BTRFS_ROOT_TREE_OBJECTID 1ULL
+
+/* stores information about which extents are in use, and reference counts */
+#define BTRFS_EXTENT_TREE_OBJECTID 2ULL
+
+/*
+ * chunk tree stores translations from logical -> physical block numbering
+ * the super block points to the chunk tree
+ */
+#define BTRFS_CHUNK_TREE_OBJECTID 3ULL
+
+/*
+ * stores information about which areas of a given device are in use.
+ * one per device. The tree of tree roots points to the device tree
+ */
+#define BTRFS_DEV_TREE_OBJECTID 4ULL
+
+/* one per subvolume, storing files and directories */
+#define BTRFS_FS_TREE_OBJECTID 5ULL
+
+/* directory objectid inside the root tree */
+#define BTRFS_ROOT_TREE_DIR_OBJECTID 6ULL
+
+/* holds checksums of all the data extents */
+#define BTRFS_CSUM_TREE_OBJECTID 7ULL
+
+/* holds quota configuration and tracking */
+#define BTRFS_QUOTA_TREE_OBJECTID 8ULL
+
+/* for storing items that use the BTRFS_UUID_KEY* types */
+#define BTRFS_UUID_TREE_OBJECTID 9ULL
+
+/* for storing balance parameters in the root tree */
+#define BTRFS_BALANCE_OBJECTID -4ULL
+
+/* orhpan objectid for tracking unlinked/truncated files */
+#define BTRFS_ORPHAN_OBJECTID -5ULL
+
+/* does write ahead logging to speed up fsyncs */
+#define BTRFS_TREE_LOG_OBJECTID -6ULL
+#define BTRFS_TREE_LOG_FIXUP_OBJECTID -7ULL
+
+/* for space balancing */
+#define BTRFS_TREE_RELOC_OBJECTID -8ULL
+#define BTRFS_DATA_RELOC_TREE_OBJECTID -9ULL
+
+/*
+ * extent checksums all have this objectid
+ * this allows them to share the logging tree
+ * for fsyncs
+ */
+#define BTRFS_EXTENT_CSUM_OBJECTID -10ULL
+
+/* For storing free space cache */
+#define BTRFS_FREE_SPACE_OBJECTID -11ULL
+
+/*
+ * The inode number assigned to the special inode for storing
+ * free ino cache
+ */
+#define BTRFS_FREE_INO_OBJECTID -12ULL
+
+/* dummy objectid represents multiple objectids */
+#define BTRFS_MULTIPLE_OBJECTIDS -255ULL
+
+/*
+ * All files have objectids in this range.
+ */
+#define BTRFS_FIRST_FREE_OBJECTID 256ULL
+#define BTRFS_LAST_FREE_OBJECTID -256ULL
+#define BTRFS_FIRST_CHUNK_TREE_OBJECTID 256ULL
+
+
+/*
+ * the device items go into the chunk tree. The key is in the form
+ * [ 1 BTRFS_DEV_ITEM_KEY device_id ]
+ */
+#define BTRFS_DEV_ITEMS_OBJECTID 1ULL
+
+#define BTRFS_BTREE_INODE_OBJECTID 1
+
+#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2
+
+#define BTRFS_DEV_REPLACE_DEVID 0ULL
+
+/*
+ * the max metadata block size. This limit is somewhat artificial,
+ * but the memmove costs go through the roof for larger blocks.
+ */
+#define BTRFS_MAX_METADATA_BLOCKSIZE 65536
+
+/*
+ * we can actually store much bigger names, but lets not confuse the rest
+ * of linux
+ */
+#define BTRFS_NAME_LEN 255
+
+/*
+ * Theoretical limit is larger, but we keep this down to a sane
+ * value. That should limit greatly the possibility of collisions on
+ * inode ref items.
+ */
+#define BTRFS_LINK_MAX 65535U
+
+/* 32 bytes in various csum fields */
+#define BTRFS_CSUM_SIZE 32
+
+/* csum types */
+#define BTRFS_CSUM_TYPE_CRC32 0
+
+static int btrfs_csum_sizes[] = { 4, 0 };
+
+/* four bytes for CRC32 */
+#define BTRFS_EMPTY_DIR_SIZE 0
+
+/* spefic to btrfs_map_block(), therefore not in include/linux/blk_types.h */
+#define REQ_GET_READ_MIRRORS (1 << 30)
+
+#define BTRFS_FT_UNKNOWN 0
+#define BTRFS_FT_REG_FILE 1
+#define BTRFS_FT_DIR 2
+#define BTRFS_FT_CHRDEV 3
+#define BTRFS_FT_BLKDEV 4
+#define BTRFS_FT_FIFO 5
+#define BTRFS_FT_SOCK 6
+#define BTRFS_FT_SYMLINK 7
+#define BTRFS_FT_XATTR 8
+#define BTRFS_FT_MAX 9
+
+/* ioprio of readahead is set to idle */
+#define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0))
+
+#define BTRFS_DIRTY_METADATA_THRESH (32 * 1024 * 1024)
+
+#define BTRFS_MAX_EXTENT_SIZE (128 * 1024 * 1024)
+
+/*
+ * The key defines the order in the tree, and so it also defines (optimal)
+ * block layout.
+ *
+ * objectid corresponds to the inode number.
+ *
+ * type tells us things about the object, and is a kind of stream selector.
+ * so for a given inode, keys with type of 1 might refer to the inode data,
+ * type of 2 may point to file data in the btree and type == 3 may point to
+ * extents.
+ *
+ * offset is the starting byte offset for this key in the stream.
+ *
+ * btrfs_disk_key is in disk byte order. struct btrfs_key is always
+ * in cpu native order. Otherwise they are identical and their sizes
+ * should be the same (ie both packed)
+ */
+struct btrfs_disk_key {
+ __le64 objectid;
+ u8 type;
+ __le64 offset;
+} __attribute__ ((__packed__));
+
+struct btrfs_key {
+ u64 objectid;
+ u8 type;
+ u64 offset;
+} __attribute__ ((__packed__));
+
+struct btrfs_mapping_tree {
+ struct extent_map_tree map_tree;
+};
+
+struct btrfs_dev_item {
+ /* the internal btrfs device id */
+ __le64 devid;
+
+ /* size of the device */
+ __le64 total_bytes;
+
+ /* bytes used */
+ __le64 bytes_used;
+
+ /* optimal io alignment for this device */
+ __le32 io_align;
+
+ /* optimal io width for this device */
+ __le32 io_width;
+
+ /* minimal io size for this device */
+ __le32 sector_size;
+
+ /* type and info about this device */
+ __le64 type;
+
+ /* expected generation for this device */
+ __le64 generation;
+
+ /*
+ * starting byte of this partition on the device,
+ * to allow for stripe alignment in the future
+ */
+ __le64 start_offset;
+
+ /* grouping information for allocation decisions */
+ __le32 dev_group;
+
+ /* seek speed 0-100 where 100 is fastest */
+ u8 seek_speed;
+
+ /* bandwidth 0-100 where 100 is fastest */
+ u8 bandwidth;
+
+ /* btrfs generated uuid for this device */
+ u8 uuid[BTRFS_UUID_SIZE];
+
+ /* uuid of FS who owns this device */
+ u8 fsid[BTRFS_UUID_SIZE];
+} __attribute__ ((__packed__));
+
+struct btrfs_stripe {
+ __le64 devid;
+ __le64 offset;
+ u8 dev_uuid[BTRFS_UUID_SIZE];
+} __attribute__ ((__packed__));
+
+struct btrfs_chunk {
+ /* size of this chunk in bytes */
+ __le64 length;
+
+ /* objectid of the root referencing this chunk */
+ __le64 owner;
+
+ __le64 stripe_len;
+ __le64 type;
+
+ /* optimal io alignment for this chunk */
+ __le32 io_align;
+
+ /* optimal io width for this chunk */
+ __le32 io_width;
+
+ /* minimal io size for this chunk */
+ __le32 sector_size;
+
+ /* 2^16 stripes is quite a lot, a second limit is the size of a single
+ * item in the btree
+ */
+ __le16 num_stripes;
+
+ /* sub stripes only matter for raid10 */
+ __le16 sub_stripes;
+ struct btrfs_stripe stripe;
+ /* additional stripes go here */
+} __attribute__ ((__packed__));
+
+#define BTRFS_FREE_SPACE_EXTENT 1
+#define BTRFS_FREE_SPACE_BITMAP 2
+
+struct btrfs_free_space_entry {
+ __le64 offset;
+ __le64 bytes;
+ u8 type;
+} __attribute__ ((__packed__));
+
+struct btrfs_free_space_header {
+ struct btrfs_disk_key location;
+ __le64 generation;
+ __le64 num_entries;
+ __le64 num_bitmaps;
+} __attribute__ ((__packed__));
+
+static inline unsigned long btrfs_chunk_item_size(int num_stripes)
+{
+ BUG_ON(num_stripes == 0);
+ return sizeof(struct btrfs_chunk) +
+ sizeof(struct btrfs_stripe) * (num_stripes - 1);
+}
+
+#define BTRFS_HEADER_FLAG_WRITTEN (1ULL << 0)
+#define BTRFS_HEADER_FLAG_RELOC (1ULL << 1)
+
+/*
+ * File system states
+ */
+#define BTRFS_FS_STATE_ERROR 0
+#define BTRFS_FS_STATE_REMOUNTING 1
+#define BTRFS_FS_STATE_TRANS_ABORTED 2
+#define BTRFS_FS_STATE_DEV_REPLACING 3
+
+/* Super block flags */
+/* Errors detected */
+#define BTRFS_SUPER_FLAG_ERROR (1ULL << 2)
+
+#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32)
+#define BTRFS_SUPER_FLAG_METADUMP (1ULL << 33)
+
+#define BTRFS_BACKREF_REV_MAX 256
+#define BTRFS_BACKREF_REV_SHIFT 56
+#define BTRFS_BACKREF_REV_MASK (((u64)BTRFS_BACKREF_REV_MAX - 1) << \
+ BTRFS_BACKREF_REV_SHIFT)
+
+#define BTRFS_OLD_BACKREF_REV 0
+#define BTRFS_MIXED_BACKREF_REV 1
+
+/*
+ * every tree block (leaf or node) starts with this header.
+ */
+struct btrfs_header {
+ /* these first four must match the super block */
+ u8 csum[BTRFS_CSUM_SIZE];
+ u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
+ __le64 bytenr; /* which block this node is supposed to live in */
+ __le64 flags;
+
+ /* allowed to be different from the super from here on down */
+ u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
+ __le64 generation;
+ __le64 owner;
+ __le32 nritems;
+ u8 level;
+} __attribute__ ((__packed__));
+
+#define BTRFS_NODEPTRS_PER_BLOCK(r) (((r)->nodesize - \
+ sizeof(struct btrfs_header)) / \
+ sizeof(struct btrfs_key_ptr))
+#define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header))
+#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->nodesize))
+#define BTRFS_FILE_EXTENT_INLINE_DATA_START \
+ (offsetof(struct btrfs_file_extent_item, disk_bytenr))
+#define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \
+ sizeof(struct btrfs_item) - \
+ BTRFS_FILE_EXTENT_INLINE_DATA_START)
+#define BTRFS_MAX_XATTR_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \
+ sizeof(struct btrfs_item) -\
+ sizeof(struct btrfs_dir_item))
+
+
+/*
+ * this is a very generous portion of the super block, giving us
+ * room to translate 14 chunks with 3 stripes each.
+ */
+#define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048
+#define BTRFS_LABEL_SIZE 256
+
+/*
+ * just in case we somehow lose the roots and are not able to mount,
+ * we store an array of the roots from previous transactions
+ * in the super.
+ */
+#define BTRFS_NUM_BACKUP_ROOTS 4
+struct btrfs_root_backup {
+ __le64 tree_root;
+ __le64 tree_root_gen;
+
+ __le64 chunk_root;
+ __le64 chunk_root_gen;
+
+ __le64 extent_root;
+ __le64 extent_root_gen;
+
+ __le64 fs_root;
+ __le64 fs_root_gen;
+
+ __le64 dev_root;
+ __le64 dev_root_gen;
+
+ __le64 csum_root;
+ __le64 csum_root_gen;
+
+ __le64 total_bytes;
+ __le64 bytes_used;
+ __le64 num_devices;
+ /* future */
+ __le64 unused_64[4];
+
+ u8 tree_root_level;
+ u8 chunk_root_level;
+ u8 extent_root_level;
+ u8 fs_root_level;
+ u8 dev_root_level;
+ u8 csum_root_level;
+ /* future and to align */
+ u8 unused_8[10];
+} __attribute__ ((__packed__));
+
+/*
+ * the super block basically lists the main trees of the FS
+ * it currently lacks any block count etc etc
+ */
+struct btrfs_super_block {
+ u8 csum[BTRFS_CSUM_SIZE];
+ /* the first 4 fields must match struct btrfs_header */
+ u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
+ __le64 bytenr; /* this block number */
+ __le64 flags;
+
+ /* allowed to be different from the btrfs_header from here own down */
+ __le64 magic;
+ __le64 generation;
+ __le64 root;
+ __le64 chunk_root;
+ __le64 log_root;
+
+ /* this will help find the new super based on the log root */
+ __le64 log_root_transid;
+ __le64 total_bytes;
+ __le64 bytes_used;
+ __le64 root_dir_objectid;
+ __le64 num_devices;
+ __le32 sectorsize;
+ __le32 nodesize;
+ __le32 __unused_leafsize;
+ __le32 stripesize;
+ __le32 sys_chunk_array_size;
+ __le64 chunk_root_generation;
+ __le64 compat_flags;
+ __le64 compat_ro_flags;
+ __le64 incompat_flags;
+ __le16 csum_type;
+ u8 root_level;
+ u8 chunk_root_level;
+ u8 log_root_level;
+ struct btrfs_dev_item dev_item;
+
+ char label[BTRFS_LABEL_SIZE];
+
+ __le64 cache_generation;
+ __le64 uuid_tree_generation;
+
+ /* future expansion */
+ __le64 reserved[30];
+ u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
+ struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS];
+} __attribute__ ((__packed__));
+
+/*
+ * Compat flags that we support. If any incompat flags are set other than the
+ * ones specified below then we will fail to mount
+ */
+#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0)
+#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1)
+#define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2)
+#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO (1ULL << 3)
+/*
+ * some patches floated around with a second compression method
+ * lets save that incompat here for when they do get in
+ * Note we don't actually support it, we're just reserving the
+ * number
+ */
+#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZOv2 (1ULL << 4)
+
+/*
+ * older kernels tried to do bigger metadata blocks, but the
+ * code was pretty buggy. Lets not let them try anymore.
+ */
+#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5)
+
+#define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6)
+#define BTRFS_FEATURE_INCOMPAT_RAID56 (1ULL << 7)
+#define BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA (1ULL << 8)
+#define BTRFS_FEATURE_INCOMPAT_NO_HOLES (1ULL << 9)
+
+#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
+#define BTRFS_FEATURE_COMPAT_SAFE_SET 0ULL
+#define BTRFS_FEATURE_COMPAT_SAFE_CLEAR 0ULL
+#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL
+#define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL
+#define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL
+
+#define BTRFS_FEATURE_INCOMPAT_SUPP \
+ (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
+ BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \
+ BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \
+ BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \
+ BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \
+ BTRFS_FEATURE_INCOMPAT_RAID56 | \
+ BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \
+ BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \
+ BTRFS_FEATURE_INCOMPAT_NO_HOLES)
+
+#define BTRFS_FEATURE_INCOMPAT_SAFE_SET \
+ (BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
+#define BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR 0ULL
+
+/*
+ * A leaf is full of items. offset and size tell us where to find
+ * the item in the leaf (relative to the start of the data area)
+ */
+struct btrfs_item {
+ struct btrfs_disk_key key;
+ __le32 offset;
+ __le32 size;
+} __attribute__ ((__packed__));
+
+/*
+ * leaves have an item area and a data area:
+ * [item0, item1....itemN] [free space] [dataN...data1, data0]
+ *
+ * The data is separate from the items to get the keys closer together
+ * during searches.
+ */
+struct btrfs_leaf {
+ struct btrfs_header header;
+ struct btrfs_item items[];
+} __attribute__ ((__packed__));
+
+/*
+ * all non-leaf blocks are nodes, they hold only keys and pointers to
+ * other blocks
+ */
+struct btrfs_key_ptr {
+ struct btrfs_disk_key key;
+ __le64 blockptr;
+ __le64 generation;
+} __attribute__ ((__packed__));
+
+struct btrfs_node {
+ struct btrfs_header header;
+ struct btrfs_key_ptr ptrs[];
+} __attribute__ ((__packed__));
+
+/*
+ * btrfs_paths remember the path taken from the root down to the leaf.
+ * level 0 is always the leaf, and nodes[1...BTRFS_MAX_LEVEL] will point
+ * to any other levels that are present.
+ *
+ * The slots array records the index of the item or block pointer
+ * used while walking the tree.
+ */
+struct btrfs_path {
+ struct extent_buffer *nodes[BTRFS_MAX_LEVEL];
+ int slots[BTRFS_MAX_LEVEL];
+ /* if there is real range locking, this locks field will change */
+ int locks[BTRFS_MAX_LEVEL];
+ int reada;
+ /* keep some upper locks as we walk down */
+ int lowest_level;
+
+ /*
+ * set by btrfs_split_item, tells search_slot to keep all locks
+ * and to force calls to keep space in the nodes
+ */
+ unsigned int search_for_split:1;
+ unsigned int keep_locks:1;
+ unsigned int skip_locking:1;
+ unsigned int leave_spinning:1;
+ unsigned int search_commit_root:1;
+ unsigned int need_commit_sem:1;
+ unsigned int skip_release_on_error:1;
+};
+
+/*
+ * items in the extent btree are used to record the objectid of the
+ * owner of the block and the number of references
+ */
+
+struct btrfs_extent_item {
+ __le64 refs;
+ __le64 generation;
+ __le64 flags;
+} __attribute__ ((__packed__));
+
+struct btrfs_extent_item_v0 {
+ __le32 refs;
+} __attribute__ ((__packed__));
+
+#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r) >> 4) - \
+ sizeof(struct btrfs_item))
+
+#define BTRFS_EXTENT_FLAG_DATA (1ULL << 0)
+#define BTRFS_EXTENT_FLAG_TREE_BLOCK (1ULL << 1)
+
+/* following flags only apply to tree blocks */
+
+/* use full backrefs for extent pointers in the block */
+#define BTRFS_BLOCK_FLAG_FULL_BACKREF (1ULL << 8)
+
+/*
+ * this flag is only used internally by scrub and may be changed at any time
+ * it is only declared here to avoid collisions
+ */
+#define BTRFS_EXTENT_FLAG_SUPER (1ULL << 48)
+
+struct btrfs_tree_block_info {
+ struct btrfs_disk_key key;
+ u8 level;
+} __attribute__ ((__packed__));
+
+struct btrfs_extent_data_ref {
+ __le64 root;
+ __le64 objectid;
+ __le64 offset;
+ __le32 count;
+} __attribute__ ((__packed__));
+
+struct btrfs_shared_data_ref {
+ __le32 count;
+} __attribute__ ((__packed__));
+
+struct btrfs_extent_inline_ref {
+ u8 type;
+ __le64 offset;
+} __attribute__ ((__packed__));
+
+/* old style backrefs item */
+struct btrfs_extent_ref_v0 {
+ __le64 root;
+ __le64 generation;
+ __le64 objectid;
+ __le32 count;
+} __attribute__ ((__packed__));
+
+
+/* dev extents record free space on individual devices. The owner
+ * field points back to the chunk allocation mapping tree that allocated
+ * the extent. The chunk tree uuid field is a way to double check the owner
+ */
+struct btrfs_dev_extent {
+ __le64 chunk_tree;
+ __le64 chunk_objectid;
+ __le64 chunk_offset;
+ __le64 length;
+ u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
+} __attribute__ ((__packed__));
+
+struct btrfs_inode_ref {
+ __le64 index;
+ __le16 name_len;
+ /* name goes here */
+} __attribute__ ((__packed__));
+
+struct btrfs_inode_extref {
+ __le64 parent_objectid;
+ __le64 index;
+ __le16 name_len;
+ __u8 name[0];
+ /* name goes here */
+} __attribute__ ((__packed__));
+
+struct btrfs_timespec {
+ __le64 sec;
+ __le32 nsec;
+} __attribute__ ((__packed__));
+
+enum btrfs_compression_type {
+ BTRFS_COMPRESS_NONE = 0,
+ BTRFS_COMPRESS_ZLIB = 1,
+ BTRFS_COMPRESS_LZO = 2,
+ BTRFS_COMPRESS_TYPES = 2,
+ BTRFS_COMPRESS_LAST = 3,
+};
+
+struct btrfs_inode_item {
+ /* nfs style generation number */
+ __le64 generation;
+ /* transid that last touched this inode */
+ __le64 transid;
+ __le64 size;
+ __le64 nbytes;
+ __le64 block_group;
+ __le32 nlink;
+ __le32 uid;
+ __le32 gid;
+ __le32 mode;
+ __le64 rdev;
+ __le64 flags;
+
+ /* modification sequence number for NFS */
+ __le64 sequence;
+
+ /*
+ * a little future expansion, for more than this we can
+ * just grow the inode item and version it
+ */
+ __le64 reserved[4];
+ struct btrfs_timespec atime;
+ struct btrfs_timespec ctime;
+ struct btrfs_timespec mtime;
+ struct btrfs_timespec otime;
+} __attribute__ ((__packed__));
+
+struct btrfs_dir_log_item {
+ __le64 end;
+} __attribute__ ((__packed__));
+
+struct btrfs_dir_item {
+ struct btrfs_disk_key location;
+ __le64 transid;
+ __le16 data_len;
+ __le16 name_len;
+ u8 type;
+} __attribute__ ((__packed__));
+
+#define BTRFS_ROOT_SUBVOL_RDONLY (1ULL << 0)
+
+/*
+ * Internal in-memory flag that a subvolume has been marked for deletion but
+ * still visible as a directory
+ */
+#define BTRFS_ROOT_SUBVOL_DEAD (1ULL << 48)
+
+struct btrfs_root_item {
+ struct btrfs_inode_item inode;
+ __le64 generation;
+ __le64 root_dirid;
+ __le64 bytenr;
+ __le64 byte_limit;
+ __le64 bytes_used;
+ __le64 last_snapshot;
+ __le64 flags;
+ __le32 refs;
+ struct btrfs_disk_key drop_progress;
+ u8 drop_level;
+ u8 level;
+
+ /*
+ * The following fields appear after subvol_uuids+subvol_times
+ * were introduced.
+ */
+
+ /*
+ * This generation number is used to test if the new fields are valid
+ * and up to date while reading the root item. Everytime the root item
+ * is written out, the "generation" field is copied into this field. If
+ * anyone ever mounted the fs with an older kernel, we will have
+ * mismatching generation values here and thus must invalidate the
+ * new fields. See btrfs_update_root and btrfs_find_last_root for
+ * details.
+ * the offset of generation_v2 is also used as the start for the memset
+ * when invalidating the fields.
+ */
+ __le64 generation_v2;
+ u8 uuid[BTRFS_UUID_SIZE];
+ u8 parent_uuid[BTRFS_UUID_SIZE];
+ u8 received_uuid[BTRFS_UUID_SIZE];
+ __le64 ctransid; /* updated when an inode changes */
+ __le64 otransid; /* trans when created */
+ __le64 stransid; /* trans when sent. non-zero for received subvol */
+ __le64 rtransid; /* trans when received. non-zero for received subvol */
+ struct btrfs_timespec ctime;
+ struct btrfs_timespec otime;
+ struct btrfs_timespec stime;
+ struct btrfs_timespec rtime;
+ __le64 reserved[8]; /* for future */
+} __attribute__ ((__packed__));
+
+/*
+ * this is used for both forward and backward root refs
+ */
+struct btrfs_root_ref {
+ __le64 dirid;
+ __le64 sequence;
+ __le16 name_len;
+} __attribute__ ((__packed__));
+
+struct btrfs_disk_balance_args {
+ /*
+ * profiles to operate on, single is denoted by
+ * BTRFS_AVAIL_ALLOC_BIT_SINGLE
+ */
+ __le64 profiles;
+
+ /* usage filter */
+ __le64 usage;
+
+ /* devid filter */
+ __le64 devid;
+
+ /* devid subset filter [pstart..pend) */
+ __le64 pstart;
+ __le64 pend;
+
+ /* btrfs virtual address space subset filter [vstart..vend) */
+ __le64 vstart;
+ __le64 vend;
+
+ /*
+ * profile to convert to, single is denoted by
+ * BTRFS_AVAIL_ALLOC_BIT_SINGLE
+ */
+ __le64 target;
+
+ /* BTRFS_BALANCE_ARGS_* */
+ __le64 flags;
+
+ /* BTRFS_BALANCE_ARGS_LIMIT value */
+ __le64 limit;
+
+ __le64 unused[7];
+} __attribute__ ((__packed__));
+
+/*
+ * store balance parameters to disk so that balance can be properly
+ * resumed after crash or unmount
+ */
+struct btrfs_balance_item {
+ /* BTRFS_BALANCE_* */
+ __le64 flags;
+
+ struct btrfs_disk_balance_args data;
+ struct btrfs_disk_balance_args meta;
+ struct btrfs_disk_balance_args sys;
+
+ __le64 unused[4];
+} __attribute__ ((__packed__));
+
+#define BTRFS_FILE_EXTENT_INLINE 0
+#define BTRFS_FILE_EXTENT_REG 1
+#define BTRFS_FILE_EXTENT_PREALLOC 2
+
+struct btrfs_file_extent_item {
+ /*
+ * transaction id that created this extent
+ */
+ __le64 generation;
+ /*
+ * max number of bytes to hold this extent in ram
+ * when we split a compressed extent we can't know how big
+ * each of the resulting pieces will be. So, this is
+ * an upper limit on the size of the extent in ram instead of
+ * an exact limit.
+ */
+ __le64 ram_bytes;
+
+ /*
+ * 32 bits for the various ways we might encode the data,
+ * including compression and encryption. If any of these
+ * are set to something a given disk format doesn't understand
+ * it is treated like an incompat flag for reading and writing,
+ * but not for stat.
+ */
+ u8 compression;
+ u8 encryption;
+ __le16 other_encoding; /* spare for later use */
+
+ /* are we inline data or a real extent? */
+ u8 type;
+
+ /*
+ * disk space consumed by the extent, checksum blocks are included
+ * in these numbers
+ *
+ * At this offset in the structure, the inline extent data start.
+ */
+ __le64 disk_bytenr;
+ __le64 disk_num_bytes;
+ /*
+ * the logical offset in file blocks (no csums)
+ * this extent record is for. This allows a file extent to point
+ * into the middle of an existing extent on disk, sharing it
+ * between two snapshots (useful if some bytes in the middle of the
+ * extent have changed
+ */
+ __le64 offset;
+ /*
+ * the logical number of file blocks (no csums included). This
+ * always reflects the size uncompressed and without encoding.
+ */
+ __le64 num_bytes;
+
+} __attribute__ ((__packed__));
+
+struct btrfs_csum_item {
+ u8 csum;
+} __attribute__ ((__packed__));
+
+struct btrfs_dev_stats_item {
+ /*
+ * grow this item struct at the end for future enhancements and keep
+ * the existing values unchanged
+ */
+ __le64 values[BTRFS_DEV_STAT_VALUES_MAX];
+} __attribute__ ((__packed__));
+
+#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS 0
+#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID 1
+#define BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED 0
+#define BTRFS_DEV_REPLACE_ITEM_STATE_STARTED 1
+#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED 2
+#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED 3
+#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED 4
+
+struct btrfs_dev_replace {
+ u64 replace_state; /* see #define above */
+ u64 time_started; /* seconds since 1-Jan-1970 */
+ u64 time_stopped; /* seconds since 1-Jan-1970 */
+ atomic64_t num_write_errors;
+ atomic64_t num_uncorrectable_read_errors;
+
+ u64 cursor_left;
+ u64 committed_cursor_left;
+ u64 cursor_left_last_write_of_item;
+ u64 cursor_right;
+
+ u64 cont_reading_from_srcdev_mode; /* see #define above */
+
+ int is_valid;
+ int item_needs_writeback;
+ struct btrfs_device *srcdev;
+ struct btrfs_device *tgtdev;
+
+ pid_t lock_owner;
+ atomic_t nesting_level;
+ struct mutex lock_finishing_cancel_unmount;
+ struct mutex lock_management_lock;
+ struct mutex lock;
+
+ struct btrfs_scrub_progress scrub_progress;
+};
+
+struct btrfs_dev_replace_item {
+ /*
+ * grow this item struct at the end for future enhancements and keep
+ * the existing values unchanged
+ */
+ __le64 src_devid;
+ __le64 cursor_left;
+ __le64 cursor_right;
+ __le64 cont_reading_from_srcdev_mode;
+
+ __le64 replace_state;
+ __le64 time_started;
+ __le64 time_stopped;
+ __le64 num_write_errors;
+ __le64 num_uncorrectable_read_errors;
+} __attribute__ ((__packed__));
+
+/* different types of block groups (and chunks) */
+#define BTRFS_BLOCK_GROUP_DATA (1ULL << 0)
+#define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1)
+#define BTRFS_BLOCK_GROUP_METADATA (1ULL << 2)
+#define BTRFS_BLOCK_GROUP_RAID0 (1ULL << 3)
+#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4)
+#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5)
+#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6)
+#define BTRFS_BLOCK_GROUP_RAID5 (1ULL << 7)
+#define BTRFS_BLOCK_GROUP_RAID6 (1ULL << 8)
+#define BTRFS_BLOCK_GROUP_RESERVED (BTRFS_AVAIL_ALLOC_BIT_SINGLE | \
+ BTRFS_SPACE_INFO_GLOBAL_RSV)
+
+enum btrfs_raid_types {
+ BTRFS_RAID_RAID10,
+ BTRFS_RAID_RAID1,
+ BTRFS_RAID_DUP,
+ BTRFS_RAID_RAID0,
+ BTRFS_RAID_SINGLE,
+ BTRFS_RAID_RAID5,
+ BTRFS_RAID_RAID6,
+ BTRFS_NR_RAID_TYPES
+};
+
+#define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \
+ BTRFS_BLOCK_GROUP_SYSTEM | \
+ BTRFS_BLOCK_GROUP_METADATA)
+
+#define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \
+ BTRFS_BLOCK_GROUP_RAID1 | \
+ BTRFS_BLOCK_GROUP_RAID5 | \
+ BTRFS_BLOCK_GROUP_RAID6 | \
+ BTRFS_BLOCK_GROUP_DUP | \
+ BTRFS_BLOCK_GROUP_RAID10)
+#define BTRFS_BLOCK_GROUP_RAID56_MASK (BTRFS_BLOCK_GROUP_RAID5 | \
+ BTRFS_BLOCK_GROUP_RAID6)
+
+/*
+ * We need a bit for restriper to be able to tell when chunks of type
+ * SINGLE are available. This "extended" profile format is used in
+ * fs_info->avail_*_alloc_bits (in-memory) and balance item fields
+ * (on-disk). The corresponding on-disk bit in chunk.type is reserved
+ * to avoid remappings between two formats in future.
+ */
+#define BTRFS_AVAIL_ALLOC_BIT_SINGLE (1ULL << 48)
+
+/*
+ * A fake block group type that is used to communicate global block reserve
+ * size to userspace via the SPACE_INFO ioctl.
+ */
+#define BTRFS_SPACE_INFO_GLOBAL_RSV (1ULL << 49)
+
+#define BTRFS_EXTENDED_PROFILE_MASK (BTRFS_BLOCK_GROUP_PROFILE_MASK | \
+ BTRFS_AVAIL_ALLOC_BIT_SINGLE)
+
+static inline u64 chunk_to_extended(u64 flags)
+{
+ if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0)
+ flags |= BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+
+ return flags;
+}
+static inline u64 extended_to_chunk(u64 flags)
+{
+ return flags & ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+}
+
+struct btrfs_block_group_item {
+ __le64 used;
+ __le64 chunk_objectid;
+ __le64 flags;
+} __attribute__ ((__packed__));
+
+#define BTRFS_QGROUP_LEVEL_SHIFT 48
+static inline u64 btrfs_qgroup_level(u64 qgroupid)
+{
+ return qgroupid >> BTRFS_QGROUP_LEVEL_SHIFT;
+}
+
+/*
+ * is subvolume quota turned on?
+ */
+#define BTRFS_QGROUP_STATUS_FLAG_ON (1ULL << 0)
+/*
+ * RESCAN is set during the initialization phase
+ */
+#define BTRFS_QGROUP_STATUS_FLAG_RESCAN (1ULL << 1)
+/*
+ * Some qgroup entries are known to be out of date,
+ * either because the configuration has changed in a way that
+ * makes a rescan necessary, or because the fs has been mounted
+ * with a non-qgroup-aware version.
+ * Turning qouta off and on again makes it inconsistent, too.
+ */
+#define BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT (1ULL << 2)
+
+#define BTRFS_QGROUP_STATUS_VERSION 1
+
+struct btrfs_qgroup_status_item {
+ __le64 version;
+ /*
+ * the generation is updated during every commit. As older
+ * versions of btrfs are not aware of qgroups, it will be
+ * possible to detect inconsistencies by checking the
+ * generation on mount time
+ */
+ __le64 generation;
+
+ /* flag definitions see above */
+ __le64 flags;
+
+ /*
+ * only used during scanning to record the progress
+ * of the scan. It contains a logical address
+ */
+ __le64 rescan;
+} __attribute__ ((__packed__));
+
+struct btrfs_qgroup_info_item {
+ __le64 generation;
+ __le64 rfer;
+ __le64 rfer_cmpr;
+ __le64 excl;
+ __le64 excl_cmpr;
+} __attribute__ ((__packed__));
+
+/* flags definition for qgroup limits */
+#define BTRFS_QGROUP_LIMIT_MAX_RFER (1ULL << 0)
+#define BTRFS_QGROUP_LIMIT_MAX_EXCL (1ULL << 1)
+#define BTRFS_QGROUP_LIMIT_RSV_RFER (1ULL << 2)
+#define BTRFS_QGROUP_LIMIT_RSV_EXCL (1ULL << 3)
+#define BTRFS_QGROUP_LIMIT_RFER_CMPR (1ULL << 4)
+#define BTRFS_QGROUP_LIMIT_EXCL_CMPR (1ULL << 5)
+
+struct btrfs_qgroup_limit_item {
+ /*
+ * only updated when any of the other values change
+ */
+ __le64 flags;
+ __le64 max_rfer;
+ __le64 max_excl;
+ __le64 rsv_rfer;
+ __le64 rsv_excl;
+} __attribute__ ((__packed__));
+
+/* For raid type sysfs entries */
+struct raid_kobject {
+ int raid_type;
+ struct kobject kobj;
+};
+
+struct btrfs_space_info {
+ spinlock_t lock;
+
+ u64 total_bytes; /* total bytes in the space,
+ this doesn't take mirrors into account */
+ u64 bytes_used; /* total bytes used,
+ this doesn't take mirrors into account */
+ u64 bytes_pinned; /* total bytes pinned, will be freed when the
+ transaction finishes */
+ u64 bytes_reserved; /* total bytes the allocator has reserved for
+ current allocations */
+ u64 bytes_may_use; /* number of bytes that may be used for
+ delalloc/allocations */
+ u64 bytes_readonly; /* total bytes that are read only */
+
+ unsigned int full:1; /* indicates that we cannot allocate any more
+ chunks for this space */
+ unsigned int chunk_alloc:1; /* set if we are allocating a chunk */
+
+ unsigned int flush:1; /* set if we are trying to make space */
+
+ unsigned int force_alloc; /* set if we need to force a chunk
+ alloc for this space */
+
+ u64 disk_used; /* total bytes used on disk */
+ u64 disk_total; /* total bytes on disk, takes mirrors into
+ account */
+
+ u64 flags;
+
+ /*
+ * bytes_pinned is kept in line with what is actually pinned, as in
+ * we've called update_block_group and dropped the bytes_used counter
+ * and increased the bytes_pinned counter. However this means that
+ * bytes_pinned does not reflect the bytes that will be pinned once the
+ * delayed refs are flushed, so this counter is inc'ed everytime we call
+ * btrfs_free_extent so it is a realtime count of what will be freed
+ * once the transaction is committed. It will be zero'ed everytime the
+ * transaction commits.
+ */
+ struct percpu_counter total_bytes_pinned;
+
+ struct list_head list;
+ /* Protected by the spinlock 'lock'. */
+ struct list_head ro_bgs;
+
+ struct rw_semaphore groups_sem;
+ /* for block groups in our same type */
+ struct list_head block_groups[BTRFS_NR_RAID_TYPES];
+ wait_queue_head_t wait;
+
+ struct kobject kobj;
+ struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES];
+};
+
+#define BTRFS_BLOCK_RSV_GLOBAL 1
+#define BTRFS_BLOCK_RSV_DELALLOC 2
+#define BTRFS_BLOCK_RSV_TRANS 3
+#define BTRFS_BLOCK_RSV_CHUNK 4
+#define BTRFS_BLOCK_RSV_DELOPS 5
+#define BTRFS_BLOCK_RSV_EMPTY 6
+#define BTRFS_BLOCK_RSV_TEMP 7
+
+struct btrfs_block_rsv {
+ u64 size;
+ u64 reserved;
+ struct btrfs_space_info *space_info;
+ spinlock_t lock;
+ unsigned short full;
+ unsigned short type;
+ unsigned short failfast;
+};
+
+/*
+ * free clusters are used to claim free space in relatively large chunks,
+ * allowing us to do less seeky writes. They are used for all metadata
+ * allocations and data allocations in ssd mode.
+ */
+struct btrfs_free_cluster {
+ spinlock_t lock;
+ spinlock_t refill_lock;
+ struct rb_root root;
+
+ /* largest extent in this cluster */
+ u64 max_size;
+
+ /* first extent starting offset */
+ u64 window_start;
+
+ struct btrfs_block_group_cache *block_group;
+ /*
+ * when a cluster is allocated from a block group, we put the
+ * cluster onto a list in the block group so that it can
+ * be freed before the block group is freed.
+ */
+ struct list_head block_group_list;
+};
+
+enum btrfs_caching_type {
+ BTRFS_CACHE_NO = 0,
+ BTRFS_CACHE_STARTED = 1,
+ BTRFS_CACHE_FAST = 2,
+ BTRFS_CACHE_FINISHED = 3,
+ BTRFS_CACHE_ERROR = 4,
+};
+
+enum btrfs_disk_cache_state {
+ BTRFS_DC_WRITTEN = 0,
+ BTRFS_DC_ERROR = 1,
+ BTRFS_DC_CLEAR = 2,
+ BTRFS_DC_SETUP = 3,
+};
+
+struct btrfs_caching_control {
+ struct list_head list;
+ struct mutex mutex;
+ wait_queue_head_t wait;
+ struct btrfs_work work;
+ struct btrfs_block_group_cache *block_group;
+ u64 progress;
+ atomic_t count;
+};
+
+struct btrfs_io_ctl {
+ void *cur, *orig;
+ struct page *page;
+ struct page **pages;
+ struct btrfs_root *root;
+ struct inode *inode;
+ unsigned long size;
+ int index;
+ int num_pages;
+ int entries;
+ int bitmaps;
+ unsigned check_crcs:1;
+};
+
+struct btrfs_block_group_cache {
+ struct btrfs_key key;
+ struct btrfs_block_group_item item;
+ struct btrfs_fs_info *fs_info;
+ struct inode *inode;
+ spinlock_t lock;
+ u64 pinned;
+ u64 reserved;
+ u64 delalloc_bytes;
+ u64 bytes_super;
+ u64 flags;
+ u64 sectorsize;
+ u64 cache_generation;
+
+ /*
+ * It is just used for the delayed data space allocation because
+ * only the data space allocation and the relative metadata update
+ * can be done cross the transaction.
+ */
+ struct rw_semaphore data_rwsem;
+
+ /* for raid56, this is a full stripe, without parity */
+ unsigned long full_stripe_len;
+
+ unsigned int ro:1;
+ unsigned int iref:1;
+ unsigned int has_caching_ctl:1;
+ unsigned int removed:1;
+
+ int disk_cache_state;
+
+ /* cache tracking stuff */
+ int cached;
+ struct btrfs_caching_control *caching_ctl;
+ u64 last_byte_to_unpin;
+
+ struct btrfs_space_info *space_info;
+
+ /* free space cache stuff */
+ struct btrfs_free_space_ctl *free_space_ctl;
+
+ /* block group cache stuff */
+ struct rb_node cache_node;
+
+ /* for block groups in the same raid type */
+ struct list_head list;
+
+ /* usage count */
+ atomic_t count;
+
+ /* List of struct btrfs_free_clusters for this block group.
+ * Today it will only have one thing on it, but that may change
+ */
+ struct list_head cluster_list;
+
+ /* For delayed block group creation or deletion of empty block groups */
+ struct list_head bg_list;
+
+ /* For read-only block groups */
+ struct list_head ro_list;
+
+ atomic_t trimming;
+
+ /* For dirty block groups */
+ struct list_head dirty_list;
+ struct list_head io_list;
+
+ struct btrfs_io_ctl io_ctl;
+};
+
+/* delayed seq elem */
+struct seq_list {
+ struct list_head list;
+ u64 seq;
+};
+
+#define SEQ_LIST_INIT(name) { .list = LIST_HEAD_INIT((name).list), .seq = 0 }
+
+enum btrfs_orphan_cleanup_state {
+ ORPHAN_CLEANUP_STARTED = 1,
+ ORPHAN_CLEANUP_DONE = 2,
+};
+
+/* used by the raid56 code to lock stripes for read/modify/write */
+struct btrfs_stripe_hash {
+ struct list_head hash_list;
+ wait_queue_head_t wait;
+ spinlock_t lock;
+};
+
+/* used by the raid56 code to lock stripes for read/modify/write */
+struct btrfs_stripe_hash_table {
+ struct list_head stripe_cache;
+ spinlock_t cache_lock;
+ int cache_size;
+ struct btrfs_stripe_hash table[];
+};
+
+#define BTRFS_STRIPE_HASH_TABLE_BITS 11
+
+void btrfs_init_async_reclaim_work(struct work_struct *work);
+
+/* fs_info */
+struct reloc_control;
+struct btrfs_device;
+struct btrfs_fs_devices;
+struct btrfs_balance_control;
+struct btrfs_delayed_root;
+struct btrfs_fs_info {
+ u8 fsid[BTRFS_FSID_SIZE];
+ u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
+ struct btrfs_root *extent_root;
+ struct btrfs_root *tree_root;
+ struct btrfs_root *chunk_root;
+ struct btrfs_root *dev_root;
+ struct btrfs_root *fs_root;
+ struct btrfs_root *csum_root;
+ struct btrfs_root *quota_root;
+ struct btrfs_root *uuid_root;
+
+ /* the log root tree is a directory of all the other log roots */
+ struct btrfs_root *log_root_tree;
+
+ spinlock_t fs_roots_radix_lock;
+ struct radix_tree_root fs_roots_radix;
+
+ /* block group cache stuff */
+ spinlock_t block_group_cache_lock;
+ u64 first_logical_byte;
+ struct rb_root block_group_cache_tree;
+
+ /* keep track of unallocated space */
+ spinlock_t free_chunk_lock;
+ u64 free_chunk_space;
+
+ struct extent_io_tree freed_extents[2];
+ struct extent_io_tree *pinned_extents;
+
+ /* logical->physical extent mapping */
+ struct btrfs_mapping_tree mapping_tree;
+
+ /*
+ * block reservation for extent, checksum, root tree and
+ * delayed dir index item
+ */
+ struct btrfs_block_rsv global_block_rsv;
+ /* block reservation for delay allocation */
+ struct btrfs_block_rsv delalloc_block_rsv;
+ /* block reservation for metadata operations */
+ struct btrfs_block_rsv trans_block_rsv;
+ /* block reservation for chunk tree */
+ struct btrfs_block_rsv chunk_block_rsv;
+ /* block reservation for delayed operations */
+ struct btrfs_block_rsv delayed_block_rsv;
+
+ struct btrfs_block_rsv empty_block_rsv;
+
+ u64 generation;
+ u64 last_trans_committed;
+ u64 avg_delayed_ref_runtime;
+
+ /*
+ * this is updated to the current trans every time a full commit
+ * is required instead of the faster short fsync log commits
+ */
+ u64 last_trans_log_full_commit;
+ unsigned long mount_opt;
+ /*
+ * Track requests for actions that need to be done during transaction
+ * commit (like for some mount options).
+ */
+ unsigned long pending_changes;
+ unsigned long compress_type:4;
+ int commit_interval;
+ /*
+ * It is a suggestive number, the read side is safe even it gets a
+ * wrong number because we will write out the data into a regular
+ * extent. The write side(mount/remount) is under ->s_umount lock,
+ * so it is also safe.
+ */
+ u64 max_inline;
+ /*
+ * Protected by ->chunk_mutex and sb->s_umount.
+ *
+ * The reason that we use two lock to protect it is because only
+ * remount and mount operations can change it and these two operations
+ * are under sb->s_umount, but the read side (chunk allocation) can not
+ * acquire sb->s_umount or the deadlock would happen. So we use two
+ * locks to protect it. On the write side, we must acquire two locks,
+ * and on the read side, we just need acquire one of them.
+ */
+ u64 alloc_start;
+ struct btrfs_transaction *running_transaction;
+ wait_queue_head_t transaction_throttle;
+ wait_queue_head_t transaction_wait;
+ wait_queue_head_t transaction_blocked_wait;
+ wait_queue_head_t async_submit_wait;
+
+ /*
+ * Used to protect the incompat_flags, compat_flags, compat_ro_flags
+ * when they are updated.
+ *
+ * Because we do not clear the flags for ever, so we needn't use
+ * the lock on the read side.
+ *
+ * We also needn't use the lock when we mount the fs, because
+ * there is no other task which will update the flag.
+ */
+ spinlock_t super_lock;
+ struct btrfs_super_block *super_copy;
+ struct btrfs_super_block *super_for_commit;
+ struct block_device *__bdev;
+ struct super_block *sb;
+ struct inode *btree_inode;
+ struct backing_dev_info bdi;
+ struct mutex tree_log_mutex;
+ struct mutex transaction_kthread_mutex;
+ struct mutex cleaner_mutex;
+ struct mutex chunk_mutex;
+ struct mutex volume_mutex;
+
+ /*
+ * this is taken to make sure we don't set block groups ro after
+ * the free space cache has been allocated on them
+ */
+ struct mutex ro_block_group_mutex;
+
+ /* this is used during read/modify/write to make sure
+ * no two ios are trying to mod the same stripe at the same
+ * time
+ */
+ struct btrfs_stripe_hash_table *stripe_hash_table;
+
+ /*
+ * this protects the ordered operations list only while we are
+ * processing all of the entries on it. This way we make
+ * sure the commit code doesn't find the list temporarily empty
+ * because another function happens to be doing non-waiting preflush
+ * before jumping into the main commit.
+ */
+ struct mutex ordered_operations_mutex;
+
+ /*
+ * Same as ordered_operations_mutex except this is for ordered extents
+ * and not the operations.
+ */
+ struct mutex ordered_extent_flush_mutex;
+
+ struct rw_semaphore commit_root_sem;
+
+ struct rw_semaphore cleanup_work_sem;
+
+ struct rw_semaphore subvol_sem;
+ struct srcu_struct subvol_srcu;
+
+ spinlock_t trans_lock;
+ /*
+ * the reloc mutex goes with the trans lock, it is taken
+ * during commit to protect us from the relocation code
+ */
+ struct mutex reloc_mutex;
+
+ struct list_head trans_list;
+ struct list_head dead_roots;
+ struct list_head caching_block_groups;
+
+ spinlock_t delayed_iput_lock;
+ struct list_head delayed_iputs;
+ struct rw_semaphore delayed_iput_sem;
+
+ /* this protects tree_mod_seq_list */
+ spinlock_t tree_mod_seq_lock;
+ atomic64_t tree_mod_seq;
+ struct list_head tree_mod_seq_list;
+
+ /* this protects tree_mod_log */
+ rwlock_t tree_mod_log_lock;
+ struct rb_root tree_mod_log;
+
+ atomic_t nr_async_submits;
+ atomic_t async_submit_draining;
+ atomic_t nr_async_bios;
+ atomic_t async_delalloc_pages;
+ atomic_t open_ioctl_trans;
+
+ /*
+ * this is used to protect the following list -- ordered_roots.
+ */
+ spinlock_t ordered_root_lock;
+
+ /*
+ * all fs/file tree roots in which there are data=ordered extents
+ * pending writeback are added into this list.
+ *
+ * these can span multiple transactions and basically include
+ * every dirty data page that isn't from nodatacow
+ */
+ struct list_head ordered_roots;
+
+ struct mutex delalloc_root_mutex;
+ spinlock_t delalloc_root_lock;
+ /* all fs/file tree roots that have delalloc inodes. */
+ struct list_head delalloc_roots;
+
+ /*
+ * there is a pool of worker threads for checksumming during writes
+ * and a pool for checksumming after reads. This is because readers
+ * can run with FS locks held, and the writers may be waiting for
+ * those locks. We don't want ordering in the pending list to cause
+ * deadlocks, and so the two are serviced separately.
+ *
+ * A third pool does submit_bio to avoid deadlocking with the other
+ * two
+ */
+ struct btrfs_workqueue *workers;
+ struct btrfs_workqueue *delalloc_workers;
+ struct btrfs_workqueue *flush_workers;
+ struct btrfs_workqueue *endio_workers;
+ struct btrfs_workqueue *endio_meta_workers;
+ struct btrfs_workqueue *endio_raid56_workers;
+ struct btrfs_workqueue *endio_repair_workers;
+ struct btrfs_workqueue *rmw_workers;
+ struct btrfs_workqueue *endio_meta_write_workers;
+ struct btrfs_workqueue *endio_write_workers;
+ struct btrfs_workqueue *endio_freespace_worker;
+ struct btrfs_workqueue *submit_workers;
+ struct btrfs_workqueue *caching_workers;
+ struct btrfs_workqueue *readahead_workers;
+
+ /*
+ * fixup workers take dirty pages that didn't properly go through
+ * the cow mechanism and make them safe to write. It happens
+ * for the sys_munmap function call path
+ */
+ struct btrfs_workqueue *fixup_workers;
+ struct btrfs_workqueue *delayed_workers;
+
+ /* the extent workers do delayed refs on the extent allocation tree */
+ struct btrfs_workqueue *extent_workers;
+ struct task_struct *transaction_kthread;
+ struct task_struct *cleaner_kthread;
+ int thread_pool_size;
+
+ struct kobject super_kobj;
+ struct kobject *space_info_kobj;
+ struct kobject *device_dir_kobj;
+ struct completion kobj_unregister;
+ int do_barriers;
+ int closing;
+ int log_root_recovering;
+ int open;
+
+ u64 total_pinned;
+
+ /* used to keep from writing metadata until there is a nice batch */
+ struct percpu_counter dirty_metadata_bytes;
+ struct percpu_counter delalloc_bytes;
+ s32 dirty_metadata_batch;
+ s32 delalloc_batch;
+
+ struct list_head dirty_cowonly_roots;
+
+ struct btrfs_fs_devices *fs_devices;
+
+ /*
+ * the space_info list is almost entirely read only. It only changes
+ * when we add a new raid type to the FS, and that happens
+ * very rarely. RCU is used to protect it.
+ */
+ struct list_head space_info;
+
+ struct btrfs_space_info *data_sinfo;
+
+ struct reloc_control *reloc_ctl;
+
+ /* data_alloc_cluster is only used in ssd mode */
+ struct btrfs_free_cluster data_alloc_cluster;
+
+ /* all metadata allocations go through this cluster */
+ struct btrfs_free_cluster meta_alloc_cluster;
+
+ /* auto defrag inodes go here */
+ spinlock_t defrag_inodes_lock;
+ struct rb_root defrag_inodes;
+ atomic_t defrag_running;
+
+ /* Used to protect avail_{data, metadata, system}_alloc_bits */
+ seqlock_t profiles_lock;
+ /*
+ * these three are in extended format (availability of single
+ * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other
+ * types are denoted by corresponding BTRFS_BLOCK_GROUP_* bits)
+ */
+ u64 avail_data_alloc_bits;
+ u64 avail_metadata_alloc_bits;
+ u64 avail_system_alloc_bits;
+
+ /* restriper state */
+ spinlock_t balance_lock;
+ struct mutex balance_mutex;
+ atomic_t balance_running;
+ atomic_t balance_pause_req;
+ atomic_t balance_cancel_req;
+ struct btrfs_balance_control *balance_ctl;
+ wait_queue_head_t balance_wait_q;
+
+ unsigned data_chunk_allocations;
+ unsigned metadata_ratio;
+
+ void *bdev_holder;
+
+ /* private scrub information */
+ struct mutex scrub_lock;
+ atomic_t scrubs_running;
+ atomic_t scrub_pause_req;
+ atomic_t scrubs_paused;
+ atomic_t scrub_cancel_req;
+ wait_queue_head_t scrub_pause_wait;
+ int scrub_workers_refcnt;
+ struct btrfs_workqueue *scrub_workers;
+ struct btrfs_workqueue *scrub_wr_completion_workers;
+ struct btrfs_workqueue *scrub_nocow_workers;
+
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+ u32 check_integrity_print_mask;
+#endif
+ /*
+ * quota information
+ */
+ unsigned int quota_enabled:1;
+
+ /*
+ * quota_enabled only changes state after a commit. This holds the
+ * next state.
+ */
+ unsigned int pending_quota_state:1;
+
+ /* is qgroup tracking in a consistent state? */
+ u64 qgroup_flags;
+
+ /* holds configuration and tracking. Protected by qgroup_lock */
+ struct rb_root qgroup_tree;
+ struct rb_root qgroup_op_tree;
+ spinlock_t qgroup_lock;
+ spinlock_t qgroup_op_lock;
+ atomic_t qgroup_op_seq;
+
+ /*
+ * used to avoid frequently calling ulist_alloc()/ulist_free()
+ * when doing qgroup accounting, it must be protected by qgroup_lock.
+ */
+ struct ulist *qgroup_ulist;
+
+ /* protect user change for quota operations */
+ struct mutex qgroup_ioctl_lock;
+
+ /* list of dirty qgroups to be written at next commit */
+ struct list_head dirty_qgroups;
+
+ /* used by btrfs_qgroup_record_ref for an efficient tree traversal */
+ u64 qgroup_seq;
+
+ /* qgroup rescan items */
+ struct mutex qgroup_rescan_lock; /* protects the progress item */
+ struct btrfs_key qgroup_rescan_progress;
+ struct btrfs_workqueue *qgroup_rescan_workers;
+ struct completion qgroup_rescan_completion;
+ struct btrfs_work qgroup_rescan_work;
+
+ /* filesystem state */
+ unsigned long fs_state;
+
+ struct btrfs_delayed_root *delayed_root;
+
+ /* readahead tree */
+ spinlock_t reada_lock;
+ struct radix_tree_root reada_tree;
+
+ /* Extent buffer radix tree */
+ spinlock_t buffer_lock;
+ struct radix_tree_root buffer_radix;
+
+ /* next backup root to be overwritten */
+ int backup_root_index;
+
+ int num_tolerated_disk_barrier_failures;
+
+ /* device replace state */
+ struct btrfs_dev_replace dev_replace;
+
+ atomic_t mutually_exclusive_operation_running;
+
+ struct percpu_counter bio_counter;
+ wait_queue_head_t replace_wait;
+
+ struct semaphore uuid_tree_rescan_sem;
+ unsigned int update_uuid_tree_gen:1;
+
+ /* Used to reclaim the metadata space in the background. */
+ struct work_struct async_reclaim_work;
+
+ spinlock_t unused_bgs_lock;
+ struct list_head unused_bgs;
+ struct mutex unused_bg_unpin_mutex;
+
+ /* For btrfs to record security options */
+ struct security_mnt_opts security_opts;
+
+ /*
+ * Chunks that can't be freed yet (under a trim/discard operation)
+ * and will be latter freed. Protected by fs_info->chunk_mutex.
+ */
+ struct list_head pinned_chunks;
+};
+
+struct btrfs_subvolume_writers {
+ struct percpu_counter counter;
+ wait_queue_head_t wait;
+};
+
+/*
+ * The state of btrfs root
+ */
+/*
+ * btrfs_record_root_in_trans is a multi-step process,
+ * and it can race with the balancing code. But the
+ * race is very small, and only the first time the root
+ * is added to each transaction. So IN_TRANS_SETUP
+ * is used to tell us when more checks are required
+ */
+#define BTRFS_ROOT_IN_TRANS_SETUP 0
+#define BTRFS_ROOT_REF_COWS 1
+#define BTRFS_ROOT_TRACK_DIRTY 2
+#define BTRFS_ROOT_IN_RADIX 3
+#define BTRFS_ROOT_DUMMY_ROOT 4
+#define BTRFS_ROOT_ORPHAN_ITEM_INSERTED 5
+#define BTRFS_ROOT_DEFRAG_RUNNING 6
+#define BTRFS_ROOT_FORCE_COW 7
+#define BTRFS_ROOT_MULTI_LOG_TASKS 8
+#define BTRFS_ROOT_DIRTY 9
+
+/*
+ * in ram representation of the tree. extent_root is used for all allocations
+ * and for the extent tree extent_root root.
+ */
+struct btrfs_root {
+ struct extent_buffer *node;
+
+ struct extent_buffer *commit_root;
+ struct btrfs_root *log_root;
+ struct btrfs_root *reloc_root;
+
+ unsigned long state;
+ struct btrfs_root_item root_item;
+ struct btrfs_key root_key;
+ struct btrfs_fs_info *fs_info;
+ struct extent_io_tree dirty_log_pages;
+
+ struct mutex objectid_mutex;
+
+ spinlock_t accounting_lock;
+ struct btrfs_block_rsv *block_rsv;
+
+ /* free ino cache stuff */
+ struct btrfs_free_space_ctl *free_ino_ctl;
+ enum btrfs_caching_type ino_cache_state;
+ spinlock_t ino_cache_lock;
+ wait_queue_head_t ino_cache_wait;
+ struct btrfs_free_space_ctl *free_ino_pinned;
+ u64 ino_cache_progress;
+ struct inode *ino_cache_inode;
+
+ struct mutex log_mutex;
+ wait_queue_head_t log_writer_wait;
+ wait_queue_head_t log_commit_wait[2];
+ struct list_head log_ctxs[2];
+ atomic_t log_writers;
+ atomic_t log_commit[2];
+ atomic_t log_batch;
+ int log_transid;
+ /* No matter the commit succeeds or not*/
+ int log_transid_committed;
+ /* Just be updated when the commit succeeds. */
+ int last_log_commit;
+ pid_t log_start_pid;
+
+ u64 objectid;
+ u64 last_trans;
+
+ /* data allocations are done in sectorsize units */
+ u32 sectorsize;
+
+ /* node allocations are done in nodesize units */
+ u32 nodesize;
+
+ u32 stripesize;
+
+ u32 type;
+
+ u64 highest_objectid;
+
+ /* only used with CONFIG_BTRFS_FS_RUN_SANITY_TESTS is enabled */
+ u64 alloc_bytenr;
+
+ u64 defrag_trans_start;
+ struct btrfs_key defrag_progress;
+ struct btrfs_key defrag_max;
+ char *name;
+
+ /* the dirty list is only used by non-reference counted roots */
+ struct list_head dirty_list;
+
+ struct list_head root_list;
+
+ spinlock_t log_extents_lock[2];
+ struct list_head logged_list[2];
+
+ spinlock_t orphan_lock;
+ atomic_t orphan_inodes;
+ struct btrfs_block_rsv *orphan_block_rsv;
+ int orphan_cleanup_state;
+
+ spinlock_t inode_lock;
+ /* red-black tree that keeps track of in-memory inodes */
+ struct rb_root inode_tree;
+
+ /*
+ * radix tree that keeps track of delayed nodes of every inode,
+ * protected by inode_lock
+ */
+ struct radix_tree_root delayed_nodes_tree;
+ /*
+ * right now this just gets used so that a root has its own devid
+ * for stat. It may be used for more later
+ */
+ dev_t anon_dev;
+
+ spinlock_t root_item_lock;
+ atomic_t refs;
+
+ struct mutex delalloc_mutex;
+ spinlock_t delalloc_lock;
+ /*
+ * all of the inodes that have delalloc bytes. It is possible for
+ * this list to be empty even when there is still dirty data=ordered
+ * extents waiting to finish IO.
+ */
+ struct list_head delalloc_inodes;
+ struct list_head delalloc_root;
+ u64 nr_delalloc_inodes;
+
+ struct mutex ordered_extent_mutex;
+ /*
+ * this is used by the balancing code to wait for all the pending
+ * ordered extents
+ */
+ spinlock_t ordered_extent_lock;
+
+ /*
+ * all of the data=ordered extents pending writeback
+ * these can span multiple transactions and basically include
+ * every dirty data page that isn't from nodatacow
+ */
+ struct list_head ordered_extents;
+ struct list_head ordered_root;
+ u64 nr_ordered_extents;
+
+ /*
+ * Number of currently running SEND ioctls to prevent
+ * manipulation with the read-only status via SUBVOL_SETFLAGS
+ */
+ int send_in_progress;
+ struct btrfs_subvolume_writers *subv_writers;
+ atomic_t will_be_snapshoted;
+};
+
+struct btrfs_ioctl_defrag_range_args {
+ /* start of the defrag operation */
+ __u64 start;
+
+ /* number of bytes to defrag, use (u64)-1 to say all */
+ __u64 len;
+
+ /*
+ * flags for the operation, which can include turning
+ * on compression for this one defrag
+ */
+ __u64 flags;
+
+ /*
+ * any extent bigger than this will be considered
+ * already defragged. Use 0 to take the kernel default
+ * Use 1 to say every single extent must be rewritten
+ */
+ __u32 extent_thresh;
+
+ /*
+ * which compression method to use if turning on compression
+ * for this defrag operation. If unspecified, zlib will
+ * be used
+ */
+ __u32 compress_type;
+
+ /* spare for later */
+ __u32 unused[4];
+};
+
+
+/*
+ * inode items have the data typically returned from stat and store other
+ * info about object characteristics. There is one for every file and dir in
+ * the FS
+ */
+#define BTRFS_INODE_ITEM_KEY 1
+#define BTRFS_INODE_REF_KEY 12
+#define BTRFS_INODE_EXTREF_KEY 13
+#define BTRFS_XATTR_ITEM_KEY 24
+#define BTRFS_ORPHAN_ITEM_KEY 48
+/* reserve 2-15 close to the inode for later flexibility */
+
+/*
+ * dir items are the name -> inode pointers in a directory. There is one
+ * for every name in a directory.
+ */
+#define BTRFS_DIR_LOG_ITEM_KEY 60
+#define BTRFS_DIR_LOG_INDEX_KEY 72
+#define BTRFS_DIR_ITEM_KEY 84
+#define BTRFS_DIR_INDEX_KEY 96
+/*
+ * extent data is for file data
+ */
+#define BTRFS_EXTENT_DATA_KEY 108
+
+/*
+ * extent csums are stored in a separate tree and hold csums for
+ * an entire extent on disk.
+ */
+#define BTRFS_EXTENT_CSUM_KEY 128
+
+/*
+ * root items point to tree roots. They are typically in the root
+ * tree used by the super block to find all the other trees
+ */
+#define BTRFS_ROOT_ITEM_KEY 132
+
+/*
+ * root backrefs tie subvols and snapshots to the directory entries that
+ * reference them
+ */
+#define BTRFS_ROOT_BACKREF_KEY 144
+
+/*
+ * root refs make a fast index for listing all of the snapshots and
+ * subvolumes referenced by a given root. They point directly to the
+ * directory item in the root that references the subvol
+ */
+#define BTRFS_ROOT_REF_KEY 156
+
+/*
+ * extent items are in the extent map tree. These record which blocks
+ * are used, and how many references there are to each block
+ */
+#define BTRFS_EXTENT_ITEM_KEY 168
+
+/*
+ * The same as the BTRFS_EXTENT_ITEM_KEY, except it's metadata we already know
+ * the length, so we save the level in key->offset instead of the length.
+ */
+#define BTRFS_METADATA_ITEM_KEY 169
+
+#define BTRFS_TREE_BLOCK_REF_KEY 176
+
+#define BTRFS_EXTENT_DATA_REF_KEY 178
+
+#define BTRFS_EXTENT_REF_V0_KEY 180
+
+#define BTRFS_SHARED_BLOCK_REF_KEY 182
+
+#define BTRFS_SHARED_DATA_REF_KEY 184
+
+/*
+ * block groups give us hints into the extent allocation trees. Which
+ * blocks are free etc etc
+ */
+#define BTRFS_BLOCK_GROUP_ITEM_KEY 192
+
+#define BTRFS_DEV_EXTENT_KEY 204
+#define BTRFS_DEV_ITEM_KEY 216
+#define BTRFS_CHUNK_ITEM_KEY 228
+
+/*
+ * Records the overall state of the qgroups.
+ * There's only one instance of this key present,
+ * (0, BTRFS_QGROUP_STATUS_KEY, 0)
+ */
+#define BTRFS_QGROUP_STATUS_KEY 240
+/*
+ * Records the currently used space of the qgroup.
+ * One key per qgroup, (0, BTRFS_QGROUP_INFO_KEY, qgroupid).
+ */
+#define BTRFS_QGROUP_INFO_KEY 242
+/*
+ * Contains the user configured limits for the qgroup.
+ * One key per qgroup, (0, BTRFS_QGROUP_LIMIT_KEY, qgroupid).
+ */
+#define BTRFS_QGROUP_LIMIT_KEY 244
+/*
+ * Records the child-parent relationship of qgroups. For
+ * each relation, 2 keys are present:
+ * (childid, BTRFS_QGROUP_RELATION_KEY, parentid)
+ * (parentid, BTRFS_QGROUP_RELATION_KEY, childid)
+ */
+#define BTRFS_QGROUP_RELATION_KEY 246
+
+#define BTRFS_BALANCE_ITEM_KEY 248
+
+/*
+ * Persistantly stores the io stats in the device tree.
+ * One key for all stats, (0, BTRFS_DEV_STATS_KEY, devid).
+ */
+#define BTRFS_DEV_STATS_KEY 249
+
+/*
+ * Persistantly stores the device replace state in the device tree.
+ * The key is built like this: (0, BTRFS_DEV_REPLACE_KEY, 0).
+ */
+#define BTRFS_DEV_REPLACE_KEY 250
+
+/*
+ * Stores items that allow to quickly map UUIDs to something else.
+ * These items are part of the filesystem UUID tree.
+ * The key is built like this:
+ * (UUID_upper_64_bits, BTRFS_UUID_KEY*, UUID_lower_64_bits).
+ */
+#if BTRFS_UUID_SIZE != 16
+#error "UUID items require BTRFS_UUID_SIZE == 16!"
+#endif
+#define BTRFS_UUID_KEY_SUBVOL 251 /* for UUIDs assigned to subvols */
+#define BTRFS_UUID_KEY_RECEIVED_SUBVOL 252 /* for UUIDs assigned to
+ * received subvols */
+
+/*
+ * string items are for debugging. They just store a short string of
+ * data in the FS
+ */
+#define BTRFS_STRING_ITEM_KEY 253
+
+/*
+ * Flags for mount options.
+ *
+ * Note: don't forget to add new options to btrfs_show_options()
+ */
+#define BTRFS_MOUNT_NODATASUM (1 << 0)
+#define BTRFS_MOUNT_NODATACOW (1 << 1)
+#define BTRFS_MOUNT_NOBARRIER (1 << 2)
+#define BTRFS_MOUNT_SSD (1 << 3)
+#define BTRFS_MOUNT_DEGRADED (1 << 4)
+#define BTRFS_MOUNT_COMPRESS (1 << 5)
+#define BTRFS_MOUNT_NOTREELOG (1 << 6)
+#define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7)
+#define BTRFS_MOUNT_SSD_SPREAD (1 << 8)
+#define BTRFS_MOUNT_NOSSD (1 << 9)
+#define BTRFS_MOUNT_DISCARD (1 << 10)
+#define BTRFS_MOUNT_FORCE_COMPRESS (1 << 11)
+#define BTRFS_MOUNT_SPACE_CACHE (1 << 12)
+#define BTRFS_MOUNT_CLEAR_CACHE (1 << 13)
+#define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14)
+#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15)
+#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16)
+#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17)
+#define BTRFS_MOUNT_RECOVERY (1 << 18)
+#define BTRFS_MOUNT_SKIP_BALANCE (1 << 19)
+#define BTRFS_MOUNT_CHECK_INTEGRITY (1 << 20)
+#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21)
+#define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR (1 << 22)
+#define BTRFS_MOUNT_RESCAN_UUID_TREE (1 << 23)
+
+#define BTRFS_DEFAULT_COMMIT_INTERVAL (30)
+#define BTRFS_DEFAULT_MAX_INLINE (8192)
+
+#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
+#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
+#define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt)
+#define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \
+ BTRFS_MOUNT_##opt)
+
+#define btrfs_set_and_info(root, opt, fmt, args...) \
+{ \
+ if (!btrfs_test_opt(root, opt)) \
+ btrfs_info(root->fs_info, fmt, ##args); \
+ btrfs_set_opt(root->fs_info->mount_opt, opt); \
+}
+
+#define btrfs_clear_and_info(root, opt, fmt, args...) \
+{ \
+ if (btrfs_test_opt(root, opt)) \
+ btrfs_info(root->fs_info, fmt, ##args); \
+ btrfs_clear_opt(root->fs_info->mount_opt, opt); \
+}
+
+/*
+ * Requests for changes that need to be done during transaction commit.
+ *
+ * Internal mount options that are used for special handling of the real
+ * mount options (eg. cannot be set during remount and have to be set during
+ * transaction commit)
+ */
+
+#define BTRFS_PENDING_SET_INODE_MAP_CACHE (0)
+#define BTRFS_PENDING_CLEAR_INODE_MAP_CACHE (1)
+#define BTRFS_PENDING_COMMIT (2)
+
+#define btrfs_test_pending(info, opt) \
+ test_bit(BTRFS_PENDING_##opt, &(info)->pending_changes)
+#define btrfs_set_pending(info, opt) \
+ set_bit(BTRFS_PENDING_##opt, &(info)->pending_changes)
+#define btrfs_clear_pending(info, opt) \
+ clear_bit(BTRFS_PENDING_##opt, &(info)->pending_changes)
+
+/*
+ * Helpers for setting pending mount option changes.
+ *
+ * Expects corresponding macros
+ * BTRFS_PENDING_SET_ and CLEAR_ + short mount option name
+ */
+#define btrfs_set_pending_and_info(info, opt, fmt, args...) \
+do { \
+ if (!btrfs_raw_test_opt((info)->mount_opt, opt)) { \
+ btrfs_info((info), fmt, ##args); \
+ btrfs_set_pending((info), SET_##opt); \
+ btrfs_clear_pending((info), CLEAR_##opt); \
+ } \
+} while(0)
+
+#define btrfs_clear_pending_and_info(info, opt, fmt, args...) \
+do { \
+ if (btrfs_raw_test_opt((info)->mount_opt, opt)) { \
+ btrfs_info((info), fmt, ##args); \
+ btrfs_set_pending((info), CLEAR_##opt); \
+ btrfs_clear_pending((info), SET_##opt); \
+ } \
+} while(0)
+
+/*
+ * Inode flags
+ */
+#define BTRFS_INODE_NODATASUM (1 << 0)
+#define BTRFS_INODE_NODATACOW (1 << 1)
+#define BTRFS_INODE_READONLY (1 << 2)
+#define BTRFS_INODE_NOCOMPRESS (1 << 3)
+#define BTRFS_INODE_PREALLOC (1 << 4)
+#define BTRFS_INODE_SYNC (1 << 5)
+#define BTRFS_INODE_IMMUTABLE (1 << 6)
+#define BTRFS_INODE_APPEND (1 << 7)
+#define BTRFS_INODE_NODUMP (1 << 8)
+#define BTRFS_INODE_NOATIME (1 << 9)
+#define BTRFS_INODE_DIRSYNC (1 << 10)
+#define BTRFS_INODE_COMPRESS (1 << 11)
+
+#define BTRFS_INODE_ROOT_ITEM_INIT (1 << 31)
+
+struct btrfs_map_token {
+ struct extent_buffer *eb;
+ char *kaddr;
+ unsigned long offset;
+};
+
+static inline void btrfs_init_map_token (struct btrfs_map_token *token)
+{
+ token->kaddr = NULL;
+}
+
+/* some macros to generate set/get funcs for the struct fields. This
+ * assumes there is a lefoo_to_cpu for every type, so lets make a simple
+ * one for u8:
+ */
+#define le8_to_cpu(v) (v)
+#define cpu_to_le8(v) (v)
+#define __le8 u8
+
+#define read_eb_member(eb, ptr, type, member, result) ( \
+ read_extent_buffer(eb, (char *)(result), \
+ ((unsigned long)(ptr)) + \
+ offsetof(type, member), \
+ sizeof(((type *)0)->member)))
+
+#define write_eb_member(eb, ptr, type, member, result) ( \
+ write_extent_buffer(eb, (char *)(result), \
+ ((unsigned long)(ptr)) + \
+ offsetof(type, member), \
+ sizeof(((type *)0)->member)))
+
+#define DECLARE_BTRFS_SETGET_BITS(bits) \
+u##bits btrfs_get_token_##bits(struct extent_buffer *eb, void *ptr, \
+ unsigned long off, \
+ struct btrfs_map_token *token); \
+void btrfs_set_token_##bits(struct extent_buffer *eb, void *ptr, \
+ unsigned long off, u##bits val, \
+ struct btrfs_map_token *token); \
+static inline u##bits btrfs_get_##bits(struct extent_buffer *eb, void *ptr, \
+ unsigned long off) \
+{ \
+ return btrfs_get_token_##bits(eb, ptr, off, NULL); \
+} \
+static inline void btrfs_set_##bits(struct extent_buffer *eb, void *ptr, \
+ unsigned long off, u##bits val) \
+{ \
+ btrfs_set_token_##bits(eb, ptr, off, val, NULL); \
+}
+
+DECLARE_BTRFS_SETGET_BITS(8)
+DECLARE_BTRFS_SETGET_BITS(16)
+DECLARE_BTRFS_SETGET_BITS(32)
+DECLARE_BTRFS_SETGET_BITS(64)
+
+#define BTRFS_SETGET_FUNCS(name, type, member, bits) \
+static inline u##bits btrfs_##name(struct extent_buffer *eb, type *s) \
+{ \
+ BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \
+ return btrfs_get_##bits(eb, s, offsetof(type, member)); \
+} \
+static inline void btrfs_set_##name(struct extent_buffer *eb, type *s, \
+ u##bits val) \
+{ \
+ BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \
+ btrfs_set_##bits(eb, s, offsetof(type, member), val); \
+} \
+static inline u##bits btrfs_token_##name(struct extent_buffer *eb, type *s, \
+ struct btrfs_map_token *token) \
+{ \
+ BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \
+ return btrfs_get_token_##bits(eb, s, offsetof(type, member), token); \
+} \
+static inline void btrfs_set_token_##name(struct extent_buffer *eb, \
+ type *s, u##bits val, \
+ struct btrfs_map_token *token) \
+{ \
+ BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \
+ btrfs_set_token_##bits(eb, s, offsetof(type, member), val, token); \
+}
+
+#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \
+static inline u##bits btrfs_##name(struct extent_buffer *eb) \
+{ \
+ type *p = page_address(eb->pages[0]); \
+ u##bits res = le##bits##_to_cpu(p->member); \
+ return res; \
+} \
+static inline void btrfs_set_##name(struct extent_buffer *eb, \
+ u##bits val) \
+{ \
+ type *p = page_address(eb->pages[0]); \
+ p->member = cpu_to_le##bits(val); \
+}
+
+#define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \
+static inline u##bits btrfs_##name(type *s) \
+{ \
+ return le##bits##_to_cpu(s->member); \
+} \
+static inline void btrfs_set_##name(type *s, u##bits val) \
+{ \
+ s->member = cpu_to_le##bits(val); \
+}
+
+BTRFS_SETGET_FUNCS(device_type, struct btrfs_dev_item, type, 64);
+BTRFS_SETGET_FUNCS(device_total_bytes, struct btrfs_dev_item, total_bytes, 64);
+BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64);
+BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32);
+BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32);
+BTRFS_SETGET_FUNCS(device_start_offset, struct btrfs_dev_item,
+ start_offset, 64);
+BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32);
+BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64);
+BTRFS_SETGET_FUNCS(device_group, struct btrfs_dev_item, dev_group, 32);
+BTRFS_SETGET_FUNCS(device_seek_speed, struct btrfs_dev_item, seek_speed, 8);
+BTRFS_SETGET_FUNCS(device_bandwidth, struct btrfs_dev_item, bandwidth, 8);
+BTRFS_SETGET_FUNCS(device_generation, struct btrfs_dev_item, generation, 64);
+
+BTRFS_SETGET_STACK_FUNCS(stack_device_type, struct btrfs_dev_item, type, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_device_total_bytes, struct btrfs_dev_item,
+ total_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_device_bytes_used, struct btrfs_dev_item,
+ bytes_used, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_device_io_align, struct btrfs_dev_item,
+ io_align, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_device_io_width, struct btrfs_dev_item,
+ io_width, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_device_sector_size, struct btrfs_dev_item,
+ sector_size, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_device_id, struct btrfs_dev_item, devid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_device_group, struct btrfs_dev_item,
+ dev_group, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_device_seek_speed, struct btrfs_dev_item,
+ seek_speed, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_device_bandwidth, struct btrfs_dev_item,
+ bandwidth, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_device_generation, struct btrfs_dev_item,
+ generation, 64);
+
+static inline unsigned long btrfs_device_uuid(struct btrfs_dev_item *d)
+{
+ return (unsigned long)d + offsetof(struct btrfs_dev_item, uuid);
+}
+
+static inline unsigned long btrfs_device_fsid(struct btrfs_dev_item *d)
+{
+ return (unsigned long)d + offsetof(struct btrfs_dev_item, fsid);
+}
+
+BTRFS_SETGET_FUNCS(chunk_length, struct btrfs_chunk, length, 64);
+BTRFS_SETGET_FUNCS(chunk_owner, struct btrfs_chunk, owner, 64);
+BTRFS_SETGET_FUNCS(chunk_stripe_len, struct btrfs_chunk, stripe_len, 64);
+BTRFS_SETGET_FUNCS(chunk_io_align, struct btrfs_chunk, io_align, 32);
+BTRFS_SETGET_FUNCS(chunk_io_width, struct btrfs_chunk, io_width, 32);
+BTRFS_SETGET_FUNCS(chunk_sector_size, struct btrfs_chunk, sector_size, 32);
+BTRFS_SETGET_FUNCS(chunk_type, struct btrfs_chunk, type, 64);
+BTRFS_SETGET_FUNCS(chunk_num_stripes, struct btrfs_chunk, num_stripes, 16);
+BTRFS_SETGET_FUNCS(chunk_sub_stripes, struct btrfs_chunk, sub_stripes, 16);
+BTRFS_SETGET_FUNCS(stripe_devid, struct btrfs_stripe, devid, 64);
+BTRFS_SETGET_FUNCS(stripe_offset, struct btrfs_stripe, offset, 64);
+
+static inline char *btrfs_stripe_dev_uuid(struct btrfs_stripe *s)
+{
+ return (char *)s + offsetof(struct btrfs_stripe, dev_uuid);
+}
+
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_length, struct btrfs_chunk, length, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_owner, struct btrfs_chunk, owner, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_stripe_len, struct btrfs_chunk,
+ stripe_len, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_align, struct btrfs_chunk,
+ io_align, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_width, struct btrfs_chunk,
+ io_width, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_sector_size, struct btrfs_chunk,
+ sector_size, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_type, struct btrfs_chunk, type, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_num_stripes, struct btrfs_chunk,
+ num_stripes, 16);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_sub_stripes, struct btrfs_chunk,
+ sub_stripes, 16);
+BTRFS_SETGET_STACK_FUNCS(stack_stripe_devid, struct btrfs_stripe, devid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_stripe_offset, struct btrfs_stripe, offset, 64);
+
+static inline struct btrfs_stripe *btrfs_stripe_nr(struct btrfs_chunk *c,
+ int nr)
+{
+ unsigned long offset = (unsigned long)c;
+ offset += offsetof(struct btrfs_chunk, stripe);
+ offset += nr * sizeof(struct btrfs_stripe);
+ return (struct btrfs_stripe *)offset;
+}
+
+static inline char *btrfs_stripe_dev_uuid_nr(struct btrfs_chunk *c, int nr)
+{
+ return btrfs_stripe_dev_uuid(btrfs_stripe_nr(c, nr));
+}
+
+static inline u64 btrfs_stripe_offset_nr(struct extent_buffer *eb,
+ struct btrfs_chunk *c, int nr)
+{
+ return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr));
+}
+
+static inline u64 btrfs_stripe_devid_nr(struct extent_buffer *eb,
+ struct btrfs_chunk *c, int nr)
+{
+ return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr));
+}
+
+/* struct btrfs_block_group_item */
+BTRFS_SETGET_STACK_FUNCS(block_group_used, struct btrfs_block_group_item,
+ used, 64);
+BTRFS_SETGET_FUNCS(disk_block_group_used, struct btrfs_block_group_item,
+ used, 64);
+BTRFS_SETGET_STACK_FUNCS(block_group_chunk_objectid,
+ struct btrfs_block_group_item, chunk_objectid, 64);
+
+BTRFS_SETGET_FUNCS(disk_block_group_chunk_objectid,
+ struct btrfs_block_group_item, chunk_objectid, 64);
+BTRFS_SETGET_FUNCS(disk_block_group_flags,
+ struct btrfs_block_group_item, flags, 64);
+BTRFS_SETGET_STACK_FUNCS(block_group_flags,
+ struct btrfs_block_group_item, flags, 64);
+
+/* struct btrfs_inode_ref */
+BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);
+BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64);
+
+/* struct btrfs_inode_extref */
+BTRFS_SETGET_FUNCS(inode_extref_parent, struct btrfs_inode_extref,
+ parent_objectid, 64);
+BTRFS_SETGET_FUNCS(inode_extref_name_len, struct btrfs_inode_extref,
+ name_len, 16);
+BTRFS_SETGET_FUNCS(inode_extref_index, struct btrfs_inode_extref, index, 64);
+
+/* struct btrfs_inode_item */
+BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64);
+BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64);
+BTRFS_SETGET_FUNCS(inode_transid, struct btrfs_inode_item, transid, 64);
+BTRFS_SETGET_FUNCS(inode_size, struct btrfs_inode_item, size, 64);
+BTRFS_SETGET_FUNCS(inode_nbytes, struct btrfs_inode_item, nbytes, 64);
+BTRFS_SETGET_FUNCS(inode_block_group, struct btrfs_inode_item, block_group, 64);
+BTRFS_SETGET_FUNCS(inode_nlink, struct btrfs_inode_item, nlink, 32);
+BTRFS_SETGET_FUNCS(inode_uid, struct btrfs_inode_item, uid, 32);
+BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32);
+BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32);
+BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64);
+BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_generation, struct btrfs_inode_item,
+ generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_sequence, struct btrfs_inode_item,
+ sequence, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_transid, struct btrfs_inode_item,
+ transid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_size, struct btrfs_inode_item, size, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_nbytes, struct btrfs_inode_item,
+ nbytes, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_block_group, struct btrfs_inode_item,
+ block_group, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_nlink, struct btrfs_inode_item, nlink, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_uid, struct btrfs_inode_item, uid, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_gid, struct btrfs_inode_item, gid, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_mode, struct btrfs_inode_item, mode, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_rdev, struct btrfs_inode_item, rdev, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_flags, struct btrfs_inode_item, flags, 64);
+BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64);
+BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32);
+
+/* struct btrfs_dev_extent */
+BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent,
+ chunk_tree, 64);
+BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent,
+ chunk_objectid, 64);
+BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent,
+ chunk_offset, 64);
+BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64);
+
+static inline unsigned long btrfs_dev_extent_chunk_tree_uuid(struct btrfs_dev_extent *dev)
+{
+ unsigned long ptr = offsetof(struct btrfs_dev_extent, chunk_tree_uuid);
+ return (unsigned long)dev + ptr;
+}
+
+BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 64);
+BTRFS_SETGET_FUNCS(extent_generation, struct btrfs_extent_item,
+ generation, 64);
+BTRFS_SETGET_FUNCS(extent_flags, struct btrfs_extent_item, flags, 64);
+
+BTRFS_SETGET_FUNCS(extent_refs_v0, struct btrfs_extent_item_v0, refs, 32);
+
+
+BTRFS_SETGET_FUNCS(tree_block_level, struct btrfs_tree_block_info, level, 8);
+
+static inline void btrfs_tree_block_key(struct extent_buffer *eb,
+ struct btrfs_tree_block_info *item,
+ struct btrfs_disk_key *key)
+{
+ read_eb_member(eb, item, struct btrfs_tree_block_info, key, key);
+}
+
+static inline void btrfs_set_tree_block_key(struct extent_buffer *eb,
+ struct btrfs_tree_block_info *item,
+ struct btrfs_disk_key *key)
+{
+ write_eb_member(eb, item, struct btrfs_tree_block_info, key, key);
+}
+
+BTRFS_SETGET_FUNCS(extent_data_ref_root, struct btrfs_extent_data_ref,
+ root, 64);
+BTRFS_SETGET_FUNCS(extent_data_ref_objectid, struct btrfs_extent_data_ref,
+ objectid, 64);
+BTRFS_SETGET_FUNCS(extent_data_ref_offset, struct btrfs_extent_data_ref,
+ offset, 64);
+BTRFS_SETGET_FUNCS(extent_data_ref_count, struct btrfs_extent_data_ref,
+ count, 32);
+
+BTRFS_SETGET_FUNCS(shared_data_ref_count, struct btrfs_shared_data_ref,
+ count, 32);
+
+BTRFS_SETGET_FUNCS(extent_inline_ref_type, struct btrfs_extent_inline_ref,
+ type, 8);
+BTRFS_SETGET_FUNCS(extent_inline_ref_offset, struct btrfs_extent_inline_ref,
+ offset, 64);
+
+static inline u32 btrfs_extent_inline_ref_size(int type)
+{
+ if (type == BTRFS_TREE_BLOCK_REF_KEY ||
+ type == BTRFS_SHARED_BLOCK_REF_KEY)
+ return sizeof(struct btrfs_extent_inline_ref);
+ if (type == BTRFS_SHARED_DATA_REF_KEY)
+ return sizeof(struct btrfs_shared_data_ref) +
+ sizeof(struct btrfs_extent_inline_ref);
+ if (type == BTRFS_EXTENT_DATA_REF_KEY)
+ return sizeof(struct btrfs_extent_data_ref) +
+ offsetof(struct btrfs_extent_inline_ref, offset);
+ BUG();
+ return 0;
+}
+
+BTRFS_SETGET_FUNCS(ref_root_v0, struct btrfs_extent_ref_v0, root, 64);
+BTRFS_SETGET_FUNCS(ref_generation_v0, struct btrfs_extent_ref_v0,
+ generation, 64);
+BTRFS_SETGET_FUNCS(ref_objectid_v0, struct btrfs_extent_ref_v0, objectid, 64);
+BTRFS_SETGET_FUNCS(ref_count_v0, struct btrfs_extent_ref_v0, count, 32);
+
+/* struct btrfs_node */
+BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64);
+BTRFS_SETGET_FUNCS(key_generation, struct btrfs_key_ptr, generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_key_blockptr, struct btrfs_key_ptr,
+ blockptr, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_key_generation, struct btrfs_key_ptr,
+ generation, 64);
+
+static inline u64 btrfs_node_blockptr(struct extent_buffer *eb, int nr)
+{
+ unsigned long ptr;
+ ptr = offsetof(struct btrfs_node, ptrs) +
+ sizeof(struct btrfs_key_ptr) * nr;
+ return btrfs_key_blockptr(eb, (struct btrfs_key_ptr *)ptr);
+}
+
+static inline void btrfs_set_node_blockptr(struct extent_buffer *eb,
+ int nr, u64 val)
+{
+ unsigned long ptr;
+ ptr = offsetof(struct btrfs_node, ptrs) +
+ sizeof(struct btrfs_key_ptr) * nr;
+ btrfs_set_key_blockptr(eb, (struct btrfs_key_ptr *)ptr, val);
+}
+
+static inline u64 btrfs_node_ptr_generation(struct extent_buffer *eb, int nr)
+{
+ unsigned long ptr;
+ ptr = offsetof(struct btrfs_node, ptrs) +
+ sizeof(struct btrfs_key_ptr) * nr;
+ return btrfs_key_generation(eb, (struct btrfs_key_ptr *)ptr);
+}
+
+static inline void btrfs_set_node_ptr_generation(struct extent_buffer *eb,
+ int nr, u64 val)
+{
+ unsigned long ptr;
+ ptr = offsetof(struct btrfs_node, ptrs) +
+ sizeof(struct btrfs_key_ptr) * nr;
+ btrfs_set_key_generation(eb, (struct btrfs_key_ptr *)ptr, val);
+}
+
+static inline unsigned long btrfs_node_key_ptr_offset(int nr)
+{
+ return offsetof(struct btrfs_node, ptrs) +
+ sizeof(struct btrfs_key_ptr) * nr;
+}
+
+void btrfs_node_key(struct extent_buffer *eb,
+ struct btrfs_disk_key *disk_key, int nr);
+
+static inline void btrfs_set_node_key(struct extent_buffer *eb,
+ struct btrfs_disk_key *disk_key, int nr)
+{
+ unsigned long ptr;
+ ptr = btrfs_node_key_ptr_offset(nr);
+ write_eb_member(eb, (struct btrfs_key_ptr *)ptr,
+ struct btrfs_key_ptr, key, disk_key);
+}
+
+/* struct btrfs_item */
+BTRFS_SETGET_FUNCS(item_offset, struct btrfs_item, offset, 32);
+BTRFS_SETGET_FUNCS(item_size, struct btrfs_item, size, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_item_offset, struct btrfs_item, offset, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_item_size, struct btrfs_item, size, 32);
+
+static inline unsigned long btrfs_item_nr_offset(int nr)
+{
+ return offsetof(struct btrfs_leaf, items) +
+ sizeof(struct btrfs_item) * nr;
+}
+
+static inline struct btrfs_item *btrfs_item_nr(int nr)
+{
+ return (struct btrfs_item *)btrfs_item_nr_offset(nr);
+}
+
+static inline u32 btrfs_item_end(struct extent_buffer *eb,
+ struct btrfs_item *item)
+{
+ return btrfs_item_offset(eb, item) + btrfs_item_size(eb, item);
+}
+
+static inline u32 btrfs_item_end_nr(struct extent_buffer *eb, int nr)
+{
+ return btrfs_item_end(eb, btrfs_item_nr(nr));
+}
+
+static inline u32 btrfs_item_offset_nr(struct extent_buffer *eb, int nr)
+{
+ return btrfs_item_offset(eb, btrfs_item_nr(nr));
+}
+
+static inline u32 btrfs_item_size_nr(struct extent_buffer *eb, int nr)
+{
+ return btrfs_item_size(eb, btrfs_item_nr(nr));
+}
+
+static inline void btrfs_item_key(struct extent_buffer *eb,
+ struct btrfs_disk_key *disk_key, int nr)
+{
+ struct btrfs_item *item = btrfs_item_nr(nr);
+ read_eb_member(eb, item, struct btrfs_item, key, disk_key);
+}
+
+static inline void btrfs_set_item_key(struct extent_buffer *eb,
+ struct btrfs_disk_key *disk_key, int nr)
+{
+ struct btrfs_item *item = btrfs_item_nr(nr);
+ write_eb_member(eb, item, struct btrfs_item, key, disk_key);
+}
+
+BTRFS_SETGET_FUNCS(dir_log_end, struct btrfs_dir_log_item, end, 64);
+
+/*
+ * struct btrfs_root_ref
+ */
+BTRFS_SETGET_FUNCS(root_ref_dirid, struct btrfs_root_ref, dirid, 64);
+BTRFS_SETGET_FUNCS(root_ref_sequence, struct btrfs_root_ref, sequence, 64);
+BTRFS_SETGET_FUNCS(root_ref_name_len, struct btrfs_root_ref, name_len, 16);
+
+/* struct btrfs_dir_item */
+BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16);
+BTRFS_SETGET_FUNCS(dir_type, struct btrfs_dir_item, type, 8);
+BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16);
+BTRFS_SETGET_FUNCS(dir_transid, struct btrfs_dir_item, transid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dir_type, struct btrfs_dir_item, type, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_dir_data_len, struct btrfs_dir_item,
+ data_len, 16);
+BTRFS_SETGET_STACK_FUNCS(stack_dir_name_len, struct btrfs_dir_item,
+ name_len, 16);
+BTRFS_SETGET_STACK_FUNCS(stack_dir_transid, struct btrfs_dir_item,
+ transid, 64);
+
+static inline void btrfs_dir_item_key(struct extent_buffer *eb,
+ struct btrfs_dir_item *item,
+ struct btrfs_disk_key *key)
+{
+ read_eb_member(eb, item, struct btrfs_dir_item, location, key);
+}
+
+static inline void btrfs_set_dir_item_key(struct extent_buffer *eb,
+ struct btrfs_dir_item *item,
+ struct btrfs_disk_key *key)
+{
+ write_eb_member(eb, item, struct btrfs_dir_item, location, key);
+}
+
+BTRFS_SETGET_FUNCS(free_space_entries, struct btrfs_free_space_header,
+ num_entries, 64);
+BTRFS_SETGET_FUNCS(free_space_bitmaps, struct btrfs_free_space_header,
+ num_bitmaps, 64);
+BTRFS_SETGET_FUNCS(free_space_generation, struct btrfs_free_space_header,
+ generation, 64);
+
+static inline void btrfs_free_space_key(struct extent_buffer *eb,
+ struct btrfs_free_space_header *h,
+ struct btrfs_disk_key *key)
+{
+ read_eb_member(eb, h, struct btrfs_free_space_header, location, key);
+}
+
+static inline void btrfs_set_free_space_key(struct extent_buffer *eb,
+ struct btrfs_free_space_header *h,
+ struct btrfs_disk_key *key)
+{
+ write_eb_member(eb, h, struct btrfs_free_space_header, location, key);
+}
+
+/* struct btrfs_disk_key */
+BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key,
+ objectid, 64);
+BTRFS_SETGET_STACK_FUNCS(disk_key_offset, struct btrfs_disk_key, offset, 64);
+BTRFS_SETGET_STACK_FUNCS(disk_key_type, struct btrfs_disk_key, type, 8);
+
+static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu,
+ struct btrfs_disk_key *disk)
+{
+ cpu->offset = le64_to_cpu(disk->offset);
+ cpu->type = disk->type;
+ cpu->objectid = le64_to_cpu(disk->objectid);
+}
+
+static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk,
+ struct btrfs_key *cpu)
+{
+ disk->offset = cpu_to_le64(cpu->offset);
+ disk->type = cpu->type;
+ disk->objectid = cpu_to_le64(cpu->objectid);
+}
+
+static inline void btrfs_node_key_to_cpu(struct extent_buffer *eb,
+ struct btrfs_key *key, int nr)
+{
+ struct btrfs_disk_key disk_key;
+ btrfs_node_key(eb, &disk_key, nr);
+ btrfs_disk_key_to_cpu(key, &disk_key);
+}
+
+static inline void btrfs_item_key_to_cpu(struct extent_buffer *eb,
+ struct btrfs_key *key, int nr)
+{
+ struct btrfs_disk_key disk_key;
+ btrfs_item_key(eb, &disk_key, nr);
+ btrfs_disk_key_to_cpu(key, &disk_key);
+}
+
+static inline void btrfs_dir_item_key_to_cpu(struct extent_buffer *eb,
+ struct btrfs_dir_item *item,
+ struct btrfs_key *key)
+{
+ struct btrfs_disk_key disk_key;
+ btrfs_dir_item_key(eb, item, &disk_key);
+ btrfs_disk_key_to_cpu(key, &disk_key);
+}
+
+
+static inline u8 btrfs_key_type(struct btrfs_key *key)
+{
+ return key->type;
+}
+
+static inline void btrfs_set_key_type(struct btrfs_key *key, u8 val)
+{
+ key->type = val;
+}
+
+/* struct btrfs_header */
+BTRFS_SETGET_HEADER_FUNCS(header_bytenr, struct btrfs_header, bytenr, 64);
+BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header,
+ generation, 64);
+BTRFS_SETGET_HEADER_FUNCS(header_owner, struct btrfs_header, owner, 64);
+BTRFS_SETGET_HEADER_FUNCS(header_nritems, struct btrfs_header, nritems, 32);
+BTRFS_SETGET_HEADER_FUNCS(header_flags, struct btrfs_header, flags, 64);
+BTRFS_SETGET_HEADER_FUNCS(header_level, struct btrfs_header, level, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_header_generation, struct btrfs_header,
+ generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_header_owner, struct btrfs_header, owner, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_header_nritems, struct btrfs_header,
+ nritems, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_header_bytenr, struct btrfs_header, bytenr, 64);
+
+static inline int btrfs_header_flag(struct extent_buffer *eb, u64 flag)
+{
+ return (btrfs_header_flags(eb) & flag) == flag;
+}
+
+static inline int btrfs_set_header_flag(struct extent_buffer *eb, u64 flag)
+{
+ u64 flags = btrfs_header_flags(eb);
+ btrfs_set_header_flags(eb, flags | flag);
+ return (flags & flag) == flag;
+}
+
+static inline int btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag)
+{
+ u64 flags = btrfs_header_flags(eb);
+ btrfs_set_header_flags(eb, flags & ~flag);
+ return (flags & flag) == flag;
+}
+
+static inline int btrfs_header_backref_rev(struct extent_buffer *eb)
+{
+ u64 flags = btrfs_header_flags(eb);
+ return flags >> BTRFS_BACKREF_REV_SHIFT;
+}
+
+static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb,
+ int rev)
+{
+ u64 flags = btrfs_header_flags(eb);
+ flags &= ~BTRFS_BACKREF_REV_MASK;
+ flags |= (u64)rev << BTRFS_BACKREF_REV_SHIFT;
+ btrfs_set_header_flags(eb, flags);
+}
+
+static inline unsigned long btrfs_header_fsid(void)
+{
+ return offsetof(struct btrfs_header, fsid);
+}
+
+static inline unsigned long btrfs_header_chunk_tree_uuid(struct extent_buffer *eb)
+{
+ return offsetof(struct btrfs_header, chunk_tree_uuid);
+}
+
+static inline int btrfs_is_leaf(struct extent_buffer *eb)
+{
+ return btrfs_header_level(eb) == 0;
+}
+
+/* struct btrfs_root_item */
+BTRFS_SETGET_FUNCS(disk_root_generation, struct btrfs_root_item,
+ generation, 64);
+BTRFS_SETGET_FUNCS(disk_root_refs, struct btrfs_root_item, refs, 32);
+BTRFS_SETGET_FUNCS(disk_root_bytenr, struct btrfs_root_item, bytenr, 64);
+BTRFS_SETGET_FUNCS(disk_root_level, struct btrfs_root_item, level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(root_generation, struct btrfs_root_item,
+ generation, 64);
+BTRFS_SETGET_STACK_FUNCS(root_bytenr, struct btrfs_root_item, bytenr, 64);
+BTRFS_SETGET_STACK_FUNCS(root_level, struct btrfs_root_item, level, 8);
+BTRFS_SETGET_STACK_FUNCS(root_dirid, struct btrfs_root_item, root_dirid, 64);
+BTRFS_SETGET_STACK_FUNCS(root_refs, struct btrfs_root_item, refs, 32);
+BTRFS_SETGET_STACK_FUNCS(root_flags, struct btrfs_root_item, flags, 64);
+BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, bytes_used, 64);
+BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64);
+BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item,
+ last_snapshot, 64);
+BTRFS_SETGET_STACK_FUNCS(root_generation_v2, struct btrfs_root_item,
+ generation_v2, 64);
+BTRFS_SETGET_STACK_FUNCS(root_ctransid, struct btrfs_root_item,
+ ctransid, 64);
+BTRFS_SETGET_STACK_FUNCS(root_otransid, struct btrfs_root_item,
+ otransid, 64);
+BTRFS_SETGET_STACK_FUNCS(root_stransid, struct btrfs_root_item,
+ stransid, 64);
+BTRFS_SETGET_STACK_FUNCS(root_rtransid, struct btrfs_root_item,
+ rtransid, 64);
+
+static inline bool btrfs_root_readonly(struct btrfs_root *root)
+{
+ return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0;
+}
+
+static inline bool btrfs_root_dead(struct btrfs_root *root)
+{
+ return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0;
+}
+
+/* struct btrfs_root_backup */
+BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup,
+ tree_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_tree_root_gen, struct btrfs_root_backup,
+ tree_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_tree_root_level, struct btrfs_root_backup,
+ tree_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_chunk_root, struct btrfs_root_backup,
+ chunk_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_gen, struct btrfs_root_backup,
+ chunk_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_level, struct btrfs_root_backup,
+ chunk_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_extent_root, struct btrfs_root_backup,
+ extent_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_extent_root_gen, struct btrfs_root_backup,
+ extent_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_extent_root_level, struct btrfs_root_backup,
+ extent_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_fs_root, struct btrfs_root_backup,
+ fs_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_fs_root_gen, struct btrfs_root_backup,
+ fs_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_fs_root_level, struct btrfs_root_backup,
+ fs_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_dev_root, struct btrfs_root_backup,
+ dev_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_dev_root_gen, struct btrfs_root_backup,
+ dev_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_dev_root_level, struct btrfs_root_backup,
+ dev_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_csum_root, struct btrfs_root_backup,
+ csum_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_csum_root_gen, struct btrfs_root_backup,
+ csum_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_csum_root_level, struct btrfs_root_backup,
+ csum_root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(backup_total_bytes, struct btrfs_root_backup,
+ total_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup,
+ bytes_used, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
+ num_devices, 64);
+
+/* struct btrfs_balance_item */
+BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64);
+
+static inline void btrfs_balance_data(struct extent_buffer *eb,
+ struct btrfs_balance_item *bi,
+ struct btrfs_disk_balance_args *ba)
+{
+ read_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
+}
+
+static inline void btrfs_set_balance_data(struct extent_buffer *eb,
+ struct btrfs_balance_item *bi,
+ struct btrfs_disk_balance_args *ba)
+{
+ write_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
+}
+
+static inline void btrfs_balance_meta(struct extent_buffer *eb,
+ struct btrfs_balance_item *bi,
+ struct btrfs_disk_balance_args *ba)
+{
+ read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
+}
+
+static inline void btrfs_set_balance_meta(struct extent_buffer *eb,
+ struct btrfs_balance_item *bi,
+ struct btrfs_disk_balance_args *ba)
+{
+ write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
+}
+
+static inline void btrfs_balance_sys(struct extent_buffer *eb,
+ struct btrfs_balance_item *bi,
+ struct btrfs_disk_balance_args *ba)
+{
+ read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
+}
+
+static inline void btrfs_set_balance_sys(struct extent_buffer *eb,
+ struct btrfs_balance_item *bi,
+ struct btrfs_disk_balance_args *ba)
+{
+ write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
+}
+
+static inline void
+btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu,
+ struct btrfs_disk_balance_args *disk)
+{
+ memset(cpu, 0, sizeof(*cpu));
+
+ cpu->profiles = le64_to_cpu(disk->profiles);
+ cpu->usage = le64_to_cpu(disk->usage);
+ cpu->devid = le64_to_cpu(disk->devid);
+ cpu->pstart = le64_to_cpu(disk->pstart);
+ cpu->pend = le64_to_cpu(disk->pend);
+ cpu->vstart = le64_to_cpu(disk->vstart);
+ cpu->vend = le64_to_cpu(disk->vend);
+ cpu->target = le64_to_cpu(disk->target);
+ cpu->flags = le64_to_cpu(disk->flags);
+ cpu->limit = le64_to_cpu(disk->limit);
+}
+
+static inline void
+btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk,
+ struct btrfs_balance_args *cpu)
+{
+ memset(disk, 0, sizeof(*disk));
+
+ disk->profiles = cpu_to_le64(cpu->profiles);
+ disk->usage = cpu_to_le64(cpu->usage);
+ disk->devid = cpu_to_le64(cpu->devid);
+ disk->pstart = cpu_to_le64(cpu->pstart);
+ disk->pend = cpu_to_le64(cpu->pend);
+ disk->vstart = cpu_to_le64(cpu->vstart);
+ disk->vend = cpu_to_le64(cpu->vend);
+ disk->target = cpu_to_le64(cpu->target);
+ disk->flags = cpu_to_le64(cpu->flags);
+ disk->limit = cpu_to_le64(cpu->limit);
+}
+
+/* struct btrfs_super_block */
+BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
+BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64);
+BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block,
+ generation, 64);
+BTRFS_SETGET_STACK_FUNCS(super_root, struct btrfs_super_block, root, 64);
+BTRFS_SETGET_STACK_FUNCS(super_sys_array_size,
+ struct btrfs_super_block, sys_chunk_array_size, 32);
+BTRFS_SETGET_STACK_FUNCS(super_chunk_root_generation,
+ struct btrfs_super_block, chunk_root_generation, 64);
+BTRFS_SETGET_STACK_FUNCS(super_root_level, struct btrfs_super_block,
+ root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(super_chunk_root, struct btrfs_super_block,
+ chunk_root, 64);
+BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block,
+ chunk_root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block,
+ log_root, 64);
+BTRFS_SETGET_STACK_FUNCS(super_log_root_transid, struct btrfs_super_block,
+ log_root_transid, 64);
+BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block,
+ log_root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block,
+ total_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(super_bytes_used, struct btrfs_super_block,
+ bytes_used, 64);
+BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block,
+ sectorsize, 32);
+BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block,
+ nodesize, 32);
+BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block,
+ stripesize, 32);
+BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block,
+ root_dir_objectid, 64);
+BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block,
+ num_devices, 64);
+BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block,
+ compat_flags, 64);
+BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block,
+ compat_ro_flags, 64);
+BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block,
+ incompat_flags, 64);
+BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block,
+ csum_type, 16);
+BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block,
+ cache_generation, 64);
+BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64);
+BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block,
+ uuid_tree_generation, 64);
+
+static inline int btrfs_super_csum_size(struct btrfs_super_block *s)
+{
+ u16 t = btrfs_super_csum_type(s);
+ /*
+ * csum type is validated at mount time
+ */
+ return btrfs_csum_sizes[t];
+}
+
+static inline unsigned long btrfs_leaf_data(struct extent_buffer *l)
+{
+ return offsetof(struct btrfs_leaf, items);
+}
+
+/* struct btrfs_file_extent_item */
+BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_bytenr,
+ struct btrfs_file_extent_item, disk_bytenr, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_offset,
+ struct btrfs_file_extent_item, offset, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_generation,
+ struct btrfs_file_extent_item, generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_num_bytes,
+ struct btrfs_file_extent_item, num_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_num_bytes,
+ struct btrfs_file_extent_item, disk_num_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_compression,
+ struct btrfs_file_extent_item, compression, 8);
+
+static inline unsigned long
+btrfs_file_extent_inline_start(struct btrfs_file_extent_item *e)
+{
+ return (unsigned long)e + BTRFS_FILE_EXTENT_INLINE_DATA_START;
+}
+
+static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize)
+{
+ return BTRFS_FILE_EXTENT_INLINE_DATA_START + datasize;
+}
+
+BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item,
+ disk_bytenr, 64);
+BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item,
+ generation, 64);
+BTRFS_SETGET_FUNCS(file_extent_disk_num_bytes, struct btrfs_file_extent_item,
+ disk_num_bytes, 64);
+BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item,
+ offset, 64);
+BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item,
+ num_bytes, 64);
+BTRFS_SETGET_FUNCS(file_extent_ram_bytes, struct btrfs_file_extent_item,
+ ram_bytes, 64);
+BTRFS_SETGET_FUNCS(file_extent_compression, struct btrfs_file_extent_item,
+ compression, 8);
+BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item,
+ encryption, 8);
+BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item,
+ other_encoding, 16);
+
+/*
+ * this returns the number of bytes used by the item on disk, minus the
+ * size of any extent headers. If a file is compressed on disk, this is
+ * the compressed size
+ */
+static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
+ struct btrfs_item *e)
+{
+ return btrfs_item_size(eb, e) - BTRFS_FILE_EXTENT_INLINE_DATA_START;
+}
+
+/* this returns the number of file bytes represented by the inline item.
+ * If an item is compressed, this is the uncompressed size
+ */
+static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb,
+ int slot,
+ struct btrfs_file_extent_item *fi)
+{
+ struct btrfs_map_token token;
+
+ btrfs_init_map_token(&token);
+ /*
+ * return the space used on disk if this item isn't
+ * compressed or encoded
+ */
+ if (btrfs_token_file_extent_compression(eb, fi, &token) == 0 &&
+ btrfs_token_file_extent_encryption(eb, fi, &token) == 0 &&
+ btrfs_token_file_extent_other_encoding(eb, fi, &token) == 0) {
+ return btrfs_file_extent_inline_item_len(eb,
+ btrfs_item_nr(slot));
+ }
+
+ /* otherwise use the ram bytes field */
+ return btrfs_token_file_extent_ram_bytes(eb, fi, &token);
+}
+
+
+/* btrfs_dev_stats_item */
+static inline u64 btrfs_dev_stats_value(struct extent_buffer *eb,
+ struct btrfs_dev_stats_item *ptr,
+ int index)
+{
+ u64 val;
+
+ read_extent_buffer(eb, &val,
+ offsetof(struct btrfs_dev_stats_item, values) +
+ ((unsigned long)ptr) + (index * sizeof(u64)),
+ sizeof(val));
+ return val;
+}
+
+static inline void btrfs_set_dev_stats_value(struct extent_buffer *eb,
+ struct btrfs_dev_stats_item *ptr,
+ int index, u64 val)
+{
+ write_extent_buffer(eb, &val,
+ offsetof(struct btrfs_dev_stats_item, values) +
+ ((unsigned long)ptr) + (index * sizeof(u64)),
+ sizeof(val));
+}
+
+/* btrfs_qgroup_status_item */
+BTRFS_SETGET_FUNCS(qgroup_status_generation, struct btrfs_qgroup_status_item,
+ generation, 64);
+BTRFS_SETGET_FUNCS(qgroup_status_version, struct btrfs_qgroup_status_item,
+ version, 64);
+BTRFS_SETGET_FUNCS(qgroup_status_flags, struct btrfs_qgroup_status_item,
+ flags, 64);
+BTRFS_SETGET_FUNCS(qgroup_status_rescan, struct btrfs_qgroup_status_item,
+ rescan, 64);
+
+/* btrfs_qgroup_info_item */
+BTRFS_SETGET_FUNCS(qgroup_info_generation, struct btrfs_qgroup_info_item,
+ generation, 64);
+BTRFS_SETGET_FUNCS(qgroup_info_rfer, struct btrfs_qgroup_info_item, rfer, 64);
+BTRFS_SETGET_FUNCS(qgroup_info_rfer_cmpr, struct btrfs_qgroup_info_item,
+ rfer_cmpr, 64);
+BTRFS_SETGET_FUNCS(qgroup_info_excl, struct btrfs_qgroup_info_item, excl, 64);
+BTRFS_SETGET_FUNCS(qgroup_info_excl_cmpr, struct btrfs_qgroup_info_item,
+ excl_cmpr, 64);
+
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_generation,
+ struct btrfs_qgroup_info_item, generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer, struct btrfs_qgroup_info_item,
+ rfer, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer_cmpr,
+ struct btrfs_qgroup_info_item, rfer_cmpr, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl, struct btrfs_qgroup_info_item,
+ excl, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl_cmpr,
+ struct btrfs_qgroup_info_item, excl_cmpr, 64);
+
+/* btrfs_qgroup_limit_item */
+BTRFS_SETGET_FUNCS(qgroup_limit_flags, struct btrfs_qgroup_limit_item,
+ flags, 64);
+BTRFS_SETGET_FUNCS(qgroup_limit_max_rfer, struct btrfs_qgroup_limit_item,
+ max_rfer, 64);
+BTRFS_SETGET_FUNCS(qgroup_limit_max_excl, struct btrfs_qgroup_limit_item,
+ max_excl, 64);
+BTRFS_SETGET_FUNCS(qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item,
+ rsv_rfer, 64);
+BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item,
+ rsv_excl, 64);
+
+/* btrfs_dev_replace_item */
+BTRFS_SETGET_FUNCS(dev_replace_src_devid,
+ struct btrfs_dev_replace_item, src_devid, 64);
+BTRFS_SETGET_FUNCS(dev_replace_cont_reading_from_srcdev_mode,
+ struct btrfs_dev_replace_item, cont_reading_from_srcdev_mode,
+ 64);
+BTRFS_SETGET_FUNCS(dev_replace_replace_state, struct btrfs_dev_replace_item,
+ replace_state, 64);
+BTRFS_SETGET_FUNCS(dev_replace_time_started, struct btrfs_dev_replace_item,
+ time_started, 64);
+BTRFS_SETGET_FUNCS(dev_replace_time_stopped, struct btrfs_dev_replace_item,
+ time_stopped, 64);
+BTRFS_SETGET_FUNCS(dev_replace_num_write_errors, struct btrfs_dev_replace_item,
+ num_write_errors, 64);
+BTRFS_SETGET_FUNCS(dev_replace_num_uncorrectable_read_errors,
+ struct btrfs_dev_replace_item, num_uncorrectable_read_errors,
+ 64);
+BTRFS_SETGET_FUNCS(dev_replace_cursor_left, struct btrfs_dev_replace_item,
+ cursor_left, 64);
+BTRFS_SETGET_FUNCS(dev_replace_cursor_right, struct btrfs_dev_replace_item,
+ cursor_right, 64);
+
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_src_devid,
+ struct btrfs_dev_replace_item, src_devid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cont_reading_from_srcdev_mode,
+ struct btrfs_dev_replace_item,
+ cont_reading_from_srcdev_mode, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_replace_state,
+ struct btrfs_dev_replace_item, replace_state, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_started,
+ struct btrfs_dev_replace_item, time_started, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_stopped,
+ struct btrfs_dev_replace_item, time_stopped, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_write_errors,
+ struct btrfs_dev_replace_item, num_write_errors, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_uncorrectable_read_errors,
+ struct btrfs_dev_replace_item,
+ num_uncorrectable_read_errors, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_left,
+ struct btrfs_dev_replace_item, cursor_left, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right,
+ struct btrfs_dev_replace_item, cursor_right, 64);
+
+static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
+{
+ return sb->s_fs_info;
+}
+
+/* helper function to cast into the data area of the leaf. */
+#define btrfs_item_ptr(leaf, slot, type) \
+ ((type *)(btrfs_leaf_data(leaf) + \
+ btrfs_item_offset_nr(leaf, slot)))
+
+#define btrfs_item_ptr_offset(leaf, slot) \
+ ((unsigned long)(btrfs_leaf_data(leaf) + \
+ btrfs_item_offset_nr(leaf, slot)))
+
+static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
+{
+ return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) &&
+ (space_info->flags & BTRFS_BLOCK_GROUP_DATA));
+}
+
+static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
+{
+ return mapping_gfp_mask(mapping) & ~__GFP_FS;
+}
+
+/* extent-tree.c */
+
+u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes);
+
+static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
+ unsigned num_items)
+{
+ return (root->nodesize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
+ 2 * num_items;
+}
+
+/*
+ * Doing a truncate won't result in new nodes or leaves, just what we need for
+ * COW.
+ */
+static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root,
+ unsigned num_items)
+{
+ return root->nodesize * BTRFS_MAX_LEVEL * num_items;
+}
+
+int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
+int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, unsigned long count);
+int btrfs_async_run_delayed_refs(struct btrfs_root *root,
+ unsigned long count, int wait);
+int btrfs_lookup_data_extent(struct btrfs_root *root, u64 start, u64 len);
+int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytenr,
+ u64 offset, int metadata, u64 *refs, u64 *flags);
+int btrfs_pin_extent(struct btrfs_root *root,
+ u64 bytenr, u64 num, int reserved);
+int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes);
+int btrfs_exclude_logged_extents(struct btrfs_root *root,
+ struct extent_buffer *eb);
+int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 objectid, u64 offset, u64 bytenr);
+struct btrfs_block_group_cache *btrfs_lookup_block_group(
+ struct btrfs_fs_info *info,
+ u64 bytenr);
+void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
+int get_block_group_index(struct btrfs_block_group_cache *cache);
+struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 parent,
+ u64 root_objectid,
+ struct btrfs_disk_key *key, int level,
+ u64 hint, u64 empty_size);
+void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *buf,
+ u64 parent, int last_ref);
+int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 root_objectid, u64 owner,
+ u64 offset, struct btrfs_key *ins);
+int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 root_objectid, u64 owner, u64 offset,
+ struct btrfs_key *ins);
+int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes,
+ u64 min_alloc_size, u64 empty_size, u64 hint_byte,
+ struct btrfs_key *ins, int is_data, int delalloc);
+int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct extent_buffer *buf, int full_backref);
+int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct extent_buffer *buf, int full_backref);
+int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes, u64 flags,
+ int level, int is_data);
+int btrfs_free_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
+ u64 owner, u64 offset, int no_quota);
+
+int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len,
+ int delalloc);
+int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
+ u64 start, u64 len);
+void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes, u64 parent,
+ u64 root_objectid, u64 owner, u64 offset, int no_quota);
+
+int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr);
+int btrfs_free_block_groups(struct btrfs_fs_info *info);
+int btrfs_read_block_groups(struct btrfs_root *root);
+int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr);
+int btrfs_make_block_group(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytes_used,
+ u64 type, u64 chunk_objectid, u64 chunk_offset,
+ u64 size);
+int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 group_start,
+ struct extent_map *em);
+void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
+void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
+void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
+
+enum btrfs_reserve_flush_enum {
+ /* If we are in the transaction, we can't flush anything.*/
+ BTRFS_RESERVE_NO_FLUSH,
+ /*
+ * Flushing delalloc may cause deadlock somewhere, in this
+ * case, use FLUSH LIMIT
+ */
+ BTRFS_RESERVE_FLUSH_LIMIT,
+ BTRFS_RESERVE_FLUSH_ALL,
+};
+
+int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes);
+void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
+void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
+ struct inode *inode);
+void btrfs_orphan_release_metadata(struct inode *inode);
+int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
+ struct btrfs_block_rsv *rsv,
+ int nitems,
+ u64 *qgroup_reserved, bool use_global_rsv);
+void btrfs_subvolume_release_metadata(struct btrfs_root *root,
+ struct btrfs_block_rsv *rsv,
+ u64 qgroup_reserved);
+int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
+void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
+int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
+void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes);
+void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
+struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
+ unsigned short type);
+void btrfs_free_block_rsv(struct btrfs_root *root,
+ struct btrfs_block_rsv *rsv);
+void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv);
+int btrfs_block_rsv_add(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv, u64 num_bytes,
+ enum btrfs_reserve_flush_enum flush);
+int btrfs_block_rsv_check(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv, int min_factor);
+int btrfs_block_rsv_refill(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv, u64 min_reserved,
+ enum btrfs_reserve_flush_enum flush);
+int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
+ struct btrfs_block_rsv *dst_rsv,
+ u64 num_bytes);
+int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *dest, u64 num_bytes,
+ int min_factor);
+void btrfs_block_rsv_release(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ u64 num_bytes);
+int btrfs_set_block_group_ro(struct btrfs_root *root,
+ struct btrfs_block_group_cache *cache);
+void btrfs_set_block_group_rw(struct btrfs_root *root,
+ struct btrfs_block_group_cache *cache);
+void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
+u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
+int btrfs_error_unpin_extent_range(struct btrfs_root *root,
+ u64 start, u64 end);
+int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes, u64 *actual_bytes);
+int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 type);
+int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range);
+
+int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
+int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info);
+int __get_raid_index(u64 flags);
+int btrfs_start_write_no_snapshoting(struct btrfs_root *root);
+void btrfs_end_write_no_snapshoting(struct btrfs_root *root);
+/* ctree.c */
+int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
+ int level, int *slot);
+int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2);
+int btrfs_previous_item(struct btrfs_root *root,
+ struct btrfs_path *path, u64 min_objectid,
+ int type);
+int btrfs_previous_extent_item(struct btrfs_root *root,
+ struct btrfs_path *path, u64 min_objectid);
+void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path,
+ struct btrfs_key *new_key);
+struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
+struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
+int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
+ struct btrfs_key *key, int lowest_level,
+ u64 min_trans);
+int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
+ struct btrfs_path *path,
+ u64 min_trans);
+enum btrfs_compare_tree_result {
+ BTRFS_COMPARE_TREE_NEW,
+ BTRFS_COMPARE_TREE_DELETED,
+ BTRFS_COMPARE_TREE_CHANGED,
+ BTRFS_COMPARE_TREE_SAME,
+};
+typedef int (*btrfs_changed_cb_t)(struct btrfs_root *left_root,
+ struct btrfs_root *right_root,
+ struct btrfs_path *left_path,
+ struct btrfs_path *right_path,
+ struct btrfs_key *key,
+ enum btrfs_compare_tree_result result,
+ void *ctx);
+int btrfs_compare_trees(struct btrfs_root *left_root,
+ struct btrfs_root *right_root,
+ btrfs_changed_cb_t cb, void *ctx);
+int btrfs_cow_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct extent_buffer *buf,
+ struct extent_buffer *parent, int parent_slot,
+ struct extent_buffer **cow_ret);
+int btrfs_copy_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *buf,
+ struct extent_buffer **cow_ret, u64 new_root_objectid);
+int btrfs_block_can_be_shared(struct btrfs_root *root,
+ struct extent_buffer *buf);
+void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,
+ u32 data_size);
+void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
+ u32 new_size, int from_end);
+int btrfs_split_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_key *new_key,
+ unsigned long split_offset);
+int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_key *new_key);
+int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
+ u64 inum, u64 ioff, u8 key_type, struct btrfs_key *found_key);
+int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_key *key, struct btrfs_path *p, int
+ ins_len, int cow);
+int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
+ struct btrfs_path *p, u64 time_seq);
+int btrfs_search_slot_for_read(struct btrfs_root *root,
+ struct btrfs_key *key, struct btrfs_path *p,
+ int find_higher, int return_any);
+int btrfs_realloc_node(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct extent_buffer *parent,
+ int start_slot, u64 *last_ret,
+ struct btrfs_key *progress);
+void btrfs_release_path(struct btrfs_path *p);
+struct btrfs_path *btrfs_alloc_path(void);
+void btrfs_free_path(struct btrfs_path *p);
+void btrfs_set_path_blocking(struct btrfs_path *p);
+void btrfs_clear_path_blocking(struct btrfs_path *p,
+ struct extent_buffer *held, int held_rw);
+void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
+
+int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct btrfs_path *path, int slot, int nr);
+static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path)
+{
+ return btrfs_del_items(trans, root, path, path->slots[0], 1);
+}
+
+void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
+ struct btrfs_key *cpu_key, u32 *data_size,
+ u32 total_data, u32 total_size, int nr);
+int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_key *key, void *data, u32 data_size);
+int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_key *cpu_key, u32 *data_size, int nr);
+
+static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_key *key,
+ u32 data_size)
+{
+ return btrfs_insert_empty_items(trans, root, path, key, &data_size, 1);
+}
+
+int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
+int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
+int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
+ u64 time_seq);
+static inline int btrfs_next_old_item(struct btrfs_root *root,
+ struct btrfs_path *p, u64 time_seq)
+{
+ ++p->slots[0];
+ if (p->slots[0] >= btrfs_header_nritems(p->nodes[0]))
+ return btrfs_next_old_leaf(root, p, time_seq);
+ return 0;
+}
+static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p)
+{
+ return btrfs_next_old_item(root, p, 0);
+}
+int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
+int __must_check btrfs_drop_snapshot(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ int update_ref, int for_reloc);
+int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *node,
+ struct extent_buffer *parent);
+static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
+{
+ /*
+ * Get synced with close_ctree()
+ */
+ smp_mb();
+ return fs_info->closing;
+}
+
+/*
+ * If we remount the fs to be R/O or umount the fs, the cleaner needn't do
+ * anything except sleeping. This function is used to check the status of
+ * the fs.
+ */
+static inline int btrfs_need_cleaner_sleep(struct btrfs_root *root)
+{
+ return (root->fs_info->sb->s_flags & MS_RDONLY ||
+ btrfs_fs_closing(root->fs_info));
+}
+
+static inline void free_fs_info(struct btrfs_fs_info *fs_info)
+{
+ kfree(fs_info->balance_ctl);
+ kfree(fs_info->delayed_root);
+ kfree(fs_info->extent_root);
+ kfree(fs_info->tree_root);
+ kfree(fs_info->chunk_root);
+ kfree(fs_info->dev_root);
+ kfree(fs_info->csum_root);
+ kfree(fs_info->quota_root);
+ kfree(fs_info->uuid_root);
+ kfree(fs_info->super_copy);
+ kfree(fs_info->super_for_commit);
+ security_free_mnt_opts(&fs_info->security_opts);
+ kfree(fs_info);
+}
+
+/* tree mod log functions from ctree.c */
+u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
+ struct seq_list *elem);
+void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
+ struct seq_list *elem);
+int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq);
+
+/* root-item.c */
+int btrfs_find_root_ref(struct btrfs_root *tree_root,
+ struct btrfs_path *path,
+ u64 root_id, u64 ref_id);
+int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *tree_root,
+ u64 root_id, u64 ref_id, u64 dirid, u64 sequence,
+ const char *name, int name_len);
+int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *tree_root,
+ u64 root_id, u64 ref_id, u64 dirid, u64 *sequence,
+ const char *name, int name_len);
+int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct btrfs_key *key);
+int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_key *key, struct btrfs_root_item
+ *item);
+int __must_check btrfs_update_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_key *key,
+ struct btrfs_root_item *item);
+int btrfs_find_root(struct btrfs_root *root, struct btrfs_key *search_key,
+ struct btrfs_path *path, struct btrfs_root_item *root_item,
+ struct btrfs_key *root_key);
+int btrfs_find_orphan_roots(struct btrfs_root *tree_root);
+void btrfs_set_root_node(struct btrfs_root_item *item,
+ struct extent_buffer *node);
+void btrfs_check_and_init_root_item(struct btrfs_root_item *item);
+void btrfs_update_root_times(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+
+/* uuid-tree.c */
+int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans,
+ struct btrfs_root *uuid_root, u8 *uuid, u8 type,
+ u64 subid);
+int btrfs_uuid_tree_rem(struct btrfs_trans_handle *trans,
+ struct btrfs_root *uuid_root, u8 *uuid, u8 type,
+ u64 subid);
+int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info,
+ int (*check_func)(struct btrfs_fs_info *, u8 *, u8,
+ u64));
+
+/* dir-item.c */
+int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
+ const char *name, int name_len);
+int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, const char *name,
+ int name_len, struct inode *dir,
+ struct btrfs_key *location, u8 type, u64 index);
+struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 dir,
+ const char *name, int name_len,
+ int mod);
+struct btrfs_dir_item *
+btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 dir,
+ u64 objectid, const char *name, int name_len,
+ int mod);
+struct btrfs_dir_item *
+btrfs_search_dir_index_item(struct btrfs_root *root,
+ struct btrfs_path *path, u64 dirid,
+ const char *name, int name_len);
+int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_dir_item *di);
+int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 objectid,
+ const char *name, u16 name_len,
+ const void *data, u16 data_len);
+struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 dir,
+ const char *name, u16 name_len,
+ int mod);
+int verify_dir_item(struct btrfs_root *root,
+ struct extent_buffer *leaf,
+ struct btrfs_dir_item *dir_item);
+struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
+ struct btrfs_path *path,
+ const char *name,
+ int name_len);
+
+/* orphan.c */
+int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 offset);
+int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 offset);
+int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset);
+
+/* inode-item.c */
+int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+ u64 inode_objectid, u64 ref_objectid, u64 index);
+int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+ u64 inode_objectid, u64 ref_objectid, u64 *index);
+int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 objectid);
+int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_path *path,
+ struct btrfs_key *location, int mod);
+
+struct btrfs_inode_extref *
+btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ const char *name, int name_len,
+ u64 inode_objectid, u64 ref_objectid, int ins_len,
+ int cow);
+
+int btrfs_find_name_in_ext_backref(struct btrfs_path *path,
+ u64 ref_objectid, const char *name,
+ int name_len,
+ struct btrfs_inode_extref **extref_ret);
+
+/* file-item.c */
+struct btrfs_dio_private;
+int btrfs_del_csums(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytenr, u64 len);
+int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
+ struct bio *bio, u32 *dst);
+int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
+ struct bio *bio, u64 logical_offset);
+int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 objectid, u64 pos,
+ u64 disk_offset, u64 disk_num_bytes,
+ u64 num_bytes, u64 offset, u64 ram_bytes,
+ u8 compression, u8 encryption, u16 other_encoding);
+int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 objectid,
+ u64 bytenr, int mod);
+int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_ordered_sum *sums);
+int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
+ struct bio *bio, u64 file_start, int contig);
+int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
+ struct list_head *list, int search_commit);
+void btrfs_extent_item_to_extent_map(struct inode *inode,
+ const struct btrfs_path *path,
+ struct btrfs_file_extent_item *fi,
+ const bool new_inline,
+ struct extent_map *em);
+
+/* inode.c */
+struct btrfs_delalloc_work {
+ struct inode *inode;
+ int wait;
+ int delay_iput;
+ struct completion completion;
+ struct list_head list;
+ struct btrfs_work work;
+};
+
+struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
+ int wait, int delay_iput);
+void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work);
+
+struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
+ size_t pg_offset, u64 start, u64 len,
+ int create);
+noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
+ u64 *orig_start, u64 *orig_block_len,
+ u64 *ram_bytes);
+
+/* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */
+#if defined(ClearPageFsMisc) && !defined(ClearPageChecked)
+#define ClearPageChecked ClearPageFsMisc
+#define SetPageChecked SetPageFsMisc
+#define PageChecked PageFsMisc
+#endif
+
+/* This forces readahead on a given range of bytes in an inode */
+static inline void btrfs_force_ra(struct address_space *mapping,
+ struct file_ra_state *ra, struct file *file,
+ pgoff_t offset, unsigned long req_size)
+{
+ page_cache_sync_readahead(mapping, ra, file, offset, req_size);
+}
+
+struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry);
+int btrfs_set_inode_index(struct inode *dir, u64 *index);
+int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *dir, struct inode *inode,
+ const char *name, int name_len);
+int btrfs_add_link(struct btrfs_trans_handle *trans,
+ struct inode *parent_inode, struct inode *inode,
+ const char *name, int name_len, int add_backref, u64 index);
+int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *dir, u64 objectid,
+ const char *name, int name_len);
+int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
+ int front);
+int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *inode, u64 new_size,
+ u32 min_type);
+
+int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
+ int nr);
+int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
+ struct extent_state **cached_state);
+int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *new_root,
+ struct btrfs_root *parent_root,
+ u64 new_dirid);
+int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
+ size_t size, struct bio *bio,
+ unsigned long bio_flags);
+int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
+int btrfs_readpage(struct file *file, struct page *page);
+void btrfs_evict_inode(struct inode *inode);
+int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
+struct inode *btrfs_alloc_inode(struct super_block *sb);
+void btrfs_destroy_inode(struct inode *inode);
+int btrfs_drop_inode(struct inode *inode);
+int btrfs_init_cachep(void);
+void btrfs_destroy_cachep(void);
+long btrfs_ioctl_trans_end(struct file *file);
+struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
+ struct btrfs_root *root, int *was_new);
+struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
+ size_t pg_offset, u64 start, u64 end,
+ int create);
+int btrfs_update_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *inode);
+int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode);
+int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
+int btrfs_orphan_cleanup(struct btrfs_root *root);
+void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size);
+void btrfs_invalidate_inodes(struct btrfs_root *root);
+void btrfs_add_delayed_iput(struct inode *inode);
+void btrfs_run_delayed_iputs(struct btrfs_root *root);
+int btrfs_prealloc_file_range(struct inode *inode, int mode,
+ u64 start, u64 num_bytes, u64 min_size,
+ loff_t actual_len, u64 *alloc_hint);
+int btrfs_prealloc_file_range_trans(struct inode *inode,
+ struct btrfs_trans_handle *trans, int mode,
+ u64 start, u64 num_bytes, u64 min_size,
+ loff_t actual_len, u64 *alloc_hint);
+int btrfs_inode_check_errors(struct inode *inode);
+extern const struct dentry_operations btrfs_dentry_operations;
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+void btrfs_test_inode_set_ops(struct inode *inode);
+#endif
+
+/* ioctl.c */
+long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+void btrfs_update_iflags(struct inode *inode);
+void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
+int btrfs_is_empty_uuid(u8 *uuid);
+int btrfs_defrag_file(struct inode *inode, struct file *file,
+ struct btrfs_ioctl_defrag_range_args *range,
+ u64 newer_than, unsigned long max_pages);
+void btrfs_get_block_group_info(struct list_head *groups_list,
+ struct btrfs_ioctl_space_info *space);
+void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
+ struct btrfs_ioctl_balance_args *bargs);
+
+
+/* file.c */
+int btrfs_auto_defrag_init(void);
+void btrfs_auto_defrag_exit(void);
+int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
+ struct inode *inode);
+int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
+void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info);
+int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
+void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
+ int skip_pinned);
+extern const struct file_operations btrfs_file_operations;
+int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode,
+ struct btrfs_path *path, u64 start, u64 end,
+ u64 *drop_end, int drop_cache,
+ int replace_extent,
+ u32 extent_item_size,
+ int *key_inserted);
+int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode, u64 start,
+ u64 end, int drop_cache);
+int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
+ struct inode *inode, u64 start, u64 end);
+int btrfs_release_file(struct inode *inode, struct file *file);
+int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
+ struct page **pages, size_t num_pages,
+ loff_t pos, size_t write_bytes,
+ struct extent_state **cached);
+int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
+
+/* tree-defrag.c */
+int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+
+/* sysfs.c */
+int btrfs_init_sysfs(void);
+void btrfs_exit_sysfs(void);
+int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info);
+void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info);
+
+/* xattr.c */
+ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
+
+/* super.c */
+int btrfs_parse_options(struct btrfs_root *root, char *options);
+int btrfs_sync_fs(struct super_block *sb, int wait);
+
+#ifdef CONFIG_PRINTK
+__printf(2, 3)
+void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...);
+#else
+static inline __printf(2, 3)
+void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
+{
+}
+#endif
+
+#define btrfs_emerg(fs_info, fmt, args...) \
+ btrfs_printk(fs_info, KERN_EMERG fmt, ##args)
+#define btrfs_alert(fs_info, fmt, args...) \
+ btrfs_printk(fs_info, KERN_ALERT fmt, ##args)
+#define btrfs_crit(fs_info, fmt, args...) \
+ btrfs_printk(fs_info, KERN_CRIT fmt, ##args)
+#define btrfs_err(fs_info, fmt, args...) \
+ btrfs_printk(fs_info, KERN_ERR fmt, ##args)
+#define btrfs_warn(fs_info, fmt, args...) \
+ btrfs_printk(fs_info, KERN_WARNING fmt, ##args)
+#define btrfs_notice(fs_info, fmt, args...) \
+ btrfs_printk(fs_info, KERN_NOTICE fmt, ##args)
+#define btrfs_info(fs_info, fmt, args...) \
+ btrfs_printk(fs_info, KERN_INFO fmt, ##args)
+
+#ifdef DEBUG
+#define btrfs_debug(fs_info, fmt, args...) \
+ btrfs_printk(fs_info, KERN_DEBUG fmt, ##args)
+#else
+#define btrfs_debug(fs_info, fmt, args...) \
+ no_printk(KERN_DEBUG fmt, ##args)
+#endif
+
+#ifdef CONFIG_BTRFS_ASSERT
+
+static inline void assfail(char *expr, char *file, int line)
+{
+ pr_err("BTRFS: assertion failed: %s, file: %s, line: %d",
+ expr, file, line);
+ BUG();
+}
+
+#define ASSERT(expr) \
+ (likely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
+#else
+#define ASSERT(expr) ((void)0)
+#endif
+
+#define btrfs_assert()
+__printf(5, 6)
+void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
+ unsigned int line, int errno, const char *fmt, ...);
+
+
+void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, const char *function,
+ unsigned int line, int errno);
+
+#define btrfs_set_fs_incompat(__fs_info, opt) \
+ __btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt)
+
+static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info,
+ u64 flag)
+{
+ struct btrfs_super_block *disk_super;
+ u64 features;
+
+ disk_super = fs_info->super_copy;
+ features = btrfs_super_incompat_flags(disk_super);
+ if (!(features & flag)) {
+ spin_lock(&fs_info->super_lock);
+ features = btrfs_super_incompat_flags(disk_super);
+ if (!(features & flag)) {
+ features |= flag;
+ btrfs_set_super_incompat_flags(disk_super, features);
+ btrfs_info(fs_info, "setting %llu feature flag",
+ flag);
+ }
+ spin_unlock(&fs_info->super_lock);
+ }
+}
+
+#define btrfs_fs_incompat(fs_info, opt) \
+ __btrfs_fs_incompat((fs_info), BTRFS_FEATURE_INCOMPAT_##opt)
+
+static inline int __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag)
+{
+ struct btrfs_super_block *disk_super;
+ disk_super = fs_info->super_copy;
+ return !!(btrfs_super_incompat_flags(disk_super) & flag);
+}
+
+/*
+ * Call btrfs_abort_transaction as early as possible when an error condition is
+ * detected, that way the exact line number is reported.
+ */
+
+#define btrfs_abort_transaction(trans, root, errno) \
+do { \
+ __btrfs_abort_transaction(trans, root, __func__, \
+ __LINE__, errno); \
+} while (0)
+
+#define btrfs_std_error(fs_info, errno) \
+do { \
+ if ((errno)) \
+ __btrfs_std_error((fs_info), __func__, \
+ __LINE__, (errno), NULL); \
+} while (0)
+
+#define btrfs_error(fs_info, errno, fmt, args...) \
+do { \
+ __btrfs_std_error((fs_info), __func__, __LINE__, \
+ (errno), fmt, ##args); \
+} while (0)
+
+__printf(5, 6)
+void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
+ unsigned int line, int errno, const char *fmt, ...);
+
+/*
+ * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic
+ * will panic(). Otherwise we BUG() here.
+ */
+#define btrfs_panic(fs_info, errno, fmt, args...) \
+do { \
+ __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \
+ BUG(); \
+} while (0)
+
+/* acl.c */
+#ifdef CONFIG_BTRFS_FS_POSIX_ACL
+struct posix_acl *btrfs_get_acl(struct inode *inode, int type);
+int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+int btrfs_init_acl(struct btrfs_trans_handle *trans,
+ struct inode *inode, struct inode *dir);
+#else
+#define btrfs_get_acl NULL
+#define btrfs_set_acl NULL
+static inline int btrfs_init_acl(struct btrfs_trans_handle *trans,
+ struct inode *inode, struct inode *dir)
+{
+ return 0;
+}
+#endif
+
+/* relocation.c */
+int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start);
+int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+int btrfs_recover_relocation(struct btrfs_root *root);
+int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
+int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct extent_buffer *buf,
+ struct extent_buffer *cow);
+void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
+ struct btrfs_pending_snapshot *pending,
+ u64 *bytes_to_reserve);
+int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
+ struct btrfs_pending_snapshot *pending);
+
+/* scrub.c */
+int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
+ u64 end, struct btrfs_scrub_progress *progress,
+ int readonly, int is_dev_replace);
+void btrfs_scrub_pause(struct btrfs_root *root);
+void btrfs_scrub_continue(struct btrfs_root *root);
+int btrfs_scrub_cancel(struct btrfs_fs_info *info);
+int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info,
+ struct btrfs_device *dev);
+int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
+ struct btrfs_scrub_progress *progress);
+
+/* dev-replace.c */
+void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info);
+void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info);
+void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount);
+
+static inline void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info)
+{
+ btrfs_bio_counter_sub(fs_info, 1);
+}
+
+/* reada.c */
+struct reada_control {
+ struct btrfs_root *root; /* tree to prefetch */
+ struct btrfs_key key_start;
+ struct btrfs_key key_end; /* exclusive */
+ atomic_t elems;
+ struct kref refcnt;
+ wait_queue_head_t wait;
+};
+struct reada_control *btrfs_reada_add(struct btrfs_root *root,
+ struct btrfs_key *start, struct btrfs_key *end);
+int btrfs_reada_wait(void *handle);
+void btrfs_reada_detach(void *handle);
+int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
+ u64 start, int err);
+
+static inline int is_fstree(u64 rootid)
+{
+ if (rootid == BTRFS_FS_TREE_OBJECTID ||
+ ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID &&
+ !btrfs_qgroup_level(rootid)))
+ return 1;
+ return 0;
+}
+
+static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
+{
+ return signal_pending(current);
+}
+
+/* Sanity test specific functions */
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+void btrfs_test_destroy_inode(struct inode *inode);
+#endif
+
+static inline int btrfs_test_is_dummy_root(struct btrfs_root *root)
+{
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
+ return 1;
+#endif
+ return 0;
+}
+
+#endif
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
new file mode 100644
index 000000000..a2ae42720
--- /dev/null
+++ b/fs/btrfs/delayed-inode.c
@@ -0,0 +1,1997 @@
+/*
+ * Copyright (C) 2011 Fujitsu. All rights reserved.
+ * Written by Miao Xie <miaox@cn.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/slab.h>
+#include "delayed-inode.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "ctree.h"
+
+#define BTRFS_DELAYED_WRITEBACK 512
+#define BTRFS_DELAYED_BACKGROUND 128
+#define BTRFS_DELAYED_BATCH 16
+
+static struct kmem_cache *delayed_node_cache;
+
+int __init btrfs_delayed_inode_init(void)
+{
+ delayed_node_cache = kmem_cache_create("btrfs_delayed_node",
+ sizeof(struct btrfs_delayed_node),
+ 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+ NULL);
+ if (!delayed_node_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void btrfs_delayed_inode_exit(void)
+{
+ if (delayed_node_cache)
+ kmem_cache_destroy(delayed_node_cache);
+}
+
+static inline void btrfs_init_delayed_node(
+ struct btrfs_delayed_node *delayed_node,
+ struct btrfs_root *root, u64 inode_id)
+{
+ delayed_node->root = root;
+ delayed_node->inode_id = inode_id;
+ atomic_set(&delayed_node->refs, 0);
+ delayed_node->count = 0;
+ delayed_node->flags = 0;
+ delayed_node->ins_root = RB_ROOT;
+ delayed_node->del_root = RB_ROOT;
+ mutex_init(&delayed_node->mutex);
+ delayed_node->index_cnt = 0;
+ INIT_LIST_HEAD(&delayed_node->n_list);
+ INIT_LIST_HEAD(&delayed_node->p_list);
+ delayed_node->bytes_reserved = 0;
+ memset(&delayed_node->inode_item, 0, sizeof(delayed_node->inode_item));
+}
+
+static inline int btrfs_is_continuous_delayed_item(
+ struct btrfs_delayed_item *item1,
+ struct btrfs_delayed_item *item2)
+{
+ if (item1->key.type == BTRFS_DIR_INDEX_KEY &&
+ item1->key.objectid == item2->key.objectid &&
+ item1->key.type == item2->key.type &&
+ item1->key.offset + 1 == item2->key.offset)
+ return 1;
+ return 0;
+}
+
+static inline struct btrfs_delayed_root *btrfs_get_delayed_root(
+ struct btrfs_root *root)
+{
+ return root->fs_info->delayed_root;
+}
+
+static struct btrfs_delayed_node *btrfs_get_delayed_node(struct inode *inode)
+{
+ struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
+ struct btrfs_root *root = btrfs_inode->root;
+ u64 ino = btrfs_ino(inode);
+ struct btrfs_delayed_node *node;
+
+ node = ACCESS_ONCE(btrfs_inode->delayed_node);
+ if (node) {
+ atomic_inc(&node->refs);
+ return node;
+ }
+
+ spin_lock(&root->inode_lock);
+ node = radix_tree_lookup(&root->delayed_nodes_tree, ino);
+ if (node) {
+ if (btrfs_inode->delayed_node) {
+ atomic_inc(&node->refs); /* can be accessed */
+ BUG_ON(btrfs_inode->delayed_node != node);
+ spin_unlock(&root->inode_lock);
+ return node;
+ }
+ btrfs_inode->delayed_node = node;
+ /* can be accessed and cached in the inode */
+ atomic_add(2, &node->refs);
+ spin_unlock(&root->inode_lock);
+ return node;
+ }
+ spin_unlock(&root->inode_lock);
+
+ return NULL;
+}
+
+/* Will return either the node or PTR_ERR(-ENOMEM) */
+static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
+ struct inode *inode)
+{
+ struct btrfs_delayed_node *node;
+ struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
+ struct btrfs_root *root = btrfs_inode->root;
+ u64 ino = btrfs_ino(inode);
+ int ret;
+
+again:
+ node = btrfs_get_delayed_node(inode);
+ if (node)
+ return node;
+
+ node = kmem_cache_alloc(delayed_node_cache, GFP_NOFS);
+ if (!node)
+ return ERR_PTR(-ENOMEM);
+ btrfs_init_delayed_node(node, root, ino);
+
+ /* cached in the btrfs inode and can be accessed */
+ atomic_add(2, &node->refs);
+
+ ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+ if (ret) {
+ kmem_cache_free(delayed_node_cache, node);
+ return ERR_PTR(ret);
+ }
+
+ spin_lock(&root->inode_lock);
+ ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node);
+ if (ret == -EEXIST) {
+ spin_unlock(&root->inode_lock);
+ kmem_cache_free(delayed_node_cache, node);
+ radix_tree_preload_end();
+ goto again;
+ }
+ btrfs_inode->delayed_node = node;
+ spin_unlock(&root->inode_lock);
+ radix_tree_preload_end();
+
+ return node;
+}
+
+/*
+ * Call it when holding delayed_node->mutex
+ *
+ * If mod = 1, add this node into the prepared list.
+ */
+static void btrfs_queue_delayed_node(struct btrfs_delayed_root *root,
+ struct btrfs_delayed_node *node,
+ int mod)
+{
+ spin_lock(&root->lock);
+ if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) {
+ if (!list_empty(&node->p_list))
+ list_move_tail(&node->p_list, &root->prepare_list);
+ else if (mod)
+ list_add_tail(&node->p_list, &root->prepare_list);
+ } else {
+ list_add_tail(&node->n_list, &root->node_list);
+ list_add_tail(&node->p_list, &root->prepare_list);
+ atomic_inc(&node->refs); /* inserted into list */
+ root->nodes++;
+ set_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags);
+ }
+ spin_unlock(&root->lock);
+}
+
+/* Call it when holding delayed_node->mutex */
+static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root,
+ struct btrfs_delayed_node *node)
+{
+ spin_lock(&root->lock);
+ if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) {
+ root->nodes--;
+ atomic_dec(&node->refs); /* not in the list */
+ list_del_init(&node->n_list);
+ if (!list_empty(&node->p_list))
+ list_del_init(&node->p_list);
+ clear_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags);
+ }
+ spin_unlock(&root->lock);
+}
+
+static struct btrfs_delayed_node *btrfs_first_delayed_node(
+ struct btrfs_delayed_root *delayed_root)
+{
+ struct list_head *p;
+ struct btrfs_delayed_node *node = NULL;
+
+ spin_lock(&delayed_root->lock);
+ if (list_empty(&delayed_root->node_list))
+ goto out;
+
+ p = delayed_root->node_list.next;
+ node = list_entry(p, struct btrfs_delayed_node, n_list);
+ atomic_inc(&node->refs);
+out:
+ spin_unlock(&delayed_root->lock);
+
+ return node;
+}
+
+static struct btrfs_delayed_node *btrfs_next_delayed_node(
+ struct btrfs_delayed_node *node)
+{
+ struct btrfs_delayed_root *delayed_root;
+ struct list_head *p;
+ struct btrfs_delayed_node *next = NULL;
+
+ delayed_root = node->root->fs_info->delayed_root;
+ spin_lock(&delayed_root->lock);
+ if (!test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) {
+ /* not in the list */
+ if (list_empty(&delayed_root->node_list))
+ goto out;
+ p = delayed_root->node_list.next;
+ } else if (list_is_last(&node->n_list, &delayed_root->node_list))
+ goto out;
+ else
+ p = node->n_list.next;
+
+ next = list_entry(p, struct btrfs_delayed_node, n_list);
+ atomic_inc(&next->refs);
+out:
+ spin_unlock(&delayed_root->lock);
+
+ return next;
+}
+
+static void __btrfs_release_delayed_node(
+ struct btrfs_delayed_node *delayed_node,
+ int mod)
+{
+ struct btrfs_delayed_root *delayed_root;
+
+ if (!delayed_node)
+ return;
+
+ delayed_root = delayed_node->root->fs_info->delayed_root;
+
+ mutex_lock(&delayed_node->mutex);
+ if (delayed_node->count)
+ btrfs_queue_delayed_node(delayed_root, delayed_node, mod);
+ else
+ btrfs_dequeue_delayed_node(delayed_root, delayed_node);
+ mutex_unlock(&delayed_node->mutex);
+
+ if (atomic_dec_and_test(&delayed_node->refs)) {
+ bool free = false;
+ struct btrfs_root *root = delayed_node->root;
+ spin_lock(&root->inode_lock);
+ if (atomic_read(&delayed_node->refs) == 0) {
+ radix_tree_delete(&root->delayed_nodes_tree,
+ delayed_node->inode_id);
+ free = true;
+ }
+ spin_unlock(&root->inode_lock);
+ if (free)
+ kmem_cache_free(delayed_node_cache, delayed_node);
+ }
+}
+
+static inline void btrfs_release_delayed_node(struct btrfs_delayed_node *node)
+{
+ __btrfs_release_delayed_node(node, 0);
+}
+
+static struct btrfs_delayed_node *btrfs_first_prepared_delayed_node(
+ struct btrfs_delayed_root *delayed_root)
+{
+ struct list_head *p;
+ struct btrfs_delayed_node *node = NULL;
+
+ spin_lock(&delayed_root->lock);
+ if (list_empty(&delayed_root->prepare_list))
+ goto out;
+
+ p = delayed_root->prepare_list.next;
+ list_del_init(p);
+ node = list_entry(p, struct btrfs_delayed_node, p_list);
+ atomic_inc(&node->refs);
+out:
+ spin_unlock(&delayed_root->lock);
+
+ return node;
+}
+
+static inline void btrfs_release_prepared_delayed_node(
+ struct btrfs_delayed_node *node)
+{
+ __btrfs_release_delayed_node(node, 1);
+}
+
+static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len)
+{
+ struct btrfs_delayed_item *item;
+ item = kmalloc(sizeof(*item) + data_len, GFP_NOFS);
+ if (item) {
+ item->data_len = data_len;
+ item->ins_or_del = 0;
+ item->bytes_reserved = 0;
+ item->delayed_node = NULL;
+ atomic_set(&item->refs, 1);
+ }
+ return item;
+}
+
+/*
+ * __btrfs_lookup_delayed_item - look up the delayed item by key
+ * @delayed_node: pointer to the delayed node
+ * @key: the key to look up
+ * @prev: used to store the prev item if the right item isn't found
+ * @next: used to store the next item if the right item isn't found
+ *
+ * Note: if we don't find the right item, we will return the prev item and
+ * the next item.
+ */
+static struct btrfs_delayed_item *__btrfs_lookup_delayed_item(
+ struct rb_root *root,
+ struct btrfs_key *key,
+ struct btrfs_delayed_item **prev,
+ struct btrfs_delayed_item **next)
+{
+ struct rb_node *node, *prev_node = NULL;
+ struct btrfs_delayed_item *delayed_item = NULL;
+ int ret = 0;
+
+ node = root->rb_node;
+
+ while (node) {
+ delayed_item = rb_entry(node, struct btrfs_delayed_item,
+ rb_node);
+ prev_node = node;
+ ret = btrfs_comp_cpu_keys(&delayed_item->key, key);
+ if (ret < 0)
+ node = node->rb_right;
+ else if (ret > 0)
+ node = node->rb_left;
+ else
+ return delayed_item;
+ }
+
+ if (prev) {
+ if (!prev_node)
+ *prev = NULL;
+ else if (ret < 0)
+ *prev = delayed_item;
+ else if ((node = rb_prev(prev_node)) != NULL) {
+ *prev = rb_entry(node, struct btrfs_delayed_item,
+ rb_node);
+ } else
+ *prev = NULL;
+ }
+
+ if (next) {
+ if (!prev_node)
+ *next = NULL;
+ else if (ret > 0)
+ *next = delayed_item;
+ else if ((node = rb_next(prev_node)) != NULL) {
+ *next = rb_entry(node, struct btrfs_delayed_item,
+ rb_node);
+ } else
+ *next = NULL;
+ }
+ return NULL;
+}
+
+static struct btrfs_delayed_item *__btrfs_lookup_delayed_insertion_item(
+ struct btrfs_delayed_node *delayed_node,
+ struct btrfs_key *key)
+{
+ struct btrfs_delayed_item *item;
+
+ item = __btrfs_lookup_delayed_item(&delayed_node->ins_root, key,
+ NULL, NULL);
+ return item;
+}
+
+static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
+ struct btrfs_delayed_item *ins,
+ int action)
+{
+ struct rb_node **p, *node;
+ struct rb_node *parent_node = NULL;
+ struct rb_root *root;
+ struct btrfs_delayed_item *item;
+ int cmp;
+
+ if (action == BTRFS_DELAYED_INSERTION_ITEM)
+ root = &delayed_node->ins_root;
+ else if (action == BTRFS_DELAYED_DELETION_ITEM)
+ root = &delayed_node->del_root;
+ else
+ BUG();
+ p = &root->rb_node;
+ node = &ins->rb_node;
+
+ while (*p) {
+ parent_node = *p;
+ item = rb_entry(parent_node, struct btrfs_delayed_item,
+ rb_node);
+
+ cmp = btrfs_comp_cpu_keys(&item->key, &ins->key);
+ if (cmp < 0)
+ p = &(*p)->rb_right;
+ else if (cmp > 0)
+ p = &(*p)->rb_left;
+ else
+ return -EEXIST;
+ }
+
+ rb_link_node(node, parent_node, p);
+ rb_insert_color(node, root);
+ ins->delayed_node = delayed_node;
+ ins->ins_or_del = action;
+
+ if (ins->key.type == BTRFS_DIR_INDEX_KEY &&
+ action == BTRFS_DELAYED_INSERTION_ITEM &&
+ ins->key.offset >= delayed_node->index_cnt)
+ delayed_node->index_cnt = ins->key.offset + 1;
+
+ delayed_node->count++;
+ atomic_inc(&delayed_node->root->fs_info->delayed_root->items);
+ return 0;
+}
+
+static int __btrfs_add_delayed_insertion_item(struct btrfs_delayed_node *node,
+ struct btrfs_delayed_item *item)
+{
+ return __btrfs_add_delayed_item(node, item,
+ BTRFS_DELAYED_INSERTION_ITEM);
+}
+
+static int __btrfs_add_delayed_deletion_item(struct btrfs_delayed_node *node,
+ struct btrfs_delayed_item *item)
+{
+ return __btrfs_add_delayed_item(node, item,
+ BTRFS_DELAYED_DELETION_ITEM);
+}
+
+static void finish_one_item(struct btrfs_delayed_root *delayed_root)
+{
+ int seq = atomic_inc_return(&delayed_root->items_seq);
+ if ((atomic_dec_return(&delayed_root->items) <
+ BTRFS_DELAYED_BACKGROUND || seq % BTRFS_DELAYED_BATCH == 0) &&
+ waitqueue_active(&delayed_root->wait))
+ wake_up(&delayed_root->wait);
+}
+
+static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
+{
+ struct rb_root *root;
+ struct btrfs_delayed_root *delayed_root;
+
+ delayed_root = delayed_item->delayed_node->root->fs_info->delayed_root;
+
+ BUG_ON(!delayed_root);
+ BUG_ON(delayed_item->ins_or_del != BTRFS_DELAYED_DELETION_ITEM &&
+ delayed_item->ins_or_del != BTRFS_DELAYED_INSERTION_ITEM);
+
+ if (delayed_item->ins_or_del == BTRFS_DELAYED_INSERTION_ITEM)
+ root = &delayed_item->delayed_node->ins_root;
+ else
+ root = &delayed_item->delayed_node->del_root;
+
+ rb_erase(&delayed_item->rb_node, root);
+ delayed_item->delayed_node->count--;
+
+ finish_one_item(delayed_root);
+}
+
+static void btrfs_release_delayed_item(struct btrfs_delayed_item *item)
+{
+ if (item) {
+ __btrfs_remove_delayed_item(item);
+ if (atomic_dec_and_test(&item->refs))
+ kfree(item);
+ }
+}
+
+static struct btrfs_delayed_item *__btrfs_first_delayed_insertion_item(
+ struct btrfs_delayed_node *delayed_node)
+{
+ struct rb_node *p;
+ struct btrfs_delayed_item *item = NULL;
+
+ p = rb_first(&delayed_node->ins_root);
+ if (p)
+ item = rb_entry(p, struct btrfs_delayed_item, rb_node);
+
+ return item;
+}
+
+static struct btrfs_delayed_item *__btrfs_first_delayed_deletion_item(
+ struct btrfs_delayed_node *delayed_node)
+{
+ struct rb_node *p;
+ struct btrfs_delayed_item *item = NULL;
+
+ p = rb_first(&delayed_node->del_root);
+ if (p)
+ item = rb_entry(p, struct btrfs_delayed_item, rb_node);
+
+ return item;
+}
+
+static struct btrfs_delayed_item *__btrfs_next_delayed_item(
+ struct btrfs_delayed_item *item)
+{
+ struct rb_node *p;
+ struct btrfs_delayed_item *next = NULL;
+
+ p = rb_next(&item->rb_node);
+ if (p)
+ next = rb_entry(p, struct btrfs_delayed_item, rb_node);
+
+ return next;
+}
+
+static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_delayed_item *item)
+{
+ struct btrfs_block_rsv *src_rsv;
+ struct btrfs_block_rsv *dst_rsv;
+ u64 num_bytes;
+ int ret;
+
+ if (!trans->bytes_reserved)
+ return 0;
+
+ src_rsv = trans->block_rsv;
+ dst_rsv = &root->fs_info->delayed_block_rsv;
+
+ num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+ ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
+ if (!ret) {
+ trace_btrfs_space_reservation(root->fs_info, "delayed_item",
+ item->key.objectid,
+ num_bytes, 1);
+ item->bytes_reserved = num_bytes;
+ }
+
+ return ret;
+}
+
+static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
+ struct btrfs_delayed_item *item)
+{
+ struct btrfs_block_rsv *rsv;
+
+ if (!item->bytes_reserved)
+ return;
+
+ rsv = &root->fs_info->delayed_block_rsv;
+ trace_btrfs_space_reservation(root->fs_info, "delayed_item",
+ item->key.objectid, item->bytes_reserved,
+ 0);
+ btrfs_block_rsv_release(root, rsv,
+ item->bytes_reserved);
+}
+
+static int btrfs_delayed_inode_reserve_metadata(
+ struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *inode,
+ struct btrfs_delayed_node *node)
+{
+ struct btrfs_block_rsv *src_rsv;
+ struct btrfs_block_rsv *dst_rsv;
+ u64 num_bytes;
+ int ret;
+ bool release = false;
+
+ src_rsv = trans->block_rsv;
+ dst_rsv = &root->fs_info->delayed_block_rsv;
+
+ num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+
+ /*
+ * btrfs_dirty_inode will update the inode under btrfs_join_transaction
+ * which doesn't reserve space for speed. This is a problem since we
+ * still need to reserve space for this update, so try to reserve the
+ * space.
+ *
+ * Now if src_rsv == delalloc_block_rsv we'll let it just steal since
+ * we're accounted for.
+ */
+ if (!src_rsv || (!trans->bytes_reserved &&
+ src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {
+ ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
+ BTRFS_RESERVE_NO_FLUSH);
+ /*
+ * Since we're under a transaction reserve_metadata_bytes could
+ * try to commit the transaction which will make it return
+ * EAGAIN to make us stop the transaction we have, so return
+ * ENOSPC instead so that btrfs_dirty_inode knows what to do.
+ */
+ if (ret == -EAGAIN)
+ ret = -ENOSPC;
+ if (!ret) {
+ node->bytes_reserved = num_bytes;
+ trace_btrfs_space_reservation(root->fs_info,
+ "delayed_inode",
+ btrfs_ino(inode),
+ num_bytes, 1);
+ }
+ return ret;
+ } else if (src_rsv->type == BTRFS_BLOCK_RSV_DELALLOC) {
+ spin_lock(&BTRFS_I(inode)->lock);
+ if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
+ &BTRFS_I(inode)->runtime_flags)) {
+ spin_unlock(&BTRFS_I(inode)->lock);
+ release = true;
+ goto migrate;
+ }
+ spin_unlock(&BTRFS_I(inode)->lock);
+
+ /* Ok we didn't have space pre-reserved. This shouldn't happen
+ * too often but it can happen if we do delalloc to an existing
+ * inode which gets dirtied because of the time update, and then
+ * isn't touched again until after the transaction commits and
+ * then we try to write out the data. First try to be nice and
+ * reserve something strictly for us. If not be a pain and try
+ * to steal from the delalloc block rsv.
+ */
+ ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
+ BTRFS_RESERVE_NO_FLUSH);
+ if (!ret)
+ goto out;
+
+ ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
+ if (!WARN_ON(ret))
+ goto out;
+
+ /*
+ * Ok this is a problem, let's just steal from the global rsv
+ * since this really shouldn't happen that often.
+ */
+ ret = btrfs_block_rsv_migrate(&root->fs_info->global_block_rsv,
+ dst_rsv, num_bytes);
+ goto out;
+ }
+
+migrate:
+ ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
+
+out:
+ /*
+ * Migrate only takes a reservation, it doesn't touch the size of the
+ * block_rsv. This is to simplify people who don't normally have things
+ * migrated from their block rsv. If they go to release their
+ * reservation, that will decrease the size as well, so if migrate
+ * reduced size we'd end up with a negative size. But for the
+ * delalloc_meta_reserved stuff we will only know to drop 1 reservation,
+ * but we could in fact do this reserve/migrate dance several times
+ * between the time we did the original reservation and we'd clean it
+ * up. So to take care of this, release the space for the meta
+ * reservation here. I think it may be time for a documentation page on
+ * how block rsvs. work.
+ */
+ if (!ret) {
+ trace_btrfs_space_reservation(root->fs_info, "delayed_inode",
+ btrfs_ino(inode), num_bytes, 1);
+ node->bytes_reserved = num_bytes;
+ }
+
+ if (release) {
+ trace_btrfs_space_reservation(root->fs_info, "delalloc",
+ btrfs_ino(inode), num_bytes, 0);
+ btrfs_block_rsv_release(root, src_rsv, num_bytes);
+ }
+
+ return ret;
+}
+
+static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root,
+ struct btrfs_delayed_node *node)
+{
+ struct btrfs_block_rsv *rsv;
+
+ if (!node->bytes_reserved)
+ return;
+
+ rsv = &root->fs_info->delayed_block_rsv;
+ trace_btrfs_space_reservation(root->fs_info, "delayed_inode",
+ node->inode_id, node->bytes_reserved, 0);
+ btrfs_block_rsv_release(root, rsv,
+ node->bytes_reserved);
+ node->bytes_reserved = 0;
+}
+
+/*
+ * This helper will insert some continuous items into the same leaf according
+ * to the free space of the leaf.
+ */
+static int btrfs_batch_insert_items(struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_delayed_item *item)
+{
+ struct btrfs_delayed_item *curr, *next;
+ int free_space;
+ int total_data_size = 0, total_size = 0;
+ struct extent_buffer *leaf;
+ char *data_ptr;
+ struct btrfs_key *keys;
+ u32 *data_size;
+ struct list_head head;
+ int slot;
+ int nitems;
+ int i;
+ int ret = 0;
+
+ BUG_ON(!path->nodes[0]);
+
+ leaf = path->nodes[0];
+ free_space = btrfs_leaf_free_space(root, leaf);
+ INIT_LIST_HEAD(&head);
+
+ next = item;
+ nitems = 0;
+
+ /*
+ * count the number of the continuous items that we can insert in batch
+ */
+ while (total_size + next->data_len + sizeof(struct btrfs_item) <=
+ free_space) {
+ total_data_size += next->data_len;
+ total_size += next->data_len + sizeof(struct btrfs_item);
+ list_add_tail(&next->tree_list, &head);
+ nitems++;
+
+ curr = next;
+ next = __btrfs_next_delayed_item(curr);
+ if (!next)
+ break;
+
+ if (!btrfs_is_continuous_delayed_item(curr, next))
+ break;
+ }
+
+ if (!nitems) {
+ ret = 0;
+ goto out;
+ }
+
+ /*
+ * we need allocate some memory space, but it might cause the task
+ * to sleep, so we set all locked nodes in the path to blocking locks
+ * first.
+ */
+ btrfs_set_path_blocking(path);
+
+ keys = kmalloc_array(nitems, sizeof(struct btrfs_key), GFP_NOFS);
+ if (!keys) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ data_size = kmalloc_array(nitems, sizeof(u32), GFP_NOFS);
+ if (!data_size) {
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ /* get keys of all the delayed items */
+ i = 0;
+ list_for_each_entry(next, &head, tree_list) {
+ keys[i] = next->key;
+ data_size[i] = next->data_len;
+ i++;
+ }
+
+ /* reset all the locked nodes in the patch to spinning locks. */
+ btrfs_clear_path_blocking(path, NULL, 0);
+
+ /* insert the keys of the items */
+ setup_items_for_insert(root, path, keys, data_size,
+ total_data_size, total_size, nitems);
+
+ /* insert the dir index items */
+ slot = path->slots[0];
+ list_for_each_entry_safe(curr, next, &head, tree_list) {
+ data_ptr = btrfs_item_ptr(leaf, slot, char);
+ write_extent_buffer(leaf, &curr->data,
+ (unsigned long)data_ptr,
+ curr->data_len);
+ slot++;
+
+ btrfs_delayed_item_release_metadata(root, curr);
+
+ list_del(&curr->tree_list);
+ btrfs_release_delayed_item(curr);
+ }
+
+error:
+ kfree(data_size);
+ kfree(keys);
+out:
+ return ret;
+}
+
+/*
+ * This helper can just do simple insertion that needn't extend item for new
+ * data, such as directory name index insertion, inode insertion.
+ */
+static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_delayed_item *delayed_item)
+{
+ struct extent_buffer *leaf;
+ char *ptr;
+ int ret;
+
+ ret = btrfs_insert_empty_item(trans, root, path, &delayed_item->key,
+ delayed_item->data_len);
+ if (ret < 0 && ret != -EEXIST)
+ return ret;
+
+ leaf = path->nodes[0];
+
+ ptr = btrfs_item_ptr(leaf, path->slots[0], char);
+
+ write_extent_buffer(leaf, delayed_item->data, (unsigned long)ptr,
+ delayed_item->data_len);
+ btrfs_mark_buffer_dirty(leaf);
+
+ btrfs_delayed_item_release_metadata(root, delayed_item);
+ return 0;
+}
+
+/*
+ * we insert an item first, then if there are some continuous items, we try
+ * to insert those items into the same leaf.
+ */
+static int btrfs_insert_delayed_items(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ struct btrfs_root *root,
+ struct btrfs_delayed_node *node)
+{
+ struct btrfs_delayed_item *curr, *prev;
+ int ret = 0;
+
+do_again:
+ mutex_lock(&node->mutex);
+ curr = __btrfs_first_delayed_insertion_item(node);
+ if (!curr)
+ goto insert_end;
+
+ ret = btrfs_insert_delayed_item(trans, root, path, curr);
+ if (ret < 0) {
+ btrfs_release_path(path);
+ goto insert_end;
+ }
+
+ prev = curr;
+ curr = __btrfs_next_delayed_item(prev);
+ if (curr && btrfs_is_continuous_delayed_item(prev, curr)) {
+ /* insert the continuous items into the same leaf */
+ path->slots[0]++;
+ btrfs_batch_insert_items(root, path, curr);
+ }
+ btrfs_release_delayed_item(prev);
+ btrfs_mark_buffer_dirty(path->nodes[0]);
+
+ btrfs_release_path(path);
+ mutex_unlock(&node->mutex);
+ goto do_again;
+
+insert_end:
+ mutex_unlock(&node->mutex);
+ return ret;
+}
+
+static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_delayed_item *item)
+{
+ struct btrfs_delayed_item *curr, *next;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ struct list_head head;
+ int nitems, i, last_item;
+ int ret = 0;
+
+ BUG_ON(!path->nodes[0]);
+
+ leaf = path->nodes[0];
+
+ i = path->slots[0];
+ last_item = btrfs_header_nritems(leaf) - 1;
+ if (i > last_item)
+ return -ENOENT; /* FIXME: Is errno suitable? */
+
+ next = item;
+ INIT_LIST_HEAD(&head);
+ btrfs_item_key_to_cpu(leaf, &key, i);
+ nitems = 0;
+ /*
+ * count the number of the dir index items that we can delete in batch
+ */
+ while (btrfs_comp_cpu_keys(&next->key, &key) == 0) {
+ list_add_tail(&next->tree_list, &head);
+ nitems++;
+
+ curr = next;
+ next = __btrfs_next_delayed_item(curr);
+ if (!next)
+ break;
+
+ if (!btrfs_is_continuous_delayed_item(curr, next))
+ break;
+
+ i++;
+ if (i > last_item)
+ break;
+ btrfs_item_key_to_cpu(leaf, &key, i);
+ }
+
+ if (!nitems)
+ return 0;
+
+ ret = btrfs_del_items(trans, root, path, path->slots[0], nitems);
+ if (ret)
+ goto out;
+
+ list_for_each_entry_safe(curr, next, &head, tree_list) {
+ btrfs_delayed_item_release_metadata(root, curr);
+ list_del(&curr->tree_list);
+ btrfs_release_delayed_item(curr);
+ }
+
+out:
+ return ret;
+}
+
+static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ struct btrfs_root *root,
+ struct btrfs_delayed_node *node)
+{
+ struct btrfs_delayed_item *curr, *prev;
+ int ret = 0;
+
+do_again:
+ mutex_lock(&node->mutex);
+ curr = __btrfs_first_delayed_deletion_item(node);
+ if (!curr)
+ goto delete_fail;
+
+ ret = btrfs_search_slot(trans, root, &curr->key, path, -1, 1);
+ if (ret < 0)
+ goto delete_fail;
+ else if (ret > 0) {
+ /*
+ * can't find the item which the node points to, so this node
+ * is invalid, just drop it.
+ */
+ prev = curr;
+ curr = __btrfs_next_delayed_item(prev);
+ btrfs_release_delayed_item(prev);
+ ret = 0;
+ btrfs_release_path(path);
+ if (curr) {
+ mutex_unlock(&node->mutex);
+ goto do_again;
+ } else
+ goto delete_fail;
+ }
+
+ btrfs_batch_delete_items(trans, root, path, curr);
+ btrfs_release_path(path);
+ mutex_unlock(&node->mutex);
+ goto do_again;
+
+delete_fail:
+ btrfs_release_path(path);
+ mutex_unlock(&node->mutex);
+ return ret;
+}
+
+static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node)
+{
+ struct btrfs_delayed_root *delayed_root;
+
+ if (delayed_node &&
+ test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
+ BUG_ON(!delayed_node->root);
+ clear_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags);
+ delayed_node->count--;
+
+ delayed_root = delayed_node->root->fs_info->delayed_root;
+ finish_one_item(delayed_root);
+ }
+}
+
+static void btrfs_release_delayed_iref(struct btrfs_delayed_node *delayed_node)
+{
+ struct btrfs_delayed_root *delayed_root;
+
+ ASSERT(delayed_node->root);
+ clear_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags);
+ delayed_node->count--;
+
+ delayed_root = delayed_node->root->fs_info->delayed_root;
+ finish_one_item(delayed_root);
+}
+
+static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_delayed_node *node)
+{
+ struct btrfs_key key;
+ struct btrfs_inode_item *inode_item;
+ struct extent_buffer *leaf;
+ int mod;
+ int ret;
+
+ key.objectid = node->inode_id;
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+
+ if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags))
+ mod = -1;
+ else
+ mod = 1;
+
+ ret = btrfs_lookup_inode(trans, root, path, &key, mod);
+ if (ret > 0) {
+ btrfs_release_path(path);
+ return -ENOENT;
+ } else if (ret < 0) {
+ return ret;
+ }
+
+ leaf = path->nodes[0];
+ inode_item = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_inode_item);
+ write_extent_buffer(leaf, &node->inode_item, (unsigned long)inode_item,
+ sizeof(struct btrfs_inode_item));
+ btrfs_mark_buffer_dirty(leaf);
+
+ if (!test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags))
+ goto no_iref;
+
+ path->slots[0]++;
+ if (path->slots[0] >= btrfs_header_nritems(leaf))
+ goto search;
+again:
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid != node->inode_id)
+ goto out;
+
+ if (key.type != BTRFS_INODE_REF_KEY &&
+ key.type != BTRFS_INODE_EXTREF_KEY)
+ goto out;
+
+ /*
+ * Delayed iref deletion is for the inode who has only one link,
+ * so there is only one iref. The case that several irefs are
+ * in the same item doesn't exist.
+ */
+ btrfs_del_item(trans, root, path);
+out:
+ btrfs_release_delayed_iref(node);
+no_iref:
+ btrfs_release_path(path);
+err_out:
+ btrfs_delayed_inode_release_metadata(root, node);
+ btrfs_release_delayed_inode(node);
+
+ return ret;
+
+search:
+ btrfs_release_path(path);
+
+ key.type = BTRFS_INODE_EXTREF_KEY;
+ key.offset = -1;
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret < 0)
+ goto err_out;
+ ASSERT(ret);
+
+ ret = 0;
+ leaf = path->nodes[0];
+ path->slots[0]--;
+ goto again;
+}
+
+static inline int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_delayed_node *node)
+{
+ int ret;
+
+ mutex_lock(&node->mutex);
+ if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &node->flags)) {
+ mutex_unlock(&node->mutex);
+ return 0;
+ }
+
+ ret = __btrfs_update_delayed_inode(trans, root, path, node);
+ mutex_unlock(&node->mutex);
+ return ret;
+}
+
+static inline int
+__btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ struct btrfs_delayed_node *node)
+{
+ int ret;
+
+ ret = btrfs_insert_delayed_items(trans, path, node->root, node);
+ if (ret)
+ return ret;
+
+ ret = btrfs_delete_delayed_items(trans, path, node->root, node);
+ if (ret)
+ return ret;
+
+ ret = btrfs_update_delayed_inode(trans, node->root, path, node);
+ return ret;
+}
+
+/*
+ * Called when committing the transaction.
+ * Returns 0 on success.
+ * Returns < 0 on error and returns with an aborted transaction with any
+ * outstanding delayed items cleaned up.
+ */
+static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, int nr)
+{
+ struct btrfs_delayed_root *delayed_root;
+ struct btrfs_delayed_node *curr_node, *prev_node;
+ struct btrfs_path *path;
+ struct btrfs_block_rsv *block_rsv;
+ int ret = 0;
+ bool count = (nr > 0);
+
+ if (trans->aborted)
+ return -EIO;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ path->leave_spinning = 1;
+
+ block_rsv = trans->block_rsv;
+ trans->block_rsv = &root->fs_info->delayed_block_rsv;
+
+ delayed_root = btrfs_get_delayed_root(root);
+
+ curr_node = btrfs_first_delayed_node(delayed_root);
+ while (curr_node && (!count || (count && nr--))) {
+ ret = __btrfs_commit_inode_delayed_items(trans, path,
+ curr_node);
+ if (ret) {
+ btrfs_release_delayed_node(curr_node);
+ curr_node = NULL;
+ btrfs_abort_transaction(trans, root, ret);
+ break;
+ }
+
+ prev_node = curr_node;
+ curr_node = btrfs_next_delayed_node(curr_node);
+ btrfs_release_delayed_node(prev_node);
+ }
+
+ if (curr_node)
+ btrfs_release_delayed_node(curr_node);
+ btrfs_free_path(path);
+ trans->block_rsv = block_rsv;
+
+ return ret;
+}
+
+int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ return __btrfs_run_delayed_items(trans, root, -1);
+}
+
+int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, int nr)
+{
+ return __btrfs_run_delayed_items(trans, root, nr);
+}
+
+int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
+ struct inode *inode)
+{
+ struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
+ struct btrfs_path *path;
+ struct btrfs_block_rsv *block_rsv;
+ int ret;
+
+ if (!delayed_node)
+ return 0;
+
+ mutex_lock(&delayed_node->mutex);
+ if (!delayed_node->count) {
+ mutex_unlock(&delayed_node->mutex);
+ btrfs_release_delayed_node(delayed_node);
+ return 0;
+ }
+ mutex_unlock(&delayed_node->mutex);
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ btrfs_release_delayed_node(delayed_node);
+ return -ENOMEM;
+ }
+ path->leave_spinning = 1;
+
+ block_rsv = trans->block_rsv;
+ trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv;
+
+ ret = __btrfs_commit_inode_delayed_items(trans, path, delayed_node);
+
+ btrfs_release_delayed_node(delayed_node);
+ btrfs_free_path(path);
+ trans->block_rsv = block_rsv;
+
+ return ret;
+}
+
+int btrfs_commit_inode_delayed_inode(struct inode *inode)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
+ struct btrfs_path *path;
+ struct btrfs_block_rsv *block_rsv;
+ int ret;
+
+ if (!delayed_node)
+ return 0;
+
+ mutex_lock(&delayed_node->mutex);
+ if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
+ mutex_unlock(&delayed_node->mutex);
+ btrfs_release_delayed_node(delayed_node);
+ return 0;
+ }
+ mutex_unlock(&delayed_node->mutex);
+
+ trans = btrfs_join_transaction(delayed_node->root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto trans_out;
+ }
+ path->leave_spinning = 1;
+
+ block_rsv = trans->block_rsv;
+ trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv;
+
+ mutex_lock(&delayed_node->mutex);
+ if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags))
+ ret = __btrfs_update_delayed_inode(trans, delayed_node->root,
+ path, delayed_node);
+ else
+ ret = 0;
+ mutex_unlock(&delayed_node->mutex);
+
+ btrfs_free_path(path);
+ trans->block_rsv = block_rsv;
+trans_out:
+ btrfs_end_transaction(trans, delayed_node->root);
+ btrfs_btree_balance_dirty(delayed_node->root);
+out:
+ btrfs_release_delayed_node(delayed_node);
+
+ return ret;
+}
+
+void btrfs_remove_delayed_node(struct inode *inode)
+{
+ struct btrfs_delayed_node *delayed_node;
+
+ delayed_node = ACCESS_ONCE(BTRFS_I(inode)->delayed_node);
+ if (!delayed_node)
+ return;
+
+ BTRFS_I(inode)->delayed_node = NULL;
+ btrfs_release_delayed_node(delayed_node);
+}
+
+struct btrfs_async_delayed_work {
+ struct btrfs_delayed_root *delayed_root;
+ int nr;
+ struct btrfs_work work;
+};
+
+static void btrfs_async_run_delayed_root(struct btrfs_work *work)
+{
+ struct btrfs_async_delayed_work *async_work;
+ struct btrfs_delayed_root *delayed_root;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_path *path;
+ struct btrfs_delayed_node *delayed_node = NULL;
+ struct btrfs_root *root;
+ struct btrfs_block_rsv *block_rsv;
+ int total_done = 0;
+
+ async_work = container_of(work, struct btrfs_async_delayed_work, work);
+ delayed_root = async_work->delayed_root;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ goto out;
+
+again:
+ if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND / 2)
+ goto free_path;
+
+ delayed_node = btrfs_first_prepared_delayed_node(delayed_root);
+ if (!delayed_node)
+ goto free_path;
+
+ path->leave_spinning = 1;
+ root = delayed_node->root;
+
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans))
+ goto release_path;
+
+ block_rsv = trans->block_rsv;
+ trans->block_rsv = &root->fs_info->delayed_block_rsv;
+
+ __btrfs_commit_inode_delayed_items(trans, path, delayed_node);
+
+ trans->block_rsv = block_rsv;
+ btrfs_end_transaction(trans, root);
+ btrfs_btree_balance_dirty_nodelay(root);
+
+release_path:
+ btrfs_release_path(path);
+ total_done++;
+
+ btrfs_release_prepared_delayed_node(delayed_node);
+ if (async_work->nr == 0 || total_done < async_work->nr)
+ goto again;
+
+free_path:
+ btrfs_free_path(path);
+out:
+ wake_up(&delayed_root->wait);
+ kfree(async_work);
+}
+
+
+static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
+ struct btrfs_fs_info *fs_info, int nr)
+{
+ struct btrfs_async_delayed_work *async_work;
+
+ if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND)
+ return 0;
+
+ async_work = kmalloc(sizeof(*async_work), GFP_NOFS);
+ if (!async_work)
+ return -ENOMEM;
+
+ async_work->delayed_root = delayed_root;
+ btrfs_init_work(&async_work->work, btrfs_delayed_meta_helper,
+ btrfs_async_run_delayed_root, NULL, NULL);
+ async_work->nr = nr;
+
+ btrfs_queue_work(fs_info->delayed_workers, &async_work->work);
+ return 0;
+}
+
+void btrfs_assert_delayed_root_empty(struct btrfs_root *root)
+{
+ struct btrfs_delayed_root *delayed_root;
+ delayed_root = btrfs_get_delayed_root(root);
+ WARN_ON(btrfs_first_delayed_node(delayed_root));
+}
+
+static int could_end_wait(struct btrfs_delayed_root *delayed_root, int seq)
+{
+ int val = atomic_read(&delayed_root->items_seq);
+
+ if (val < seq || val >= seq + BTRFS_DELAYED_BATCH)
+ return 1;
+
+ if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND)
+ return 1;
+
+ return 0;
+}
+
+void btrfs_balance_delayed_items(struct btrfs_root *root)
+{
+ struct btrfs_delayed_root *delayed_root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+
+ delayed_root = btrfs_get_delayed_root(root);
+
+ if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND)
+ return;
+
+ if (atomic_read(&delayed_root->items) >= BTRFS_DELAYED_WRITEBACK) {
+ int seq;
+ int ret;
+
+ seq = atomic_read(&delayed_root->items_seq);
+
+ ret = btrfs_wq_run_delayed_node(delayed_root, fs_info, 0);
+ if (ret)
+ return;
+
+ wait_event_interruptible(delayed_root->wait,
+ could_end_wait(delayed_root, seq));
+ return;
+ }
+
+ btrfs_wq_run_delayed_node(delayed_root, fs_info, BTRFS_DELAYED_BATCH);
+}
+
+/* Will return 0 or -ENOMEM */
+int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, const char *name,
+ int name_len, struct inode *dir,
+ struct btrfs_disk_key *disk_key, u8 type,
+ u64 index)
+{
+ struct btrfs_delayed_node *delayed_node;
+ struct btrfs_delayed_item *delayed_item;
+ struct btrfs_dir_item *dir_item;
+ int ret;
+
+ delayed_node = btrfs_get_or_create_delayed_node(dir);
+ if (IS_ERR(delayed_node))
+ return PTR_ERR(delayed_node);
+
+ delayed_item = btrfs_alloc_delayed_item(sizeof(*dir_item) + name_len);
+ if (!delayed_item) {
+ ret = -ENOMEM;
+ goto release_node;
+ }
+
+ delayed_item->key.objectid = btrfs_ino(dir);
+ delayed_item->key.type = BTRFS_DIR_INDEX_KEY;
+ delayed_item->key.offset = index;
+
+ dir_item = (struct btrfs_dir_item *)delayed_item->data;
+ dir_item->location = *disk_key;
+ btrfs_set_stack_dir_transid(dir_item, trans->transid);
+ btrfs_set_stack_dir_data_len(dir_item, 0);
+ btrfs_set_stack_dir_name_len(dir_item, name_len);
+ btrfs_set_stack_dir_type(dir_item, type);
+ memcpy((char *)(dir_item + 1), name, name_len);
+
+ ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item);
+ /*
+ * we have reserved enough space when we start a new transaction,
+ * so reserving metadata failure is impossible
+ */
+ BUG_ON(ret);
+
+
+ mutex_lock(&delayed_node->mutex);
+ ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item);
+ if (unlikely(ret)) {
+ btrfs_err(root->fs_info, "err add delayed dir index item(name: %.*s) "
+ "into the insertion tree of the delayed node"
+ "(root id: %llu, inode id: %llu, errno: %d)",
+ name_len, name, delayed_node->root->objectid,
+ delayed_node->inode_id, ret);
+ BUG();
+ }
+ mutex_unlock(&delayed_node->mutex);
+
+release_node:
+ btrfs_release_delayed_node(delayed_node);
+ return ret;
+}
+
+static int btrfs_delete_delayed_insertion_item(struct btrfs_root *root,
+ struct btrfs_delayed_node *node,
+ struct btrfs_key *key)
+{
+ struct btrfs_delayed_item *item;
+
+ mutex_lock(&node->mutex);
+ item = __btrfs_lookup_delayed_insertion_item(node, key);
+ if (!item) {
+ mutex_unlock(&node->mutex);
+ return 1;
+ }
+
+ btrfs_delayed_item_release_metadata(root, item);
+ btrfs_release_delayed_item(item);
+ mutex_unlock(&node->mutex);
+ return 0;
+}
+
+int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *dir,
+ u64 index)
+{
+ struct btrfs_delayed_node *node;
+ struct btrfs_delayed_item *item;
+ struct btrfs_key item_key;
+ int ret;
+
+ node = btrfs_get_or_create_delayed_node(dir);
+ if (IS_ERR(node))
+ return PTR_ERR(node);
+
+ item_key.objectid = btrfs_ino(dir);
+ item_key.type = BTRFS_DIR_INDEX_KEY;
+ item_key.offset = index;
+
+ ret = btrfs_delete_delayed_insertion_item(root, node, &item_key);
+ if (!ret)
+ goto end;
+
+ item = btrfs_alloc_delayed_item(0);
+ if (!item) {
+ ret = -ENOMEM;
+ goto end;
+ }
+
+ item->key = item_key;
+
+ ret = btrfs_delayed_item_reserve_metadata(trans, root, item);
+ /*
+ * we have reserved enough space when we start a new transaction,
+ * so reserving metadata failure is impossible.
+ */
+ BUG_ON(ret);
+
+ mutex_lock(&node->mutex);
+ ret = __btrfs_add_delayed_deletion_item(node, item);
+ if (unlikely(ret)) {
+ btrfs_err(root->fs_info, "err add delayed dir index item(index: %llu) "
+ "into the deletion tree of the delayed node"
+ "(root id: %llu, inode id: %llu, errno: %d)",
+ index, node->root->objectid, node->inode_id,
+ ret);
+ BUG();
+ }
+ mutex_unlock(&node->mutex);
+end:
+ btrfs_release_delayed_node(node);
+ return ret;
+}
+
+int btrfs_inode_delayed_dir_index_count(struct inode *inode)
+{
+ struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
+
+ if (!delayed_node)
+ return -ENOENT;
+
+ /*
+ * Since we have held i_mutex of this directory, it is impossible that
+ * a new directory index is added into the delayed node and index_cnt
+ * is updated now. So we needn't lock the delayed node.
+ */
+ if (!delayed_node->index_cnt) {
+ btrfs_release_delayed_node(delayed_node);
+ return -EINVAL;
+ }
+
+ BTRFS_I(inode)->index_cnt = delayed_node->index_cnt;
+ btrfs_release_delayed_node(delayed_node);
+ return 0;
+}
+
+void btrfs_get_delayed_items(struct inode *inode, struct list_head *ins_list,
+ struct list_head *del_list)
+{
+ struct btrfs_delayed_node *delayed_node;
+ struct btrfs_delayed_item *item;
+
+ delayed_node = btrfs_get_delayed_node(inode);
+ if (!delayed_node)
+ return;
+
+ mutex_lock(&delayed_node->mutex);
+ item = __btrfs_first_delayed_insertion_item(delayed_node);
+ while (item) {
+ atomic_inc(&item->refs);
+ list_add_tail(&item->readdir_list, ins_list);
+ item = __btrfs_next_delayed_item(item);
+ }
+
+ item = __btrfs_first_delayed_deletion_item(delayed_node);
+ while (item) {
+ atomic_inc(&item->refs);
+ list_add_tail(&item->readdir_list, del_list);
+ item = __btrfs_next_delayed_item(item);
+ }
+ mutex_unlock(&delayed_node->mutex);
+ /*
+ * This delayed node is still cached in the btrfs inode, so refs
+ * must be > 1 now, and we needn't check it is going to be freed
+ * or not.
+ *
+ * Besides that, this function is used to read dir, we do not
+ * insert/delete delayed items in this period. So we also needn't
+ * requeue or dequeue this delayed node.
+ */
+ atomic_dec(&delayed_node->refs);
+}
+
+void btrfs_put_delayed_items(struct list_head *ins_list,
+ struct list_head *del_list)
+{
+ struct btrfs_delayed_item *curr, *next;
+
+ list_for_each_entry_safe(curr, next, ins_list, readdir_list) {
+ list_del(&curr->readdir_list);
+ if (atomic_dec_and_test(&curr->refs))
+ kfree(curr);
+ }
+
+ list_for_each_entry_safe(curr, next, del_list, readdir_list) {
+ list_del(&curr->readdir_list);
+ if (atomic_dec_and_test(&curr->refs))
+ kfree(curr);
+ }
+}
+
+int btrfs_should_delete_dir_index(struct list_head *del_list,
+ u64 index)
+{
+ struct btrfs_delayed_item *curr, *next;
+ int ret;
+
+ if (list_empty(del_list))
+ return 0;
+
+ list_for_each_entry_safe(curr, next, del_list, readdir_list) {
+ if (curr->key.offset > index)
+ break;
+
+ list_del(&curr->readdir_list);
+ ret = (curr->key.offset == index);
+
+ if (atomic_dec_and_test(&curr->refs))
+ kfree(curr);
+
+ if (ret)
+ return 1;
+ else
+ continue;
+ }
+ return 0;
+}
+
+/*
+ * btrfs_readdir_delayed_dir_index - read dir info stored in the delayed tree
+ *
+ */
+int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
+ struct list_head *ins_list)
+{
+ struct btrfs_dir_item *di;
+ struct btrfs_delayed_item *curr, *next;
+ struct btrfs_key location;
+ char *name;
+ int name_len;
+ int over = 0;
+ unsigned char d_type;
+
+ if (list_empty(ins_list))
+ return 0;
+
+ /*
+ * Changing the data of the delayed item is impossible. So
+ * we needn't lock them. And we have held i_mutex of the
+ * directory, nobody can delete any directory indexes now.
+ */
+ list_for_each_entry_safe(curr, next, ins_list, readdir_list) {
+ list_del(&curr->readdir_list);
+
+ if (curr->key.offset < ctx->pos) {
+ if (atomic_dec_and_test(&curr->refs))
+ kfree(curr);
+ continue;
+ }
+
+ ctx->pos = curr->key.offset;
+
+ di = (struct btrfs_dir_item *)curr->data;
+ name = (char *)(di + 1);
+ name_len = btrfs_stack_dir_name_len(di);
+
+ d_type = btrfs_filetype_table[di->type];
+ btrfs_disk_key_to_cpu(&location, &di->location);
+
+ over = !dir_emit(ctx, name, name_len,
+ location.objectid, d_type);
+
+ if (atomic_dec_and_test(&curr->refs))
+ kfree(curr);
+
+ if (over)
+ return 1;
+ }
+ return 0;
+}
+
+static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
+ struct btrfs_inode_item *inode_item,
+ struct inode *inode)
+{
+ btrfs_set_stack_inode_uid(inode_item, i_uid_read(inode));
+ btrfs_set_stack_inode_gid(inode_item, i_gid_read(inode));
+ btrfs_set_stack_inode_size(inode_item, BTRFS_I(inode)->disk_i_size);
+ btrfs_set_stack_inode_mode(inode_item, inode->i_mode);
+ btrfs_set_stack_inode_nlink(inode_item, inode->i_nlink);
+ btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(inode));
+ btrfs_set_stack_inode_generation(inode_item,
+ BTRFS_I(inode)->generation);
+ btrfs_set_stack_inode_sequence(inode_item, inode->i_version);
+ btrfs_set_stack_inode_transid(inode_item, trans->transid);
+ btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev);
+ btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags);
+ btrfs_set_stack_inode_block_group(inode_item, 0);
+
+ btrfs_set_stack_timespec_sec(&inode_item->atime,
+ inode->i_atime.tv_sec);
+ btrfs_set_stack_timespec_nsec(&inode_item->atime,
+ inode->i_atime.tv_nsec);
+
+ btrfs_set_stack_timespec_sec(&inode_item->mtime,
+ inode->i_mtime.tv_sec);
+ btrfs_set_stack_timespec_nsec(&inode_item->mtime,
+ inode->i_mtime.tv_nsec);
+
+ btrfs_set_stack_timespec_sec(&inode_item->ctime,
+ inode->i_ctime.tv_sec);
+ btrfs_set_stack_timespec_nsec(&inode_item->ctime,
+ inode->i_ctime.tv_nsec);
+
+ btrfs_set_stack_timespec_sec(&inode_item->otime,
+ BTRFS_I(inode)->i_otime.tv_sec);
+ btrfs_set_stack_timespec_nsec(&inode_item->otime,
+ BTRFS_I(inode)->i_otime.tv_nsec);
+}
+
+int btrfs_fill_inode(struct inode *inode, u32 *rdev)
+{
+ struct btrfs_delayed_node *delayed_node;
+ struct btrfs_inode_item *inode_item;
+
+ delayed_node = btrfs_get_delayed_node(inode);
+ if (!delayed_node)
+ return -ENOENT;
+
+ mutex_lock(&delayed_node->mutex);
+ if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
+ mutex_unlock(&delayed_node->mutex);
+ btrfs_release_delayed_node(delayed_node);
+ return -ENOENT;
+ }
+
+ inode_item = &delayed_node->inode_item;
+
+ i_uid_write(inode, btrfs_stack_inode_uid(inode_item));
+ i_gid_write(inode, btrfs_stack_inode_gid(inode_item));
+ btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item));
+ inode->i_mode = btrfs_stack_inode_mode(inode_item);
+ set_nlink(inode, btrfs_stack_inode_nlink(inode_item));
+ inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item));
+ BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item);
+ BTRFS_I(inode)->last_trans = btrfs_stack_inode_transid(inode_item);
+
+ inode->i_version = btrfs_stack_inode_sequence(inode_item);
+ inode->i_rdev = 0;
+ *rdev = btrfs_stack_inode_rdev(inode_item);
+ BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item);
+
+ inode->i_atime.tv_sec = btrfs_stack_timespec_sec(&inode_item->atime);
+ inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->atime);
+
+ inode->i_mtime.tv_sec = btrfs_stack_timespec_sec(&inode_item->mtime);
+ inode->i_mtime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->mtime);
+
+ inode->i_ctime.tv_sec = btrfs_stack_timespec_sec(&inode_item->ctime);
+ inode->i_ctime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->ctime);
+
+ BTRFS_I(inode)->i_otime.tv_sec =
+ btrfs_stack_timespec_sec(&inode_item->otime);
+ BTRFS_I(inode)->i_otime.tv_nsec =
+ btrfs_stack_timespec_nsec(&inode_item->otime);
+
+ inode->i_generation = BTRFS_I(inode)->generation;
+ BTRFS_I(inode)->index_cnt = (u64)-1;
+
+ mutex_unlock(&delayed_node->mutex);
+ btrfs_release_delayed_node(delayed_node);
+ return 0;
+}
+
+int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode)
+{
+ struct btrfs_delayed_node *delayed_node;
+ int ret = 0;
+
+ delayed_node = btrfs_get_or_create_delayed_node(inode);
+ if (IS_ERR(delayed_node))
+ return PTR_ERR(delayed_node);
+
+ mutex_lock(&delayed_node->mutex);
+ if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
+ fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
+ goto release_node;
+ }
+
+ ret = btrfs_delayed_inode_reserve_metadata(trans, root, inode,
+ delayed_node);
+ if (ret)
+ goto release_node;
+
+ fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
+ set_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags);
+ delayed_node->count++;
+ atomic_inc(&root->fs_info->delayed_root->items);
+release_node:
+ mutex_unlock(&delayed_node->mutex);
+ btrfs_release_delayed_node(delayed_node);
+ return ret;
+}
+
+int btrfs_delayed_delete_inode_ref(struct inode *inode)
+{
+ struct btrfs_delayed_node *delayed_node;
+
+ /*
+ * we don't do delayed inode updates during log recovery because it
+ * leads to enospc problems. This means we also can't do
+ * delayed inode refs
+ */
+ if (BTRFS_I(inode)->root->fs_info->log_root_recovering)
+ return -EAGAIN;
+
+ delayed_node = btrfs_get_or_create_delayed_node(inode);
+ if (IS_ERR(delayed_node))
+ return PTR_ERR(delayed_node);
+
+ /*
+ * We don't reserve space for inode ref deletion is because:
+ * - We ONLY do async inode ref deletion for the inode who has only
+ * one link(i_nlink == 1), it means there is only one inode ref.
+ * And in most case, the inode ref and the inode item are in the
+ * same leaf, and we will deal with them at the same time.
+ * Since we are sure we will reserve the space for the inode item,
+ * it is unnecessary to reserve space for inode ref deletion.
+ * - If the inode ref and the inode item are not in the same leaf,
+ * We also needn't worry about enospc problem, because we reserve
+ * much more space for the inode update than it needs.
+ * - At the worst, we can steal some space from the global reservation.
+ * It is very rare.
+ */
+ mutex_lock(&delayed_node->mutex);
+ if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags))
+ goto release_node;
+
+ set_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags);
+ delayed_node->count++;
+ atomic_inc(&BTRFS_I(inode)->root->fs_info->delayed_root->items);
+release_node:
+ mutex_unlock(&delayed_node->mutex);
+ btrfs_release_delayed_node(delayed_node);
+ return 0;
+}
+
+static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node)
+{
+ struct btrfs_root *root = delayed_node->root;
+ struct btrfs_delayed_item *curr_item, *prev_item;
+
+ mutex_lock(&delayed_node->mutex);
+ curr_item = __btrfs_first_delayed_insertion_item(delayed_node);
+ while (curr_item) {
+ btrfs_delayed_item_release_metadata(root, curr_item);
+ prev_item = curr_item;
+ curr_item = __btrfs_next_delayed_item(prev_item);
+ btrfs_release_delayed_item(prev_item);
+ }
+
+ curr_item = __btrfs_first_delayed_deletion_item(delayed_node);
+ while (curr_item) {
+ btrfs_delayed_item_release_metadata(root, curr_item);
+ prev_item = curr_item;
+ curr_item = __btrfs_next_delayed_item(prev_item);
+ btrfs_release_delayed_item(prev_item);
+ }
+
+ if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags))
+ btrfs_release_delayed_iref(delayed_node);
+
+ if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
+ btrfs_delayed_inode_release_metadata(root, delayed_node);
+ btrfs_release_delayed_inode(delayed_node);
+ }
+ mutex_unlock(&delayed_node->mutex);
+}
+
+void btrfs_kill_delayed_inode_items(struct inode *inode)
+{
+ struct btrfs_delayed_node *delayed_node;
+
+ delayed_node = btrfs_get_delayed_node(inode);
+ if (!delayed_node)
+ return;
+
+ __btrfs_kill_delayed_node(delayed_node);
+ btrfs_release_delayed_node(delayed_node);
+}
+
+void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
+{
+ u64 inode_id = 0;
+ struct btrfs_delayed_node *delayed_nodes[8];
+ int i, n;
+
+ while (1) {
+ spin_lock(&root->inode_lock);
+ n = radix_tree_gang_lookup(&root->delayed_nodes_tree,
+ (void **)delayed_nodes, inode_id,
+ ARRAY_SIZE(delayed_nodes));
+ if (!n) {
+ spin_unlock(&root->inode_lock);
+ break;
+ }
+
+ inode_id = delayed_nodes[n - 1]->inode_id + 1;
+
+ for (i = 0; i < n; i++)
+ atomic_inc(&delayed_nodes[i]->refs);
+ spin_unlock(&root->inode_lock);
+
+ for (i = 0; i < n; i++) {
+ __btrfs_kill_delayed_node(delayed_nodes[i]);
+ btrfs_release_delayed_node(delayed_nodes[i]);
+ }
+ }
+}
+
+void btrfs_destroy_delayed_inodes(struct btrfs_root *root)
+{
+ struct btrfs_delayed_root *delayed_root;
+ struct btrfs_delayed_node *curr_node, *prev_node;
+
+ delayed_root = btrfs_get_delayed_root(root);
+
+ curr_node = btrfs_first_delayed_node(delayed_root);
+ while (curr_node) {
+ __btrfs_kill_delayed_node(curr_node);
+
+ prev_node = curr_node;
+ curr_node = btrfs_next_delayed_node(curr_node);
+ btrfs_release_delayed_node(prev_node);
+ }
+}
+
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
new file mode 100644
index 000000000..f70119f25
--- /dev/null
+++ b/fs/btrfs/delayed-inode.h
@@ -0,0 +1,156 @@
+/*
+ * Copyright (C) 2011 Fujitsu. All rights reserved.
+ * Written by Miao Xie <miaox@cn.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __DELAYED_TREE_OPERATION_H
+#define __DELAYED_TREE_OPERATION_H
+
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/wait.h>
+#include <linux/atomic.h>
+
+#include "ctree.h"
+
+/* types of the delayed item */
+#define BTRFS_DELAYED_INSERTION_ITEM 1
+#define BTRFS_DELAYED_DELETION_ITEM 2
+
+struct btrfs_delayed_root {
+ spinlock_t lock;
+ struct list_head node_list;
+ /*
+ * Used for delayed nodes which is waiting to be dealt with by the
+ * worker. If the delayed node is inserted into the work queue, we
+ * drop it from this list.
+ */
+ struct list_head prepare_list;
+ atomic_t items; /* for delayed items */
+ atomic_t items_seq; /* for delayed items */
+ int nodes; /* for delayed nodes */
+ wait_queue_head_t wait;
+};
+
+#define BTRFS_DELAYED_NODE_IN_LIST 0
+#define BTRFS_DELAYED_NODE_INODE_DIRTY 1
+#define BTRFS_DELAYED_NODE_DEL_IREF 2
+
+struct btrfs_delayed_node {
+ u64 inode_id;
+ u64 bytes_reserved;
+ struct btrfs_root *root;
+ /* Used to add the node into the delayed root's node list. */
+ struct list_head n_list;
+ /*
+ * Used to add the node into the prepare list, the nodes in this list
+ * is waiting to be dealt with by the async worker.
+ */
+ struct list_head p_list;
+ struct rb_root ins_root;
+ struct rb_root del_root;
+ struct mutex mutex;
+ struct btrfs_inode_item inode_item;
+ atomic_t refs;
+ u64 index_cnt;
+ unsigned long flags;
+ int count;
+};
+
+struct btrfs_delayed_item {
+ struct rb_node rb_node;
+ struct btrfs_key key;
+ struct list_head tree_list; /* used for batch insert/delete items */
+ struct list_head readdir_list; /* used for readdir items */
+ u64 bytes_reserved;
+ struct btrfs_delayed_node *delayed_node;
+ atomic_t refs;
+ int ins_or_del;
+ u32 data_len;
+ char data[0];
+};
+
+static inline void btrfs_init_delayed_root(
+ struct btrfs_delayed_root *delayed_root)
+{
+ atomic_set(&delayed_root->items, 0);
+ atomic_set(&delayed_root->items_seq, 0);
+ delayed_root->nodes = 0;
+ spin_lock_init(&delayed_root->lock);
+ init_waitqueue_head(&delayed_root->wait);
+ INIT_LIST_HEAD(&delayed_root->node_list);
+ INIT_LIST_HEAD(&delayed_root->prepare_list);
+}
+
+int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, const char *name,
+ int name_len, struct inode *dir,
+ struct btrfs_disk_key *disk_key, u8 type,
+ u64 index);
+
+int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *dir,
+ u64 index);
+
+int btrfs_inode_delayed_dir_index_count(struct inode *inode);
+
+int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, int nr);
+
+void btrfs_balance_delayed_items(struct btrfs_root *root);
+
+int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
+ struct inode *inode);
+/* Used for evicting the inode. */
+void btrfs_remove_delayed_node(struct inode *inode);
+void btrfs_kill_delayed_inode_items(struct inode *inode);
+int btrfs_commit_inode_delayed_inode(struct inode *inode);
+
+
+int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode);
+int btrfs_fill_inode(struct inode *inode, u32 *rdev);
+int btrfs_delayed_delete_inode_ref(struct inode *inode);
+
+/* Used for drop dead root */
+void btrfs_kill_all_delayed_nodes(struct btrfs_root *root);
+
+/* Used for clean the transaction */
+void btrfs_destroy_delayed_inodes(struct btrfs_root *root);
+
+/* Used for readdir() */
+void btrfs_get_delayed_items(struct inode *inode, struct list_head *ins_list,
+ struct list_head *del_list);
+void btrfs_put_delayed_items(struct list_head *ins_list,
+ struct list_head *del_list);
+int btrfs_should_delete_dir_index(struct list_head *del_list,
+ u64 index);
+int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
+ struct list_head *ins_list);
+
+/* for init */
+int __init btrfs_delayed_inode_init(void);
+void btrfs_delayed_inode_exit(void);
+
+/* for debugging */
+void btrfs_assert_delayed_root_empty(struct btrfs_root *root);
+
+#endif
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
new file mode 100644
index 000000000..8f8ed7d20
--- /dev/null
+++ b/fs/btrfs/delayed-ref.c
@@ -0,0 +1,962 @@
+/*
+ * Copyright (C) 2009 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/sort.h>
+#include "ctree.h"
+#include "delayed-ref.h"
+#include "transaction.h"
+
+struct kmem_cache *btrfs_delayed_ref_head_cachep;
+struct kmem_cache *btrfs_delayed_tree_ref_cachep;
+struct kmem_cache *btrfs_delayed_data_ref_cachep;
+struct kmem_cache *btrfs_delayed_extent_op_cachep;
+/*
+ * delayed back reference update tracking. For subvolume trees
+ * we queue up extent allocations and backref maintenance for
+ * delayed processing. This avoids deep call chains where we
+ * add extents in the middle of btrfs_search_slot, and it allows
+ * us to buffer up frequently modified backrefs in an rb tree instead
+ * of hammering updates on the extent allocation tree.
+ */
+
+/*
+ * compare two delayed tree backrefs with same bytenr and type
+ */
+static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref2,
+ struct btrfs_delayed_tree_ref *ref1, int type)
+{
+ if (type == BTRFS_TREE_BLOCK_REF_KEY) {
+ if (ref1->root < ref2->root)
+ return -1;
+ if (ref1->root > ref2->root)
+ return 1;
+ } else {
+ if (ref1->parent < ref2->parent)
+ return -1;
+ if (ref1->parent > ref2->parent)
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * compare two delayed data backrefs with same bytenr and type
+ */
+static int comp_data_refs(struct btrfs_delayed_data_ref *ref2,
+ struct btrfs_delayed_data_ref *ref1)
+{
+ if (ref1->node.type == BTRFS_EXTENT_DATA_REF_KEY) {
+ if (ref1->root < ref2->root)
+ return -1;
+ if (ref1->root > ref2->root)
+ return 1;
+ if (ref1->objectid < ref2->objectid)
+ return -1;
+ if (ref1->objectid > ref2->objectid)
+ return 1;
+ if (ref1->offset < ref2->offset)
+ return -1;
+ if (ref1->offset > ref2->offset)
+ return 1;
+ } else {
+ if (ref1->parent < ref2->parent)
+ return -1;
+ if (ref1->parent > ref2->parent)
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * entries in the rb tree are ordered by the byte number of the extent,
+ * type of the delayed backrefs and content of delayed backrefs.
+ */
+static int comp_entry(struct btrfs_delayed_ref_node *ref2,
+ struct btrfs_delayed_ref_node *ref1,
+ bool compare_seq)
+{
+ if (ref1->bytenr < ref2->bytenr)
+ return -1;
+ if (ref1->bytenr > ref2->bytenr)
+ return 1;
+ if (ref1->is_head && ref2->is_head)
+ return 0;
+ if (ref2->is_head)
+ return -1;
+ if (ref1->is_head)
+ return 1;
+ if (ref1->type < ref2->type)
+ return -1;
+ if (ref1->type > ref2->type)
+ return 1;
+ if (ref1->no_quota > ref2->no_quota)
+ return 1;
+ if (ref1->no_quota < ref2->no_quota)
+ return -1;
+ /* merging of sequenced refs is not allowed */
+ if (compare_seq) {
+ if (ref1->seq < ref2->seq)
+ return -1;
+ if (ref1->seq > ref2->seq)
+ return 1;
+ }
+ if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
+ ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) {
+ return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2),
+ btrfs_delayed_node_to_tree_ref(ref1),
+ ref1->type);
+ } else if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY ||
+ ref1->type == BTRFS_SHARED_DATA_REF_KEY) {
+ return comp_data_refs(btrfs_delayed_node_to_data_ref(ref2),
+ btrfs_delayed_node_to_data_ref(ref1));
+ }
+ BUG();
+ return 0;
+}
+
+/*
+ * insert a new ref into the rbtree. This returns any existing refs
+ * for the same (bytenr,parent) tuple, or NULL if the new node was properly
+ * inserted.
+ */
+static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
+ struct rb_node *node)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent_node = NULL;
+ struct btrfs_delayed_ref_node *entry;
+ struct btrfs_delayed_ref_node *ins;
+ int cmp;
+
+ ins = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+ while (*p) {
+ parent_node = *p;
+ entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
+ rb_node);
+
+ cmp = comp_entry(entry, ins, 1);
+ if (cmp < 0)
+ p = &(*p)->rb_left;
+ else if (cmp > 0)
+ p = &(*p)->rb_right;
+ else
+ return entry;
+ }
+
+ rb_link_node(node, parent_node, p);
+ rb_insert_color(node, root);
+ return NULL;
+}
+
+/* insert a new ref to head ref rbtree */
+static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root,
+ struct rb_node *node)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent_node = NULL;
+ struct btrfs_delayed_ref_head *entry;
+ struct btrfs_delayed_ref_head *ins;
+ u64 bytenr;
+
+ ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node);
+ bytenr = ins->node.bytenr;
+ while (*p) {
+ parent_node = *p;
+ entry = rb_entry(parent_node, struct btrfs_delayed_ref_head,
+ href_node);
+
+ if (bytenr < entry->node.bytenr)
+ p = &(*p)->rb_left;
+ else if (bytenr > entry->node.bytenr)
+ p = &(*p)->rb_right;
+ else
+ return entry;
+ }
+
+ rb_link_node(node, parent_node, p);
+ rb_insert_color(node, root);
+ return NULL;
+}
+
+/*
+ * find an head entry based on bytenr. This returns the delayed ref
+ * head if it was able to find one, or NULL if nothing was in that spot.
+ * If return_bigger is given, the next bigger entry is returned if no exact
+ * match is found.
+ */
+static struct btrfs_delayed_ref_head *
+find_ref_head(struct rb_root *root, u64 bytenr,
+ int return_bigger)
+{
+ struct rb_node *n;
+ struct btrfs_delayed_ref_head *entry;
+
+ n = root->rb_node;
+ entry = NULL;
+ while (n) {
+ entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node);
+
+ if (bytenr < entry->node.bytenr)
+ n = n->rb_left;
+ else if (bytenr > entry->node.bytenr)
+ n = n->rb_right;
+ else
+ return entry;
+ }
+ if (entry && return_bigger) {
+ if (bytenr > entry->node.bytenr) {
+ n = rb_next(&entry->href_node);
+ if (!n)
+ n = rb_first(root);
+ entry = rb_entry(n, struct btrfs_delayed_ref_head,
+ href_node);
+ return entry;
+ }
+ return entry;
+ }
+ return NULL;
+}
+
+int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_head *head)
+{
+ struct btrfs_delayed_ref_root *delayed_refs;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ assert_spin_locked(&delayed_refs->lock);
+ if (mutex_trylock(&head->mutex))
+ return 0;
+
+ atomic_inc(&head->node.refs);
+ spin_unlock(&delayed_refs->lock);
+
+ mutex_lock(&head->mutex);
+ spin_lock(&delayed_refs->lock);
+ if (!head->node.in_tree) {
+ mutex_unlock(&head->mutex);
+ btrfs_put_delayed_ref(&head->node);
+ return -EAGAIN;
+ }
+ btrfs_put_delayed_ref(&head->node);
+ return 0;
+}
+
+static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_head *head,
+ struct btrfs_delayed_ref_node *ref)
+{
+ if (btrfs_delayed_ref_is_head(ref)) {
+ head = btrfs_delayed_node_to_head(ref);
+ rb_erase(&head->href_node, &delayed_refs->href_root);
+ } else {
+ assert_spin_locked(&head->lock);
+ rb_erase(&ref->rb_node, &head->ref_root);
+ }
+ ref->in_tree = 0;
+ btrfs_put_delayed_ref(ref);
+ atomic_dec(&delayed_refs->num_entries);
+ if (trans->delayed_ref_updates)
+ trans->delayed_ref_updates--;
+}
+
+static int merge_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_head *head,
+ struct btrfs_delayed_ref_node *ref, u64 seq)
+{
+ struct rb_node *node;
+ int mod = 0;
+ int done = 0;
+
+ node = rb_next(&ref->rb_node);
+ while (!done && node) {
+ struct btrfs_delayed_ref_node *next;
+
+ next = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+ node = rb_next(node);
+ if (seq && next->seq >= seq)
+ break;
+ if (comp_entry(ref, next, 0))
+ continue;
+
+ if (ref->action == next->action) {
+ mod = next->ref_mod;
+ } else {
+ if (ref->ref_mod < next->ref_mod) {
+ struct btrfs_delayed_ref_node *tmp;
+
+ tmp = ref;
+ ref = next;
+ next = tmp;
+ done = 1;
+ }
+ mod = -next->ref_mod;
+ }
+
+ drop_delayed_ref(trans, delayed_refs, head, next);
+ ref->ref_mod += mod;
+ if (ref->ref_mod == 0) {
+ drop_delayed_ref(trans, delayed_refs, head, ref);
+ done = 1;
+ } else {
+ /*
+ * You can't have multiples of the same ref on a tree
+ * block.
+ */
+ WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY ||
+ ref->type == BTRFS_SHARED_BLOCK_REF_KEY);
+ }
+ }
+ return done;
+}
+
+void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_head *head)
+{
+ struct rb_node *node;
+ u64 seq = 0;
+
+ assert_spin_locked(&head->lock);
+ /*
+ * We don't have too much refs to merge in the case of delayed data
+ * refs.
+ */
+ if (head->is_data)
+ return;
+
+ spin_lock(&fs_info->tree_mod_seq_lock);
+ if (!list_empty(&fs_info->tree_mod_seq_list)) {
+ struct seq_list *elem;
+
+ elem = list_first_entry(&fs_info->tree_mod_seq_list,
+ struct seq_list, list);
+ seq = elem->seq;
+ }
+ spin_unlock(&fs_info->tree_mod_seq_lock);
+
+ node = rb_first(&head->ref_root);
+ while (node) {
+ struct btrfs_delayed_ref_node *ref;
+
+ ref = rb_entry(node, struct btrfs_delayed_ref_node,
+ rb_node);
+ /* We can't merge refs that are outside of our seq count */
+ if (seq && ref->seq >= seq)
+ break;
+ if (merge_ref(trans, delayed_refs, head, ref, seq))
+ node = rb_first(&head->ref_root);
+ else
+ node = rb_next(&ref->rb_node);
+ }
+}
+
+int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_root *delayed_refs,
+ u64 seq)
+{
+ struct seq_list *elem;
+ int ret = 0;
+
+ spin_lock(&fs_info->tree_mod_seq_lock);
+ if (!list_empty(&fs_info->tree_mod_seq_list)) {
+ elem = list_first_entry(&fs_info->tree_mod_seq_list,
+ struct seq_list, list);
+ if (seq >= elem->seq) {
+ pr_debug("holding back delayed_ref %#x.%x, lowest is %#x.%x (%p)\n",
+ (u32)(seq >> 32), (u32)seq,
+ (u32)(elem->seq >> 32), (u32)elem->seq,
+ delayed_refs);
+ ret = 1;
+ }
+ }
+
+ spin_unlock(&fs_info->tree_mod_seq_lock);
+ return ret;
+}
+
+struct btrfs_delayed_ref_head *
+btrfs_select_ref_head(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_delayed_ref_head *head;
+ u64 start;
+ bool loop = false;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+
+again:
+ start = delayed_refs->run_delayed_start;
+ head = find_ref_head(&delayed_refs->href_root, start, 1);
+ if (!head && !loop) {
+ delayed_refs->run_delayed_start = 0;
+ start = 0;
+ loop = true;
+ head = find_ref_head(&delayed_refs->href_root, start, 1);
+ if (!head)
+ return NULL;
+ } else if (!head && loop) {
+ return NULL;
+ }
+
+ while (head->processing) {
+ struct rb_node *node;
+
+ node = rb_next(&head->href_node);
+ if (!node) {
+ if (loop)
+ return NULL;
+ delayed_refs->run_delayed_start = 0;
+ start = 0;
+ loop = true;
+ goto again;
+ }
+ head = rb_entry(node, struct btrfs_delayed_ref_head,
+ href_node);
+ }
+
+ head->processing = 1;
+ WARN_ON(delayed_refs->num_heads_ready == 0);
+ delayed_refs->num_heads_ready--;
+ delayed_refs->run_delayed_start = head->node.bytenr +
+ head->node.num_bytes;
+ return head;
+}
+
+/*
+ * helper function to update an extent delayed ref in the
+ * rbtree. existing and update must both have the same
+ * bytenr and parent
+ *
+ * This may free existing if the update cancels out whatever
+ * operation it was doing.
+ */
+static noinline void
+update_existing_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_head *head,
+ struct btrfs_delayed_ref_node *existing,
+ struct btrfs_delayed_ref_node *update)
+{
+ if (update->action != existing->action) {
+ /*
+ * this is effectively undoing either an add or a
+ * drop. We decrement the ref_mod, and if it goes
+ * down to zero we just delete the entry without
+ * every changing the extent allocation tree.
+ */
+ existing->ref_mod--;
+ if (existing->ref_mod == 0)
+ drop_delayed_ref(trans, delayed_refs, head, existing);
+ else
+ WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
+ existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
+ } else {
+ WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
+ existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
+ /*
+ * the action on the existing ref matches
+ * the action on the ref we're trying to add.
+ * Bump the ref_mod by one so the backref that
+ * is eventually added/removed has the correct
+ * reference count
+ */
+ existing->ref_mod += update->ref_mod;
+ }
+}
+
+/*
+ * helper function to update the accounting in the head ref
+ * existing and update must have the same bytenr
+ */
+static noinline void
+update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_node *existing,
+ struct btrfs_delayed_ref_node *update)
+{
+ struct btrfs_delayed_ref_head *existing_ref;
+ struct btrfs_delayed_ref_head *ref;
+ int old_ref_mod;
+
+ existing_ref = btrfs_delayed_node_to_head(existing);
+ ref = btrfs_delayed_node_to_head(update);
+ BUG_ON(existing_ref->is_data != ref->is_data);
+
+ spin_lock(&existing_ref->lock);
+ if (ref->must_insert_reserved) {
+ /* if the extent was freed and then
+ * reallocated before the delayed ref
+ * entries were processed, we can end up
+ * with an existing head ref without
+ * the must_insert_reserved flag set.
+ * Set it again here
+ */
+ existing_ref->must_insert_reserved = ref->must_insert_reserved;
+
+ /*
+ * update the num_bytes so we make sure the accounting
+ * is done correctly
+ */
+ existing->num_bytes = update->num_bytes;
+
+ }
+
+ if (ref->extent_op) {
+ if (!existing_ref->extent_op) {
+ existing_ref->extent_op = ref->extent_op;
+ } else {
+ if (ref->extent_op->update_key) {
+ memcpy(&existing_ref->extent_op->key,
+ &ref->extent_op->key,
+ sizeof(ref->extent_op->key));
+ existing_ref->extent_op->update_key = 1;
+ }
+ if (ref->extent_op->update_flags) {
+ existing_ref->extent_op->flags_to_set |=
+ ref->extent_op->flags_to_set;
+ existing_ref->extent_op->update_flags = 1;
+ }
+ btrfs_free_delayed_extent_op(ref->extent_op);
+ }
+ }
+ /*
+ * update the reference mod on the head to reflect this new operation,
+ * only need the lock for this case cause we could be processing it
+ * currently, for refs we just added we know we're a-ok.
+ */
+ old_ref_mod = existing_ref->total_ref_mod;
+ existing->ref_mod += update->ref_mod;
+ existing_ref->total_ref_mod += update->ref_mod;
+
+ /*
+ * If we are going to from a positive ref mod to a negative or vice
+ * versa we need to make sure to adjust pending_csums accordingly.
+ */
+ if (existing_ref->is_data) {
+ if (existing_ref->total_ref_mod >= 0 && old_ref_mod < 0)
+ delayed_refs->pending_csums -= existing->num_bytes;
+ if (existing_ref->total_ref_mod < 0 && old_ref_mod >= 0)
+ delayed_refs->pending_csums += existing->num_bytes;
+ }
+ spin_unlock(&existing_ref->lock);
+}
+
+/*
+ * helper function to actually insert a head node into the rbtree.
+ * this does all the dirty work in terms of maintaining the correct
+ * overall modification count.
+ */
+static noinline struct btrfs_delayed_ref_head *
+add_delayed_ref_head(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_node *ref, u64 bytenr,
+ u64 num_bytes, int action, int is_data)
+{
+ struct btrfs_delayed_ref_head *existing;
+ struct btrfs_delayed_ref_head *head_ref = NULL;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ int count_mod = 1;
+ int must_insert_reserved = 0;
+
+ /*
+ * the head node stores the sum of all the mods, so dropping a ref
+ * should drop the sum in the head node by one.
+ */
+ if (action == BTRFS_UPDATE_DELAYED_HEAD)
+ count_mod = 0;
+ else if (action == BTRFS_DROP_DELAYED_REF)
+ count_mod = -1;
+
+ /*
+ * BTRFS_ADD_DELAYED_EXTENT means that we need to update
+ * the reserved accounting when the extent is finally added, or
+ * if a later modification deletes the delayed ref without ever
+ * inserting the extent into the extent allocation tree.
+ * ref->must_insert_reserved is the flag used to record
+ * that accounting mods are required.
+ *
+ * Once we record must_insert_reserved, switch the action to
+ * BTRFS_ADD_DELAYED_REF because other special casing is not required.
+ */
+ if (action == BTRFS_ADD_DELAYED_EXTENT)
+ must_insert_reserved = 1;
+ else
+ must_insert_reserved = 0;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+
+ /* first set the basic ref node struct up */
+ atomic_set(&ref->refs, 1);
+ ref->bytenr = bytenr;
+ ref->num_bytes = num_bytes;
+ ref->ref_mod = count_mod;
+ ref->type = 0;
+ ref->action = 0;
+ ref->is_head = 1;
+ ref->in_tree = 1;
+ ref->seq = 0;
+
+ head_ref = btrfs_delayed_node_to_head(ref);
+ head_ref->must_insert_reserved = must_insert_reserved;
+ head_ref->is_data = is_data;
+ head_ref->ref_root = RB_ROOT;
+ head_ref->processing = 0;
+ head_ref->total_ref_mod = count_mod;
+
+ spin_lock_init(&head_ref->lock);
+ mutex_init(&head_ref->mutex);
+
+ trace_add_delayed_ref_head(ref, head_ref, action);
+
+ existing = htree_insert(&delayed_refs->href_root,
+ &head_ref->href_node);
+ if (existing) {
+ update_existing_head_ref(delayed_refs, &existing->node, ref);
+ /*
+ * we've updated the existing ref, free the newly
+ * allocated ref
+ */
+ kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
+ head_ref = existing;
+ } else {
+ if (is_data && count_mod < 0)
+ delayed_refs->pending_csums += num_bytes;
+ delayed_refs->num_heads++;
+ delayed_refs->num_heads_ready++;
+ atomic_inc(&delayed_refs->num_entries);
+ trans->delayed_ref_updates++;
+ }
+ return head_ref;
+}
+
+/*
+ * helper to insert a delayed tree ref into the rbtree.
+ */
+static noinline void
+add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_head *head_ref,
+ struct btrfs_delayed_ref_node *ref, u64 bytenr,
+ u64 num_bytes, u64 parent, u64 ref_root, int level,
+ int action, int no_quota)
+{
+ struct btrfs_delayed_ref_node *existing;
+ struct btrfs_delayed_tree_ref *full_ref;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ u64 seq = 0;
+
+ if (action == BTRFS_ADD_DELAYED_EXTENT)
+ action = BTRFS_ADD_DELAYED_REF;
+
+ if (is_fstree(ref_root))
+ seq = atomic64_read(&fs_info->tree_mod_seq);
+ delayed_refs = &trans->transaction->delayed_refs;
+
+ /* first set the basic ref node struct up */
+ atomic_set(&ref->refs, 1);
+ ref->bytenr = bytenr;
+ ref->num_bytes = num_bytes;
+ ref->ref_mod = 1;
+ ref->action = action;
+ ref->is_head = 0;
+ ref->in_tree = 1;
+ ref->no_quota = no_quota;
+ ref->seq = seq;
+
+ full_ref = btrfs_delayed_node_to_tree_ref(ref);
+ full_ref->parent = parent;
+ full_ref->root = ref_root;
+ if (parent)
+ ref->type = BTRFS_SHARED_BLOCK_REF_KEY;
+ else
+ ref->type = BTRFS_TREE_BLOCK_REF_KEY;
+ full_ref->level = level;
+
+ trace_add_delayed_tree_ref(ref, full_ref, action);
+
+ spin_lock(&head_ref->lock);
+ existing = tree_insert(&head_ref->ref_root, &ref->rb_node);
+ if (existing) {
+ update_existing_ref(trans, delayed_refs, head_ref, existing,
+ ref);
+ /*
+ * we've updated the existing ref, free the newly
+ * allocated ref
+ */
+ kmem_cache_free(btrfs_delayed_tree_ref_cachep, full_ref);
+ } else {
+ atomic_inc(&delayed_refs->num_entries);
+ trans->delayed_ref_updates++;
+ }
+ spin_unlock(&head_ref->lock);
+}
+
+/*
+ * helper to insert a delayed data ref into the rbtree.
+ */
+static noinline void
+add_delayed_data_ref(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_head *head_ref,
+ struct btrfs_delayed_ref_node *ref, u64 bytenr,
+ u64 num_bytes, u64 parent, u64 ref_root, u64 owner,
+ u64 offset, int action, int no_quota)
+{
+ struct btrfs_delayed_ref_node *existing;
+ struct btrfs_delayed_data_ref *full_ref;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ u64 seq = 0;
+
+ if (action == BTRFS_ADD_DELAYED_EXTENT)
+ action = BTRFS_ADD_DELAYED_REF;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+
+ if (is_fstree(ref_root))
+ seq = atomic64_read(&fs_info->tree_mod_seq);
+
+ /* first set the basic ref node struct up */
+ atomic_set(&ref->refs, 1);
+ ref->bytenr = bytenr;
+ ref->num_bytes = num_bytes;
+ ref->ref_mod = 1;
+ ref->action = action;
+ ref->is_head = 0;
+ ref->in_tree = 1;
+ ref->no_quota = no_quota;
+ ref->seq = seq;
+
+ full_ref = btrfs_delayed_node_to_data_ref(ref);
+ full_ref->parent = parent;
+ full_ref->root = ref_root;
+ if (parent)
+ ref->type = BTRFS_SHARED_DATA_REF_KEY;
+ else
+ ref->type = BTRFS_EXTENT_DATA_REF_KEY;
+
+ full_ref->objectid = owner;
+ full_ref->offset = offset;
+
+ trace_add_delayed_data_ref(ref, full_ref, action);
+
+ spin_lock(&head_ref->lock);
+ existing = tree_insert(&head_ref->ref_root, &ref->rb_node);
+ if (existing) {
+ update_existing_ref(trans, delayed_refs, head_ref, existing,
+ ref);
+ /*
+ * we've updated the existing ref, free the newly
+ * allocated ref
+ */
+ kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref);
+ } else {
+ atomic_inc(&delayed_refs->num_entries);
+ trans->delayed_ref_updates++;
+ }
+ spin_unlock(&head_ref->lock);
+}
+
+/*
+ * add a delayed tree ref. This does all of the accounting required
+ * to make sure the delayed ref is eventually processed before this
+ * transaction commits.
+ */
+int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans,
+ u64 bytenr, u64 num_bytes, u64 parent,
+ u64 ref_root, int level, int action,
+ struct btrfs_delayed_extent_op *extent_op,
+ int no_quota)
+{
+ struct btrfs_delayed_tree_ref *ref;
+ struct btrfs_delayed_ref_head *head_ref;
+ struct btrfs_delayed_ref_root *delayed_refs;
+
+ if (!is_fstree(ref_root) || !fs_info->quota_enabled)
+ no_quota = 0;
+
+ BUG_ON(extent_op && extent_op->is_data);
+ ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS);
+ if (!ref)
+ return -ENOMEM;
+
+ head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
+ if (!head_ref) {
+ kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
+ return -ENOMEM;
+ }
+
+ head_ref->extent_op = extent_op;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ spin_lock(&delayed_refs->lock);
+
+ /*
+ * insert both the head node and the new ref without dropping
+ * the spin lock
+ */
+ head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node,
+ bytenr, num_bytes, action, 0);
+
+ add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr,
+ num_bytes, parent, ref_root, level, action,
+ no_quota);
+ spin_unlock(&delayed_refs->lock);
+
+ return 0;
+}
+
+/*
+ * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref.
+ */
+int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans,
+ u64 bytenr, u64 num_bytes,
+ u64 parent, u64 ref_root,
+ u64 owner, u64 offset, int action,
+ struct btrfs_delayed_extent_op *extent_op,
+ int no_quota)
+{
+ struct btrfs_delayed_data_ref *ref;
+ struct btrfs_delayed_ref_head *head_ref;
+ struct btrfs_delayed_ref_root *delayed_refs;
+
+ if (!is_fstree(ref_root) || !fs_info->quota_enabled)
+ no_quota = 0;
+
+ BUG_ON(extent_op && !extent_op->is_data);
+ ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS);
+ if (!ref)
+ return -ENOMEM;
+
+ head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
+ if (!head_ref) {
+ kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
+ return -ENOMEM;
+ }
+
+ head_ref->extent_op = extent_op;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ spin_lock(&delayed_refs->lock);
+
+ /*
+ * insert both the head node and the new ref without dropping
+ * the spin lock
+ */
+ head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node,
+ bytenr, num_bytes, action, 1);
+
+ add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr,
+ num_bytes, parent, ref_root, owner, offset,
+ action, no_quota);
+ spin_unlock(&delayed_refs->lock);
+
+ return 0;
+}
+
+int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans,
+ u64 bytenr, u64 num_bytes,
+ struct btrfs_delayed_extent_op *extent_op)
+{
+ struct btrfs_delayed_ref_head *head_ref;
+ struct btrfs_delayed_ref_root *delayed_refs;
+
+ head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
+ if (!head_ref)
+ return -ENOMEM;
+
+ head_ref->extent_op = extent_op;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ spin_lock(&delayed_refs->lock);
+
+ add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
+ num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
+ extent_op->is_data);
+
+ spin_unlock(&delayed_refs->lock);
+ return 0;
+}
+
+/*
+ * this does a simple search for the head node for a given extent.
+ * It must be called with the delayed ref spinlock held, and it returns
+ * the head node if any where found, or NULL if not.
+ */
+struct btrfs_delayed_ref_head *
+btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
+{
+ struct btrfs_delayed_ref_root *delayed_refs;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ return find_ref_head(&delayed_refs->href_root, bytenr, 0);
+}
+
+void btrfs_delayed_ref_exit(void)
+{
+ if (btrfs_delayed_ref_head_cachep)
+ kmem_cache_destroy(btrfs_delayed_ref_head_cachep);
+ if (btrfs_delayed_tree_ref_cachep)
+ kmem_cache_destroy(btrfs_delayed_tree_ref_cachep);
+ if (btrfs_delayed_data_ref_cachep)
+ kmem_cache_destroy(btrfs_delayed_data_ref_cachep);
+ if (btrfs_delayed_extent_op_cachep)
+ kmem_cache_destroy(btrfs_delayed_extent_op_cachep);
+}
+
+int btrfs_delayed_ref_init(void)
+{
+ btrfs_delayed_ref_head_cachep = kmem_cache_create(
+ "btrfs_delayed_ref_head",
+ sizeof(struct btrfs_delayed_ref_head), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+ if (!btrfs_delayed_ref_head_cachep)
+ goto fail;
+
+ btrfs_delayed_tree_ref_cachep = kmem_cache_create(
+ "btrfs_delayed_tree_ref",
+ sizeof(struct btrfs_delayed_tree_ref), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+ if (!btrfs_delayed_tree_ref_cachep)
+ goto fail;
+
+ btrfs_delayed_data_ref_cachep = kmem_cache_create(
+ "btrfs_delayed_data_ref",
+ sizeof(struct btrfs_delayed_data_ref), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+ if (!btrfs_delayed_data_ref_cachep)
+ goto fail;
+
+ btrfs_delayed_extent_op_cachep = kmem_cache_create(
+ "btrfs_delayed_extent_op",
+ sizeof(struct btrfs_delayed_extent_op), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+ if (!btrfs_delayed_extent_op_cachep)
+ goto fail;
+
+ return 0;
+fail:
+ btrfs_delayed_ref_exit();
+ return -ENOMEM;
+}
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
new file mode 100644
index 000000000..5eb089239
--- /dev/null
+++ b/fs/btrfs/delayed-ref.h
@@ -0,0 +1,276 @@
+/*
+ * Copyright (C) 2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __DELAYED_REF__
+#define __DELAYED_REF__
+
+/* these are the possible values of struct btrfs_delayed_ref_node->action */
+#define BTRFS_ADD_DELAYED_REF 1 /* add one backref to the tree */
+#define BTRFS_DROP_DELAYED_REF 2 /* delete one backref from the tree */
+#define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */
+#define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */
+
+struct btrfs_delayed_ref_node {
+ struct rb_node rb_node;
+
+ /* the starting bytenr of the extent */
+ u64 bytenr;
+
+ /* the size of the extent */
+ u64 num_bytes;
+
+ /* seq number to keep track of insertion order */
+ u64 seq;
+
+ /* ref count on this data structure */
+ atomic_t refs;
+
+ /*
+ * how many refs is this entry adding or deleting. For
+ * head refs, this may be a negative number because it is keeping
+ * track of the total mods done to the reference count.
+ * For individual refs, this will always be a positive number
+ *
+ * It may be more than one, since it is possible for a single
+ * parent to have more than one ref on an extent
+ */
+ int ref_mod;
+
+ unsigned int action:8;
+ unsigned int type:8;
+ unsigned int no_quota:1;
+ /* is this node still in the rbtree? */
+ unsigned int is_head:1;
+ unsigned int in_tree:1;
+};
+
+struct btrfs_delayed_extent_op {
+ struct btrfs_disk_key key;
+ u64 flags_to_set;
+ int level;
+ unsigned int update_key:1;
+ unsigned int update_flags:1;
+ unsigned int is_data:1;
+};
+
+/*
+ * the head refs are used to hold a lock on a given extent, which allows us
+ * to make sure that only one process is running the delayed refs
+ * at a time for a single extent. They also store the sum of all the
+ * reference count modifications we've queued up.
+ */
+struct btrfs_delayed_ref_head {
+ struct btrfs_delayed_ref_node node;
+
+ /*
+ * the mutex is held while running the refs, and it is also
+ * held when checking the sum of reference modifications.
+ */
+ struct mutex mutex;
+
+ spinlock_t lock;
+ struct rb_root ref_root;
+
+ struct rb_node href_node;
+
+ struct btrfs_delayed_extent_op *extent_op;
+
+ /*
+ * This is used to track the final ref_mod from all the refs associated
+ * with this head ref, this is not adjusted as delayed refs are run,
+ * this is meant to track if we need to do the csum accounting or not.
+ */
+ int total_ref_mod;
+
+ /*
+ * when a new extent is allocated, it is just reserved in memory
+ * The actual extent isn't inserted into the extent allocation tree
+ * until the delayed ref is processed. must_insert_reserved is
+ * used to flag a delayed ref so the accounting can be updated
+ * when a full insert is done.
+ *
+ * It is possible the extent will be freed before it is ever
+ * inserted into the extent allocation tree. In this case
+ * we need to update the in ram accounting to properly reflect
+ * the free has happened.
+ */
+ unsigned int must_insert_reserved:1;
+ unsigned int is_data:1;
+ unsigned int processing:1;
+};
+
+struct btrfs_delayed_tree_ref {
+ struct btrfs_delayed_ref_node node;
+ u64 root;
+ u64 parent;
+ int level;
+};
+
+struct btrfs_delayed_data_ref {
+ struct btrfs_delayed_ref_node node;
+ u64 root;
+ u64 parent;
+ u64 objectid;
+ u64 offset;
+};
+
+struct btrfs_delayed_ref_root {
+ /* head ref rbtree */
+ struct rb_root href_root;
+
+ /* this spin lock protects the rbtree and the entries inside */
+ spinlock_t lock;
+
+ /* how many delayed ref updates we've queued, used by the
+ * throttling code
+ */
+ atomic_t num_entries;
+
+ /* total number of head nodes in tree */
+ unsigned long num_heads;
+
+ /* total number of head nodes ready for processing */
+ unsigned long num_heads_ready;
+
+ u64 pending_csums;
+
+ /*
+ * set when the tree is flushing before a transaction commit,
+ * used by the throttling code to decide if new updates need
+ * to be run right away
+ */
+ int flushing;
+
+ u64 run_delayed_start;
+};
+
+extern struct kmem_cache *btrfs_delayed_ref_head_cachep;
+extern struct kmem_cache *btrfs_delayed_tree_ref_cachep;
+extern struct kmem_cache *btrfs_delayed_data_ref_cachep;
+extern struct kmem_cache *btrfs_delayed_extent_op_cachep;
+
+int btrfs_delayed_ref_init(void);
+void btrfs_delayed_ref_exit(void);
+
+static inline struct btrfs_delayed_extent_op *
+btrfs_alloc_delayed_extent_op(void)
+{
+ return kmem_cache_alloc(btrfs_delayed_extent_op_cachep, GFP_NOFS);
+}
+
+static inline void
+btrfs_free_delayed_extent_op(struct btrfs_delayed_extent_op *op)
+{
+ if (op)
+ kmem_cache_free(btrfs_delayed_extent_op_cachep, op);
+}
+
+static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
+{
+ WARN_ON(atomic_read(&ref->refs) == 0);
+ if (atomic_dec_and_test(&ref->refs)) {
+ WARN_ON(ref->in_tree);
+ switch (ref->type) {
+ case BTRFS_TREE_BLOCK_REF_KEY:
+ case BTRFS_SHARED_BLOCK_REF_KEY:
+ kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
+ break;
+ case BTRFS_EXTENT_DATA_REF_KEY:
+ case BTRFS_SHARED_DATA_REF_KEY:
+ kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
+ break;
+ case 0:
+ kmem_cache_free(btrfs_delayed_ref_head_cachep, ref);
+ break;
+ default:
+ BUG();
+ }
+ }
+}
+
+int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans,
+ u64 bytenr, u64 num_bytes, u64 parent,
+ u64 ref_root, int level, int action,
+ struct btrfs_delayed_extent_op *extent_op,
+ int no_quota);
+int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans,
+ u64 bytenr, u64 num_bytes,
+ u64 parent, u64 ref_root,
+ u64 owner, u64 offset, int action,
+ struct btrfs_delayed_extent_op *extent_op,
+ int no_quota);
+int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans,
+ u64 bytenr, u64 num_bytes,
+ struct btrfs_delayed_extent_op *extent_op);
+void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_head *head);
+
+struct btrfs_delayed_ref_head *
+btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
+int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_head *head);
+static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head)
+{
+ mutex_unlock(&head->mutex);
+}
+
+
+struct btrfs_delayed_ref_head *
+btrfs_select_ref_head(struct btrfs_trans_handle *trans);
+
+int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_root *delayed_refs,
+ u64 seq);
+
+/*
+ * a node might live in a head or a regular ref, this lets you
+ * test for the proper type to use.
+ */
+static int btrfs_delayed_ref_is_head(struct btrfs_delayed_ref_node *node)
+{
+ return node->is_head;
+}
+
+/*
+ * helper functions to cast a node into its container
+ */
+static inline struct btrfs_delayed_tree_ref *
+btrfs_delayed_node_to_tree_ref(struct btrfs_delayed_ref_node *node)
+{
+ WARN_ON(btrfs_delayed_ref_is_head(node));
+ return container_of(node, struct btrfs_delayed_tree_ref, node);
+}
+
+static inline struct btrfs_delayed_data_ref *
+btrfs_delayed_node_to_data_ref(struct btrfs_delayed_ref_node *node)
+{
+ WARN_ON(btrfs_delayed_ref_is_head(node));
+ return container_of(node, struct btrfs_delayed_data_ref, node);
+}
+
+static inline struct btrfs_delayed_ref_head *
+btrfs_delayed_node_to_head(struct btrfs_delayed_ref_node *node)
+{
+ WARN_ON(!btrfs_delayed_ref_is_head(node));
+ return container_of(node, struct btrfs_delayed_ref_head, node);
+}
+#endif
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
new file mode 100644
index 000000000..0573848c7
--- /dev/null
+++ b/fs/btrfs/dev-replace.c
@@ -0,0 +1,937 @@
+/*
+ * Copyright (C) STRATO AG 2012. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/bio.h>
+#include <linux/slab.h>
+#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
+#include <linux/random.h>
+#include <linux/iocontext.h>
+#include <linux/capability.h>
+#include <linux/kthread.h>
+#include <linux/math64.h>
+#include <asm/div64.h>
+#include "ctree.h"
+#include "extent_map.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "print-tree.h"
+#include "volumes.h"
+#include "async-thread.h"
+#include "check-integrity.h"
+#include "rcu-string.h"
+#include "dev-replace.h"
+#include "sysfs.h"
+
+static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
+ int scrub_ret);
+static void btrfs_dev_replace_update_device_in_mapping_tree(
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_device *srcdev,
+ struct btrfs_device *tgtdev);
+static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid,
+ char *srcdev_name,
+ struct btrfs_device **device);
+static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
+static int btrfs_dev_replace_kthread(void *data);
+static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info);
+
+
+int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_key key;
+ struct btrfs_root *dev_root = fs_info->dev_root;
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+ struct extent_buffer *eb;
+ int slot;
+ int ret = 0;
+ struct btrfs_path *path = NULL;
+ int item_size;
+ struct btrfs_dev_replace_item *ptr;
+ u64 src_devid;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ key.objectid = 0;
+ key.type = BTRFS_DEV_REPLACE_KEY;
+ key.offset = 0;
+ ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
+ if (ret) {
+no_valid_dev_replace_entry_found:
+ ret = 0;
+ dev_replace->replace_state =
+ BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED;
+ dev_replace->cont_reading_from_srcdev_mode =
+ BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS;
+ dev_replace->replace_state = 0;
+ dev_replace->time_started = 0;
+ dev_replace->time_stopped = 0;
+ atomic64_set(&dev_replace->num_write_errors, 0);
+ atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
+ dev_replace->cursor_left = 0;
+ dev_replace->committed_cursor_left = 0;
+ dev_replace->cursor_left_last_write_of_item = 0;
+ dev_replace->cursor_right = 0;
+ dev_replace->srcdev = NULL;
+ dev_replace->tgtdev = NULL;
+ dev_replace->is_valid = 0;
+ dev_replace->item_needs_writeback = 0;
+ goto out;
+ }
+ slot = path->slots[0];
+ eb = path->nodes[0];
+ item_size = btrfs_item_size_nr(eb, slot);
+ ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item);
+
+ if (item_size != sizeof(struct btrfs_dev_replace_item)) {
+ btrfs_warn(fs_info,
+ "dev_replace entry found has unexpected size, ignore entry");
+ goto no_valid_dev_replace_entry_found;
+ }
+
+ src_devid = btrfs_dev_replace_src_devid(eb, ptr);
+ dev_replace->cont_reading_from_srcdev_mode =
+ btrfs_dev_replace_cont_reading_from_srcdev_mode(eb, ptr);
+ dev_replace->replace_state = btrfs_dev_replace_replace_state(eb, ptr);
+ dev_replace->time_started = btrfs_dev_replace_time_started(eb, ptr);
+ dev_replace->time_stopped =
+ btrfs_dev_replace_time_stopped(eb, ptr);
+ atomic64_set(&dev_replace->num_write_errors,
+ btrfs_dev_replace_num_write_errors(eb, ptr));
+ atomic64_set(&dev_replace->num_uncorrectable_read_errors,
+ btrfs_dev_replace_num_uncorrectable_read_errors(eb, ptr));
+ dev_replace->cursor_left = btrfs_dev_replace_cursor_left(eb, ptr);
+ dev_replace->committed_cursor_left = dev_replace->cursor_left;
+ dev_replace->cursor_left_last_write_of_item = dev_replace->cursor_left;
+ dev_replace->cursor_right = btrfs_dev_replace_cursor_right(eb, ptr);
+ dev_replace->is_valid = 1;
+
+ dev_replace->item_needs_writeback = 0;
+ switch (dev_replace->replace_state) {
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+ dev_replace->srcdev = NULL;
+ dev_replace->tgtdev = NULL;
+ break;
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+ dev_replace->srcdev = btrfs_find_device(fs_info, src_devid,
+ NULL, NULL);
+ dev_replace->tgtdev = btrfs_find_device(fs_info,
+ BTRFS_DEV_REPLACE_DEVID,
+ NULL, NULL);
+ /*
+ * allow 'btrfs dev replace_cancel' if src/tgt device is
+ * missing
+ */
+ if (!dev_replace->srcdev &&
+ !btrfs_test_opt(dev_root, DEGRADED)) {
+ ret = -EIO;
+ btrfs_warn(fs_info,
+ "cannot mount because device replace operation is ongoing and");
+ btrfs_warn(fs_info,
+ "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?",
+ src_devid);
+ }
+ if (!dev_replace->tgtdev &&
+ !btrfs_test_opt(dev_root, DEGRADED)) {
+ ret = -EIO;
+ btrfs_warn(fs_info,
+ "cannot mount because device replace operation is ongoing and");
+ btrfs_warn(fs_info,
+ "tgtdev (devid %llu) is missing, need to run 'btrfs dev scan'?",
+ BTRFS_DEV_REPLACE_DEVID);
+ }
+ if (dev_replace->tgtdev) {
+ if (dev_replace->srcdev) {
+ dev_replace->tgtdev->total_bytes =
+ dev_replace->srcdev->total_bytes;
+ dev_replace->tgtdev->disk_total_bytes =
+ dev_replace->srcdev->disk_total_bytes;
+ dev_replace->tgtdev->commit_total_bytes =
+ dev_replace->srcdev->commit_total_bytes;
+ dev_replace->tgtdev->bytes_used =
+ dev_replace->srcdev->bytes_used;
+ dev_replace->tgtdev->commit_bytes_used =
+ dev_replace->srcdev->commit_bytes_used;
+ }
+ dev_replace->tgtdev->is_tgtdev_for_dev_replace = 1;
+ btrfs_init_dev_replace_tgtdev_for_resume(fs_info,
+ dev_replace->tgtdev);
+ }
+ break;
+ }
+
+out:
+ if (path)
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * called from commit_transaction. Writes changed device replace state to
+ * disk.
+ */
+int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info)
+{
+ int ret;
+ struct btrfs_root *dev_root = fs_info->dev_root;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct extent_buffer *eb;
+ struct btrfs_dev_replace_item *ptr;
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+
+ btrfs_dev_replace_lock(dev_replace);
+ if (!dev_replace->is_valid ||
+ !dev_replace->item_needs_writeback) {
+ btrfs_dev_replace_unlock(dev_replace);
+ return 0;
+ }
+ btrfs_dev_replace_unlock(dev_replace);
+
+ key.objectid = 0;
+ key.type = BTRFS_DEV_REPLACE_KEY;
+ key.offset = 0;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
+ if (ret < 0) {
+ btrfs_warn(fs_info, "error %d while searching for dev_replace item!",
+ ret);
+ goto out;
+ }
+
+ if (ret == 0 &&
+ btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
+ /*
+ * need to delete old one and insert a new one.
+ * Since no attempt is made to recover any old state, if the
+ * dev_replace state is 'running', the data on the target
+ * drive is lost.
+ * It would be possible to recover the state: just make sure
+ * that the beginning of the item is never changed and always
+ * contains all the essential information. Then read this
+ * minimal set of information and use it as a base for the
+ * new state.
+ */
+ ret = btrfs_del_item(trans, dev_root, path);
+ if (ret != 0) {
+ btrfs_warn(fs_info, "delete too small dev_replace item failed %d!",
+ ret);
+ goto out;
+ }
+ ret = 1;
+ }
+
+ if (ret == 1) {
+ /* need to insert a new item */
+ btrfs_release_path(path);
+ ret = btrfs_insert_empty_item(trans, dev_root, path,
+ &key, sizeof(*ptr));
+ if (ret < 0) {
+ btrfs_warn(fs_info, "insert dev_replace item failed %d!",
+ ret);
+ goto out;
+ }
+ }
+
+ eb = path->nodes[0];
+ ptr = btrfs_item_ptr(eb, path->slots[0],
+ struct btrfs_dev_replace_item);
+
+ btrfs_dev_replace_lock(dev_replace);
+ if (dev_replace->srcdev)
+ btrfs_set_dev_replace_src_devid(eb, ptr,
+ dev_replace->srcdev->devid);
+ else
+ btrfs_set_dev_replace_src_devid(eb, ptr, (u64)-1);
+ btrfs_set_dev_replace_cont_reading_from_srcdev_mode(eb, ptr,
+ dev_replace->cont_reading_from_srcdev_mode);
+ btrfs_set_dev_replace_replace_state(eb, ptr,
+ dev_replace->replace_state);
+ btrfs_set_dev_replace_time_started(eb, ptr, dev_replace->time_started);
+ btrfs_set_dev_replace_time_stopped(eb, ptr, dev_replace->time_stopped);
+ btrfs_set_dev_replace_num_write_errors(eb, ptr,
+ atomic64_read(&dev_replace->num_write_errors));
+ btrfs_set_dev_replace_num_uncorrectable_read_errors(eb, ptr,
+ atomic64_read(&dev_replace->num_uncorrectable_read_errors));
+ dev_replace->cursor_left_last_write_of_item =
+ dev_replace->cursor_left;
+ btrfs_set_dev_replace_cursor_left(eb, ptr,
+ dev_replace->cursor_left_last_write_of_item);
+ btrfs_set_dev_replace_cursor_right(eb, ptr,
+ dev_replace->cursor_right);
+ dev_replace->item_needs_writeback = 0;
+ btrfs_dev_replace_unlock(dev_replace);
+
+ btrfs_mark_buffer_dirty(eb);
+
+out:
+ btrfs_free_path(path);
+
+ return ret;
+}
+
+void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+
+ dev_replace->committed_cursor_left =
+ dev_replace->cursor_left_last_write_of_item;
+}
+
+int btrfs_dev_replace_start(struct btrfs_root *root,
+ struct btrfs_ioctl_dev_replace_args *args)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+ int ret;
+ struct btrfs_device *tgt_device = NULL;
+ struct btrfs_device *src_device = NULL;
+
+ switch (args->start.cont_reading_from_srcdev_mode) {
+ case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
+ case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') ||
+ args->start.tgtdev_name[0] == '\0')
+ return -EINVAL;
+
+ /*
+ * Here we commit the transaction to make sure commit_total_bytes
+ * of all the devices are updated.
+ */
+ trans = btrfs_attach_transaction(root);
+ if (!IS_ERR(trans)) {
+ ret = btrfs_commit_transaction(trans, root);
+ if (ret)
+ return ret;
+ } else if (PTR_ERR(trans) != -ENOENT) {
+ return PTR_ERR(trans);
+ }
+
+ /* the disk copy procedure reuses the scrub code */
+ mutex_lock(&fs_info->volume_mutex);
+ ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid,
+ args->start.srcdev_name,
+ &src_device);
+ if (ret) {
+ mutex_unlock(&fs_info->volume_mutex);
+ return ret;
+ }
+
+ ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name,
+ src_device, &tgt_device);
+ mutex_unlock(&fs_info->volume_mutex);
+ if (ret)
+ return ret;
+
+ btrfs_dev_replace_lock(dev_replace);
+ switch (dev_replace->replace_state) {
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+ break;
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+ args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
+ goto leave;
+ }
+
+ dev_replace->cont_reading_from_srcdev_mode =
+ args->start.cont_reading_from_srcdev_mode;
+ WARN_ON(!src_device);
+ dev_replace->srcdev = src_device;
+ WARN_ON(!tgt_device);
+ dev_replace->tgtdev = tgt_device;
+
+ printk_in_rcu(KERN_INFO
+ "BTRFS: dev_replace from %s (devid %llu) to %s started\n",
+ src_device->missing ? "<missing disk>" :
+ rcu_str_deref(src_device->name),
+ src_device->devid,
+ rcu_str_deref(tgt_device->name));
+
+ /*
+ * from now on, the writes to the srcdev are all duplicated to
+ * go to the tgtdev as well (refer to btrfs_map_block()).
+ */
+ dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
+ dev_replace->time_started = get_seconds();
+ dev_replace->cursor_left = 0;
+ dev_replace->committed_cursor_left = 0;
+ dev_replace->cursor_left_last_write_of_item = 0;
+ dev_replace->cursor_right = 0;
+ dev_replace->is_valid = 1;
+ dev_replace->item_needs_writeback = 1;
+ args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
+ btrfs_dev_replace_unlock(dev_replace);
+
+ btrfs_wait_ordered_roots(root->fs_info, -1);
+
+ /* force writing the updated state information to disk */
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ btrfs_dev_replace_lock(dev_replace);
+ goto leave;
+ }
+
+ ret = btrfs_commit_transaction(trans, root);
+ WARN_ON(ret);
+
+ /* the disk copy procedure reuses the scrub code */
+ ret = btrfs_scrub_dev(fs_info, src_device->devid, 0,
+ btrfs_device_get_total_bytes(src_device),
+ &dev_replace->scrub_progress, 0, 1);
+
+ ret = btrfs_dev_replace_finishing(root->fs_info, ret);
+ /* don't warn if EINPROGRESS, someone else might be running scrub */
+ if (ret == -EINPROGRESS) {
+ args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS;
+ ret = 0;
+ } else {
+ WARN_ON(ret);
+ }
+
+ return ret;
+
+leave:
+ dev_replace->srcdev = NULL;
+ dev_replace->tgtdev = NULL;
+ btrfs_dev_replace_unlock(dev_replace);
+ btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
+ return ret;
+}
+
+/*
+ * blocked until all flighting bios are finished.
+ */
+static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
+{
+ set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
+ wait_event(fs_info->replace_wait, !percpu_counter_sum(
+ &fs_info->bio_counter));
+}
+
+/*
+ * we have removed target device, it is safe to allow new bios request.
+ */
+static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info)
+{
+ clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
+ if (waitqueue_active(&fs_info->replace_wait))
+ wake_up(&fs_info->replace_wait);
+}
+
+static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
+ int scrub_ret)
+{
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+ struct btrfs_device *tgt_device;
+ struct btrfs_device *src_device;
+ struct btrfs_root *root = fs_info->tree_root;
+ u8 uuid_tmp[BTRFS_UUID_SIZE];
+ struct btrfs_trans_handle *trans;
+ int ret = 0;
+
+ /* don't allow cancel or unmount to disturb the finishing procedure */
+ mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
+
+ btrfs_dev_replace_lock(dev_replace);
+ /* was the operation canceled, or is it finished? */
+ if (dev_replace->replace_state !=
+ BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) {
+ btrfs_dev_replace_unlock(dev_replace);
+ mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+ return 0;
+ }
+
+ tgt_device = dev_replace->tgtdev;
+ src_device = dev_replace->srcdev;
+ btrfs_dev_replace_unlock(dev_replace);
+
+ /*
+ * flush all outstanding I/O and inode extent mappings before the
+ * copy operation is declared as being finished
+ */
+ ret = btrfs_start_delalloc_roots(root->fs_info, 0, -1);
+ if (ret) {
+ mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+ return ret;
+ }
+ btrfs_wait_ordered_roots(root->fs_info, -1);
+
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+ return PTR_ERR(trans);
+ }
+ ret = btrfs_commit_transaction(trans, root);
+ WARN_ON(ret);
+
+ mutex_lock(&uuid_mutex);
+ /* keep away write_all_supers() during the finishing procedure */
+ mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+ mutex_lock(&root->fs_info->chunk_mutex);
+ btrfs_dev_replace_lock(dev_replace);
+ dev_replace->replace_state =
+ scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
+ : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED;
+ dev_replace->tgtdev = NULL;
+ dev_replace->srcdev = NULL;
+ dev_replace->time_stopped = get_seconds();
+ dev_replace->item_needs_writeback = 1;
+
+ /* replace old device with new one in mapping tree */
+ if (!scrub_ret) {
+ btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
+ src_device,
+ tgt_device);
+ } else {
+ printk_in_rcu(KERN_ERR
+ "BTRFS: btrfs_scrub_dev(%s, %llu, %s) failed %d\n",
+ src_device->missing ? "<missing disk>" :
+ rcu_str_deref(src_device->name),
+ src_device->devid,
+ rcu_str_deref(tgt_device->name), scrub_ret);
+ btrfs_dev_replace_unlock(dev_replace);
+ mutex_unlock(&root->fs_info->chunk_mutex);
+ mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+ mutex_unlock(&uuid_mutex);
+ if (tgt_device)
+ btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
+ mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+
+ return scrub_ret;
+ }
+
+ printk_in_rcu(KERN_INFO
+ "BTRFS: dev_replace from %s (devid %llu) to %s finished\n",
+ src_device->missing ? "<missing disk>" :
+ rcu_str_deref(src_device->name),
+ src_device->devid,
+ rcu_str_deref(tgt_device->name));
+ tgt_device->is_tgtdev_for_dev_replace = 0;
+ tgt_device->devid = src_device->devid;
+ src_device->devid = BTRFS_DEV_REPLACE_DEVID;
+ memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp));
+ memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid));
+ memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid));
+ btrfs_device_set_total_bytes(tgt_device, src_device->total_bytes);
+ btrfs_device_set_disk_total_bytes(tgt_device,
+ src_device->disk_total_bytes);
+ btrfs_device_set_bytes_used(tgt_device, src_device->bytes_used);
+ ASSERT(list_empty(&src_device->resized_list));
+ tgt_device->commit_total_bytes = src_device->commit_total_bytes;
+ tgt_device->commit_bytes_used = src_device->bytes_used;
+ if (fs_info->sb->s_bdev == src_device->bdev)
+ fs_info->sb->s_bdev = tgt_device->bdev;
+ if (fs_info->fs_devices->latest_bdev == src_device->bdev)
+ fs_info->fs_devices->latest_bdev = tgt_device->bdev;
+ list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
+ fs_info->fs_devices->rw_devices++;
+
+ btrfs_dev_replace_unlock(dev_replace);
+
+ btrfs_rm_dev_replace_blocked(fs_info);
+
+ btrfs_rm_dev_replace_remove_srcdev(fs_info, src_device);
+
+ btrfs_rm_dev_replace_unblocked(fs_info);
+
+ /*
+ * this is again a consistent state where no dev_replace procedure
+ * is running, the target device is part of the filesystem, the
+ * source device is not part of the filesystem anymore and its 1st
+ * superblock is scratched out so that it is no longer marked to
+ * belong to this filesystem.
+ */
+ mutex_unlock(&root->fs_info->chunk_mutex);
+ mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+ mutex_unlock(&uuid_mutex);
+
+ /* replace the sysfs entry */
+ btrfs_kobj_rm_device(fs_info, src_device);
+ btrfs_kobj_add_device(fs_info, tgt_device);
+ btrfs_rm_dev_replace_free_srcdev(fs_info, src_device);
+
+ /* write back the superblocks */
+ trans = btrfs_start_transaction(root, 0);
+ if (!IS_ERR(trans))
+ btrfs_commit_transaction(trans, root);
+
+ mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+
+ return 0;
+}
+
+static void btrfs_dev_replace_update_device_in_mapping_tree(
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_device *srcdev,
+ struct btrfs_device *tgtdev)
+{
+ struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
+ struct extent_map *em;
+ struct map_lookup *map;
+ u64 start = 0;
+ int i;
+
+ write_lock(&em_tree->lock);
+ do {
+ em = lookup_extent_mapping(em_tree, start, (u64)-1);
+ if (!em)
+ break;
+ map = (struct map_lookup *)em->bdev;
+ for (i = 0; i < map->num_stripes; i++)
+ if (srcdev == map->stripes[i].dev)
+ map->stripes[i].dev = tgtdev;
+ start = em->start + em->len;
+ free_extent_map(em);
+ } while (start);
+ write_unlock(&em_tree->lock);
+}
+
+static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid,
+ char *srcdev_name,
+ struct btrfs_device **device)
+{
+ int ret;
+
+ if (srcdevid) {
+ ret = 0;
+ *device = btrfs_find_device(root->fs_info, srcdevid, NULL,
+ NULL);
+ if (!*device)
+ ret = -ENOENT;
+ } else {
+ ret = btrfs_find_device_missing_or_by_path(root, srcdev_name,
+ device);
+ }
+ return ret;
+}
+
+void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
+ struct btrfs_ioctl_dev_replace_args *args)
+{
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+ struct btrfs_device *srcdev;
+
+ btrfs_dev_replace_lock(dev_replace);
+ /* even if !dev_replace_is_valid, the values are good enough for
+ * the replace_status ioctl */
+ args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
+ args->status.replace_state = dev_replace->replace_state;
+ args->status.time_started = dev_replace->time_started;
+ args->status.time_stopped = dev_replace->time_stopped;
+ args->status.num_write_errors =
+ atomic64_read(&dev_replace->num_write_errors);
+ args->status.num_uncorrectable_read_errors =
+ atomic64_read(&dev_replace->num_uncorrectable_read_errors);
+ switch (dev_replace->replace_state) {
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+ args->status.progress_1000 = 0;
+ break;
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+ args->status.progress_1000 = 1000;
+ break;
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+ srcdev = dev_replace->srcdev;
+ args->status.progress_1000 = div_u64(dev_replace->cursor_left,
+ div_u64(btrfs_device_get_total_bytes(srcdev), 1000));
+ break;
+ }
+ btrfs_dev_replace_unlock(dev_replace);
+}
+
+int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
+ struct btrfs_ioctl_dev_replace_args *args)
+{
+ args->result = __btrfs_dev_replace_cancel(fs_info);
+ return 0;
+}
+
+static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+ struct btrfs_device *tgt_device = NULL;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = fs_info->tree_root;
+ u64 result;
+ int ret;
+
+ if (fs_info->sb->s_flags & MS_RDONLY)
+ return -EROFS;
+
+ mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
+ btrfs_dev_replace_lock(dev_replace);
+ switch (dev_replace->replace_state) {
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+ result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
+ btrfs_dev_replace_unlock(dev_replace);
+ goto leave;
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+ result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
+ tgt_device = dev_replace->tgtdev;
+ dev_replace->tgtdev = NULL;
+ dev_replace->srcdev = NULL;
+ break;
+ }
+ dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
+ dev_replace->time_stopped = get_seconds();
+ dev_replace->item_needs_writeback = 1;
+ btrfs_dev_replace_unlock(dev_replace);
+ btrfs_scrub_cancel(fs_info);
+
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+ return PTR_ERR(trans);
+ }
+ ret = btrfs_commit_transaction(trans, root);
+ WARN_ON(ret);
+ if (tgt_device)
+ btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
+
+leave:
+ mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+ return result;
+}
+
+void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+
+ mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
+ btrfs_dev_replace_lock(dev_replace);
+ switch (dev_replace->replace_state) {
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+ break;
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+ dev_replace->replace_state =
+ BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
+ dev_replace->time_stopped = get_seconds();
+ dev_replace->item_needs_writeback = 1;
+ btrfs_info(fs_info, "suspending dev_replace for unmount");
+ break;
+ }
+
+ btrfs_dev_replace_unlock(dev_replace);
+ mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+}
+
+/* resume dev_replace procedure that was interrupted by unmount */
+int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
+{
+ struct task_struct *task;
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+
+ btrfs_dev_replace_lock(dev_replace);
+ switch (dev_replace->replace_state) {
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+ btrfs_dev_replace_unlock(dev_replace);
+ return 0;
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+ break;
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+ dev_replace->replace_state =
+ BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
+ break;
+ }
+ if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) {
+ btrfs_info(fs_info, "cannot continue dev_replace, tgtdev is missing");
+ btrfs_info(fs_info,
+ "you may cancel the operation after 'mount -o degraded'");
+ btrfs_dev_replace_unlock(dev_replace);
+ return 0;
+ }
+ btrfs_dev_replace_unlock(dev_replace);
+
+ WARN_ON(atomic_xchg(
+ &fs_info->mutually_exclusive_operation_running, 1));
+ task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl");
+ return PTR_ERR_OR_ZERO(task);
+}
+
+static int btrfs_dev_replace_kthread(void *data)
+{
+ struct btrfs_fs_info *fs_info = data;
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+ struct btrfs_ioctl_dev_replace_args *status_args;
+ u64 progress;
+
+ status_args = kzalloc(sizeof(*status_args), GFP_NOFS);
+ if (status_args) {
+ btrfs_dev_replace_status(fs_info, status_args);
+ progress = status_args->status.progress_1000;
+ kfree(status_args);
+ progress = div_u64(progress, 10);
+ printk_in_rcu(KERN_INFO
+ "BTRFS: continuing dev_replace from %s (devid %llu) to %s @%u%%\n",
+ dev_replace->srcdev->missing ? "<missing disk>" :
+ rcu_str_deref(dev_replace->srcdev->name),
+ dev_replace->srcdev->devid,
+ dev_replace->tgtdev ?
+ rcu_str_deref(dev_replace->tgtdev->name) :
+ "<missing target disk>",
+ (unsigned int)progress);
+ }
+ btrfs_dev_replace_continue_on_mount(fs_info);
+ atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+
+ return 0;
+}
+
+static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+ int ret;
+
+ ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid,
+ dev_replace->committed_cursor_left,
+ btrfs_device_get_total_bytes(dev_replace->srcdev),
+ &dev_replace->scrub_progress, 0, 1);
+ ret = btrfs_dev_replace_finishing(fs_info, ret);
+ WARN_ON(ret);
+ return 0;
+}
+
+int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
+{
+ if (!dev_replace->is_valid)
+ return 0;
+
+ switch (dev_replace->replace_state) {
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+ return 0;
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+ case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+ /*
+ * return true even if tgtdev is missing (this is
+ * something that can happen if the dev_replace
+ * procedure is suspended by an umount and then
+ * the tgtdev is missing (or "btrfs dev scan") was
+ * not called and the the filesystem is remounted
+ * in degraded state. This does not stop the
+ * dev_replace procedure. It needs to be canceled
+ * manually if the cancelation is wanted.
+ */
+ break;
+ }
+ return 1;
+}
+
+void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace)
+{
+ /* the beginning is just an optimization for the typical case */
+ if (atomic_read(&dev_replace->nesting_level) == 0) {
+acquire_lock:
+ /* this is not a nested case where the same thread
+ * is trying to acqurire the same lock twice */
+ mutex_lock(&dev_replace->lock);
+ mutex_lock(&dev_replace->lock_management_lock);
+ dev_replace->lock_owner = current->pid;
+ atomic_inc(&dev_replace->nesting_level);
+ mutex_unlock(&dev_replace->lock_management_lock);
+ return;
+ }
+
+ mutex_lock(&dev_replace->lock_management_lock);
+ if (atomic_read(&dev_replace->nesting_level) > 0 &&
+ dev_replace->lock_owner == current->pid) {
+ WARN_ON(!mutex_is_locked(&dev_replace->lock));
+ atomic_inc(&dev_replace->nesting_level);
+ mutex_unlock(&dev_replace->lock_management_lock);
+ return;
+ }
+
+ mutex_unlock(&dev_replace->lock_management_lock);
+ goto acquire_lock;
+}
+
+void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace)
+{
+ WARN_ON(!mutex_is_locked(&dev_replace->lock));
+ mutex_lock(&dev_replace->lock_management_lock);
+ WARN_ON(atomic_read(&dev_replace->nesting_level) < 1);
+ WARN_ON(dev_replace->lock_owner != current->pid);
+ atomic_dec(&dev_replace->nesting_level);
+ if (atomic_read(&dev_replace->nesting_level) == 0) {
+ dev_replace->lock_owner = 0;
+ mutex_unlock(&dev_replace->lock_management_lock);
+ mutex_unlock(&dev_replace->lock);
+ } else {
+ mutex_unlock(&dev_replace->lock_management_lock);
+ }
+}
+
+void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
+{
+ percpu_counter_inc(&fs_info->bio_counter);
+}
+
+void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount)
+{
+ percpu_counter_sub(&fs_info->bio_counter, amount);
+
+ if (waitqueue_active(&fs_info->replace_wait))
+ wake_up(&fs_info->replace_wait);
+}
+
+void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info)
+{
+ while (1) {
+ percpu_counter_inc(&fs_info->bio_counter);
+ if (likely(!test_bit(BTRFS_FS_STATE_DEV_REPLACING,
+ &fs_info->fs_state)))
+ break;
+
+ btrfs_bio_counter_dec(fs_info);
+ wait_event(fs_info->replace_wait,
+ !test_bit(BTRFS_FS_STATE_DEV_REPLACING,
+ &fs_info->fs_state));
+ }
+}
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
new file mode 100644
index 000000000..20035cbbf
--- /dev/null
+++ b/fs/btrfs/dev-replace.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (C) STRATO AG 2012. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#if !defined(__BTRFS_DEV_REPLACE__)
+#define __BTRFS_DEV_REPLACE__
+
+struct btrfs_ioctl_dev_replace_args;
+
+int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info);
+int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info);
+void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info);
+int btrfs_dev_replace_start(struct btrfs_root *root,
+ struct btrfs_ioctl_dev_replace_args *args);
+void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
+ struct btrfs_ioctl_dev_replace_args *args);
+int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
+ struct btrfs_ioctl_dev_replace_args *args);
+void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info);
+int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info);
+int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);
+void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace);
+void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace);
+
+static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value)
+{
+ atomic64_inc(stat_value);
+}
+#endif
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
new file mode 100644
index 000000000..1752625fb
--- /dev/null
+++ b/fs/btrfs/dir-item.c
@@ -0,0 +1,482 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "ctree.h"
+#include "disk-io.h"
+#include "hash.h"
+#include "transaction.h"
+
+/*
+ * insert a name into a directory, doing overflow properly if there is a hash
+ * collision. data_size indicates how big the item inserted should be. On
+ * success a struct btrfs_dir_item pointer is returned, otherwise it is
+ * an ERR_PTR.
+ *
+ * The name is not copied into the dir item, you have to do that yourself.
+ */
+static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
+ *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_key *cpu_key,
+ u32 data_size,
+ const char *name,
+ int name_len)
+{
+ int ret;
+ char *ptr;
+ struct btrfs_item *item;
+ struct extent_buffer *leaf;
+
+ ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
+ if (ret == -EEXIST) {
+ struct btrfs_dir_item *di;
+ di = btrfs_match_dir_item_name(root, path, name, name_len);
+ if (di)
+ return ERR_PTR(-EEXIST);
+ btrfs_extend_item(root, path, data_size);
+ } else if (ret < 0)
+ return ERR_PTR(ret);
+ WARN_ON(ret > 0);
+ leaf = path->nodes[0];
+ item = btrfs_item_nr(path->slots[0]);
+ ptr = btrfs_item_ptr(leaf, path->slots[0], char);
+ BUG_ON(data_size > btrfs_item_size(leaf, item));
+ ptr += btrfs_item_size(leaf, item) - data_size;
+ return (struct btrfs_dir_item *)ptr;
+}
+
+/*
+ * xattrs work a lot like directories, this inserts an xattr item
+ * into the tree
+ */
+int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 objectid,
+ const char *name, u16 name_len,
+ const void *data, u16 data_len)
+{
+ int ret = 0;
+ struct btrfs_dir_item *dir_item;
+ unsigned long name_ptr, data_ptr;
+ struct btrfs_key key, location;
+ struct btrfs_disk_key disk_key;
+ struct extent_buffer *leaf;
+ u32 data_size;
+
+ BUG_ON(name_len + data_len > BTRFS_MAX_XATTR_SIZE(root));
+
+ key.objectid = objectid;
+ key.type = BTRFS_XATTR_ITEM_KEY;
+ key.offset = btrfs_name_hash(name, name_len);
+
+ data_size = sizeof(*dir_item) + name_len + data_len;
+ dir_item = insert_with_overflow(trans, root, path, &key, data_size,
+ name, name_len);
+ if (IS_ERR(dir_item))
+ return PTR_ERR(dir_item);
+ memset(&location, 0, sizeof(location));
+
+ leaf = path->nodes[0];
+ btrfs_cpu_key_to_disk(&disk_key, &location);
+ btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
+ btrfs_set_dir_type(leaf, dir_item, BTRFS_FT_XATTR);
+ btrfs_set_dir_name_len(leaf, dir_item, name_len);
+ btrfs_set_dir_transid(leaf, dir_item, trans->transid);
+ btrfs_set_dir_data_len(leaf, dir_item, data_len);
+ name_ptr = (unsigned long)(dir_item + 1);
+ data_ptr = (unsigned long)((char *)name_ptr + name_len);
+
+ write_extent_buffer(leaf, name, name_ptr, name_len);
+ write_extent_buffer(leaf, data, data_ptr, data_len);
+ btrfs_mark_buffer_dirty(path->nodes[0]);
+
+ return ret;
+}
+
+/*
+ * insert a directory item in the tree, doing all the magic for
+ * both indexes. 'dir' indicates which objectid to insert it into,
+ * 'location' is the key to stuff into the directory item, 'type' is the
+ * type of the inode we're pointing to, and 'index' is the sequence number
+ * to use for the second index (if one is created).
+ * Will return 0 or -ENOMEM
+ */
+int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, const char *name, int name_len,
+ struct inode *dir, struct btrfs_key *location,
+ u8 type, u64 index)
+{
+ int ret = 0;
+ int ret2 = 0;
+ struct btrfs_path *path;
+ struct btrfs_dir_item *dir_item;
+ struct extent_buffer *leaf;
+ unsigned long name_ptr;
+ struct btrfs_key key;
+ struct btrfs_disk_key disk_key;
+ u32 data_size;
+
+ key.objectid = btrfs_ino(dir);
+ key.type = BTRFS_DIR_ITEM_KEY;
+ key.offset = btrfs_name_hash(name, name_len);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ path->leave_spinning = 1;
+
+ btrfs_cpu_key_to_disk(&disk_key, location);
+
+ data_size = sizeof(*dir_item) + name_len;
+ dir_item = insert_with_overflow(trans, root, path, &key, data_size,
+ name, name_len);
+ if (IS_ERR(dir_item)) {
+ ret = PTR_ERR(dir_item);
+ if (ret == -EEXIST)
+ goto second_insert;
+ goto out_free;
+ }
+
+ leaf = path->nodes[0];
+ btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
+ btrfs_set_dir_type(leaf, dir_item, type);
+ btrfs_set_dir_data_len(leaf, dir_item, 0);
+ btrfs_set_dir_name_len(leaf, dir_item, name_len);
+ btrfs_set_dir_transid(leaf, dir_item, trans->transid);
+ name_ptr = (unsigned long)(dir_item + 1);
+
+ write_extent_buffer(leaf, name, name_ptr, name_len);
+ btrfs_mark_buffer_dirty(leaf);
+
+second_insert:
+ /* FIXME, use some real flag for selecting the extra index */
+ if (root == root->fs_info->tree_root) {
+ ret = 0;
+ goto out_free;
+ }
+ btrfs_release_path(path);
+
+ ret2 = btrfs_insert_delayed_dir_index(trans, root, name, name_len, dir,
+ &disk_key, type, index);
+out_free:
+ btrfs_free_path(path);
+ if (ret)
+ return ret;
+ if (ret2)
+ return ret2;
+ return 0;
+}
+
+/*
+ * lookup a directory item based on name. 'dir' is the objectid
+ * we're searching in, and 'mod' tells us if you plan on deleting the
+ * item (use mod < 0) or changing the options (use mod > 0)
+ */
+struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 dir,
+ const char *name, int name_len,
+ int mod)
+{
+ int ret;
+ struct btrfs_key key;
+ int ins_len = mod < 0 ? -1 : 0;
+ int cow = mod != 0;
+
+ key.objectid = dir;
+ key.type = BTRFS_DIR_ITEM_KEY;
+
+ key.offset = btrfs_name_hash(name, name_len);
+
+ ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ if (ret > 0)
+ return NULL;
+
+ return btrfs_match_dir_item_name(root, path, name, name_len);
+}
+
+int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
+ const char *name, int name_len)
+{
+ int ret;
+ struct btrfs_key key;
+ struct btrfs_dir_item *di;
+ int data_size;
+ struct extent_buffer *leaf;
+ int slot;
+ struct btrfs_path *path;
+
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = dir;
+ key.type = BTRFS_DIR_ITEM_KEY;
+ key.offset = btrfs_name_hash(name, name_len);
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+
+ /* return back any errors */
+ if (ret < 0)
+ goto out;
+
+ /* nothing found, we're safe */
+ if (ret > 0) {
+ ret = 0;
+ goto out;
+ }
+
+ /* we found an item, look for our name in the item */
+ di = btrfs_match_dir_item_name(root, path, name, name_len);
+ if (di) {
+ /* our exact name was found */
+ ret = -EEXIST;
+ goto out;
+ }
+
+ /*
+ * see if there is room in the item to insert this
+ * name
+ */
+ data_size = sizeof(*di) + name_len;
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ if (data_size + btrfs_item_size_nr(leaf, slot) +
+ sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root)) {
+ ret = -EOVERFLOW;
+ } else {
+ /* plenty of insertion room */
+ ret = 0;
+ }
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * lookup a directory item based on index. 'dir' is the objectid
+ * we're searching in, and 'mod' tells us if you plan on deleting the
+ * item (use mod < 0) or changing the options (use mod > 0)
+ *
+ * The name is used to make sure the index really points to the name you were
+ * looking for.
+ */
+struct btrfs_dir_item *
+btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 dir,
+ u64 objectid, const char *name, int name_len,
+ int mod)
+{
+ int ret;
+ struct btrfs_key key;
+ int ins_len = mod < 0 ? -1 : 0;
+ int cow = mod != 0;
+
+ key.objectid = dir;
+ key.type = BTRFS_DIR_INDEX_KEY;
+ key.offset = objectid;
+
+ ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ if (ret > 0)
+ return ERR_PTR(-ENOENT);
+ return btrfs_match_dir_item_name(root, path, name, name_len);
+}
+
+struct btrfs_dir_item *
+btrfs_search_dir_index_item(struct btrfs_root *root,
+ struct btrfs_path *path, u64 dirid,
+ const char *name, int name_len)
+{
+ struct extent_buffer *leaf;
+ struct btrfs_dir_item *di;
+ struct btrfs_key key;
+ u32 nritems;
+ int ret;
+
+ key.objectid = dirid;
+ key.type = BTRFS_DIR_INDEX_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ return ERR_PTR(ret);
+
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+
+ while (1) {
+ if (path->slots[0] >= nritems) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ if (ret > 0)
+ break;
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY)
+ break;
+
+ di = btrfs_match_dir_item_name(root, path, name, name_len);
+ if (di)
+ return di;
+
+ path->slots[0]++;
+ }
+ return NULL;
+}
+
+struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 dir,
+ const char *name, u16 name_len,
+ int mod)
+{
+ int ret;
+ struct btrfs_key key;
+ int ins_len = mod < 0 ? -1 : 0;
+ int cow = mod != 0;
+
+ key.objectid = dir;
+ key.type = BTRFS_XATTR_ITEM_KEY;
+ key.offset = btrfs_name_hash(name, name_len);
+ ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ if (ret > 0)
+ return NULL;
+
+ return btrfs_match_dir_item_name(root, path, name, name_len);
+}
+
+/*
+ * helper function to look at the directory item pointed to by 'path'
+ * this walks through all the entries in a dir item and finds one
+ * for a specific name.
+ */
+struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
+ struct btrfs_path *path,
+ const char *name, int name_len)
+{
+ struct btrfs_dir_item *dir_item;
+ unsigned long name_ptr;
+ u32 total_len;
+ u32 cur = 0;
+ u32 this_len;
+ struct extent_buffer *leaf;
+
+ leaf = path->nodes[0];
+ dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
+ if (verify_dir_item(root, leaf, dir_item))
+ return NULL;
+
+ total_len = btrfs_item_size_nr(leaf, path->slots[0]);
+ while (cur < total_len) {
+ this_len = sizeof(*dir_item) +
+ btrfs_dir_name_len(leaf, dir_item) +
+ btrfs_dir_data_len(leaf, dir_item);
+ name_ptr = (unsigned long)(dir_item + 1);
+
+ if (btrfs_dir_name_len(leaf, dir_item) == name_len &&
+ memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)
+ return dir_item;
+
+ cur += this_len;
+ dir_item = (struct btrfs_dir_item *)((char *)dir_item +
+ this_len);
+ }
+ return NULL;
+}
+
+/*
+ * given a pointer into a directory item, delete it. This
+ * handles items that have more than one entry in them.
+ */
+int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_dir_item *di)
+{
+
+ struct extent_buffer *leaf;
+ u32 sub_item_len;
+ u32 item_len;
+ int ret = 0;
+
+ leaf = path->nodes[0];
+ sub_item_len = sizeof(*di) + btrfs_dir_name_len(leaf, di) +
+ btrfs_dir_data_len(leaf, di);
+ item_len = btrfs_item_size_nr(leaf, path->slots[0]);
+ if (sub_item_len == item_len) {
+ ret = btrfs_del_item(trans, root, path);
+ } else {
+ /* MARKER */
+ unsigned long ptr = (unsigned long)di;
+ unsigned long start;
+
+ start = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
+ item_len - (ptr + sub_item_len - start));
+ btrfs_truncate_item(root, path, item_len - sub_item_len, 1);
+ }
+ return ret;
+}
+
+int verify_dir_item(struct btrfs_root *root,
+ struct extent_buffer *leaf,
+ struct btrfs_dir_item *dir_item)
+{
+ u16 namelen = BTRFS_NAME_LEN;
+ u8 type = btrfs_dir_type(leaf, dir_item);
+
+ if (type >= BTRFS_FT_MAX) {
+ btrfs_crit(root->fs_info, "invalid dir item type: %d",
+ (int)type);
+ return 1;
+ }
+
+ if (type == BTRFS_FT_XATTR)
+ namelen = XATTR_NAME_MAX;
+
+ if (btrfs_dir_name_len(leaf, dir_item) > namelen) {
+ btrfs_crit(root->fs_info, "invalid dir item name len: %u",
+ (unsigned)btrfs_dir_data_len(leaf, dir_item));
+ return 1;
+ }
+
+ /* BTRFS_MAX_XATTR_SIZE is the same for all dir items */
+ if ((btrfs_dir_data_len(leaf, dir_item) +
+ btrfs_dir_name_len(leaf, dir_item)) > BTRFS_MAX_XATTR_SIZE(root)) {
+ btrfs_crit(root->fs_info, "invalid dir item name + data len: %u + %u",
+ (unsigned)btrfs_dir_name_len(leaf, dir_item),
+ (unsigned)btrfs_dir_data_len(leaf, dir_item));
+ return 1;
+ }
+
+ return 0;
+}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
new file mode 100644
index 000000000..2ef9a4b72
--- /dev/null
+++ b/fs/btrfs/disk-io.c
@@ -0,0 +1,4338 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include <linux/scatterlist.h>
+#include <linux/swap.h>
+#include <linux/radix-tree.h>
+#include <linux/writeback.h>
+#include <linux/buffer_head.h>
+#include <linux/workqueue.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include <linux/slab.h>
+#include <linux/migrate.h>
+#include <linux/ratelimit.h>
+#include <linux/uuid.h>
+#include <linux/semaphore.h>
+#include <asm/unaligned.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "hash.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "volumes.h"
+#include "print-tree.h"
+#include "locking.h"
+#include "tree-log.h"
+#include "free-space-cache.h"
+#include "inode-map.h"
+#include "check-integrity.h"
+#include "rcu-string.h"
+#include "dev-replace.h"
+#include "raid56.h"
+#include "sysfs.h"
+#include "qgroup.h"
+
+#ifdef CONFIG_X86
+#include <asm/cpufeature.h>
+#endif
+
+static const struct extent_io_ops btree_extent_io_ops;
+static void end_workqueue_fn(struct btrfs_work *work);
+static void free_fs_root(struct btrfs_root *root);
+static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
+ int read_only);
+static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
+static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
+ struct btrfs_root *root);
+static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
+static int btrfs_destroy_marked_extents(struct btrfs_root *root,
+ struct extent_io_tree *dirty_pages,
+ int mark);
+static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
+ struct extent_io_tree *pinned_extents);
+static int btrfs_cleanup_transaction(struct btrfs_root *root);
+static void btrfs_error_commit_super(struct btrfs_root *root);
+
+/*
+ * btrfs_end_io_wq structs are used to do processing in task context when an IO
+ * is complete. This is used during reads to verify checksums, and it is used
+ * by writes to insert metadata for new file extents after IO is complete.
+ */
+struct btrfs_end_io_wq {
+ struct bio *bio;
+ bio_end_io_t *end_io;
+ void *private;
+ struct btrfs_fs_info *info;
+ int error;
+ enum btrfs_wq_endio_type metadata;
+ struct list_head list;
+ struct btrfs_work work;
+};
+
+static struct kmem_cache *btrfs_end_io_wq_cache;
+
+int __init btrfs_end_io_wq_init(void)
+{
+ btrfs_end_io_wq_cache = kmem_cache_create("btrfs_end_io_wq",
+ sizeof(struct btrfs_end_io_wq),
+ 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+ NULL);
+ if (!btrfs_end_io_wq_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void btrfs_end_io_wq_exit(void)
+{
+ if (btrfs_end_io_wq_cache)
+ kmem_cache_destroy(btrfs_end_io_wq_cache);
+}
+
+/*
+ * async submit bios are used to offload expensive checksumming
+ * onto the worker threads. They checksum file and metadata bios
+ * just before they are sent down the IO stack.
+ */
+struct async_submit_bio {
+ struct inode *inode;
+ struct bio *bio;
+ struct list_head list;
+ extent_submit_bio_hook_t *submit_bio_start;
+ extent_submit_bio_hook_t *submit_bio_done;
+ int rw;
+ int mirror_num;
+ unsigned long bio_flags;
+ /*
+ * bio_offset is optional, can be used if the pages in the bio
+ * can't tell us where in the file the bio should go
+ */
+ u64 bio_offset;
+ struct btrfs_work work;
+ int error;
+};
+
+/*
+ * Lockdep class keys for extent_buffer->lock's in this root. For a given
+ * eb, the lockdep key is determined by the btrfs_root it belongs to and
+ * the level the eb occupies in the tree.
+ *
+ * Different roots are used for different purposes and may nest inside each
+ * other and they require separate keysets. As lockdep keys should be
+ * static, assign keysets according to the purpose of the root as indicated
+ * by btrfs_root->objectid. This ensures that all special purpose roots
+ * have separate keysets.
+ *
+ * Lock-nesting across peer nodes is always done with the immediate parent
+ * node locked thus preventing deadlock. As lockdep doesn't know this, use
+ * subclass to avoid triggering lockdep warning in such cases.
+ *
+ * The key is set by the readpage_end_io_hook after the buffer has passed
+ * csum validation but before the pages are unlocked. It is also set by
+ * btrfs_init_new_buffer on freshly allocated blocks.
+ *
+ * We also add a check to make sure the highest level of the tree is the
+ * same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this code
+ * needs update as well.
+ */
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# if BTRFS_MAX_LEVEL != 8
+# error
+# endif
+
+static struct btrfs_lockdep_keyset {
+ u64 id; /* root objectid */
+ const char *name_stem; /* lock name stem */
+ char names[BTRFS_MAX_LEVEL + 1][20];
+ struct lock_class_key keys[BTRFS_MAX_LEVEL + 1];
+} btrfs_lockdep_keysets[] = {
+ { .id = BTRFS_ROOT_TREE_OBJECTID, .name_stem = "root" },
+ { .id = BTRFS_EXTENT_TREE_OBJECTID, .name_stem = "extent" },
+ { .id = BTRFS_CHUNK_TREE_OBJECTID, .name_stem = "chunk" },
+ { .id = BTRFS_DEV_TREE_OBJECTID, .name_stem = "dev" },
+ { .id = BTRFS_FS_TREE_OBJECTID, .name_stem = "fs" },
+ { .id = BTRFS_CSUM_TREE_OBJECTID, .name_stem = "csum" },
+ { .id = BTRFS_QUOTA_TREE_OBJECTID, .name_stem = "quota" },
+ { .id = BTRFS_TREE_LOG_OBJECTID, .name_stem = "log" },
+ { .id = BTRFS_TREE_RELOC_OBJECTID, .name_stem = "treloc" },
+ { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, .name_stem = "dreloc" },
+ { .id = BTRFS_UUID_TREE_OBJECTID, .name_stem = "uuid" },
+ { .id = 0, .name_stem = "tree" },
+};
+
+void __init btrfs_init_lockdep(void)
+{
+ int i, j;
+
+ /* initialize lockdep class names */
+ for (i = 0; i < ARRAY_SIZE(btrfs_lockdep_keysets); i++) {
+ struct btrfs_lockdep_keyset *ks = &btrfs_lockdep_keysets[i];
+
+ for (j = 0; j < ARRAY_SIZE(ks->names); j++)
+ snprintf(ks->names[j], sizeof(ks->names[j]),
+ "btrfs-%s-%02d", ks->name_stem, j);
+ }
+}
+
+void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb,
+ int level)
+{
+ struct btrfs_lockdep_keyset *ks;
+
+ BUG_ON(level >= ARRAY_SIZE(ks->keys));
+
+ /* find the matching keyset, id 0 is the default entry */
+ for (ks = btrfs_lockdep_keysets; ks->id; ks++)
+ if (ks->id == objectid)
+ break;
+
+ lockdep_set_class_and_name(&eb->lock,
+ &ks->keys[level], ks->names[level]);
+}
+
+#endif
+
+/*
+ * extents on the btree inode are pretty simple, there's one extent
+ * that covers the entire device
+ */
+static struct extent_map *btree_get_extent(struct inode *inode,
+ struct page *page, size_t pg_offset, u64 start, u64 len,
+ int create)
+{
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ struct extent_map *em;
+ int ret;
+
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, start, len);
+ if (em) {
+ em->bdev =
+ BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+ read_unlock(&em_tree->lock);
+ goto out;
+ }
+ read_unlock(&em_tree->lock);
+
+ em = alloc_extent_map();
+ if (!em) {
+ em = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+ em->start = 0;
+ em->len = (u64)-1;
+ em->block_len = (u64)-1;
+ em->block_start = 0;
+ em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+
+ write_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, em, 0);
+ if (ret == -EEXIST) {
+ free_extent_map(em);
+ em = lookup_extent_mapping(em_tree, start, len);
+ if (!em)
+ em = ERR_PTR(-EIO);
+ } else if (ret) {
+ free_extent_map(em);
+ em = ERR_PTR(ret);
+ }
+ write_unlock(&em_tree->lock);
+
+out:
+ return em;
+}
+
+u32 btrfs_csum_data(char *data, u32 seed, size_t len)
+{
+ return btrfs_crc32c(seed, data, len);
+}
+
+void btrfs_csum_final(u32 crc, char *result)
+{
+ put_unaligned_le32(~crc, result);
+}
+
+/*
+ * compute the csum for a btree block, and either verify it or write it
+ * into the csum field of the block.
+ */
+static int csum_tree_block(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *buf,
+ int verify)
+{
+ u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
+ char *result = NULL;
+ unsigned long len;
+ unsigned long cur_len;
+ unsigned long offset = BTRFS_CSUM_SIZE;
+ char *kaddr;
+ unsigned long map_start;
+ unsigned long map_len;
+ int err;
+ u32 crc = ~(u32)0;
+ unsigned long inline_result;
+
+ len = buf->len - offset;
+ while (len > 0) {
+ err = map_private_extent_buffer(buf, offset, 32,
+ &kaddr, &map_start, &map_len);
+ if (err)
+ return 1;
+ cur_len = min(len, map_len - (offset - map_start));
+ crc = btrfs_csum_data(kaddr + offset - map_start,
+ crc, cur_len);
+ len -= cur_len;
+ offset += cur_len;
+ }
+ if (csum_size > sizeof(inline_result)) {
+ result = kzalloc(csum_size, GFP_NOFS);
+ if (!result)
+ return 1;
+ } else {
+ result = (char *)&inline_result;
+ }
+
+ btrfs_csum_final(crc, result);
+
+ if (verify) {
+ if (memcmp_extent_buffer(buf, result, 0, csum_size)) {
+ u32 val;
+ u32 found = 0;
+ memcpy(&found, result, csum_size);
+
+ read_extent_buffer(buf, &val, 0, csum_size);
+ printk_ratelimited(KERN_WARNING
+ "BTRFS: %s checksum verify failed on %llu wanted %X found %X "
+ "level %d\n",
+ fs_info->sb->s_id, buf->start,
+ val, found, btrfs_header_level(buf));
+ if (result != (char *)&inline_result)
+ kfree(result);
+ return 1;
+ }
+ } else {
+ write_extent_buffer(buf, result, 0, csum_size);
+ }
+ if (result != (char *)&inline_result)
+ kfree(result);
+ return 0;
+}
+
+/*
+ * we can't consider a given block up to date unless the transid of the
+ * block matches the transid in the parent node's pointer. This is how we
+ * detect blocks that either didn't get written at all or got written
+ * in the wrong place.
+ */
+static int verify_parent_transid(struct extent_io_tree *io_tree,
+ struct extent_buffer *eb, u64 parent_transid,
+ int atomic)
+{
+ struct extent_state *cached_state = NULL;
+ int ret;
+ bool need_lock = (current->journal_info == BTRFS_SEND_TRANS_STUB);
+
+ if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
+ return 0;
+
+ if (atomic)
+ return -EAGAIN;
+
+ if (need_lock) {
+ btrfs_tree_read_lock(eb);
+ btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ }
+
+ lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
+ 0, &cached_state);
+ if (extent_buffer_uptodate(eb) &&
+ btrfs_header_generation(eb) == parent_transid) {
+ ret = 0;
+ goto out;
+ }
+ printk_ratelimited(KERN_ERR
+ "BTRFS (device %s): parent transid verify failed on %llu wanted %llu found %llu\n",
+ eb->fs_info->sb->s_id, eb->start,
+ parent_transid, btrfs_header_generation(eb));
+ ret = 1;
+
+ /*
+ * Things reading via commit roots that don't have normal protection,
+ * like send, can have a really old block in cache that may point at a
+ * block that has been free'd and re-allocated. So don't clear uptodate
+ * if we find an eb that is under IO (dirty/writeback) because we could
+ * end up reading in the stale data and then writing it back out and
+ * making everybody very sad.
+ */
+ if (!extent_buffer_under_io(eb))
+ clear_extent_buffer_uptodate(eb);
+out:
+ unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
+ &cached_state, GFP_NOFS);
+ if (need_lock)
+ btrfs_tree_read_unlock_blocking(eb);
+ return ret;
+}
+
+/*
+ * Return 0 if the superblock checksum type matches the checksum value of that
+ * algorithm. Pass the raw disk superblock data.
+ */
+static int btrfs_check_super_csum(char *raw_disk_sb)
+{
+ struct btrfs_super_block *disk_sb =
+ (struct btrfs_super_block *)raw_disk_sb;
+ u16 csum_type = btrfs_super_csum_type(disk_sb);
+ int ret = 0;
+
+ if (csum_type == BTRFS_CSUM_TYPE_CRC32) {
+ u32 crc = ~(u32)0;
+ const int csum_size = sizeof(crc);
+ char result[csum_size];
+
+ /*
+ * The super_block structure does not span the whole
+ * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space
+ * is filled with zeros and is included in the checkum.
+ */
+ crc = btrfs_csum_data(raw_disk_sb + BTRFS_CSUM_SIZE,
+ crc, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
+ btrfs_csum_final(crc, result);
+
+ if (memcmp(raw_disk_sb, result, csum_size))
+ ret = 1;
+ }
+
+ if (csum_type >= ARRAY_SIZE(btrfs_csum_sizes)) {
+ printk(KERN_ERR "BTRFS: unsupported checksum algorithm %u\n",
+ csum_type);
+ ret = 1;
+ }
+
+ return ret;
+}
+
+/*
+ * helper to read a given tree block, doing retries as required when
+ * the checksums don't match and we have alternate mirrors to try.
+ */
+static int btree_read_extent_buffer_pages(struct btrfs_root *root,
+ struct extent_buffer *eb,
+ u64 start, u64 parent_transid)
+{
+ struct extent_io_tree *io_tree;
+ int failed = 0;
+ int ret;
+ int num_copies = 0;
+ int mirror_num = 0;
+ int failed_mirror = 0;
+
+ clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
+ io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
+ while (1) {
+ ret = read_extent_buffer_pages(io_tree, eb, start,
+ WAIT_COMPLETE,
+ btree_get_extent, mirror_num);
+ if (!ret) {
+ if (!verify_parent_transid(io_tree, eb,
+ parent_transid, 0))
+ break;
+ else
+ ret = -EIO;
+ }
+
+ /*
+ * This buffer's crc is fine, but its contents are corrupted, so
+ * there is no reason to read the other copies, they won't be
+ * any less wrong.
+ */
+ if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
+ break;
+
+ num_copies = btrfs_num_copies(root->fs_info,
+ eb->start, eb->len);
+ if (num_copies == 1)
+ break;
+
+ if (!failed_mirror) {
+ failed = 1;
+ failed_mirror = eb->read_mirror;
+ }
+
+ mirror_num++;
+ if (mirror_num == failed_mirror)
+ mirror_num++;
+
+ if (mirror_num > num_copies)
+ break;
+ }
+
+ if (failed && !ret && failed_mirror)
+ repair_eb_io_failure(root, eb, failed_mirror);
+
+ return ret;
+}
+
+/*
+ * checksum a dirty tree block before IO. This has extra checks to make sure
+ * we only fill in the checksum field in the first page of a multi-page block
+ */
+
+static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page)
+{
+ u64 start = page_offset(page);
+ u64 found_start;
+ struct extent_buffer *eb;
+
+ eb = (struct extent_buffer *)page->private;
+ if (page != eb->pages[0])
+ return 0;
+ found_start = btrfs_header_bytenr(eb);
+ if (WARN_ON(found_start != start || !PageUptodate(page)))
+ return 0;
+ csum_tree_block(fs_info, eb, 0);
+ return 0;
+}
+
+static int check_tree_block_fsid(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb)
+{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ u8 fsid[BTRFS_UUID_SIZE];
+ int ret = 1;
+
+ read_extent_buffer(eb, fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE);
+ while (fs_devices) {
+ if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) {
+ ret = 0;
+ break;
+ }
+ fs_devices = fs_devices->seed;
+ }
+ return ret;
+}
+
+#define CORRUPT(reason, eb, root, slot) \
+ btrfs_crit(root->fs_info, "corrupt leaf, %s: block=%llu," \
+ "root=%llu, slot=%d", reason, \
+ btrfs_header_bytenr(eb), root->objectid, slot)
+
+static noinline int check_leaf(struct btrfs_root *root,
+ struct extent_buffer *leaf)
+{
+ struct btrfs_key key;
+ struct btrfs_key leaf_key;
+ u32 nritems = btrfs_header_nritems(leaf);
+ int slot;
+
+ if (nritems == 0)
+ return 0;
+
+ /* Check the 0 item */
+ if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) !=
+ BTRFS_LEAF_DATA_SIZE(root)) {
+ CORRUPT("invalid item offset size pair", leaf, root, 0);
+ return -EIO;
+ }
+
+ /*
+ * Check to make sure each items keys are in the correct order and their
+ * offsets make sense. We only have to loop through nritems-1 because
+ * we check the current slot against the next slot, which verifies the
+ * next slot's offset+size makes sense and that the current's slot
+ * offset is correct.
+ */
+ for (slot = 0; slot < nritems - 1; slot++) {
+ btrfs_item_key_to_cpu(leaf, &leaf_key, slot);
+ btrfs_item_key_to_cpu(leaf, &key, slot + 1);
+
+ /* Make sure the keys are in the right order */
+ if (btrfs_comp_cpu_keys(&leaf_key, &key) >= 0) {
+ CORRUPT("bad key order", leaf, root, slot);
+ return -EIO;
+ }
+
+ /*
+ * Make sure the offset and ends are right, remember that the
+ * item data starts at the end of the leaf and grows towards the
+ * front.
+ */
+ if (btrfs_item_offset_nr(leaf, slot) !=
+ btrfs_item_end_nr(leaf, slot + 1)) {
+ CORRUPT("slot offset bad", leaf, root, slot);
+ return -EIO;
+ }
+
+ /*
+ * Check to make sure that we don't point outside of the leaf,
+ * just incase all the items are consistent to eachother, but
+ * all point outside of the leaf.
+ */
+ if (btrfs_item_end_nr(leaf, slot) >
+ BTRFS_LEAF_DATA_SIZE(root)) {
+ CORRUPT("slot end outside of leaf", leaf, root, slot);
+ return -EIO;
+ }
+ }
+
+ return 0;
+}
+
+static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
+ u64 phy_offset, struct page *page,
+ u64 start, u64 end, int mirror)
+{
+ u64 found_start;
+ int found_level;
+ struct extent_buffer *eb;
+ struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
+ int ret = 0;
+ int reads_done;
+
+ if (!page->private)
+ goto out;
+
+ eb = (struct extent_buffer *)page->private;
+
+ /* the pending IO might have been the only thing that kept this buffer
+ * in memory. Make sure we have a ref for all this other checks
+ */
+ extent_buffer_get(eb);
+
+ reads_done = atomic_dec_and_test(&eb->io_pages);
+ if (!reads_done)
+ goto err;
+
+ eb->read_mirror = mirror;
+ if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) {
+ ret = -EIO;
+ goto err;
+ }
+
+ found_start = btrfs_header_bytenr(eb);
+ if (found_start != eb->start) {
+ printk_ratelimited(KERN_ERR "BTRFS (device %s): bad tree block start "
+ "%llu %llu\n",
+ eb->fs_info->sb->s_id, found_start, eb->start);
+ ret = -EIO;
+ goto err;
+ }
+ if (check_tree_block_fsid(root->fs_info, eb)) {
+ printk_ratelimited(KERN_ERR "BTRFS (device %s): bad fsid on block %llu\n",
+ eb->fs_info->sb->s_id, eb->start);
+ ret = -EIO;
+ goto err;
+ }
+ found_level = btrfs_header_level(eb);
+ if (found_level >= BTRFS_MAX_LEVEL) {
+ btrfs_err(root->fs_info, "bad tree block level %d",
+ (int)btrfs_header_level(eb));
+ ret = -EIO;
+ goto err;
+ }
+
+ btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb),
+ eb, found_level);
+
+ ret = csum_tree_block(root->fs_info, eb, 1);
+ if (ret) {
+ ret = -EIO;
+ goto err;
+ }
+
+ /*
+ * If this is a leaf block and it is corrupt, set the corrupt bit so
+ * that we don't try and read the other copies of this block, just
+ * return -EIO.
+ */
+ if (found_level == 0 && check_leaf(root, eb)) {
+ set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
+ ret = -EIO;
+ }
+
+ if (!ret)
+ set_extent_buffer_uptodate(eb);
+err:
+ if (reads_done &&
+ test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
+ btree_readahead_hook(root, eb, eb->start, ret);
+
+ if (ret) {
+ /*
+ * our io error hook is going to dec the io pages
+ * again, we have to make sure it has something
+ * to decrement
+ */
+ atomic_inc(&eb->io_pages);
+ clear_extent_buffer_uptodate(eb);
+ }
+ free_extent_buffer(eb);
+out:
+ return ret;
+}
+
+static int btree_io_failed_hook(struct page *page, int failed_mirror)
+{
+ struct extent_buffer *eb;
+ struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
+
+ eb = (struct extent_buffer *)page->private;
+ set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
+ eb->read_mirror = failed_mirror;
+ atomic_dec(&eb->io_pages);
+ if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
+ btree_readahead_hook(root, eb, eb->start, -EIO);
+ return -EIO; /* we fixed nothing */
+}
+
+static void end_workqueue_bio(struct bio *bio, int err)
+{
+ struct btrfs_end_io_wq *end_io_wq = bio->bi_private;
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_workqueue *wq;
+ btrfs_work_func_t func;
+
+ fs_info = end_io_wq->info;
+ end_io_wq->error = err;
+
+ if (bio->bi_rw & REQ_WRITE) {
+ if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) {
+ wq = fs_info->endio_meta_write_workers;
+ func = btrfs_endio_meta_write_helper;
+ } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) {
+ wq = fs_info->endio_freespace_worker;
+ func = btrfs_freespace_write_helper;
+ } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) {
+ wq = fs_info->endio_raid56_workers;
+ func = btrfs_endio_raid56_helper;
+ } else {
+ wq = fs_info->endio_write_workers;
+ func = btrfs_endio_write_helper;
+ }
+ } else {
+ if (unlikely(end_io_wq->metadata ==
+ BTRFS_WQ_ENDIO_DIO_REPAIR)) {
+ wq = fs_info->endio_repair_workers;
+ func = btrfs_endio_repair_helper;
+ } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) {
+ wq = fs_info->endio_raid56_workers;
+ func = btrfs_endio_raid56_helper;
+ } else if (end_io_wq->metadata) {
+ wq = fs_info->endio_meta_workers;
+ func = btrfs_endio_meta_helper;
+ } else {
+ wq = fs_info->endio_workers;
+ func = btrfs_endio_helper;
+ }
+ }
+
+ btrfs_init_work(&end_io_wq->work, func, end_workqueue_fn, NULL, NULL);
+ btrfs_queue_work(wq, &end_io_wq->work);
+}
+
+int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
+ enum btrfs_wq_endio_type metadata)
+{
+ struct btrfs_end_io_wq *end_io_wq;
+
+ end_io_wq = kmem_cache_alloc(btrfs_end_io_wq_cache, GFP_NOFS);
+ if (!end_io_wq)
+ return -ENOMEM;
+
+ end_io_wq->private = bio->bi_private;
+ end_io_wq->end_io = bio->bi_end_io;
+ end_io_wq->info = info;
+ end_io_wq->error = 0;
+ end_io_wq->bio = bio;
+ end_io_wq->metadata = metadata;
+
+ bio->bi_private = end_io_wq;
+ bio->bi_end_io = end_workqueue_bio;
+ return 0;
+}
+
+unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
+{
+ unsigned long limit = min_t(unsigned long,
+ info->thread_pool_size,
+ info->fs_devices->open_devices);
+ return 256 * limit;
+}
+
+static void run_one_async_start(struct btrfs_work *work)
+{
+ struct async_submit_bio *async;
+ int ret;
+
+ async = container_of(work, struct async_submit_bio, work);
+ ret = async->submit_bio_start(async->inode, async->rw, async->bio,
+ async->mirror_num, async->bio_flags,
+ async->bio_offset);
+ if (ret)
+ async->error = ret;
+}
+
+static void run_one_async_done(struct btrfs_work *work)
+{
+ struct btrfs_fs_info *fs_info;
+ struct async_submit_bio *async;
+ int limit;
+
+ async = container_of(work, struct async_submit_bio, work);
+ fs_info = BTRFS_I(async->inode)->root->fs_info;
+
+ limit = btrfs_async_submit_limit(fs_info);
+ limit = limit * 2 / 3;
+
+ if (atomic_dec_return(&fs_info->nr_async_submits) < limit &&
+ waitqueue_active(&fs_info->async_submit_wait))
+ wake_up(&fs_info->async_submit_wait);
+
+ /* If an error occured we just want to clean up the bio and move on */
+ if (async->error) {
+ bio_endio(async->bio, async->error);
+ return;
+ }
+
+ async->submit_bio_done(async->inode, async->rw, async->bio,
+ async->mirror_num, async->bio_flags,
+ async->bio_offset);
+}
+
+static void run_one_async_free(struct btrfs_work *work)
+{
+ struct async_submit_bio *async;
+
+ async = container_of(work, struct async_submit_bio, work);
+ kfree(async);
+}
+
+int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
+ int rw, struct bio *bio, int mirror_num,
+ unsigned long bio_flags,
+ u64 bio_offset,
+ extent_submit_bio_hook_t *submit_bio_start,
+ extent_submit_bio_hook_t *submit_bio_done)
+{
+ struct async_submit_bio *async;
+
+ async = kmalloc(sizeof(*async), GFP_NOFS);
+ if (!async)
+ return -ENOMEM;
+
+ async->inode = inode;
+ async->rw = rw;
+ async->bio = bio;
+ async->mirror_num = mirror_num;
+ async->submit_bio_start = submit_bio_start;
+ async->submit_bio_done = submit_bio_done;
+
+ btrfs_init_work(&async->work, btrfs_worker_helper, run_one_async_start,
+ run_one_async_done, run_one_async_free);
+
+ async->bio_flags = bio_flags;
+ async->bio_offset = bio_offset;
+
+ async->error = 0;
+
+ atomic_inc(&fs_info->nr_async_submits);
+
+ if (rw & REQ_SYNC)
+ btrfs_set_work_high_priority(&async->work);
+
+ btrfs_queue_work(fs_info->workers, &async->work);
+
+ while (atomic_read(&fs_info->async_submit_draining) &&
+ atomic_read(&fs_info->nr_async_submits)) {
+ wait_event(fs_info->async_submit_wait,
+ (atomic_read(&fs_info->nr_async_submits) == 0));
+ }
+
+ return 0;
+}
+
+static int btree_csum_one_bio(struct bio *bio)
+{
+ struct bio_vec *bvec;
+ struct btrfs_root *root;
+ int i, ret = 0;
+
+ bio_for_each_segment_all(bvec, bio, i) {
+ root = BTRFS_I(bvec->bv_page->mapping->host)->root;
+ ret = csum_dirty_buffer(root->fs_info, bvec->bv_page);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+static int __btree_submit_bio_start(struct inode *inode, int rw,
+ struct bio *bio, int mirror_num,
+ unsigned long bio_flags,
+ u64 bio_offset)
+{
+ /*
+ * when we're called for a write, we're already in the async
+ * submission context. Just jump into btrfs_map_bio
+ */
+ return btree_csum_one_bio(bio);
+}
+
+static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
+ int mirror_num, unsigned long bio_flags,
+ u64 bio_offset)
+{
+ int ret;
+
+ /*
+ * when we're called for a write, we're already in the async
+ * submission context. Just jump into btrfs_map_bio
+ */
+ ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
+ if (ret)
+ bio_endio(bio, ret);
+ return ret;
+}
+
+static int check_async_write(struct inode *inode, unsigned long bio_flags)
+{
+ if (bio_flags & EXTENT_BIO_TREE_LOG)
+ return 0;
+#ifdef CONFIG_X86
+ if (cpu_has_xmm4_2)
+ return 0;
+#endif
+ return 1;
+}
+
+static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
+ int mirror_num, unsigned long bio_flags,
+ u64 bio_offset)
+{
+ int async = check_async_write(inode, bio_flags);
+ int ret;
+
+ if (!(rw & REQ_WRITE)) {
+ /*
+ * called for a read, do the setup so that checksum validation
+ * can happen in the async kernel threads
+ */
+ ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
+ bio, BTRFS_WQ_ENDIO_METADATA);
+ if (ret)
+ goto out_w_error;
+ ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
+ mirror_num, 0);
+ } else if (!async) {
+ ret = btree_csum_one_bio(bio);
+ if (ret)
+ goto out_w_error;
+ ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
+ mirror_num, 0);
+ } else {
+ /*
+ * kthread helpers are used to submit writes so that
+ * checksumming can happen in parallel across all CPUs
+ */
+ ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
+ inode, rw, bio, mirror_num, 0,
+ bio_offset,
+ __btree_submit_bio_start,
+ __btree_submit_bio_done);
+ }
+
+ if (ret) {
+out_w_error:
+ bio_endio(bio, ret);
+ }
+ return ret;
+}
+
+#ifdef CONFIG_MIGRATION
+static int btree_migratepage(struct address_space *mapping,
+ struct page *newpage, struct page *page,
+ enum migrate_mode mode)
+{
+ /*
+ * we can't safely write a btree page from here,
+ * we haven't done the locking hook
+ */
+ if (PageDirty(page))
+ return -EAGAIN;
+ /*
+ * Buffers may be managed in a filesystem specific way.
+ * We must have no buffers or drop them.
+ */
+ if (page_has_private(page) &&
+ !try_to_release_page(page, GFP_KERNEL))
+ return -EAGAIN;
+ return migrate_page(mapping, newpage, page, mode);
+}
+#endif
+
+
+static int btree_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct btrfs_fs_info *fs_info;
+ int ret;
+
+ if (wbc->sync_mode == WB_SYNC_NONE) {
+
+ if (wbc->for_kupdate)
+ return 0;
+
+ fs_info = BTRFS_I(mapping->host)->root->fs_info;
+ /* this is a bit racy, but that's ok */
+ ret = percpu_counter_compare(&fs_info->dirty_metadata_bytes,
+ BTRFS_DIRTY_METADATA_THRESH);
+ if (ret < 0)
+ return 0;
+ }
+ return btree_write_cache_pages(mapping, wbc);
+}
+
+static int btree_readpage(struct file *file, struct page *page)
+{
+ struct extent_io_tree *tree;
+ tree = &BTRFS_I(page->mapping->host)->io_tree;
+ return extent_read_full_page(tree, page, btree_get_extent, 0);
+}
+
+static int btree_releasepage(struct page *page, gfp_t gfp_flags)
+{
+ if (PageWriteback(page) || PageDirty(page))
+ return 0;
+
+ return try_release_extent_buffer(page);
+}
+
+static void btree_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
+{
+ struct extent_io_tree *tree;
+ tree = &BTRFS_I(page->mapping->host)->io_tree;
+ extent_invalidatepage(tree, page, offset);
+ btree_releasepage(page, GFP_NOFS);
+ if (PagePrivate(page)) {
+ btrfs_warn(BTRFS_I(page->mapping->host)->root->fs_info,
+ "page private not zero on page %llu",
+ (unsigned long long)page_offset(page));
+ ClearPagePrivate(page);
+ set_page_private(page, 0);
+ page_cache_release(page);
+ }
+}
+
+static int btree_set_page_dirty(struct page *page)
+{
+#ifdef DEBUG
+ struct extent_buffer *eb;
+
+ BUG_ON(!PagePrivate(page));
+ eb = (struct extent_buffer *)page->private;
+ BUG_ON(!eb);
+ BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
+ BUG_ON(!atomic_read(&eb->refs));
+ btrfs_assert_tree_locked(eb);
+#endif
+ return __set_page_dirty_nobuffers(page);
+}
+
+static const struct address_space_operations btree_aops = {
+ .readpage = btree_readpage,
+ .writepages = btree_writepages,
+ .releasepage = btree_releasepage,
+ .invalidatepage = btree_invalidatepage,
+#ifdef CONFIG_MIGRATION
+ .migratepage = btree_migratepage,
+#endif
+ .set_page_dirty = btree_set_page_dirty,
+};
+
+void readahead_tree_block(struct btrfs_root *root, u64 bytenr)
+{
+ struct extent_buffer *buf = NULL;
+ struct inode *btree_inode = root->fs_info->btree_inode;
+
+ buf = btrfs_find_create_tree_block(root, bytenr);
+ if (!buf)
+ return;
+ read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
+ buf, 0, WAIT_NONE, btree_get_extent, 0);
+ free_extent_buffer(buf);
+}
+
+int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr,
+ int mirror_num, struct extent_buffer **eb)
+{
+ struct extent_buffer *buf = NULL;
+ struct inode *btree_inode = root->fs_info->btree_inode;
+ struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree;
+ int ret;
+
+ buf = btrfs_find_create_tree_block(root, bytenr);
+ if (!buf)
+ return 0;
+
+ set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags);
+
+ ret = read_extent_buffer_pages(io_tree, buf, 0, WAIT_PAGE_LOCK,
+ btree_get_extent, mirror_num);
+ if (ret) {
+ free_extent_buffer(buf);
+ return ret;
+ }
+
+ if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) {
+ free_extent_buffer(buf);
+ return -EIO;
+ } else if (extent_buffer_uptodate(buf)) {
+ *eb = buf;
+ } else {
+ free_extent_buffer(buf);
+ }
+ return 0;
+}
+
+struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info,
+ u64 bytenr)
+{
+ return find_extent_buffer(fs_info, bytenr);
+}
+
+struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
+ u64 bytenr)
+{
+ if (btrfs_test_is_dummy_root(root))
+ return alloc_test_extent_buffer(root->fs_info, bytenr);
+ return alloc_extent_buffer(root->fs_info, bytenr);
+}
+
+
+int btrfs_write_tree_block(struct extent_buffer *buf)
+{
+ return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start,
+ buf->start + buf->len - 1);
+}
+
+int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
+{
+ return filemap_fdatawait_range(buf->pages[0]->mapping,
+ buf->start, buf->start + buf->len - 1);
+}
+
+struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
+ u64 parent_transid)
+{
+ struct extent_buffer *buf = NULL;
+ int ret;
+
+ buf = btrfs_find_create_tree_block(root, bytenr);
+ if (!buf)
+ return NULL;
+
+ ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
+ if (ret) {
+ free_extent_buffer(buf);
+ return NULL;
+ }
+ return buf;
+
+}
+
+void clean_tree_block(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct extent_buffer *buf)
+{
+ if (btrfs_header_generation(buf) ==
+ fs_info->running_transaction->transid) {
+ btrfs_assert_tree_locked(buf);
+
+ if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
+ __percpu_counter_add(&fs_info->dirty_metadata_bytes,
+ -buf->len,
+ fs_info->dirty_metadata_batch);
+ /* ugh, clear_extent_buffer_dirty needs to lock the page */
+ btrfs_set_lock_blocking(buf);
+ clear_extent_buffer_dirty(buf);
+ }
+ }
+}
+
+static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void)
+{
+ struct btrfs_subvolume_writers *writers;
+ int ret;
+
+ writers = kmalloc(sizeof(*writers), GFP_NOFS);
+ if (!writers)
+ return ERR_PTR(-ENOMEM);
+
+ ret = percpu_counter_init(&writers->counter, 0, GFP_KERNEL);
+ if (ret < 0) {
+ kfree(writers);
+ return ERR_PTR(ret);
+ }
+
+ init_waitqueue_head(&writers->wait);
+ return writers;
+}
+
+static void
+btrfs_free_subvolume_writers(struct btrfs_subvolume_writers *writers)
+{
+ percpu_counter_destroy(&writers->counter);
+ kfree(writers);
+}
+
+static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize,
+ struct btrfs_root *root, struct btrfs_fs_info *fs_info,
+ u64 objectid)
+{
+ root->node = NULL;
+ root->commit_root = NULL;
+ root->sectorsize = sectorsize;
+ root->nodesize = nodesize;
+ root->stripesize = stripesize;
+ root->state = 0;
+ root->orphan_cleanup_state = 0;
+
+ root->objectid = objectid;
+ root->last_trans = 0;
+ root->highest_objectid = 0;
+ root->nr_delalloc_inodes = 0;
+ root->nr_ordered_extents = 0;
+ root->name = NULL;
+ root->inode_tree = RB_ROOT;
+ INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
+ root->block_rsv = NULL;
+ root->orphan_block_rsv = NULL;
+
+ INIT_LIST_HEAD(&root->dirty_list);
+ INIT_LIST_HEAD(&root->root_list);
+ INIT_LIST_HEAD(&root->delalloc_inodes);
+ INIT_LIST_HEAD(&root->delalloc_root);
+ INIT_LIST_HEAD(&root->ordered_extents);
+ INIT_LIST_HEAD(&root->ordered_root);
+ INIT_LIST_HEAD(&root->logged_list[0]);
+ INIT_LIST_HEAD(&root->logged_list[1]);
+ spin_lock_init(&root->orphan_lock);
+ spin_lock_init(&root->inode_lock);
+ spin_lock_init(&root->delalloc_lock);
+ spin_lock_init(&root->ordered_extent_lock);
+ spin_lock_init(&root->accounting_lock);
+ spin_lock_init(&root->log_extents_lock[0]);
+ spin_lock_init(&root->log_extents_lock[1]);
+ mutex_init(&root->objectid_mutex);
+ mutex_init(&root->log_mutex);
+ mutex_init(&root->ordered_extent_mutex);
+ mutex_init(&root->delalloc_mutex);
+ init_waitqueue_head(&root->log_writer_wait);
+ init_waitqueue_head(&root->log_commit_wait[0]);
+ init_waitqueue_head(&root->log_commit_wait[1]);
+ INIT_LIST_HEAD(&root->log_ctxs[0]);
+ INIT_LIST_HEAD(&root->log_ctxs[1]);
+ atomic_set(&root->log_commit[0], 0);
+ atomic_set(&root->log_commit[1], 0);
+ atomic_set(&root->log_writers, 0);
+ atomic_set(&root->log_batch, 0);
+ atomic_set(&root->orphan_inodes, 0);
+ atomic_set(&root->refs, 1);
+ atomic_set(&root->will_be_snapshoted, 0);
+ root->log_transid = 0;
+ root->log_transid_committed = -1;
+ root->last_log_commit = 0;
+ if (fs_info)
+ extent_io_tree_init(&root->dirty_log_pages,
+ fs_info->btree_inode->i_mapping);
+
+ memset(&root->root_key, 0, sizeof(root->root_key));
+ memset(&root->root_item, 0, sizeof(root->root_item));
+ memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
+ if (fs_info)
+ root->defrag_trans_start = fs_info->generation;
+ else
+ root->defrag_trans_start = 0;
+ root->root_key.objectid = objectid;
+ root->anon_dev = 0;
+
+ spin_lock_init(&root->root_item_lock);
+}
+
+static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS);
+ if (root)
+ root->fs_info = fs_info;
+ return root;
+}
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+/* Should only be used by the testing infrastructure */
+struct btrfs_root *btrfs_alloc_dummy_root(void)
+{
+ struct btrfs_root *root;
+
+ root = btrfs_alloc_root(NULL);
+ if (!root)
+ return ERR_PTR(-ENOMEM);
+ __setup_root(4096, 4096, 4096, root, NULL, 1);
+ set_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state);
+ root->alloc_bytenr = 0;
+
+ return root;
+}
+#endif
+
+struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ u64 objectid)
+{
+ struct extent_buffer *leaf;
+ struct btrfs_root *tree_root = fs_info->tree_root;
+ struct btrfs_root *root;
+ struct btrfs_key key;
+ int ret = 0;
+ uuid_le uuid;
+
+ root = btrfs_alloc_root(fs_info);
+ if (!root)
+ return ERR_PTR(-ENOMEM);
+
+ __setup_root(tree_root->nodesize, tree_root->sectorsize,
+ tree_root->stripesize, root, fs_info, objectid);
+ root->root_key.objectid = objectid;
+ root->root_key.type = BTRFS_ROOT_ITEM_KEY;
+ root->root_key.offset = 0;
+
+ leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0);
+ if (IS_ERR(leaf)) {
+ ret = PTR_ERR(leaf);
+ leaf = NULL;
+ goto fail;
+ }
+
+ memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header));
+ btrfs_set_header_bytenr(leaf, leaf->start);
+ btrfs_set_header_generation(leaf, trans->transid);
+ btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
+ btrfs_set_header_owner(leaf, objectid);
+ root->node = leaf;
+
+ write_extent_buffer(leaf, fs_info->fsid, btrfs_header_fsid(),
+ BTRFS_FSID_SIZE);
+ write_extent_buffer(leaf, fs_info->chunk_tree_uuid,
+ btrfs_header_chunk_tree_uuid(leaf),
+ BTRFS_UUID_SIZE);
+ btrfs_mark_buffer_dirty(leaf);
+
+ root->commit_root = btrfs_root_node(root);
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+
+ root->root_item.flags = 0;
+ root->root_item.byte_limit = 0;
+ btrfs_set_root_bytenr(&root->root_item, leaf->start);
+ btrfs_set_root_generation(&root->root_item, trans->transid);
+ btrfs_set_root_level(&root->root_item, 0);
+ btrfs_set_root_refs(&root->root_item, 1);
+ btrfs_set_root_used(&root->root_item, leaf->len);
+ btrfs_set_root_last_snapshot(&root->root_item, 0);
+ btrfs_set_root_dirid(&root->root_item, 0);
+ uuid_le_gen(&uuid);
+ memcpy(root->root_item.uuid, uuid.b, BTRFS_UUID_SIZE);
+ root->root_item.drop_level = 0;
+
+ key.objectid = objectid;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = 0;
+ ret = btrfs_insert_root(trans, tree_root, &key, &root->root_item);
+ if (ret)
+ goto fail;
+
+ btrfs_tree_unlock(leaf);
+
+ return root;
+
+fail:
+ if (leaf) {
+ btrfs_tree_unlock(leaf);
+ free_extent_buffer(root->commit_root);
+ free_extent_buffer(leaf);
+ }
+ kfree(root);
+
+ return ERR_PTR(ret);
+}
+
+static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *root;
+ struct btrfs_root *tree_root = fs_info->tree_root;
+ struct extent_buffer *leaf;
+
+ root = btrfs_alloc_root(fs_info);
+ if (!root)
+ return ERR_PTR(-ENOMEM);
+
+ __setup_root(tree_root->nodesize, tree_root->sectorsize,
+ tree_root->stripesize, root, fs_info,
+ BTRFS_TREE_LOG_OBJECTID);
+
+ root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
+ root->root_key.type = BTRFS_ROOT_ITEM_KEY;
+ root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
+
+ /*
+ * DON'T set REF_COWS for log trees
+ *
+ * log trees do not get reference counted because they go away
+ * before a real commit is actually done. They do store pointers
+ * to file data extents, and those reference counts still get
+ * updated (along with back refs to the log tree).
+ */
+
+ leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID,
+ NULL, 0, 0, 0);
+ if (IS_ERR(leaf)) {
+ kfree(root);
+ return ERR_CAST(leaf);
+ }
+
+ memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header));
+ btrfs_set_header_bytenr(leaf, leaf->start);
+ btrfs_set_header_generation(leaf, trans->transid);
+ btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
+ btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID);
+ root->node = leaf;
+
+ write_extent_buffer(root->node, root->fs_info->fsid,
+ btrfs_header_fsid(), BTRFS_FSID_SIZE);
+ btrfs_mark_buffer_dirty(root->node);
+ btrfs_tree_unlock(root->node);
+ return root;
+}
+
+int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *log_root;
+
+ log_root = alloc_log_tree(trans, fs_info);
+ if (IS_ERR(log_root))
+ return PTR_ERR(log_root);
+ WARN_ON(fs_info->log_root_tree);
+ fs_info->log_root_tree = log_root;
+ return 0;
+}
+
+int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_root *log_root;
+ struct btrfs_inode_item *inode_item;
+
+ log_root = alloc_log_tree(trans, root->fs_info);
+ if (IS_ERR(log_root))
+ return PTR_ERR(log_root);
+
+ log_root->last_trans = trans->transid;
+ log_root->root_key.offset = root->root_key.objectid;
+
+ inode_item = &log_root->root_item.inode;
+ btrfs_set_stack_inode_generation(inode_item, 1);
+ btrfs_set_stack_inode_size(inode_item, 3);
+ btrfs_set_stack_inode_nlink(inode_item, 1);
+ btrfs_set_stack_inode_nbytes(inode_item, root->nodesize);
+ btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
+
+ btrfs_set_root_node(&log_root->root_item, log_root->node);
+
+ WARN_ON(root->log_root);
+ root->log_root = log_root;
+ root->log_transid = 0;
+ root->log_transid_committed = -1;
+ root->last_log_commit = 0;
+ return 0;
+}
+
+static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
+ struct btrfs_key *key)
+{
+ struct btrfs_root *root;
+ struct btrfs_fs_info *fs_info = tree_root->fs_info;
+ struct btrfs_path *path;
+ u64 generation;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return ERR_PTR(-ENOMEM);
+
+ root = btrfs_alloc_root(fs_info);
+ if (!root) {
+ ret = -ENOMEM;
+ goto alloc_fail;
+ }
+
+ __setup_root(tree_root->nodesize, tree_root->sectorsize,
+ tree_root->stripesize, root, fs_info, key->objectid);
+
+ ret = btrfs_find_root(tree_root, key, path,
+ &root->root_item, &root->root_key);
+ if (ret) {
+ if (ret > 0)
+ ret = -ENOENT;
+ goto find_fail;
+ }
+
+ generation = btrfs_root_generation(&root->root_item);
+ root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
+ generation);
+ if (!root->node) {
+ ret = -ENOMEM;
+ goto find_fail;
+ } else if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
+ ret = -EIO;
+ goto read_fail;
+ }
+ root->commit_root = btrfs_root_node(root);
+out:
+ btrfs_free_path(path);
+ return root;
+
+read_fail:
+ free_extent_buffer(root->node);
+find_fail:
+ kfree(root);
+alloc_fail:
+ root = ERR_PTR(ret);
+ goto out;
+}
+
+struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
+ struct btrfs_key *location)
+{
+ struct btrfs_root *root;
+
+ root = btrfs_read_tree_root(tree_root, location);
+ if (IS_ERR(root))
+ return root;
+
+ if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
+ set_bit(BTRFS_ROOT_REF_COWS, &root->state);
+ btrfs_check_and_init_root_item(&root->root_item);
+ }
+
+ return root;
+}
+
+int btrfs_init_fs_root(struct btrfs_root *root)
+{
+ int ret;
+ struct btrfs_subvolume_writers *writers;
+
+ root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
+ root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
+ GFP_NOFS);
+ if (!root->free_ino_pinned || !root->free_ino_ctl) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+
+ writers = btrfs_alloc_subvolume_writers();
+ if (IS_ERR(writers)) {
+ ret = PTR_ERR(writers);
+ goto fail;
+ }
+ root->subv_writers = writers;
+
+ btrfs_init_free_ino_ctl(root);
+ spin_lock_init(&root->ino_cache_lock);
+ init_waitqueue_head(&root->ino_cache_wait);
+
+ ret = get_anon_bdev(&root->anon_dev);
+ if (ret)
+ goto free_writers;
+ return 0;
+
+free_writers:
+ btrfs_free_subvolume_writers(root->subv_writers);
+fail:
+ kfree(root->free_ino_ctl);
+ kfree(root->free_ino_pinned);
+ return ret;
+}
+
+static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
+ u64 root_id)
+{
+ struct btrfs_root *root;
+
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ root = radix_tree_lookup(&fs_info->fs_roots_radix,
+ (unsigned long)root_id);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+ return root;
+}
+
+int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
+ struct btrfs_root *root)
+{
+ int ret;
+
+ ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+ if (ret)
+ return ret;
+
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ ret = radix_tree_insert(&fs_info->fs_roots_radix,
+ (unsigned long)root->root_key.objectid,
+ root);
+ if (ret == 0)
+ set_bit(BTRFS_ROOT_IN_RADIX, &root->state);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+ radix_tree_preload_end();
+
+ return ret;
+}
+
+struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
+ struct btrfs_key *location,
+ bool check_ref)
+{
+ struct btrfs_root *root;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ int ret;
+
+ if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
+ return fs_info->tree_root;
+ if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID)
+ return fs_info->extent_root;
+ if (location->objectid == BTRFS_CHUNK_TREE_OBJECTID)
+ return fs_info->chunk_root;
+ if (location->objectid == BTRFS_DEV_TREE_OBJECTID)
+ return fs_info->dev_root;
+ if (location->objectid == BTRFS_CSUM_TREE_OBJECTID)
+ return fs_info->csum_root;
+ if (location->objectid == BTRFS_QUOTA_TREE_OBJECTID)
+ return fs_info->quota_root ? fs_info->quota_root :
+ ERR_PTR(-ENOENT);
+ if (location->objectid == BTRFS_UUID_TREE_OBJECTID)
+ return fs_info->uuid_root ? fs_info->uuid_root :
+ ERR_PTR(-ENOENT);
+again:
+ root = btrfs_lookup_fs_root(fs_info, location->objectid);
+ if (root) {
+ if (check_ref && btrfs_root_refs(&root->root_item) == 0)
+ return ERR_PTR(-ENOENT);
+ return root;
+ }
+
+ root = btrfs_read_fs_root(fs_info->tree_root, location);
+ if (IS_ERR(root))
+ return root;
+
+ if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
+ ret = -ENOENT;
+ goto fail;
+ }
+
+ ret = btrfs_init_fs_root(root);
+ if (ret)
+ goto fail;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+ key.objectid = BTRFS_ORPHAN_OBJECTID;
+ key.type = BTRFS_ORPHAN_ITEM_KEY;
+ key.offset = location->objectid;
+
+ ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
+ btrfs_free_path(path);
+ if (ret < 0)
+ goto fail;
+ if (ret == 0)
+ set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state);
+
+ ret = btrfs_insert_fs_root(fs_info, root);
+ if (ret) {
+ if (ret == -EEXIST) {
+ free_fs_root(root);
+ goto again;
+ }
+ goto fail;
+ }
+ return root;
+fail:
+ free_fs_root(root);
+ return ERR_PTR(ret);
+}
+
+static int btrfs_congested_fn(void *congested_data, int bdi_bits)
+{
+ struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;
+ int ret = 0;
+ struct btrfs_device *device;
+ struct backing_dev_info *bdi;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(device, &info->fs_devices->devices, dev_list) {
+ if (!device->bdev)
+ continue;
+ bdi = blk_get_backing_dev_info(device->bdev);
+ if (bdi_congested(bdi, bdi_bits)) {
+ ret = 1;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ return ret;
+}
+
+static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
+{
+ int err;
+
+ err = bdi_setup_and_register(bdi, "btrfs");
+ if (err)
+ return err;
+
+ bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE;
+ bdi->congested_fn = btrfs_congested_fn;
+ bdi->congested_data = info;
+ return 0;
+}
+
+/*
+ * called by the kthread helper functions to finally call the bio end_io
+ * functions. This is where read checksum verification actually happens
+ */
+static void end_workqueue_fn(struct btrfs_work *work)
+{
+ struct bio *bio;
+ struct btrfs_end_io_wq *end_io_wq;
+ int error;
+
+ end_io_wq = container_of(work, struct btrfs_end_io_wq, work);
+ bio = end_io_wq->bio;
+
+ error = end_io_wq->error;
+ bio->bi_private = end_io_wq->private;
+ bio->bi_end_io = end_io_wq->end_io;
+ kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq);
+ bio_endio_nodec(bio, error);
+}
+
+static int cleaner_kthread(void *arg)
+{
+ struct btrfs_root *root = arg;
+ int again;
+
+ do {
+ again = 0;
+
+ /* Make the cleaner go to sleep early. */
+ if (btrfs_need_cleaner_sleep(root))
+ goto sleep;
+
+ if (!mutex_trylock(&root->fs_info->cleaner_mutex))
+ goto sleep;
+
+ /*
+ * Avoid the problem that we change the status of the fs
+ * during the above check and trylock.
+ */
+ if (btrfs_need_cleaner_sleep(root)) {
+ mutex_unlock(&root->fs_info->cleaner_mutex);
+ goto sleep;
+ }
+
+ btrfs_run_delayed_iputs(root);
+ btrfs_delete_unused_bgs(root->fs_info);
+ again = btrfs_clean_one_deleted_snapshot(root);
+ mutex_unlock(&root->fs_info->cleaner_mutex);
+
+ /*
+ * The defragger has dealt with the R/O remount and umount,
+ * needn't do anything special here.
+ */
+ btrfs_run_defrag_inodes(root->fs_info);
+sleep:
+ if (!try_to_freeze() && !again) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (!kthread_should_stop())
+ schedule();
+ __set_current_state(TASK_RUNNING);
+ }
+ } while (!kthread_should_stop());
+ return 0;
+}
+
+static int transaction_kthread(void *arg)
+{
+ struct btrfs_root *root = arg;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_transaction *cur;
+ u64 transid;
+ unsigned long now;
+ unsigned long delay;
+ bool cannot_commit;
+
+ do {
+ cannot_commit = false;
+ delay = HZ * root->fs_info->commit_interval;
+ mutex_lock(&root->fs_info->transaction_kthread_mutex);
+
+ spin_lock(&root->fs_info->trans_lock);
+ cur = root->fs_info->running_transaction;
+ if (!cur) {
+ spin_unlock(&root->fs_info->trans_lock);
+ goto sleep;
+ }
+
+ now = get_seconds();
+ if (cur->state < TRANS_STATE_BLOCKED &&
+ (now < cur->start_time ||
+ now - cur->start_time < root->fs_info->commit_interval)) {
+ spin_unlock(&root->fs_info->trans_lock);
+ delay = HZ * 5;
+ goto sleep;
+ }
+ transid = cur->transid;
+ spin_unlock(&root->fs_info->trans_lock);
+
+ /* If the file system is aborted, this will always fail. */
+ trans = btrfs_attach_transaction(root);
+ if (IS_ERR(trans)) {
+ if (PTR_ERR(trans) != -ENOENT)
+ cannot_commit = true;
+ goto sleep;
+ }
+ if (transid == trans->transid) {
+ btrfs_commit_transaction(trans, root);
+ } else {
+ btrfs_end_transaction(trans, root);
+ }
+sleep:
+ wake_up_process(root->fs_info->cleaner_kthread);
+ mutex_unlock(&root->fs_info->transaction_kthread_mutex);
+
+ if (unlikely(test_bit(BTRFS_FS_STATE_ERROR,
+ &root->fs_info->fs_state)))
+ btrfs_cleanup_transaction(root);
+ if (!try_to_freeze()) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (!kthread_should_stop() &&
+ (!btrfs_transaction_blocked(root->fs_info) ||
+ cannot_commit))
+ schedule_timeout(delay);
+ __set_current_state(TASK_RUNNING);
+ }
+ } while (!kthread_should_stop());
+ return 0;
+}
+
+/*
+ * this will find the highest generation in the array of
+ * root backups. The index of the highest array is returned,
+ * or -1 if we can't find anything.
+ *
+ * We check to make sure the array is valid by comparing the
+ * generation of the latest root in the array with the generation
+ * in the super block. If they don't match we pitch it.
+ */
+static int find_newest_super_backup(struct btrfs_fs_info *info, u64 newest_gen)
+{
+ u64 cur;
+ int newest_index = -1;
+ struct btrfs_root_backup *root_backup;
+ int i;
+
+ for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
+ root_backup = info->super_copy->super_roots + i;
+ cur = btrfs_backup_tree_root_gen(root_backup);
+ if (cur == newest_gen)
+ newest_index = i;
+ }
+
+ /* check to see if we actually wrapped around */
+ if (newest_index == BTRFS_NUM_BACKUP_ROOTS - 1) {
+ root_backup = info->super_copy->super_roots;
+ cur = btrfs_backup_tree_root_gen(root_backup);
+ if (cur == newest_gen)
+ newest_index = 0;
+ }
+ return newest_index;
+}
+
+
+/*
+ * find the oldest backup so we know where to store new entries
+ * in the backup array. This will set the backup_root_index
+ * field in the fs_info struct
+ */
+static void find_oldest_super_backup(struct btrfs_fs_info *info,
+ u64 newest_gen)
+{
+ int newest_index = -1;
+
+ newest_index = find_newest_super_backup(info, newest_gen);
+ /* if there was garbage in there, just move along */
+ if (newest_index == -1) {
+ info->backup_root_index = 0;
+ } else {
+ info->backup_root_index = (newest_index + 1) % BTRFS_NUM_BACKUP_ROOTS;
+ }
+}
+
+/*
+ * copy all the root pointers into the super backup array.
+ * this will bump the backup pointer by one when it is
+ * done
+ */
+static void backup_super_roots(struct btrfs_fs_info *info)
+{
+ int next_backup;
+ struct btrfs_root_backup *root_backup;
+ int last_backup;
+
+ next_backup = info->backup_root_index;
+ last_backup = (next_backup + BTRFS_NUM_BACKUP_ROOTS - 1) %
+ BTRFS_NUM_BACKUP_ROOTS;
+
+ /*
+ * just overwrite the last backup if we're at the same generation
+ * this happens only at umount
+ */
+ root_backup = info->super_for_commit->super_roots + last_backup;
+ if (btrfs_backup_tree_root_gen(root_backup) ==
+ btrfs_header_generation(info->tree_root->node))
+ next_backup = last_backup;
+
+ root_backup = info->super_for_commit->super_roots + next_backup;
+
+ /*
+ * make sure all of our padding and empty slots get zero filled
+ * regardless of which ones we use today
+ */
+ memset(root_backup, 0, sizeof(*root_backup));
+
+ info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS;
+
+ btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start);
+ btrfs_set_backup_tree_root_gen(root_backup,
+ btrfs_header_generation(info->tree_root->node));
+
+ btrfs_set_backup_tree_root_level(root_backup,
+ btrfs_header_level(info->tree_root->node));
+
+ btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start);
+ btrfs_set_backup_chunk_root_gen(root_backup,
+ btrfs_header_generation(info->chunk_root->node));
+ btrfs_set_backup_chunk_root_level(root_backup,
+ btrfs_header_level(info->chunk_root->node));
+
+ btrfs_set_backup_extent_root(root_backup, info->extent_root->node->start);
+ btrfs_set_backup_extent_root_gen(root_backup,
+ btrfs_header_generation(info->extent_root->node));
+ btrfs_set_backup_extent_root_level(root_backup,
+ btrfs_header_level(info->extent_root->node));
+
+ /*
+ * we might commit during log recovery, which happens before we set
+ * the fs_root. Make sure it is valid before we fill it in.
+ */
+ if (info->fs_root && info->fs_root->node) {
+ btrfs_set_backup_fs_root(root_backup,
+ info->fs_root->node->start);
+ btrfs_set_backup_fs_root_gen(root_backup,
+ btrfs_header_generation(info->fs_root->node));
+ btrfs_set_backup_fs_root_level(root_backup,
+ btrfs_header_level(info->fs_root->node));
+ }
+
+ btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start);
+ btrfs_set_backup_dev_root_gen(root_backup,
+ btrfs_header_generation(info->dev_root->node));
+ btrfs_set_backup_dev_root_level(root_backup,
+ btrfs_header_level(info->dev_root->node));
+
+ btrfs_set_backup_csum_root(root_backup, info->csum_root->node->start);
+ btrfs_set_backup_csum_root_gen(root_backup,
+ btrfs_header_generation(info->csum_root->node));
+ btrfs_set_backup_csum_root_level(root_backup,
+ btrfs_header_level(info->csum_root->node));
+
+ btrfs_set_backup_total_bytes(root_backup,
+ btrfs_super_total_bytes(info->super_copy));
+ btrfs_set_backup_bytes_used(root_backup,
+ btrfs_super_bytes_used(info->super_copy));
+ btrfs_set_backup_num_devices(root_backup,
+ btrfs_super_num_devices(info->super_copy));
+
+ /*
+ * if we don't copy this out to the super_copy, it won't get remembered
+ * for the next commit
+ */
+ memcpy(&info->super_copy->super_roots,
+ &info->super_for_commit->super_roots,
+ sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS);
+}
+
+/*
+ * this copies info out of the root backup array and back into
+ * the in-memory super block. It is meant to help iterate through
+ * the array, so you send it the number of backups you've already
+ * tried and the last backup index you used.
+ *
+ * this returns -1 when it has tried all the backups
+ */
+static noinline int next_root_backup(struct btrfs_fs_info *info,
+ struct btrfs_super_block *super,
+ int *num_backups_tried, int *backup_index)
+{
+ struct btrfs_root_backup *root_backup;
+ int newest = *backup_index;
+
+ if (*num_backups_tried == 0) {
+ u64 gen = btrfs_super_generation(super);
+
+ newest = find_newest_super_backup(info, gen);
+ if (newest == -1)
+ return -1;
+
+ *backup_index = newest;
+ *num_backups_tried = 1;
+ } else if (*num_backups_tried == BTRFS_NUM_BACKUP_ROOTS) {
+ /* we've tried all the backups, all done */
+ return -1;
+ } else {
+ /* jump to the next oldest backup */
+ newest = (*backup_index + BTRFS_NUM_BACKUP_ROOTS - 1) %
+ BTRFS_NUM_BACKUP_ROOTS;
+ *backup_index = newest;
+ *num_backups_tried += 1;
+ }
+ root_backup = super->super_roots + newest;
+
+ btrfs_set_super_generation(super,
+ btrfs_backup_tree_root_gen(root_backup));
+ btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup));
+ btrfs_set_super_root_level(super,
+ btrfs_backup_tree_root_level(root_backup));
+ btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup));
+
+ /*
+ * fixme: the total bytes and num_devices need to match or we should
+ * need a fsck
+ */
+ btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup));
+ btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup));
+ return 0;
+}
+
+/* helper to cleanup workers */
+static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
+{
+ btrfs_destroy_workqueue(fs_info->fixup_workers);
+ btrfs_destroy_workqueue(fs_info->delalloc_workers);
+ btrfs_destroy_workqueue(fs_info->workers);
+ btrfs_destroy_workqueue(fs_info->endio_workers);
+ btrfs_destroy_workqueue(fs_info->endio_meta_workers);
+ btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
+ btrfs_destroy_workqueue(fs_info->endio_repair_workers);
+ btrfs_destroy_workqueue(fs_info->rmw_workers);
+ btrfs_destroy_workqueue(fs_info->endio_meta_write_workers);
+ btrfs_destroy_workqueue(fs_info->endio_write_workers);
+ btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
+ btrfs_destroy_workqueue(fs_info->submit_workers);
+ btrfs_destroy_workqueue(fs_info->delayed_workers);
+ btrfs_destroy_workqueue(fs_info->caching_workers);
+ btrfs_destroy_workqueue(fs_info->readahead_workers);
+ btrfs_destroy_workqueue(fs_info->flush_workers);
+ btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers);
+ btrfs_destroy_workqueue(fs_info->extent_workers);
+}
+
+static void free_root_extent_buffers(struct btrfs_root *root)
+{
+ if (root) {
+ free_extent_buffer(root->node);
+ free_extent_buffer(root->commit_root);
+ root->node = NULL;
+ root->commit_root = NULL;
+ }
+}
+
+/* helper to cleanup tree roots */
+static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
+{
+ free_root_extent_buffers(info->tree_root);
+
+ free_root_extent_buffers(info->dev_root);
+ free_root_extent_buffers(info->extent_root);
+ free_root_extent_buffers(info->csum_root);
+ free_root_extent_buffers(info->quota_root);
+ free_root_extent_buffers(info->uuid_root);
+ if (chunk_root)
+ free_root_extent_buffers(info->chunk_root);
+}
+
+void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
+{
+ int ret;
+ struct btrfs_root *gang[8];
+ int i;
+
+ while (!list_empty(&fs_info->dead_roots)) {
+ gang[0] = list_entry(fs_info->dead_roots.next,
+ struct btrfs_root, root_list);
+ list_del(&gang[0]->root_list);
+
+ if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state)) {
+ btrfs_drop_and_free_fs_root(fs_info, gang[0]);
+ } else {
+ free_extent_buffer(gang[0]->node);
+ free_extent_buffer(gang[0]->commit_root);
+ btrfs_put_fs_root(gang[0]);
+ }
+ }
+
+ while (1) {
+ ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
+ (void **)gang, 0,
+ ARRAY_SIZE(gang));
+ if (!ret)
+ break;
+ for (i = 0; i < ret; i++)
+ btrfs_drop_and_free_fs_root(fs_info, gang[i]);
+ }
+
+ if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+ btrfs_free_log_root_tree(NULL, fs_info);
+ btrfs_destroy_pinned_extent(fs_info->tree_root,
+ fs_info->pinned_extents);
+ }
+}
+
+static void btrfs_init_scrub(struct btrfs_fs_info *fs_info)
+{
+ mutex_init(&fs_info->scrub_lock);
+ atomic_set(&fs_info->scrubs_running, 0);
+ atomic_set(&fs_info->scrub_pause_req, 0);
+ atomic_set(&fs_info->scrubs_paused, 0);
+ atomic_set(&fs_info->scrub_cancel_req, 0);
+ init_waitqueue_head(&fs_info->scrub_pause_wait);
+ fs_info->scrub_workers_refcnt = 0;
+}
+
+static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
+{
+ spin_lock_init(&fs_info->balance_lock);
+ mutex_init(&fs_info->balance_mutex);
+ atomic_set(&fs_info->balance_running, 0);
+ atomic_set(&fs_info->balance_pause_req, 0);
+ atomic_set(&fs_info->balance_cancel_req, 0);
+ fs_info->balance_ctl = NULL;
+ init_waitqueue_head(&fs_info->balance_wait_q);
+}
+
+static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info,
+ struct btrfs_root *tree_root)
+{
+ fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
+ set_nlink(fs_info->btree_inode, 1);
+ /*
+ * we set the i_size on the btree inode to the max possible int.
+ * the real end of the address space is determined by all of
+ * the devices in the system
+ */
+ fs_info->btree_inode->i_size = OFFSET_MAX;
+ fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
+
+ RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
+ extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
+ fs_info->btree_inode->i_mapping);
+ BTRFS_I(fs_info->btree_inode)->io_tree.track_uptodate = 0;
+ extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree);
+
+ BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
+
+ BTRFS_I(fs_info->btree_inode)->root = tree_root;
+ memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
+ sizeof(struct btrfs_key));
+ set_bit(BTRFS_INODE_DUMMY,
+ &BTRFS_I(fs_info->btree_inode)->runtime_flags);
+ btrfs_insert_inode_hash(fs_info->btree_inode);
+}
+
+static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
+{
+ fs_info->dev_replace.lock_owner = 0;
+ atomic_set(&fs_info->dev_replace.nesting_level, 0);
+ mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
+ mutex_init(&fs_info->dev_replace.lock_management_lock);
+ mutex_init(&fs_info->dev_replace.lock);
+ init_waitqueue_head(&fs_info->replace_wait);
+}
+
+static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
+{
+ spin_lock_init(&fs_info->qgroup_lock);
+ mutex_init(&fs_info->qgroup_ioctl_lock);
+ fs_info->qgroup_tree = RB_ROOT;
+ fs_info->qgroup_op_tree = RB_ROOT;
+ INIT_LIST_HEAD(&fs_info->dirty_qgroups);
+ fs_info->qgroup_seq = 1;
+ fs_info->quota_enabled = 0;
+ fs_info->pending_quota_state = 0;
+ fs_info->qgroup_ulist = NULL;
+ mutex_init(&fs_info->qgroup_rescan_lock);
+}
+
+static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
+ struct btrfs_fs_devices *fs_devices)
+{
+ int max_active = fs_info->thread_pool_size;
+ unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
+
+ fs_info->workers =
+ btrfs_alloc_workqueue("worker", flags | WQ_HIGHPRI,
+ max_active, 16);
+
+ fs_info->delalloc_workers =
+ btrfs_alloc_workqueue("delalloc", flags, max_active, 2);
+
+ fs_info->flush_workers =
+ btrfs_alloc_workqueue("flush_delalloc", flags, max_active, 0);
+
+ fs_info->caching_workers =
+ btrfs_alloc_workqueue("cache", flags, max_active, 0);
+
+ /*
+ * a higher idle thresh on the submit workers makes it much more
+ * likely that bios will be send down in a sane order to the
+ * devices
+ */
+ fs_info->submit_workers =
+ btrfs_alloc_workqueue("submit", flags,
+ min_t(u64, fs_devices->num_devices,
+ max_active), 64);
+
+ fs_info->fixup_workers =
+ btrfs_alloc_workqueue("fixup", flags, 1, 0);
+
+ /*
+ * endios are largely parallel and should have a very
+ * low idle thresh
+ */
+ fs_info->endio_workers =
+ btrfs_alloc_workqueue("endio", flags, max_active, 4);
+ fs_info->endio_meta_workers =
+ btrfs_alloc_workqueue("endio-meta", flags, max_active, 4);
+ fs_info->endio_meta_write_workers =
+ btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2);
+ fs_info->endio_raid56_workers =
+ btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4);
+ fs_info->endio_repair_workers =
+ btrfs_alloc_workqueue("endio-repair", flags, 1, 0);
+ fs_info->rmw_workers =
+ btrfs_alloc_workqueue("rmw", flags, max_active, 2);
+ fs_info->endio_write_workers =
+ btrfs_alloc_workqueue("endio-write", flags, max_active, 2);
+ fs_info->endio_freespace_worker =
+ btrfs_alloc_workqueue("freespace-write", flags, max_active, 0);
+ fs_info->delayed_workers =
+ btrfs_alloc_workqueue("delayed-meta", flags, max_active, 0);
+ fs_info->readahead_workers =
+ btrfs_alloc_workqueue("readahead", flags, max_active, 2);
+ fs_info->qgroup_rescan_workers =
+ btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0);
+ fs_info->extent_workers =
+ btrfs_alloc_workqueue("extent-refs", flags,
+ min_t(u64, fs_devices->num_devices,
+ max_active), 8);
+
+ if (!(fs_info->workers && fs_info->delalloc_workers &&
+ fs_info->submit_workers && fs_info->flush_workers &&
+ fs_info->endio_workers && fs_info->endio_meta_workers &&
+ fs_info->endio_meta_write_workers &&
+ fs_info->endio_repair_workers &&
+ fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
+ fs_info->endio_freespace_worker && fs_info->rmw_workers &&
+ fs_info->caching_workers && fs_info->readahead_workers &&
+ fs_info->fixup_workers && fs_info->delayed_workers &&
+ fs_info->extent_workers &&
+ fs_info->qgroup_rescan_workers)) {
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
+ struct btrfs_fs_devices *fs_devices)
+{
+ int ret;
+ struct btrfs_root *tree_root = fs_info->tree_root;
+ struct btrfs_root *log_tree_root;
+ struct btrfs_super_block *disk_super = fs_info->super_copy;
+ u64 bytenr = btrfs_super_log_root(disk_super);
+
+ if (fs_devices->rw_devices == 0) {
+ printk(KERN_WARNING "BTRFS: log replay required "
+ "on RO media\n");
+ return -EIO;
+ }
+
+ log_tree_root = btrfs_alloc_root(fs_info);
+ if (!log_tree_root)
+ return -ENOMEM;
+
+ __setup_root(tree_root->nodesize, tree_root->sectorsize,
+ tree_root->stripesize, log_tree_root, fs_info,
+ BTRFS_TREE_LOG_OBJECTID);
+
+ log_tree_root->node = read_tree_block(tree_root, bytenr,
+ fs_info->generation + 1);
+ if (!log_tree_root->node ||
+ !extent_buffer_uptodate(log_tree_root->node)) {
+ printk(KERN_ERR "BTRFS: failed to read log tree\n");
+ free_extent_buffer(log_tree_root->node);
+ kfree(log_tree_root);
+ return -EIO;
+ }
+ /* returns with log_tree_root freed on success */
+ ret = btrfs_recover_log_trees(log_tree_root);
+ if (ret) {
+ btrfs_error(tree_root->fs_info, ret,
+ "Failed to recover log tree");
+ free_extent_buffer(log_tree_root->node);
+ kfree(log_tree_root);
+ return ret;
+ }
+
+ if (fs_info->sb->s_flags & MS_RDONLY) {
+ ret = btrfs_commit_super(tree_root);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int btrfs_read_roots(struct btrfs_fs_info *fs_info,
+ struct btrfs_root *tree_root)
+{
+ struct btrfs_root *root;
+ struct btrfs_key location;
+ int ret;
+
+ location.objectid = BTRFS_EXTENT_TREE_OBJECTID;
+ location.type = BTRFS_ROOT_ITEM_KEY;
+ location.offset = 0;
+
+ root = btrfs_read_tree_root(tree_root, &location);
+ if (IS_ERR(root))
+ return PTR_ERR(root);
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ fs_info->extent_root = root;
+
+ location.objectid = BTRFS_DEV_TREE_OBJECTID;
+ root = btrfs_read_tree_root(tree_root, &location);
+ if (IS_ERR(root))
+ return PTR_ERR(root);
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ fs_info->dev_root = root;
+ btrfs_init_devices_late(fs_info);
+
+ location.objectid = BTRFS_CSUM_TREE_OBJECTID;
+ root = btrfs_read_tree_root(tree_root, &location);
+ if (IS_ERR(root))
+ return PTR_ERR(root);
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ fs_info->csum_root = root;
+
+ location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
+ root = btrfs_read_tree_root(tree_root, &location);
+ if (!IS_ERR(root)) {
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ fs_info->quota_enabled = 1;
+ fs_info->pending_quota_state = 1;
+ fs_info->quota_root = root;
+ }
+
+ location.objectid = BTRFS_UUID_TREE_OBJECTID;
+ root = btrfs_read_tree_root(tree_root, &location);
+ if (IS_ERR(root)) {
+ ret = PTR_ERR(root);
+ if (ret != -ENOENT)
+ return ret;
+ } else {
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ fs_info->uuid_root = root;
+ }
+
+ return 0;
+}
+
+int open_ctree(struct super_block *sb,
+ struct btrfs_fs_devices *fs_devices,
+ char *options)
+{
+ u32 sectorsize;
+ u32 nodesize;
+ u32 stripesize;
+ u64 generation;
+ u64 features;
+ struct btrfs_key location;
+ struct buffer_head *bh;
+ struct btrfs_super_block *disk_super;
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ struct btrfs_root *tree_root;
+ struct btrfs_root *chunk_root;
+ int ret;
+ int err = -EINVAL;
+ int num_backups_tried = 0;
+ int backup_index = 0;
+ int max_active;
+
+ tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info);
+ chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info);
+ if (!tree_root || !chunk_root) {
+ err = -ENOMEM;
+ goto fail;
+ }
+
+ ret = init_srcu_struct(&fs_info->subvol_srcu);
+ if (ret) {
+ err = ret;
+ goto fail;
+ }
+
+ ret = setup_bdi(fs_info, &fs_info->bdi);
+ if (ret) {
+ err = ret;
+ goto fail_srcu;
+ }
+
+ ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
+ if (ret) {
+ err = ret;
+ goto fail_bdi;
+ }
+ fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE *
+ (1 + ilog2(nr_cpu_ids));
+
+ ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL);
+ if (ret) {
+ err = ret;
+ goto fail_dirty_metadata_bytes;
+ }
+
+ ret = percpu_counter_init(&fs_info->bio_counter, 0, GFP_KERNEL);
+ if (ret) {
+ err = ret;
+ goto fail_delalloc_bytes;
+ }
+
+ fs_info->btree_inode = new_inode(sb);
+ if (!fs_info->btree_inode) {
+ err = -ENOMEM;
+ goto fail_bio_counter;
+ }
+
+ mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
+
+ INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
+ INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
+ INIT_LIST_HEAD(&fs_info->trans_list);
+ INIT_LIST_HEAD(&fs_info->dead_roots);
+ INIT_LIST_HEAD(&fs_info->delayed_iputs);
+ INIT_LIST_HEAD(&fs_info->delalloc_roots);
+ INIT_LIST_HEAD(&fs_info->caching_block_groups);
+ spin_lock_init(&fs_info->delalloc_root_lock);
+ spin_lock_init(&fs_info->trans_lock);
+ spin_lock_init(&fs_info->fs_roots_radix_lock);
+ spin_lock_init(&fs_info->delayed_iput_lock);
+ spin_lock_init(&fs_info->defrag_inodes_lock);
+ spin_lock_init(&fs_info->free_chunk_lock);
+ spin_lock_init(&fs_info->tree_mod_seq_lock);
+ spin_lock_init(&fs_info->super_lock);
+ spin_lock_init(&fs_info->qgroup_op_lock);
+ spin_lock_init(&fs_info->buffer_lock);
+ spin_lock_init(&fs_info->unused_bgs_lock);
+ rwlock_init(&fs_info->tree_mod_log_lock);
+ mutex_init(&fs_info->unused_bg_unpin_mutex);
+ mutex_init(&fs_info->reloc_mutex);
+ mutex_init(&fs_info->delalloc_root_mutex);
+ seqlock_init(&fs_info->profiles_lock);
+ init_rwsem(&fs_info->delayed_iput_sem);
+
+ init_completion(&fs_info->kobj_unregister);
+ INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
+ INIT_LIST_HEAD(&fs_info->space_info);
+ INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
+ INIT_LIST_HEAD(&fs_info->unused_bgs);
+ btrfs_mapping_init(&fs_info->mapping_tree);
+ btrfs_init_block_rsv(&fs_info->global_block_rsv,
+ BTRFS_BLOCK_RSV_GLOBAL);
+ btrfs_init_block_rsv(&fs_info->delalloc_block_rsv,
+ BTRFS_BLOCK_RSV_DELALLOC);
+ btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
+ btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK);
+ btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
+ btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
+ BTRFS_BLOCK_RSV_DELOPS);
+ atomic_set(&fs_info->nr_async_submits, 0);
+ atomic_set(&fs_info->async_delalloc_pages, 0);
+ atomic_set(&fs_info->async_submit_draining, 0);
+ atomic_set(&fs_info->nr_async_bios, 0);
+ atomic_set(&fs_info->defrag_running, 0);
+ atomic_set(&fs_info->qgroup_op_seq, 0);
+ atomic64_set(&fs_info->tree_mod_seq, 0);
+ fs_info->sb = sb;
+ fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
+ fs_info->metadata_ratio = 0;
+ fs_info->defrag_inodes = RB_ROOT;
+ fs_info->free_chunk_space = 0;
+ fs_info->tree_mod_log = RB_ROOT;
+ fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
+ fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */
+ /* readahead state */
+ INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
+ spin_lock_init(&fs_info->reada_lock);
+
+ fs_info->thread_pool_size = min_t(unsigned long,
+ num_online_cpus() + 2, 8);
+
+ INIT_LIST_HEAD(&fs_info->ordered_roots);
+ spin_lock_init(&fs_info->ordered_root_lock);
+ fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
+ GFP_NOFS);
+ if (!fs_info->delayed_root) {
+ err = -ENOMEM;
+ goto fail_iput;
+ }
+ btrfs_init_delayed_root(fs_info->delayed_root);
+
+ btrfs_init_scrub(fs_info);
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+ fs_info->check_integrity_print_mask = 0;
+#endif
+ btrfs_init_balance(fs_info);
+ btrfs_init_async_reclaim_work(&fs_info->async_reclaim_work);
+
+ sb->s_blocksize = 4096;
+ sb->s_blocksize_bits = blksize_bits(4096);
+ sb->s_bdi = &fs_info->bdi;
+
+ btrfs_init_btree_inode(fs_info, tree_root);
+
+ spin_lock_init(&fs_info->block_group_cache_lock);
+ fs_info->block_group_cache_tree = RB_ROOT;
+ fs_info->first_logical_byte = (u64)-1;
+
+ extent_io_tree_init(&fs_info->freed_extents[0],
+ fs_info->btree_inode->i_mapping);
+ extent_io_tree_init(&fs_info->freed_extents[1],
+ fs_info->btree_inode->i_mapping);
+ fs_info->pinned_extents = &fs_info->freed_extents[0];
+ fs_info->do_barriers = 1;
+
+
+ mutex_init(&fs_info->ordered_operations_mutex);
+ mutex_init(&fs_info->ordered_extent_flush_mutex);
+ mutex_init(&fs_info->tree_log_mutex);
+ mutex_init(&fs_info->chunk_mutex);
+ mutex_init(&fs_info->transaction_kthread_mutex);
+ mutex_init(&fs_info->cleaner_mutex);
+ mutex_init(&fs_info->volume_mutex);
+ mutex_init(&fs_info->ro_block_group_mutex);
+ init_rwsem(&fs_info->commit_root_sem);
+ init_rwsem(&fs_info->cleanup_work_sem);
+ init_rwsem(&fs_info->subvol_sem);
+ sema_init(&fs_info->uuid_tree_rescan_sem, 1);
+
+ btrfs_init_dev_replace_locks(fs_info);
+ btrfs_init_qgroup(fs_info);
+
+ btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
+ btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
+
+ init_waitqueue_head(&fs_info->transaction_throttle);
+ init_waitqueue_head(&fs_info->transaction_wait);
+ init_waitqueue_head(&fs_info->transaction_blocked_wait);
+ init_waitqueue_head(&fs_info->async_submit_wait);
+
+ INIT_LIST_HEAD(&fs_info->pinned_chunks);
+
+ ret = btrfs_alloc_stripe_hash_table(fs_info);
+ if (ret) {
+ err = ret;
+ goto fail_alloc;
+ }
+
+ __setup_root(4096, 4096, 4096, tree_root,
+ fs_info, BTRFS_ROOT_TREE_OBJECTID);
+
+ invalidate_bdev(fs_devices->latest_bdev);
+
+ /*
+ * Read super block and check the signature bytes only
+ */
+ bh = btrfs_read_dev_super(fs_devices->latest_bdev);
+ if (!bh) {
+ err = -EINVAL;
+ goto fail_alloc;
+ }
+
+ /*
+ * We want to check superblock checksum, the type is stored inside.
+ * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k).
+ */
+ if (btrfs_check_super_csum(bh->b_data)) {
+ printk(KERN_ERR "BTRFS: superblock checksum mismatch\n");
+ err = -EINVAL;
+ goto fail_alloc;
+ }
+
+ /*
+ * super_copy is zeroed at allocation time and we never touch the
+ * following bytes up to INFO_SIZE, the checksum is calculated from
+ * the whole block of INFO_SIZE
+ */
+ memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy));
+ memcpy(fs_info->super_for_commit, fs_info->super_copy,
+ sizeof(*fs_info->super_for_commit));
+ brelse(bh);
+
+ memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE);
+
+ ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
+ if (ret) {
+ printk(KERN_ERR "BTRFS: superblock contains fatal errors\n");
+ err = -EINVAL;
+ goto fail_alloc;
+ }
+
+ disk_super = fs_info->super_copy;
+ if (!btrfs_super_root(disk_super))
+ goto fail_alloc;
+
+ /* check FS state, whether FS is broken. */
+ if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR)
+ set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
+
+ /*
+ * run through our array of backup supers and setup
+ * our ring pointer to the oldest one
+ */
+ generation = btrfs_super_generation(disk_super);
+ find_oldest_super_backup(fs_info, generation);
+
+ /*
+ * In the long term, we'll store the compression type in the super
+ * block, and it'll be used for per file compression control.
+ */
+ fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
+
+ ret = btrfs_parse_options(tree_root, options);
+ if (ret) {
+ err = ret;
+ goto fail_alloc;
+ }
+
+ features = btrfs_super_incompat_flags(disk_super) &
+ ~BTRFS_FEATURE_INCOMPAT_SUPP;
+ if (features) {
+ printk(KERN_ERR "BTRFS: couldn't mount because of "
+ "unsupported optional features (%Lx).\n",
+ features);
+ err = -EINVAL;
+ goto fail_alloc;
+ }
+
+ /*
+ * Leafsize and nodesize were always equal, this is only a sanity check.
+ */
+ if (le32_to_cpu(disk_super->__unused_leafsize) !=
+ btrfs_super_nodesize(disk_super)) {
+ printk(KERN_ERR "BTRFS: couldn't mount because metadata "
+ "blocksizes don't match. node %d leaf %d\n",
+ btrfs_super_nodesize(disk_super),
+ le32_to_cpu(disk_super->__unused_leafsize));
+ err = -EINVAL;
+ goto fail_alloc;
+ }
+ if (btrfs_super_nodesize(disk_super) > BTRFS_MAX_METADATA_BLOCKSIZE) {
+ printk(KERN_ERR "BTRFS: couldn't mount because metadata "
+ "blocksize (%d) was too large\n",
+ btrfs_super_nodesize(disk_super));
+ err = -EINVAL;
+ goto fail_alloc;
+ }
+
+ features = btrfs_super_incompat_flags(disk_super);
+ features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
+ if (tree_root->fs_info->compress_type == BTRFS_COMPRESS_LZO)
+ features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
+
+ if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
+ printk(KERN_INFO "BTRFS: has skinny extents\n");
+
+ /*
+ * flag our filesystem as having big metadata blocks if
+ * they are bigger than the page size
+ */
+ if (btrfs_super_nodesize(disk_super) > PAGE_CACHE_SIZE) {
+ if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
+ printk(KERN_INFO "BTRFS: flagging fs with big metadata feature\n");
+ features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
+ }
+
+ nodesize = btrfs_super_nodesize(disk_super);
+ sectorsize = btrfs_super_sectorsize(disk_super);
+ stripesize = btrfs_super_stripesize(disk_super);
+ fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids));
+ fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
+
+ /*
+ * mixed block groups end up with duplicate but slightly offset
+ * extent buffers for the same range. It leads to corruptions
+ */
+ if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
+ (sectorsize != nodesize)) {
+ printk(KERN_ERR "BTRFS: unequal leaf/node/sector sizes "
+ "are not allowed for mixed block groups on %s\n",
+ sb->s_id);
+ goto fail_alloc;
+ }
+
+ /*
+ * Needn't use the lock because there is no other task which will
+ * update the flag.
+ */
+ btrfs_set_super_incompat_flags(disk_super, features);
+
+ features = btrfs_super_compat_ro_flags(disk_super) &
+ ~BTRFS_FEATURE_COMPAT_RO_SUPP;
+ if (!(sb->s_flags & MS_RDONLY) && features) {
+ printk(KERN_ERR "BTRFS: couldn't mount RDWR because of "
+ "unsupported option features (%Lx).\n",
+ features);
+ err = -EINVAL;
+ goto fail_alloc;
+ }
+
+ max_active = fs_info->thread_pool_size;
+
+ ret = btrfs_init_workqueues(fs_info, fs_devices);
+ if (ret) {
+ err = ret;
+ goto fail_sb_buffer;
+ }
+
+ fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
+ fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
+ 4 * 1024 * 1024 / PAGE_CACHE_SIZE);
+
+ tree_root->nodesize = nodesize;
+ tree_root->sectorsize = sectorsize;
+ tree_root->stripesize = stripesize;
+
+ sb->s_blocksize = sectorsize;
+ sb->s_blocksize_bits = blksize_bits(sectorsize);
+
+ if (btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
+ printk(KERN_ERR "BTRFS: valid FS not found on %s\n", sb->s_id);
+ goto fail_sb_buffer;
+ }
+
+ if (sectorsize != PAGE_SIZE) {
+ printk(KERN_ERR "BTRFS: incompatible sector size (%lu) "
+ "found on %s\n", (unsigned long)sectorsize, sb->s_id);
+ goto fail_sb_buffer;
+ }
+
+ mutex_lock(&fs_info->chunk_mutex);
+ ret = btrfs_read_sys_array(tree_root);
+ mutex_unlock(&fs_info->chunk_mutex);
+ if (ret) {
+ printk(KERN_ERR "BTRFS: failed to read the system "
+ "array on %s\n", sb->s_id);
+ goto fail_sb_buffer;
+ }
+
+ generation = btrfs_super_chunk_root_generation(disk_super);
+
+ __setup_root(nodesize, sectorsize, stripesize, chunk_root,
+ fs_info, BTRFS_CHUNK_TREE_OBJECTID);
+
+ chunk_root->node = read_tree_block(chunk_root,
+ btrfs_super_chunk_root(disk_super),
+ generation);
+ if (!chunk_root->node ||
+ !test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
+ printk(KERN_ERR "BTRFS: failed to read chunk root on %s\n",
+ sb->s_id);
+ goto fail_tree_roots;
+ }
+ btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
+ chunk_root->commit_root = btrfs_root_node(chunk_root);
+
+ read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
+ btrfs_header_chunk_tree_uuid(chunk_root->node), BTRFS_UUID_SIZE);
+
+ ret = btrfs_read_chunk_tree(chunk_root);
+ if (ret) {
+ printk(KERN_ERR "BTRFS: failed to read chunk tree on %s\n",
+ sb->s_id);
+ goto fail_tree_roots;
+ }
+
+ /*
+ * keep the device that is marked to be the target device for the
+ * dev_replace procedure
+ */
+ btrfs_close_extra_devices(fs_devices, 0);
+
+ if (!fs_devices->latest_bdev) {
+ printk(KERN_ERR "BTRFS: failed to read devices on %s\n",
+ sb->s_id);
+ goto fail_tree_roots;
+ }
+
+retry_root_backup:
+ generation = btrfs_super_generation(disk_super);
+
+ tree_root->node = read_tree_block(tree_root,
+ btrfs_super_root(disk_super),
+ generation);
+ if (!tree_root->node ||
+ !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
+ printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n",
+ sb->s_id);
+
+ goto recovery_tree_root;
+ }
+
+ btrfs_set_root_node(&tree_root->root_item, tree_root->node);
+ tree_root->commit_root = btrfs_root_node(tree_root);
+ btrfs_set_root_refs(&tree_root->root_item, 1);
+
+ ret = btrfs_read_roots(fs_info, tree_root);
+ if (ret)
+ goto recovery_tree_root;
+
+ fs_info->generation = generation;
+ fs_info->last_trans_committed = generation;
+
+ ret = btrfs_recover_balance(fs_info);
+ if (ret) {
+ printk(KERN_ERR "BTRFS: failed to recover balance\n");
+ goto fail_block_groups;
+ }
+
+ ret = btrfs_init_dev_stats(fs_info);
+ if (ret) {
+ printk(KERN_ERR "BTRFS: failed to init dev_stats: %d\n",
+ ret);
+ goto fail_block_groups;
+ }
+
+ ret = btrfs_init_dev_replace(fs_info);
+ if (ret) {
+ pr_err("BTRFS: failed to init dev_replace: %d\n", ret);
+ goto fail_block_groups;
+ }
+
+ btrfs_close_extra_devices(fs_devices, 1);
+
+ ret = btrfs_sysfs_add_one(fs_info);
+ if (ret) {
+ pr_err("BTRFS: failed to init sysfs interface: %d\n", ret);
+ goto fail_block_groups;
+ }
+
+ ret = btrfs_init_space_info(fs_info);
+ if (ret) {
+ printk(KERN_ERR "BTRFS: Failed to initial space info: %d\n", ret);
+ goto fail_sysfs;
+ }
+
+ ret = btrfs_read_block_groups(fs_info->extent_root);
+ if (ret) {
+ printk(KERN_ERR "BTRFS: Failed to read block groups: %d\n", ret);
+ goto fail_sysfs;
+ }
+ fs_info->num_tolerated_disk_barrier_failures =
+ btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
+ if (fs_info->fs_devices->missing_devices >
+ fs_info->num_tolerated_disk_barrier_failures &&
+ !(sb->s_flags & MS_RDONLY)) {
+ printk(KERN_WARNING "BTRFS: "
+ "too many missing devices, writeable mount is not allowed\n");
+ goto fail_sysfs;
+ }
+
+ fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
+ "btrfs-cleaner");
+ if (IS_ERR(fs_info->cleaner_kthread))
+ goto fail_sysfs;
+
+ fs_info->transaction_kthread = kthread_run(transaction_kthread,
+ tree_root,
+ "btrfs-transaction");
+ if (IS_ERR(fs_info->transaction_kthread))
+ goto fail_cleaner;
+
+ if (!btrfs_test_opt(tree_root, SSD) &&
+ !btrfs_test_opt(tree_root, NOSSD) &&
+ !fs_info->fs_devices->rotating) {
+ printk(KERN_INFO "BTRFS: detected SSD devices, enabling SSD "
+ "mode\n");
+ btrfs_set_opt(fs_info->mount_opt, SSD);
+ }
+
+ /*
+ * Mount does not set all options immediatelly, we can do it now and do
+ * not have to wait for transaction commit
+ */
+ btrfs_apply_pending_changes(fs_info);
+
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+ if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) {
+ ret = btrfsic_mount(tree_root, fs_devices,
+ btrfs_test_opt(tree_root,
+ CHECK_INTEGRITY_INCLUDING_EXTENT_DATA) ?
+ 1 : 0,
+ fs_info->check_integrity_print_mask);
+ if (ret)
+ printk(KERN_WARNING "BTRFS: failed to initialize"
+ " integrity check module %s\n", sb->s_id);
+ }
+#endif
+ ret = btrfs_read_qgroup_config(fs_info);
+ if (ret)
+ goto fail_trans_kthread;
+
+ /* do not make disk changes in broken FS */
+ if (btrfs_super_log_root(disk_super) != 0) {
+ ret = btrfs_replay_log(fs_info, fs_devices);
+ if (ret) {
+ err = ret;
+ goto fail_qgroup;
+ }
+ }
+
+ ret = btrfs_find_orphan_roots(tree_root);
+ if (ret)
+ goto fail_qgroup;
+
+ if (!(sb->s_flags & MS_RDONLY)) {
+ ret = btrfs_cleanup_fs_roots(fs_info);
+ if (ret)
+ goto fail_qgroup;
+
+ mutex_lock(&fs_info->cleaner_mutex);
+ ret = btrfs_recover_relocation(tree_root);
+ mutex_unlock(&fs_info->cleaner_mutex);
+ if (ret < 0) {
+ printk(KERN_WARNING
+ "BTRFS: failed to recover relocation\n");
+ err = -EINVAL;
+ goto fail_qgroup;
+ }
+ }
+
+ location.objectid = BTRFS_FS_TREE_OBJECTID;
+ location.type = BTRFS_ROOT_ITEM_KEY;
+ location.offset = 0;
+
+ fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
+ if (IS_ERR(fs_info->fs_root)) {
+ err = PTR_ERR(fs_info->fs_root);
+ goto fail_qgroup;
+ }
+
+ if (sb->s_flags & MS_RDONLY)
+ return 0;
+
+ down_read(&fs_info->cleanup_work_sem);
+ if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) ||
+ (ret = btrfs_orphan_cleanup(fs_info->tree_root))) {
+ up_read(&fs_info->cleanup_work_sem);
+ close_ctree(tree_root);
+ return ret;
+ }
+ up_read(&fs_info->cleanup_work_sem);
+
+ ret = btrfs_resume_balance_async(fs_info);
+ if (ret) {
+ printk(KERN_WARNING "BTRFS: failed to resume balance\n");
+ close_ctree(tree_root);
+ return ret;
+ }
+
+ ret = btrfs_resume_dev_replace_async(fs_info);
+ if (ret) {
+ pr_warn("BTRFS: failed to resume dev_replace\n");
+ close_ctree(tree_root);
+ return ret;
+ }
+
+ btrfs_qgroup_rescan_resume(fs_info);
+
+ if (!fs_info->uuid_root) {
+ pr_info("BTRFS: creating UUID tree\n");
+ ret = btrfs_create_uuid_tree(fs_info);
+ if (ret) {
+ pr_warn("BTRFS: failed to create the UUID tree %d\n",
+ ret);
+ close_ctree(tree_root);
+ return ret;
+ }
+ } else if (btrfs_test_opt(tree_root, RESCAN_UUID_TREE) ||
+ fs_info->generation !=
+ btrfs_super_uuid_tree_generation(disk_super)) {
+ pr_info("BTRFS: checking UUID tree\n");
+ ret = btrfs_check_uuid_tree(fs_info);
+ if (ret) {
+ pr_warn("BTRFS: failed to check the UUID tree %d\n",
+ ret);
+ close_ctree(tree_root);
+ return ret;
+ }
+ } else {
+ fs_info->update_uuid_tree_gen = 1;
+ }
+
+ fs_info->open = 1;
+
+ return 0;
+
+fail_qgroup:
+ btrfs_free_qgroup_config(fs_info);
+fail_trans_kthread:
+ kthread_stop(fs_info->transaction_kthread);
+ btrfs_cleanup_transaction(fs_info->tree_root);
+ btrfs_free_fs_roots(fs_info);
+fail_cleaner:
+ kthread_stop(fs_info->cleaner_kthread);
+
+ /*
+ * make sure we're done with the btree inode before we stop our
+ * kthreads
+ */
+ filemap_write_and_wait(fs_info->btree_inode->i_mapping);
+
+fail_sysfs:
+ btrfs_sysfs_remove_one(fs_info);
+
+fail_block_groups:
+ btrfs_put_block_group_cache(fs_info);
+ btrfs_free_block_groups(fs_info);
+
+fail_tree_roots:
+ free_root_pointers(fs_info, 1);
+ invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
+
+fail_sb_buffer:
+ btrfs_stop_all_workers(fs_info);
+fail_alloc:
+fail_iput:
+ btrfs_mapping_tree_free(&fs_info->mapping_tree);
+
+ iput(fs_info->btree_inode);
+fail_bio_counter:
+ percpu_counter_destroy(&fs_info->bio_counter);
+fail_delalloc_bytes:
+ percpu_counter_destroy(&fs_info->delalloc_bytes);
+fail_dirty_metadata_bytes:
+ percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
+fail_bdi:
+ bdi_destroy(&fs_info->bdi);
+fail_srcu:
+ cleanup_srcu_struct(&fs_info->subvol_srcu);
+fail:
+ btrfs_free_stripe_hash_table(fs_info);
+ btrfs_close_devices(fs_info->fs_devices);
+ return err;
+
+recovery_tree_root:
+ if (!btrfs_test_opt(tree_root, RECOVERY))
+ goto fail_tree_roots;
+
+ free_root_pointers(fs_info, 0);
+
+ /* don't use the log in recovery mode, it won't be valid */
+ btrfs_set_super_log_root(disk_super, 0);
+
+ /* we can't trust the free space cache either */
+ btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);
+
+ ret = next_root_backup(fs_info, fs_info->super_copy,
+ &num_backups_tried, &backup_index);
+ if (ret == -1)
+ goto fail_block_groups;
+ goto retry_root_backup;
+}
+
+static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
+{
+ if (uptodate) {
+ set_buffer_uptodate(bh);
+ } else {
+ struct btrfs_device *device = (struct btrfs_device *)
+ bh->b_private;
+
+ printk_ratelimited_in_rcu(KERN_WARNING "BTRFS: lost page write due to "
+ "I/O error on %s\n",
+ rcu_str_deref(device->name));
+ /* note, we dont' set_buffer_write_io_error because we have
+ * our own ways of dealing with the IO errors
+ */
+ clear_buffer_uptodate(bh);
+ btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_WRITE_ERRS);
+ }
+ unlock_buffer(bh);
+ put_bh(bh);
+}
+
+struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
+{
+ struct buffer_head *bh;
+ struct buffer_head *latest = NULL;
+ struct btrfs_super_block *super;
+ int i;
+ u64 transid = 0;
+ u64 bytenr;
+
+ /* we would like to check all the supers, but that would make
+ * a btrfs mount succeed after a mkfs from a different FS.
+ * So, we need to add a special mount option to scan for
+ * later supers, using BTRFS_SUPER_MIRROR_MAX instead
+ */
+ for (i = 0; i < 1; i++) {
+ bytenr = btrfs_sb_offset(i);
+ if (bytenr + BTRFS_SUPER_INFO_SIZE >=
+ i_size_read(bdev->bd_inode))
+ break;
+ bh = __bread(bdev, bytenr / 4096,
+ BTRFS_SUPER_INFO_SIZE);
+ if (!bh)
+ continue;
+
+ super = (struct btrfs_super_block *)bh->b_data;
+ if (btrfs_super_bytenr(super) != bytenr ||
+ btrfs_super_magic(super) != BTRFS_MAGIC) {
+ brelse(bh);
+ continue;
+ }
+
+ if (!latest || btrfs_super_generation(super) > transid) {
+ brelse(latest);
+ latest = bh;
+ transid = btrfs_super_generation(super);
+ } else {
+ brelse(bh);
+ }
+ }
+ return latest;
+}
+
+/*
+ * this should be called twice, once with wait == 0 and
+ * once with wait == 1. When wait == 0 is done, all the buffer heads
+ * we write are pinned.
+ *
+ * They are released when wait == 1 is done.
+ * max_mirrors must be the same for both runs, and it indicates how
+ * many supers on this one device should be written.
+ *
+ * max_mirrors == 0 means to write them all.
+ */
+static int write_dev_supers(struct btrfs_device *device,
+ struct btrfs_super_block *sb,
+ int do_barriers, int wait, int max_mirrors)
+{
+ struct buffer_head *bh;
+ int i;
+ int ret;
+ int errors = 0;
+ u32 crc;
+ u64 bytenr;
+
+ if (max_mirrors == 0)
+ max_mirrors = BTRFS_SUPER_MIRROR_MAX;
+
+ for (i = 0; i < max_mirrors; i++) {
+ bytenr = btrfs_sb_offset(i);
+ if (bytenr + BTRFS_SUPER_INFO_SIZE >=
+ device->commit_total_bytes)
+ break;
+
+ if (wait) {
+ bh = __find_get_block(device->bdev, bytenr / 4096,
+ BTRFS_SUPER_INFO_SIZE);
+ if (!bh) {
+ errors++;
+ continue;
+ }
+ wait_on_buffer(bh);
+ if (!buffer_uptodate(bh))
+ errors++;
+
+ /* drop our reference */
+ brelse(bh);
+
+ /* drop the reference from the wait == 0 run */
+ brelse(bh);
+ continue;
+ } else {
+ btrfs_set_super_bytenr(sb, bytenr);
+
+ crc = ~(u32)0;
+ crc = btrfs_csum_data((char *)sb +
+ BTRFS_CSUM_SIZE, crc,
+ BTRFS_SUPER_INFO_SIZE -
+ BTRFS_CSUM_SIZE);
+ btrfs_csum_final(crc, sb->csum);
+
+ /*
+ * one reference for us, and we leave it for the
+ * caller
+ */
+ bh = __getblk(device->bdev, bytenr / 4096,
+ BTRFS_SUPER_INFO_SIZE);
+ if (!bh) {
+ printk(KERN_ERR "BTRFS: couldn't get super "
+ "buffer head for bytenr %Lu\n", bytenr);
+ errors++;
+ continue;
+ }
+
+ memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
+
+ /* one reference for submit_bh */
+ get_bh(bh);
+
+ set_buffer_uptodate(bh);
+ lock_buffer(bh);
+ bh->b_end_io = btrfs_end_buffer_write_sync;
+ bh->b_private = device;
+ }
+
+ /*
+ * we fua the first super. The others we allow
+ * to go down lazy.
+ */
+ if (i == 0)
+ ret = btrfsic_submit_bh(WRITE_FUA, bh);
+ else
+ ret = btrfsic_submit_bh(WRITE_SYNC, bh);
+ if (ret)
+ errors++;
+ }
+ return errors < i ? 0 : -1;
+}
+
+/*
+ * endio for the write_dev_flush, this will wake anyone waiting
+ * for the barrier when it is done
+ */
+static void btrfs_end_empty_barrier(struct bio *bio, int err)
+{
+ if (err) {
+ if (err == -EOPNOTSUPP)
+ set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
+ clear_bit(BIO_UPTODATE, &bio->bi_flags);
+ }
+ if (bio->bi_private)
+ complete(bio->bi_private);
+ bio_put(bio);
+}
+
+/*
+ * trigger flushes for one the devices. If you pass wait == 0, the flushes are
+ * sent down. With wait == 1, it waits for the previous flush.
+ *
+ * any device where the flush fails with eopnotsupp are flagged as not-barrier
+ * capable
+ */
+static int write_dev_flush(struct btrfs_device *device, int wait)
+{
+ struct bio *bio;
+ int ret = 0;
+
+ if (device->nobarriers)
+ return 0;
+
+ if (wait) {
+ bio = device->flush_bio;
+ if (!bio)
+ return 0;
+
+ wait_for_completion(&device->flush_wait);
+
+ if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
+ printk_in_rcu("BTRFS: disabling barriers on dev %s\n",
+ rcu_str_deref(device->name));
+ device->nobarriers = 1;
+ } else if (!bio_flagged(bio, BIO_UPTODATE)) {
+ ret = -EIO;
+ btrfs_dev_stat_inc_and_print(device,
+ BTRFS_DEV_STAT_FLUSH_ERRS);
+ }
+
+ /* drop the reference from the wait == 0 run */
+ bio_put(bio);
+ device->flush_bio = NULL;
+
+ return ret;
+ }
+
+ /*
+ * one reference for us, and we leave it for the
+ * caller
+ */
+ device->flush_bio = NULL;
+ bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
+ if (!bio)
+ return -ENOMEM;
+
+ bio->bi_end_io = btrfs_end_empty_barrier;
+ bio->bi_bdev = device->bdev;
+ init_completion(&device->flush_wait);
+ bio->bi_private = &device->flush_wait;
+ device->flush_bio = bio;
+
+ bio_get(bio);
+ btrfsic_submit_bio(WRITE_FLUSH, bio);
+
+ return 0;
+}
+
+/*
+ * send an empty flush down to each device in parallel,
+ * then wait for them
+ */
+static int barrier_all_devices(struct btrfs_fs_info *info)
+{
+ struct list_head *head;
+ struct btrfs_device *dev;
+ int errors_send = 0;
+ int errors_wait = 0;
+ int ret;
+
+ /* send down all the barriers */
+ head = &info->fs_devices->devices;
+ list_for_each_entry_rcu(dev, head, dev_list) {
+ if (dev->missing)
+ continue;
+ if (!dev->bdev) {
+ errors_send++;
+ continue;
+ }
+ if (!dev->in_fs_metadata || !dev->writeable)
+ continue;
+
+ ret = write_dev_flush(dev, 0);
+ if (ret)
+ errors_send++;
+ }
+
+ /* wait for all the barriers */
+ list_for_each_entry_rcu(dev, head, dev_list) {
+ if (dev->missing)
+ continue;
+ if (!dev->bdev) {
+ errors_wait++;
+ continue;
+ }
+ if (!dev->in_fs_metadata || !dev->writeable)
+ continue;
+
+ ret = write_dev_flush(dev, 1);
+ if (ret)
+ errors_wait++;
+ }
+ if (errors_send > info->num_tolerated_disk_barrier_failures ||
+ errors_wait > info->num_tolerated_disk_barrier_failures)
+ return -EIO;
+ return 0;
+}
+
+int btrfs_calc_num_tolerated_disk_barrier_failures(
+ struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_ioctl_space_info space;
+ struct btrfs_space_info *sinfo;
+ u64 types[] = {BTRFS_BLOCK_GROUP_DATA,
+ BTRFS_BLOCK_GROUP_SYSTEM,
+ BTRFS_BLOCK_GROUP_METADATA,
+ BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA};
+ int num_types = 4;
+ int i;
+ int c;
+ int num_tolerated_disk_barrier_failures =
+ (int)fs_info->fs_devices->num_devices;
+
+ for (i = 0; i < num_types; i++) {
+ struct btrfs_space_info *tmp;
+
+ sinfo = NULL;
+ rcu_read_lock();
+ list_for_each_entry_rcu(tmp, &fs_info->space_info, list) {
+ if (tmp->flags == types[i]) {
+ sinfo = tmp;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ if (!sinfo)
+ continue;
+
+ down_read(&sinfo->groups_sem);
+ for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
+ if (!list_empty(&sinfo->block_groups[c])) {
+ u64 flags;
+
+ btrfs_get_block_group_info(
+ &sinfo->block_groups[c], &space);
+ if (space.total_bytes == 0 ||
+ space.used_bytes == 0)
+ continue;
+ flags = space.flags;
+ /*
+ * return
+ * 0: if dup, single or RAID0 is configured for
+ * any of metadata, system or data, else
+ * 1: if RAID5 is configured, or if RAID1 or
+ * RAID10 is configured and only two mirrors
+ * are used, else
+ * 2: if RAID6 is configured, else
+ * num_mirrors - 1: if RAID1 or RAID10 is
+ * configured and more than
+ * 2 mirrors are used.
+ */
+ if (num_tolerated_disk_barrier_failures > 0 &&
+ ((flags & (BTRFS_BLOCK_GROUP_DUP |
+ BTRFS_BLOCK_GROUP_RAID0)) ||
+ ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK)
+ == 0)))
+ num_tolerated_disk_barrier_failures = 0;
+ else if (num_tolerated_disk_barrier_failures > 1) {
+ if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID5 |
+ BTRFS_BLOCK_GROUP_RAID10)) {
+ num_tolerated_disk_barrier_failures = 1;
+ } else if (flags &
+ BTRFS_BLOCK_GROUP_RAID6) {
+ num_tolerated_disk_barrier_failures = 2;
+ }
+ }
+ }
+ }
+ up_read(&sinfo->groups_sem);
+ }
+
+ return num_tolerated_disk_barrier_failures;
+}
+
+static int write_all_supers(struct btrfs_root *root, int max_mirrors)
+{
+ struct list_head *head;
+ struct btrfs_device *dev;
+ struct btrfs_super_block *sb;
+ struct btrfs_dev_item *dev_item;
+ int ret;
+ int do_barriers;
+ int max_errors;
+ int total_errors = 0;
+ u64 flags;
+
+ do_barriers = !btrfs_test_opt(root, NOBARRIER);
+ backup_super_roots(root->fs_info);
+
+ sb = root->fs_info->super_for_commit;
+ dev_item = &sb->dev_item;
+
+ mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+ head = &root->fs_info->fs_devices->devices;
+ max_errors = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
+
+ if (do_barriers) {
+ ret = barrier_all_devices(root->fs_info);
+ if (ret) {
+ mutex_unlock(
+ &root->fs_info->fs_devices->device_list_mutex);
+ btrfs_error(root->fs_info, ret,
+ "errors while submitting device barriers.");
+ return ret;
+ }
+ }
+
+ list_for_each_entry_rcu(dev, head, dev_list) {
+ if (!dev->bdev) {
+ total_errors++;
+ continue;
+ }
+ if (!dev->in_fs_metadata || !dev->writeable)
+ continue;
+
+ btrfs_set_stack_device_generation(dev_item, 0);
+ btrfs_set_stack_device_type(dev_item, dev->type);
+ btrfs_set_stack_device_id(dev_item, dev->devid);
+ btrfs_set_stack_device_total_bytes(dev_item,
+ dev->commit_total_bytes);
+ btrfs_set_stack_device_bytes_used(dev_item,
+ dev->commit_bytes_used);
+ btrfs_set_stack_device_io_align(dev_item, dev->io_align);
+ btrfs_set_stack_device_io_width(dev_item, dev->io_width);
+ btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
+ memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
+ memcpy(dev_item->fsid, dev->fs_devices->fsid, BTRFS_UUID_SIZE);
+
+ flags = btrfs_super_flags(sb);
+ btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
+
+ ret = write_dev_supers(dev, sb, do_barriers, 0, max_mirrors);
+ if (ret)
+ total_errors++;
+ }
+ if (total_errors > max_errors) {
+ btrfs_err(root->fs_info, "%d errors while writing supers",
+ total_errors);
+ mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
+ /* FUA is masked off if unsupported and can't be the reason */
+ btrfs_error(root->fs_info, -EIO,
+ "%d errors while writing supers", total_errors);
+ return -EIO;
+ }
+
+ total_errors = 0;
+ list_for_each_entry_rcu(dev, head, dev_list) {
+ if (!dev->bdev)
+ continue;
+ if (!dev->in_fs_metadata || !dev->writeable)
+ continue;
+
+ ret = write_dev_supers(dev, sb, do_barriers, 1, max_mirrors);
+ if (ret)
+ total_errors++;
+ }
+ mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+ if (total_errors > max_errors) {
+ btrfs_error(root->fs_info, -EIO,
+ "%d errors while writing supers", total_errors);
+ return -EIO;
+ }
+ return 0;
+}
+
+int write_ctree_super(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, int max_mirrors)
+{
+ return write_all_supers(root, max_mirrors);
+}
+
+/* Drop a fs root from the radix tree and free it. */
+void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
+ struct btrfs_root *root)
+{
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ radix_tree_delete(&fs_info->fs_roots_radix,
+ (unsigned long)root->root_key.objectid);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+
+ if (btrfs_root_refs(&root->root_item) == 0)
+ synchronize_srcu(&fs_info->subvol_srcu);
+
+ if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+ btrfs_free_log(NULL, root);
+
+ if (root->free_ino_pinned)
+ __btrfs_remove_free_space_cache(root->free_ino_pinned);
+ if (root->free_ino_ctl)
+ __btrfs_remove_free_space_cache(root->free_ino_ctl);
+ free_fs_root(root);
+}
+
+static void free_fs_root(struct btrfs_root *root)
+{
+ iput(root->ino_cache_inode);
+ WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
+ btrfs_free_block_rsv(root, root->orphan_block_rsv);
+ root->orphan_block_rsv = NULL;
+ if (root->anon_dev)
+ free_anon_bdev(root->anon_dev);
+ if (root->subv_writers)
+ btrfs_free_subvolume_writers(root->subv_writers);
+ free_extent_buffer(root->node);
+ free_extent_buffer(root->commit_root);
+ kfree(root->free_ino_ctl);
+ kfree(root->free_ino_pinned);
+ kfree(root->name);
+ btrfs_put_fs_root(root);
+}
+
+void btrfs_free_fs_root(struct btrfs_root *root)
+{
+ free_fs_root(root);
+}
+
+int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
+{
+ u64 root_objectid = 0;
+ struct btrfs_root *gang[8];
+ int i = 0;
+ int err = 0;
+ unsigned int ret = 0;
+ int index;
+
+ while (1) {
+ index = srcu_read_lock(&fs_info->subvol_srcu);
+ ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
+ (void **)gang, root_objectid,
+ ARRAY_SIZE(gang));
+ if (!ret) {
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
+ break;
+ }
+ root_objectid = gang[ret - 1]->root_key.objectid + 1;
+
+ for (i = 0; i < ret; i++) {
+ /* Avoid to grab roots in dead_roots */
+ if (btrfs_root_refs(&gang[i]->root_item) == 0) {
+ gang[i] = NULL;
+ continue;
+ }
+ /* grab all the search result for later use */
+ gang[i] = btrfs_grab_fs_root(gang[i]);
+ }
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
+
+ for (i = 0; i < ret; i++) {
+ if (!gang[i])
+ continue;
+ root_objectid = gang[i]->root_key.objectid;
+ err = btrfs_orphan_cleanup(gang[i]);
+ if (err)
+ break;
+ btrfs_put_fs_root(gang[i]);
+ }
+ root_objectid++;
+ }
+
+ /* release the uncleaned roots due to error */
+ for (; i < ret; i++) {
+ if (gang[i])
+ btrfs_put_fs_root(gang[i]);
+ }
+ return err;
+}
+
+int btrfs_commit_super(struct btrfs_root *root)
+{
+ struct btrfs_trans_handle *trans;
+
+ mutex_lock(&root->fs_info->cleaner_mutex);
+ btrfs_run_delayed_iputs(root);
+ mutex_unlock(&root->fs_info->cleaner_mutex);
+ wake_up_process(root->fs_info->cleaner_kthread);
+
+ /* wait until ongoing cleanup work done */
+ down_write(&root->fs_info->cleanup_work_sem);
+ up_write(&root->fs_info->cleanup_work_sem);
+
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+ return btrfs_commit_transaction(trans, root);
+}
+
+void close_ctree(struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ int ret;
+
+ fs_info->closing = 1;
+ smp_mb();
+
+ /* wait for the uuid_scan task to finish */
+ down(&fs_info->uuid_tree_rescan_sem);
+ /* avoid complains from lockdep et al., set sem back to initial state */
+ up(&fs_info->uuid_tree_rescan_sem);
+
+ /* pause restriper - we want to resume on mount */
+ btrfs_pause_balance(fs_info);
+
+ btrfs_dev_replace_suspend_for_unmount(fs_info);
+
+ btrfs_scrub_cancel(fs_info);
+
+ /* wait for any defraggers to finish */
+ wait_event(fs_info->transaction_wait,
+ (atomic_read(&fs_info->defrag_running) == 0));
+
+ /* clear out the rbtree of defraggable inodes */
+ btrfs_cleanup_defrag_inodes(fs_info);
+
+ cancel_work_sync(&fs_info->async_reclaim_work);
+
+ if (!(fs_info->sb->s_flags & MS_RDONLY)) {
+ ret = btrfs_commit_super(root);
+ if (ret)
+ btrfs_err(fs_info, "commit super ret %d", ret);
+ }
+
+ if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+ btrfs_error_commit_super(root);
+
+ kthread_stop(fs_info->transaction_kthread);
+ kthread_stop(fs_info->cleaner_kthread);
+
+ fs_info->closing = 2;
+ smp_mb();
+
+ btrfs_free_qgroup_config(fs_info);
+
+ if (percpu_counter_sum(&fs_info->delalloc_bytes)) {
+ btrfs_info(fs_info, "at unmount delalloc count %lld",
+ percpu_counter_sum(&fs_info->delalloc_bytes));
+ }
+
+ btrfs_sysfs_remove_one(fs_info);
+
+ btrfs_free_fs_roots(fs_info);
+
+ btrfs_put_block_group_cache(fs_info);
+
+ btrfs_free_block_groups(fs_info);
+
+ /*
+ * we must make sure there is not any read request to
+ * submit after we stopping all workers.
+ */
+ invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
+ btrfs_stop_all_workers(fs_info);
+
+ fs_info->open = 0;
+ free_root_pointers(fs_info, 1);
+
+ iput(fs_info->btree_inode);
+
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+ if (btrfs_test_opt(root, CHECK_INTEGRITY))
+ btrfsic_unmount(root, fs_info->fs_devices);
+#endif
+
+ btrfs_close_devices(fs_info->fs_devices);
+ btrfs_mapping_tree_free(&fs_info->mapping_tree);
+
+ percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
+ percpu_counter_destroy(&fs_info->delalloc_bytes);
+ percpu_counter_destroy(&fs_info->bio_counter);
+ bdi_destroy(&fs_info->bdi);
+ cleanup_srcu_struct(&fs_info->subvol_srcu);
+
+ btrfs_free_stripe_hash_table(fs_info);
+
+ __btrfs_free_block_rsv(root->orphan_block_rsv);
+ root->orphan_block_rsv = NULL;
+
+ lock_chunks(root);
+ while (!list_empty(&fs_info->pinned_chunks)) {
+ struct extent_map *em;
+
+ em = list_first_entry(&fs_info->pinned_chunks,
+ struct extent_map, list);
+ list_del_init(&em->list);
+ free_extent_map(em);
+ }
+ unlock_chunks(root);
+}
+
+int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
+ int atomic)
+{
+ int ret;
+ struct inode *btree_inode = buf->pages[0]->mapping->host;
+
+ ret = extent_buffer_uptodate(buf);
+ if (!ret)
+ return ret;
+
+ ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf,
+ parent_transid, atomic);
+ if (ret == -EAGAIN)
+ return ret;
+ return !ret;
+}
+
+int btrfs_set_buffer_uptodate(struct extent_buffer *buf)
+{
+ return set_extent_buffer_uptodate(buf);
+}
+
+void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
+{
+ struct btrfs_root *root;
+ u64 transid = btrfs_header_generation(buf);
+ int was_dirty;
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ /*
+ * This is a fast path so only do this check if we have sanity tests
+ * enabled. Normal people shouldn't be marking dummy buffers as dirty
+ * outside of the sanity tests.
+ */
+ if (unlikely(test_bit(EXTENT_BUFFER_DUMMY, &buf->bflags)))
+ return;
+#endif
+ root = BTRFS_I(buf->pages[0]->mapping->host)->root;
+ btrfs_assert_tree_locked(buf);
+ if (transid != root->fs_info->generation)
+ WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, "
+ "found %llu running %llu\n",
+ buf->start, transid, root->fs_info->generation);
+ was_dirty = set_extent_buffer_dirty(buf);
+ if (!was_dirty)
+ __percpu_counter_add(&root->fs_info->dirty_metadata_bytes,
+ buf->len,
+ root->fs_info->dirty_metadata_batch);
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+ if (btrfs_header_level(buf) == 0 && check_leaf(root, buf)) {
+ btrfs_print_leaf(root, buf);
+ ASSERT(0);
+ }
+#endif
+}
+
+static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
+ int flush_delayed)
+{
+ /*
+ * looks as though older kernels can get into trouble with
+ * this code, they end up stuck in balance_dirty_pages forever
+ */
+ int ret;
+
+ if (current->flags & PF_MEMALLOC)
+ return;
+
+ if (flush_delayed)
+ btrfs_balance_delayed_items(root);
+
+ ret = percpu_counter_compare(&root->fs_info->dirty_metadata_bytes,
+ BTRFS_DIRTY_METADATA_THRESH);
+ if (ret > 0) {
+ balance_dirty_pages_ratelimited(
+ root->fs_info->btree_inode->i_mapping);
+ }
+ return;
+}
+
+void btrfs_btree_balance_dirty(struct btrfs_root *root)
+{
+ __btrfs_btree_balance_dirty(root, 1);
+}
+
+void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root)
+{
+ __btrfs_btree_balance_dirty(root, 0);
+}
+
+int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
+{
+ struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
+ return btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
+}
+
+static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
+ int read_only)
+{
+ struct btrfs_super_block *sb = fs_info->super_copy;
+ int ret = 0;
+
+ if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) {
+ printk(KERN_ERR "BTRFS: tree_root level too big: %d >= %d\n",
+ btrfs_super_root_level(sb), BTRFS_MAX_LEVEL);
+ ret = -EINVAL;
+ }
+ if (btrfs_super_chunk_root_level(sb) >= BTRFS_MAX_LEVEL) {
+ printk(KERN_ERR "BTRFS: chunk_root level too big: %d >= %d\n",
+ btrfs_super_chunk_root_level(sb), BTRFS_MAX_LEVEL);
+ ret = -EINVAL;
+ }
+ if (btrfs_super_log_root_level(sb) >= BTRFS_MAX_LEVEL) {
+ printk(KERN_ERR "BTRFS: log_root level too big: %d >= %d\n",
+ btrfs_super_log_root_level(sb), BTRFS_MAX_LEVEL);
+ ret = -EINVAL;
+ }
+
+ /*
+ * The common minimum, we don't know if we can trust the nodesize/sectorsize
+ * items yet, they'll be verified later. Issue just a warning.
+ */
+ if (!IS_ALIGNED(btrfs_super_root(sb), 4096))
+ printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
+ btrfs_super_root(sb));
+ if (!IS_ALIGNED(btrfs_super_chunk_root(sb), 4096))
+ printk(KERN_WARNING "BTRFS: chunk_root block unaligned: %llu\n",
+ btrfs_super_chunk_root(sb));
+ if (!IS_ALIGNED(btrfs_super_log_root(sb), 4096))
+ printk(KERN_WARNING "BTRFS: log_root block unaligned: %llu\n",
+ btrfs_super_log_root(sb));
+
+ /*
+ * Check the lower bound, the alignment and other constraints are
+ * checked later.
+ */
+ if (btrfs_super_nodesize(sb) < 4096) {
+ printk(KERN_ERR "BTRFS: nodesize too small: %u < 4096\n",
+ btrfs_super_nodesize(sb));
+ ret = -EINVAL;
+ }
+ if (btrfs_super_sectorsize(sb) < 4096) {
+ printk(KERN_ERR "BTRFS: sectorsize too small: %u < 4096\n",
+ btrfs_super_sectorsize(sb));
+ ret = -EINVAL;
+ }
+
+ if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_UUID_SIZE) != 0) {
+ printk(KERN_ERR "BTRFS: dev_item UUID does not match fsid: %pU != %pU\n",
+ fs_info->fsid, sb->dev_item.fsid);
+ ret = -EINVAL;
+ }
+
+ /*
+ * Hint to catch really bogus numbers, bitflips or so, more exact checks are
+ * done later
+ */
+ if (btrfs_super_num_devices(sb) > (1UL << 31))
+ printk(KERN_WARNING "BTRFS: suspicious number of devices: %llu\n",
+ btrfs_super_num_devices(sb));
+ if (btrfs_super_num_devices(sb) == 0) {
+ printk(KERN_ERR "BTRFS: number of devices is 0\n");
+ ret = -EINVAL;
+ }
+
+ if (btrfs_super_bytenr(sb) != BTRFS_SUPER_INFO_OFFSET) {
+ printk(KERN_ERR "BTRFS: super offset mismatch %llu != %u\n",
+ btrfs_super_bytenr(sb), BTRFS_SUPER_INFO_OFFSET);
+ ret = -EINVAL;
+ }
+
+ /*
+ * Obvious sys_chunk_array corruptions, it must hold at least one key
+ * and one chunk
+ */
+ if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
+ printk(KERN_ERR "BTRFS: system chunk array too big %u > %u\n",
+ btrfs_super_sys_array_size(sb),
+ BTRFS_SYSTEM_CHUNK_ARRAY_SIZE);
+ ret = -EINVAL;
+ }
+ if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key)
+ + sizeof(struct btrfs_chunk)) {
+ printk(KERN_ERR "BTRFS: system chunk array too small %u < %zu\n",
+ btrfs_super_sys_array_size(sb),
+ sizeof(struct btrfs_disk_key)
+ + sizeof(struct btrfs_chunk));
+ ret = -EINVAL;
+ }
+
+ /*
+ * The generation is a global counter, we'll trust it more than the others
+ * but it's still possible that it's the one that's wrong.
+ */
+ if (btrfs_super_generation(sb) < btrfs_super_chunk_root_generation(sb))
+ printk(KERN_WARNING
+ "BTRFS: suspicious: generation < chunk_root_generation: %llu < %llu\n",
+ btrfs_super_generation(sb), btrfs_super_chunk_root_generation(sb));
+ if (btrfs_super_generation(sb) < btrfs_super_cache_generation(sb)
+ && btrfs_super_cache_generation(sb) != (u64)-1)
+ printk(KERN_WARNING
+ "BTRFS: suspicious: generation < cache_generation: %llu < %llu\n",
+ btrfs_super_generation(sb), btrfs_super_cache_generation(sb));
+
+ return ret;
+}
+
+static void btrfs_error_commit_super(struct btrfs_root *root)
+{
+ mutex_lock(&root->fs_info->cleaner_mutex);
+ btrfs_run_delayed_iputs(root);
+ mutex_unlock(&root->fs_info->cleaner_mutex);
+
+ down_write(&root->fs_info->cleanup_work_sem);
+ up_write(&root->fs_info->cleanup_work_sem);
+
+ /* cleanup FS via transaction */
+ btrfs_cleanup_transaction(root);
+}
+
+static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
+{
+ struct btrfs_ordered_extent *ordered;
+
+ spin_lock(&root->ordered_extent_lock);
+ /*
+ * This will just short circuit the ordered completion stuff which will
+ * make sure the ordered extent gets properly cleaned up.
+ */
+ list_for_each_entry(ordered, &root->ordered_extents,
+ root_extent_list)
+ set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
+ spin_unlock(&root->ordered_extent_lock);
+}
+
+static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *root;
+ struct list_head splice;
+
+ INIT_LIST_HEAD(&splice);
+
+ spin_lock(&fs_info->ordered_root_lock);
+ list_splice_init(&fs_info->ordered_roots, &splice);
+ while (!list_empty(&splice)) {
+ root = list_first_entry(&splice, struct btrfs_root,
+ ordered_root);
+ list_move_tail(&root->ordered_root,
+ &fs_info->ordered_roots);
+
+ spin_unlock(&fs_info->ordered_root_lock);
+ btrfs_destroy_ordered_extents(root);
+
+ cond_resched();
+ spin_lock(&fs_info->ordered_root_lock);
+ }
+ spin_unlock(&fs_info->ordered_root_lock);
+}
+
+static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
+ struct btrfs_root *root)
+{
+ struct rb_node *node;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_delayed_ref_node *ref;
+ int ret = 0;
+
+ delayed_refs = &trans->delayed_refs;
+
+ spin_lock(&delayed_refs->lock);
+ if (atomic_read(&delayed_refs->num_entries) == 0) {
+ spin_unlock(&delayed_refs->lock);
+ btrfs_info(root->fs_info, "delayed_refs has NO entry");
+ return ret;
+ }
+
+ while ((node = rb_first(&delayed_refs->href_root)) != NULL) {
+ struct btrfs_delayed_ref_head *head;
+ bool pin_bytes = false;
+
+ head = rb_entry(node, struct btrfs_delayed_ref_head,
+ href_node);
+ if (!mutex_trylock(&head->mutex)) {
+ atomic_inc(&head->node.refs);
+ spin_unlock(&delayed_refs->lock);
+
+ mutex_lock(&head->mutex);
+ mutex_unlock(&head->mutex);
+ btrfs_put_delayed_ref(&head->node);
+ spin_lock(&delayed_refs->lock);
+ continue;
+ }
+ spin_lock(&head->lock);
+ while ((node = rb_first(&head->ref_root)) != NULL) {
+ ref = rb_entry(node, struct btrfs_delayed_ref_node,
+ rb_node);
+ ref->in_tree = 0;
+ rb_erase(&ref->rb_node, &head->ref_root);
+ atomic_dec(&delayed_refs->num_entries);
+ btrfs_put_delayed_ref(ref);
+ }
+ if (head->must_insert_reserved)
+ pin_bytes = true;
+ btrfs_free_delayed_extent_op(head->extent_op);
+ delayed_refs->num_heads--;
+ if (head->processing == 0)
+ delayed_refs->num_heads_ready--;
+ atomic_dec(&delayed_refs->num_entries);
+ head->node.in_tree = 0;
+ rb_erase(&head->href_node, &delayed_refs->href_root);
+ spin_unlock(&head->lock);
+ spin_unlock(&delayed_refs->lock);
+ mutex_unlock(&head->mutex);
+
+ if (pin_bytes)
+ btrfs_pin_extent(root, head->node.bytenr,
+ head->node.num_bytes, 1);
+ btrfs_put_delayed_ref(&head->node);
+ cond_resched();
+ spin_lock(&delayed_refs->lock);
+ }
+
+ spin_unlock(&delayed_refs->lock);
+
+ return ret;
+}
+
+static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
+{
+ struct btrfs_inode *btrfs_inode;
+ struct list_head splice;
+
+ INIT_LIST_HEAD(&splice);
+
+ spin_lock(&root->delalloc_lock);
+ list_splice_init(&root->delalloc_inodes, &splice);
+
+ while (!list_empty(&splice)) {
+ btrfs_inode = list_first_entry(&splice, struct btrfs_inode,
+ delalloc_inodes);
+
+ list_del_init(&btrfs_inode->delalloc_inodes);
+ clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+ &btrfs_inode->runtime_flags);
+ spin_unlock(&root->delalloc_lock);
+
+ btrfs_invalidate_inodes(btrfs_inode->root);
+
+ spin_lock(&root->delalloc_lock);
+ }
+
+ spin_unlock(&root->delalloc_lock);
+}
+
+static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *root;
+ struct list_head splice;
+
+ INIT_LIST_HEAD(&splice);
+
+ spin_lock(&fs_info->delalloc_root_lock);
+ list_splice_init(&fs_info->delalloc_roots, &splice);
+ while (!list_empty(&splice)) {
+ root = list_first_entry(&splice, struct btrfs_root,
+ delalloc_root);
+ list_del_init(&root->delalloc_root);
+ root = btrfs_grab_fs_root(root);
+ BUG_ON(!root);
+ spin_unlock(&fs_info->delalloc_root_lock);
+
+ btrfs_destroy_delalloc_inodes(root);
+ btrfs_put_fs_root(root);
+
+ spin_lock(&fs_info->delalloc_root_lock);
+ }
+ spin_unlock(&fs_info->delalloc_root_lock);
+}
+
+static int btrfs_destroy_marked_extents(struct btrfs_root *root,
+ struct extent_io_tree *dirty_pages,
+ int mark)
+{
+ int ret;
+ struct extent_buffer *eb;
+ u64 start = 0;
+ u64 end;
+
+ while (1) {
+ ret = find_first_extent_bit(dirty_pages, start, &start, &end,
+ mark, NULL);
+ if (ret)
+ break;
+
+ clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
+ while (start <= end) {
+ eb = btrfs_find_tree_block(root->fs_info, start);
+ start += root->nodesize;
+ if (!eb)
+ continue;
+ wait_on_extent_buffer_writeback(eb);
+
+ if (test_and_clear_bit(EXTENT_BUFFER_DIRTY,
+ &eb->bflags))
+ clear_extent_buffer_dirty(eb);
+ free_extent_buffer_stale(eb);
+ }
+ }
+
+ return ret;
+}
+
+static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
+ struct extent_io_tree *pinned_extents)
+{
+ struct extent_io_tree *unpin;
+ u64 start;
+ u64 end;
+ int ret;
+ bool loop = true;
+
+ unpin = pinned_extents;
+again:
+ while (1) {
+ ret = find_first_extent_bit(unpin, 0, &start, &end,
+ EXTENT_DIRTY, NULL);
+ if (ret)
+ break;
+
+ clear_extent_dirty(unpin, start, end, GFP_NOFS);
+ btrfs_error_unpin_extent_range(root, start, end);
+ cond_resched();
+ }
+
+ if (loop) {
+ if (unpin == &root->fs_info->freed_extents[0])
+ unpin = &root->fs_info->freed_extents[1];
+ else
+ unpin = &root->fs_info->freed_extents[0];
+ loop = false;
+ goto again;
+ }
+
+ return 0;
+}
+
+static void btrfs_free_pending_ordered(struct btrfs_transaction *cur_trans,
+ struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_ordered_extent *ordered;
+
+ spin_lock(&fs_info->trans_lock);
+ while (!list_empty(&cur_trans->pending_ordered)) {
+ ordered = list_first_entry(&cur_trans->pending_ordered,
+ struct btrfs_ordered_extent,
+ trans_list);
+ list_del_init(&ordered->trans_list);
+ spin_unlock(&fs_info->trans_lock);
+
+ btrfs_put_ordered_extent(ordered);
+ spin_lock(&fs_info->trans_lock);
+ }
+ spin_unlock(&fs_info->trans_lock);
+}
+
+void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
+ struct btrfs_root *root)
+{
+ btrfs_destroy_delayed_refs(cur_trans, root);
+
+ cur_trans->state = TRANS_STATE_COMMIT_START;
+ wake_up(&root->fs_info->transaction_blocked_wait);
+
+ cur_trans->state = TRANS_STATE_UNBLOCKED;
+ wake_up(&root->fs_info->transaction_wait);
+
+ btrfs_free_pending_ordered(cur_trans, root->fs_info);
+ btrfs_destroy_delayed_inodes(root);
+ btrfs_assert_delayed_root_empty(root);
+
+ btrfs_destroy_marked_extents(root, &cur_trans->dirty_pages,
+ EXTENT_DIRTY);
+ btrfs_destroy_pinned_extent(root,
+ root->fs_info->pinned_extents);
+
+ cur_trans->state =TRANS_STATE_COMPLETED;
+ wake_up(&cur_trans->commit_wait);
+
+ /*
+ memset(cur_trans, 0, sizeof(*cur_trans));
+ kmem_cache_free(btrfs_transaction_cachep, cur_trans);
+ */
+}
+
+static int btrfs_cleanup_transaction(struct btrfs_root *root)
+{
+ struct btrfs_transaction *t;
+
+ mutex_lock(&root->fs_info->transaction_kthread_mutex);
+
+ spin_lock(&root->fs_info->trans_lock);
+ while (!list_empty(&root->fs_info->trans_list)) {
+ t = list_first_entry(&root->fs_info->trans_list,
+ struct btrfs_transaction, list);
+ if (t->state >= TRANS_STATE_COMMIT_START) {
+ atomic_inc(&t->use_count);
+ spin_unlock(&root->fs_info->trans_lock);
+ btrfs_wait_for_commit(root, t->transid);
+ btrfs_put_transaction(t);
+ spin_lock(&root->fs_info->trans_lock);
+ continue;
+ }
+ if (t == root->fs_info->running_transaction) {
+ t->state = TRANS_STATE_COMMIT_DOING;
+ spin_unlock(&root->fs_info->trans_lock);
+ /*
+ * We wait for 0 num_writers since we don't hold a trans
+ * handle open currently for this transaction.
+ */
+ wait_event(t->writer_wait,
+ atomic_read(&t->num_writers) == 0);
+ } else {
+ spin_unlock(&root->fs_info->trans_lock);
+ }
+ btrfs_cleanup_one_transaction(t, root);
+
+ spin_lock(&root->fs_info->trans_lock);
+ if (t == root->fs_info->running_transaction)
+ root->fs_info->running_transaction = NULL;
+ list_del_init(&t->list);
+ spin_unlock(&root->fs_info->trans_lock);
+
+ btrfs_put_transaction(t);
+ trace_btrfs_transaction_commit(root);
+ spin_lock(&root->fs_info->trans_lock);
+ }
+ spin_unlock(&root->fs_info->trans_lock);
+ btrfs_destroy_all_ordered_extents(root->fs_info);
+ btrfs_destroy_delayed_inodes(root);
+ btrfs_assert_delayed_root_empty(root);
+ btrfs_destroy_pinned_extent(root, root->fs_info->pinned_extents);
+ btrfs_destroy_all_delalloc_inodes(root->fs_info);
+ mutex_unlock(&root->fs_info->transaction_kthread_mutex);
+
+ return 0;
+}
+
+static const struct extent_io_ops btree_extent_io_ops = {
+ .readpage_end_io_hook = btree_readpage_end_io_hook,
+ .readpage_io_failed_hook = btree_io_failed_hook,
+ .submit_bio_hook = btree_submit_bio_hook,
+ /* note we're sharing with inode.c for the merge bio hook */
+ .merge_bio_hook = btrfs_merge_bio_hook,
+};
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
new file mode 100644
index 000000000..d4cbfeeee
--- /dev/null
+++ b/fs/btrfs/disk-io.h
@@ -0,0 +1,159 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __DISKIO__
+#define __DISKIO__
+
+#define BTRFS_SUPER_INFO_OFFSET (64 * 1024)
+#define BTRFS_SUPER_INFO_SIZE 4096
+
+#define BTRFS_SUPER_MIRROR_MAX 3
+#define BTRFS_SUPER_MIRROR_SHIFT 12
+
+enum btrfs_wq_endio_type {
+ BTRFS_WQ_ENDIO_DATA = 0,
+ BTRFS_WQ_ENDIO_METADATA = 1,
+ BTRFS_WQ_ENDIO_FREE_SPACE = 2,
+ BTRFS_WQ_ENDIO_RAID56 = 3,
+ BTRFS_WQ_ENDIO_DIO_REPAIR = 4,
+};
+
+static inline u64 btrfs_sb_offset(int mirror)
+{
+ u64 start = 16 * 1024;
+ if (mirror)
+ return start << (BTRFS_SUPER_MIRROR_SHIFT * mirror);
+ return BTRFS_SUPER_INFO_OFFSET;
+}
+
+struct btrfs_device;
+struct btrfs_fs_devices;
+
+struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
+ u64 parent_transid);
+void readahead_tree_block(struct btrfs_root *root, u64 bytenr);
+int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr,
+ int mirror_num, struct extent_buffer **eb);
+struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
+ u64 bytenr);
+void clean_tree_block(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, struct extent_buffer *buf);
+int open_ctree(struct super_block *sb,
+ struct btrfs_fs_devices *fs_devices,
+ char *options);
+void close_ctree(struct btrfs_root *root);
+int write_ctree_super(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, int max_mirrors);
+struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
+int btrfs_commit_super(struct btrfs_root *root);
+struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info,
+ u64 bytenr);
+struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
+ struct btrfs_key *location);
+int btrfs_init_fs_root(struct btrfs_root *root);
+int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
+ struct btrfs_root *root);
+void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info);
+
+struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
+ struct btrfs_key *key,
+ bool check_ref);
+static inline struct btrfs_root *
+btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
+ struct btrfs_key *location)
+{
+ return btrfs_get_fs_root(fs_info, location, true);
+}
+
+int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
+void btrfs_btree_balance_dirty(struct btrfs_root *root);
+void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root);
+void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
+ struct btrfs_root *root);
+void btrfs_free_fs_root(struct btrfs_root *root);
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+struct btrfs_root *btrfs_alloc_dummy_root(void);
+#endif
+
+/*
+ * This function is used to grab the root, and avoid it is freed when we
+ * access it. But it doesn't ensure that the tree is not dropped.
+ *
+ * If you want to ensure the whole tree is safe, you should use
+ * fs_info->subvol_srcu
+ */
+static inline struct btrfs_root *btrfs_grab_fs_root(struct btrfs_root *root)
+{
+ if (atomic_inc_not_zero(&root->refs))
+ return root;
+ return NULL;
+}
+
+static inline void btrfs_put_fs_root(struct btrfs_root *root)
+{
+ if (atomic_dec_and_test(&root->refs))
+ kfree(root);
+}
+
+void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
+int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
+ int atomic);
+int btrfs_set_buffer_uptodate(struct extent_buffer *buf);
+int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid);
+u32 btrfs_csum_data(char *data, u32 seed, size_t len);
+void btrfs_csum_final(u32 crc, char *result);
+int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
+ enum btrfs_wq_endio_type metadata);
+int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
+ int rw, struct bio *bio, int mirror_num,
+ unsigned long bio_flags, u64 bio_offset,
+ extent_submit_bio_hook_t *submit_bio_start,
+ extent_submit_bio_hook_t *submit_bio_done);
+unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
+int btrfs_write_tree_block(struct extent_buffer *buf);
+int btrfs_wait_tree_block_writeback(struct extent_buffer *buf);
+int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info);
+int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans,
+ struct btrfs_root *root);
+struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ u64 objectid);
+int btree_lock_page_hook(struct page *page, void *data,
+ void (*flush_fn)(void *));
+int btrfs_calc_num_tolerated_disk_barrier_failures(
+ struct btrfs_fs_info *fs_info);
+int __init btrfs_end_io_wq_init(void);
+void btrfs_end_io_wq_exit(void);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void btrfs_init_lockdep(void);
+void btrfs_set_buffer_lockdep_class(u64 objectid,
+ struct extent_buffer *eb, int level);
+#else
+static inline void btrfs_init_lockdep(void)
+{ }
+static inline void btrfs_set_buffer_lockdep_class(u64 objectid,
+ struct extent_buffer *eb, int level)
+{
+}
+#endif
+#endif
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
new file mode 100644
index 000000000..8d052209f
--- /dev/null
+++ b/fs/btrfs/export.c
@@ -0,0 +1,304 @@
+#include <linux/fs.h>
+#include <linux/types.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "btrfs_inode.h"
+#include "print-tree.h"
+#include "export.h"
+
+#define BTRFS_FID_SIZE_NON_CONNECTABLE (offsetof(struct btrfs_fid, \
+ parent_objectid) / 4)
+#define BTRFS_FID_SIZE_CONNECTABLE (offsetof(struct btrfs_fid, \
+ parent_root_objectid) / 4)
+#define BTRFS_FID_SIZE_CONNECTABLE_ROOT (sizeof(struct btrfs_fid) / 4)
+
+static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
+ struct inode *parent)
+{
+ struct btrfs_fid *fid = (struct btrfs_fid *)fh;
+ int len = *max_len;
+ int type;
+
+ if (parent && (len < BTRFS_FID_SIZE_CONNECTABLE)) {
+ *max_len = BTRFS_FID_SIZE_CONNECTABLE;
+ return FILEID_INVALID;
+ } else if (len < BTRFS_FID_SIZE_NON_CONNECTABLE) {
+ *max_len = BTRFS_FID_SIZE_NON_CONNECTABLE;
+ return FILEID_INVALID;
+ }
+
+ len = BTRFS_FID_SIZE_NON_CONNECTABLE;
+ type = FILEID_BTRFS_WITHOUT_PARENT;
+
+ fid->objectid = btrfs_ino(inode);
+ fid->root_objectid = BTRFS_I(inode)->root->objectid;
+ fid->gen = inode->i_generation;
+
+ if (parent) {
+ u64 parent_root_id;
+
+ fid->parent_objectid = BTRFS_I(parent)->location.objectid;
+ fid->parent_gen = parent->i_generation;
+ parent_root_id = BTRFS_I(parent)->root->objectid;
+
+ if (parent_root_id != fid->root_objectid) {
+ fid->parent_root_objectid = parent_root_id;
+ len = BTRFS_FID_SIZE_CONNECTABLE_ROOT;
+ type = FILEID_BTRFS_WITH_PARENT_ROOT;
+ } else {
+ len = BTRFS_FID_SIZE_CONNECTABLE;
+ type = FILEID_BTRFS_WITH_PARENT;
+ }
+ }
+
+ *max_len = len;
+ return type;
+}
+
+static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
+ u64 root_objectid, u32 generation,
+ int check_generation)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ struct btrfs_root *root;
+ struct inode *inode;
+ struct btrfs_key key;
+ int index;
+ int err = 0;
+
+ if (objectid < BTRFS_FIRST_FREE_OBJECTID)
+ return ERR_PTR(-ESTALE);
+
+ key.objectid = root_objectid;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = (u64)-1;
+
+ index = srcu_read_lock(&fs_info->subvol_srcu);
+
+ root = btrfs_read_fs_root_no_name(fs_info, &key);
+ if (IS_ERR(root)) {
+ err = PTR_ERR(root);
+ goto fail;
+ }
+
+ key.objectid = objectid;
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+
+ inode = btrfs_iget(sb, &key, root, NULL);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ goto fail;
+ }
+
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
+
+ if (check_generation && generation != inode->i_generation) {
+ iput(inode);
+ return ERR_PTR(-ESTALE);
+ }
+
+ return d_obtain_alias(inode);
+fail:
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
+ return ERR_PTR(err);
+}
+
+static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
+ int fh_len, int fh_type)
+{
+ struct btrfs_fid *fid = (struct btrfs_fid *) fh;
+ u64 objectid, root_objectid;
+ u32 generation;
+
+ if (fh_type == FILEID_BTRFS_WITH_PARENT) {
+ if (fh_len != BTRFS_FID_SIZE_CONNECTABLE)
+ return NULL;
+ root_objectid = fid->root_objectid;
+ } else if (fh_type == FILEID_BTRFS_WITH_PARENT_ROOT) {
+ if (fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT)
+ return NULL;
+ root_objectid = fid->parent_root_objectid;
+ } else
+ return NULL;
+
+ objectid = fid->parent_objectid;
+ generation = fid->parent_gen;
+
+ return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1);
+}
+
+static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
+ int fh_len, int fh_type)
+{
+ struct btrfs_fid *fid = (struct btrfs_fid *) fh;
+ u64 objectid, root_objectid;
+ u32 generation;
+
+ if ((fh_type != FILEID_BTRFS_WITH_PARENT ||
+ fh_len != BTRFS_FID_SIZE_CONNECTABLE) &&
+ (fh_type != FILEID_BTRFS_WITH_PARENT_ROOT ||
+ fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT) &&
+ (fh_type != FILEID_BTRFS_WITHOUT_PARENT ||
+ fh_len != BTRFS_FID_SIZE_NON_CONNECTABLE))
+ return NULL;
+
+ objectid = fid->objectid;
+ root_objectid = fid->root_objectid;
+ generation = fid->gen;
+
+ return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1);
+}
+
+static struct dentry *btrfs_get_parent(struct dentry *child)
+{
+ struct inode *dir = d_inode(child);
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_root_ref *ref;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return ERR_PTR(-ENOMEM);
+
+ if (btrfs_ino(dir) == BTRFS_FIRST_FREE_OBJECTID) {
+ key.objectid = root->root_key.objectid;
+ key.type = BTRFS_ROOT_BACKREF_KEY;
+ key.offset = (u64)-1;
+ root = root->fs_info->tree_root;
+ } else {
+ key.objectid = btrfs_ino(dir);
+ key.type = BTRFS_INODE_REF_KEY;
+ key.offset = (u64)-1;
+ }
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto fail;
+
+ BUG_ON(ret == 0); /* Key with offset of -1 found */
+ if (path->slots[0] == 0) {
+ ret = -ENOENT;
+ goto fail;
+ }
+
+ path->slots[0]--;
+ leaf = path->nodes[0];
+
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ if (found_key.objectid != key.objectid || found_key.type != key.type) {
+ ret = -ENOENT;
+ goto fail;
+ }
+
+ if (found_key.type == BTRFS_ROOT_BACKREF_KEY) {
+ ref = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_root_ref);
+ key.objectid = btrfs_root_ref_dirid(leaf, ref);
+ } else {
+ key.objectid = found_key.offset;
+ }
+ btrfs_free_path(path);
+
+ if (found_key.type == BTRFS_ROOT_BACKREF_KEY) {
+ return btrfs_get_dentry(root->fs_info->sb, key.objectid,
+ found_key.offset, 0, 0);
+ }
+
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+ return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL));
+fail:
+ btrfs_free_path(path);
+ return ERR_PTR(ret);
+}
+
+static int btrfs_get_name(struct dentry *parent, char *name,
+ struct dentry *child)
+{
+ struct inode *inode = d_inode(child);
+ struct inode *dir = d_inode(parent);
+ struct btrfs_path *path;
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_inode_ref *iref;
+ struct btrfs_root_ref *rref;
+ struct extent_buffer *leaf;
+ unsigned long name_ptr;
+ struct btrfs_key key;
+ int name_len;
+ int ret;
+ u64 ino;
+
+ if (!dir || !inode)
+ return -EINVAL;
+
+ if (!S_ISDIR(dir->i_mode))
+ return -EINVAL;
+
+ ino = btrfs_ino(inode);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ path->leave_spinning = 1;
+
+ if (ino == BTRFS_FIRST_FREE_OBJECTID) {
+ key.objectid = BTRFS_I(inode)->root->root_key.objectid;
+ key.type = BTRFS_ROOT_BACKREF_KEY;
+ key.offset = (u64)-1;
+ root = root->fs_info->tree_root;
+ } else {
+ key.objectid = ino;
+ key.offset = btrfs_ino(dir);
+ key.type = BTRFS_INODE_REF_KEY;
+ }
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0) {
+ btrfs_free_path(path);
+ return ret;
+ } else if (ret > 0) {
+ if (ino == BTRFS_FIRST_FREE_OBJECTID) {
+ path->slots[0]--;
+ } else {
+ btrfs_free_path(path);
+ return -ENOENT;
+ }
+ }
+ leaf = path->nodes[0];
+
+ if (ino == BTRFS_FIRST_FREE_OBJECTID) {
+ rref = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_root_ref);
+ name_ptr = (unsigned long)(rref + 1);
+ name_len = btrfs_root_ref_name_len(leaf, rref);
+ } else {
+ iref = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_inode_ref);
+ name_ptr = (unsigned long)(iref + 1);
+ name_len = btrfs_inode_ref_name_len(leaf, iref);
+ }
+
+ read_extent_buffer(leaf, name, name_ptr, name_len);
+ btrfs_free_path(path);
+
+ /*
+ * have to add the null termination to make sure that reconnect_path
+ * gets the right len for strlen
+ */
+ name[name_len] = '\0';
+
+ return 0;
+}
+
+const struct export_operations btrfs_export_ops = {
+ .encode_fh = btrfs_encode_fh,
+ .fh_to_dentry = btrfs_fh_to_dentry,
+ .fh_to_parent = btrfs_fh_to_parent,
+ .get_parent = btrfs_get_parent,
+ .get_name = btrfs_get_name,
+};
diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h
new file mode 100644
index 000000000..074348a95
--- /dev/null
+++ b/fs/btrfs/export.h
@@ -0,0 +1,19 @@
+#ifndef BTRFS_EXPORT_H
+#define BTRFS_EXPORT_H
+
+#include <linux/exportfs.h>
+
+extern const struct export_operations btrfs_export_ops;
+
+struct btrfs_fid {
+ u64 objectid;
+ u64 root_objectid;
+ u32 gen;
+
+ u64 parent_objectid;
+ u32 parent_gen;
+
+ u64 parent_root_objectid;
+} __attribute__ ((packed));
+
+#endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
new file mode 100644
index 000000000..0ec3acd14
--- /dev/null
+++ b/fs/btrfs/extent-tree.c
@@ -0,0 +1,10174 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/sort.h>
+#include <linux/rcupdate.h>
+#include <linux/kthread.h>
+#include <linux/slab.h>
+#include <linux/ratelimit.h>
+#include <linux/percpu_counter.h>
+#include "hash.h"
+#include "tree-log.h"
+#include "disk-io.h"
+#include "print-tree.h"
+#include "volumes.h"
+#include "raid56.h"
+#include "locking.h"
+#include "free-space-cache.h"
+#include "math.h"
+#include "sysfs.h"
+#include "qgroup.h"
+
+#undef SCRAMBLE_DELAYED_REFS
+
+/*
+ * control flags for do_chunk_alloc's force field
+ * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
+ * if we really need one.
+ *
+ * CHUNK_ALLOC_LIMITED means to only try and allocate one
+ * if we have very few chunks already allocated. This is
+ * used as part of the clustering code to help make sure
+ * we have a good pool of storage to cluster in, without
+ * filling the FS with empty chunks
+ *
+ * CHUNK_ALLOC_FORCE means it must try to allocate one
+ *
+ */
+enum {
+ CHUNK_ALLOC_NO_FORCE = 0,
+ CHUNK_ALLOC_LIMITED = 1,
+ CHUNK_ALLOC_FORCE = 2,
+};
+
+/*
+ * Control how reservations are dealt with.
+ *
+ * RESERVE_FREE - freeing a reservation.
+ * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for
+ * ENOSPC accounting
+ * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update
+ * bytes_may_use as the ENOSPC accounting is done elsewhere
+ */
+enum {
+ RESERVE_FREE = 0,
+ RESERVE_ALLOC = 1,
+ RESERVE_ALLOC_NO_ACCOUNT = 2,
+};
+
+static int update_block_group(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes, int alloc);
+static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes, u64 parent,
+ u64 root_objectid, u64 owner_objectid,
+ u64 owner_offset, int refs_to_drop,
+ struct btrfs_delayed_extent_op *extra_op,
+ int no_quota);
+static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
+ struct extent_buffer *leaf,
+ struct btrfs_extent_item *ei);
+static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 parent, u64 root_objectid,
+ u64 flags, u64 owner, u64 offset,
+ struct btrfs_key *ins, int ref_mod);
+static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 parent, u64 root_objectid,
+ u64 flags, struct btrfs_disk_key *key,
+ int level, struct btrfs_key *ins,
+ int no_quota);
+static int do_chunk_alloc(struct btrfs_trans_handle *trans,
+ struct btrfs_root *extent_root, u64 flags,
+ int force);
+static int find_next_key(struct btrfs_path *path, int level,
+ struct btrfs_key *key);
+static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
+ int dump_block_groups);
+static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
+ u64 num_bytes, int reserve,
+ int delalloc);
+static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
+ u64 num_bytes);
+int btrfs_pin_extent(struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes, int reserved);
+
+static noinline int
+block_group_cache_done(struct btrfs_block_group_cache *cache)
+{
+ smp_mb();
+ return cache->cached == BTRFS_CACHE_FINISHED ||
+ cache->cached == BTRFS_CACHE_ERROR;
+}
+
+static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
+{
+ return (cache->flags & bits) == bits;
+}
+
+static void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
+{
+ atomic_inc(&cache->count);
+}
+
+void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
+{
+ if (atomic_dec_and_test(&cache->count)) {
+ WARN_ON(cache->pinned > 0);
+ WARN_ON(cache->reserved > 0);
+ kfree(cache->free_space_ctl);
+ kfree(cache);
+ }
+}
+
+/*
+ * this adds the block group to the fs_info rb tree for the block group
+ * cache
+ */
+static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
+ struct btrfs_block_group_cache *block_group)
+{
+ struct rb_node **p;
+ struct rb_node *parent = NULL;
+ struct btrfs_block_group_cache *cache;
+
+ spin_lock(&info->block_group_cache_lock);
+ p = &info->block_group_cache_tree.rb_node;
+
+ while (*p) {
+ parent = *p;
+ cache = rb_entry(parent, struct btrfs_block_group_cache,
+ cache_node);
+ if (block_group->key.objectid < cache->key.objectid) {
+ p = &(*p)->rb_left;
+ } else if (block_group->key.objectid > cache->key.objectid) {
+ p = &(*p)->rb_right;
+ } else {
+ spin_unlock(&info->block_group_cache_lock);
+ return -EEXIST;
+ }
+ }
+
+ rb_link_node(&block_group->cache_node, parent, p);
+ rb_insert_color(&block_group->cache_node,
+ &info->block_group_cache_tree);
+
+ if (info->first_logical_byte > block_group->key.objectid)
+ info->first_logical_byte = block_group->key.objectid;
+
+ spin_unlock(&info->block_group_cache_lock);
+
+ return 0;
+}
+
+/*
+ * This will return the block group at or after bytenr if contains is 0, else
+ * it will return the block group that contains the bytenr
+ */
+static struct btrfs_block_group_cache *
+block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
+ int contains)
+{
+ struct btrfs_block_group_cache *cache, *ret = NULL;
+ struct rb_node *n;
+ u64 end, start;
+
+ spin_lock(&info->block_group_cache_lock);
+ n = info->block_group_cache_tree.rb_node;
+
+ while (n) {
+ cache = rb_entry(n, struct btrfs_block_group_cache,
+ cache_node);
+ end = cache->key.objectid + cache->key.offset - 1;
+ start = cache->key.objectid;
+
+ if (bytenr < start) {
+ if (!contains && (!ret || start < ret->key.objectid))
+ ret = cache;
+ n = n->rb_left;
+ } else if (bytenr > start) {
+ if (contains && bytenr <= end) {
+ ret = cache;
+ break;
+ }
+ n = n->rb_right;
+ } else {
+ ret = cache;
+ break;
+ }
+ }
+ if (ret) {
+ btrfs_get_block_group(ret);
+ if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
+ info->first_logical_byte = ret->key.objectid;
+ }
+ spin_unlock(&info->block_group_cache_lock);
+
+ return ret;
+}
+
+static int add_excluded_extent(struct btrfs_root *root,
+ u64 start, u64 num_bytes)
+{
+ u64 end = start + num_bytes - 1;
+ set_extent_bits(&root->fs_info->freed_extents[0],
+ start, end, EXTENT_UPTODATE, GFP_NOFS);
+ set_extent_bits(&root->fs_info->freed_extents[1],
+ start, end, EXTENT_UPTODATE, GFP_NOFS);
+ return 0;
+}
+
+static void free_excluded_extents(struct btrfs_root *root,
+ struct btrfs_block_group_cache *cache)
+{
+ u64 start, end;
+
+ start = cache->key.objectid;
+ end = start + cache->key.offset - 1;
+
+ clear_extent_bits(&root->fs_info->freed_extents[0],
+ start, end, EXTENT_UPTODATE, GFP_NOFS);
+ clear_extent_bits(&root->fs_info->freed_extents[1],
+ start, end, EXTENT_UPTODATE, GFP_NOFS);
+}
+
+static int exclude_super_stripes(struct btrfs_root *root,
+ struct btrfs_block_group_cache *cache)
+{
+ u64 bytenr;
+ u64 *logical;
+ int stripe_len;
+ int i, nr, ret;
+
+ if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
+ stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
+ cache->bytes_super += stripe_len;
+ ret = add_excluded_extent(root, cache->key.objectid,
+ stripe_len);
+ if (ret)
+ return ret;
+ }
+
+ for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+ bytenr = btrfs_sb_offset(i);
+ ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
+ cache->key.objectid, bytenr,
+ 0, &logical, &nr, &stripe_len);
+ if (ret)
+ return ret;
+
+ while (nr--) {
+ u64 start, len;
+
+ if (logical[nr] > cache->key.objectid +
+ cache->key.offset)
+ continue;
+
+ if (logical[nr] + stripe_len <= cache->key.objectid)
+ continue;
+
+ start = logical[nr];
+ if (start < cache->key.objectid) {
+ start = cache->key.objectid;
+ len = (logical[nr] + stripe_len) - start;
+ } else {
+ len = min_t(u64, stripe_len,
+ cache->key.objectid +
+ cache->key.offset - start);
+ }
+
+ cache->bytes_super += len;
+ ret = add_excluded_extent(root, start, len);
+ if (ret) {
+ kfree(logical);
+ return ret;
+ }
+ }
+
+ kfree(logical);
+ }
+ return 0;
+}
+
+static struct btrfs_caching_control *
+get_caching_control(struct btrfs_block_group_cache *cache)
+{
+ struct btrfs_caching_control *ctl;
+
+ spin_lock(&cache->lock);
+ if (!cache->caching_ctl) {
+ spin_unlock(&cache->lock);
+ return NULL;
+ }
+
+ ctl = cache->caching_ctl;
+ atomic_inc(&ctl->count);
+ spin_unlock(&cache->lock);
+ return ctl;
+}
+
+static void put_caching_control(struct btrfs_caching_control *ctl)
+{
+ if (atomic_dec_and_test(&ctl->count))
+ kfree(ctl);
+}
+
+/*
+ * this is only called by cache_block_group, since we could have freed extents
+ * we need to check the pinned_extents for any extents that can't be used yet
+ * since their free space will be released as soon as the transaction commits.
+ */
+static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
+ struct btrfs_fs_info *info, u64 start, u64 end)
+{
+ u64 extent_start, extent_end, size, total_added = 0;
+ int ret;
+
+ while (start < end) {
+ ret = find_first_extent_bit(info->pinned_extents, start,
+ &extent_start, &extent_end,
+ EXTENT_DIRTY | EXTENT_UPTODATE,
+ NULL);
+ if (ret)
+ break;
+
+ if (extent_start <= start) {
+ start = extent_end + 1;
+ } else if (extent_start > start && extent_start < end) {
+ size = extent_start - start;
+ total_added += size;
+ ret = btrfs_add_free_space(block_group, start,
+ size);
+ BUG_ON(ret); /* -ENOMEM or logic error */
+ start = extent_end + 1;
+ } else {
+ break;
+ }
+ }
+
+ if (start < end) {
+ size = end - start;
+ total_added += size;
+ ret = btrfs_add_free_space(block_group, start, size);
+ BUG_ON(ret); /* -ENOMEM or logic error */
+ }
+
+ return total_added;
+}
+
+static noinline void caching_thread(struct btrfs_work *work)
+{
+ struct btrfs_block_group_cache *block_group;
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_caching_control *caching_ctl;
+ struct btrfs_root *extent_root;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ u64 total_found = 0;
+ u64 last = 0;
+ u32 nritems;
+ int ret = -ENOMEM;
+
+ caching_ctl = container_of(work, struct btrfs_caching_control, work);
+ block_group = caching_ctl->block_group;
+ fs_info = block_group->fs_info;
+ extent_root = fs_info->extent_root;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ goto out;
+
+ last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
+
+ /*
+ * We don't want to deadlock with somebody trying to allocate a new
+ * extent for the extent root while also trying to search the extent
+ * root to add free space. So we skip locking and search the commit
+ * root, since its read-only
+ */
+ path->skip_locking = 1;
+ path->search_commit_root = 1;
+ path->reada = 1;
+
+ key.objectid = last;
+ key.offset = 0;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+again:
+ mutex_lock(&caching_ctl->mutex);
+ /* need to make sure the commit_root doesn't disappear */
+ down_read(&fs_info->commit_root_sem);
+
+next:
+ ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
+ if (ret < 0)
+ goto err;
+
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+
+ while (1) {
+ if (btrfs_fs_closing(fs_info) > 1) {
+ last = (u64)-1;
+ break;
+ }
+
+ if (path->slots[0] < nritems) {
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ } else {
+ ret = find_next_key(path, 0, &key);
+ if (ret)
+ break;
+
+ if (need_resched() ||
+ rwsem_is_contended(&fs_info->commit_root_sem)) {
+ caching_ctl->progress = last;
+ btrfs_release_path(path);
+ up_read(&fs_info->commit_root_sem);
+ mutex_unlock(&caching_ctl->mutex);
+ cond_resched();
+ goto again;
+ }
+
+ ret = btrfs_next_leaf(extent_root, path);
+ if (ret < 0)
+ goto err;
+ if (ret)
+ break;
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ continue;
+ }
+
+ if (key.objectid < last) {
+ key.objectid = last;
+ key.offset = 0;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+
+ caching_ctl->progress = last;
+ btrfs_release_path(path);
+ goto next;
+ }
+
+ if (key.objectid < block_group->key.objectid) {
+ path->slots[0]++;
+ continue;
+ }
+
+ if (key.objectid >= block_group->key.objectid +
+ block_group->key.offset)
+ break;
+
+ if (key.type == BTRFS_EXTENT_ITEM_KEY ||
+ key.type == BTRFS_METADATA_ITEM_KEY) {
+ total_found += add_new_free_space(block_group,
+ fs_info, last,
+ key.objectid);
+ if (key.type == BTRFS_METADATA_ITEM_KEY)
+ last = key.objectid +
+ fs_info->tree_root->nodesize;
+ else
+ last = key.objectid + key.offset;
+
+ if (total_found > (1024 * 1024 * 2)) {
+ total_found = 0;
+ wake_up(&caching_ctl->wait);
+ }
+ }
+ path->slots[0]++;
+ }
+ ret = 0;
+
+ total_found += add_new_free_space(block_group, fs_info, last,
+ block_group->key.objectid +
+ block_group->key.offset);
+ caching_ctl->progress = (u64)-1;
+
+ spin_lock(&block_group->lock);
+ block_group->caching_ctl = NULL;
+ block_group->cached = BTRFS_CACHE_FINISHED;
+ spin_unlock(&block_group->lock);
+
+err:
+ btrfs_free_path(path);
+ up_read(&fs_info->commit_root_sem);
+
+ free_excluded_extents(extent_root, block_group);
+
+ mutex_unlock(&caching_ctl->mutex);
+out:
+ if (ret) {
+ spin_lock(&block_group->lock);
+ block_group->caching_ctl = NULL;
+ block_group->cached = BTRFS_CACHE_ERROR;
+ spin_unlock(&block_group->lock);
+ }
+ wake_up(&caching_ctl->wait);
+
+ put_caching_control(caching_ctl);
+ btrfs_put_block_group(block_group);
+}
+
+static int cache_block_group(struct btrfs_block_group_cache *cache,
+ int load_cache_only)
+{
+ DEFINE_WAIT(wait);
+ struct btrfs_fs_info *fs_info = cache->fs_info;
+ struct btrfs_caching_control *caching_ctl;
+ int ret = 0;
+
+ caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
+ if (!caching_ctl)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&caching_ctl->list);
+ mutex_init(&caching_ctl->mutex);
+ init_waitqueue_head(&caching_ctl->wait);
+ caching_ctl->block_group = cache;
+ caching_ctl->progress = cache->key.objectid;
+ atomic_set(&caching_ctl->count, 1);
+ btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
+ caching_thread, NULL, NULL);
+
+ spin_lock(&cache->lock);
+ /*
+ * This should be a rare occasion, but this could happen I think in the
+ * case where one thread starts to load the space cache info, and then
+ * some other thread starts a transaction commit which tries to do an
+ * allocation while the other thread is still loading the space cache
+ * info. The previous loop should have kept us from choosing this block
+ * group, but if we've moved to the state where we will wait on caching
+ * block groups we need to first check if we're doing a fast load here,
+ * so we can wait for it to finish, otherwise we could end up allocating
+ * from a block group who's cache gets evicted for one reason or
+ * another.
+ */
+ while (cache->cached == BTRFS_CACHE_FAST) {
+ struct btrfs_caching_control *ctl;
+
+ ctl = cache->caching_ctl;
+ atomic_inc(&ctl->count);
+ prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
+ spin_unlock(&cache->lock);
+
+ schedule();
+
+ finish_wait(&ctl->wait, &wait);
+ put_caching_control(ctl);
+ spin_lock(&cache->lock);
+ }
+
+ if (cache->cached != BTRFS_CACHE_NO) {
+ spin_unlock(&cache->lock);
+ kfree(caching_ctl);
+ return 0;
+ }
+ WARN_ON(cache->caching_ctl);
+ cache->caching_ctl = caching_ctl;
+ cache->cached = BTRFS_CACHE_FAST;
+ spin_unlock(&cache->lock);
+
+ if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
+ mutex_lock(&caching_ctl->mutex);
+ ret = load_free_space_cache(fs_info, cache);
+
+ spin_lock(&cache->lock);
+ if (ret == 1) {
+ cache->caching_ctl = NULL;
+ cache->cached = BTRFS_CACHE_FINISHED;
+ cache->last_byte_to_unpin = (u64)-1;
+ caching_ctl->progress = (u64)-1;
+ } else {
+ if (load_cache_only) {
+ cache->caching_ctl = NULL;
+ cache->cached = BTRFS_CACHE_NO;
+ } else {
+ cache->cached = BTRFS_CACHE_STARTED;
+ cache->has_caching_ctl = 1;
+ }
+ }
+ spin_unlock(&cache->lock);
+ mutex_unlock(&caching_ctl->mutex);
+
+ wake_up(&caching_ctl->wait);
+ if (ret == 1) {
+ put_caching_control(caching_ctl);
+ free_excluded_extents(fs_info->extent_root, cache);
+ return 0;
+ }
+ } else {
+ /*
+ * We are not going to do the fast caching, set cached to the
+ * appropriate value and wakeup any waiters.
+ */
+ spin_lock(&cache->lock);
+ if (load_cache_only) {
+ cache->caching_ctl = NULL;
+ cache->cached = BTRFS_CACHE_NO;
+ } else {
+ cache->cached = BTRFS_CACHE_STARTED;
+ cache->has_caching_ctl = 1;
+ }
+ spin_unlock(&cache->lock);
+ wake_up(&caching_ctl->wait);
+ }
+
+ if (load_cache_only) {
+ put_caching_control(caching_ctl);
+ return 0;
+ }
+
+ down_write(&fs_info->commit_root_sem);
+ atomic_inc(&caching_ctl->count);
+ list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
+ up_write(&fs_info->commit_root_sem);
+
+ btrfs_get_block_group(cache);
+
+ btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
+
+ return ret;
+}
+
+/*
+ * return the block group that starts at or after bytenr
+ */
+static struct btrfs_block_group_cache *
+btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
+{
+ struct btrfs_block_group_cache *cache;
+
+ cache = block_group_cache_tree_search(info, bytenr, 0);
+
+ return cache;
+}
+
+/*
+ * return the block group that contains the given bytenr
+ */
+struct btrfs_block_group_cache *btrfs_lookup_block_group(
+ struct btrfs_fs_info *info,
+ u64 bytenr)
+{
+ struct btrfs_block_group_cache *cache;
+
+ cache = block_group_cache_tree_search(info, bytenr, 1);
+
+ return cache;
+}
+
+static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
+ u64 flags)
+{
+ struct list_head *head = &info->space_info;
+ struct btrfs_space_info *found;
+
+ flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(found, head, list) {
+ if (found->flags & flags) {
+ rcu_read_unlock();
+ return found;
+ }
+ }
+ rcu_read_unlock();
+ return NULL;
+}
+
+/*
+ * after adding space to the filesystem, we need to clear the full flags
+ * on all the space infos.
+ */
+void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
+{
+ struct list_head *head = &info->space_info;
+ struct btrfs_space_info *found;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(found, head, list)
+ found->full = 0;
+ rcu_read_unlock();
+}
+
+/* simple helper to search for an existing data extent at a given offset */
+int btrfs_lookup_data_extent(struct btrfs_root *root, u64 start, u64 len)
+{
+ int ret;
+ struct btrfs_key key;
+ struct btrfs_path *path;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = start;
+ key.offset = len;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
+ 0, 0);
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * helper function to lookup reference count and flags of a tree block.
+ *
+ * the head node for delayed ref is used to store the sum of all the
+ * reference count modifications queued up in the rbtree. the head
+ * node may also store the extent flags to set. This way you can check
+ * to see what the reference count and extent flags would be if all of
+ * the delayed refs are not processed.
+ */
+int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytenr,
+ u64 offset, int metadata, u64 *refs, u64 *flags)
+{
+ struct btrfs_delayed_ref_head *head;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_path *path;
+ struct btrfs_extent_item *ei;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ u32 item_size;
+ u64 num_refs;
+ u64 extent_flags;
+ int ret;
+
+ /*
+ * If we don't have skinny metadata, don't bother doing anything
+ * different
+ */
+ if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) {
+ offset = root->nodesize;
+ metadata = 0;
+ }
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ if (!trans) {
+ path->skip_locking = 1;
+ path->search_commit_root = 1;
+ }
+
+search_again:
+ key.objectid = bytenr;
+ key.offset = offset;
+ if (metadata)
+ key.type = BTRFS_METADATA_ITEM_KEY;
+ else
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+
+ ret = btrfs_search_slot(trans, root->fs_info->extent_root,
+ &key, path, 0, 0);
+ if (ret < 0)
+ goto out_free;
+
+ if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
+ if (path->slots[0]) {
+ path->slots[0]--;
+ btrfs_item_key_to_cpu(path->nodes[0], &key,
+ path->slots[0]);
+ if (key.objectid == bytenr &&
+ key.type == BTRFS_EXTENT_ITEM_KEY &&
+ key.offset == root->nodesize)
+ ret = 0;
+ }
+ }
+
+ if (ret == 0) {
+ leaf = path->nodes[0];
+ item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ if (item_size >= sizeof(*ei)) {
+ ei = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_extent_item);
+ num_refs = btrfs_extent_refs(leaf, ei);
+ extent_flags = btrfs_extent_flags(leaf, ei);
+ } else {
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+ struct btrfs_extent_item_v0 *ei0;
+ BUG_ON(item_size != sizeof(*ei0));
+ ei0 = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_extent_item_v0);
+ num_refs = btrfs_extent_refs_v0(leaf, ei0);
+ /* FIXME: this isn't correct for data */
+ extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
+#else
+ BUG();
+#endif
+ }
+ BUG_ON(num_refs == 0);
+ } else {
+ num_refs = 0;
+ extent_flags = 0;
+ ret = 0;
+ }
+
+ if (!trans)
+ goto out;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ spin_lock(&delayed_refs->lock);
+ head = btrfs_find_delayed_ref_head(trans, bytenr);
+ if (head) {
+ if (!mutex_trylock(&head->mutex)) {
+ atomic_inc(&head->node.refs);
+ spin_unlock(&delayed_refs->lock);
+
+ btrfs_release_path(path);
+
+ /*
+ * Mutex was contended, block until it's released and try
+ * again
+ */
+ mutex_lock(&head->mutex);
+ mutex_unlock(&head->mutex);
+ btrfs_put_delayed_ref(&head->node);
+ goto search_again;
+ }
+ spin_lock(&head->lock);
+ if (head->extent_op && head->extent_op->update_flags)
+ extent_flags |= head->extent_op->flags_to_set;
+ else
+ BUG_ON(num_refs == 0);
+
+ num_refs += head->node.ref_mod;
+ spin_unlock(&head->lock);
+ mutex_unlock(&head->mutex);
+ }
+ spin_unlock(&delayed_refs->lock);
+out:
+ WARN_ON(num_refs == 0);
+ if (refs)
+ *refs = num_refs;
+ if (flags)
+ *flags = extent_flags;
+out_free:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * Back reference rules. Back refs have three main goals:
+ *
+ * 1) differentiate between all holders of references to an extent so that
+ * when a reference is dropped we can make sure it was a valid reference
+ * before freeing the extent.
+ *
+ * 2) Provide enough information to quickly find the holders of an extent
+ * if we notice a given block is corrupted or bad.
+ *
+ * 3) Make it easy to migrate blocks for FS shrinking or storage pool
+ * maintenance. This is actually the same as #2, but with a slightly
+ * different use case.
+ *
+ * There are two kinds of back refs. The implicit back refs is optimized
+ * for pointers in non-shared tree blocks. For a given pointer in a block,
+ * back refs of this kind provide information about the block's owner tree
+ * and the pointer's key. These information allow us to find the block by
+ * b-tree searching. The full back refs is for pointers in tree blocks not
+ * referenced by their owner trees. The location of tree block is recorded
+ * in the back refs. Actually the full back refs is generic, and can be
+ * used in all cases the implicit back refs is used. The major shortcoming
+ * of the full back refs is its overhead. Every time a tree block gets
+ * COWed, we have to update back refs entry for all pointers in it.
+ *
+ * For a newly allocated tree block, we use implicit back refs for
+ * pointers in it. This means most tree related operations only involve
+ * implicit back refs. For a tree block created in old transaction, the
+ * only way to drop a reference to it is COW it. So we can detect the
+ * event that tree block loses its owner tree's reference and do the
+ * back refs conversion.
+ *
+ * When a tree block is COW'd through a tree, there are four cases:
+ *
+ * The reference count of the block is one and the tree is the block's
+ * owner tree. Nothing to do in this case.
+ *
+ * The reference count of the block is one and the tree is not the
+ * block's owner tree. In this case, full back refs is used for pointers
+ * in the block. Remove these full back refs, add implicit back refs for
+ * every pointers in the new block.
+ *
+ * The reference count of the block is greater than one and the tree is
+ * the block's owner tree. In this case, implicit back refs is used for
+ * pointers in the block. Add full back refs for every pointers in the
+ * block, increase lower level extents' reference counts. The original
+ * implicit back refs are entailed to the new block.
+ *
+ * The reference count of the block is greater than one and the tree is
+ * not the block's owner tree. Add implicit back refs for every pointer in
+ * the new block, increase lower level extents' reference count.
+ *
+ * Back Reference Key composing:
+ *
+ * The key objectid corresponds to the first byte in the extent,
+ * The key type is used to differentiate between types of back refs.
+ * There are different meanings of the key offset for different types
+ * of back refs.
+ *
+ * File extents can be referenced by:
+ *
+ * - multiple snapshots, subvolumes, or different generations in one subvol
+ * - different files inside a single subvolume
+ * - different offsets inside a file (bookend extents in file.c)
+ *
+ * The extent ref structure for the implicit back refs has fields for:
+ *
+ * - Objectid of the subvolume root
+ * - objectid of the file holding the reference
+ * - original offset in the file
+ * - how many bookend extents
+ *
+ * The key offset for the implicit back refs is hash of the first
+ * three fields.
+ *
+ * The extent ref structure for the full back refs has field for:
+ *
+ * - number of pointers in the tree leaf
+ *
+ * The key offset for the implicit back refs is the first byte of
+ * the tree leaf
+ *
+ * When a file extent is allocated, The implicit back refs is used.
+ * the fields are filled in:
+ *
+ * (root_key.objectid, inode objectid, offset in file, 1)
+ *
+ * When a file extent is removed file truncation, we find the
+ * corresponding implicit back refs and check the following fields:
+ *
+ * (btrfs_header_owner(leaf), inode objectid, offset in file)
+ *
+ * Btree extents can be referenced by:
+ *
+ * - Different subvolumes
+ *
+ * Both the implicit back refs and the full back refs for tree blocks
+ * only consist of key. The key offset for the implicit back refs is
+ * objectid of block's owner tree. The key offset for the full back refs
+ * is the first byte of parent block.
+ *
+ * When implicit back refs is used, information about the lowest key and
+ * level of the tree block are required. These information are stored in
+ * tree block info structure.
+ */
+
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 owner, u32 extra_size)
+{
+ struct btrfs_extent_item *item;
+ struct btrfs_extent_item_v0 *ei0;
+ struct btrfs_extent_ref_v0 *ref0;
+ struct btrfs_tree_block_info *bi;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ u32 new_size = sizeof(*item);
+ u64 refs;
+ int ret;
+
+ leaf = path->nodes[0];
+ BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ ei0 = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_extent_item_v0);
+ refs = btrfs_extent_refs_v0(leaf, ei0);
+
+ if (owner == (u64)-1) {
+ while (1) {
+ if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ return ret;
+ BUG_ON(ret > 0); /* Corruption */
+ leaf = path->nodes[0];
+ }
+ btrfs_item_key_to_cpu(leaf, &found_key,
+ path->slots[0]);
+ BUG_ON(key.objectid != found_key.objectid);
+ if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
+ path->slots[0]++;
+ continue;
+ }
+ ref0 = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_extent_ref_v0);
+ owner = btrfs_ref_objectid_v0(leaf, ref0);
+ break;
+ }
+ }
+ btrfs_release_path(path);
+
+ if (owner < BTRFS_FIRST_FREE_OBJECTID)
+ new_size += sizeof(*bi);
+
+ new_size -= sizeof(*ei0);
+ ret = btrfs_search_slot(trans, root, &key, path,
+ new_size + extra_size, 1);
+ if (ret < 0)
+ return ret;
+ BUG_ON(ret); /* Corruption */
+
+ btrfs_extend_item(root, path, new_size);
+
+ leaf = path->nodes[0];
+ item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+ btrfs_set_extent_refs(leaf, item, refs);
+ /* FIXME: get real generation */
+ btrfs_set_extent_generation(leaf, item, 0);
+ if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+ btrfs_set_extent_flags(leaf, item,
+ BTRFS_EXTENT_FLAG_TREE_BLOCK |
+ BTRFS_BLOCK_FLAG_FULL_BACKREF);
+ bi = (struct btrfs_tree_block_info *)(item + 1);
+ /* FIXME: get first key of the block */
+ memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi));
+ btrfs_set_tree_block_level(leaf, bi, (int)owner);
+ } else {
+ btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
+ }
+ btrfs_mark_buffer_dirty(leaf);
+ return 0;
+}
+#endif
+
+static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
+{
+ u32 high_crc = ~(u32)0;
+ u32 low_crc = ~(u32)0;
+ __le64 lenum;
+
+ lenum = cpu_to_le64(root_objectid);
+ high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
+ lenum = cpu_to_le64(owner);
+ low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
+ lenum = cpu_to_le64(offset);
+ low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
+
+ return ((u64)high_crc << 31) ^ (u64)low_crc;
+}
+
+static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
+ struct btrfs_extent_data_ref *ref)
+{
+ return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
+ btrfs_extent_data_ref_objectid(leaf, ref),
+ btrfs_extent_data_ref_offset(leaf, ref));
+}
+
+static int match_extent_data_ref(struct extent_buffer *leaf,
+ struct btrfs_extent_data_ref *ref,
+ u64 root_objectid, u64 owner, u64 offset)
+{
+ if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
+ btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
+ btrfs_extent_data_ref_offset(leaf, ref) != offset)
+ return 0;
+ return 1;
+}
+
+static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 bytenr, u64 parent,
+ u64 root_objectid,
+ u64 owner, u64 offset)
+{
+ struct btrfs_key key;
+ struct btrfs_extent_data_ref *ref;
+ struct extent_buffer *leaf;
+ u32 nritems;
+ int ret;
+ int recow;
+ int err = -ENOENT;
+
+ key.objectid = bytenr;
+ if (parent) {
+ key.type = BTRFS_SHARED_DATA_REF_KEY;
+ key.offset = parent;
+ } else {
+ key.type = BTRFS_EXTENT_DATA_REF_KEY;
+ key.offset = hash_extent_data_ref(root_objectid,
+ owner, offset);
+ }
+again:
+ recow = 0;
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret < 0) {
+ err = ret;
+ goto fail;
+ }
+
+ if (parent) {
+ if (!ret)
+ return 0;
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+ key.type = BTRFS_EXTENT_REF_V0_KEY;
+ btrfs_release_path(path);
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret < 0) {
+ err = ret;
+ goto fail;
+ }
+ if (!ret)
+ return 0;
+#endif
+ goto fail;
+ }
+
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ while (1) {
+ if (path->slots[0] >= nritems) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ err = ret;
+ if (ret)
+ goto fail;
+
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ recow = 1;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid != bytenr ||
+ key.type != BTRFS_EXTENT_DATA_REF_KEY)
+ goto fail;
+
+ ref = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_extent_data_ref);
+
+ if (match_extent_data_ref(leaf, ref, root_objectid,
+ owner, offset)) {
+ if (recow) {
+ btrfs_release_path(path);
+ goto again;
+ }
+ err = 0;
+ break;
+ }
+ path->slots[0]++;
+ }
+fail:
+ return err;
+}
+
+static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 bytenr, u64 parent,
+ u64 root_objectid, u64 owner,
+ u64 offset, int refs_to_add)
+{
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ u32 size;
+ u32 num_refs;
+ int ret;
+
+ key.objectid = bytenr;
+ if (parent) {
+ key.type = BTRFS_SHARED_DATA_REF_KEY;
+ key.offset = parent;
+ size = sizeof(struct btrfs_shared_data_ref);
+ } else {
+ key.type = BTRFS_EXTENT_DATA_REF_KEY;
+ key.offset = hash_extent_data_ref(root_objectid,
+ owner, offset);
+ size = sizeof(struct btrfs_extent_data_ref);
+ }
+
+ ret = btrfs_insert_empty_item(trans, root, path, &key, size);
+ if (ret && ret != -EEXIST)
+ goto fail;
+
+ leaf = path->nodes[0];
+ if (parent) {
+ struct btrfs_shared_data_ref *ref;
+ ref = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_shared_data_ref);
+ if (ret == 0) {
+ btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
+ } else {
+ num_refs = btrfs_shared_data_ref_count(leaf, ref);
+ num_refs += refs_to_add;
+ btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
+ }
+ } else {
+ struct btrfs_extent_data_ref *ref;
+ while (ret == -EEXIST) {
+ ref = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_extent_data_ref);
+ if (match_extent_data_ref(leaf, ref, root_objectid,
+ owner, offset))
+ break;
+ btrfs_release_path(path);
+ key.offset++;
+ ret = btrfs_insert_empty_item(trans, root, path, &key,
+ size);
+ if (ret && ret != -EEXIST)
+ goto fail;
+
+ leaf = path->nodes[0];
+ }
+ ref = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_extent_data_ref);
+ if (ret == 0) {
+ btrfs_set_extent_data_ref_root(leaf, ref,
+ root_objectid);
+ btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
+ btrfs_set_extent_data_ref_offset(leaf, ref, offset);
+ btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
+ } else {
+ num_refs = btrfs_extent_data_ref_count(leaf, ref);
+ num_refs += refs_to_add;
+ btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
+ }
+ }
+ btrfs_mark_buffer_dirty(leaf);
+ ret = 0;
+fail:
+ btrfs_release_path(path);
+ return ret;
+}
+
+static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ int refs_to_drop, int *last_ref)
+{
+ struct btrfs_key key;
+ struct btrfs_extent_data_ref *ref1 = NULL;
+ struct btrfs_shared_data_ref *ref2 = NULL;
+ struct extent_buffer *leaf;
+ u32 num_refs = 0;
+ int ret = 0;
+
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+ if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
+ ref1 = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_extent_data_ref);
+ num_refs = btrfs_extent_data_ref_count(leaf, ref1);
+ } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
+ ref2 = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_shared_data_ref);
+ num_refs = btrfs_shared_data_ref_count(leaf, ref2);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+ } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
+ struct btrfs_extent_ref_v0 *ref0;
+ ref0 = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_extent_ref_v0);
+ num_refs = btrfs_ref_count_v0(leaf, ref0);
+#endif
+ } else {
+ BUG();
+ }
+
+ BUG_ON(num_refs < refs_to_drop);
+ num_refs -= refs_to_drop;
+
+ if (num_refs == 0) {
+ ret = btrfs_del_item(trans, root, path);
+ *last_ref = 1;
+ } else {
+ if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
+ btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
+ else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
+ btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+ else {
+ struct btrfs_extent_ref_v0 *ref0;
+ ref0 = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_extent_ref_v0);
+ btrfs_set_ref_count_v0(leaf, ref0, num_refs);
+ }
+#endif
+ btrfs_mark_buffer_dirty(leaf);
+ }
+ return ret;
+}
+
+static noinline u32 extent_data_ref_count(struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_extent_inline_ref *iref)
+{
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ struct btrfs_extent_data_ref *ref1;
+ struct btrfs_shared_data_ref *ref2;
+ u32 num_refs = 0;
+
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (iref) {
+ if (btrfs_extent_inline_ref_type(leaf, iref) ==
+ BTRFS_EXTENT_DATA_REF_KEY) {
+ ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
+ num_refs = btrfs_extent_data_ref_count(leaf, ref1);
+ } else {
+ ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
+ num_refs = btrfs_shared_data_ref_count(leaf, ref2);
+ }
+ } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
+ ref1 = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_extent_data_ref);
+ num_refs = btrfs_extent_data_ref_count(leaf, ref1);
+ } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
+ ref2 = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_shared_data_ref);
+ num_refs = btrfs_shared_data_ref_count(leaf, ref2);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+ } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
+ struct btrfs_extent_ref_v0 *ref0;
+ ref0 = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_extent_ref_v0);
+ num_refs = btrfs_ref_count_v0(leaf, ref0);
+#endif
+ } else {
+ WARN_ON(1);
+ }
+ return num_refs;
+}
+
+static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 bytenr, u64 parent,
+ u64 root_objectid)
+{
+ struct btrfs_key key;
+ int ret;
+
+ key.objectid = bytenr;
+ if (parent) {
+ key.type = BTRFS_SHARED_BLOCK_REF_KEY;
+ key.offset = parent;
+ } else {
+ key.type = BTRFS_TREE_BLOCK_REF_KEY;
+ key.offset = root_objectid;
+ }
+
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret > 0)
+ ret = -ENOENT;
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+ if (ret == -ENOENT && parent) {
+ btrfs_release_path(path);
+ key.type = BTRFS_EXTENT_REF_V0_KEY;
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret > 0)
+ ret = -ENOENT;
+ }
+#endif
+ return ret;
+}
+
+static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 bytenr, u64 parent,
+ u64 root_objectid)
+{
+ struct btrfs_key key;
+ int ret;
+
+ key.objectid = bytenr;
+ if (parent) {
+ key.type = BTRFS_SHARED_BLOCK_REF_KEY;
+ key.offset = parent;
+ } else {
+ key.type = BTRFS_TREE_BLOCK_REF_KEY;
+ key.offset = root_objectid;
+ }
+
+ ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+ btrfs_release_path(path);
+ return ret;
+}
+
+static inline int extent_ref_type(u64 parent, u64 owner)
+{
+ int type;
+ if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+ if (parent > 0)
+ type = BTRFS_SHARED_BLOCK_REF_KEY;
+ else
+ type = BTRFS_TREE_BLOCK_REF_KEY;
+ } else {
+ if (parent > 0)
+ type = BTRFS_SHARED_DATA_REF_KEY;
+ else
+ type = BTRFS_EXTENT_DATA_REF_KEY;
+ }
+ return type;
+}
+
+static int find_next_key(struct btrfs_path *path, int level,
+ struct btrfs_key *key)
+
+{
+ for (; level < BTRFS_MAX_LEVEL; level++) {
+ if (!path->nodes[level])
+ break;
+ if (path->slots[level] + 1 >=
+ btrfs_header_nritems(path->nodes[level]))
+ continue;
+ if (level == 0)
+ btrfs_item_key_to_cpu(path->nodes[level], key,
+ path->slots[level] + 1);
+ else
+ btrfs_node_key_to_cpu(path->nodes[level], key,
+ path->slots[level] + 1);
+ return 0;
+ }
+ return 1;
+}
+
+/*
+ * look for inline back ref. if back ref is found, *ref_ret is set
+ * to the address of inline back ref, and 0 is returned.
+ *
+ * if back ref isn't found, *ref_ret is set to the address where it
+ * should be inserted, and -ENOENT is returned.
+ *
+ * if insert is true and there are too many inline back refs, the path
+ * points to the extent item, and -EAGAIN is returned.
+ *
+ * NOTE: inline back refs are ordered in the same way that back ref
+ * items in the tree are ordered.
+ */
+static noinline_for_stack
+int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_extent_inline_ref **ref_ret,
+ u64 bytenr, u64 num_bytes,
+ u64 parent, u64 root_objectid,
+ u64 owner, u64 offset, int insert)
+{
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ struct btrfs_extent_item *ei;
+ struct btrfs_extent_inline_ref *iref;
+ u64 flags;
+ u64 item_size;
+ unsigned long ptr;
+ unsigned long end;
+ int extra_size;
+ int type;
+ int want;
+ int ret;
+ int err = 0;
+ bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
+ SKINNY_METADATA);
+
+ key.objectid = bytenr;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = num_bytes;
+
+ want = extent_ref_type(parent, owner);
+ if (insert) {
+ extra_size = btrfs_extent_inline_ref_size(want);
+ path->keep_locks = 1;
+ } else
+ extra_size = -1;
+
+ /*
+ * Owner is our parent level, so we can just add one to get the level
+ * for the block we are interested in.
+ */
+ if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
+ key.type = BTRFS_METADATA_ITEM_KEY;
+ key.offset = owner;
+ }
+
+again:
+ ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+
+ /*
+ * We may be a newly converted file system which still has the old fat
+ * extent entries for metadata, so try and see if we have one of those.
+ */
+ if (ret > 0 && skinny_metadata) {
+ skinny_metadata = false;
+ if (path->slots[0]) {
+ path->slots[0]--;
+ btrfs_item_key_to_cpu(path->nodes[0], &key,
+ path->slots[0]);
+ if (key.objectid == bytenr &&
+ key.type == BTRFS_EXTENT_ITEM_KEY &&
+ key.offset == num_bytes)
+ ret = 0;
+ }
+ if (ret) {
+ key.objectid = bytenr;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = num_bytes;
+ btrfs_release_path(path);
+ goto again;
+ }
+ }
+
+ if (ret && !insert) {
+ err = -ENOENT;
+ goto out;
+ } else if (WARN_ON(ret)) {
+ err = -EIO;
+ goto out;
+ }
+
+ leaf = path->nodes[0];
+ item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+ if (item_size < sizeof(*ei)) {
+ if (!insert) {
+ err = -ENOENT;
+ goto out;
+ }
+ ret = convert_extent_item_v0(trans, root, path, owner,
+ extra_size);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+ leaf = path->nodes[0];
+ item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ }
+#endif
+ BUG_ON(item_size < sizeof(*ei));
+
+ ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+ flags = btrfs_extent_flags(leaf, ei);
+
+ ptr = (unsigned long)(ei + 1);
+ end = (unsigned long)ei + item_size;
+
+ if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
+ ptr += sizeof(struct btrfs_tree_block_info);
+ BUG_ON(ptr > end);
+ }
+
+ err = -ENOENT;
+ while (1) {
+ if (ptr >= end) {
+ WARN_ON(ptr > end);
+ break;
+ }
+ iref = (struct btrfs_extent_inline_ref *)ptr;
+ type = btrfs_extent_inline_ref_type(leaf, iref);
+ if (want < type)
+ break;
+ if (want > type) {
+ ptr += btrfs_extent_inline_ref_size(type);
+ continue;
+ }
+
+ if (type == BTRFS_EXTENT_DATA_REF_KEY) {
+ struct btrfs_extent_data_ref *dref;
+ dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+ if (match_extent_data_ref(leaf, dref, root_objectid,
+ owner, offset)) {
+ err = 0;
+ break;
+ }
+ if (hash_extent_data_ref_item(leaf, dref) <
+ hash_extent_data_ref(root_objectid, owner, offset))
+ break;
+ } else {
+ u64 ref_offset;
+ ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
+ if (parent > 0) {
+ if (parent == ref_offset) {
+ err = 0;
+ break;
+ }
+ if (ref_offset < parent)
+ break;
+ } else {
+ if (root_objectid == ref_offset) {
+ err = 0;
+ break;
+ }
+ if (ref_offset < root_objectid)
+ break;
+ }
+ }
+ ptr += btrfs_extent_inline_ref_size(type);
+ }
+ if (err == -ENOENT && insert) {
+ if (item_size + extra_size >=
+ BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
+ err = -EAGAIN;
+ goto out;
+ }
+ /*
+ * To add new inline back ref, we have to make sure
+ * there is no corresponding back ref item.
+ * For simplicity, we just do not add new inline back
+ * ref if there is any kind of item for this block
+ */
+ if (find_next_key(path, 0, &key) == 0 &&
+ key.objectid == bytenr &&
+ key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
+ err = -EAGAIN;
+ goto out;
+ }
+ }
+ *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
+out:
+ if (insert) {
+ path->keep_locks = 0;
+ btrfs_unlock_up_safe(path, 1);
+ }
+ return err;
+}
+
+/*
+ * helper to add new inline back ref
+ */
+static noinline_for_stack
+void setup_inline_extent_backref(struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_extent_inline_ref *iref,
+ u64 parent, u64 root_objectid,
+ u64 owner, u64 offset, int refs_to_add,
+ struct btrfs_delayed_extent_op *extent_op)
+{
+ struct extent_buffer *leaf;
+ struct btrfs_extent_item *ei;
+ unsigned long ptr;
+ unsigned long end;
+ unsigned long item_offset;
+ u64 refs;
+ int size;
+ int type;
+
+ leaf = path->nodes[0];
+ ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+ item_offset = (unsigned long)iref - (unsigned long)ei;
+
+ type = extent_ref_type(parent, owner);
+ size = btrfs_extent_inline_ref_size(type);
+
+ btrfs_extend_item(root, path, size);
+
+ ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+ refs = btrfs_extent_refs(leaf, ei);
+ refs += refs_to_add;
+ btrfs_set_extent_refs(leaf, ei, refs);
+ if (extent_op)
+ __run_delayed_extent_op(extent_op, leaf, ei);
+
+ ptr = (unsigned long)ei + item_offset;
+ end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
+ if (ptr < end - size)
+ memmove_extent_buffer(leaf, ptr + size, ptr,
+ end - size - ptr);
+
+ iref = (struct btrfs_extent_inline_ref *)ptr;
+ btrfs_set_extent_inline_ref_type(leaf, iref, type);
+ if (type == BTRFS_EXTENT_DATA_REF_KEY) {
+ struct btrfs_extent_data_ref *dref;
+ dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+ btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
+ btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
+ btrfs_set_extent_data_ref_offset(leaf, dref, offset);
+ btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
+ } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
+ struct btrfs_shared_data_ref *sref;
+ sref = (struct btrfs_shared_data_ref *)(iref + 1);
+ btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
+ btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
+ } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
+ btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
+ } else {
+ btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
+ }
+ btrfs_mark_buffer_dirty(leaf);
+}
+
+static int lookup_extent_backref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_extent_inline_ref **ref_ret,
+ u64 bytenr, u64 num_bytes, u64 parent,
+ u64 root_objectid, u64 owner, u64 offset)
+{
+ int ret;
+
+ ret = lookup_inline_extent_backref(trans, root, path, ref_ret,
+ bytenr, num_bytes, parent,
+ root_objectid, owner, offset, 0);
+ if (ret != -ENOENT)
+ return ret;
+
+ btrfs_release_path(path);
+ *ref_ret = NULL;
+
+ if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+ ret = lookup_tree_block_ref(trans, root, path, bytenr, parent,
+ root_objectid);
+ } else {
+ ret = lookup_extent_data_ref(trans, root, path, bytenr, parent,
+ root_objectid, owner, offset);
+ }
+ return ret;
+}
+
+/*
+ * helper to update/remove inline back ref
+ */
+static noinline_for_stack
+void update_inline_extent_backref(struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_extent_inline_ref *iref,
+ int refs_to_mod,
+ struct btrfs_delayed_extent_op *extent_op,
+ int *last_ref)
+{
+ struct extent_buffer *leaf;
+ struct btrfs_extent_item *ei;
+ struct btrfs_extent_data_ref *dref = NULL;
+ struct btrfs_shared_data_ref *sref = NULL;
+ unsigned long ptr;
+ unsigned long end;
+ u32 item_size;
+ int size;
+ int type;
+ u64 refs;
+
+ leaf = path->nodes[0];
+ ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+ refs = btrfs_extent_refs(leaf, ei);
+ WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
+ refs += refs_to_mod;
+ btrfs_set_extent_refs(leaf, ei, refs);
+ if (extent_op)
+ __run_delayed_extent_op(extent_op, leaf, ei);
+
+ type = btrfs_extent_inline_ref_type(leaf, iref);
+
+ if (type == BTRFS_EXTENT_DATA_REF_KEY) {
+ dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+ refs = btrfs_extent_data_ref_count(leaf, dref);
+ } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
+ sref = (struct btrfs_shared_data_ref *)(iref + 1);
+ refs = btrfs_shared_data_ref_count(leaf, sref);
+ } else {
+ refs = 1;
+ BUG_ON(refs_to_mod != -1);
+ }
+
+ BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
+ refs += refs_to_mod;
+
+ if (refs > 0) {
+ if (type == BTRFS_EXTENT_DATA_REF_KEY)
+ btrfs_set_extent_data_ref_count(leaf, dref, refs);
+ else
+ btrfs_set_shared_data_ref_count(leaf, sref, refs);
+ } else {
+ *last_ref = 1;
+ size = btrfs_extent_inline_ref_size(type);
+ item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ ptr = (unsigned long)iref;
+ end = (unsigned long)ei + item_size;
+ if (ptr + size < end)
+ memmove_extent_buffer(leaf, ptr, ptr + size,
+ end - ptr - size);
+ item_size -= size;
+ btrfs_truncate_item(root, path, item_size, 1);
+ }
+ btrfs_mark_buffer_dirty(leaf);
+}
+
+static noinline_for_stack
+int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 bytenr, u64 num_bytes, u64 parent,
+ u64 root_objectid, u64 owner,
+ u64 offset, int refs_to_add,
+ struct btrfs_delayed_extent_op *extent_op)
+{
+ struct btrfs_extent_inline_ref *iref;
+ int ret;
+
+ ret = lookup_inline_extent_backref(trans, root, path, &iref,
+ bytenr, num_bytes, parent,
+ root_objectid, owner, offset, 1);
+ if (ret == 0) {
+ BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
+ update_inline_extent_backref(root, path, iref,
+ refs_to_add, extent_op, NULL);
+ } else if (ret == -ENOENT) {
+ setup_inline_extent_backref(root, path, iref, parent,
+ root_objectid, owner, offset,
+ refs_to_add, extent_op);
+ ret = 0;
+ }
+ return ret;
+}
+
+static int insert_extent_backref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 bytenr, u64 parent, u64 root_objectid,
+ u64 owner, u64 offset, int refs_to_add)
+{
+ int ret;
+ if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+ BUG_ON(refs_to_add != 1);
+ ret = insert_tree_block_ref(trans, root, path, bytenr,
+ parent, root_objectid);
+ } else {
+ ret = insert_extent_data_ref(trans, root, path, bytenr,
+ parent, root_objectid,
+ owner, offset, refs_to_add);
+ }
+ return ret;
+}
+
+static int remove_extent_backref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_extent_inline_ref *iref,
+ int refs_to_drop, int is_data, int *last_ref)
+{
+ int ret = 0;
+
+ BUG_ON(!is_data && refs_to_drop != 1);
+ if (iref) {
+ update_inline_extent_backref(root, path, iref,
+ -refs_to_drop, NULL, last_ref);
+ } else if (is_data) {
+ ret = remove_extent_data_ref(trans, root, path, refs_to_drop,
+ last_ref);
+ } else {
+ *last_ref = 1;
+ ret = btrfs_del_item(trans, root, path);
+ }
+ return ret;
+}
+
+static int btrfs_issue_discard(struct block_device *bdev,
+ u64 start, u64 len)
+{
+ return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0);
+}
+
+int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes, u64 *actual_bytes)
+{
+ int ret;
+ u64 discarded_bytes = 0;
+ struct btrfs_bio *bbio = NULL;
+
+
+ /* Tell the block device(s) that the sectors can be discarded */
+ ret = btrfs_map_block(root->fs_info, REQ_DISCARD,
+ bytenr, &num_bytes, &bbio, 0);
+ /* Error condition is -ENOMEM */
+ if (!ret) {
+ struct btrfs_bio_stripe *stripe = bbio->stripes;
+ int i;
+
+
+ for (i = 0; i < bbio->num_stripes; i++, stripe++) {
+ if (!stripe->dev->can_discard)
+ continue;
+
+ ret = btrfs_issue_discard(stripe->dev->bdev,
+ stripe->physical,
+ stripe->length);
+ if (!ret)
+ discarded_bytes += stripe->length;
+ else if (ret != -EOPNOTSUPP)
+ break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
+
+ /*
+ * Just in case we get back EOPNOTSUPP for some reason,
+ * just ignore the return value so we don't screw up
+ * people calling discard_extent.
+ */
+ ret = 0;
+ }
+ btrfs_put_bbio(bbio);
+ }
+
+ if (actual_bytes)
+ *actual_bytes = discarded_bytes;
+
+
+ if (ret == -EOPNOTSUPP)
+ ret = 0;
+ return ret;
+}
+
+/* Can return -ENOMEM */
+int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes, u64 parent,
+ u64 root_objectid, u64 owner, u64 offset,
+ int no_quota)
+{
+ int ret;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+
+ BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
+ root_objectid == BTRFS_TREE_LOG_OBJECTID);
+
+ if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+ ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
+ num_bytes,
+ parent, root_objectid, (int)owner,
+ BTRFS_ADD_DELAYED_REF, NULL, no_quota);
+ } else {
+ ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
+ num_bytes,
+ parent, root_objectid, owner, offset,
+ BTRFS_ADD_DELAYED_REF, NULL, no_quota);
+ }
+ return ret;
+}
+
+static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes,
+ u64 parent, u64 root_objectid,
+ u64 owner, u64 offset, int refs_to_add,
+ int no_quota,
+ struct btrfs_delayed_extent_op *extent_op)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_extent_item *item;
+ struct btrfs_key key;
+ u64 refs;
+ int ret;
+ enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_ADD_EXCL;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ if (!is_fstree(root_objectid) || !root->fs_info->quota_enabled)
+ no_quota = 1;
+
+ path->reada = 1;
+ path->leave_spinning = 1;
+ /* this will setup the path even if it fails to insert the back ref */
+ ret = insert_inline_extent_backref(trans, fs_info->extent_root, path,
+ bytenr, num_bytes, parent,
+ root_objectid, owner, offset,
+ refs_to_add, extent_op);
+ if ((ret < 0 && ret != -EAGAIN) || (!ret && no_quota))
+ goto out;
+ /*
+ * Ok we were able to insert an inline extent and it appears to be a new
+ * reference, deal with the qgroup accounting.
+ */
+ if (!ret && !no_quota) {
+ ASSERT(root->fs_info->quota_enabled);
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ item = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_extent_item);
+ if (btrfs_extent_refs(leaf, item) > (u64)refs_to_add)
+ type = BTRFS_QGROUP_OPER_ADD_SHARED;
+ btrfs_release_path(path);
+
+ ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
+ bytenr, num_bytes, type, 0);
+ goto out;
+ }
+
+ /*
+ * Ok we had -EAGAIN which means we didn't have space to insert and
+ * inline extent ref, so just update the reference count and add a
+ * normal backref.
+ */
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+ refs = btrfs_extent_refs(leaf, item);
+ if (refs)
+ type = BTRFS_QGROUP_OPER_ADD_SHARED;
+ btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
+ if (extent_op)
+ __run_delayed_extent_op(extent_op, leaf, item);
+
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_release_path(path);
+
+ if (!no_quota) {
+ ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
+ bytenr, num_bytes, type, 0);
+ if (ret)
+ goto out;
+ }
+
+ path->reada = 1;
+ path->leave_spinning = 1;
+ /* now insert the actual backref */
+ ret = insert_extent_backref(trans, root->fs_info->extent_root,
+ path, bytenr, parent, root_objectid,
+ owner, offset, refs_to_add);
+ if (ret)
+ btrfs_abort_transaction(trans, root, ret);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_delayed_ref_node *node,
+ struct btrfs_delayed_extent_op *extent_op,
+ int insert_reserved)
+{
+ int ret = 0;
+ struct btrfs_delayed_data_ref *ref;
+ struct btrfs_key ins;
+ u64 parent = 0;
+ u64 ref_root = 0;
+ u64 flags = 0;
+
+ ins.objectid = node->bytenr;
+ ins.offset = node->num_bytes;
+ ins.type = BTRFS_EXTENT_ITEM_KEY;
+
+ ref = btrfs_delayed_node_to_data_ref(node);
+ trace_run_delayed_data_ref(node, ref, node->action);
+
+ if (node->type == BTRFS_SHARED_DATA_REF_KEY)
+ parent = ref->parent;
+ ref_root = ref->root;
+
+ if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
+ if (extent_op)
+ flags |= extent_op->flags_to_set;
+ ret = alloc_reserved_file_extent(trans, root,
+ parent, ref_root, flags,
+ ref->objectid, ref->offset,
+ &ins, node->ref_mod);
+ } else if (node->action == BTRFS_ADD_DELAYED_REF) {
+ ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
+ node->num_bytes, parent,
+ ref_root, ref->objectid,
+ ref->offset, node->ref_mod,
+ node->no_quota, extent_op);
+ } else if (node->action == BTRFS_DROP_DELAYED_REF) {
+ ret = __btrfs_free_extent(trans, root, node->bytenr,
+ node->num_bytes, parent,
+ ref_root, ref->objectid,
+ ref->offset, node->ref_mod,
+ extent_op, node->no_quota);
+ } else {
+ BUG();
+ }
+ return ret;
+}
+
+static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
+ struct extent_buffer *leaf,
+ struct btrfs_extent_item *ei)
+{
+ u64 flags = btrfs_extent_flags(leaf, ei);
+ if (extent_op->update_flags) {
+ flags |= extent_op->flags_to_set;
+ btrfs_set_extent_flags(leaf, ei, flags);
+ }
+
+ if (extent_op->update_key) {
+ struct btrfs_tree_block_info *bi;
+ BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
+ bi = (struct btrfs_tree_block_info *)(ei + 1);
+ btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
+ }
+}
+
+static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_delayed_ref_node *node,
+ struct btrfs_delayed_extent_op *extent_op)
+{
+ struct btrfs_key key;
+ struct btrfs_path *path;
+ struct btrfs_extent_item *ei;
+ struct extent_buffer *leaf;
+ u32 item_size;
+ int ret;
+ int err = 0;
+ int metadata = !extent_op->is_data;
+
+ if (trans->aborted)
+ return 0;
+
+ if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA))
+ metadata = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = node->bytenr;
+
+ if (metadata) {
+ key.type = BTRFS_METADATA_ITEM_KEY;
+ key.offset = extent_op->level;
+ } else {
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = node->num_bytes;
+ }
+
+again:
+ path->reada = 1;
+ path->leave_spinning = 1;
+ ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
+ path, 0, 1);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+ if (ret > 0) {
+ if (metadata) {
+ if (path->slots[0] > 0) {
+ path->slots[0]--;
+ btrfs_item_key_to_cpu(path->nodes[0], &key,
+ path->slots[0]);
+ if (key.objectid == node->bytenr &&
+ key.type == BTRFS_EXTENT_ITEM_KEY &&
+ key.offset == node->num_bytes)
+ ret = 0;
+ }
+ if (ret > 0) {
+ btrfs_release_path(path);
+ metadata = 0;
+
+ key.objectid = node->bytenr;
+ key.offset = node->num_bytes;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ goto again;
+ }
+ } else {
+ err = -EIO;
+ goto out;
+ }
+ }
+
+ leaf = path->nodes[0];
+ item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+ if (item_size < sizeof(*ei)) {
+ ret = convert_extent_item_v0(trans, root->fs_info->extent_root,
+ path, (u64)-1, 0);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+ leaf = path->nodes[0];
+ item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ }
+#endif
+ BUG_ON(item_size < sizeof(*ei));
+ ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+ __run_delayed_extent_op(extent_op, leaf, ei);
+
+ btrfs_mark_buffer_dirty(leaf);
+out:
+ btrfs_free_path(path);
+ return err;
+}
+
+static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_delayed_ref_node *node,
+ struct btrfs_delayed_extent_op *extent_op,
+ int insert_reserved)
+{
+ int ret = 0;
+ struct btrfs_delayed_tree_ref *ref;
+ struct btrfs_key ins;
+ u64 parent = 0;
+ u64 ref_root = 0;
+ bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
+ SKINNY_METADATA);
+
+ ref = btrfs_delayed_node_to_tree_ref(node);
+ trace_run_delayed_tree_ref(node, ref, node->action);
+
+ if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
+ parent = ref->parent;
+ ref_root = ref->root;
+
+ ins.objectid = node->bytenr;
+ if (skinny_metadata) {
+ ins.offset = ref->level;
+ ins.type = BTRFS_METADATA_ITEM_KEY;
+ } else {
+ ins.offset = node->num_bytes;
+ ins.type = BTRFS_EXTENT_ITEM_KEY;
+ }
+
+ BUG_ON(node->ref_mod != 1);
+ if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
+ BUG_ON(!extent_op || !extent_op->update_flags);
+ ret = alloc_reserved_tree_block(trans, root,
+ parent, ref_root,
+ extent_op->flags_to_set,
+ &extent_op->key,
+ ref->level, &ins,
+ node->no_quota);
+ } else if (node->action == BTRFS_ADD_DELAYED_REF) {
+ ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
+ node->num_bytes, parent, ref_root,
+ ref->level, 0, 1, node->no_quota,
+ extent_op);
+ } else if (node->action == BTRFS_DROP_DELAYED_REF) {
+ ret = __btrfs_free_extent(trans, root, node->bytenr,
+ node->num_bytes, parent, ref_root,
+ ref->level, 0, 1, extent_op,
+ node->no_quota);
+ } else {
+ BUG();
+ }
+ return ret;
+}
+
+/* helper function to actually process a single delayed ref entry */
+static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_delayed_ref_node *node,
+ struct btrfs_delayed_extent_op *extent_op,
+ int insert_reserved)
+{
+ int ret = 0;
+
+ if (trans->aborted) {
+ if (insert_reserved)
+ btrfs_pin_extent(root, node->bytenr,
+ node->num_bytes, 1);
+ return 0;
+ }
+
+ if (btrfs_delayed_ref_is_head(node)) {
+ struct btrfs_delayed_ref_head *head;
+ /*
+ * we've hit the end of the chain and we were supposed
+ * to insert this extent into the tree. But, it got
+ * deleted before we ever needed to insert it, so all
+ * we have to do is clean up the accounting
+ */
+ BUG_ON(extent_op);
+ head = btrfs_delayed_node_to_head(node);
+ trace_run_delayed_ref_head(node, head, node->action);
+
+ if (insert_reserved) {
+ btrfs_pin_extent(root, node->bytenr,
+ node->num_bytes, 1);
+ if (head->is_data) {
+ ret = btrfs_del_csums(trans, root,
+ node->bytenr,
+ node->num_bytes);
+ }
+ }
+ return ret;
+ }
+
+ if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
+ node->type == BTRFS_SHARED_BLOCK_REF_KEY)
+ ret = run_delayed_tree_ref(trans, root, node, extent_op,
+ insert_reserved);
+ else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
+ node->type == BTRFS_SHARED_DATA_REF_KEY)
+ ret = run_delayed_data_ref(trans, root, node, extent_op,
+ insert_reserved);
+ else
+ BUG();
+ return ret;
+}
+
+static noinline struct btrfs_delayed_ref_node *
+select_delayed_ref(struct btrfs_delayed_ref_head *head)
+{
+ struct rb_node *node;
+ struct btrfs_delayed_ref_node *ref, *last = NULL;;
+
+ /*
+ * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
+ * this prevents ref count from going down to zero when
+ * there still are pending delayed ref.
+ */
+ node = rb_first(&head->ref_root);
+ while (node) {
+ ref = rb_entry(node, struct btrfs_delayed_ref_node,
+ rb_node);
+ if (ref->action == BTRFS_ADD_DELAYED_REF)
+ return ref;
+ else if (last == NULL)
+ last = ref;
+ node = rb_next(node);
+ }
+ return last;
+}
+
+/*
+ * Returns 0 on success or if called with an already aborted transaction.
+ * Returns -ENOMEM or -EIO on failure and will abort the transaction.
+ */
+static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ unsigned long nr)
+{
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_delayed_ref_node *ref;
+ struct btrfs_delayed_ref_head *locked_ref = NULL;
+ struct btrfs_delayed_extent_op *extent_op;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ ktime_t start = ktime_get();
+ int ret;
+ unsigned long count = 0;
+ unsigned long actual_count = 0;
+ int must_insert_reserved = 0;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ while (1) {
+ if (!locked_ref) {
+ if (count >= nr)
+ break;
+
+ spin_lock(&delayed_refs->lock);
+ locked_ref = btrfs_select_ref_head(trans);
+ if (!locked_ref) {
+ spin_unlock(&delayed_refs->lock);
+ break;
+ }
+
+ /* grab the lock that says we are going to process
+ * all the refs for this head */
+ ret = btrfs_delayed_ref_lock(trans, locked_ref);
+ spin_unlock(&delayed_refs->lock);
+ /*
+ * we may have dropped the spin lock to get the head
+ * mutex lock, and that might have given someone else
+ * time to free the head. If that's true, it has been
+ * removed from our list and we can move on.
+ */
+ if (ret == -EAGAIN) {
+ locked_ref = NULL;
+ count++;
+ continue;
+ }
+ }
+
+ /*
+ * We need to try and merge add/drops of the same ref since we
+ * can run into issues with relocate dropping the implicit ref
+ * and then it being added back again before the drop can
+ * finish. If we merged anything we need to re-loop so we can
+ * get a good ref.
+ */
+ spin_lock(&locked_ref->lock);
+ btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
+ locked_ref);
+
+ /*
+ * locked_ref is the head node, so we have to go one
+ * node back for any delayed ref updates
+ */
+ ref = select_delayed_ref(locked_ref);
+
+ if (ref && ref->seq &&
+ btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
+ spin_unlock(&locked_ref->lock);
+ btrfs_delayed_ref_unlock(locked_ref);
+ spin_lock(&delayed_refs->lock);
+ locked_ref->processing = 0;
+ delayed_refs->num_heads_ready++;
+ spin_unlock(&delayed_refs->lock);
+ locked_ref = NULL;
+ cond_resched();
+ count++;
+ continue;
+ }
+
+ /*
+ * record the must insert reserved flag before we
+ * drop the spin lock.
+ */
+ must_insert_reserved = locked_ref->must_insert_reserved;
+ locked_ref->must_insert_reserved = 0;
+
+ extent_op = locked_ref->extent_op;
+ locked_ref->extent_op = NULL;
+
+ if (!ref) {
+
+
+ /* All delayed refs have been processed, Go ahead
+ * and send the head node to run_one_delayed_ref,
+ * so that any accounting fixes can happen
+ */
+ ref = &locked_ref->node;
+
+ if (extent_op && must_insert_reserved) {
+ btrfs_free_delayed_extent_op(extent_op);
+ extent_op = NULL;
+ }
+
+ if (extent_op) {
+ spin_unlock(&locked_ref->lock);
+ ret = run_delayed_extent_op(trans, root,
+ ref, extent_op);
+ btrfs_free_delayed_extent_op(extent_op);
+
+ if (ret) {
+ /*
+ * Need to reset must_insert_reserved if
+ * there was an error so the abort stuff
+ * can cleanup the reserved space
+ * properly.
+ */
+ if (must_insert_reserved)
+ locked_ref->must_insert_reserved = 1;
+ locked_ref->processing = 0;
+ btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
+ btrfs_delayed_ref_unlock(locked_ref);
+ return ret;
+ }
+ continue;
+ }
+
+ /*
+ * Need to drop our head ref lock and re-aqcuire the
+ * delayed ref lock and then re-check to make sure
+ * nobody got added.
+ */
+ spin_unlock(&locked_ref->lock);
+ spin_lock(&delayed_refs->lock);
+ spin_lock(&locked_ref->lock);
+ if (rb_first(&locked_ref->ref_root) ||
+ locked_ref->extent_op) {
+ spin_unlock(&locked_ref->lock);
+ spin_unlock(&delayed_refs->lock);
+ continue;
+ }
+ ref->in_tree = 0;
+ delayed_refs->num_heads--;
+ rb_erase(&locked_ref->href_node,
+ &delayed_refs->href_root);
+ spin_unlock(&delayed_refs->lock);
+ } else {
+ actual_count++;
+ ref->in_tree = 0;
+ rb_erase(&ref->rb_node, &locked_ref->ref_root);
+ }
+ atomic_dec(&delayed_refs->num_entries);
+
+ if (!btrfs_delayed_ref_is_head(ref)) {
+ /*
+ * when we play the delayed ref, also correct the
+ * ref_mod on head
+ */
+ switch (ref->action) {
+ case BTRFS_ADD_DELAYED_REF:
+ case BTRFS_ADD_DELAYED_EXTENT:
+ locked_ref->node.ref_mod -= ref->ref_mod;
+ break;
+ case BTRFS_DROP_DELAYED_REF:
+ locked_ref->node.ref_mod += ref->ref_mod;
+ break;
+ default:
+ WARN_ON(1);
+ }
+ }
+ spin_unlock(&locked_ref->lock);
+
+ ret = run_one_delayed_ref(trans, root, ref, extent_op,
+ must_insert_reserved);
+
+ btrfs_free_delayed_extent_op(extent_op);
+ if (ret) {
+ locked_ref->processing = 0;
+ btrfs_delayed_ref_unlock(locked_ref);
+ btrfs_put_delayed_ref(ref);
+ btrfs_debug(fs_info, "run_one_delayed_ref returned %d", ret);
+ return ret;
+ }
+
+ /*
+ * If this node is a head, that means all the refs in this head
+ * have been dealt with, and we will pick the next head to deal
+ * with, so we must unlock the head and drop it from the cluster
+ * list before we release it.
+ */
+ if (btrfs_delayed_ref_is_head(ref)) {
+ if (locked_ref->is_data &&
+ locked_ref->total_ref_mod < 0) {
+ spin_lock(&delayed_refs->lock);
+ delayed_refs->pending_csums -= ref->num_bytes;
+ spin_unlock(&delayed_refs->lock);
+ }
+ btrfs_delayed_ref_unlock(locked_ref);
+ locked_ref = NULL;
+ }
+ btrfs_put_delayed_ref(ref);
+ count++;
+ cond_resched();
+ }
+
+ /*
+ * We don't want to include ref heads since we can have empty ref heads
+ * and those will drastically skew our runtime down since we just do
+ * accounting, no actual extent tree updates.
+ */
+ if (actual_count > 0) {
+ u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
+ u64 avg;
+
+ /*
+ * We weigh the current average higher than our current runtime
+ * to avoid large swings in the average.
+ */
+ spin_lock(&delayed_refs->lock);
+ avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
+ fs_info->avg_delayed_ref_runtime = avg >> 2; /* div by 4 */
+ spin_unlock(&delayed_refs->lock);
+ }
+ return 0;
+}
+
+#ifdef SCRAMBLE_DELAYED_REFS
+/*
+ * Normally delayed refs get processed in ascending bytenr order. This
+ * correlates in most cases to the order added. To expose dependencies on this
+ * order, we start to process the tree in the middle instead of the beginning
+ */
+static u64 find_middle(struct rb_root *root)
+{
+ struct rb_node *n = root->rb_node;
+ struct btrfs_delayed_ref_node *entry;
+ int alt = 1;
+ u64 middle;
+ u64 first = 0, last = 0;
+
+ n = rb_first(root);
+ if (n) {
+ entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
+ first = entry->bytenr;
+ }
+ n = rb_last(root);
+ if (n) {
+ entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
+ last = entry->bytenr;
+ }
+ n = root->rb_node;
+
+ while (n) {
+ entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
+ WARN_ON(!entry->in_tree);
+
+ middle = entry->bytenr;
+
+ if (alt)
+ n = n->rb_left;
+ else
+ n = n->rb_right;
+
+ alt = 1 - alt;
+ }
+ return middle;
+}
+#endif
+
+static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
+{
+ u64 num_bytes;
+
+ num_bytes = heads * (sizeof(struct btrfs_extent_item) +
+ sizeof(struct btrfs_extent_inline_ref));
+ if (!btrfs_fs_incompat(root->fs_info, SKINNY_METADATA))
+ num_bytes += heads * sizeof(struct btrfs_tree_block_info);
+
+ /*
+ * We don't ever fill up leaves all the way so multiply by 2 just to be
+ * closer to what we're really going to want to ouse.
+ */
+ return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
+}
+
+/*
+ * Takes the number of bytes to be csumm'ed and figures out how many leaves it
+ * would require to store the csums for that many bytes.
+ */
+u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes)
+{
+ u64 csum_size;
+ u64 num_csums_per_leaf;
+ u64 num_csums;
+
+ csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
+ num_csums_per_leaf = div64_u64(csum_size,
+ (u64)btrfs_super_csum_size(root->fs_info->super_copy));
+ num_csums = div64_u64(csum_bytes, root->sectorsize);
+ num_csums += num_csums_per_leaf - 1;
+ num_csums = div64_u64(num_csums, num_csums_per_leaf);
+ return num_csums;
+}
+
+int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_block_rsv *global_rsv;
+ u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
+ u64 csum_bytes = trans->transaction->delayed_refs.pending_csums;
+ u64 num_dirty_bgs = trans->transaction->num_dirty_bgs;
+ u64 num_bytes, num_dirty_bgs_bytes;
+ int ret = 0;
+
+ num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+ num_heads = heads_to_leaves(root, num_heads);
+ if (num_heads > 1)
+ num_bytes += (num_heads - 1) * root->nodesize;
+ num_bytes <<= 1;
+ num_bytes += btrfs_csum_bytes_to_leaves(root, csum_bytes) * root->nodesize;
+ num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(root,
+ num_dirty_bgs);
+ global_rsv = &root->fs_info->global_block_rsv;
+
+ /*
+ * If we can't allocate any more chunks lets make sure we have _lots_ of
+ * wiggle room since running delayed refs can create more delayed refs.
+ */
+ if (global_rsv->space_info->full) {
+ num_dirty_bgs_bytes <<= 1;
+ num_bytes <<= 1;
+ }
+
+ spin_lock(&global_rsv->lock);
+ if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes)
+ ret = 1;
+ spin_unlock(&global_rsv->lock);
+ return ret;
+}
+
+int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ u64 num_entries =
+ atomic_read(&trans->transaction->delayed_refs.num_entries);
+ u64 avg_runtime;
+ u64 val;
+
+ smp_mb();
+ avg_runtime = fs_info->avg_delayed_ref_runtime;
+ val = num_entries * avg_runtime;
+ if (num_entries * avg_runtime >= NSEC_PER_SEC)
+ return 1;
+ if (val >= NSEC_PER_SEC / 2)
+ return 2;
+
+ return btrfs_check_space_for_delayed_refs(trans, root);
+}
+
+struct async_delayed_refs {
+ struct btrfs_root *root;
+ int count;
+ int error;
+ int sync;
+ struct completion wait;
+ struct btrfs_work work;
+};
+
+static void delayed_ref_async_start(struct btrfs_work *work)
+{
+ struct async_delayed_refs *async;
+ struct btrfs_trans_handle *trans;
+ int ret;
+
+ async = container_of(work, struct async_delayed_refs, work);
+
+ trans = btrfs_join_transaction(async->root);
+ if (IS_ERR(trans)) {
+ async->error = PTR_ERR(trans);
+ goto done;
+ }
+
+ /*
+ * trans->sync means that when we call end_transaciton, we won't
+ * wait on delayed refs
+ */
+ trans->sync = true;
+ ret = btrfs_run_delayed_refs(trans, async->root, async->count);
+ if (ret)
+ async->error = ret;
+
+ ret = btrfs_end_transaction(trans, async->root);
+ if (ret && !async->error)
+ async->error = ret;
+done:
+ if (async->sync)
+ complete(&async->wait);
+ else
+ kfree(async);
+}
+
+int btrfs_async_run_delayed_refs(struct btrfs_root *root,
+ unsigned long count, int wait)
+{
+ struct async_delayed_refs *async;
+ int ret;
+
+ async = kmalloc(sizeof(*async), GFP_NOFS);
+ if (!async)
+ return -ENOMEM;
+
+ async->root = root->fs_info->tree_root;
+ async->count = count;
+ async->error = 0;
+ if (wait)
+ async->sync = 1;
+ else
+ async->sync = 0;
+ init_completion(&async->wait);
+
+ btrfs_init_work(&async->work, btrfs_extent_refs_helper,
+ delayed_ref_async_start, NULL, NULL);
+
+ btrfs_queue_work(root->fs_info->extent_workers, &async->work);
+
+ if (wait) {
+ wait_for_completion(&async->wait);
+ ret = async->error;
+ kfree(async);
+ return ret;
+ }
+ return 0;
+}
+
+/*
+ * this starts processing the delayed reference count updates and
+ * extent insertions we have queued up so far. count can be
+ * 0, which means to process everything in the tree at the start
+ * of the run (but not newly added entries), or it can be some target
+ * number you'd like to process.
+ *
+ * Returns 0 on success or if called with an aborted transaction
+ * Returns <0 on error and aborts the transaction
+ */
+int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, unsigned long count)
+{
+ struct rb_node *node;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_delayed_ref_head *head;
+ int ret;
+ int run_all = count == (unsigned long)-1;
+
+ /* We'll clean this up in btrfs_cleanup_transaction */
+ if (trans->aborted)
+ return 0;
+
+ if (root == root->fs_info->extent_root)
+ root = root->fs_info->tree_root;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ if (count == 0)
+ count = atomic_read(&delayed_refs->num_entries) * 2;
+
+again:
+#ifdef SCRAMBLE_DELAYED_REFS
+ delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
+#endif
+ ret = __btrfs_run_delayed_refs(trans, root, count);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, root, ret);
+ return ret;
+ }
+
+ if (run_all) {
+ if (!list_empty(&trans->new_bgs))
+ btrfs_create_pending_block_groups(trans, root);
+
+ spin_lock(&delayed_refs->lock);
+ node = rb_first(&delayed_refs->href_root);
+ if (!node) {
+ spin_unlock(&delayed_refs->lock);
+ goto out;
+ }
+ count = (unsigned long)-1;
+
+ while (node) {
+ head = rb_entry(node, struct btrfs_delayed_ref_head,
+ href_node);
+ if (btrfs_delayed_ref_is_head(&head->node)) {
+ struct btrfs_delayed_ref_node *ref;
+
+ ref = &head->node;
+ atomic_inc(&ref->refs);
+
+ spin_unlock(&delayed_refs->lock);
+ /*
+ * Mutex was contended, block until it's
+ * released and try again
+ */
+ mutex_lock(&head->mutex);
+ mutex_unlock(&head->mutex);
+
+ btrfs_put_delayed_ref(ref);
+ cond_resched();
+ goto again;
+ } else {
+ WARN_ON(1);
+ }
+ node = rb_next(node);
+ }
+ spin_unlock(&delayed_refs->lock);
+ cond_resched();
+ goto again;
+ }
+out:
+ ret = btrfs_delayed_qgroup_accounting(trans, root->fs_info);
+ if (ret)
+ return ret;
+ assert_qgroups_uptodate(trans);
+ return 0;
+}
+
+int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes, u64 flags,
+ int level, int is_data)
+{
+ struct btrfs_delayed_extent_op *extent_op;
+ int ret;
+
+ extent_op = btrfs_alloc_delayed_extent_op();
+ if (!extent_op)
+ return -ENOMEM;
+
+ extent_op->flags_to_set = flags;
+ extent_op->update_flags = 1;
+ extent_op->update_key = 0;
+ extent_op->is_data = is_data ? 1 : 0;
+ extent_op->level = level;
+
+ ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
+ num_bytes, extent_op);
+ if (ret)
+ btrfs_free_delayed_extent_op(extent_op);
+ return ret;
+}
+
+static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 objectid, u64 offset, u64 bytenr)
+{
+ struct btrfs_delayed_ref_head *head;
+ struct btrfs_delayed_ref_node *ref;
+ struct btrfs_delayed_data_ref *data_ref;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct rb_node *node;
+ int ret = 0;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ spin_lock(&delayed_refs->lock);
+ head = btrfs_find_delayed_ref_head(trans, bytenr);
+ if (!head) {
+ spin_unlock(&delayed_refs->lock);
+ return 0;
+ }
+
+ if (!mutex_trylock(&head->mutex)) {
+ atomic_inc(&head->node.refs);
+ spin_unlock(&delayed_refs->lock);
+
+ btrfs_release_path(path);
+
+ /*
+ * Mutex was contended, block until it's released and let
+ * caller try again
+ */
+ mutex_lock(&head->mutex);
+ mutex_unlock(&head->mutex);
+ btrfs_put_delayed_ref(&head->node);
+ return -EAGAIN;
+ }
+ spin_unlock(&delayed_refs->lock);
+
+ spin_lock(&head->lock);
+ node = rb_first(&head->ref_root);
+ while (node) {
+ ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+ node = rb_next(node);
+
+ /* If it's a shared ref we know a cross reference exists */
+ if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
+ ret = 1;
+ break;
+ }
+
+ data_ref = btrfs_delayed_node_to_data_ref(ref);
+
+ /*
+ * If our ref doesn't match the one we're currently looking at
+ * then we have a cross reference.
+ */
+ if (data_ref->root != root->root_key.objectid ||
+ data_ref->objectid != objectid ||
+ data_ref->offset != offset) {
+ ret = 1;
+ break;
+ }
+ }
+ spin_unlock(&head->lock);
+ mutex_unlock(&head->mutex);
+ return ret;
+}
+
+static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 objectid, u64 offset, u64 bytenr)
+{
+ struct btrfs_root *extent_root = root->fs_info->extent_root;
+ struct extent_buffer *leaf;
+ struct btrfs_extent_data_ref *ref;
+ struct btrfs_extent_inline_ref *iref;
+ struct btrfs_extent_item *ei;
+ struct btrfs_key key;
+ u32 item_size;
+ int ret;
+
+ key.objectid = bytenr;
+ key.offset = (u64)-1;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+
+ ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ BUG_ON(ret == 0); /* Corruption */
+
+ ret = -ENOENT;
+ if (path->slots[0] == 0)
+ goto out;
+
+ path->slots[0]--;
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+ if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
+ goto out;
+
+ ret = 1;
+ item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+ if (item_size < sizeof(*ei)) {
+ WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
+ goto out;
+ }
+#endif
+ ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+
+ if (item_size != sizeof(*ei) +
+ btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
+ goto out;
+
+ if (btrfs_extent_generation(leaf, ei) <=
+ btrfs_root_last_snapshot(&root->root_item))
+ goto out;
+
+ iref = (struct btrfs_extent_inline_ref *)(ei + 1);
+ if (btrfs_extent_inline_ref_type(leaf, iref) !=
+ BTRFS_EXTENT_DATA_REF_KEY)
+ goto out;
+
+ ref = (struct btrfs_extent_data_ref *)(&iref->offset);
+ if (btrfs_extent_refs(leaf, ei) !=
+ btrfs_extent_data_ref_count(leaf, ref) ||
+ btrfs_extent_data_ref_root(leaf, ref) !=
+ root->root_key.objectid ||
+ btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
+ btrfs_extent_data_ref_offset(leaf, ref) != offset)
+ goto out;
+
+ ret = 0;
+out:
+ return ret;
+}
+
+int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 objectid, u64 offset, u64 bytenr)
+{
+ struct btrfs_path *path;
+ int ret;
+ int ret2;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOENT;
+
+ do {
+ ret = check_committed_ref(trans, root, path, objectid,
+ offset, bytenr);
+ if (ret && ret != -ENOENT)
+ goto out;
+
+ ret2 = check_delayed_ref(trans, root, path, objectid,
+ offset, bytenr);
+ } while (ret2 == -EAGAIN);
+
+ if (ret2 && ret2 != -ENOENT) {
+ ret = ret2;
+ goto out;
+ }
+
+ if (ret != -ENOENT || ret2 != -ENOENT)
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
+ WARN_ON(ret > 0);
+ return ret;
+}
+
+static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *buf,
+ int full_backref, int inc)
+{
+ u64 bytenr;
+ u64 num_bytes;
+ u64 parent;
+ u64 ref_root;
+ u32 nritems;
+ struct btrfs_key key;
+ struct btrfs_file_extent_item *fi;
+ int i;
+ int level;
+ int ret = 0;
+ int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
+ u64, u64, u64, u64, u64, u64, int);
+
+
+ if (btrfs_test_is_dummy_root(root))
+ return 0;
+
+ ref_root = btrfs_header_owner(buf);
+ nritems = btrfs_header_nritems(buf);
+ level = btrfs_header_level(buf);
+
+ if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)
+ return 0;
+
+ if (inc)
+ process_func = btrfs_inc_extent_ref;
+ else
+ process_func = btrfs_free_extent;
+
+ if (full_backref)
+ parent = buf->start;
+ else
+ parent = 0;
+
+ for (i = 0; i < nritems; i++) {
+ if (level == 0) {
+ btrfs_item_key_to_cpu(buf, &key, i);
+ if (key.type != BTRFS_EXTENT_DATA_KEY)
+ continue;
+ fi = btrfs_item_ptr(buf, i,
+ struct btrfs_file_extent_item);
+ if (btrfs_file_extent_type(buf, fi) ==
+ BTRFS_FILE_EXTENT_INLINE)
+ continue;
+ bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
+ if (bytenr == 0)
+ continue;
+
+ num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
+ key.offset -= btrfs_file_extent_offset(buf, fi);
+ ret = process_func(trans, root, bytenr, num_bytes,
+ parent, ref_root, key.objectid,
+ key.offset, 1);
+ if (ret)
+ goto fail;
+ } else {
+ bytenr = btrfs_node_blockptr(buf, i);
+ num_bytes = root->nodesize;
+ ret = process_func(trans, root, bytenr, num_bytes,
+ parent, ref_root, level - 1, 0,
+ 1);
+ if (ret)
+ goto fail;
+ }
+ }
+ return 0;
+fail:
+ return ret;
+}
+
+int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct extent_buffer *buf, int full_backref)
+{
+ return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
+}
+
+int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct extent_buffer *buf, int full_backref)
+{
+ return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
+}
+
+static int write_one_cache_group(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_block_group_cache *cache)
+{
+ int ret;
+ struct btrfs_root *extent_root = root->fs_info->extent_root;
+ unsigned long bi;
+ struct extent_buffer *leaf;
+
+ ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
+ if (ret) {
+ if (ret > 0)
+ ret = -ENOENT;
+ goto fail;
+ }
+
+ leaf = path->nodes[0];
+ bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
+ btrfs_mark_buffer_dirty(leaf);
+fail:
+ btrfs_release_path(path);
+ return ret;
+
+}
+
+static struct btrfs_block_group_cache *
+next_block_group(struct btrfs_root *root,
+ struct btrfs_block_group_cache *cache)
+{
+ struct rb_node *node;
+
+ spin_lock(&root->fs_info->block_group_cache_lock);
+
+ /* If our block group was removed, we need a full search. */
+ if (RB_EMPTY_NODE(&cache->cache_node)) {
+ const u64 next_bytenr = cache->key.objectid + cache->key.offset;
+
+ spin_unlock(&root->fs_info->block_group_cache_lock);
+ btrfs_put_block_group(cache);
+ cache = btrfs_lookup_first_block_group(root->fs_info,
+ next_bytenr);
+ return cache;
+ }
+ node = rb_next(&cache->cache_node);
+ btrfs_put_block_group(cache);
+ if (node) {
+ cache = rb_entry(node, struct btrfs_block_group_cache,
+ cache_node);
+ btrfs_get_block_group(cache);
+ } else
+ cache = NULL;
+ spin_unlock(&root->fs_info->block_group_cache_lock);
+ return cache;
+}
+
+static int cache_save_setup(struct btrfs_block_group_cache *block_group,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_path *path)
+{
+ struct btrfs_root *root = block_group->fs_info->tree_root;
+ struct inode *inode = NULL;
+ u64 alloc_hint = 0;
+ int dcs = BTRFS_DC_ERROR;
+ u64 num_pages = 0;
+ int retries = 0;
+ int ret = 0;
+
+ /*
+ * If this block group is smaller than 100 megs don't bother caching the
+ * block group.
+ */
+ if (block_group->key.offset < (100 * 1024 * 1024)) {
+ spin_lock(&block_group->lock);
+ block_group->disk_cache_state = BTRFS_DC_WRITTEN;
+ spin_unlock(&block_group->lock);
+ return 0;
+ }
+
+ if (trans->aborted)
+ return 0;
+again:
+ inode = lookup_free_space_inode(root, block_group, path);
+ if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
+ ret = PTR_ERR(inode);
+ btrfs_release_path(path);
+ goto out;
+ }
+
+ if (IS_ERR(inode)) {
+ BUG_ON(retries);
+ retries++;
+
+ if (block_group->ro)
+ goto out_free;
+
+ ret = create_free_space_inode(root, trans, block_group, path);
+ if (ret)
+ goto out_free;
+ goto again;
+ }
+
+ /* We've already setup this transaction, go ahead and exit */
+ if (block_group->cache_generation == trans->transid &&
+ i_size_read(inode)) {
+ dcs = BTRFS_DC_SETUP;
+ goto out_put;
+ }
+
+ /*
+ * We want to set the generation to 0, that way if anything goes wrong
+ * from here on out we know not to trust this cache when we load up next
+ * time.
+ */
+ BTRFS_I(inode)->generation = 0;
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret) {
+ /*
+ * So theoretically we could recover from this, simply set the
+ * super cache generation to 0 so we know to invalidate the
+ * cache, but then we'd have to keep track of the block groups
+ * that fail this way so we know we _have_ to reset this cache
+ * before the next commit or risk reading stale cache. So to
+ * limit our exposure to horrible edge cases lets just abort the
+ * transaction, this only happens in really bad situations
+ * anyway.
+ */
+ btrfs_abort_transaction(trans, root, ret);
+ goto out_put;
+ }
+ WARN_ON(ret);
+
+ if (i_size_read(inode) > 0) {
+ ret = btrfs_check_trunc_cache_free_space(root,
+ &root->fs_info->global_block_rsv);
+ if (ret)
+ goto out_put;
+
+ ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode);
+ if (ret)
+ goto out_put;
+ }
+
+ spin_lock(&block_group->lock);
+ if (block_group->cached != BTRFS_CACHE_FINISHED ||
+ !btrfs_test_opt(root, SPACE_CACHE)) {
+ /*
+ * don't bother trying to write stuff out _if_
+ * a) we're not cached,
+ * b) we're with nospace_cache mount option.
+ */
+ dcs = BTRFS_DC_WRITTEN;
+ spin_unlock(&block_group->lock);
+ goto out_put;
+ }
+ spin_unlock(&block_group->lock);
+
+ /*
+ * Try to preallocate enough space based on how big the block group is.
+ * Keep in mind this has to include any pinned space which could end up
+ * taking up quite a bit since it's not folded into the other space
+ * cache.
+ */
+ num_pages = div_u64(block_group->key.offset, 256 * 1024 * 1024);
+ if (!num_pages)
+ num_pages = 1;
+
+ num_pages *= 16;
+ num_pages *= PAGE_CACHE_SIZE;
+
+ ret = btrfs_check_data_free_space(inode, num_pages, num_pages);
+ if (ret)
+ goto out_put;
+
+ ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
+ num_pages, num_pages,
+ &alloc_hint);
+ if (!ret)
+ dcs = BTRFS_DC_SETUP;
+ btrfs_free_reserved_data_space(inode, num_pages);
+
+out_put:
+ iput(inode);
+out_free:
+ btrfs_release_path(path);
+out:
+ spin_lock(&block_group->lock);
+ if (!ret && dcs == BTRFS_DC_SETUP)
+ block_group->cache_generation = trans->transid;
+ block_group->disk_cache_state = dcs;
+ spin_unlock(&block_group->lock);
+
+ return ret;
+}
+
+int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_block_group_cache *cache, *tmp;
+ struct btrfs_transaction *cur_trans = trans->transaction;
+ struct btrfs_path *path;
+
+ if (list_empty(&cur_trans->dirty_bgs) ||
+ !btrfs_test_opt(root, SPACE_CACHE))
+ return 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ /* Could add new block groups, use _safe just in case */
+ list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
+ dirty_list) {
+ if (cache->disk_cache_state == BTRFS_DC_CLEAR)
+ cache_save_setup(cache, trans, path);
+ }
+
+ btrfs_free_path(path);
+ return 0;
+}
+
+/*
+ * transaction commit does final block group cache writeback during a
+ * critical section where nothing is allowed to change the FS. This is
+ * required in order for the cache to actually match the block group,
+ * but can introduce a lot of latency into the commit.
+ *
+ * So, btrfs_start_dirty_block_groups is here to kick off block group
+ * cache IO. There's a chance we'll have to redo some of it if the
+ * block group changes again during the commit, but it greatly reduces
+ * the commit latency by getting rid of the easy block groups while
+ * we're still allowing others to join the commit.
+ */
+int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_block_group_cache *cache;
+ struct btrfs_transaction *cur_trans = trans->transaction;
+ int ret = 0;
+ int should_put;
+ struct btrfs_path *path = NULL;
+ LIST_HEAD(dirty);
+ struct list_head *io = &cur_trans->io_bgs;
+ int num_started = 0;
+ int loops = 0;
+
+ spin_lock(&cur_trans->dirty_bgs_lock);
+ if (list_empty(&cur_trans->dirty_bgs)) {
+ spin_unlock(&cur_trans->dirty_bgs_lock);
+ return 0;
+ }
+ list_splice_init(&cur_trans->dirty_bgs, &dirty);
+ spin_unlock(&cur_trans->dirty_bgs_lock);
+
+again:
+ /*
+ * make sure all the block groups on our dirty list actually
+ * exist
+ */
+ btrfs_create_pending_block_groups(trans, root);
+
+ if (!path) {
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ }
+
+ /*
+ * cache_write_mutex is here only to save us from balance or automatic
+ * removal of empty block groups deleting this block group while we are
+ * writing out the cache
+ */
+ mutex_lock(&trans->transaction->cache_write_mutex);
+ while (!list_empty(&dirty)) {
+ cache = list_first_entry(&dirty,
+ struct btrfs_block_group_cache,
+ dirty_list);
+ /*
+ * this can happen if something re-dirties a block
+ * group that is already under IO. Just wait for it to
+ * finish and then do it all again
+ */
+ if (!list_empty(&cache->io_list)) {
+ list_del_init(&cache->io_list);
+ btrfs_wait_cache_io(root, trans, cache,
+ &cache->io_ctl, path,
+ cache->key.objectid);
+ btrfs_put_block_group(cache);
+ }
+
+
+ /*
+ * btrfs_wait_cache_io uses the cache->dirty_list to decide
+ * if it should update the cache_state. Don't delete
+ * until after we wait.
+ *
+ * Since we're not running in the commit critical section
+ * we need the dirty_bgs_lock to protect from update_block_group
+ */
+ spin_lock(&cur_trans->dirty_bgs_lock);
+ list_del_init(&cache->dirty_list);
+ spin_unlock(&cur_trans->dirty_bgs_lock);
+
+ should_put = 1;
+
+ cache_save_setup(cache, trans, path);
+
+ if (cache->disk_cache_state == BTRFS_DC_SETUP) {
+ cache->io_ctl.inode = NULL;
+ ret = btrfs_write_out_cache(root, trans, cache, path);
+ if (ret == 0 && cache->io_ctl.inode) {
+ num_started++;
+ should_put = 0;
+
+ /*
+ * the cache_write_mutex is protecting
+ * the io_list
+ */
+ list_add_tail(&cache->io_list, io);
+ } else {
+ /*
+ * if we failed to write the cache, the
+ * generation will be bad and life goes on
+ */
+ ret = 0;
+ }
+ }
+ if (!ret) {
+ ret = write_one_cache_group(trans, root, path, cache);
+ /*
+ * Our block group might still be attached to the list
+ * of new block groups in the transaction handle of some
+ * other task (struct btrfs_trans_handle->new_bgs). This
+ * means its block group item isn't yet in the extent
+ * tree. If this happens ignore the error, as we will
+ * try again later in the critical section of the
+ * transaction commit.
+ */
+ if (ret == -ENOENT) {
+ ret = 0;
+ spin_lock(&cur_trans->dirty_bgs_lock);
+ if (list_empty(&cache->dirty_list)) {
+ list_add_tail(&cache->dirty_list,
+ &cur_trans->dirty_bgs);
+ btrfs_get_block_group(cache);
+ }
+ spin_unlock(&cur_trans->dirty_bgs_lock);
+ } else if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ }
+ }
+
+ /* if its not on the io list, we need to put the block group */
+ if (should_put)
+ btrfs_put_block_group(cache);
+
+ if (ret)
+ break;
+
+ /*
+ * Avoid blocking other tasks for too long. It might even save
+ * us from writing caches for block groups that are going to be
+ * removed.
+ */
+ mutex_unlock(&trans->transaction->cache_write_mutex);
+ mutex_lock(&trans->transaction->cache_write_mutex);
+ }
+ mutex_unlock(&trans->transaction->cache_write_mutex);
+
+ /*
+ * go through delayed refs for all the stuff we've just kicked off
+ * and then loop back (just once)
+ */
+ ret = btrfs_run_delayed_refs(trans, root, 0);
+ if (!ret && loops == 0) {
+ loops++;
+ spin_lock(&cur_trans->dirty_bgs_lock);
+ list_splice_init(&cur_trans->dirty_bgs, &dirty);
+ /*
+ * dirty_bgs_lock protects us from concurrent block group
+ * deletes too (not just cache_write_mutex).
+ */
+ if (!list_empty(&dirty)) {
+ spin_unlock(&cur_trans->dirty_bgs_lock);
+ goto again;
+ }
+ spin_unlock(&cur_trans->dirty_bgs_lock);
+ }
+
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_block_group_cache *cache;
+ struct btrfs_transaction *cur_trans = trans->transaction;
+ int ret = 0;
+ int should_put;
+ struct btrfs_path *path;
+ struct list_head *io = &cur_trans->io_bgs;
+ int num_started = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ /*
+ * We don't need the lock here since we are protected by the transaction
+ * commit. We want to do the cache_save_setup first and then run the
+ * delayed refs to make sure we have the best chance at doing this all
+ * in one shot.
+ */
+ while (!list_empty(&cur_trans->dirty_bgs)) {
+ cache = list_first_entry(&cur_trans->dirty_bgs,
+ struct btrfs_block_group_cache,
+ dirty_list);
+
+ /*
+ * this can happen if cache_save_setup re-dirties a block
+ * group that is already under IO. Just wait for it to
+ * finish and then do it all again
+ */
+ if (!list_empty(&cache->io_list)) {
+ list_del_init(&cache->io_list);
+ btrfs_wait_cache_io(root, trans, cache,
+ &cache->io_ctl, path,
+ cache->key.objectid);
+ btrfs_put_block_group(cache);
+ }
+
+ /*
+ * don't remove from the dirty list until after we've waited
+ * on any pending IO
+ */
+ list_del_init(&cache->dirty_list);
+ should_put = 1;
+
+ cache_save_setup(cache, trans, path);
+
+ if (!ret)
+ ret = btrfs_run_delayed_refs(trans, root, (unsigned long) -1);
+
+ if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
+ cache->io_ctl.inode = NULL;
+ ret = btrfs_write_out_cache(root, trans, cache, path);
+ if (ret == 0 && cache->io_ctl.inode) {
+ num_started++;
+ should_put = 0;
+ list_add_tail(&cache->io_list, io);
+ } else {
+ /*
+ * if we failed to write the cache, the
+ * generation will be bad and life goes on
+ */
+ ret = 0;
+ }
+ }
+ if (!ret) {
+ ret = write_one_cache_group(trans, root, path, cache);
+ if (ret)
+ btrfs_abort_transaction(trans, root, ret);
+ }
+
+ /* if its not on the io list, we need to put the block group */
+ if (should_put)
+ btrfs_put_block_group(cache);
+ }
+
+ while (!list_empty(io)) {
+ cache = list_first_entry(io, struct btrfs_block_group_cache,
+ io_list);
+ list_del_init(&cache->io_list);
+ btrfs_wait_cache_io(root, trans, cache,
+ &cache->io_ctl, path, cache->key.objectid);
+ btrfs_put_block_group(cache);
+ }
+
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
+{
+ struct btrfs_block_group_cache *block_group;
+ int readonly = 0;
+
+ block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
+ if (!block_group || block_group->ro)
+ readonly = 1;
+ if (block_group)
+ btrfs_put_block_group(block_group);
+ return readonly;
+}
+
+static const char *alloc_name(u64 flags)
+{
+ switch (flags) {
+ case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA:
+ return "mixed";
+ case BTRFS_BLOCK_GROUP_METADATA:
+ return "metadata";
+ case BTRFS_BLOCK_GROUP_DATA:
+ return "data";
+ case BTRFS_BLOCK_GROUP_SYSTEM:
+ return "system";
+ default:
+ WARN_ON(1);
+ return "invalid-combination";
+ };
+}
+
+static int update_space_info(struct btrfs_fs_info *info, u64 flags,
+ u64 total_bytes, u64 bytes_used,
+ struct btrfs_space_info **space_info)
+{
+ struct btrfs_space_info *found;
+ int i;
+ int factor;
+ int ret;
+
+ if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID10))
+ factor = 2;
+ else
+ factor = 1;
+
+ found = __find_space_info(info, flags);
+ if (found) {
+ spin_lock(&found->lock);
+ found->total_bytes += total_bytes;
+ found->disk_total += total_bytes * factor;
+ found->bytes_used += bytes_used;
+ found->disk_used += bytes_used * factor;
+ found->full = 0;
+ spin_unlock(&found->lock);
+ *space_info = found;
+ return 0;
+ }
+ found = kzalloc(sizeof(*found), GFP_NOFS);
+ if (!found)
+ return -ENOMEM;
+
+ ret = percpu_counter_init(&found->total_bytes_pinned, 0, GFP_KERNEL);
+ if (ret) {
+ kfree(found);
+ return ret;
+ }
+
+ for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
+ INIT_LIST_HEAD(&found->block_groups[i]);
+ init_rwsem(&found->groups_sem);
+ spin_lock_init(&found->lock);
+ found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
+ found->total_bytes = total_bytes;
+ found->disk_total = total_bytes * factor;
+ found->bytes_used = bytes_used;
+ found->disk_used = bytes_used * factor;
+ found->bytes_pinned = 0;
+ found->bytes_reserved = 0;
+ found->bytes_readonly = 0;
+ found->bytes_may_use = 0;
+ found->full = 0;
+ found->force_alloc = CHUNK_ALLOC_NO_FORCE;
+ found->chunk_alloc = 0;
+ found->flush = 0;
+ init_waitqueue_head(&found->wait);
+ INIT_LIST_HEAD(&found->ro_bgs);
+
+ ret = kobject_init_and_add(&found->kobj, &space_info_ktype,
+ info->space_info_kobj, "%s",
+ alloc_name(found->flags));
+ if (ret) {
+ kfree(found);
+ return ret;
+ }
+
+ *space_info = found;
+ list_add_rcu(&found->list, &info->space_info);
+ if (flags & BTRFS_BLOCK_GROUP_DATA)
+ info->data_sinfo = found;
+
+ return ret;
+}
+
+static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
+{
+ u64 extra_flags = chunk_to_extended(flags) &
+ BTRFS_EXTENDED_PROFILE_MASK;
+
+ write_seqlock(&fs_info->profiles_lock);
+ if (flags & BTRFS_BLOCK_GROUP_DATA)
+ fs_info->avail_data_alloc_bits |= extra_flags;
+ if (flags & BTRFS_BLOCK_GROUP_METADATA)
+ fs_info->avail_metadata_alloc_bits |= extra_flags;
+ if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+ fs_info->avail_system_alloc_bits |= extra_flags;
+ write_sequnlock(&fs_info->profiles_lock);
+}
+
+/*
+ * returns target flags in extended format or 0 if restripe for this
+ * chunk_type is not in progress
+ *
+ * should be called with either volume_mutex or balance_lock held
+ */
+static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
+{
+ struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+ u64 target = 0;
+
+ if (!bctl)
+ return 0;
+
+ if (flags & BTRFS_BLOCK_GROUP_DATA &&
+ bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+ target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
+ } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
+ bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+ target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
+ } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
+ bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+ target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
+ }
+
+ return target;
+}
+
+/*
+ * @flags: available profiles in extended format (see ctree.h)
+ *
+ * Returns reduced profile in chunk format. If profile changing is in
+ * progress (either running or paused) picks the target profile (if it's
+ * already available), otherwise falls back to plain reducing.
+ */
+static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
+{
+ u64 num_devices = root->fs_info->fs_devices->rw_devices;
+ u64 target;
+ u64 tmp;
+
+ /*
+ * see if restripe for this chunk_type is in progress, if so
+ * try to reduce to the target profile
+ */
+ spin_lock(&root->fs_info->balance_lock);
+ target = get_restripe_target(root->fs_info, flags);
+ if (target) {
+ /* pick target profile only if it's already available */
+ if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
+ spin_unlock(&root->fs_info->balance_lock);
+ return extended_to_chunk(target);
+ }
+ }
+ spin_unlock(&root->fs_info->balance_lock);
+
+ /* First, mask out the RAID levels which aren't possible */
+ if (num_devices == 1)
+ flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID5);
+ if (num_devices < 3)
+ flags &= ~BTRFS_BLOCK_GROUP_RAID6;
+ if (num_devices < 4)
+ flags &= ~BTRFS_BLOCK_GROUP_RAID10;
+
+ tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 |
+ BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10);
+ flags &= ~tmp;
+
+ if (tmp & BTRFS_BLOCK_GROUP_RAID6)
+ tmp = BTRFS_BLOCK_GROUP_RAID6;
+ else if (tmp & BTRFS_BLOCK_GROUP_RAID5)
+ tmp = BTRFS_BLOCK_GROUP_RAID5;
+ else if (tmp & BTRFS_BLOCK_GROUP_RAID10)
+ tmp = BTRFS_BLOCK_GROUP_RAID10;
+ else if (tmp & BTRFS_BLOCK_GROUP_RAID1)
+ tmp = BTRFS_BLOCK_GROUP_RAID1;
+ else if (tmp & BTRFS_BLOCK_GROUP_RAID0)
+ tmp = BTRFS_BLOCK_GROUP_RAID0;
+
+ return extended_to_chunk(flags | tmp);
+}
+
+static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags)
+{
+ unsigned seq;
+ u64 flags;
+
+ do {
+ flags = orig_flags;
+ seq = read_seqbegin(&root->fs_info->profiles_lock);
+
+ if (flags & BTRFS_BLOCK_GROUP_DATA)
+ flags |= root->fs_info->avail_data_alloc_bits;
+ else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+ flags |= root->fs_info->avail_system_alloc_bits;
+ else if (flags & BTRFS_BLOCK_GROUP_METADATA)
+ flags |= root->fs_info->avail_metadata_alloc_bits;
+ } while (read_seqretry(&root->fs_info->profiles_lock, seq));
+
+ return btrfs_reduce_alloc_profile(root, flags);
+}
+
+u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
+{
+ u64 flags;
+ u64 ret;
+
+ if (data)
+ flags = BTRFS_BLOCK_GROUP_DATA;
+ else if (root == root->fs_info->chunk_root)
+ flags = BTRFS_BLOCK_GROUP_SYSTEM;
+ else
+ flags = BTRFS_BLOCK_GROUP_METADATA;
+
+ ret = get_alloc_profile(root, flags);
+ return ret;
+}
+
+/*
+ * This will check the space that the inode allocates from to make sure we have
+ * enough space for bytes.
+ */
+int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes)
+{
+ struct btrfs_space_info *data_sinfo;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ u64 used;
+ int ret = 0;
+ int need_commit = 2;
+ int have_pinned_space;
+
+ /* make sure bytes are sectorsize aligned */
+ bytes = ALIGN(bytes, root->sectorsize);
+
+ if (btrfs_is_free_space_inode(inode)) {
+ need_commit = 0;
+ ASSERT(current->journal_info);
+ }
+
+ data_sinfo = fs_info->data_sinfo;
+ if (!data_sinfo)
+ goto alloc;
+
+again:
+ /* make sure we have enough space to handle the data first */
+ spin_lock(&data_sinfo->lock);
+ used = data_sinfo->bytes_used + data_sinfo->bytes_reserved +
+ data_sinfo->bytes_pinned + data_sinfo->bytes_readonly +
+ data_sinfo->bytes_may_use;
+
+ if (used + bytes > data_sinfo->total_bytes) {
+ struct btrfs_trans_handle *trans;
+
+ /*
+ * if we don't have enough free bytes in this space then we need
+ * to alloc a new chunk.
+ */
+ if (!data_sinfo->full) {
+ u64 alloc_target;
+
+ data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
+ spin_unlock(&data_sinfo->lock);
+alloc:
+ alloc_target = btrfs_get_alloc_profile(root, 1);
+ /*
+ * It is ugly that we don't call nolock join
+ * transaction for the free space inode case here.
+ * But it is safe because we only do the data space
+ * reservation for the free space cache in the
+ * transaction context, the common join transaction
+ * just increase the counter of the current transaction
+ * handler, doesn't try to acquire the trans_lock of
+ * the fs.
+ */
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+ alloc_target,
+ CHUNK_ALLOC_NO_FORCE);
+ btrfs_end_transaction(trans, root);
+ if (ret < 0) {
+ if (ret != -ENOSPC)
+ return ret;
+ else {
+ have_pinned_space = 1;
+ goto commit_trans;
+ }
+ }
+
+ if (!data_sinfo)
+ data_sinfo = fs_info->data_sinfo;
+
+ goto again;
+ }
+
+ /*
+ * If we don't have enough pinned space to deal with this
+ * allocation, and no removed chunk in current transaction,
+ * don't bother committing the transaction.
+ */
+ have_pinned_space = percpu_counter_compare(
+ &data_sinfo->total_bytes_pinned,
+ used + bytes - data_sinfo->total_bytes);
+ spin_unlock(&data_sinfo->lock);
+
+ /* commit the current transaction and try again */
+commit_trans:
+ if (need_commit &&
+ !atomic_read(&root->fs_info->open_ioctl_trans)) {
+ need_commit--;
+
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+ if (have_pinned_space >= 0 ||
+ trans->transaction->have_free_bgs ||
+ need_commit > 0) {
+ ret = btrfs_commit_transaction(trans, root);
+ if (ret)
+ return ret;
+ /*
+ * make sure that all running delayed iput are
+ * done
+ */
+ down_write(&root->fs_info->delayed_iput_sem);
+ up_write(&root->fs_info->delayed_iput_sem);
+ goto again;
+ } else {
+ btrfs_end_transaction(trans, root);
+ }
+ }
+
+ trace_btrfs_space_reservation(root->fs_info,
+ "space_info:enospc",
+ data_sinfo->flags, bytes, 1);
+ return -ENOSPC;
+ }
+ ret = btrfs_qgroup_reserve(root, write_bytes);
+ if (ret)
+ goto out;
+ data_sinfo->bytes_may_use += bytes;
+ trace_btrfs_space_reservation(root->fs_info, "space_info",
+ data_sinfo->flags, bytes, 1);
+out:
+ spin_unlock(&data_sinfo->lock);
+
+ return ret;
+}
+
+/*
+ * Called if we need to clear a data reservation for this inode.
+ */
+void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_space_info *data_sinfo;
+
+ /* make sure bytes are sectorsize aligned */
+ bytes = ALIGN(bytes, root->sectorsize);
+
+ data_sinfo = root->fs_info->data_sinfo;
+ spin_lock(&data_sinfo->lock);
+ WARN_ON(data_sinfo->bytes_may_use < bytes);
+ data_sinfo->bytes_may_use -= bytes;
+ trace_btrfs_space_reservation(root->fs_info, "space_info",
+ data_sinfo->flags, bytes, 0);
+ spin_unlock(&data_sinfo->lock);
+}
+
+static void force_metadata_allocation(struct btrfs_fs_info *info)
+{
+ struct list_head *head = &info->space_info;
+ struct btrfs_space_info *found;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(found, head, list) {
+ if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
+ found->force_alloc = CHUNK_ALLOC_FORCE;
+ }
+ rcu_read_unlock();
+}
+
+static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
+{
+ return (global->size << 1);
+}
+
+static int should_alloc_chunk(struct btrfs_root *root,
+ struct btrfs_space_info *sinfo, int force)
+{
+ struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
+ u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
+ u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
+ u64 thresh;
+
+ if (force == CHUNK_ALLOC_FORCE)
+ return 1;
+
+ /*
+ * We need to take into account the global rsv because for all intents
+ * and purposes it's used space. Don't worry about locking the
+ * global_rsv, it doesn't change except when the transaction commits.
+ */
+ if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
+ num_allocated += calc_global_rsv_need_space(global_rsv);
+
+ /*
+ * in limited mode, we want to have some free space up to
+ * about 1% of the FS size.
+ */
+ if (force == CHUNK_ALLOC_LIMITED) {
+ thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
+ thresh = max_t(u64, 64 * 1024 * 1024,
+ div_factor_fine(thresh, 1));
+
+ if (num_bytes - num_allocated < thresh)
+ return 1;
+ }
+
+ if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8))
+ return 0;
+ return 1;
+}
+
+static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
+{
+ u64 num_dev;
+
+ if (type & (BTRFS_BLOCK_GROUP_RAID10 |
+ BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID5 |
+ BTRFS_BLOCK_GROUP_RAID6))
+ num_dev = root->fs_info->fs_devices->rw_devices;
+ else if (type & BTRFS_BLOCK_GROUP_RAID1)
+ num_dev = 2;
+ else
+ num_dev = 1; /* DUP or single */
+
+ /* metadata for updaing devices and chunk tree */
+ return btrfs_calc_trans_metadata_size(root, num_dev + 1);
+}
+
+static void check_system_chunk(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 type)
+{
+ struct btrfs_space_info *info;
+ u64 left;
+ u64 thresh;
+
+ info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
+ spin_lock(&info->lock);
+ left = info->total_bytes - info->bytes_used - info->bytes_pinned -
+ info->bytes_reserved - info->bytes_readonly;
+ spin_unlock(&info->lock);
+
+ thresh = get_system_chunk_thresh(root, type);
+ if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) {
+ btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu",
+ left, thresh, type);
+ dump_space_info(info, 0, 0);
+ }
+
+ if (left < thresh) {
+ u64 flags;
+
+ flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0);
+ btrfs_alloc_chunk(trans, root, flags);
+ }
+}
+
+static int do_chunk_alloc(struct btrfs_trans_handle *trans,
+ struct btrfs_root *extent_root, u64 flags, int force)
+{
+ struct btrfs_space_info *space_info;
+ struct btrfs_fs_info *fs_info = extent_root->fs_info;
+ int wait_for_alloc = 0;
+ int ret = 0;
+
+ /* Don't re-enter if we're already allocating a chunk */
+ if (trans->allocating_chunk)
+ return -ENOSPC;
+
+ space_info = __find_space_info(extent_root->fs_info, flags);
+ if (!space_info) {
+ ret = update_space_info(extent_root->fs_info, flags,
+ 0, 0, &space_info);
+ BUG_ON(ret); /* -ENOMEM */
+ }
+ BUG_ON(!space_info); /* Logic error */
+
+again:
+ spin_lock(&space_info->lock);
+ if (force < space_info->force_alloc)
+ force = space_info->force_alloc;
+ if (space_info->full) {
+ if (should_alloc_chunk(extent_root, space_info, force))
+ ret = -ENOSPC;
+ else
+ ret = 0;
+ spin_unlock(&space_info->lock);
+ return ret;
+ }
+
+ if (!should_alloc_chunk(extent_root, space_info, force)) {
+ spin_unlock(&space_info->lock);
+ return 0;
+ } else if (space_info->chunk_alloc) {
+ wait_for_alloc = 1;
+ } else {
+ space_info->chunk_alloc = 1;
+ }
+
+ spin_unlock(&space_info->lock);
+
+ mutex_lock(&fs_info->chunk_mutex);
+
+ /*
+ * The chunk_mutex is held throughout the entirety of a chunk
+ * allocation, so once we've acquired the chunk_mutex we know that the
+ * other guy is done and we need to recheck and see if we should
+ * allocate.
+ */
+ if (wait_for_alloc) {
+ mutex_unlock(&fs_info->chunk_mutex);
+ wait_for_alloc = 0;
+ goto again;
+ }
+
+ trans->allocating_chunk = true;
+
+ /*
+ * If we have mixed data/metadata chunks we want to make sure we keep
+ * allocating mixed chunks instead of individual chunks.
+ */
+ if (btrfs_mixed_space_info(space_info))
+ flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
+
+ /*
+ * if we're doing a data chunk, go ahead and make sure that
+ * we keep a reasonable number of metadata chunks allocated in the
+ * FS as well.
+ */
+ if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
+ fs_info->data_chunk_allocations++;
+ if (!(fs_info->data_chunk_allocations %
+ fs_info->metadata_ratio))
+ force_metadata_allocation(fs_info);
+ }
+
+ /*
+ * Check if we have enough space in SYSTEM chunk because we may need
+ * to update devices.
+ */
+ check_system_chunk(trans, extent_root, flags);
+
+ ret = btrfs_alloc_chunk(trans, extent_root, flags);
+ trans->allocating_chunk = false;
+
+ spin_lock(&space_info->lock);
+ if (ret < 0 && ret != -ENOSPC)
+ goto out;
+ if (ret)
+ space_info->full = 1;
+ else
+ ret = 1;
+
+ space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
+out:
+ space_info->chunk_alloc = 0;
+ spin_unlock(&space_info->lock);
+ mutex_unlock(&fs_info->chunk_mutex);
+ return ret;
+}
+
+static int can_overcommit(struct btrfs_root *root,
+ struct btrfs_space_info *space_info, u64 bytes,
+ enum btrfs_reserve_flush_enum flush)
+{
+ struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
+ u64 profile = btrfs_get_alloc_profile(root, 0);
+ u64 space_size;
+ u64 avail;
+ u64 used;
+
+ used = space_info->bytes_used + space_info->bytes_reserved +
+ space_info->bytes_pinned + space_info->bytes_readonly;
+
+ /*
+ * We only want to allow over committing if we have lots of actual space
+ * free, but if we don't have enough space to handle the global reserve
+ * space then we could end up having a real enospc problem when trying
+ * to allocate a chunk or some other such important allocation.
+ */
+ spin_lock(&global_rsv->lock);
+ space_size = calc_global_rsv_need_space(global_rsv);
+ spin_unlock(&global_rsv->lock);
+ if (used + space_size >= space_info->total_bytes)
+ return 0;
+
+ used += space_info->bytes_may_use;
+
+ spin_lock(&root->fs_info->free_chunk_lock);
+ avail = root->fs_info->free_chunk_space;
+ spin_unlock(&root->fs_info->free_chunk_lock);
+
+ /*
+ * If we have dup, raid1 or raid10 then only half of the free
+ * space is actually useable. For raid56, the space info used
+ * doesn't include the parity drive, so we don't have to
+ * change the math
+ */
+ if (profile & (BTRFS_BLOCK_GROUP_DUP |
+ BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID10))
+ avail >>= 1;
+
+ /*
+ * If we aren't flushing all things, let us overcommit up to
+ * 1/2th of the space. If we can flush, don't let us overcommit
+ * too much, let it overcommit up to 1/8 of the space.
+ */
+ if (flush == BTRFS_RESERVE_FLUSH_ALL)
+ avail >>= 3;
+ else
+ avail >>= 1;
+
+ if (used + bytes < space_info->total_bytes + avail)
+ return 1;
+ return 0;
+}
+
+static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
+ unsigned long nr_pages, int nr_items)
+{
+ struct super_block *sb = root->fs_info->sb;
+
+ if (down_read_trylock(&sb->s_umount)) {
+ writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
+ up_read(&sb->s_umount);
+ } else {
+ /*
+ * We needn't worry the filesystem going from r/w to r/o though
+ * we don't acquire ->s_umount mutex, because the filesystem
+ * should guarantee the delalloc inodes list be empty after
+ * the filesystem is readonly(all dirty pages are written to
+ * the disk).
+ */
+ btrfs_start_delalloc_roots(root->fs_info, 0, nr_items);
+ if (!current->journal_info)
+ btrfs_wait_ordered_roots(root->fs_info, nr_items);
+ }
+}
+
+static inline int calc_reclaim_items_nr(struct btrfs_root *root, u64 to_reclaim)
+{
+ u64 bytes;
+ int nr;
+
+ bytes = btrfs_calc_trans_metadata_size(root, 1);
+ nr = (int)div64_u64(to_reclaim, bytes);
+ if (!nr)
+ nr = 1;
+ return nr;
+}
+
+#define EXTENT_SIZE_PER_ITEM (256 * 1024)
+
+/*
+ * shrink metadata reservation for delalloc
+ */
+static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
+ bool wait_ordered)
+{
+ struct btrfs_block_rsv *block_rsv;
+ struct btrfs_space_info *space_info;
+ struct btrfs_trans_handle *trans;
+ u64 delalloc_bytes;
+ u64 max_reclaim;
+ long time_left;
+ unsigned long nr_pages;
+ int loops;
+ int items;
+ enum btrfs_reserve_flush_enum flush;
+
+ /* Calc the number of the pages we need flush for space reservation */
+ items = calc_reclaim_items_nr(root, to_reclaim);
+ to_reclaim = items * EXTENT_SIZE_PER_ITEM;
+
+ trans = (struct btrfs_trans_handle *)current->journal_info;
+ block_rsv = &root->fs_info->delalloc_block_rsv;
+ space_info = block_rsv->space_info;
+
+ delalloc_bytes = percpu_counter_sum_positive(
+ &root->fs_info->delalloc_bytes);
+ if (delalloc_bytes == 0) {
+ if (trans)
+ return;
+ if (wait_ordered)
+ btrfs_wait_ordered_roots(root->fs_info, items);
+ return;
+ }
+
+ loops = 0;
+ while (delalloc_bytes && loops < 3) {
+ max_reclaim = min(delalloc_bytes, to_reclaim);
+ nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
+ btrfs_writeback_inodes_sb_nr(root, nr_pages, items);
+ /*
+ * We need to wait for the async pages to actually start before
+ * we do anything.
+ */
+ max_reclaim = atomic_read(&root->fs_info->async_delalloc_pages);
+ if (!max_reclaim)
+ goto skip_async;
+
+ if (max_reclaim <= nr_pages)
+ max_reclaim = 0;
+ else
+ max_reclaim -= nr_pages;
+
+ wait_event(root->fs_info->async_submit_wait,
+ atomic_read(&root->fs_info->async_delalloc_pages) <=
+ (int)max_reclaim);
+skip_async:
+ if (!trans)
+ flush = BTRFS_RESERVE_FLUSH_ALL;
+ else
+ flush = BTRFS_RESERVE_NO_FLUSH;
+ spin_lock(&space_info->lock);
+ if (can_overcommit(root, space_info, orig, flush)) {
+ spin_unlock(&space_info->lock);
+ break;
+ }
+ spin_unlock(&space_info->lock);
+
+ loops++;
+ if (wait_ordered && !trans) {
+ btrfs_wait_ordered_roots(root->fs_info, items);
+ } else {
+ time_left = schedule_timeout_killable(1);
+ if (time_left)
+ break;
+ }
+ delalloc_bytes = percpu_counter_sum_positive(
+ &root->fs_info->delalloc_bytes);
+ }
+}
+
+/**
+ * maybe_commit_transaction - possibly commit the transaction if its ok to
+ * @root - the root we're allocating for
+ * @bytes - the number of bytes we want to reserve
+ * @force - force the commit
+ *
+ * This will check to make sure that committing the transaction will actually
+ * get us somewhere and then commit the transaction if it does. Otherwise it
+ * will return -ENOSPC.
+ */
+static int may_commit_transaction(struct btrfs_root *root,
+ struct btrfs_space_info *space_info,
+ u64 bytes, int force)
+{
+ struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv;
+ struct btrfs_trans_handle *trans;
+
+ trans = (struct btrfs_trans_handle *)current->journal_info;
+ if (trans)
+ return -EAGAIN;
+
+ if (force)
+ goto commit;
+
+ /* See if there is enough pinned space to make this reservation */
+ if (percpu_counter_compare(&space_info->total_bytes_pinned,
+ bytes) >= 0)
+ goto commit;
+
+ /*
+ * See if there is some space in the delayed insertion reservation for
+ * this reservation.
+ */
+ if (space_info != delayed_rsv->space_info)
+ return -ENOSPC;
+
+ spin_lock(&delayed_rsv->lock);
+ if (percpu_counter_compare(&space_info->total_bytes_pinned,
+ bytes - delayed_rsv->size) >= 0) {
+ spin_unlock(&delayed_rsv->lock);
+ return -ENOSPC;
+ }
+ spin_unlock(&delayed_rsv->lock);
+
+commit:
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans))
+ return -ENOSPC;
+
+ return btrfs_commit_transaction(trans, root);
+}
+
+enum flush_state {
+ FLUSH_DELAYED_ITEMS_NR = 1,
+ FLUSH_DELAYED_ITEMS = 2,
+ FLUSH_DELALLOC = 3,
+ FLUSH_DELALLOC_WAIT = 4,
+ ALLOC_CHUNK = 5,
+ COMMIT_TRANS = 6,
+};
+
+static int flush_space(struct btrfs_root *root,
+ struct btrfs_space_info *space_info, u64 num_bytes,
+ u64 orig_bytes, int state)
+{
+ struct btrfs_trans_handle *trans;
+ int nr;
+ int ret = 0;
+
+ switch (state) {
+ case FLUSH_DELAYED_ITEMS_NR:
+ case FLUSH_DELAYED_ITEMS:
+ if (state == FLUSH_DELAYED_ITEMS_NR)
+ nr = calc_reclaim_items_nr(root, num_bytes) * 2;
+ else
+ nr = -1;
+
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ break;
+ }
+ ret = btrfs_run_delayed_items_nr(trans, root, nr);
+ btrfs_end_transaction(trans, root);
+ break;
+ case FLUSH_DELALLOC:
+ case FLUSH_DELALLOC_WAIT:
+ shrink_delalloc(root, num_bytes * 2, orig_bytes,
+ state == FLUSH_DELALLOC_WAIT);
+ break;
+ case ALLOC_CHUNK:
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ break;
+ }
+ ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+ btrfs_get_alloc_profile(root, 0),
+ CHUNK_ALLOC_NO_FORCE);
+ btrfs_end_transaction(trans, root);
+ if (ret == -ENOSPC)
+ ret = 0;
+ break;
+ case COMMIT_TRANS:
+ ret = may_commit_transaction(root, space_info, orig_bytes, 0);
+ break;
+ default:
+ ret = -ENOSPC;
+ break;
+ }
+
+ return ret;
+}
+
+static inline u64
+btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
+ struct btrfs_space_info *space_info)
+{
+ u64 used;
+ u64 expected;
+ u64 to_reclaim;
+
+ to_reclaim = min_t(u64, num_online_cpus() * 1024 * 1024,
+ 16 * 1024 * 1024);
+ spin_lock(&space_info->lock);
+ if (can_overcommit(root, space_info, to_reclaim,
+ BTRFS_RESERVE_FLUSH_ALL)) {
+ to_reclaim = 0;
+ goto out;
+ }
+
+ used = space_info->bytes_used + space_info->bytes_reserved +
+ space_info->bytes_pinned + space_info->bytes_readonly +
+ space_info->bytes_may_use;
+ if (can_overcommit(root, space_info, 1024 * 1024,
+ BTRFS_RESERVE_FLUSH_ALL))
+ expected = div_factor_fine(space_info->total_bytes, 95);
+ else
+ expected = div_factor_fine(space_info->total_bytes, 90);
+
+ if (used > expected)
+ to_reclaim = used - expected;
+ else
+ to_reclaim = 0;
+ to_reclaim = min(to_reclaim, space_info->bytes_may_use +
+ space_info->bytes_reserved);
+out:
+ spin_unlock(&space_info->lock);
+
+ return to_reclaim;
+}
+
+static inline int need_do_async_reclaim(struct btrfs_space_info *space_info,
+ struct btrfs_fs_info *fs_info, u64 used)
+{
+ u64 thresh = div_factor_fine(space_info->total_bytes, 98);
+
+ /* If we're just plain full then async reclaim just slows us down. */
+ if (space_info->bytes_used >= thresh)
+ return 0;
+
+ return (used >= thresh && !btrfs_fs_closing(fs_info) &&
+ !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
+}
+
+static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info,
+ struct btrfs_fs_info *fs_info,
+ int flush_state)
+{
+ u64 used;
+
+ spin_lock(&space_info->lock);
+ /*
+ * We run out of space and have not got any free space via flush_space,
+ * so don't bother doing async reclaim.
+ */
+ if (flush_state > COMMIT_TRANS && space_info->full) {
+ spin_unlock(&space_info->lock);
+ return 0;
+ }
+
+ used = space_info->bytes_used + space_info->bytes_reserved +
+ space_info->bytes_pinned + space_info->bytes_readonly +
+ space_info->bytes_may_use;
+ if (need_do_async_reclaim(space_info, fs_info, used)) {
+ spin_unlock(&space_info->lock);
+ return 1;
+ }
+ spin_unlock(&space_info->lock);
+
+ return 0;
+}
+
+static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
+{
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_space_info *space_info;
+ u64 to_reclaim;
+ int flush_state;
+
+ fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
+ space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+
+ to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root,
+ space_info);
+ if (!to_reclaim)
+ return;
+
+ flush_state = FLUSH_DELAYED_ITEMS_NR;
+ do {
+ flush_space(fs_info->fs_root, space_info, to_reclaim,
+ to_reclaim, flush_state);
+ flush_state++;
+ if (!btrfs_need_do_async_reclaim(space_info, fs_info,
+ flush_state))
+ return;
+ } while (flush_state < COMMIT_TRANS);
+}
+
+void btrfs_init_async_reclaim_work(struct work_struct *work)
+{
+ INIT_WORK(work, btrfs_async_reclaim_metadata_space);
+}
+
+/**
+ * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
+ * @root - the root we're allocating for
+ * @block_rsv - the block_rsv we're allocating for
+ * @orig_bytes - the number of bytes we want
+ * @flush - whether or not we can flush to make our reservation
+ *
+ * This will reserve orgi_bytes number of bytes from the space info associated
+ * with the block_rsv. If there is not enough space it will make an attempt to
+ * flush out space to make room. It will do this by flushing delalloc if
+ * possible or committing the transaction. If flush is 0 then no attempts to
+ * regain reservations will be made and this will fail if there is not enough
+ * space already.
+ */
+static int reserve_metadata_bytes(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ u64 orig_bytes,
+ enum btrfs_reserve_flush_enum flush)
+{
+ struct btrfs_space_info *space_info = block_rsv->space_info;
+ u64 used;
+ u64 num_bytes = orig_bytes;
+ int flush_state = FLUSH_DELAYED_ITEMS_NR;
+ int ret = 0;
+ bool flushing = false;
+
+again:
+ ret = 0;
+ spin_lock(&space_info->lock);
+ /*
+ * We only want to wait if somebody other than us is flushing and we
+ * are actually allowed to flush all things.
+ */
+ while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing &&
+ space_info->flush) {
+ spin_unlock(&space_info->lock);
+ /*
+ * If we have a trans handle we can't wait because the flusher
+ * may have to commit the transaction, which would mean we would
+ * deadlock since we are waiting for the flusher to finish, but
+ * hold the current transaction open.
+ */
+ if (current->journal_info)
+ return -EAGAIN;
+ ret = wait_event_killable(space_info->wait, !space_info->flush);
+ /* Must have been killed, return */
+ if (ret)
+ return -EINTR;
+
+ spin_lock(&space_info->lock);
+ }
+
+ ret = -ENOSPC;
+ used = space_info->bytes_used + space_info->bytes_reserved +
+ space_info->bytes_pinned + space_info->bytes_readonly +
+ space_info->bytes_may_use;
+
+ /*
+ * The idea here is that we've not already over-reserved the block group
+ * then we can go ahead and save our reservation first and then start
+ * flushing if we need to. Otherwise if we've already overcommitted
+ * lets start flushing stuff first and then come back and try to make
+ * our reservation.
+ */
+ if (used <= space_info->total_bytes) {
+ if (used + orig_bytes <= space_info->total_bytes) {
+ space_info->bytes_may_use += orig_bytes;
+ trace_btrfs_space_reservation(root->fs_info,
+ "space_info", space_info->flags, orig_bytes, 1);
+ ret = 0;
+ } else {
+ /*
+ * Ok set num_bytes to orig_bytes since we aren't
+ * overocmmitted, this way we only try and reclaim what
+ * we need.
+ */
+ num_bytes = orig_bytes;
+ }
+ } else {
+ /*
+ * Ok we're over committed, set num_bytes to the overcommitted
+ * amount plus the amount of bytes that we need for this
+ * reservation.
+ */
+ num_bytes = used - space_info->total_bytes +
+ (orig_bytes * 2);
+ }
+
+ if (ret && can_overcommit(root, space_info, orig_bytes, flush)) {
+ space_info->bytes_may_use += orig_bytes;
+ trace_btrfs_space_reservation(root->fs_info, "space_info",
+ space_info->flags, orig_bytes,
+ 1);
+ ret = 0;
+ }
+
+ /*
+ * Couldn't make our reservation, save our place so while we're trying
+ * to reclaim space we can actually use it instead of somebody else
+ * stealing it from us.
+ *
+ * We make the other tasks wait for the flush only when we can flush
+ * all things.
+ */
+ if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
+ flushing = true;
+ space_info->flush = 1;
+ } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
+ used += orig_bytes;
+ /*
+ * We will do the space reservation dance during log replay,
+ * which means we won't have fs_info->fs_root set, so don't do
+ * the async reclaim as we will panic.
+ */
+ if (!root->fs_info->log_root_recovering &&
+ need_do_async_reclaim(space_info, root->fs_info, used) &&
+ !work_busy(&root->fs_info->async_reclaim_work))
+ queue_work(system_unbound_wq,
+ &root->fs_info->async_reclaim_work);
+ }
+ spin_unlock(&space_info->lock);
+
+ if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
+ goto out;
+
+ ret = flush_space(root, space_info, num_bytes, orig_bytes,
+ flush_state);
+ flush_state++;
+
+ /*
+ * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock
+ * would happen. So skip delalloc flush.
+ */
+ if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
+ (flush_state == FLUSH_DELALLOC ||
+ flush_state == FLUSH_DELALLOC_WAIT))
+ flush_state = ALLOC_CHUNK;
+
+ if (!ret)
+ goto again;
+ else if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
+ flush_state < COMMIT_TRANS)
+ goto again;
+ else if (flush == BTRFS_RESERVE_FLUSH_ALL &&
+ flush_state <= COMMIT_TRANS)
+ goto again;
+
+out:
+ if (ret == -ENOSPC &&
+ unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
+ struct btrfs_block_rsv *global_rsv =
+ &root->fs_info->global_block_rsv;
+
+ if (block_rsv != global_rsv &&
+ !block_rsv_use_bytes(global_rsv, orig_bytes))
+ ret = 0;
+ }
+ if (ret == -ENOSPC)
+ trace_btrfs_space_reservation(root->fs_info,
+ "space_info:enospc",
+ space_info->flags, orig_bytes, 1);
+ if (flushing) {
+ spin_lock(&space_info->lock);
+ space_info->flush = 0;
+ wake_up_all(&space_info->wait);
+ spin_unlock(&space_info->lock);
+ }
+ return ret;
+}
+
+static struct btrfs_block_rsv *get_block_rsv(
+ const struct btrfs_trans_handle *trans,
+ const struct btrfs_root *root)
+{
+ struct btrfs_block_rsv *block_rsv = NULL;
+
+ if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
+ block_rsv = trans->block_rsv;
+
+ if (root == root->fs_info->csum_root && trans->adding_csums)
+ block_rsv = trans->block_rsv;
+
+ if (root == root->fs_info->uuid_root)
+ block_rsv = trans->block_rsv;
+
+ if (!block_rsv)
+ block_rsv = root->block_rsv;
+
+ if (!block_rsv)
+ block_rsv = &root->fs_info->empty_block_rsv;
+
+ return block_rsv;
+}
+
+static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
+ u64 num_bytes)
+{
+ int ret = -ENOSPC;
+ spin_lock(&block_rsv->lock);
+ if (block_rsv->reserved >= num_bytes) {
+ block_rsv->reserved -= num_bytes;
+ if (block_rsv->reserved < block_rsv->size)
+ block_rsv->full = 0;
+ ret = 0;
+ }
+ spin_unlock(&block_rsv->lock);
+ return ret;
+}
+
+static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
+ u64 num_bytes, int update_size)
+{
+ spin_lock(&block_rsv->lock);
+ block_rsv->reserved += num_bytes;
+ if (update_size)
+ block_rsv->size += num_bytes;
+ else if (block_rsv->reserved >= block_rsv->size)
+ block_rsv->full = 1;
+ spin_unlock(&block_rsv->lock);
+}
+
+int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *dest, u64 num_bytes,
+ int min_factor)
+{
+ struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+ u64 min_bytes;
+
+ if (global_rsv->space_info != dest->space_info)
+ return -ENOSPC;
+
+ spin_lock(&global_rsv->lock);
+ min_bytes = div_factor(global_rsv->size, min_factor);
+ if (global_rsv->reserved < min_bytes + num_bytes) {
+ spin_unlock(&global_rsv->lock);
+ return -ENOSPC;
+ }
+ global_rsv->reserved -= num_bytes;
+ if (global_rsv->reserved < global_rsv->size)
+ global_rsv->full = 0;
+ spin_unlock(&global_rsv->lock);
+
+ block_rsv_add_bytes(dest, num_bytes, 1);
+ return 0;
+}
+
+static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *block_rsv,
+ struct btrfs_block_rsv *dest, u64 num_bytes)
+{
+ struct btrfs_space_info *space_info = block_rsv->space_info;
+
+ spin_lock(&block_rsv->lock);
+ if (num_bytes == (u64)-1)
+ num_bytes = block_rsv->size;
+ block_rsv->size -= num_bytes;
+ if (block_rsv->reserved >= block_rsv->size) {
+ num_bytes = block_rsv->reserved - block_rsv->size;
+ block_rsv->reserved = block_rsv->size;
+ block_rsv->full = 1;
+ } else {
+ num_bytes = 0;
+ }
+ spin_unlock(&block_rsv->lock);
+
+ if (num_bytes > 0) {
+ if (dest) {
+ spin_lock(&dest->lock);
+ if (!dest->full) {
+ u64 bytes_to_add;
+
+ bytes_to_add = dest->size - dest->reserved;
+ bytes_to_add = min(num_bytes, bytes_to_add);
+ dest->reserved += bytes_to_add;
+ if (dest->reserved >= dest->size)
+ dest->full = 1;
+ num_bytes -= bytes_to_add;
+ }
+ spin_unlock(&dest->lock);
+ }
+ if (num_bytes) {
+ spin_lock(&space_info->lock);
+ space_info->bytes_may_use -= num_bytes;
+ trace_btrfs_space_reservation(fs_info, "space_info",
+ space_info->flags, num_bytes, 0);
+ spin_unlock(&space_info->lock);
+ }
+ }
+}
+
+static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
+ struct btrfs_block_rsv *dst, u64 num_bytes)
+{
+ int ret;
+
+ ret = block_rsv_use_bytes(src, num_bytes);
+ if (ret)
+ return ret;
+
+ block_rsv_add_bytes(dst, num_bytes, 1);
+ return 0;
+}
+
+void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
+{
+ memset(rsv, 0, sizeof(*rsv));
+ spin_lock_init(&rsv->lock);
+ rsv->type = type;
+}
+
+struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
+ unsigned short type)
+{
+ struct btrfs_block_rsv *block_rsv;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+
+ block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
+ if (!block_rsv)
+ return NULL;
+
+ btrfs_init_block_rsv(block_rsv, type);
+ block_rsv->space_info = __find_space_info(fs_info,
+ BTRFS_BLOCK_GROUP_METADATA);
+ return block_rsv;
+}
+
+void btrfs_free_block_rsv(struct btrfs_root *root,
+ struct btrfs_block_rsv *rsv)
+{
+ if (!rsv)
+ return;
+ btrfs_block_rsv_release(root, rsv, (u64)-1);
+ kfree(rsv);
+}
+
+void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv)
+{
+ kfree(rsv);
+}
+
+int btrfs_block_rsv_add(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv, u64 num_bytes,
+ enum btrfs_reserve_flush_enum flush)
+{
+ int ret;
+
+ if (num_bytes == 0)
+ return 0;
+
+ ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
+ if (!ret) {
+ block_rsv_add_bytes(block_rsv, num_bytes, 1);
+ return 0;
+ }
+
+ return ret;
+}
+
+int btrfs_block_rsv_check(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv, int min_factor)
+{
+ u64 num_bytes = 0;
+ int ret = -ENOSPC;
+
+ if (!block_rsv)
+ return 0;
+
+ spin_lock(&block_rsv->lock);
+ num_bytes = div_factor(block_rsv->size, min_factor);
+ if (block_rsv->reserved >= num_bytes)
+ ret = 0;
+ spin_unlock(&block_rsv->lock);
+
+ return ret;
+}
+
+int btrfs_block_rsv_refill(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv, u64 min_reserved,
+ enum btrfs_reserve_flush_enum flush)
+{
+ u64 num_bytes = 0;
+ int ret = -ENOSPC;
+
+ if (!block_rsv)
+ return 0;
+
+ spin_lock(&block_rsv->lock);
+ num_bytes = min_reserved;
+ if (block_rsv->reserved >= num_bytes)
+ ret = 0;
+ else
+ num_bytes -= block_rsv->reserved;
+ spin_unlock(&block_rsv->lock);
+
+ if (!ret)
+ return 0;
+
+ ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
+ if (!ret) {
+ block_rsv_add_bytes(block_rsv, num_bytes, 0);
+ return 0;
+ }
+
+ return ret;
+}
+
+int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
+ struct btrfs_block_rsv *dst_rsv,
+ u64 num_bytes)
+{
+ return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
+}
+
+void btrfs_block_rsv_release(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ u64 num_bytes)
+{
+ struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
+ if (global_rsv == block_rsv ||
+ block_rsv->space_info != global_rsv->space_info)
+ global_rsv = NULL;
+ block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv,
+ num_bytes);
+}
+
+/*
+ * helper to calculate size of global block reservation.
+ * the desired value is sum of space used by extent tree,
+ * checksum tree and root tree
+ */
+static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_space_info *sinfo;
+ u64 num_bytes;
+ u64 meta_used;
+ u64 data_used;
+ int csum_size = btrfs_super_csum_size(fs_info->super_copy);
+
+ sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
+ spin_lock(&sinfo->lock);
+ data_used = sinfo->bytes_used;
+ spin_unlock(&sinfo->lock);
+
+ sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+ spin_lock(&sinfo->lock);
+ if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA)
+ data_used = 0;
+ meta_used = sinfo->bytes_used;
+ spin_unlock(&sinfo->lock);
+
+ num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
+ csum_size * 2;
+ num_bytes += div_u64(data_used + meta_used, 50);
+
+ if (num_bytes * 3 > meta_used)
+ num_bytes = div_u64(meta_used, 3);
+
+ return ALIGN(num_bytes, fs_info->extent_root->nodesize << 10);
+}
+
+static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
+ struct btrfs_space_info *sinfo = block_rsv->space_info;
+ u64 num_bytes;
+
+ num_bytes = calc_global_metadata_size(fs_info);
+
+ spin_lock(&sinfo->lock);
+ spin_lock(&block_rsv->lock);
+
+ block_rsv->size = min_t(u64, num_bytes, 512 * 1024 * 1024);
+
+ num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
+ sinfo->bytes_reserved + sinfo->bytes_readonly +
+ sinfo->bytes_may_use;
+
+ if (sinfo->total_bytes > num_bytes) {
+ num_bytes = sinfo->total_bytes - num_bytes;
+ block_rsv->reserved += num_bytes;
+ sinfo->bytes_may_use += num_bytes;
+ trace_btrfs_space_reservation(fs_info, "space_info",
+ sinfo->flags, num_bytes, 1);
+ }
+
+ if (block_rsv->reserved >= block_rsv->size) {
+ num_bytes = block_rsv->reserved - block_rsv->size;
+ sinfo->bytes_may_use -= num_bytes;
+ trace_btrfs_space_reservation(fs_info, "space_info",
+ sinfo->flags, num_bytes, 0);
+ block_rsv->reserved = block_rsv->size;
+ block_rsv->full = 1;
+ }
+
+ spin_unlock(&block_rsv->lock);
+ spin_unlock(&sinfo->lock);
+}
+
+static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_space_info *space_info;
+
+ space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
+ fs_info->chunk_block_rsv.space_info = space_info;
+
+ space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+ fs_info->global_block_rsv.space_info = space_info;
+ fs_info->delalloc_block_rsv.space_info = space_info;
+ fs_info->trans_block_rsv.space_info = space_info;
+ fs_info->empty_block_rsv.space_info = space_info;
+ fs_info->delayed_block_rsv.space_info = space_info;
+
+ fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
+ fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
+ fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
+ fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
+ if (fs_info->quota_root)
+ fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;
+ fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
+
+ update_global_block_rsv(fs_info);
+}
+
+static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
+{
+ block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
+ (u64)-1);
+ WARN_ON(fs_info->delalloc_block_rsv.size > 0);
+ WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
+ WARN_ON(fs_info->trans_block_rsv.size > 0);
+ WARN_ON(fs_info->trans_block_rsv.reserved > 0);
+ WARN_ON(fs_info->chunk_block_rsv.size > 0);
+ WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
+ WARN_ON(fs_info->delayed_block_rsv.size > 0);
+ WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
+}
+
+void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ if (!trans->block_rsv)
+ return;
+
+ if (!trans->bytes_reserved)
+ return;
+
+ trace_btrfs_space_reservation(root->fs_info, "transaction",
+ trans->transid, trans->bytes_reserved, 0);
+ btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
+ trans->bytes_reserved = 0;
+}
+
+/* Can only return 0 or -ENOSPC */
+int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
+ struct inode *inode)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
+ struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
+
+ /*
+ * We need to hold space in order to delete our orphan item once we've
+ * added it, so this takes the reservation so we can release it later
+ * when we are truly done with the orphan item.
+ */
+ u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+ trace_btrfs_space_reservation(root->fs_info, "orphan",
+ btrfs_ino(inode), num_bytes, 1);
+ return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
+}
+
+void btrfs_orphan_release_metadata(struct inode *inode)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+ trace_btrfs_space_reservation(root->fs_info, "orphan",
+ btrfs_ino(inode), num_bytes, 0);
+ btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
+}
+
+/*
+ * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
+ * root: the root of the parent directory
+ * rsv: block reservation
+ * items: the number of items that we need do reservation
+ * qgroup_reserved: used to return the reserved size in qgroup
+ *
+ * This function is used to reserve the space for snapshot/subvolume
+ * creation and deletion. Those operations are different with the
+ * common file/directory operations, they change two fs/file trees
+ * and root tree, the number of items that the qgroup reserves is
+ * different with the free space reservation. So we can not use
+ * the space reseravtion mechanism in start_transaction().
+ */
+int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
+ struct btrfs_block_rsv *rsv,
+ int items,
+ u64 *qgroup_reserved,
+ bool use_global_rsv)
+{
+ u64 num_bytes;
+ int ret;
+ struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
+
+ if (root->fs_info->quota_enabled) {
+ /* One for parent inode, two for dir entries */
+ num_bytes = 3 * root->nodesize;
+ ret = btrfs_qgroup_reserve(root, num_bytes);
+ if (ret)
+ return ret;
+ } else {
+ num_bytes = 0;
+ }
+
+ *qgroup_reserved = num_bytes;
+
+ num_bytes = btrfs_calc_trans_metadata_size(root, items);
+ rsv->space_info = __find_space_info(root->fs_info,
+ BTRFS_BLOCK_GROUP_METADATA);
+ ret = btrfs_block_rsv_add(root, rsv, num_bytes,
+ BTRFS_RESERVE_FLUSH_ALL);
+
+ if (ret == -ENOSPC && use_global_rsv)
+ ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes);
+
+ if (ret) {
+ if (*qgroup_reserved)
+ btrfs_qgroup_free(root, *qgroup_reserved);
+ }
+
+ return ret;
+}
+
+void btrfs_subvolume_release_metadata(struct btrfs_root *root,
+ struct btrfs_block_rsv *rsv,
+ u64 qgroup_reserved)
+{
+ btrfs_block_rsv_release(root, rsv, (u64)-1);
+}
+
+/**
+ * drop_outstanding_extent - drop an outstanding extent
+ * @inode: the inode we're dropping the extent for
+ * @num_bytes: the number of bytes we're relaseing.
+ *
+ * This is called when we are freeing up an outstanding extent, either called
+ * after an error or after an extent is written. This will return the number of
+ * reserved extents that need to be freed. This must be called with
+ * BTRFS_I(inode)->lock held.
+ */
+static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes)
+{
+ unsigned drop_inode_space = 0;
+ unsigned dropped_extents = 0;
+ unsigned num_extents = 0;
+
+ num_extents = (unsigned)div64_u64(num_bytes +
+ BTRFS_MAX_EXTENT_SIZE - 1,
+ BTRFS_MAX_EXTENT_SIZE);
+ ASSERT(num_extents);
+ ASSERT(BTRFS_I(inode)->outstanding_extents >= num_extents);
+ BTRFS_I(inode)->outstanding_extents -= num_extents;
+
+ if (BTRFS_I(inode)->outstanding_extents == 0 &&
+ test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
+ &BTRFS_I(inode)->runtime_flags))
+ drop_inode_space = 1;
+
+ /*
+ * If we have more or the same amount of outsanding extents than we have
+ * reserved then we need to leave the reserved extents count alone.
+ */
+ if (BTRFS_I(inode)->outstanding_extents >=
+ BTRFS_I(inode)->reserved_extents)
+ return drop_inode_space;
+
+ dropped_extents = BTRFS_I(inode)->reserved_extents -
+ BTRFS_I(inode)->outstanding_extents;
+ BTRFS_I(inode)->reserved_extents -= dropped_extents;
+ return dropped_extents + drop_inode_space;
+}
+
+/**
+ * calc_csum_metadata_size - return the amount of metada space that must be
+ * reserved/free'd for the given bytes.
+ * @inode: the inode we're manipulating
+ * @num_bytes: the number of bytes in question
+ * @reserve: 1 if we are reserving space, 0 if we are freeing space
+ *
+ * This adjusts the number of csum_bytes in the inode and then returns the
+ * correct amount of metadata that must either be reserved or freed. We
+ * calculate how many checksums we can fit into one leaf and then divide the
+ * number of bytes that will need to be checksumed by this value to figure out
+ * how many checksums will be required. If we are adding bytes then the number
+ * may go up and we will return the number of additional bytes that must be
+ * reserved. If it is going down we will return the number of bytes that must
+ * be freed.
+ *
+ * This must be called with BTRFS_I(inode)->lock held.
+ */
+static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
+ int reserve)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ u64 old_csums, num_csums;
+
+ if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
+ BTRFS_I(inode)->csum_bytes == 0)
+ return 0;
+
+ old_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes);
+ if (reserve)
+ BTRFS_I(inode)->csum_bytes += num_bytes;
+ else
+ BTRFS_I(inode)->csum_bytes -= num_bytes;
+ num_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes);
+
+ /* No change, no need to reserve more */
+ if (old_csums == num_csums)
+ return 0;
+
+ if (reserve)
+ return btrfs_calc_trans_metadata_size(root,
+ num_csums - old_csums);
+
+ return btrfs_calc_trans_metadata_size(root, old_csums - num_csums);
+}
+
+int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
+ u64 to_reserve = 0;
+ u64 csum_bytes;
+ unsigned nr_extents = 0;
+ int extra_reserve = 0;
+ enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
+ int ret = 0;
+ bool delalloc_lock = true;
+ u64 to_free = 0;
+ unsigned dropped;
+
+ /* If we are a free space inode we need to not flush since we will be in
+ * the middle of a transaction commit. We also don't need the delalloc
+ * mutex since we won't race with anybody. We need this mostly to make
+ * lockdep shut its filthy mouth.
+ */
+ if (btrfs_is_free_space_inode(inode)) {
+ flush = BTRFS_RESERVE_NO_FLUSH;
+ delalloc_lock = false;
+ }
+
+ if (flush != BTRFS_RESERVE_NO_FLUSH &&
+ btrfs_transaction_in_commit(root->fs_info))
+ schedule_timeout(1);
+
+ if (delalloc_lock)
+ mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
+
+ num_bytes = ALIGN(num_bytes, root->sectorsize);
+
+ spin_lock(&BTRFS_I(inode)->lock);
+ nr_extents = (unsigned)div64_u64(num_bytes +
+ BTRFS_MAX_EXTENT_SIZE - 1,
+ BTRFS_MAX_EXTENT_SIZE);
+ BTRFS_I(inode)->outstanding_extents += nr_extents;
+ nr_extents = 0;
+
+ if (BTRFS_I(inode)->outstanding_extents >
+ BTRFS_I(inode)->reserved_extents)
+ nr_extents = BTRFS_I(inode)->outstanding_extents -
+ BTRFS_I(inode)->reserved_extents;
+
+ /*
+ * Add an item to reserve for updating the inode when we complete the
+ * delalloc io.
+ */
+ if (!test_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
+ &BTRFS_I(inode)->runtime_flags)) {
+ nr_extents++;
+ extra_reserve = 1;
+ }
+
+ to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
+ to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
+ csum_bytes = BTRFS_I(inode)->csum_bytes;
+ spin_unlock(&BTRFS_I(inode)->lock);
+
+ if (root->fs_info->quota_enabled) {
+ ret = btrfs_qgroup_reserve(root, nr_extents * root->nodesize);
+ if (ret)
+ goto out_fail;
+ }
+
+ ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
+ if (unlikely(ret)) {
+ if (root->fs_info->quota_enabled)
+ btrfs_qgroup_free(root, nr_extents * root->nodesize);
+ goto out_fail;
+ }
+
+ spin_lock(&BTRFS_I(inode)->lock);
+ if (extra_reserve) {
+ set_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
+ &BTRFS_I(inode)->runtime_flags);
+ nr_extents--;
+ }
+ BTRFS_I(inode)->reserved_extents += nr_extents;
+ spin_unlock(&BTRFS_I(inode)->lock);
+
+ if (delalloc_lock)
+ mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
+
+ if (to_reserve)
+ trace_btrfs_space_reservation(root->fs_info, "delalloc",
+ btrfs_ino(inode), to_reserve, 1);
+ block_rsv_add_bytes(block_rsv, to_reserve, 1);
+
+ return 0;
+
+out_fail:
+ spin_lock(&BTRFS_I(inode)->lock);
+ dropped = drop_outstanding_extent(inode, num_bytes);
+ /*
+ * If the inodes csum_bytes is the same as the original
+ * csum_bytes then we know we haven't raced with any free()ers
+ * so we can just reduce our inodes csum bytes and carry on.
+ */
+ if (BTRFS_I(inode)->csum_bytes == csum_bytes) {
+ calc_csum_metadata_size(inode, num_bytes, 0);
+ } else {
+ u64 orig_csum_bytes = BTRFS_I(inode)->csum_bytes;
+ u64 bytes;
+
+ /*
+ * This is tricky, but first we need to figure out how much we
+ * free'd from any free-ers that occured during this
+ * reservation, so we reset ->csum_bytes to the csum_bytes
+ * before we dropped our lock, and then call the free for the
+ * number of bytes that were freed while we were trying our
+ * reservation.
+ */
+ bytes = csum_bytes - BTRFS_I(inode)->csum_bytes;
+ BTRFS_I(inode)->csum_bytes = csum_bytes;
+ to_free = calc_csum_metadata_size(inode, bytes, 0);
+
+
+ /*
+ * Now we need to see how much we would have freed had we not
+ * been making this reservation and our ->csum_bytes were not
+ * artificially inflated.
+ */
+ BTRFS_I(inode)->csum_bytes = csum_bytes - num_bytes;
+ bytes = csum_bytes - orig_csum_bytes;
+ bytes = calc_csum_metadata_size(inode, bytes, 0);
+
+ /*
+ * Now reset ->csum_bytes to what it should be. If bytes is
+ * more than to_free then we would have free'd more space had we
+ * not had an artificially high ->csum_bytes, so we need to free
+ * the remainder. If bytes is the same or less then we don't
+ * need to do anything, the other free-ers did the correct
+ * thing.
+ */
+ BTRFS_I(inode)->csum_bytes = orig_csum_bytes - num_bytes;
+ if (bytes > to_free)
+ to_free = bytes - to_free;
+ else
+ to_free = 0;
+ }
+ spin_unlock(&BTRFS_I(inode)->lock);
+ if (dropped)
+ to_free += btrfs_calc_trans_metadata_size(root, dropped);
+
+ if (to_free) {
+ btrfs_block_rsv_release(root, block_rsv, to_free);
+ trace_btrfs_space_reservation(root->fs_info, "delalloc",
+ btrfs_ino(inode), to_free, 0);
+ }
+ if (delalloc_lock)
+ mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
+ return ret;
+}
+
+/**
+ * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
+ * @inode: the inode to release the reservation for
+ * @num_bytes: the number of bytes we're releasing
+ *
+ * This will release the metadata reservation for an inode. This can be called
+ * once we complete IO for a given set of bytes to release their metadata
+ * reservations.
+ */
+void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ u64 to_free = 0;
+ unsigned dropped;
+
+ num_bytes = ALIGN(num_bytes, root->sectorsize);
+ spin_lock(&BTRFS_I(inode)->lock);
+ dropped = drop_outstanding_extent(inode, num_bytes);
+
+ if (num_bytes)
+ to_free = calc_csum_metadata_size(inode, num_bytes, 0);
+ spin_unlock(&BTRFS_I(inode)->lock);
+ if (dropped > 0)
+ to_free += btrfs_calc_trans_metadata_size(root, dropped);
+
+ if (btrfs_test_is_dummy_root(root))
+ return;
+
+ trace_btrfs_space_reservation(root->fs_info, "delalloc",
+ btrfs_ino(inode), to_free, 0);
+
+ btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
+ to_free);
+}
+
+/**
+ * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
+ * @inode: inode we're writing to
+ * @num_bytes: the number of bytes we want to allocate
+ *
+ * This will do the following things
+ *
+ * o reserve space in the data space info for num_bytes
+ * o reserve space in the metadata space info based on number of outstanding
+ * extents and how much csums will be needed
+ * o add to the inodes ->delalloc_bytes
+ * o add it to the fs_info's delalloc inodes list.
+ *
+ * This will return 0 for success and -ENOSPC if there is no space left.
+ */
+int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
+{
+ int ret;
+
+ ret = btrfs_check_data_free_space(inode, num_bytes, num_bytes);
+ if (ret)
+ return ret;
+
+ ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
+ if (ret) {
+ btrfs_free_reserved_data_space(inode, num_bytes);
+ return ret;
+ }
+
+ return 0;
+}
+
+/**
+ * btrfs_delalloc_release_space - release data and metadata space for delalloc
+ * @inode: inode we're releasing space for
+ * @num_bytes: the number of bytes we want to free up
+ *
+ * This must be matched with a call to btrfs_delalloc_reserve_space. This is
+ * called in the case that we don't need the metadata AND data reservations
+ * anymore. So if there is an error or we insert an inline extent.
+ *
+ * This function will release the metadata space that was not used and will
+ * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
+ * list if there are no delalloc bytes left.
+ */
+void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
+{
+ btrfs_delalloc_release_metadata(inode, num_bytes);
+ btrfs_free_reserved_data_space(inode, num_bytes);
+}
+
+static int update_block_group(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes, int alloc)
+{
+ struct btrfs_block_group_cache *cache = NULL;
+ struct btrfs_fs_info *info = root->fs_info;
+ u64 total = num_bytes;
+ u64 old_val;
+ u64 byte_in_group;
+ int factor;
+
+ /* block accounting for super block */
+ spin_lock(&info->delalloc_root_lock);
+ old_val = btrfs_super_bytes_used(info->super_copy);
+ if (alloc)
+ old_val += num_bytes;
+ else
+ old_val -= num_bytes;
+ btrfs_set_super_bytes_used(info->super_copy, old_val);
+ spin_unlock(&info->delalloc_root_lock);
+
+ while (total) {
+ cache = btrfs_lookup_block_group(info, bytenr);
+ if (!cache)
+ return -ENOENT;
+ if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
+ BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID10))
+ factor = 2;
+ else
+ factor = 1;
+ /*
+ * If this block group has free space cache written out, we
+ * need to make sure to load it if we are removing space. This
+ * is because we need the unpinning stage to actually add the
+ * space back to the block group, otherwise we will leak space.
+ */
+ if (!alloc && cache->cached == BTRFS_CACHE_NO)
+ cache_block_group(cache, 1);
+
+ byte_in_group = bytenr - cache->key.objectid;
+ WARN_ON(byte_in_group > cache->key.offset);
+
+ spin_lock(&cache->space_info->lock);
+ spin_lock(&cache->lock);
+
+ if (btrfs_test_opt(root, SPACE_CACHE) &&
+ cache->disk_cache_state < BTRFS_DC_CLEAR)
+ cache->disk_cache_state = BTRFS_DC_CLEAR;
+
+ old_val = btrfs_block_group_used(&cache->item);
+ num_bytes = min(total, cache->key.offset - byte_in_group);
+ if (alloc) {
+ old_val += num_bytes;
+ btrfs_set_block_group_used(&cache->item, old_val);
+ cache->reserved -= num_bytes;
+ cache->space_info->bytes_reserved -= num_bytes;
+ cache->space_info->bytes_used += num_bytes;
+ cache->space_info->disk_used += num_bytes * factor;
+ spin_unlock(&cache->lock);
+ spin_unlock(&cache->space_info->lock);
+ } else {
+ old_val -= num_bytes;
+ btrfs_set_block_group_used(&cache->item, old_val);
+ cache->pinned += num_bytes;
+ cache->space_info->bytes_pinned += num_bytes;
+ cache->space_info->bytes_used -= num_bytes;
+ cache->space_info->disk_used -= num_bytes * factor;
+ spin_unlock(&cache->lock);
+ spin_unlock(&cache->space_info->lock);
+
+ set_extent_dirty(info->pinned_extents,
+ bytenr, bytenr + num_bytes - 1,
+ GFP_NOFS | __GFP_NOFAIL);
+ /*
+ * No longer have used bytes in this block group, queue
+ * it for deletion.
+ */
+ if (old_val == 0) {
+ spin_lock(&info->unused_bgs_lock);
+ if (list_empty(&cache->bg_list)) {
+ btrfs_get_block_group(cache);
+ list_add_tail(&cache->bg_list,
+ &info->unused_bgs);
+ }
+ spin_unlock(&info->unused_bgs_lock);
+ }
+ }
+
+ spin_lock(&trans->transaction->dirty_bgs_lock);
+ if (list_empty(&cache->dirty_list)) {
+ list_add_tail(&cache->dirty_list,
+ &trans->transaction->dirty_bgs);
+ trans->transaction->num_dirty_bgs++;
+ btrfs_get_block_group(cache);
+ }
+ spin_unlock(&trans->transaction->dirty_bgs_lock);
+
+ btrfs_put_block_group(cache);
+ total -= num_bytes;
+ bytenr += num_bytes;
+ }
+ return 0;
+}
+
+static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
+{
+ struct btrfs_block_group_cache *cache;
+ u64 bytenr;
+
+ spin_lock(&root->fs_info->block_group_cache_lock);
+ bytenr = root->fs_info->first_logical_byte;
+ spin_unlock(&root->fs_info->block_group_cache_lock);
+
+ if (bytenr < (u64)-1)
+ return bytenr;
+
+ cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
+ if (!cache)
+ return 0;
+
+ bytenr = cache->key.objectid;
+ btrfs_put_block_group(cache);
+
+ return bytenr;
+}
+
+static int pin_down_extent(struct btrfs_root *root,
+ struct btrfs_block_group_cache *cache,
+ u64 bytenr, u64 num_bytes, int reserved)
+{
+ spin_lock(&cache->space_info->lock);
+ spin_lock(&cache->lock);
+ cache->pinned += num_bytes;
+ cache->space_info->bytes_pinned += num_bytes;
+ if (reserved) {
+ cache->reserved -= num_bytes;
+ cache->space_info->bytes_reserved -= num_bytes;
+ }
+ spin_unlock(&cache->lock);
+ spin_unlock(&cache->space_info->lock);
+
+ set_extent_dirty(root->fs_info->pinned_extents, bytenr,
+ bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
+ if (reserved)
+ trace_btrfs_reserved_extent_free(root, bytenr, num_bytes);
+ return 0;
+}
+
+/*
+ * this function must be called within transaction
+ */
+int btrfs_pin_extent(struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes, int reserved)
+{
+ struct btrfs_block_group_cache *cache;
+
+ cache = btrfs_lookup_block_group(root->fs_info, bytenr);
+ BUG_ON(!cache); /* Logic error */
+
+ pin_down_extent(root, cache, bytenr, num_bytes, reserved);
+
+ btrfs_put_block_group(cache);
+ return 0;
+}
+
+/*
+ * this function must be called within transaction
+ */
+int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes)
+{
+ struct btrfs_block_group_cache *cache;
+ int ret;
+
+ cache = btrfs_lookup_block_group(root->fs_info, bytenr);
+ if (!cache)
+ return -EINVAL;
+
+ /*
+ * pull in the free space cache (if any) so that our pin
+ * removes the free space from the cache. We have load_only set
+ * to one because the slow code to read in the free extents does check
+ * the pinned extents.
+ */
+ cache_block_group(cache, 1);
+
+ pin_down_extent(root, cache, bytenr, num_bytes, 0);
+
+ /* remove us from the free space cache (if we're there at all) */
+ ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
+ btrfs_put_block_group(cache);
+ return ret;
+}
+
+static int __exclude_logged_extent(struct btrfs_root *root, u64 start, u64 num_bytes)
+{
+ int ret;
+ struct btrfs_block_group_cache *block_group;
+ struct btrfs_caching_control *caching_ctl;
+
+ block_group = btrfs_lookup_block_group(root->fs_info, start);
+ if (!block_group)
+ return -EINVAL;
+
+ cache_block_group(block_group, 0);
+ caching_ctl = get_caching_control(block_group);
+
+ if (!caching_ctl) {
+ /* Logic error */
+ BUG_ON(!block_group_cache_done(block_group));
+ ret = btrfs_remove_free_space(block_group, start, num_bytes);
+ } else {
+ mutex_lock(&caching_ctl->mutex);
+
+ if (start >= caching_ctl->progress) {
+ ret = add_excluded_extent(root, start, num_bytes);
+ } else if (start + num_bytes <= caching_ctl->progress) {
+ ret = btrfs_remove_free_space(block_group,
+ start, num_bytes);
+ } else {
+ num_bytes = caching_ctl->progress - start;
+ ret = btrfs_remove_free_space(block_group,
+ start, num_bytes);
+ if (ret)
+ goto out_lock;
+
+ num_bytes = (start + num_bytes) -
+ caching_ctl->progress;
+ start = caching_ctl->progress;
+ ret = add_excluded_extent(root, start, num_bytes);
+ }
+out_lock:
+ mutex_unlock(&caching_ctl->mutex);
+ put_caching_control(caching_ctl);
+ }
+ btrfs_put_block_group(block_group);
+ return ret;
+}
+
+int btrfs_exclude_logged_extents(struct btrfs_root *log,
+ struct extent_buffer *eb)
+{
+ struct btrfs_file_extent_item *item;
+ struct btrfs_key key;
+ int found_type;
+ int i;
+
+ if (!btrfs_fs_incompat(log->fs_info, MIXED_GROUPS))
+ return 0;
+
+ for (i = 0; i < btrfs_header_nritems(eb); i++) {
+ btrfs_item_key_to_cpu(eb, &key, i);
+ if (key.type != BTRFS_EXTENT_DATA_KEY)
+ continue;
+ item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
+ found_type = btrfs_file_extent_type(eb, item);
+ if (found_type == BTRFS_FILE_EXTENT_INLINE)
+ continue;
+ if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
+ continue;
+ key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
+ key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
+ __exclude_logged_extent(log, key.objectid, key.offset);
+ }
+
+ return 0;
+}
+
+/**
+ * btrfs_update_reserved_bytes - update the block_group and space info counters
+ * @cache: The cache we are manipulating
+ * @num_bytes: The number of bytes in question
+ * @reserve: One of the reservation enums
+ * @delalloc: The blocks are allocated for the delalloc write
+ *
+ * This is called by the allocator when it reserves space, or by somebody who is
+ * freeing space that was never actually used on disk. For example if you
+ * reserve some space for a new leaf in transaction A and before transaction A
+ * commits you free that leaf, you call this with reserve set to 0 in order to
+ * clear the reservation.
+ *
+ * Metadata reservations should be called with RESERVE_ALLOC so we do the proper
+ * ENOSPC accounting. For data we handle the reservation through clearing the
+ * delalloc bits in the io_tree. We have to do this since we could end up
+ * allocating less disk space for the amount of data we have reserved in the
+ * case of compression.
+ *
+ * If this is a reservation and the block group has become read only we cannot
+ * make the reservation and return -EAGAIN, otherwise this function always
+ * succeeds.
+ */
+static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
+ u64 num_bytes, int reserve, int delalloc)
+{
+ struct btrfs_space_info *space_info = cache->space_info;
+ int ret = 0;
+
+ spin_lock(&space_info->lock);
+ spin_lock(&cache->lock);
+ if (reserve != RESERVE_FREE) {
+ if (cache->ro) {
+ ret = -EAGAIN;
+ } else {
+ cache->reserved += num_bytes;
+ space_info->bytes_reserved += num_bytes;
+ if (reserve == RESERVE_ALLOC) {
+ trace_btrfs_space_reservation(cache->fs_info,
+ "space_info", space_info->flags,
+ num_bytes, 0);
+ space_info->bytes_may_use -= num_bytes;
+ }
+
+ if (delalloc)
+ cache->delalloc_bytes += num_bytes;
+ }
+ } else {
+ if (cache->ro)
+ space_info->bytes_readonly += num_bytes;
+ cache->reserved -= num_bytes;
+ space_info->bytes_reserved -= num_bytes;
+
+ if (delalloc)
+ cache->delalloc_bytes -= num_bytes;
+ }
+ spin_unlock(&cache->lock);
+ spin_unlock(&space_info->lock);
+ return ret;
+}
+
+void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_caching_control *next;
+ struct btrfs_caching_control *caching_ctl;
+ struct btrfs_block_group_cache *cache;
+
+ down_write(&fs_info->commit_root_sem);
+
+ list_for_each_entry_safe(caching_ctl, next,
+ &fs_info->caching_block_groups, list) {
+ cache = caching_ctl->block_group;
+ if (block_group_cache_done(cache)) {
+ cache->last_byte_to_unpin = (u64)-1;
+ list_del_init(&caching_ctl->list);
+ put_caching_control(caching_ctl);
+ } else {
+ cache->last_byte_to_unpin = caching_ctl->progress;
+ }
+ }
+
+ if (fs_info->pinned_extents == &fs_info->freed_extents[0])
+ fs_info->pinned_extents = &fs_info->freed_extents[1];
+ else
+ fs_info->pinned_extents = &fs_info->freed_extents[0];
+
+ up_write(&fs_info->commit_root_sem);
+
+ update_global_block_rsv(fs_info);
+}
+
+static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
+ const bool return_free_space)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_block_group_cache *cache = NULL;
+ struct btrfs_space_info *space_info;
+ struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+ u64 len;
+ bool readonly;
+
+ while (start <= end) {
+ readonly = false;
+ if (!cache ||
+ start >= cache->key.objectid + cache->key.offset) {
+ if (cache)
+ btrfs_put_block_group(cache);
+ cache = btrfs_lookup_block_group(fs_info, start);
+ BUG_ON(!cache); /* Logic error */
+ }
+
+ len = cache->key.objectid + cache->key.offset - start;
+ len = min(len, end + 1 - start);
+
+ if (start < cache->last_byte_to_unpin) {
+ len = min(len, cache->last_byte_to_unpin - start);
+ if (return_free_space)
+ btrfs_add_free_space(cache, start, len);
+ }
+
+ start += len;
+ space_info = cache->space_info;
+
+ spin_lock(&space_info->lock);
+ spin_lock(&cache->lock);
+ cache->pinned -= len;
+ space_info->bytes_pinned -= len;
+ percpu_counter_add(&space_info->total_bytes_pinned, -len);
+ if (cache->ro) {
+ space_info->bytes_readonly += len;
+ readonly = true;
+ }
+ spin_unlock(&cache->lock);
+ if (!readonly && global_rsv->space_info == space_info) {
+ spin_lock(&global_rsv->lock);
+ if (!global_rsv->full) {
+ len = min(len, global_rsv->size -
+ global_rsv->reserved);
+ global_rsv->reserved += len;
+ space_info->bytes_may_use += len;
+ if (global_rsv->reserved >= global_rsv->size)
+ global_rsv->full = 1;
+ }
+ spin_unlock(&global_rsv->lock);
+ }
+ spin_unlock(&space_info->lock);
+ }
+
+ if (cache)
+ btrfs_put_block_group(cache);
+ return 0;
+}
+
+int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct extent_io_tree *unpin;
+ u64 start;
+ u64 end;
+ int ret;
+
+ if (trans->aborted)
+ return 0;
+
+ if (fs_info->pinned_extents == &fs_info->freed_extents[0])
+ unpin = &fs_info->freed_extents[1];
+ else
+ unpin = &fs_info->freed_extents[0];
+
+ while (1) {
+ mutex_lock(&fs_info->unused_bg_unpin_mutex);
+ ret = find_first_extent_bit(unpin, 0, &start, &end,
+ EXTENT_DIRTY, NULL);
+ if (ret) {
+ mutex_unlock(&fs_info->unused_bg_unpin_mutex);
+ break;
+ }
+
+ if (btrfs_test_opt(root, DISCARD))
+ ret = btrfs_discard_extent(root, start,
+ end + 1 - start, NULL);
+
+ clear_extent_dirty(unpin, start, end, GFP_NOFS);
+ unpin_extent_range(root, start, end, true);
+ mutex_unlock(&fs_info->unused_bg_unpin_mutex);
+ cond_resched();
+ }
+
+ return 0;
+}
+
+static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes,
+ u64 owner, u64 root_objectid)
+{
+ struct btrfs_space_info *space_info;
+ u64 flags;
+
+ if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+ if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
+ flags = BTRFS_BLOCK_GROUP_SYSTEM;
+ else
+ flags = BTRFS_BLOCK_GROUP_METADATA;
+ } else {
+ flags = BTRFS_BLOCK_GROUP_DATA;
+ }
+
+ space_info = __find_space_info(fs_info, flags);
+ BUG_ON(!space_info); /* Logic bug */
+ percpu_counter_add(&space_info->total_bytes_pinned, num_bytes);
+}
+
+
+static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes, u64 parent,
+ u64 root_objectid, u64 owner_objectid,
+ u64 owner_offset, int refs_to_drop,
+ struct btrfs_delayed_extent_op *extent_op,
+ int no_quota)
+{
+ struct btrfs_key key;
+ struct btrfs_path *path;
+ struct btrfs_fs_info *info = root->fs_info;
+ struct btrfs_root *extent_root = info->extent_root;
+ struct extent_buffer *leaf;
+ struct btrfs_extent_item *ei;
+ struct btrfs_extent_inline_ref *iref;
+ int ret;
+ int is_data;
+ int extent_slot = 0;
+ int found_extent = 0;
+ int num_to_del = 1;
+ u32 item_size;
+ u64 refs;
+ int last_ref = 0;
+ enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_SUB_EXCL;
+ bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
+ SKINNY_METADATA);
+
+ if (!info->quota_enabled || !is_fstree(root_objectid))
+ no_quota = 1;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ path->reada = 1;
+ path->leave_spinning = 1;
+
+ is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
+ BUG_ON(!is_data && refs_to_drop != 1);
+
+ if (is_data)
+ skinny_metadata = 0;
+
+ ret = lookup_extent_backref(trans, extent_root, path, &iref,
+ bytenr, num_bytes, parent,
+ root_objectid, owner_objectid,
+ owner_offset);
+ if (ret == 0) {
+ extent_slot = path->slots[0];
+ while (extent_slot >= 0) {
+ btrfs_item_key_to_cpu(path->nodes[0], &key,
+ extent_slot);
+ if (key.objectid != bytenr)
+ break;
+ if (key.type == BTRFS_EXTENT_ITEM_KEY &&
+ key.offset == num_bytes) {
+ found_extent = 1;
+ break;
+ }
+ if (key.type == BTRFS_METADATA_ITEM_KEY &&
+ key.offset == owner_objectid) {
+ found_extent = 1;
+ break;
+ }
+ if (path->slots[0] - extent_slot > 5)
+ break;
+ extent_slot--;
+ }
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+ item_size = btrfs_item_size_nr(path->nodes[0], extent_slot);
+ if (found_extent && item_size < sizeof(*ei))
+ found_extent = 0;
+#endif
+ if (!found_extent) {
+ BUG_ON(iref);
+ ret = remove_extent_backref(trans, extent_root, path,
+ NULL, refs_to_drop,
+ is_data, &last_ref);
+ if (ret) {
+ btrfs_abort_transaction(trans, extent_root, ret);
+ goto out;
+ }
+ btrfs_release_path(path);
+ path->leave_spinning = 1;
+
+ key.objectid = bytenr;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = num_bytes;
+
+ if (!is_data && skinny_metadata) {
+ key.type = BTRFS_METADATA_ITEM_KEY;
+ key.offset = owner_objectid;
+ }
+
+ ret = btrfs_search_slot(trans, extent_root,
+ &key, path, -1, 1);
+ if (ret > 0 && skinny_metadata && path->slots[0]) {
+ /*
+ * Couldn't find our skinny metadata item,
+ * see if we have ye olde extent item.
+ */
+ path->slots[0]--;
+ btrfs_item_key_to_cpu(path->nodes[0], &key,
+ path->slots[0]);
+ if (key.objectid == bytenr &&
+ key.type == BTRFS_EXTENT_ITEM_KEY &&
+ key.offset == num_bytes)
+ ret = 0;
+ }
+
+ if (ret > 0 && skinny_metadata) {
+ skinny_metadata = false;
+ key.objectid = bytenr;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = num_bytes;
+ btrfs_release_path(path);
+ ret = btrfs_search_slot(trans, extent_root,
+ &key, path, -1, 1);
+ }
+
+ if (ret) {
+ btrfs_err(info, "umm, got %d back from search, was looking for %llu",
+ ret, bytenr);
+ if (ret > 0)
+ btrfs_print_leaf(extent_root,
+ path->nodes[0]);
+ }
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, extent_root, ret);
+ goto out;
+ }
+ extent_slot = path->slots[0];
+ }
+ } else if (WARN_ON(ret == -ENOENT)) {
+ btrfs_print_leaf(extent_root, path->nodes[0]);
+ btrfs_err(info,
+ "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu",
+ bytenr, parent, root_objectid, owner_objectid,
+ owner_offset);
+ btrfs_abort_transaction(trans, extent_root, ret);
+ goto out;
+ } else {
+ btrfs_abort_transaction(trans, extent_root, ret);
+ goto out;
+ }
+
+ leaf = path->nodes[0];
+ item_size = btrfs_item_size_nr(leaf, extent_slot);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+ if (item_size < sizeof(*ei)) {
+ BUG_ON(found_extent || extent_slot != path->slots[0]);
+ ret = convert_extent_item_v0(trans, extent_root, path,
+ owner_objectid, 0);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, extent_root, ret);
+ goto out;
+ }
+
+ btrfs_release_path(path);
+ path->leave_spinning = 1;
+
+ key.objectid = bytenr;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = num_bytes;
+
+ ret = btrfs_search_slot(trans, extent_root, &key, path,
+ -1, 1);
+ if (ret) {
+ btrfs_err(info, "umm, got %d back from search, was looking for %llu",
+ ret, bytenr);
+ btrfs_print_leaf(extent_root, path->nodes[0]);
+ }
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, extent_root, ret);
+ goto out;
+ }
+
+ extent_slot = path->slots[0];
+ leaf = path->nodes[0];
+ item_size = btrfs_item_size_nr(leaf, extent_slot);
+ }
+#endif
+ BUG_ON(item_size < sizeof(*ei));
+ ei = btrfs_item_ptr(leaf, extent_slot,
+ struct btrfs_extent_item);
+ if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
+ key.type == BTRFS_EXTENT_ITEM_KEY) {
+ struct btrfs_tree_block_info *bi;
+ BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
+ bi = (struct btrfs_tree_block_info *)(ei + 1);
+ WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
+ }
+
+ refs = btrfs_extent_refs(leaf, ei);
+ if (refs < refs_to_drop) {
+ btrfs_err(info, "trying to drop %d refs but we only have %Lu "
+ "for bytenr %Lu", refs_to_drop, refs, bytenr);
+ ret = -EINVAL;
+ btrfs_abort_transaction(trans, extent_root, ret);
+ goto out;
+ }
+ refs -= refs_to_drop;
+
+ if (refs > 0) {
+ type = BTRFS_QGROUP_OPER_SUB_SHARED;
+ if (extent_op)
+ __run_delayed_extent_op(extent_op, leaf, ei);
+ /*
+ * In the case of inline back ref, reference count will
+ * be updated by remove_extent_backref
+ */
+ if (iref) {
+ BUG_ON(!found_extent);
+ } else {
+ btrfs_set_extent_refs(leaf, ei, refs);
+ btrfs_mark_buffer_dirty(leaf);
+ }
+ if (found_extent) {
+ ret = remove_extent_backref(trans, extent_root, path,
+ iref, refs_to_drop,
+ is_data, &last_ref);
+ if (ret) {
+ btrfs_abort_transaction(trans, extent_root, ret);
+ goto out;
+ }
+ }
+ add_pinned_bytes(root->fs_info, -num_bytes, owner_objectid,
+ root_objectid);
+ } else {
+ if (found_extent) {
+ BUG_ON(is_data && refs_to_drop !=
+ extent_data_ref_count(root, path, iref));
+ if (iref) {
+ BUG_ON(path->slots[0] != extent_slot);
+ } else {
+ BUG_ON(path->slots[0] != extent_slot + 1);
+ path->slots[0] = extent_slot;
+ num_to_del = 2;
+ }
+ }
+
+ last_ref = 1;
+ ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
+ num_to_del);
+ if (ret) {
+ btrfs_abort_transaction(trans, extent_root, ret);
+ goto out;
+ }
+ btrfs_release_path(path);
+
+ if (is_data) {
+ ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
+ if (ret) {
+ btrfs_abort_transaction(trans, extent_root, ret);
+ goto out;
+ }
+ }
+
+ ret = update_block_group(trans, root, bytenr, num_bytes, 0);
+ if (ret) {
+ btrfs_abort_transaction(trans, extent_root, ret);
+ goto out;
+ }
+ }
+ btrfs_release_path(path);
+
+ /* Deal with the quota accounting */
+ if (!ret && last_ref && !no_quota) {
+ int mod_seq = 0;
+
+ if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID &&
+ type == BTRFS_QGROUP_OPER_SUB_SHARED)
+ mod_seq = 1;
+
+ ret = btrfs_qgroup_record_ref(trans, info, root_objectid,
+ bytenr, num_bytes, type,
+ mod_seq);
+ }
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * when we free an block, it is possible (and likely) that we free the last
+ * delayed ref for that extent as well. This searches the delayed ref tree for
+ * a given extent, and if there are no other delayed refs to be processed, it
+ * removes it from the tree.
+ */
+static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytenr)
+{
+ struct btrfs_delayed_ref_head *head;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ int ret = 0;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ spin_lock(&delayed_refs->lock);
+ head = btrfs_find_delayed_ref_head(trans, bytenr);
+ if (!head)
+ goto out_delayed_unlock;
+
+ spin_lock(&head->lock);
+ if (rb_first(&head->ref_root))
+ goto out;
+
+ if (head->extent_op) {
+ if (!head->must_insert_reserved)
+ goto out;
+ btrfs_free_delayed_extent_op(head->extent_op);
+ head->extent_op = NULL;
+ }
+
+ /*
+ * waiting for the lock here would deadlock. If someone else has it
+ * locked they are already in the process of dropping it anyway
+ */
+ if (!mutex_trylock(&head->mutex))
+ goto out;
+
+ /*
+ * at this point we have a head with no other entries. Go
+ * ahead and process it.
+ */
+ head->node.in_tree = 0;
+ rb_erase(&head->href_node, &delayed_refs->href_root);
+
+ atomic_dec(&delayed_refs->num_entries);
+
+ /*
+ * we don't take a ref on the node because we're removing it from the
+ * tree, so we just steal the ref the tree was holding.
+ */
+ delayed_refs->num_heads--;
+ if (head->processing == 0)
+ delayed_refs->num_heads_ready--;
+ head->processing = 0;
+ spin_unlock(&head->lock);
+ spin_unlock(&delayed_refs->lock);
+
+ BUG_ON(head->extent_op);
+ if (head->must_insert_reserved)
+ ret = 1;
+
+ mutex_unlock(&head->mutex);
+ btrfs_put_delayed_ref(&head->node);
+ return ret;
+out:
+ spin_unlock(&head->lock);
+
+out_delayed_unlock:
+ spin_unlock(&delayed_refs->lock);
+ return 0;
+}
+
+void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *buf,
+ u64 parent, int last_ref)
+{
+ int pin = 1;
+ int ret;
+
+ if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
+ ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
+ buf->start, buf->len,
+ parent, root->root_key.objectid,
+ btrfs_header_level(buf),
+ BTRFS_DROP_DELAYED_REF, NULL, 0);
+ BUG_ON(ret); /* -ENOMEM */
+ }
+
+ if (!last_ref)
+ return;
+
+ if (btrfs_header_generation(buf) == trans->transid) {
+ struct btrfs_block_group_cache *cache;
+
+ if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
+ ret = check_ref_cleanup(trans, root, buf->start);
+ if (!ret)
+ goto out;
+ }
+
+ cache = btrfs_lookup_block_group(root->fs_info, buf->start);
+
+ if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
+ pin_down_extent(root, cache, buf->start, buf->len, 1);
+ btrfs_put_block_group(cache);
+ goto out;
+ }
+
+ WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
+
+ btrfs_add_free_space(cache, buf->start, buf->len);
+ btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE, 0);
+ btrfs_put_block_group(cache);
+ trace_btrfs_reserved_extent_free(root, buf->start, buf->len);
+ pin = 0;
+ }
+out:
+ if (pin)
+ add_pinned_bytes(root->fs_info, buf->len,
+ btrfs_header_level(buf),
+ root->root_key.objectid);
+
+ /*
+ * Deleting the buffer, clear the corrupt flag since it doesn't matter
+ * anymore.
+ */
+ clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
+}
+
+/* Can return -ENOMEM */
+int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
+ u64 owner, u64 offset, int no_quota)
+{
+ int ret;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+
+ if (btrfs_test_is_dummy_root(root))
+ return 0;
+
+ add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid);
+
+ /*
+ * tree log blocks never actually go into the extent allocation
+ * tree, just update pinning info and exit early.
+ */
+ if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
+ WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
+ /* unlocks the pinned mutex */
+ btrfs_pin_extent(root, bytenr, num_bytes, 1);
+ ret = 0;
+ } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+ ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
+ num_bytes,
+ parent, root_objectid, (int)owner,
+ BTRFS_DROP_DELAYED_REF, NULL, no_quota);
+ } else {
+ ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
+ num_bytes,
+ parent, root_objectid, owner,
+ offset, BTRFS_DROP_DELAYED_REF,
+ NULL, no_quota);
+ }
+ return ret;
+}
+
+/*
+ * when we wait for progress in the block group caching, its because
+ * our allocation attempt failed at least once. So, we must sleep
+ * and let some progress happen before we try again.
+ *
+ * This function will sleep at least once waiting for new free space to
+ * show up, and then it will check the block group free space numbers
+ * for our min num_bytes. Another option is to have it go ahead
+ * and look in the rbtree for a free extent of a given size, but this
+ * is a good start.
+ *
+ * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
+ * any of the information in this block group.
+ */
+static noinline void
+wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
+ u64 num_bytes)
+{
+ struct btrfs_caching_control *caching_ctl;
+
+ caching_ctl = get_caching_control(cache);
+ if (!caching_ctl)
+ return;
+
+ wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
+ (cache->free_space_ctl->free_space >= num_bytes));
+
+ put_caching_control(caching_ctl);
+}
+
+static noinline int
+wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
+{
+ struct btrfs_caching_control *caching_ctl;
+ int ret = 0;
+
+ caching_ctl = get_caching_control(cache);
+ if (!caching_ctl)
+ return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
+
+ wait_event(caching_ctl->wait, block_group_cache_done(cache));
+ if (cache->cached == BTRFS_CACHE_ERROR)
+ ret = -EIO;
+ put_caching_control(caching_ctl);
+ return ret;
+}
+
+int __get_raid_index(u64 flags)
+{
+ if (flags & BTRFS_BLOCK_GROUP_RAID10)
+ return BTRFS_RAID_RAID10;
+ else if (flags & BTRFS_BLOCK_GROUP_RAID1)
+ return BTRFS_RAID_RAID1;
+ else if (flags & BTRFS_BLOCK_GROUP_DUP)
+ return BTRFS_RAID_DUP;
+ else if (flags & BTRFS_BLOCK_GROUP_RAID0)
+ return BTRFS_RAID_RAID0;
+ else if (flags & BTRFS_BLOCK_GROUP_RAID5)
+ return BTRFS_RAID_RAID5;
+ else if (flags & BTRFS_BLOCK_GROUP_RAID6)
+ return BTRFS_RAID_RAID6;
+
+ return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
+}
+
+int get_block_group_index(struct btrfs_block_group_cache *cache)
+{
+ return __get_raid_index(cache->flags);
+}
+
+static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = {
+ [BTRFS_RAID_RAID10] = "raid10",
+ [BTRFS_RAID_RAID1] = "raid1",
+ [BTRFS_RAID_DUP] = "dup",
+ [BTRFS_RAID_RAID0] = "raid0",
+ [BTRFS_RAID_SINGLE] = "single",
+ [BTRFS_RAID_RAID5] = "raid5",
+ [BTRFS_RAID_RAID6] = "raid6",
+};
+
+static const char *get_raid_name(enum btrfs_raid_types type)
+{
+ if (type >= BTRFS_NR_RAID_TYPES)
+ return NULL;
+
+ return btrfs_raid_type_names[type];
+}
+
+enum btrfs_loop_type {
+ LOOP_CACHING_NOWAIT = 0,
+ LOOP_CACHING_WAIT = 1,
+ LOOP_ALLOC_CHUNK = 2,
+ LOOP_NO_EMPTY_SIZE = 3,
+};
+
+static inline void
+btrfs_lock_block_group(struct btrfs_block_group_cache *cache,
+ int delalloc)
+{
+ if (delalloc)
+ down_read(&cache->data_rwsem);
+}
+
+static inline void
+btrfs_grab_block_group(struct btrfs_block_group_cache *cache,
+ int delalloc)
+{
+ btrfs_get_block_group(cache);
+ if (delalloc)
+ down_read(&cache->data_rwsem);
+}
+
+static struct btrfs_block_group_cache *
+btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_cluster *cluster,
+ int delalloc)
+{
+ struct btrfs_block_group_cache *used_bg;
+ bool locked = false;
+again:
+ spin_lock(&cluster->refill_lock);
+ if (locked) {
+ if (used_bg == cluster->block_group)
+ return used_bg;
+
+ up_read(&used_bg->data_rwsem);
+ btrfs_put_block_group(used_bg);
+ }
+
+ used_bg = cluster->block_group;
+ if (!used_bg)
+ return NULL;
+
+ if (used_bg == block_group)
+ return used_bg;
+
+ btrfs_get_block_group(used_bg);
+
+ if (!delalloc)
+ return used_bg;
+
+ if (down_read_trylock(&used_bg->data_rwsem))
+ return used_bg;
+
+ spin_unlock(&cluster->refill_lock);
+ down_read(&used_bg->data_rwsem);
+ locked = true;
+ goto again;
+}
+
+static inline void
+btrfs_release_block_group(struct btrfs_block_group_cache *cache,
+ int delalloc)
+{
+ if (delalloc)
+ up_read(&cache->data_rwsem);
+ btrfs_put_block_group(cache);
+}
+
+/*
+ * walks the btree of allocated extents and find a hole of a given size.
+ * The key ins is changed to record the hole:
+ * ins->objectid == start position
+ * ins->flags = BTRFS_EXTENT_ITEM_KEY
+ * ins->offset == the size of the hole.
+ * Any available blocks before search_start are skipped.
+ *
+ * If there is no suitable free space, we will record the max size of
+ * the free space extent currently.
+ */
+static noinline int find_free_extent(struct btrfs_root *orig_root,
+ u64 num_bytes, u64 empty_size,
+ u64 hint_byte, struct btrfs_key *ins,
+ u64 flags, int delalloc)
+{
+ int ret = 0;
+ struct btrfs_root *root = orig_root->fs_info->extent_root;
+ struct btrfs_free_cluster *last_ptr = NULL;
+ struct btrfs_block_group_cache *block_group = NULL;
+ u64 search_start = 0;
+ u64 max_extent_size = 0;
+ int empty_cluster = 2 * 1024 * 1024;
+ struct btrfs_space_info *space_info;
+ int loop = 0;
+ int index = __get_raid_index(flags);
+ int alloc_type = (flags & BTRFS_BLOCK_GROUP_DATA) ?
+ RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
+ bool failed_cluster_refill = false;
+ bool failed_alloc = false;
+ bool use_cluster = true;
+ bool have_caching_bg = false;
+
+ WARN_ON(num_bytes < root->sectorsize);
+ ins->type = BTRFS_EXTENT_ITEM_KEY;
+ ins->objectid = 0;
+ ins->offset = 0;
+
+ trace_find_free_extent(orig_root, num_bytes, empty_size, flags);
+
+ space_info = __find_space_info(root->fs_info, flags);
+ if (!space_info) {
+ btrfs_err(root->fs_info, "No space info for %llu", flags);
+ return -ENOSPC;
+ }
+
+ /*
+ * If the space info is for both data and metadata it means we have a
+ * small filesystem and we can't use the clustering stuff.
+ */
+ if (btrfs_mixed_space_info(space_info))
+ use_cluster = false;
+
+ if (flags & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
+ last_ptr = &root->fs_info->meta_alloc_cluster;
+ if (!btrfs_test_opt(root, SSD))
+ empty_cluster = 64 * 1024;
+ }
+
+ if ((flags & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
+ btrfs_test_opt(root, SSD)) {
+ last_ptr = &root->fs_info->data_alloc_cluster;
+ }
+
+ if (last_ptr) {
+ spin_lock(&last_ptr->lock);
+ if (last_ptr->block_group)
+ hint_byte = last_ptr->window_start;
+ spin_unlock(&last_ptr->lock);
+ }
+
+ search_start = max(search_start, first_logical_byte(root, 0));
+ search_start = max(search_start, hint_byte);
+
+ if (!last_ptr)
+ empty_cluster = 0;
+
+ if (search_start == hint_byte) {
+ block_group = btrfs_lookup_block_group(root->fs_info,
+ search_start);
+ /*
+ * we don't want to use the block group if it doesn't match our
+ * allocation bits, or if its not cached.
+ *
+ * However if we are re-searching with an ideal block group
+ * picked out then we don't care that the block group is cached.
+ */
+ if (block_group && block_group_bits(block_group, flags) &&
+ block_group->cached != BTRFS_CACHE_NO) {
+ down_read(&space_info->groups_sem);
+ if (list_empty(&block_group->list) ||
+ block_group->ro) {
+ /*
+ * someone is removing this block group,
+ * we can't jump into the have_block_group
+ * target because our list pointers are not
+ * valid
+ */
+ btrfs_put_block_group(block_group);
+ up_read(&space_info->groups_sem);
+ } else {
+ index = get_block_group_index(block_group);
+ btrfs_lock_block_group(block_group, delalloc);
+ goto have_block_group;
+ }
+ } else if (block_group) {
+ btrfs_put_block_group(block_group);
+ }
+ }
+search:
+ have_caching_bg = false;
+ down_read(&space_info->groups_sem);
+ list_for_each_entry(block_group, &space_info->block_groups[index],
+ list) {
+ u64 offset;
+ int cached;
+
+ btrfs_grab_block_group(block_group, delalloc);
+ search_start = block_group->key.objectid;
+
+ /*
+ * this can happen if we end up cycling through all the
+ * raid types, but we want to make sure we only allocate
+ * for the proper type.
+ */
+ if (!block_group_bits(block_group, flags)) {
+ u64 extra = BTRFS_BLOCK_GROUP_DUP |
+ BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID5 |
+ BTRFS_BLOCK_GROUP_RAID6 |
+ BTRFS_BLOCK_GROUP_RAID10;
+
+ /*
+ * if they asked for extra copies and this block group
+ * doesn't provide them, bail. This does allow us to
+ * fill raid0 from raid1.
+ */
+ if ((flags & extra) && !(block_group->flags & extra))
+ goto loop;
+ }
+
+have_block_group:
+ cached = block_group_cache_done(block_group);
+ if (unlikely(!cached)) {
+ ret = cache_block_group(block_group, 0);
+ BUG_ON(ret < 0);
+ ret = 0;
+ }
+
+ if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
+ goto loop;
+ if (unlikely(block_group->ro))
+ goto loop;
+
+ /*
+ * Ok we want to try and use the cluster allocator, so
+ * lets look there
+ */
+ if (last_ptr) {
+ struct btrfs_block_group_cache *used_block_group;
+ unsigned long aligned_cluster;
+ /*
+ * the refill lock keeps out other
+ * people trying to start a new cluster
+ */
+ used_block_group = btrfs_lock_cluster(block_group,
+ last_ptr,
+ delalloc);
+ if (!used_block_group)
+ goto refill_cluster;
+
+ if (used_block_group != block_group &&
+ (used_block_group->ro ||
+ !block_group_bits(used_block_group, flags)))
+ goto release_cluster;
+
+ offset = btrfs_alloc_from_cluster(used_block_group,
+ last_ptr,
+ num_bytes,
+ used_block_group->key.objectid,
+ &max_extent_size);
+ if (offset) {
+ /* we have a block, we're done */
+ spin_unlock(&last_ptr->refill_lock);
+ trace_btrfs_reserve_extent_cluster(root,
+ used_block_group,
+ search_start, num_bytes);
+ if (used_block_group != block_group) {
+ btrfs_release_block_group(block_group,
+ delalloc);
+ block_group = used_block_group;
+ }
+ goto checks;
+ }
+
+ WARN_ON(last_ptr->block_group != used_block_group);
+release_cluster:
+ /* If we are on LOOP_NO_EMPTY_SIZE, we can't
+ * set up a new clusters, so lets just skip it
+ * and let the allocator find whatever block
+ * it can find. If we reach this point, we
+ * will have tried the cluster allocator
+ * plenty of times and not have found
+ * anything, so we are likely way too
+ * fragmented for the clustering stuff to find
+ * anything.
+ *
+ * However, if the cluster is taken from the
+ * current block group, release the cluster
+ * first, so that we stand a better chance of
+ * succeeding in the unclustered
+ * allocation. */
+ if (loop >= LOOP_NO_EMPTY_SIZE &&
+ used_block_group != block_group) {
+ spin_unlock(&last_ptr->refill_lock);
+ btrfs_release_block_group(used_block_group,
+ delalloc);
+ goto unclustered_alloc;
+ }
+
+ /*
+ * this cluster didn't work out, free it and
+ * start over
+ */
+ btrfs_return_cluster_to_free_space(NULL, last_ptr);
+
+ if (used_block_group != block_group)
+ btrfs_release_block_group(used_block_group,
+ delalloc);
+refill_cluster:
+ if (loop >= LOOP_NO_EMPTY_SIZE) {
+ spin_unlock(&last_ptr->refill_lock);
+ goto unclustered_alloc;
+ }
+
+ aligned_cluster = max_t(unsigned long,
+ empty_cluster + empty_size,
+ block_group->full_stripe_len);
+
+ /* allocate a cluster in this block group */
+ ret = btrfs_find_space_cluster(root, block_group,
+ last_ptr, search_start,
+ num_bytes,
+ aligned_cluster);
+ if (ret == 0) {
+ /*
+ * now pull our allocation out of this
+ * cluster
+ */
+ offset = btrfs_alloc_from_cluster(block_group,
+ last_ptr,
+ num_bytes,
+ search_start,
+ &max_extent_size);
+ if (offset) {
+ /* we found one, proceed */
+ spin_unlock(&last_ptr->refill_lock);
+ trace_btrfs_reserve_extent_cluster(root,
+ block_group, search_start,
+ num_bytes);
+ goto checks;
+ }
+ } else if (!cached && loop > LOOP_CACHING_NOWAIT
+ && !failed_cluster_refill) {
+ spin_unlock(&last_ptr->refill_lock);
+
+ failed_cluster_refill = true;
+ wait_block_group_cache_progress(block_group,
+ num_bytes + empty_cluster + empty_size);
+ goto have_block_group;
+ }
+
+ /*
+ * at this point we either didn't find a cluster
+ * or we weren't able to allocate a block from our
+ * cluster. Free the cluster we've been trying
+ * to use, and go to the next block group
+ */
+ btrfs_return_cluster_to_free_space(NULL, last_ptr);
+ spin_unlock(&last_ptr->refill_lock);
+ goto loop;
+ }
+
+unclustered_alloc:
+ spin_lock(&block_group->free_space_ctl->tree_lock);
+ if (cached &&
+ block_group->free_space_ctl->free_space <
+ num_bytes + empty_cluster + empty_size) {
+ if (block_group->free_space_ctl->free_space >
+ max_extent_size)
+ max_extent_size =
+ block_group->free_space_ctl->free_space;
+ spin_unlock(&block_group->free_space_ctl->tree_lock);
+ goto loop;
+ }
+ spin_unlock(&block_group->free_space_ctl->tree_lock);
+
+ offset = btrfs_find_space_for_alloc(block_group, search_start,
+ num_bytes, empty_size,
+ &max_extent_size);
+ /*
+ * If we didn't find a chunk, and we haven't failed on this
+ * block group before, and this block group is in the middle of
+ * caching and we are ok with waiting, then go ahead and wait
+ * for progress to be made, and set failed_alloc to true.
+ *
+ * If failed_alloc is true then we've already waited on this
+ * block group once and should move on to the next block group.
+ */
+ if (!offset && !failed_alloc && !cached &&
+ loop > LOOP_CACHING_NOWAIT) {
+ wait_block_group_cache_progress(block_group,
+ num_bytes + empty_size);
+ failed_alloc = true;
+ goto have_block_group;
+ } else if (!offset) {
+ if (!cached)
+ have_caching_bg = true;
+ goto loop;
+ }
+checks:
+ search_start = ALIGN(offset, root->stripesize);
+
+ /* move on to the next group */
+ if (search_start + num_bytes >
+ block_group->key.objectid + block_group->key.offset) {
+ btrfs_add_free_space(block_group, offset, num_bytes);
+ goto loop;
+ }
+
+ if (offset < search_start)
+ btrfs_add_free_space(block_group, offset,
+ search_start - offset);
+ BUG_ON(offset > search_start);
+
+ ret = btrfs_update_reserved_bytes(block_group, num_bytes,
+ alloc_type, delalloc);
+ if (ret == -EAGAIN) {
+ btrfs_add_free_space(block_group, offset, num_bytes);
+ goto loop;
+ }
+
+ /* we are all good, lets return */
+ ins->objectid = search_start;
+ ins->offset = num_bytes;
+
+ trace_btrfs_reserve_extent(orig_root, block_group,
+ search_start, num_bytes);
+ btrfs_release_block_group(block_group, delalloc);
+ break;
+loop:
+ failed_cluster_refill = false;
+ failed_alloc = false;
+ BUG_ON(index != get_block_group_index(block_group));
+ btrfs_release_block_group(block_group, delalloc);
+ }
+ up_read(&space_info->groups_sem);
+
+ if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
+ goto search;
+
+ if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
+ goto search;
+
+ /*
+ * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
+ * caching kthreads as we move along
+ * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
+ * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
+ * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
+ * again
+ */
+ if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
+ index = 0;
+ loop++;
+ if (loop == LOOP_ALLOC_CHUNK) {
+ struct btrfs_trans_handle *trans;
+ int exist = 0;
+
+ trans = current->journal_info;
+ if (trans)
+ exist = 1;
+ else
+ trans = btrfs_join_transaction(root);
+
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+
+ ret = do_chunk_alloc(trans, root, flags,
+ CHUNK_ALLOC_FORCE);
+ /*
+ * Do not bail out on ENOSPC since we
+ * can do more things.
+ */
+ if (ret < 0 && ret != -ENOSPC)
+ btrfs_abort_transaction(trans,
+ root, ret);
+ else
+ ret = 0;
+ if (!exist)
+ btrfs_end_transaction(trans, root);
+ if (ret)
+ goto out;
+ }
+
+ if (loop == LOOP_NO_EMPTY_SIZE) {
+ empty_size = 0;
+ empty_cluster = 0;
+ }
+
+ goto search;
+ } else if (!ins->objectid) {
+ ret = -ENOSPC;
+ } else if (ins->objectid) {
+ ret = 0;
+ }
+out:
+ if (ret == -ENOSPC)
+ ins->offset = max_extent_size;
+ return ret;
+}
+
+static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
+ int dump_block_groups)
+{
+ struct btrfs_block_group_cache *cache;
+ int index = 0;
+
+ spin_lock(&info->lock);
+ printk(KERN_INFO "BTRFS: space_info %llu has %llu free, is %sfull\n",
+ info->flags,
+ info->total_bytes - info->bytes_used - info->bytes_pinned -
+ info->bytes_reserved - info->bytes_readonly,
+ (info->full) ? "" : "not ");
+ printk(KERN_INFO "BTRFS: space_info total=%llu, used=%llu, pinned=%llu, "
+ "reserved=%llu, may_use=%llu, readonly=%llu\n",
+ info->total_bytes, info->bytes_used, info->bytes_pinned,
+ info->bytes_reserved, info->bytes_may_use,
+ info->bytes_readonly);
+ spin_unlock(&info->lock);
+
+ if (!dump_block_groups)
+ return;
+
+ down_read(&info->groups_sem);
+again:
+ list_for_each_entry(cache, &info->block_groups[index], list) {
+ spin_lock(&cache->lock);
+ printk(KERN_INFO "BTRFS: "
+ "block group %llu has %llu bytes, "
+ "%llu used %llu pinned %llu reserved %s\n",
+ cache->key.objectid, cache->key.offset,
+ btrfs_block_group_used(&cache->item), cache->pinned,
+ cache->reserved, cache->ro ? "[readonly]" : "");
+ btrfs_dump_free_space(cache, bytes);
+ spin_unlock(&cache->lock);
+ }
+ if (++index < BTRFS_NR_RAID_TYPES)
+ goto again;
+ up_read(&info->groups_sem);
+}
+
+int btrfs_reserve_extent(struct btrfs_root *root,
+ u64 num_bytes, u64 min_alloc_size,
+ u64 empty_size, u64 hint_byte,
+ struct btrfs_key *ins, int is_data, int delalloc)
+{
+ bool final_tried = false;
+ u64 flags;
+ int ret;
+
+ flags = btrfs_get_alloc_profile(root, is_data);
+again:
+ WARN_ON(num_bytes < root->sectorsize);
+ ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins,
+ flags, delalloc);
+
+ if (ret == -ENOSPC) {
+ if (!final_tried && ins->offset) {
+ num_bytes = min(num_bytes >> 1, ins->offset);
+ num_bytes = round_down(num_bytes, root->sectorsize);
+ num_bytes = max(num_bytes, min_alloc_size);
+ if (num_bytes == min_alloc_size)
+ final_tried = true;
+ goto again;
+ } else if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
+ struct btrfs_space_info *sinfo;
+
+ sinfo = __find_space_info(root->fs_info, flags);
+ btrfs_err(root->fs_info, "allocation failed flags %llu, wanted %llu",
+ flags, num_bytes);
+ if (sinfo)
+ dump_space_info(sinfo, num_bytes, 1);
+ }
+ }
+
+ return ret;
+}
+
+static int __btrfs_free_reserved_extent(struct btrfs_root *root,
+ u64 start, u64 len,
+ int pin, int delalloc)
+{
+ struct btrfs_block_group_cache *cache;
+ int ret = 0;
+
+ cache = btrfs_lookup_block_group(root->fs_info, start);
+ if (!cache) {
+ btrfs_err(root->fs_info, "Unable to find block group for %llu",
+ start);
+ return -ENOSPC;
+ }
+
+ if (pin)
+ pin_down_extent(root, cache, start, len, 1);
+ else {
+ if (btrfs_test_opt(root, DISCARD))
+ ret = btrfs_discard_extent(root, start, len, NULL);
+ btrfs_add_free_space(cache, start, len);
+ btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc);
+ }
+
+ btrfs_put_block_group(cache);
+
+ trace_btrfs_reserved_extent_free(root, start, len);
+
+ return ret;
+}
+
+int btrfs_free_reserved_extent(struct btrfs_root *root,
+ u64 start, u64 len, int delalloc)
+{
+ return __btrfs_free_reserved_extent(root, start, len, 0, delalloc);
+}
+
+int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
+ u64 start, u64 len)
+{
+ return __btrfs_free_reserved_extent(root, start, len, 1, 0);
+}
+
+static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 parent, u64 root_objectid,
+ u64 flags, u64 owner, u64 offset,
+ struct btrfs_key *ins, int ref_mod)
+{
+ int ret;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_extent_item *extent_item;
+ struct btrfs_extent_inline_ref *iref;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ int type;
+ u32 size;
+
+ if (parent > 0)
+ type = BTRFS_SHARED_DATA_REF_KEY;
+ else
+ type = BTRFS_EXTENT_DATA_REF_KEY;
+
+ size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ path->leave_spinning = 1;
+ ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
+ ins, size);
+ if (ret) {
+ btrfs_free_path(path);
+ return ret;
+ }
+
+ leaf = path->nodes[0];
+ extent_item = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_extent_item);
+ btrfs_set_extent_refs(leaf, extent_item, ref_mod);
+ btrfs_set_extent_generation(leaf, extent_item, trans->transid);
+ btrfs_set_extent_flags(leaf, extent_item,
+ flags | BTRFS_EXTENT_FLAG_DATA);
+
+ iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
+ btrfs_set_extent_inline_ref_type(leaf, iref, type);
+ if (parent > 0) {
+ struct btrfs_shared_data_ref *ref;
+ ref = (struct btrfs_shared_data_ref *)(iref + 1);
+ btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
+ btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
+ } else {
+ struct btrfs_extent_data_ref *ref;
+ ref = (struct btrfs_extent_data_ref *)(&iref->offset);
+ btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
+ btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
+ btrfs_set_extent_data_ref_offset(leaf, ref, offset);
+ btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
+ }
+
+ btrfs_mark_buffer_dirty(path->nodes[0]);
+ btrfs_free_path(path);
+
+ /* Always set parent to 0 here since its exclusive anyway. */
+ ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
+ ins->objectid, ins->offset,
+ BTRFS_QGROUP_OPER_ADD_EXCL, 0);
+ if (ret)
+ return ret;
+
+ ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
+ if (ret) { /* -ENOENT, logic error */
+ btrfs_err(fs_info, "update block group failed for %llu %llu",
+ ins->objectid, ins->offset);
+ BUG();
+ }
+ trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);
+ return ret;
+}
+
+static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 parent, u64 root_objectid,
+ u64 flags, struct btrfs_disk_key *key,
+ int level, struct btrfs_key *ins,
+ int no_quota)
+{
+ int ret;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_extent_item *extent_item;
+ struct btrfs_tree_block_info *block_info;
+ struct btrfs_extent_inline_ref *iref;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ u32 size = sizeof(*extent_item) + sizeof(*iref);
+ u64 num_bytes = ins->offset;
+ bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
+ SKINNY_METADATA);
+
+ if (!skinny_metadata)
+ size += sizeof(*block_info);
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ btrfs_free_and_pin_reserved_extent(root, ins->objectid,
+ root->nodesize);
+ return -ENOMEM;
+ }
+
+ path->leave_spinning = 1;
+ ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
+ ins, size);
+ if (ret) {
+ btrfs_free_path(path);
+ btrfs_free_and_pin_reserved_extent(root, ins->objectid,
+ root->nodesize);
+ return ret;
+ }
+
+ leaf = path->nodes[0];
+ extent_item = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_extent_item);
+ btrfs_set_extent_refs(leaf, extent_item, 1);
+ btrfs_set_extent_generation(leaf, extent_item, trans->transid);
+ btrfs_set_extent_flags(leaf, extent_item,
+ flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
+
+ if (skinny_metadata) {
+ iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
+ num_bytes = root->nodesize;
+ } else {
+ block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
+ btrfs_set_tree_block_key(leaf, block_info, key);
+ btrfs_set_tree_block_level(leaf, block_info, level);
+ iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
+ }
+
+ if (parent > 0) {
+ BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
+ btrfs_set_extent_inline_ref_type(leaf, iref,
+ BTRFS_SHARED_BLOCK_REF_KEY);
+ btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
+ } else {
+ btrfs_set_extent_inline_ref_type(leaf, iref,
+ BTRFS_TREE_BLOCK_REF_KEY);
+ btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
+ }
+
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_free_path(path);
+
+ if (!no_quota) {
+ ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
+ ins->objectid, num_bytes,
+ BTRFS_QGROUP_OPER_ADD_EXCL, 0);
+ if (ret)
+ return ret;
+ }
+
+ ret = update_block_group(trans, root, ins->objectid, root->nodesize,
+ 1);
+ if (ret) { /* -ENOENT, logic error */
+ btrfs_err(fs_info, "update block group failed for %llu %llu",
+ ins->objectid, ins->offset);
+ BUG();
+ }
+
+ trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->nodesize);
+ return ret;
+}
+
+int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 root_objectid, u64 owner,
+ u64 offset, struct btrfs_key *ins)
+{
+ int ret;
+
+ BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
+
+ ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
+ ins->offset, 0,
+ root_objectid, owner, offset,
+ BTRFS_ADD_DELAYED_EXTENT, NULL, 0);
+ return ret;
+}
+
+/*
+ * this is used by the tree logging recovery code. It records that
+ * an extent has been allocated and makes sure to clear the free
+ * space cache bits as well
+ */
+int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 root_objectid, u64 owner, u64 offset,
+ struct btrfs_key *ins)
+{
+ int ret;
+ struct btrfs_block_group_cache *block_group;
+
+ /*
+ * Mixed block groups will exclude before processing the log so we only
+ * need to do the exlude dance if this fs isn't mixed.
+ */
+ if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) {
+ ret = __exclude_logged_extent(root, ins->objectid, ins->offset);
+ if (ret)
+ return ret;
+ }
+
+ block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
+ if (!block_group)
+ return -EINVAL;
+
+ ret = btrfs_update_reserved_bytes(block_group, ins->offset,
+ RESERVE_ALLOC_NO_ACCOUNT, 0);
+ BUG_ON(ret); /* logic error */
+ ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
+ 0, owner, offset, ins, 1);
+ btrfs_put_block_group(block_group);
+ return ret;
+}
+
+static struct extent_buffer *
+btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ u64 bytenr, int level)
+{
+ struct extent_buffer *buf;
+
+ buf = btrfs_find_create_tree_block(root, bytenr);
+ if (!buf)
+ return ERR_PTR(-ENOMEM);
+ btrfs_set_header_generation(buf, trans->transid);
+ btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
+ btrfs_tree_lock(buf);
+ clean_tree_block(trans, root->fs_info, buf);
+ clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
+
+ btrfs_set_lock_blocking(buf);
+ btrfs_set_buffer_uptodate(buf);
+
+ if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
+ buf->log_index = root->log_transid % 2;
+ /*
+ * we allow two log transactions at a time, use different
+ * EXENT bit to differentiate dirty pages.
+ */
+ if (buf->log_index == 0)
+ set_extent_dirty(&root->dirty_log_pages, buf->start,
+ buf->start + buf->len - 1, GFP_NOFS);
+ else
+ set_extent_new(&root->dirty_log_pages, buf->start,
+ buf->start + buf->len - 1, GFP_NOFS);
+ } else {
+ buf->log_index = -1;
+ set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
+ buf->start + buf->len - 1, GFP_NOFS);
+ }
+ trans->blocks_used++;
+ /* this returns a buffer locked for blocking */
+ return buf;
+}
+
+static struct btrfs_block_rsv *
+use_block_rsv(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u32 blocksize)
+{
+ struct btrfs_block_rsv *block_rsv;
+ struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
+ int ret;
+ bool global_updated = false;
+
+ block_rsv = get_block_rsv(trans, root);
+
+ if (unlikely(block_rsv->size == 0))
+ goto try_reserve;
+again:
+ ret = block_rsv_use_bytes(block_rsv, blocksize);
+ if (!ret)
+ return block_rsv;
+
+ if (block_rsv->failfast)
+ return ERR_PTR(ret);
+
+ if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
+ global_updated = true;
+ update_global_block_rsv(root->fs_info);
+ goto again;
+ }
+
+ if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
+ static DEFINE_RATELIMIT_STATE(_rs,
+ DEFAULT_RATELIMIT_INTERVAL * 10,
+ /*DEFAULT_RATELIMIT_BURST*/ 1);
+ if (__ratelimit(&_rs))
+ WARN(1, KERN_DEBUG
+ "BTRFS: block rsv returned %d\n", ret);
+ }
+try_reserve:
+ ret = reserve_metadata_bytes(root, block_rsv, blocksize,
+ BTRFS_RESERVE_NO_FLUSH);
+ if (!ret)
+ return block_rsv;
+ /*
+ * If we couldn't reserve metadata bytes try and use some from
+ * the global reserve if its space type is the same as the global
+ * reservation.
+ */
+ if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL &&
+ block_rsv->space_info == global_rsv->space_info) {
+ ret = block_rsv_use_bytes(global_rsv, blocksize);
+ if (!ret)
+ return global_rsv;
+ }
+ return ERR_PTR(ret);
+}
+
+static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *block_rsv, u32 blocksize)
+{
+ block_rsv_add_bytes(block_rsv, blocksize, 0);
+ block_rsv_release_bytes(fs_info, block_rsv, NULL, 0);
+}
+
+/*
+ * finds a free extent and does all the dirty work required for allocation
+ * returns the key for the extent through ins, and a tree buffer for
+ * the first block of the extent through buf.
+ *
+ * returns the tree buffer or an ERR_PTR on error.
+ */
+struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 parent, u64 root_objectid,
+ struct btrfs_disk_key *key, int level,
+ u64 hint, u64 empty_size)
+{
+ struct btrfs_key ins;
+ struct btrfs_block_rsv *block_rsv;
+ struct extent_buffer *buf;
+ struct btrfs_delayed_extent_op *extent_op;
+ u64 flags = 0;
+ int ret;
+ u32 blocksize = root->nodesize;
+ bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
+ SKINNY_METADATA);
+
+ if (btrfs_test_is_dummy_root(root)) {
+ buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
+ level);
+ if (!IS_ERR(buf))
+ root->alloc_bytenr += blocksize;
+ return buf;
+ }
+
+ block_rsv = use_block_rsv(trans, root, blocksize);
+ if (IS_ERR(block_rsv))
+ return ERR_CAST(block_rsv);
+
+ ret = btrfs_reserve_extent(root, blocksize, blocksize,
+ empty_size, hint, &ins, 0, 0);
+ if (ret)
+ goto out_unuse;
+
+ buf = btrfs_init_new_buffer(trans, root, ins.objectid, level);
+ if (IS_ERR(buf)) {
+ ret = PTR_ERR(buf);
+ goto out_free_reserved;
+ }
+
+ if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
+ if (parent == 0)
+ parent = ins.objectid;
+ flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
+ } else
+ BUG_ON(parent > 0);
+
+ if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
+ extent_op = btrfs_alloc_delayed_extent_op();
+ if (!extent_op) {
+ ret = -ENOMEM;
+ goto out_free_buf;
+ }
+ if (key)
+ memcpy(&extent_op->key, key, sizeof(extent_op->key));
+ else
+ memset(&extent_op->key, 0, sizeof(extent_op->key));
+ extent_op->flags_to_set = flags;
+ if (skinny_metadata)
+ extent_op->update_key = 0;
+ else
+ extent_op->update_key = 1;
+ extent_op->update_flags = 1;
+ extent_op->is_data = 0;
+ extent_op->level = level;
+
+ ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
+ ins.objectid, ins.offset,
+ parent, root_objectid, level,
+ BTRFS_ADD_DELAYED_EXTENT,
+ extent_op, 0);
+ if (ret)
+ goto out_free_delayed;
+ }
+ return buf;
+
+out_free_delayed:
+ btrfs_free_delayed_extent_op(extent_op);
+out_free_buf:
+ free_extent_buffer(buf);
+out_free_reserved:
+ btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 0);
+out_unuse:
+ unuse_block_rsv(root->fs_info, block_rsv, blocksize);
+ return ERR_PTR(ret);
+}
+
+struct walk_control {
+ u64 refs[BTRFS_MAX_LEVEL];
+ u64 flags[BTRFS_MAX_LEVEL];
+ struct btrfs_key update_progress;
+ int stage;
+ int level;
+ int shared_level;
+ int update_ref;
+ int keep_locks;
+ int reada_slot;
+ int reada_count;
+ int for_reloc;
+};
+
+#define DROP_REFERENCE 1
+#define UPDATE_BACKREF 2
+
+static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct walk_control *wc,
+ struct btrfs_path *path)
+{
+ u64 bytenr;
+ u64 generation;
+ u64 refs;
+ u64 flags;
+ u32 nritems;
+ u32 blocksize;
+ struct btrfs_key key;
+ struct extent_buffer *eb;
+ int ret;
+ int slot;
+ int nread = 0;
+
+ if (path->slots[wc->level] < wc->reada_slot) {
+ wc->reada_count = wc->reada_count * 2 / 3;
+ wc->reada_count = max(wc->reada_count, 2);
+ } else {
+ wc->reada_count = wc->reada_count * 3 / 2;
+ wc->reada_count = min_t(int, wc->reada_count,
+ BTRFS_NODEPTRS_PER_BLOCK(root));
+ }
+
+ eb = path->nodes[wc->level];
+ nritems = btrfs_header_nritems(eb);
+ blocksize = root->nodesize;
+
+ for (slot = path->slots[wc->level]; slot < nritems; slot++) {
+ if (nread >= wc->reada_count)
+ break;
+
+ cond_resched();
+ bytenr = btrfs_node_blockptr(eb, slot);
+ generation = btrfs_node_ptr_generation(eb, slot);
+
+ if (slot == path->slots[wc->level])
+ goto reada;
+
+ if (wc->stage == UPDATE_BACKREF &&
+ generation <= root->root_key.offset)
+ continue;
+
+ /* We don't lock the tree block, it's OK to be racy here */
+ ret = btrfs_lookup_extent_info(trans, root, bytenr,
+ wc->level - 1, 1, &refs,
+ &flags);
+ /* We don't care about errors in readahead. */
+ if (ret < 0)
+ continue;
+ BUG_ON(refs == 0);
+
+ if (wc->stage == DROP_REFERENCE) {
+ if (refs == 1)
+ goto reada;
+
+ if (wc->level == 1 &&
+ (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
+ continue;
+ if (!wc->update_ref ||
+ generation <= root->root_key.offset)
+ continue;
+ btrfs_node_key_to_cpu(eb, &key, slot);
+ ret = btrfs_comp_cpu_keys(&key,
+ &wc->update_progress);
+ if (ret < 0)
+ continue;
+ } else {
+ if (wc->level == 1 &&
+ (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
+ continue;
+ }
+reada:
+ readahead_tree_block(root, bytenr);
+ nread++;
+ }
+ wc->reada_slot = slot;
+}
+
+static int account_leaf_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *eb)
+{
+ int nr = btrfs_header_nritems(eb);
+ int i, extent_type, ret;
+ struct btrfs_key key;
+ struct btrfs_file_extent_item *fi;
+ u64 bytenr, num_bytes;
+
+ for (i = 0; i < nr; i++) {
+ btrfs_item_key_to_cpu(eb, &key, i);
+
+ if (key.type != BTRFS_EXTENT_DATA_KEY)
+ continue;
+
+ fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
+ /* filter out non qgroup-accountable extents */
+ extent_type = btrfs_file_extent_type(eb, fi);
+
+ if (extent_type == BTRFS_FILE_EXTENT_INLINE)
+ continue;
+
+ bytenr = btrfs_file_extent_disk_bytenr(eb, fi);
+ if (!bytenr)
+ continue;
+
+ num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
+
+ ret = btrfs_qgroup_record_ref(trans, root->fs_info,
+ root->objectid,
+ bytenr, num_bytes,
+ BTRFS_QGROUP_OPER_SUB_SUBTREE, 0);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+/*
+ * Walk up the tree from the bottom, freeing leaves and any interior
+ * nodes which have had all slots visited. If a node (leaf or
+ * interior) is freed, the node above it will have it's slot
+ * incremented. The root node will never be freed.
+ *
+ * At the end of this function, we should have a path which has all
+ * slots incremented to the next position for a search. If we need to
+ * read a new node it will be NULL and the node above it will have the
+ * correct slot selected for a later read.
+ *
+ * If we increment the root nodes slot counter past the number of
+ * elements, 1 is returned to signal completion of the search.
+ */
+static int adjust_slots_upwards(struct btrfs_root *root,
+ struct btrfs_path *path, int root_level)
+{
+ int level = 0;
+ int nr, slot;
+ struct extent_buffer *eb;
+
+ if (root_level == 0)
+ return 1;
+
+ while (level <= root_level) {
+ eb = path->nodes[level];
+ nr = btrfs_header_nritems(eb);
+ path->slots[level]++;
+ slot = path->slots[level];
+ if (slot >= nr || level == 0) {
+ /*
+ * Don't free the root - we will detect this
+ * condition after our loop and return a
+ * positive value for caller to stop walking the tree.
+ */
+ if (level != root_level) {
+ btrfs_tree_unlock_rw(eb, path->locks[level]);
+ path->locks[level] = 0;
+
+ free_extent_buffer(eb);
+ path->nodes[level] = NULL;
+ path->slots[level] = 0;
+ }
+ } else {
+ /*
+ * We have a valid slot to walk back down
+ * from. Stop here so caller can process these
+ * new nodes.
+ */
+ break;
+ }
+
+ level++;
+ }
+
+ eb = path->nodes[root_level];
+ if (path->slots[root_level] >= btrfs_header_nritems(eb))
+ return 1;
+
+ return 0;
+}
+
+/*
+ * root_eb is the subtree root and is locked before this function is called.
+ */
+static int account_shared_subtree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *root_eb,
+ u64 root_gen,
+ int root_level)
+{
+ int ret = 0;
+ int level;
+ struct extent_buffer *eb = root_eb;
+ struct btrfs_path *path = NULL;
+
+ BUG_ON(root_level < 0 || root_level > BTRFS_MAX_LEVEL);
+ BUG_ON(root_eb == NULL);
+
+ if (!root->fs_info->quota_enabled)
+ return 0;
+
+ if (!extent_buffer_uptodate(root_eb)) {
+ ret = btrfs_read_buffer(root_eb, root_gen);
+ if (ret)
+ goto out;
+ }
+
+ if (root_level == 0) {
+ ret = account_leaf_items(trans, root, root_eb);
+ goto out;
+ }
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ /*
+ * Walk down the tree. Missing extent blocks are filled in as
+ * we go. Metadata is accounted every time we read a new
+ * extent block.
+ *
+ * When we reach a leaf, we account for file extent items in it,
+ * walk back up the tree (adjusting slot pointers as we go)
+ * and restart the search process.
+ */
+ extent_buffer_get(root_eb); /* For path */
+ path->nodes[root_level] = root_eb;
+ path->slots[root_level] = 0;
+ path->locks[root_level] = 0; /* so release_path doesn't try to unlock */
+walk_down:
+ level = root_level;
+ while (level >= 0) {
+ if (path->nodes[level] == NULL) {
+ int parent_slot;
+ u64 child_gen;
+ u64 child_bytenr;
+
+ /* We need to get child blockptr/gen from
+ * parent before we can read it. */
+ eb = path->nodes[level + 1];
+ parent_slot = path->slots[level + 1];
+ child_bytenr = btrfs_node_blockptr(eb, parent_slot);
+ child_gen = btrfs_node_ptr_generation(eb, parent_slot);
+
+ eb = read_tree_block(root, child_bytenr, child_gen);
+ if (!eb || !extent_buffer_uptodate(eb)) {
+ ret = -EIO;
+ goto out;
+ }
+
+ path->nodes[level] = eb;
+ path->slots[level] = 0;
+
+ btrfs_tree_read_lock(eb);
+ btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
+
+ ret = btrfs_qgroup_record_ref(trans, root->fs_info,
+ root->objectid,
+ child_bytenr,
+ root->nodesize,
+ BTRFS_QGROUP_OPER_SUB_SUBTREE,
+ 0);
+ if (ret)
+ goto out;
+
+ }
+
+ if (level == 0) {
+ ret = account_leaf_items(trans, root, path->nodes[level]);
+ if (ret)
+ goto out;
+
+ /* Nonzero return here means we completed our search */
+ ret = adjust_slots_upwards(root, path, root_level);
+ if (ret)
+ break;
+
+ /* Restart search with new slots */
+ goto walk_down;
+ }
+
+ level--;
+ }
+
+ ret = 0;
+out:
+ btrfs_free_path(path);
+
+ return ret;
+}
+
+/*
+ * helper to process tree block while walking down the tree.
+ *
+ * when wc->stage == UPDATE_BACKREF, this function updates
+ * back refs for pointers in the block.
+ *
+ * NOTE: return value 1 means we should stop walking down.
+ */
+static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct walk_control *wc, int lookup_info)
+{
+ int level = wc->level;
+ struct extent_buffer *eb = path->nodes[level];
+ u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
+ int ret;
+
+ if (wc->stage == UPDATE_BACKREF &&
+ btrfs_header_owner(eb) != root->root_key.objectid)
+ return 1;
+
+ /*
+ * when reference count of tree block is 1, it won't increase
+ * again. once full backref flag is set, we never clear it.
+ */
+ if (lookup_info &&
+ ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
+ (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
+ BUG_ON(!path->locks[level]);
+ ret = btrfs_lookup_extent_info(trans, root,
+ eb->start, level, 1,
+ &wc->refs[level],
+ &wc->flags[level]);
+ BUG_ON(ret == -ENOMEM);
+ if (ret)
+ return ret;
+ BUG_ON(wc->refs[level] == 0);
+ }
+
+ if (wc->stage == DROP_REFERENCE) {
+ if (wc->refs[level] > 1)
+ return 1;
+
+ if (path->locks[level] && !wc->keep_locks) {
+ btrfs_tree_unlock_rw(eb, path->locks[level]);
+ path->locks[level] = 0;
+ }
+ return 0;
+ }
+
+ /* wc->stage == UPDATE_BACKREF */
+ if (!(wc->flags[level] & flag)) {
+ BUG_ON(!path->locks[level]);
+ ret = btrfs_inc_ref(trans, root, eb, 1);
+ BUG_ON(ret); /* -ENOMEM */
+ ret = btrfs_dec_ref(trans, root, eb, 0);
+ BUG_ON(ret); /* -ENOMEM */
+ ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
+ eb->len, flag,
+ btrfs_header_level(eb), 0);
+ BUG_ON(ret); /* -ENOMEM */
+ wc->flags[level] |= flag;
+ }
+
+ /*
+ * the block is shared by multiple trees, so it's not good to
+ * keep the tree lock
+ */
+ if (path->locks[level] && level > 0) {
+ btrfs_tree_unlock_rw(eb, path->locks[level]);
+ path->locks[level] = 0;
+ }
+ return 0;
+}
+
+/*
+ * helper to process tree block pointer.
+ *
+ * when wc->stage == DROP_REFERENCE, this function checks
+ * reference count of the block pointed to. if the block
+ * is shared and we need update back refs for the subtree
+ * rooted at the block, this function changes wc->stage to
+ * UPDATE_BACKREF. if the block is shared and there is no
+ * need to update back, this function drops the reference
+ * to the block.
+ *
+ * NOTE: return value 1 means we should stop walking down.
+ */
+static noinline int do_walk_down(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct walk_control *wc, int *lookup_info)
+{
+ u64 bytenr;
+ u64 generation;
+ u64 parent;
+ u32 blocksize;
+ struct btrfs_key key;
+ struct extent_buffer *next;
+ int level = wc->level;
+ int reada = 0;
+ int ret = 0;
+ bool need_account = false;
+
+ generation = btrfs_node_ptr_generation(path->nodes[level],
+ path->slots[level]);
+ /*
+ * if the lower level block was created before the snapshot
+ * was created, we know there is no need to update back refs
+ * for the subtree
+ */
+ if (wc->stage == UPDATE_BACKREF &&
+ generation <= root->root_key.offset) {
+ *lookup_info = 1;
+ return 1;
+ }
+
+ bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
+ blocksize = root->nodesize;
+
+ next = btrfs_find_tree_block(root->fs_info, bytenr);
+ if (!next) {
+ next = btrfs_find_create_tree_block(root, bytenr);
+ if (!next)
+ return -ENOMEM;
+ btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
+ level - 1);
+ reada = 1;
+ }
+ btrfs_tree_lock(next);
+ btrfs_set_lock_blocking(next);
+
+ ret = btrfs_lookup_extent_info(trans, root, bytenr, level - 1, 1,
+ &wc->refs[level - 1],
+ &wc->flags[level - 1]);
+ if (ret < 0) {
+ btrfs_tree_unlock(next);
+ return ret;
+ }
+
+ if (unlikely(wc->refs[level - 1] == 0)) {
+ btrfs_err(root->fs_info, "Missing references.");
+ BUG();
+ }
+ *lookup_info = 0;
+
+ if (wc->stage == DROP_REFERENCE) {
+ if (wc->refs[level - 1] > 1) {
+ need_account = true;
+ if (level == 1 &&
+ (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
+ goto skip;
+
+ if (!wc->update_ref ||
+ generation <= root->root_key.offset)
+ goto skip;
+
+ btrfs_node_key_to_cpu(path->nodes[level], &key,
+ path->slots[level]);
+ ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
+ if (ret < 0)
+ goto skip;
+
+ wc->stage = UPDATE_BACKREF;
+ wc->shared_level = level - 1;
+ }
+ } else {
+ if (level == 1 &&
+ (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
+ goto skip;
+ }
+
+ if (!btrfs_buffer_uptodate(next, generation, 0)) {
+ btrfs_tree_unlock(next);
+ free_extent_buffer(next);
+ next = NULL;
+ *lookup_info = 1;
+ }
+
+ if (!next) {
+ if (reada && level == 1)
+ reada_walk_down(trans, root, wc, path);
+ next = read_tree_block(root, bytenr, generation);
+ if (!next || !extent_buffer_uptodate(next)) {
+ free_extent_buffer(next);
+ return -EIO;
+ }
+ btrfs_tree_lock(next);
+ btrfs_set_lock_blocking(next);
+ }
+
+ level--;
+ BUG_ON(level != btrfs_header_level(next));
+ path->nodes[level] = next;
+ path->slots[level] = 0;
+ path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
+ wc->level = level;
+ if (wc->level == 1)
+ wc->reada_slot = 0;
+ return 0;
+skip:
+ wc->refs[level - 1] = 0;
+ wc->flags[level - 1] = 0;
+ if (wc->stage == DROP_REFERENCE) {
+ if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
+ parent = path->nodes[level]->start;
+ } else {
+ BUG_ON(root->root_key.objectid !=
+ btrfs_header_owner(path->nodes[level]));
+ parent = 0;
+ }
+
+ if (need_account) {
+ ret = account_shared_subtree(trans, root, next,
+ generation, level - 1);
+ if (ret) {
+ printk_ratelimited(KERN_ERR "BTRFS: %s Error "
+ "%d accounting shared subtree. Quota "
+ "is out of sync, rescan required.\n",
+ root->fs_info->sb->s_id, ret);
+ }
+ }
+ ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
+ root->root_key.objectid, level - 1, 0, 0);
+ BUG_ON(ret); /* -ENOMEM */
+ }
+ btrfs_tree_unlock(next);
+ free_extent_buffer(next);
+ *lookup_info = 1;
+ return 1;
+}
+
+/*
+ * helper to process tree block while walking up the tree.
+ *
+ * when wc->stage == DROP_REFERENCE, this function drops
+ * reference count on the block.
+ *
+ * when wc->stage == UPDATE_BACKREF, this function changes
+ * wc->stage back to DROP_REFERENCE if we changed wc->stage
+ * to UPDATE_BACKREF previously while processing the block.
+ *
+ * NOTE: return value 1 means we should stop walking up.
+ */
+static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct walk_control *wc)
+{
+ int ret;
+ int level = wc->level;
+ struct extent_buffer *eb = path->nodes[level];
+ u64 parent = 0;
+
+ if (wc->stage == UPDATE_BACKREF) {
+ BUG_ON(wc->shared_level < level);
+ if (level < wc->shared_level)
+ goto out;
+
+ ret = find_next_key(path, level + 1, &wc->update_progress);
+ if (ret > 0)
+ wc->update_ref = 0;
+
+ wc->stage = DROP_REFERENCE;
+ wc->shared_level = -1;
+ path->slots[level] = 0;
+
+ /*
+ * check reference count again if the block isn't locked.
+ * we should start walking down the tree again if reference
+ * count is one.
+ */
+ if (!path->locks[level]) {
+ BUG_ON(level == 0);
+ btrfs_tree_lock(eb);
+ btrfs_set_lock_blocking(eb);
+ path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
+
+ ret = btrfs_lookup_extent_info(trans, root,
+ eb->start, level, 1,
+ &wc->refs[level],
+ &wc->flags[level]);
+ if (ret < 0) {
+ btrfs_tree_unlock_rw(eb, path->locks[level]);
+ path->locks[level] = 0;
+ return ret;
+ }
+ BUG_ON(wc->refs[level] == 0);
+ if (wc->refs[level] == 1) {
+ btrfs_tree_unlock_rw(eb, path->locks[level]);
+ path->locks[level] = 0;
+ return 1;
+ }
+ }
+ }
+
+ /* wc->stage == DROP_REFERENCE */
+ BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
+
+ if (wc->refs[level] == 1) {
+ if (level == 0) {
+ if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
+ ret = btrfs_dec_ref(trans, root, eb, 1);
+ else
+ ret = btrfs_dec_ref(trans, root, eb, 0);
+ BUG_ON(ret); /* -ENOMEM */
+ ret = account_leaf_items(trans, root, eb);
+ if (ret) {
+ printk_ratelimited(KERN_ERR "BTRFS: %s Error "
+ "%d accounting leaf items. Quota "
+ "is out of sync, rescan required.\n",
+ root->fs_info->sb->s_id, ret);
+ }
+ }
+ /* make block locked assertion in clean_tree_block happy */
+ if (!path->locks[level] &&
+ btrfs_header_generation(eb) == trans->transid) {
+ btrfs_tree_lock(eb);
+ btrfs_set_lock_blocking(eb);
+ path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
+ }
+ clean_tree_block(trans, root->fs_info, eb);
+ }
+
+ if (eb == root->node) {
+ if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
+ parent = eb->start;
+ else
+ BUG_ON(root->root_key.objectid !=
+ btrfs_header_owner(eb));
+ } else {
+ if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
+ parent = path->nodes[level + 1]->start;
+ else
+ BUG_ON(root->root_key.objectid !=
+ btrfs_header_owner(path->nodes[level + 1]));
+ }
+
+ btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
+out:
+ wc->refs[level] = 0;
+ wc->flags[level] = 0;
+ return 0;
+}
+
+static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct walk_control *wc)
+{
+ int level = wc->level;
+ int lookup_info = 1;
+ int ret;
+
+ while (level >= 0) {
+ ret = walk_down_proc(trans, root, path, wc, lookup_info);
+ if (ret > 0)
+ break;
+
+ if (level == 0)
+ break;
+
+ if (path->slots[level] >=
+ btrfs_header_nritems(path->nodes[level]))
+ break;
+
+ ret = do_walk_down(trans, root, path, wc, &lookup_info);
+ if (ret > 0) {
+ path->slots[level]++;
+ continue;
+ } else if (ret < 0)
+ return ret;
+ level = wc->level;
+ }
+ return 0;
+}
+
+static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct walk_control *wc, int max_level)
+{
+ int level = wc->level;
+ int ret;
+
+ path->slots[level] = btrfs_header_nritems(path->nodes[level]);
+ while (level < max_level && path->nodes[level]) {
+ wc->level = level;
+ if (path->slots[level] + 1 <
+ btrfs_header_nritems(path->nodes[level])) {
+ path->slots[level]++;
+ return 0;
+ } else {
+ ret = walk_up_proc(trans, root, path, wc);
+ if (ret > 0)
+ return 0;
+
+ if (path->locks[level]) {
+ btrfs_tree_unlock_rw(path->nodes[level],
+ path->locks[level]);
+ path->locks[level] = 0;
+ }
+ free_extent_buffer(path->nodes[level]);
+ path->nodes[level] = NULL;
+ level++;
+ }
+ }
+ return 1;
+}
+
+/*
+ * drop a subvolume tree.
+ *
+ * this function traverses the tree freeing any blocks that only
+ * referenced by the tree.
+ *
+ * when a shared tree block is found. this function decreases its
+ * reference count by one. if update_ref is true, this function
+ * also make sure backrefs for the shared block and all lower level
+ * blocks are properly updated.
+ *
+ * If called with for_reloc == 0, may exit early with -EAGAIN
+ */
+int btrfs_drop_snapshot(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv, int update_ref,
+ int for_reloc)
+{
+ struct btrfs_path *path;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *tree_root = root->fs_info->tree_root;
+ struct btrfs_root_item *root_item = &root->root_item;
+ struct walk_control *wc;
+ struct btrfs_key key;
+ int err = 0;
+ int ret;
+ int level;
+ bool root_dropped = false;
+
+ btrfs_debug(root->fs_info, "Drop subvolume %llu", root->objectid);
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ wc = kzalloc(sizeof(*wc), GFP_NOFS);
+ if (!wc) {
+ btrfs_free_path(path);
+ err = -ENOMEM;
+ goto out;
+ }
+
+ trans = btrfs_start_transaction(tree_root, 0);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ goto out_free;
+ }
+
+ if (block_rsv)
+ trans->block_rsv = block_rsv;
+
+ if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
+ level = btrfs_header_level(root->node);
+ path->nodes[level] = btrfs_lock_root_node(root);
+ btrfs_set_lock_blocking(path->nodes[level]);
+ path->slots[level] = 0;
+ path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
+ memset(&wc->update_progress, 0,
+ sizeof(wc->update_progress));
+ } else {
+ btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
+ memcpy(&wc->update_progress, &key,
+ sizeof(wc->update_progress));
+
+ level = root_item->drop_level;
+ BUG_ON(level == 0);
+ path->lowest_level = level;
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ path->lowest_level = 0;
+ if (ret < 0) {
+ err = ret;
+ goto out_end_trans;
+ }
+ WARN_ON(ret > 0);
+
+ /*
+ * unlock our path, this is safe because only this
+ * function is allowed to delete this snapshot
+ */
+ btrfs_unlock_up_safe(path, 0);
+
+ level = btrfs_header_level(root->node);
+ while (1) {
+ btrfs_tree_lock(path->nodes[level]);
+ btrfs_set_lock_blocking(path->nodes[level]);
+ path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
+
+ ret = btrfs_lookup_extent_info(trans, root,
+ path->nodes[level]->start,
+ level, 1, &wc->refs[level],
+ &wc->flags[level]);
+ if (ret < 0) {
+ err = ret;
+ goto out_end_trans;
+ }
+ BUG_ON(wc->refs[level] == 0);
+
+ if (level == root_item->drop_level)
+ break;
+
+ btrfs_tree_unlock(path->nodes[level]);
+ path->locks[level] = 0;
+ WARN_ON(wc->refs[level] != 1);
+ level--;
+ }
+ }
+
+ wc->level = level;
+ wc->shared_level = -1;
+ wc->stage = DROP_REFERENCE;
+ wc->update_ref = update_ref;
+ wc->keep_locks = 0;
+ wc->for_reloc = for_reloc;
+ wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
+
+ while (1) {
+
+ ret = walk_down_tree(trans, root, path, wc);
+ if (ret < 0) {
+ err = ret;
+ break;
+ }
+
+ ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
+ if (ret < 0) {
+ err = ret;
+ break;
+ }
+
+ if (ret > 0) {
+ BUG_ON(wc->stage != DROP_REFERENCE);
+ break;
+ }
+
+ if (wc->stage == DROP_REFERENCE) {
+ level = wc->level;
+ btrfs_node_key(path->nodes[level],
+ &root_item->drop_progress,
+ path->slots[level]);
+ root_item->drop_level = level;
+ }
+
+ BUG_ON(wc->level == 0);
+ if (btrfs_should_end_transaction(trans, tree_root) ||
+ (!for_reloc && btrfs_need_cleaner_sleep(root))) {
+ ret = btrfs_update_root(trans, tree_root,
+ &root->root_key,
+ root_item);
+ if (ret) {
+ btrfs_abort_transaction(trans, tree_root, ret);
+ err = ret;
+ goto out_end_trans;
+ }
+
+ /*
+ * Qgroup update accounting is run from
+ * delayed ref handling. This usually works
+ * out because delayed refs are normally the
+ * only way qgroup updates are added. However,
+ * we may have added updates during our tree
+ * walk so run qgroups here to make sure we
+ * don't lose any updates.
+ */
+ ret = btrfs_delayed_qgroup_accounting(trans,
+ root->fs_info);
+ if (ret)
+ printk_ratelimited(KERN_ERR "BTRFS: Failure %d "
+ "running qgroup updates "
+ "during snapshot delete. "
+ "Quota is out of sync, "
+ "rescan required.\n", ret);
+
+ btrfs_end_transaction_throttle(trans, tree_root);
+ if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
+ pr_debug("BTRFS: drop snapshot early exit\n");
+ err = -EAGAIN;
+ goto out_free;
+ }
+
+ trans = btrfs_start_transaction(tree_root, 0);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ goto out_free;
+ }
+ if (block_rsv)
+ trans->block_rsv = block_rsv;
+ }
+ }
+ btrfs_release_path(path);
+ if (err)
+ goto out_end_trans;
+
+ ret = btrfs_del_root(trans, tree_root, &root->root_key);
+ if (ret) {
+ btrfs_abort_transaction(trans, tree_root, ret);
+ goto out_end_trans;
+ }
+
+ if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
+ ret = btrfs_find_root(tree_root, &root->root_key, path,
+ NULL, NULL);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, tree_root, ret);
+ err = ret;
+ goto out_end_trans;
+ } else if (ret > 0) {
+ /* if we fail to delete the orphan item this time
+ * around, it'll get picked up the next time.
+ *
+ * The most common failure here is just -ENOENT.
+ */
+ btrfs_del_orphan_item(trans, tree_root,
+ root->root_key.objectid);
+ }
+ }
+
+ if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
+ btrfs_drop_and_free_fs_root(tree_root->fs_info, root);
+ } else {
+ free_extent_buffer(root->node);
+ free_extent_buffer(root->commit_root);
+ btrfs_put_fs_root(root);
+ }
+ root_dropped = true;
+out_end_trans:
+ ret = btrfs_delayed_qgroup_accounting(trans, tree_root->fs_info);
+ if (ret)
+ printk_ratelimited(KERN_ERR "BTRFS: Failure %d "
+ "running qgroup updates "
+ "during snapshot delete. "
+ "Quota is out of sync, "
+ "rescan required.\n", ret);
+
+ btrfs_end_transaction_throttle(trans, tree_root);
+out_free:
+ kfree(wc);
+ btrfs_free_path(path);
+out:
+ /*
+ * So if we need to stop dropping the snapshot for whatever reason we
+ * need to make sure to add it back to the dead root list so that we
+ * keep trying to do the work later. This also cleans up roots if we
+ * don't have it in the radix (like when we recover after a power fail
+ * or unmount) so we don't leak memory.
+ */
+ if (!for_reloc && root_dropped == false)
+ btrfs_add_dead_root(root);
+ if (err && err != -EAGAIN)
+ btrfs_std_error(root->fs_info, err);
+ return err;
+}
+
+/*
+ * drop subtree rooted at tree block 'node'.
+ *
+ * NOTE: this function will unlock and release tree block 'node'
+ * only used by relocation code
+ */
+int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *node,
+ struct extent_buffer *parent)
+{
+ struct btrfs_path *path;
+ struct walk_control *wc;
+ int level;
+ int parent_level;
+ int ret = 0;
+ int wret;
+
+ BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ wc = kzalloc(sizeof(*wc), GFP_NOFS);
+ if (!wc) {
+ btrfs_free_path(path);
+ return -ENOMEM;
+ }
+
+ btrfs_assert_tree_locked(parent);
+ parent_level = btrfs_header_level(parent);
+ extent_buffer_get(parent);
+ path->nodes[parent_level] = parent;
+ path->slots[parent_level] = btrfs_header_nritems(parent);
+
+ btrfs_assert_tree_locked(node);
+ level = btrfs_header_level(node);
+ path->nodes[level] = node;
+ path->slots[level] = 0;
+ path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
+
+ wc->refs[parent_level] = 1;
+ wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
+ wc->level = level;
+ wc->shared_level = -1;
+ wc->stage = DROP_REFERENCE;
+ wc->update_ref = 0;
+ wc->keep_locks = 1;
+ wc->for_reloc = 1;
+ wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
+
+ while (1) {
+ wret = walk_down_tree(trans, root, path, wc);
+ if (wret < 0) {
+ ret = wret;
+ break;
+ }
+
+ wret = walk_up_tree(trans, root, path, wc, parent_level);
+ if (wret < 0)
+ ret = wret;
+ if (wret != 0)
+ break;
+ }
+
+ kfree(wc);
+ btrfs_free_path(path);
+ return ret;
+}
+
+static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
+{
+ u64 num_devices;
+ u64 stripped;
+
+ /*
+ * if restripe for this chunk_type is on pick target profile and
+ * return, otherwise do the usual balance
+ */
+ stripped = get_restripe_target(root->fs_info, flags);
+ if (stripped)
+ return extended_to_chunk(stripped);
+
+ num_devices = root->fs_info->fs_devices->rw_devices;
+
+ stripped = BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
+ BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
+
+ if (num_devices == 1) {
+ stripped |= BTRFS_BLOCK_GROUP_DUP;
+ stripped = flags & ~stripped;
+
+ /* turn raid0 into single device chunks */
+ if (flags & BTRFS_BLOCK_GROUP_RAID0)
+ return stripped;
+
+ /* turn mirroring into duplication */
+ if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID10))
+ return stripped | BTRFS_BLOCK_GROUP_DUP;
+ } else {
+ /* they already had raid on here, just return */
+ if (flags & stripped)
+ return flags;
+
+ stripped |= BTRFS_BLOCK_GROUP_DUP;
+ stripped = flags & ~stripped;
+
+ /* switch duplicated blocks with raid1 */
+ if (flags & BTRFS_BLOCK_GROUP_DUP)
+ return stripped | BTRFS_BLOCK_GROUP_RAID1;
+
+ /* this is drive concat, leave it alone */
+ }
+
+ return flags;
+}
+
+static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
+{
+ struct btrfs_space_info *sinfo = cache->space_info;
+ u64 num_bytes;
+ u64 min_allocable_bytes;
+ int ret = -ENOSPC;
+
+
+ /*
+ * We need some metadata space and system metadata space for
+ * allocating chunks in some corner cases until we force to set
+ * it to be readonly.
+ */
+ if ((sinfo->flags &
+ (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
+ !force)
+ min_allocable_bytes = 1 * 1024 * 1024;
+ else
+ min_allocable_bytes = 0;
+
+ spin_lock(&sinfo->lock);
+ spin_lock(&cache->lock);
+
+ if (cache->ro) {
+ ret = 0;
+ goto out;
+ }
+
+ num_bytes = cache->key.offset - cache->reserved - cache->pinned -
+ cache->bytes_super - btrfs_block_group_used(&cache->item);
+
+ if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
+ sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
+ min_allocable_bytes <= sinfo->total_bytes) {
+ sinfo->bytes_readonly += num_bytes;
+ cache->ro = 1;
+ list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
+ ret = 0;
+ }
+out:
+ spin_unlock(&cache->lock);
+ spin_unlock(&sinfo->lock);
+ return ret;
+}
+
+int btrfs_set_block_group_ro(struct btrfs_root *root,
+ struct btrfs_block_group_cache *cache)
+
+{
+ struct btrfs_trans_handle *trans;
+ u64 alloc_flags;
+ int ret;
+
+ BUG_ON(cache->ro);
+
+again:
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ /*
+ * we're not allowed to set block groups readonly after the dirty
+ * block groups cache has started writing. If it already started,
+ * back off and let this transaction commit
+ */
+ mutex_lock(&root->fs_info->ro_block_group_mutex);
+ if (trans->transaction->dirty_bg_run) {
+ u64 transid = trans->transid;
+
+ mutex_unlock(&root->fs_info->ro_block_group_mutex);
+ btrfs_end_transaction(trans, root);
+
+ ret = btrfs_wait_for_commit(root, transid);
+ if (ret)
+ return ret;
+ goto again;
+ }
+
+ /*
+ * if we are changing raid levels, try to allocate a corresponding
+ * block group with the new raid level.
+ */
+ alloc_flags = update_block_group_flags(root, cache->flags);
+ if (alloc_flags != cache->flags) {
+ ret = do_chunk_alloc(trans, root, alloc_flags,
+ CHUNK_ALLOC_FORCE);
+ /*
+ * ENOSPC is allowed here, we may have enough space
+ * already allocated at the new raid level to
+ * carry on
+ */
+ if (ret == -ENOSPC)
+ ret = 0;
+ if (ret < 0)
+ goto out;
+ }
+
+ ret = set_block_group_ro(cache, 0);
+ if (!ret)
+ goto out;
+ alloc_flags = get_alloc_profile(root, cache->space_info->flags);
+ ret = do_chunk_alloc(trans, root, alloc_flags,
+ CHUNK_ALLOC_FORCE);
+ if (ret < 0)
+ goto out;
+ ret = set_block_group_ro(cache, 0);
+out:
+ if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
+ alloc_flags = update_block_group_flags(root, cache->flags);
+ lock_chunks(root->fs_info->chunk_root);
+ check_system_chunk(trans, root, alloc_flags);
+ unlock_chunks(root->fs_info->chunk_root);
+ }
+ mutex_unlock(&root->fs_info->ro_block_group_mutex);
+
+ btrfs_end_transaction(trans, root);
+ return ret;
+}
+
+int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 type)
+{
+ u64 alloc_flags = get_alloc_profile(root, type);
+ return do_chunk_alloc(trans, root, alloc_flags,
+ CHUNK_ALLOC_FORCE);
+}
+
+/*
+ * helper to account the unused space of all the readonly block group in the
+ * space_info. takes mirrors into account.
+ */
+u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
+{
+ struct btrfs_block_group_cache *block_group;
+ u64 free_bytes = 0;
+ int factor;
+
+ /* It's df, we don't care if it's racey */
+ if (list_empty(&sinfo->ro_bgs))
+ return 0;
+
+ spin_lock(&sinfo->lock);
+ list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
+ spin_lock(&block_group->lock);
+
+ if (!block_group->ro) {
+ spin_unlock(&block_group->lock);
+ continue;
+ }
+
+ if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID10 |
+ BTRFS_BLOCK_GROUP_DUP))
+ factor = 2;
+ else
+ factor = 1;
+
+ free_bytes += (block_group->key.offset -
+ btrfs_block_group_used(&block_group->item)) *
+ factor;
+
+ spin_unlock(&block_group->lock);
+ }
+ spin_unlock(&sinfo->lock);
+
+ return free_bytes;
+}
+
+void btrfs_set_block_group_rw(struct btrfs_root *root,
+ struct btrfs_block_group_cache *cache)
+{
+ struct btrfs_space_info *sinfo = cache->space_info;
+ u64 num_bytes;
+
+ BUG_ON(!cache->ro);
+
+ spin_lock(&sinfo->lock);
+ spin_lock(&cache->lock);
+ num_bytes = cache->key.offset - cache->reserved - cache->pinned -
+ cache->bytes_super - btrfs_block_group_used(&cache->item);
+ sinfo->bytes_readonly -= num_bytes;
+ cache->ro = 0;
+ list_del_init(&cache->ro_list);
+ spin_unlock(&cache->lock);
+ spin_unlock(&sinfo->lock);
+}
+
+/*
+ * checks to see if its even possible to relocate this block group.
+ *
+ * @return - -1 if it's not a good idea to relocate this block group, 0 if its
+ * ok to go ahead and try.
+ */
+int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
+{
+ struct btrfs_block_group_cache *block_group;
+ struct btrfs_space_info *space_info;
+ struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+ struct btrfs_device *device;
+ struct btrfs_trans_handle *trans;
+ u64 min_free;
+ u64 dev_min = 1;
+ u64 dev_nr = 0;
+ u64 target;
+ int index;
+ int full = 0;
+ int ret = 0;
+
+ block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
+
+ /* odd, couldn't find the block group, leave it alone */
+ if (!block_group)
+ return -1;
+
+ min_free = btrfs_block_group_used(&block_group->item);
+
+ /* no bytes used, we're good */
+ if (!min_free)
+ goto out;
+
+ space_info = block_group->space_info;
+ spin_lock(&space_info->lock);
+
+ full = space_info->full;
+
+ /*
+ * if this is the last block group we have in this space, we can't
+ * relocate it unless we're able to allocate a new chunk below.
+ *
+ * Otherwise, we need to make sure we have room in the space to handle
+ * all of the extents from this block group. If we can, we're good
+ */
+ if ((space_info->total_bytes != block_group->key.offset) &&
+ (space_info->bytes_used + space_info->bytes_reserved +
+ space_info->bytes_pinned + space_info->bytes_readonly +
+ min_free < space_info->total_bytes)) {
+ spin_unlock(&space_info->lock);
+ goto out;
+ }
+ spin_unlock(&space_info->lock);
+
+ /*
+ * ok we don't have enough space, but maybe we have free space on our
+ * devices to allocate new chunks for relocation, so loop through our
+ * alloc devices and guess if we have enough space. if this block
+ * group is going to be restriped, run checks against the target
+ * profile instead of the current one.
+ */
+ ret = -1;
+
+ /*
+ * index:
+ * 0: raid10
+ * 1: raid1
+ * 2: dup
+ * 3: raid0
+ * 4: single
+ */
+ target = get_restripe_target(root->fs_info, block_group->flags);
+ if (target) {
+ index = __get_raid_index(extended_to_chunk(target));
+ } else {
+ /*
+ * this is just a balance, so if we were marked as full
+ * we know there is no space for a new chunk
+ */
+ if (full)
+ goto out;
+
+ index = get_block_group_index(block_group);
+ }
+
+ if (index == BTRFS_RAID_RAID10) {
+ dev_min = 4;
+ /* Divide by 2 */
+ min_free >>= 1;
+ } else if (index == BTRFS_RAID_RAID1) {
+ dev_min = 2;
+ } else if (index == BTRFS_RAID_DUP) {
+ /* Multiply by 2 */
+ min_free <<= 1;
+ } else if (index == BTRFS_RAID_RAID0) {
+ dev_min = fs_devices->rw_devices;
+ min_free = div64_u64(min_free, dev_min);
+ }
+
+ /* We need to do this so that we can look at pending chunks */
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+
+ mutex_lock(&root->fs_info->chunk_mutex);
+ list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
+ u64 dev_offset;
+
+ /*
+ * check to make sure we can actually find a chunk with enough
+ * space to fit our block group in.
+ */
+ if (device->total_bytes > device->bytes_used + min_free &&
+ !device->is_tgtdev_for_dev_replace) {
+ ret = find_free_dev_extent(trans, device, min_free,
+ &dev_offset, NULL);
+ if (!ret)
+ dev_nr++;
+
+ if (dev_nr >= dev_min)
+ break;
+
+ ret = -1;
+ }
+ }
+ mutex_unlock(&root->fs_info->chunk_mutex);
+ btrfs_end_transaction(trans, root);
+out:
+ btrfs_put_block_group(block_group);
+ return ret;
+}
+
+static int find_first_block_group(struct btrfs_root *root,
+ struct btrfs_path *path, struct btrfs_key *key)
+{
+ int ret = 0;
+ struct btrfs_key found_key;
+ struct extent_buffer *leaf;
+ int slot;
+
+ ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ while (1) {
+ slot = path->slots[0];
+ leaf = path->nodes[0];
+ if (slot >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret == 0)
+ continue;
+ if (ret < 0)
+ goto out;
+ break;
+ }
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+ if (found_key.objectid >= key->objectid &&
+ found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
+ ret = 0;
+ goto out;
+ }
+ path->slots[0]++;
+ }
+out:
+ return ret;
+}
+
+void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
+{
+ struct btrfs_block_group_cache *block_group;
+ u64 last = 0;
+
+ while (1) {
+ struct inode *inode;
+
+ block_group = btrfs_lookup_first_block_group(info, last);
+ while (block_group) {
+ spin_lock(&block_group->lock);
+ if (block_group->iref)
+ break;
+ spin_unlock(&block_group->lock);
+ block_group = next_block_group(info->tree_root,
+ block_group);
+ }
+ if (!block_group) {
+ if (last == 0)
+ break;
+ last = 0;
+ continue;
+ }
+
+ inode = block_group->inode;
+ block_group->iref = 0;
+ block_group->inode = NULL;
+ spin_unlock(&block_group->lock);
+ iput(inode);
+ last = block_group->key.objectid + block_group->key.offset;
+ btrfs_put_block_group(block_group);
+ }
+}
+
+int btrfs_free_block_groups(struct btrfs_fs_info *info)
+{
+ struct btrfs_block_group_cache *block_group;
+ struct btrfs_space_info *space_info;
+ struct btrfs_caching_control *caching_ctl;
+ struct rb_node *n;
+
+ down_write(&info->commit_root_sem);
+ while (!list_empty(&info->caching_block_groups)) {
+ caching_ctl = list_entry(info->caching_block_groups.next,
+ struct btrfs_caching_control, list);
+ list_del(&caching_ctl->list);
+ put_caching_control(caching_ctl);
+ }
+ up_write(&info->commit_root_sem);
+
+ spin_lock(&info->unused_bgs_lock);
+ while (!list_empty(&info->unused_bgs)) {
+ block_group = list_first_entry(&info->unused_bgs,
+ struct btrfs_block_group_cache,
+ bg_list);
+ list_del_init(&block_group->bg_list);
+ btrfs_put_block_group(block_group);
+ }
+ spin_unlock(&info->unused_bgs_lock);
+
+ spin_lock(&info->block_group_cache_lock);
+ while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
+ block_group = rb_entry(n, struct btrfs_block_group_cache,
+ cache_node);
+ rb_erase(&block_group->cache_node,
+ &info->block_group_cache_tree);
+ RB_CLEAR_NODE(&block_group->cache_node);
+ spin_unlock(&info->block_group_cache_lock);
+
+ down_write(&block_group->space_info->groups_sem);
+ list_del(&block_group->list);
+ up_write(&block_group->space_info->groups_sem);
+
+ if (block_group->cached == BTRFS_CACHE_STARTED)
+ wait_block_group_cache_done(block_group);
+
+ /*
+ * We haven't cached this block group, which means we could
+ * possibly have excluded extents on this block group.
+ */
+ if (block_group->cached == BTRFS_CACHE_NO ||
+ block_group->cached == BTRFS_CACHE_ERROR)
+ free_excluded_extents(info->extent_root, block_group);
+
+ btrfs_remove_free_space_cache(block_group);
+ btrfs_put_block_group(block_group);
+
+ spin_lock(&info->block_group_cache_lock);
+ }
+ spin_unlock(&info->block_group_cache_lock);
+
+ /* now that all the block groups are freed, go through and
+ * free all the space_info structs. This is only called during
+ * the final stages of unmount, and so we know nobody is
+ * using them. We call synchronize_rcu() once before we start,
+ * just to be on the safe side.
+ */
+ synchronize_rcu();
+
+ release_global_block_rsv(info);
+
+ while (!list_empty(&info->space_info)) {
+ int i;
+
+ space_info = list_entry(info->space_info.next,
+ struct btrfs_space_info,
+ list);
+ if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) {
+ if (WARN_ON(space_info->bytes_pinned > 0 ||
+ space_info->bytes_reserved > 0 ||
+ space_info->bytes_may_use > 0)) {
+ dump_space_info(space_info, 0, 0);
+ }
+ }
+ list_del(&space_info->list);
+ for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
+ struct kobject *kobj;
+ kobj = space_info->block_group_kobjs[i];
+ space_info->block_group_kobjs[i] = NULL;
+ if (kobj) {
+ kobject_del(kobj);
+ kobject_put(kobj);
+ }
+ }
+ kobject_del(&space_info->kobj);
+ kobject_put(&space_info->kobj);
+ }
+ return 0;
+}
+
+static void __link_block_group(struct btrfs_space_info *space_info,
+ struct btrfs_block_group_cache *cache)
+{
+ int index = get_block_group_index(cache);
+ bool first = false;
+
+ down_write(&space_info->groups_sem);
+ if (list_empty(&space_info->block_groups[index]))
+ first = true;
+ list_add_tail(&cache->list, &space_info->block_groups[index]);
+ up_write(&space_info->groups_sem);
+
+ if (first) {
+ struct raid_kobject *rkobj;
+ int ret;
+
+ rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
+ if (!rkobj)
+ goto out_err;
+ rkobj->raid_type = index;
+ kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
+ ret = kobject_add(&rkobj->kobj, &space_info->kobj,
+ "%s", get_raid_name(index));
+ if (ret) {
+ kobject_put(&rkobj->kobj);
+ goto out_err;
+ }
+ space_info->block_group_kobjs[index] = &rkobj->kobj;
+ }
+
+ return;
+out_err:
+ pr_warn("BTRFS: failed to add kobject for block cache. ignoring.\n");
+}
+
+static struct btrfs_block_group_cache *
+btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
+{
+ struct btrfs_block_group_cache *cache;
+
+ cache = kzalloc(sizeof(*cache), GFP_NOFS);
+ if (!cache)
+ return NULL;
+
+ cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
+ GFP_NOFS);
+ if (!cache->free_space_ctl) {
+ kfree(cache);
+ return NULL;
+ }
+
+ cache->key.objectid = start;
+ cache->key.offset = size;
+ cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+
+ cache->sectorsize = root->sectorsize;
+ cache->fs_info = root->fs_info;
+ cache->full_stripe_len = btrfs_full_stripe_len(root,
+ &root->fs_info->mapping_tree,
+ start);
+ atomic_set(&cache->count, 1);
+ spin_lock_init(&cache->lock);
+ init_rwsem(&cache->data_rwsem);
+ INIT_LIST_HEAD(&cache->list);
+ INIT_LIST_HEAD(&cache->cluster_list);
+ INIT_LIST_HEAD(&cache->bg_list);
+ INIT_LIST_HEAD(&cache->ro_list);
+ INIT_LIST_HEAD(&cache->dirty_list);
+ INIT_LIST_HEAD(&cache->io_list);
+ btrfs_init_free_space_ctl(cache);
+ atomic_set(&cache->trimming, 0);
+
+ return cache;
+}
+
+int btrfs_read_block_groups(struct btrfs_root *root)
+{
+ struct btrfs_path *path;
+ int ret;
+ struct btrfs_block_group_cache *cache;
+ struct btrfs_fs_info *info = root->fs_info;
+ struct btrfs_space_info *space_info;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ struct extent_buffer *leaf;
+ int need_clear = 0;
+ u64 cache_gen;
+
+ root = info->extent_root;
+ key.objectid = 0;
+ key.offset = 0;
+ key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ path->reada = 1;
+
+ cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
+ if (btrfs_test_opt(root, SPACE_CACHE) &&
+ btrfs_super_generation(root->fs_info->super_copy) != cache_gen)
+ need_clear = 1;
+ if (btrfs_test_opt(root, CLEAR_CACHE))
+ need_clear = 1;
+
+ while (1) {
+ ret = find_first_block_group(root, path, &key);
+ if (ret > 0)
+ break;
+ if (ret != 0)
+ goto error;
+
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+ cache = btrfs_create_block_group_cache(root, found_key.objectid,
+ found_key.offset);
+ if (!cache) {
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ if (need_clear) {
+ /*
+ * When we mount with old space cache, we need to
+ * set BTRFS_DC_CLEAR and set dirty flag.
+ *
+ * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
+ * truncate the old free space cache inode and
+ * setup a new one.
+ * b) Setting 'dirty flag' makes sure that we flush
+ * the new space cache info onto disk.
+ */
+ if (btrfs_test_opt(root, SPACE_CACHE))
+ cache->disk_cache_state = BTRFS_DC_CLEAR;
+ }
+
+ read_extent_buffer(leaf, &cache->item,
+ btrfs_item_ptr_offset(leaf, path->slots[0]),
+ sizeof(cache->item));
+ cache->flags = btrfs_block_group_flags(&cache->item);
+
+ key.objectid = found_key.objectid + found_key.offset;
+ btrfs_release_path(path);
+
+ /*
+ * We need to exclude the super stripes now so that the space
+ * info has super bytes accounted for, otherwise we'll think
+ * we have more space than we actually do.
+ */
+ ret = exclude_super_stripes(root, cache);
+ if (ret) {
+ /*
+ * We may have excluded something, so call this just in
+ * case.
+ */
+ free_excluded_extents(root, cache);
+ btrfs_put_block_group(cache);
+ goto error;
+ }
+
+ /*
+ * check for two cases, either we are full, and therefore
+ * don't need to bother with the caching work since we won't
+ * find any space, or we are empty, and we can just add all
+ * the space in and be done with it. This saves us _alot_ of
+ * time, particularly in the full case.
+ */
+ if (found_key.offset == btrfs_block_group_used(&cache->item)) {
+ cache->last_byte_to_unpin = (u64)-1;
+ cache->cached = BTRFS_CACHE_FINISHED;
+ free_excluded_extents(root, cache);
+ } else if (btrfs_block_group_used(&cache->item) == 0) {
+ cache->last_byte_to_unpin = (u64)-1;
+ cache->cached = BTRFS_CACHE_FINISHED;
+ add_new_free_space(cache, root->fs_info,
+ found_key.objectid,
+ found_key.objectid +
+ found_key.offset);
+ free_excluded_extents(root, cache);
+ }
+
+ ret = btrfs_add_block_group_cache(root->fs_info, cache);
+ if (ret) {
+ btrfs_remove_free_space_cache(cache);
+ btrfs_put_block_group(cache);
+ goto error;
+ }
+
+ ret = update_space_info(info, cache->flags, found_key.offset,
+ btrfs_block_group_used(&cache->item),
+ &space_info);
+ if (ret) {
+ btrfs_remove_free_space_cache(cache);
+ spin_lock(&info->block_group_cache_lock);
+ rb_erase(&cache->cache_node,
+ &info->block_group_cache_tree);
+ RB_CLEAR_NODE(&cache->cache_node);
+ spin_unlock(&info->block_group_cache_lock);
+ btrfs_put_block_group(cache);
+ goto error;
+ }
+
+ cache->space_info = space_info;
+ spin_lock(&cache->space_info->lock);
+ cache->space_info->bytes_readonly += cache->bytes_super;
+ spin_unlock(&cache->space_info->lock);
+
+ __link_block_group(space_info, cache);
+
+ set_avail_alloc_bits(root->fs_info, cache->flags);
+ if (btrfs_chunk_readonly(root, cache->key.objectid)) {
+ set_block_group_ro(cache, 1);
+ } else if (btrfs_block_group_used(&cache->item) == 0) {
+ spin_lock(&info->unused_bgs_lock);
+ /* Should always be true but just in case. */
+ if (list_empty(&cache->bg_list)) {
+ btrfs_get_block_group(cache);
+ list_add_tail(&cache->bg_list,
+ &info->unused_bgs);
+ }
+ spin_unlock(&info->unused_bgs_lock);
+ }
+ }
+
+ list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
+ if (!(get_alloc_profile(root, space_info->flags) &
+ (BTRFS_BLOCK_GROUP_RAID10 |
+ BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID5 |
+ BTRFS_BLOCK_GROUP_RAID6 |
+ BTRFS_BLOCK_GROUP_DUP)))
+ continue;
+ /*
+ * avoid allocating from un-mirrored block group if there are
+ * mirrored block groups.
+ */
+ list_for_each_entry(cache,
+ &space_info->block_groups[BTRFS_RAID_RAID0],
+ list)
+ set_block_group_ro(cache, 1);
+ list_for_each_entry(cache,
+ &space_info->block_groups[BTRFS_RAID_SINGLE],
+ list)
+ set_block_group_ro(cache, 1);
+ }
+
+ init_global_block_rsv(info);
+ ret = 0;
+error:
+ btrfs_free_path(path);
+ return ret;
+}
+
+void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_block_group_cache *block_group, *tmp;
+ struct btrfs_root *extent_root = root->fs_info->extent_root;
+ struct btrfs_block_group_item item;
+ struct btrfs_key key;
+ int ret = 0;
+
+ list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
+ if (ret)
+ goto next;
+
+ spin_lock(&block_group->lock);
+ memcpy(&item, &block_group->item, sizeof(item));
+ memcpy(&key, &block_group->key, sizeof(key));
+ spin_unlock(&block_group->lock);
+
+ ret = btrfs_insert_item(trans, extent_root, &key, &item,
+ sizeof(item));
+ if (ret)
+ btrfs_abort_transaction(trans, extent_root, ret);
+ ret = btrfs_finish_chunk_alloc(trans, extent_root,
+ key.objectid, key.offset);
+ if (ret)
+ btrfs_abort_transaction(trans, extent_root, ret);
+next:
+ list_del_init(&block_group->bg_list);
+ }
+}
+
+int btrfs_make_block_group(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytes_used,
+ u64 type, u64 chunk_objectid, u64 chunk_offset,
+ u64 size)
+{
+ int ret;
+ struct btrfs_root *extent_root;
+ struct btrfs_block_group_cache *cache;
+
+ extent_root = root->fs_info->extent_root;
+
+ btrfs_set_log_full_commit(root->fs_info, trans);
+
+ cache = btrfs_create_block_group_cache(root, chunk_offset, size);
+ if (!cache)
+ return -ENOMEM;
+
+ btrfs_set_block_group_used(&cache->item, bytes_used);
+ btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
+ btrfs_set_block_group_flags(&cache->item, type);
+
+ cache->flags = type;
+ cache->last_byte_to_unpin = (u64)-1;
+ cache->cached = BTRFS_CACHE_FINISHED;
+ ret = exclude_super_stripes(root, cache);
+ if (ret) {
+ /*
+ * We may have excluded something, so call this just in
+ * case.
+ */
+ free_excluded_extents(root, cache);
+ btrfs_put_block_group(cache);
+ return ret;
+ }
+
+ add_new_free_space(cache, root->fs_info, chunk_offset,
+ chunk_offset + size);
+
+ free_excluded_extents(root, cache);
+
+ ret = btrfs_add_block_group_cache(root->fs_info, cache);
+ if (ret) {
+ btrfs_remove_free_space_cache(cache);
+ btrfs_put_block_group(cache);
+ return ret;
+ }
+
+ ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
+ &cache->space_info);
+ if (ret) {
+ btrfs_remove_free_space_cache(cache);
+ spin_lock(&root->fs_info->block_group_cache_lock);
+ rb_erase(&cache->cache_node,
+ &root->fs_info->block_group_cache_tree);
+ RB_CLEAR_NODE(&cache->cache_node);
+ spin_unlock(&root->fs_info->block_group_cache_lock);
+ btrfs_put_block_group(cache);
+ return ret;
+ }
+ update_global_block_rsv(root->fs_info);
+
+ spin_lock(&cache->space_info->lock);
+ cache->space_info->bytes_readonly += cache->bytes_super;
+ spin_unlock(&cache->space_info->lock);
+
+ __link_block_group(cache->space_info, cache);
+
+ list_add_tail(&cache->bg_list, &trans->new_bgs);
+
+ set_avail_alloc_bits(extent_root->fs_info, type);
+
+ return 0;
+}
+
+static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
+{
+ u64 extra_flags = chunk_to_extended(flags) &
+ BTRFS_EXTENDED_PROFILE_MASK;
+
+ write_seqlock(&fs_info->profiles_lock);
+ if (flags & BTRFS_BLOCK_GROUP_DATA)
+ fs_info->avail_data_alloc_bits &= ~extra_flags;
+ if (flags & BTRFS_BLOCK_GROUP_METADATA)
+ fs_info->avail_metadata_alloc_bits &= ~extra_flags;
+ if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+ fs_info->avail_system_alloc_bits &= ~extra_flags;
+ write_sequnlock(&fs_info->profiles_lock);
+}
+
+int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 group_start,
+ struct extent_map *em)
+{
+ struct btrfs_path *path;
+ struct btrfs_block_group_cache *block_group;
+ struct btrfs_free_cluster *cluster;
+ struct btrfs_root *tree_root = root->fs_info->tree_root;
+ struct btrfs_key key;
+ struct inode *inode;
+ struct kobject *kobj = NULL;
+ int ret;
+ int index;
+ int factor;
+ struct btrfs_caching_control *caching_ctl = NULL;
+ bool remove_em;
+
+ root = root->fs_info->extent_root;
+
+ block_group = btrfs_lookup_block_group(root->fs_info, group_start);
+ BUG_ON(!block_group);
+ BUG_ON(!block_group->ro);
+
+ /*
+ * Free the reserved super bytes from this block group before
+ * remove it.
+ */
+ free_excluded_extents(root, block_group);
+
+ memcpy(&key, &block_group->key, sizeof(key));
+ index = get_block_group_index(block_group);
+ if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
+ BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID10))
+ factor = 2;
+ else
+ factor = 1;
+
+ /* make sure this block group isn't part of an allocation cluster */
+ cluster = &root->fs_info->data_alloc_cluster;
+ spin_lock(&cluster->refill_lock);
+ btrfs_return_cluster_to_free_space(block_group, cluster);
+ spin_unlock(&cluster->refill_lock);
+
+ /*
+ * make sure this block group isn't part of a metadata
+ * allocation cluster
+ */
+ cluster = &root->fs_info->meta_alloc_cluster;
+ spin_lock(&cluster->refill_lock);
+ btrfs_return_cluster_to_free_space(block_group, cluster);
+ spin_unlock(&cluster->refill_lock);
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * get the inode first so any iput calls done for the io_list
+ * aren't the final iput (no unlinks allowed now)
+ */
+ inode = lookup_free_space_inode(tree_root, block_group, path);
+
+ mutex_lock(&trans->transaction->cache_write_mutex);
+ /*
+ * make sure our free spache cache IO is done before remove the
+ * free space inode
+ */
+ spin_lock(&trans->transaction->dirty_bgs_lock);
+ if (!list_empty(&block_group->io_list)) {
+ list_del_init(&block_group->io_list);
+
+ WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
+
+ spin_unlock(&trans->transaction->dirty_bgs_lock);
+ btrfs_wait_cache_io(root, trans, block_group,
+ &block_group->io_ctl, path,
+ block_group->key.objectid);
+ btrfs_put_block_group(block_group);
+ spin_lock(&trans->transaction->dirty_bgs_lock);
+ }
+
+ if (!list_empty(&block_group->dirty_list)) {
+ list_del_init(&block_group->dirty_list);
+ btrfs_put_block_group(block_group);
+ }
+ spin_unlock(&trans->transaction->dirty_bgs_lock);
+ mutex_unlock(&trans->transaction->cache_write_mutex);
+
+ if (!IS_ERR(inode)) {
+ ret = btrfs_orphan_add(trans, inode);
+ if (ret) {
+ btrfs_add_delayed_iput(inode);
+ goto out;
+ }
+ clear_nlink(inode);
+ /* One for the block groups ref */
+ spin_lock(&block_group->lock);
+ if (block_group->iref) {
+ block_group->iref = 0;
+ block_group->inode = NULL;
+ spin_unlock(&block_group->lock);
+ iput(inode);
+ } else {
+ spin_unlock(&block_group->lock);
+ }
+ /* One for our lookup ref */
+ btrfs_add_delayed_iput(inode);
+ }
+
+ key.objectid = BTRFS_FREE_SPACE_OBJECTID;
+ key.offset = block_group->key.objectid;
+ key.type = 0;
+
+ ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
+ if (ret < 0)
+ goto out;
+ if (ret > 0)
+ btrfs_release_path(path);
+ if (ret == 0) {
+ ret = btrfs_del_item(trans, tree_root, path);
+ if (ret)
+ goto out;
+ btrfs_release_path(path);
+ }
+
+ spin_lock(&root->fs_info->block_group_cache_lock);
+ rb_erase(&block_group->cache_node,
+ &root->fs_info->block_group_cache_tree);
+ RB_CLEAR_NODE(&block_group->cache_node);
+
+ if (root->fs_info->first_logical_byte == block_group->key.objectid)
+ root->fs_info->first_logical_byte = (u64)-1;
+ spin_unlock(&root->fs_info->block_group_cache_lock);
+
+ down_write(&block_group->space_info->groups_sem);
+ /*
+ * we must use list_del_init so people can check to see if they
+ * are still on the list after taking the semaphore
+ */
+ list_del_init(&block_group->list);
+ if (list_empty(&block_group->space_info->block_groups[index])) {
+ kobj = block_group->space_info->block_group_kobjs[index];
+ block_group->space_info->block_group_kobjs[index] = NULL;
+ clear_avail_alloc_bits(root->fs_info, block_group->flags);
+ }
+ up_write(&block_group->space_info->groups_sem);
+ if (kobj) {
+ kobject_del(kobj);
+ kobject_put(kobj);
+ }
+
+ if (block_group->has_caching_ctl)
+ caching_ctl = get_caching_control(block_group);
+ if (block_group->cached == BTRFS_CACHE_STARTED)
+ wait_block_group_cache_done(block_group);
+ if (block_group->has_caching_ctl) {
+ down_write(&root->fs_info->commit_root_sem);
+ if (!caching_ctl) {
+ struct btrfs_caching_control *ctl;
+
+ list_for_each_entry(ctl,
+ &root->fs_info->caching_block_groups, list)
+ if (ctl->block_group == block_group) {
+ caching_ctl = ctl;
+ atomic_inc(&caching_ctl->count);
+ break;
+ }
+ }
+ if (caching_ctl)
+ list_del_init(&caching_ctl->list);
+ up_write(&root->fs_info->commit_root_sem);
+ if (caching_ctl) {
+ /* Once for the caching bgs list and once for us. */
+ put_caching_control(caching_ctl);
+ put_caching_control(caching_ctl);
+ }
+ }
+
+ spin_lock(&trans->transaction->dirty_bgs_lock);
+ if (!list_empty(&block_group->dirty_list)) {
+ WARN_ON(1);
+ }
+ if (!list_empty(&block_group->io_list)) {
+ WARN_ON(1);
+ }
+ spin_unlock(&trans->transaction->dirty_bgs_lock);
+ btrfs_remove_free_space_cache(block_group);
+
+ spin_lock(&block_group->space_info->lock);
+ list_del_init(&block_group->ro_list);
+
+ if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
+ WARN_ON(block_group->space_info->total_bytes
+ < block_group->key.offset);
+ WARN_ON(block_group->space_info->bytes_readonly
+ < block_group->key.offset);
+ WARN_ON(block_group->space_info->disk_total
+ < block_group->key.offset * factor);
+ }
+ block_group->space_info->total_bytes -= block_group->key.offset;
+ block_group->space_info->bytes_readonly -= block_group->key.offset;
+ block_group->space_info->disk_total -= block_group->key.offset * factor;
+
+ spin_unlock(&block_group->space_info->lock);
+
+ memcpy(&key, &block_group->key, sizeof(key));
+
+ lock_chunks(root);
+ if (!list_empty(&em->list)) {
+ /* We're in the transaction->pending_chunks list. */
+ free_extent_map(em);
+ }
+ spin_lock(&block_group->lock);
+ block_group->removed = 1;
+ /*
+ * At this point trimming can't start on this block group, because we
+ * removed the block group from the tree fs_info->block_group_cache_tree
+ * so no one can't find it anymore and even if someone already got this
+ * block group before we removed it from the rbtree, they have already
+ * incremented block_group->trimming - if they didn't, they won't find
+ * any free space entries because we already removed them all when we
+ * called btrfs_remove_free_space_cache().
+ *
+ * And we must not remove the extent map from the fs_info->mapping_tree
+ * to prevent the same logical address range and physical device space
+ * ranges from being reused for a new block group. This is because our
+ * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
+ * completely transactionless, so while it is trimming a range the
+ * currently running transaction might finish and a new one start,
+ * allowing for new block groups to be created that can reuse the same
+ * physical device locations unless we take this special care.
+ */
+ remove_em = (atomic_read(&block_group->trimming) == 0);
+ /*
+ * Make sure a trimmer task always sees the em in the pinned_chunks list
+ * if it sees block_group->removed == 1 (needs to lock block_group->lock
+ * before checking block_group->removed).
+ */
+ if (!remove_em) {
+ /*
+ * Our em might be in trans->transaction->pending_chunks which
+ * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks),
+ * and so is the fs_info->pinned_chunks list.
+ *
+ * So at this point we must be holding the chunk_mutex to avoid
+ * any races with chunk allocation (more specifically at
+ * volumes.c:contains_pending_extent()), to ensure it always
+ * sees the em, either in the pending_chunks list or in the
+ * pinned_chunks list.
+ */
+ list_move_tail(&em->list, &root->fs_info->pinned_chunks);
+ }
+ spin_unlock(&block_group->lock);
+
+ if (remove_em) {
+ struct extent_map_tree *em_tree;
+
+ em_tree = &root->fs_info->mapping_tree.map_tree;
+ write_lock(&em_tree->lock);
+ /*
+ * The em might be in the pending_chunks list, so make sure the
+ * chunk mutex is locked, since remove_extent_mapping() will
+ * delete us from that list.
+ */
+ remove_extent_mapping(em_tree, em);
+ write_unlock(&em_tree->lock);
+ /* once for the tree */
+ free_extent_map(em);
+ }
+
+ unlock_chunks(root);
+
+ btrfs_put_block_group(block_group);
+ btrfs_put_block_group(block_group);
+
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret > 0)
+ ret = -EIO;
+ if (ret < 0)
+ goto out;
+
+ ret = btrfs_del_item(trans, root, path);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * Process the unused_bgs list and remove any that don't have any allocated
+ * space inside of them.
+ */
+void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_block_group_cache *block_group;
+ struct btrfs_space_info *space_info;
+ struct btrfs_root *root = fs_info->extent_root;
+ struct btrfs_trans_handle *trans;
+ int ret = 0;
+
+ if (!fs_info->open)
+ return;
+
+ spin_lock(&fs_info->unused_bgs_lock);
+ while (!list_empty(&fs_info->unused_bgs)) {
+ u64 start, end;
+
+ block_group = list_first_entry(&fs_info->unused_bgs,
+ struct btrfs_block_group_cache,
+ bg_list);
+ space_info = block_group->space_info;
+ list_del_init(&block_group->bg_list);
+ if (ret || btrfs_mixed_space_info(space_info)) {
+ btrfs_put_block_group(block_group);
+ continue;
+ }
+ spin_unlock(&fs_info->unused_bgs_lock);
+
+ /* Don't want to race with allocators so take the groups_sem */
+ down_write(&space_info->groups_sem);
+ spin_lock(&block_group->lock);
+ if (block_group->reserved ||
+ btrfs_block_group_used(&block_group->item) ||
+ block_group->ro) {
+ /*
+ * We want to bail if we made new allocations or have
+ * outstanding allocations in this block group. We do
+ * the ro check in case balance is currently acting on
+ * this block group.
+ */
+ spin_unlock(&block_group->lock);
+ up_write(&space_info->groups_sem);
+ goto next;
+ }
+ spin_unlock(&block_group->lock);
+
+ /* We don't want to force the issue, only flip if it's ok. */
+ ret = set_block_group_ro(block_group, 0);
+ up_write(&space_info->groups_sem);
+ if (ret < 0) {
+ ret = 0;
+ goto next;
+ }
+
+ /*
+ * Want to do this before we do anything else so we can recover
+ * properly if we fail to join the transaction.
+ */
+ /* 1 for btrfs_orphan_reserve_metadata() */
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ btrfs_set_block_group_rw(root, block_group);
+ ret = PTR_ERR(trans);
+ goto next;
+ }
+
+ /*
+ * We could have pending pinned extents for this block group,
+ * just delete them, we don't care about them anymore.
+ */
+ start = block_group->key.objectid;
+ end = start + block_group->key.offset - 1;
+ /*
+ * Hold the unused_bg_unpin_mutex lock to avoid racing with
+ * btrfs_finish_extent_commit(). If we are at transaction N,
+ * another task might be running finish_extent_commit() for the
+ * previous transaction N - 1, and have seen a range belonging
+ * to the block group in freed_extents[] before we were able to
+ * clear the whole block group range from freed_extents[]. This
+ * means that task can lookup for the block group after we
+ * unpinned it from freed_extents[] and removed it, leading to
+ * a BUG_ON() at btrfs_unpin_extent_range().
+ */
+ mutex_lock(&fs_info->unused_bg_unpin_mutex);
+ ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
+ EXTENT_DIRTY, GFP_NOFS);
+ if (ret) {
+ mutex_unlock(&fs_info->unused_bg_unpin_mutex);
+ btrfs_set_block_group_rw(root, block_group);
+ goto end_trans;
+ }
+ ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
+ EXTENT_DIRTY, GFP_NOFS);
+ if (ret) {
+ mutex_unlock(&fs_info->unused_bg_unpin_mutex);
+ btrfs_set_block_group_rw(root, block_group);
+ goto end_trans;
+ }
+ mutex_unlock(&fs_info->unused_bg_unpin_mutex);
+
+ /* Reset pinned so btrfs_put_block_group doesn't complain */
+ spin_lock(&space_info->lock);
+ spin_lock(&block_group->lock);
+
+ space_info->bytes_pinned -= block_group->pinned;
+ space_info->bytes_readonly += block_group->pinned;
+ percpu_counter_add(&space_info->total_bytes_pinned,
+ -block_group->pinned);
+ block_group->pinned = 0;
+
+ spin_unlock(&block_group->lock);
+ spin_unlock(&space_info->lock);
+
+ /*
+ * Btrfs_remove_chunk will abort the transaction if things go
+ * horribly wrong.
+ */
+ ret = btrfs_remove_chunk(trans, root,
+ block_group->key.objectid);
+end_trans:
+ btrfs_end_transaction(trans, root);
+next:
+ btrfs_put_block_group(block_group);
+ spin_lock(&fs_info->unused_bgs_lock);
+ }
+ spin_unlock(&fs_info->unused_bgs_lock);
+}
+
+int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_space_info *space_info;
+ struct btrfs_super_block *disk_super;
+ u64 features;
+ u64 flags;
+ int mixed = 0;
+ int ret;
+
+ disk_super = fs_info->super_copy;
+ if (!btrfs_super_root(disk_super))
+ return 1;
+
+ features = btrfs_super_incompat_flags(disk_super);
+ if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
+ mixed = 1;
+
+ flags = BTRFS_BLOCK_GROUP_SYSTEM;
+ ret = update_space_info(fs_info, flags, 0, 0, &space_info);
+ if (ret)
+ goto out;
+
+ if (mixed) {
+ flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
+ ret = update_space_info(fs_info, flags, 0, 0, &space_info);
+ } else {
+ flags = BTRFS_BLOCK_GROUP_METADATA;
+ ret = update_space_info(fs_info, flags, 0, 0, &space_info);
+ if (ret)
+ goto out;
+
+ flags = BTRFS_BLOCK_GROUP_DATA;
+ ret = update_space_info(fs_info, flags, 0, 0, &space_info);
+ }
+out:
+ return ret;
+}
+
+int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
+{
+ return unpin_extent_range(root, start, end, false);
+}
+
+int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_block_group_cache *cache = NULL;
+ u64 group_trimmed;
+ u64 start;
+ u64 end;
+ u64 trimmed = 0;
+ u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
+ int ret = 0;
+
+ /*
+ * try to trim all FS space, our block group may start from non-zero.
+ */
+ if (range->len == total_bytes)
+ cache = btrfs_lookup_first_block_group(fs_info, range->start);
+ else
+ cache = btrfs_lookup_block_group(fs_info, range->start);
+
+ while (cache) {
+ if (cache->key.objectid >= (range->start + range->len)) {
+ btrfs_put_block_group(cache);
+ break;
+ }
+
+ start = max(range->start, cache->key.objectid);
+ end = min(range->start + range->len,
+ cache->key.objectid + cache->key.offset);
+
+ if (end - start >= range->minlen) {
+ if (!block_group_cache_done(cache)) {
+ ret = cache_block_group(cache, 0);
+ if (ret) {
+ btrfs_put_block_group(cache);
+ break;
+ }
+ ret = wait_block_group_cache_done(cache);
+ if (ret) {
+ btrfs_put_block_group(cache);
+ break;
+ }
+ }
+ ret = btrfs_trim_block_group(cache,
+ &group_trimmed,
+ start,
+ end,
+ range->minlen);
+
+ trimmed += group_trimmed;
+ if (ret) {
+ btrfs_put_block_group(cache);
+ break;
+ }
+ }
+
+ cache = next_block_group(fs_info->tree_root, cache);
+ }
+
+ range->len = trimmed;
+ return ret;
+}
+
+/*
+ * btrfs_{start,end}_write_no_snapshoting() are similar to
+ * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
+ * data into the page cache through nocow before the subvolume is snapshoted,
+ * but flush the data into disk after the snapshot creation, or to prevent
+ * operations while snapshoting is ongoing and that cause the snapshot to be
+ * inconsistent (writes followed by expanding truncates for example).
+ */
+void btrfs_end_write_no_snapshoting(struct btrfs_root *root)
+{
+ percpu_counter_dec(&root->subv_writers->counter);
+ /*
+ * Make sure counter is updated before we wake up
+ * waiters.
+ */
+ smp_mb();
+ if (waitqueue_active(&root->subv_writers->wait))
+ wake_up(&root->subv_writers->wait);
+}
+
+int btrfs_start_write_no_snapshoting(struct btrfs_root *root)
+{
+ if (atomic_read(&root->will_be_snapshoted))
+ return 0;
+
+ percpu_counter_inc(&root->subv_writers->counter);
+ /*
+ * Make sure counter is updated before we check for snapshot creation.
+ */
+ smp_mb();
+ if (atomic_read(&root->will_be_snapshoted)) {
+ btrfs_end_write_no_snapshoting(root);
+ return 0;
+ }
+ return 1;
+}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
new file mode 100644
index 000000000..c32d226bf
--- /dev/null
+++ b/fs/btrfs/extent_io.c
@@ -0,0 +1,5632 @@
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include <linux/bio.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/page-flags.h>
+#include <linux/spinlock.h>
+#include <linux/blkdev.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+#include <linux/prefetch.h>
+#include <linux/cleancache.h>
+#include "extent_io.h"
+#include "extent_map.h"
+#include "ctree.h"
+#include "btrfs_inode.h"
+#include "volumes.h"
+#include "check-integrity.h"
+#include "locking.h"
+#include "rcu-string.h"
+#include "backref.h"
+
+static struct kmem_cache *extent_state_cache;
+static struct kmem_cache *extent_buffer_cache;
+static struct bio_set *btrfs_bioset;
+
+static inline bool extent_state_in_tree(const struct extent_state *state)
+{
+ return !RB_EMPTY_NODE(&state->rb_node);
+}
+
+#ifdef CONFIG_BTRFS_DEBUG
+static LIST_HEAD(buffers);
+static LIST_HEAD(states);
+
+static DEFINE_SPINLOCK(leak_lock);
+
+static inline
+void btrfs_leak_debug_add(struct list_head *new, struct list_head *head)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&leak_lock, flags);
+ list_add(new, head);
+ spin_unlock_irqrestore(&leak_lock, flags);
+}
+
+static inline
+void btrfs_leak_debug_del(struct list_head *entry)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&leak_lock, flags);
+ list_del(entry);
+ spin_unlock_irqrestore(&leak_lock, flags);
+}
+
+static inline
+void btrfs_leak_debug_check(void)
+{
+ struct extent_state *state;
+ struct extent_buffer *eb;
+
+ while (!list_empty(&states)) {
+ state = list_entry(states.next, struct extent_state, leak_list);
+ pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
+ state->start, state->end, state->state,
+ extent_state_in_tree(state),
+ atomic_read(&state->refs));
+ list_del(&state->leak_list);
+ kmem_cache_free(extent_state_cache, state);
+ }
+
+ while (!list_empty(&buffers)) {
+ eb = list_entry(buffers.next, struct extent_buffer, leak_list);
+ printk(KERN_ERR "BTRFS: buffer leak start %llu len %lu "
+ "refs %d\n",
+ eb->start, eb->len, atomic_read(&eb->refs));
+ list_del(&eb->leak_list);
+ kmem_cache_free(extent_buffer_cache, eb);
+ }
+}
+
+#define btrfs_debug_check_extent_io_range(tree, start, end) \
+ __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end))
+static inline void __btrfs_debug_check_extent_io_range(const char *caller,
+ struct extent_io_tree *tree, u64 start, u64 end)
+{
+ struct inode *inode;
+ u64 isize;
+
+ if (!tree->mapping)
+ return;
+
+ inode = tree->mapping->host;
+ isize = i_size_read(inode);
+ if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
+ printk_ratelimited(KERN_DEBUG
+ "BTRFS: %s: ino %llu isize %llu odd range [%llu,%llu]\n",
+ caller, btrfs_ino(inode), isize, start, end);
+ }
+}
+#else
+#define btrfs_leak_debug_add(new, head) do {} while (0)
+#define btrfs_leak_debug_del(entry) do {} while (0)
+#define btrfs_leak_debug_check() do {} while (0)
+#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0)
+#endif
+
+#define BUFFER_LRU_MAX 64
+
+struct tree_entry {
+ u64 start;
+ u64 end;
+ struct rb_node rb_node;
+};
+
+struct extent_page_data {
+ struct bio *bio;
+ struct extent_io_tree *tree;
+ get_extent_t *get_extent;
+ unsigned long bio_flags;
+
+ /* tells writepage not to lock the state bits for this range
+ * it still does the unlocking
+ */
+ unsigned int extent_locked:1;
+
+ /* tells the submit_bio code to use a WRITE_SYNC */
+ unsigned int sync_io:1;
+};
+
+static noinline void flush_write_bio(void *data);
+static inline struct btrfs_fs_info *
+tree_fs_info(struct extent_io_tree *tree)
+{
+ if (!tree->mapping)
+ return NULL;
+ return btrfs_sb(tree->mapping->host->i_sb);
+}
+
+int __init extent_io_init(void)
+{
+ extent_state_cache = kmem_cache_create("btrfs_extent_state",
+ sizeof(struct extent_state), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+ if (!extent_state_cache)
+ return -ENOMEM;
+
+ extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
+ sizeof(struct extent_buffer), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+ if (!extent_buffer_cache)
+ goto free_state_cache;
+
+ btrfs_bioset = bioset_create(BIO_POOL_SIZE,
+ offsetof(struct btrfs_io_bio, bio));
+ if (!btrfs_bioset)
+ goto free_buffer_cache;
+
+ if (bioset_integrity_create(btrfs_bioset, BIO_POOL_SIZE))
+ goto free_bioset;
+
+ return 0;
+
+free_bioset:
+ bioset_free(btrfs_bioset);
+ btrfs_bioset = NULL;
+
+free_buffer_cache:
+ kmem_cache_destroy(extent_buffer_cache);
+ extent_buffer_cache = NULL;
+
+free_state_cache:
+ kmem_cache_destroy(extent_state_cache);
+ extent_state_cache = NULL;
+ return -ENOMEM;
+}
+
+void extent_io_exit(void)
+{
+ btrfs_leak_debug_check();
+
+ /*
+ * Make sure all delayed rcu free are flushed before we
+ * destroy caches.
+ */
+ rcu_barrier();
+ if (extent_state_cache)
+ kmem_cache_destroy(extent_state_cache);
+ if (extent_buffer_cache)
+ kmem_cache_destroy(extent_buffer_cache);
+ if (btrfs_bioset)
+ bioset_free(btrfs_bioset);
+}
+
+void extent_io_tree_init(struct extent_io_tree *tree,
+ struct address_space *mapping)
+{
+ tree->state = RB_ROOT;
+ tree->ops = NULL;
+ tree->dirty_bytes = 0;
+ spin_lock_init(&tree->lock);
+ tree->mapping = mapping;
+}
+
+static struct extent_state *alloc_extent_state(gfp_t mask)
+{
+ struct extent_state *state;
+
+ state = kmem_cache_alloc(extent_state_cache, mask);
+ if (!state)
+ return state;
+ state->state = 0;
+ state->private = 0;
+ RB_CLEAR_NODE(&state->rb_node);
+ btrfs_leak_debug_add(&state->leak_list, &states);
+ atomic_set(&state->refs, 1);
+ init_waitqueue_head(&state->wq);
+ trace_alloc_extent_state(state, mask, _RET_IP_);
+ return state;
+}
+
+void free_extent_state(struct extent_state *state)
+{
+ if (!state)
+ return;
+ if (atomic_dec_and_test(&state->refs)) {
+ WARN_ON(extent_state_in_tree(state));
+ btrfs_leak_debug_del(&state->leak_list);
+ trace_free_extent_state(state, _RET_IP_);
+ kmem_cache_free(extent_state_cache, state);
+ }
+}
+
+static struct rb_node *tree_insert(struct rb_root *root,
+ struct rb_node *search_start,
+ u64 offset,
+ struct rb_node *node,
+ struct rb_node ***p_in,
+ struct rb_node **parent_in)
+{
+ struct rb_node **p;
+ struct rb_node *parent = NULL;
+ struct tree_entry *entry;
+
+ if (p_in && parent_in) {
+ p = *p_in;
+ parent = *parent_in;
+ goto do_insert;
+ }
+
+ p = search_start ? &search_start : &root->rb_node;
+ while (*p) {
+ parent = *p;
+ entry = rb_entry(parent, struct tree_entry, rb_node);
+
+ if (offset < entry->start)
+ p = &(*p)->rb_left;
+ else if (offset > entry->end)
+ p = &(*p)->rb_right;
+ else
+ return parent;
+ }
+
+do_insert:
+ rb_link_node(node, parent, p);
+ rb_insert_color(node, root);
+ return NULL;
+}
+
+static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
+ struct rb_node **prev_ret,
+ struct rb_node **next_ret,
+ struct rb_node ***p_ret,
+ struct rb_node **parent_ret)
+{
+ struct rb_root *root = &tree->state;
+ struct rb_node **n = &root->rb_node;
+ struct rb_node *prev = NULL;
+ struct rb_node *orig_prev = NULL;
+ struct tree_entry *entry;
+ struct tree_entry *prev_entry = NULL;
+
+ while (*n) {
+ prev = *n;
+ entry = rb_entry(prev, struct tree_entry, rb_node);
+ prev_entry = entry;
+
+ if (offset < entry->start)
+ n = &(*n)->rb_left;
+ else if (offset > entry->end)
+ n = &(*n)->rb_right;
+ else
+ return *n;
+ }
+
+ if (p_ret)
+ *p_ret = n;
+ if (parent_ret)
+ *parent_ret = prev;
+
+ if (prev_ret) {
+ orig_prev = prev;
+ while (prev && offset > prev_entry->end) {
+ prev = rb_next(prev);
+ prev_entry = rb_entry(prev, struct tree_entry, rb_node);
+ }
+ *prev_ret = prev;
+ prev = orig_prev;
+ }
+
+ if (next_ret) {
+ prev_entry = rb_entry(prev, struct tree_entry, rb_node);
+ while (prev && offset < prev_entry->start) {
+ prev = rb_prev(prev);
+ prev_entry = rb_entry(prev, struct tree_entry, rb_node);
+ }
+ *next_ret = prev;
+ }
+ return NULL;
+}
+
+static inline struct rb_node *
+tree_search_for_insert(struct extent_io_tree *tree,
+ u64 offset,
+ struct rb_node ***p_ret,
+ struct rb_node **parent_ret)
+{
+ struct rb_node *prev = NULL;
+ struct rb_node *ret;
+
+ ret = __etree_search(tree, offset, &prev, NULL, p_ret, parent_ret);
+ if (!ret)
+ return prev;
+ return ret;
+}
+
+static inline struct rb_node *tree_search(struct extent_io_tree *tree,
+ u64 offset)
+{
+ return tree_search_for_insert(tree, offset, NULL, NULL);
+}
+
+static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
+ struct extent_state *other)
+{
+ if (tree->ops && tree->ops->merge_extent_hook)
+ tree->ops->merge_extent_hook(tree->mapping->host, new,
+ other);
+}
+
+/*
+ * utility function to look for merge candidates inside a given range.
+ * Any extents with matching state are merged together into a single
+ * extent in the tree. Extents with EXTENT_IO in their state field
+ * are not merged because the end_io handlers need to be able to do
+ * operations on them without sleeping (or doing allocations/splits).
+ *
+ * This should be called with the tree lock held.
+ */
+static void merge_state(struct extent_io_tree *tree,
+ struct extent_state *state)
+{
+ struct extent_state *other;
+ struct rb_node *other_node;
+
+ if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
+ return;
+
+ other_node = rb_prev(&state->rb_node);
+ if (other_node) {
+ other = rb_entry(other_node, struct extent_state, rb_node);
+ if (other->end == state->start - 1 &&
+ other->state == state->state) {
+ merge_cb(tree, state, other);
+ state->start = other->start;
+ rb_erase(&other->rb_node, &tree->state);
+ RB_CLEAR_NODE(&other->rb_node);
+ free_extent_state(other);
+ }
+ }
+ other_node = rb_next(&state->rb_node);
+ if (other_node) {
+ other = rb_entry(other_node, struct extent_state, rb_node);
+ if (other->start == state->end + 1 &&
+ other->state == state->state) {
+ merge_cb(tree, state, other);
+ state->end = other->end;
+ rb_erase(&other->rb_node, &tree->state);
+ RB_CLEAR_NODE(&other->rb_node);
+ free_extent_state(other);
+ }
+ }
+}
+
+static void set_state_cb(struct extent_io_tree *tree,
+ struct extent_state *state, unsigned *bits)
+{
+ if (tree->ops && tree->ops->set_bit_hook)
+ tree->ops->set_bit_hook(tree->mapping->host, state, bits);
+}
+
+static void clear_state_cb(struct extent_io_tree *tree,
+ struct extent_state *state, unsigned *bits)
+{
+ if (tree->ops && tree->ops->clear_bit_hook)
+ tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
+}
+
+static void set_state_bits(struct extent_io_tree *tree,
+ struct extent_state *state, unsigned *bits);
+
+/*
+ * insert an extent_state struct into the tree. 'bits' are set on the
+ * struct before it is inserted.
+ *
+ * This may return -EEXIST if the extent is already there, in which case the
+ * state struct is freed.
+ *
+ * The tree lock is not taken internally. This is a utility function and
+ * probably isn't what you want to call (see set/clear_extent_bit).
+ */
+static int insert_state(struct extent_io_tree *tree,
+ struct extent_state *state, u64 start, u64 end,
+ struct rb_node ***p,
+ struct rb_node **parent,
+ unsigned *bits)
+{
+ struct rb_node *node;
+
+ if (end < start)
+ WARN(1, KERN_ERR "BTRFS: end < start %llu %llu\n",
+ end, start);
+ state->start = start;
+ state->end = end;
+
+ set_state_bits(tree, state, bits);
+
+ node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent);
+ if (node) {
+ struct extent_state *found;
+ found = rb_entry(node, struct extent_state, rb_node);
+ printk(KERN_ERR "BTRFS: found node %llu %llu on insert of "
+ "%llu %llu\n",
+ found->start, found->end, start, end);
+ return -EEXIST;
+ }
+ merge_state(tree, state);
+ return 0;
+}
+
+static void split_cb(struct extent_io_tree *tree, struct extent_state *orig,
+ u64 split)
+{
+ if (tree->ops && tree->ops->split_extent_hook)
+ tree->ops->split_extent_hook(tree->mapping->host, orig, split);
+}
+
+/*
+ * split a given extent state struct in two, inserting the preallocated
+ * struct 'prealloc' as the newly created second half. 'split' indicates an
+ * offset inside 'orig' where it should be split.
+ *
+ * Before calling,
+ * the tree has 'orig' at [orig->start, orig->end]. After calling, there
+ * are two extent state structs in the tree:
+ * prealloc: [orig->start, split - 1]
+ * orig: [ split, orig->end ]
+ *
+ * The tree locks are not taken by this function. They need to be held
+ * by the caller.
+ */
+static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
+ struct extent_state *prealloc, u64 split)
+{
+ struct rb_node *node;
+
+ split_cb(tree, orig, split);
+
+ prealloc->start = orig->start;
+ prealloc->end = split - 1;
+ prealloc->state = orig->state;
+ orig->start = split;
+
+ node = tree_insert(&tree->state, &orig->rb_node, prealloc->end,
+ &prealloc->rb_node, NULL, NULL);
+ if (node) {
+ free_extent_state(prealloc);
+ return -EEXIST;
+ }
+ return 0;
+}
+
+static struct extent_state *next_state(struct extent_state *state)
+{
+ struct rb_node *next = rb_next(&state->rb_node);
+ if (next)
+ return rb_entry(next, struct extent_state, rb_node);
+ else
+ return NULL;
+}
+
+/*
+ * utility function to clear some bits in an extent state struct.
+ * it will optionally wake up any one waiting on this state (wake == 1).
+ *
+ * If no bits are set on the state struct after clearing things, the
+ * struct is freed and removed from the tree
+ */
+static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
+ struct extent_state *state,
+ unsigned *bits, int wake)
+{
+ struct extent_state *next;
+ unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS;
+
+ if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
+ u64 range = state->end - state->start + 1;
+ WARN_ON(range > tree->dirty_bytes);
+ tree->dirty_bytes -= range;
+ }
+ clear_state_cb(tree, state, bits);
+ state->state &= ~bits_to_clear;
+ if (wake)
+ wake_up(&state->wq);
+ if (state->state == 0) {
+ next = next_state(state);
+ if (extent_state_in_tree(state)) {
+ rb_erase(&state->rb_node, &tree->state);
+ RB_CLEAR_NODE(&state->rb_node);
+ free_extent_state(state);
+ } else {
+ WARN_ON(1);
+ }
+ } else {
+ merge_state(tree, state);
+ next = next_state(state);
+ }
+ return next;
+}
+
+static struct extent_state *
+alloc_extent_state_atomic(struct extent_state *prealloc)
+{
+ if (!prealloc)
+ prealloc = alloc_extent_state(GFP_ATOMIC);
+
+ return prealloc;
+}
+
+static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
+{
+ btrfs_panic(tree_fs_info(tree), err, "Locking error: "
+ "Extent tree was modified by another "
+ "thread while locked.");
+}
+
+/*
+ * clear some bits on a range in the tree. This may require splitting
+ * or inserting elements in the tree, so the gfp mask is used to
+ * indicate which allocations or sleeping are allowed.
+ *
+ * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
+ * the given range from the tree regardless of state (ie for truncate).
+ *
+ * the range [start, end] is inclusive.
+ *
+ * This takes the tree lock, and returns 0 on success and < 0 on error.
+ */
+int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, int wake, int delete,
+ struct extent_state **cached_state,
+ gfp_t mask)
+{
+ struct extent_state *state;
+ struct extent_state *cached;
+ struct extent_state *prealloc = NULL;
+ struct rb_node *node;
+ u64 last_end;
+ int err;
+ int clear = 0;
+
+ btrfs_debug_check_extent_io_range(tree, start, end);
+
+ if (bits & EXTENT_DELALLOC)
+ bits |= EXTENT_NORESERVE;
+
+ if (delete)
+ bits |= ~EXTENT_CTLBITS;
+ bits |= EXTENT_FIRST_DELALLOC;
+
+ if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
+ clear = 1;
+again:
+ if (!prealloc && (mask & __GFP_WAIT)) {
+ /*
+ * Don't care for allocation failure here because we might end
+ * up not needing the pre-allocated extent state at all, which
+ * is the case if we only have in the tree extent states that
+ * cover our input range and don't cover too any other range.
+ * If we end up needing a new extent state we allocate it later.
+ */
+ prealloc = alloc_extent_state(mask);
+ }
+
+ spin_lock(&tree->lock);
+ if (cached_state) {
+ cached = *cached_state;
+
+ if (clear) {
+ *cached_state = NULL;
+ cached_state = NULL;
+ }
+
+ if (cached && extent_state_in_tree(cached) &&
+ cached->start <= start && cached->end > start) {
+ if (clear)
+ atomic_dec(&cached->refs);
+ state = cached;
+ goto hit_next;
+ }
+ if (clear)
+ free_extent_state(cached);
+ }
+ /*
+ * this search will find the extents that end after
+ * our range starts
+ */
+ node = tree_search(tree, start);
+ if (!node)
+ goto out;
+ state = rb_entry(node, struct extent_state, rb_node);
+hit_next:
+ if (state->start > end)
+ goto out;
+ WARN_ON(state->end < start);
+ last_end = state->end;
+
+ /* the state doesn't have the wanted bits, go ahead */
+ if (!(state->state & bits)) {
+ state = next_state(state);
+ goto next;
+ }
+
+ /*
+ * | ---- desired range ---- |
+ * | state | or
+ * | ------------- state -------------- |
+ *
+ * We need to split the extent we found, and may flip
+ * bits on second half.
+ *
+ * If the extent we found extends past our range, we
+ * just split and search again. It'll get split again
+ * the next time though.
+ *
+ * If the extent we found is inside our range, we clear
+ * the desired bit on it.
+ */
+
+ if (state->start < start) {
+ prealloc = alloc_extent_state_atomic(prealloc);
+ BUG_ON(!prealloc);
+ err = split_state(tree, state, prealloc, start);
+ if (err)
+ extent_io_tree_panic(tree, err);
+
+ prealloc = NULL;
+ if (err)
+ goto out;
+ if (state->end <= end) {
+ state = clear_state_bit(tree, state, &bits, wake);
+ goto next;
+ }
+ goto search_again;
+ }
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ * We need to split the extent, and clear the bit
+ * on the first half
+ */
+ if (state->start <= end && state->end > end) {
+ prealloc = alloc_extent_state_atomic(prealloc);
+ BUG_ON(!prealloc);
+ err = split_state(tree, state, prealloc, end + 1);
+ if (err)
+ extent_io_tree_panic(tree, err);
+
+ if (wake)
+ wake_up(&state->wq);
+
+ clear_state_bit(tree, prealloc, &bits, wake);
+
+ prealloc = NULL;
+ goto out;
+ }
+
+ state = clear_state_bit(tree, state, &bits, wake);
+next:
+ if (last_end == (u64)-1)
+ goto out;
+ start = last_end + 1;
+ if (start <= end && state && !need_resched())
+ goto hit_next;
+ goto search_again;
+
+out:
+ spin_unlock(&tree->lock);
+ if (prealloc)
+ free_extent_state(prealloc);
+
+ return 0;
+
+search_again:
+ if (start > end)
+ goto out;
+ spin_unlock(&tree->lock);
+ if (mask & __GFP_WAIT)
+ cond_resched();
+ goto again;
+}
+
+static void wait_on_state(struct extent_io_tree *tree,
+ struct extent_state *state)
+ __releases(tree->lock)
+ __acquires(tree->lock)
+{
+ DEFINE_WAIT(wait);
+ prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
+ spin_unlock(&tree->lock);
+ schedule();
+ spin_lock(&tree->lock);
+ finish_wait(&state->wq, &wait);
+}
+
+/*
+ * waits for one or more bits to clear on a range in the state tree.
+ * The range [start, end] is inclusive.
+ * The tree lock is taken by this function
+ */
+static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned long bits)
+{
+ struct extent_state *state;
+ struct rb_node *node;
+
+ btrfs_debug_check_extent_io_range(tree, start, end);
+
+ spin_lock(&tree->lock);
+again:
+ while (1) {
+ /*
+ * this search will find all the extents that end after
+ * our range starts
+ */
+ node = tree_search(tree, start);
+process_node:
+ if (!node)
+ break;
+
+ state = rb_entry(node, struct extent_state, rb_node);
+
+ if (state->start > end)
+ goto out;
+
+ if (state->state & bits) {
+ start = state->start;
+ atomic_inc(&state->refs);
+ wait_on_state(tree, state);
+ free_extent_state(state);
+ goto again;
+ }
+ start = state->end + 1;
+
+ if (start > end)
+ break;
+
+ if (!cond_resched_lock(&tree->lock)) {
+ node = rb_next(node);
+ goto process_node;
+ }
+ }
+out:
+ spin_unlock(&tree->lock);
+}
+
+static void set_state_bits(struct extent_io_tree *tree,
+ struct extent_state *state,
+ unsigned *bits)
+{
+ unsigned bits_to_set = *bits & ~EXTENT_CTLBITS;
+
+ set_state_cb(tree, state, bits);
+ if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
+ u64 range = state->end - state->start + 1;
+ tree->dirty_bytes += range;
+ }
+ state->state |= bits_to_set;
+}
+
+static void cache_state_if_flags(struct extent_state *state,
+ struct extent_state **cached_ptr,
+ unsigned flags)
+{
+ if (cached_ptr && !(*cached_ptr)) {
+ if (!flags || (state->state & flags)) {
+ *cached_ptr = state;
+ atomic_inc(&state->refs);
+ }
+ }
+}
+
+static void cache_state(struct extent_state *state,
+ struct extent_state **cached_ptr)
+{
+ return cache_state_if_flags(state, cached_ptr,
+ EXTENT_IOBITS | EXTENT_BOUNDARY);
+}
+
+/*
+ * set some bits on a range in the tree. This may require allocations or
+ * sleeping, so the gfp mask is used to indicate what is allowed.
+ *
+ * If any of the exclusive bits are set, this will fail with -EEXIST if some
+ * part of the range already has the desired bits set. The start of the
+ * existing range is returned in failed_start in this case.
+ *
+ * [start, end] is inclusive This takes the tree lock.
+ */
+
+static int __must_check
+__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, unsigned exclusive_bits,
+ u64 *failed_start, struct extent_state **cached_state,
+ gfp_t mask)
+{
+ struct extent_state *state;
+ struct extent_state *prealloc = NULL;
+ struct rb_node *node;
+ struct rb_node **p;
+ struct rb_node *parent;
+ int err = 0;
+ u64 last_start;
+ u64 last_end;
+
+ btrfs_debug_check_extent_io_range(tree, start, end);
+
+ bits |= EXTENT_FIRST_DELALLOC;
+again:
+ if (!prealloc && (mask & __GFP_WAIT)) {
+ prealloc = alloc_extent_state(mask);
+ BUG_ON(!prealloc);
+ }
+
+ spin_lock(&tree->lock);
+ if (cached_state && *cached_state) {
+ state = *cached_state;
+ if (state->start <= start && state->end > start &&
+ extent_state_in_tree(state)) {
+ node = &state->rb_node;
+ goto hit_next;
+ }
+ }
+ /*
+ * this search will find all the extents that end after
+ * our range starts.
+ */
+ node = tree_search_for_insert(tree, start, &p, &parent);
+ if (!node) {
+ prealloc = alloc_extent_state_atomic(prealloc);
+ BUG_ON(!prealloc);
+ err = insert_state(tree, prealloc, start, end,
+ &p, &parent, &bits);
+ if (err)
+ extent_io_tree_panic(tree, err);
+
+ cache_state(prealloc, cached_state);
+ prealloc = NULL;
+ goto out;
+ }
+ state = rb_entry(node, struct extent_state, rb_node);
+hit_next:
+ last_start = state->start;
+ last_end = state->end;
+
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ *
+ * Just lock what we found and keep going
+ */
+ if (state->start == start && state->end <= end) {
+ if (state->state & exclusive_bits) {
+ *failed_start = state->start;
+ err = -EEXIST;
+ goto out;
+ }
+
+ set_state_bits(tree, state, &bits);
+ cache_state(state, cached_state);
+ merge_state(tree, state);
+ if (last_end == (u64)-1)
+ goto out;
+ start = last_end + 1;
+ state = next_state(state);
+ if (start < end && state && state->start == start &&
+ !need_resched())
+ goto hit_next;
+ goto search_again;
+ }
+
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ * or
+ * | ------------- state -------------- |
+ *
+ * We need to split the extent we found, and may flip bits on
+ * second half.
+ *
+ * If the extent we found extends past our
+ * range, we just split and search again. It'll get split
+ * again the next time though.
+ *
+ * If the extent we found is inside our range, we set the
+ * desired bit on it.
+ */
+ if (state->start < start) {
+ if (state->state & exclusive_bits) {
+ *failed_start = start;
+ err = -EEXIST;
+ goto out;
+ }
+
+ prealloc = alloc_extent_state_atomic(prealloc);
+ BUG_ON(!prealloc);
+ err = split_state(tree, state, prealloc, start);
+ if (err)
+ extent_io_tree_panic(tree, err);
+
+ prealloc = NULL;
+ if (err)
+ goto out;
+ if (state->end <= end) {
+ set_state_bits(tree, state, &bits);
+ cache_state(state, cached_state);
+ merge_state(tree, state);
+ if (last_end == (u64)-1)
+ goto out;
+ start = last_end + 1;
+ state = next_state(state);
+ if (start < end && state && state->start == start &&
+ !need_resched())
+ goto hit_next;
+ }
+ goto search_again;
+ }
+ /*
+ * | ---- desired range ---- |
+ * | state | or | state |
+ *
+ * There's a hole, we need to insert something in it and
+ * ignore the extent we found.
+ */
+ if (state->start > start) {
+ u64 this_end;
+ if (end < last_start)
+ this_end = end;
+ else
+ this_end = last_start - 1;
+
+ prealloc = alloc_extent_state_atomic(prealloc);
+ BUG_ON(!prealloc);
+
+ /*
+ * Avoid to free 'prealloc' if it can be merged with
+ * the later extent.
+ */
+ err = insert_state(tree, prealloc, start, this_end,
+ NULL, NULL, &bits);
+ if (err)
+ extent_io_tree_panic(tree, err);
+
+ cache_state(prealloc, cached_state);
+ prealloc = NULL;
+ start = this_end + 1;
+ goto search_again;
+ }
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ * We need to split the extent, and set the bit
+ * on the first half
+ */
+ if (state->start <= end && state->end > end) {
+ if (state->state & exclusive_bits) {
+ *failed_start = start;
+ err = -EEXIST;
+ goto out;
+ }
+
+ prealloc = alloc_extent_state_atomic(prealloc);
+ BUG_ON(!prealloc);
+ err = split_state(tree, state, prealloc, end + 1);
+ if (err)
+ extent_io_tree_panic(tree, err);
+
+ set_state_bits(tree, prealloc, &bits);
+ cache_state(prealloc, cached_state);
+ merge_state(tree, prealloc);
+ prealloc = NULL;
+ goto out;
+ }
+
+ goto search_again;
+
+out:
+ spin_unlock(&tree->lock);
+ if (prealloc)
+ free_extent_state(prealloc);
+
+ return err;
+
+search_again:
+ if (start > end)
+ goto out;
+ spin_unlock(&tree->lock);
+ if (mask & __GFP_WAIT)
+ cond_resched();
+ goto again;
+}
+
+int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, u64 * failed_start,
+ struct extent_state **cached_state, gfp_t mask)
+{
+ return __set_extent_bit(tree, start, end, bits, 0, failed_start,
+ cached_state, mask);
+}
+
+
+/**
+ * convert_extent_bit - convert all bits in a given range from one bit to
+ * another
+ * @tree: the io tree to search
+ * @start: the start offset in bytes
+ * @end: the end offset in bytes (inclusive)
+ * @bits: the bits to set in this range
+ * @clear_bits: the bits to clear in this range
+ * @cached_state: state that we're going to cache
+ * @mask: the allocation mask
+ *
+ * This will go through and set bits for the given range. If any states exist
+ * already in this range they are set with the given bit and cleared of the
+ * clear_bits. This is only meant to be used by things that are mergeable, ie
+ * converting from say DELALLOC to DIRTY. This is not meant to be used with
+ * boundary bits like LOCK.
+ */
+int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, unsigned clear_bits,
+ struct extent_state **cached_state, gfp_t mask)
+{
+ struct extent_state *state;
+ struct extent_state *prealloc = NULL;
+ struct rb_node *node;
+ struct rb_node **p;
+ struct rb_node *parent;
+ int err = 0;
+ u64 last_start;
+ u64 last_end;
+ bool first_iteration = true;
+
+ btrfs_debug_check_extent_io_range(tree, start, end);
+
+again:
+ if (!prealloc && (mask & __GFP_WAIT)) {
+ /*
+ * Best effort, don't worry if extent state allocation fails
+ * here for the first iteration. We might have a cached state
+ * that matches exactly the target range, in which case no
+ * extent state allocations are needed. We'll only know this
+ * after locking the tree.
+ */
+ prealloc = alloc_extent_state(mask);
+ if (!prealloc && !first_iteration)
+ return -ENOMEM;
+ }
+
+ spin_lock(&tree->lock);
+ if (cached_state && *cached_state) {
+ state = *cached_state;
+ if (state->start <= start && state->end > start &&
+ extent_state_in_tree(state)) {
+ node = &state->rb_node;
+ goto hit_next;
+ }
+ }
+
+ /*
+ * this search will find all the extents that end after
+ * our range starts.
+ */
+ node = tree_search_for_insert(tree, start, &p, &parent);
+ if (!node) {
+ prealloc = alloc_extent_state_atomic(prealloc);
+ if (!prealloc) {
+ err = -ENOMEM;
+ goto out;
+ }
+ err = insert_state(tree, prealloc, start, end,
+ &p, &parent, &bits);
+ if (err)
+ extent_io_tree_panic(tree, err);
+ cache_state(prealloc, cached_state);
+ prealloc = NULL;
+ goto out;
+ }
+ state = rb_entry(node, struct extent_state, rb_node);
+hit_next:
+ last_start = state->start;
+ last_end = state->end;
+
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ *
+ * Just lock what we found and keep going
+ */
+ if (state->start == start && state->end <= end) {
+ set_state_bits(tree, state, &bits);
+ cache_state(state, cached_state);
+ state = clear_state_bit(tree, state, &clear_bits, 0);
+ if (last_end == (u64)-1)
+ goto out;
+ start = last_end + 1;
+ if (start < end && state && state->start == start &&
+ !need_resched())
+ goto hit_next;
+ goto search_again;
+ }
+
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ * or
+ * | ------------- state -------------- |
+ *
+ * We need to split the extent we found, and may flip bits on
+ * second half.
+ *
+ * If the extent we found extends past our
+ * range, we just split and search again. It'll get split
+ * again the next time though.
+ *
+ * If the extent we found is inside our range, we set the
+ * desired bit on it.
+ */
+ if (state->start < start) {
+ prealloc = alloc_extent_state_atomic(prealloc);
+ if (!prealloc) {
+ err = -ENOMEM;
+ goto out;
+ }
+ err = split_state(tree, state, prealloc, start);
+ if (err)
+ extent_io_tree_panic(tree, err);
+ prealloc = NULL;
+ if (err)
+ goto out;
+ if (state->end <= end) {
+ set_state_bits(tree, state, &bits);
+ cache_state(state, cached_state);
+ state = clear_state_bit(tree, state, &clear_bits, 0);
+ if (last_end == (u64)-1)
+ goto out;
+ start = last_end + 1;
+ if (start < end && state && state->start == start &&
+ !need_resched())
+ goto hit_next;
+ }
+ goto search_again;
+ }
+ /*
+ * | ---- desired range ---- |
+ * | state | or | state |
+ *
+ * There's a hole, we need to insert something in it and
+ * ignore the extent we found.
+ */
+ if (state->start > start) {
+ u64 this_end;
+ if (end < last_start)
+ this_end = end;
+ else
+ this_end = last_start - 1;
+
+ prealloc = alloc_extent_state_atomic(prealloc);
+ if (!prealloc) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * Avoid to free 'prealloc' if it can be merged with
+ * the later extent.
+ */
+ err = insert_state(tree, prealloc, start, this_end,
+ NULL, NULL, &bits);
+ if (err)
+ extent_io_tree_panic(tree, err);
+ cache_state(prealloc, cached_state);
+ prealloc = NULL;
+ start = this_end + 1;
+ goto search_again;
+ }
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ * We need to split the extent, and set the bit
+ * on the first half
+ */
+ if (state->start <= end && state->end > end) {
+ prealloc = alloc_extent_state_atomic(prealloc);
+ if (!prealloc) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = split_state(tree, state, prealloc, end + 1);
+ if (err)
+ extent_io_tree_panic(tree, err);
+
+ set_state_bits(tree, prealloc, &bits);
+ cache_state(prealloc, cached_state);
+ clear_state_bit(tree, prealloc, &clear_bits, 0);
+ prealloc = NULL;
+ goto out;
+ }
+
+ goto search_again;
+
+out:
+ spin_unlock(&tree->lock);
+ if (prealloc)
+ free_extent_state(prealloc);
+
+ return err;
+
+search_again:
+ if (start > end)
+ goto out;
+ spin_unlock(&tree->lock);
+ if (mask & __GFP_WAIT)
+ cond_resched();
+ first_iteration = false;
+ goto again;
+}
+
+/* wrappers around set/clear extent bit */
+int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
+ gfp_t mask)
+{
+ return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL,
+ NULL, mask);
+}
+
+int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, gfp_t mask)
+{
+ return set_extent_bit(tree, start, end, bits, NULL,
+ NULL, mask);
+}
+
+int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, gfp_t mask)
+{
+ return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
+}
+
+int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached_state, gfp_t mask)
+{
+ return set_extent_bit(tree, start, end,
+ EXTENT_DELALLOC | EXTENT_UPTODATE,
+ NULL, cached_state, mask);
+}
+
+int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached_state, gfp_t mask)
+{
+ return set_extent_bit(tree, start, end,
+ EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG,
+ NULL, cached_state, mask);
+}
+
+int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
+ gfp_t mask)
+{
+ return clear_extent_bit(tree, start, end,
+ EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask);
+}
+
+int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
+ gfp_t mask)
+{
+ return set_extent_bit(tree, start, end, EXTENT_NEW, NULL,
+ NULL, mask);
+}
+
+int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached_state, gfp_t mask)
+{
+ return set_extent_bit(tree, start, end, EXTENT_UPTODATE, NULL,
+ cached_state, mask);
+}
+
+int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached_state, gfp_t mask)
+{
+ return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
+ cached_state, mask);
+}
+
+/*
+ * either insert or lock state struct between start and end use mask to tell
+ * us if waiting is desired.
+ */
+int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, struct extent_state **cached_state)
+{
+ int err;
+ u64 failed_start;
+
+ while (1) {
+ err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
+ EXTENT_LOCKED, &failed_start,
+ cached_state, GFP_NOFS);
+ if (err == -EEXIST) {
+ wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
+ start = failed_start;
+ } else
+ break;
+ WARN_ON(start > end);
+ }
+ return err;
+}
+
+int lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
+{
+ return lock_extent_bits(tree, start, end, 0, NULL);
+}
+
+int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
+{
+ int err;
+ u64 failed_start;
+
+ err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
+ &failed_start, NULL, GFP_NOFS);
+ if (err == -EEXIST) {
+ if (failed_start > start)
+ clear_extent_bit(tree, start, failed_start - 1,
+ EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS);
+ return 0;
+ }
+ return 1;
+}
+
+int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached, gfp_t mask)
+{
+ return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
+ mask);
+}
+
+int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end)
+{
+ return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
+ GFP_NOFS);
+}
+
+int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
+{
+ unsigned long index = start >> PAGE_CACHE_SHIFT;
+ unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+ struct page *page;
+
+ while (index <= end_index) {
+ page = find_get_page(inode->i_mapping, index);
+ BUG_ON(!page); /* Pages should be in the extent_io_tree */
+ clear_page_dirty_for_io(page);
+ page_cache_release(page);
+ index++;
+ }
+ return 0;
+}
+
+int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
+{
+ unsigned long index = start >> PAGE_CACHE_SHIFT;
+ unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+ struct page *page;
+
+ while (index <= end_index) {
+ page = find_get_page(inode->i_mapping, index);
+ BUG_ON(!page); /* Pages should be in the extent_io_tree */
+ __set_page_dirty_nobuffers(page);
+ account_page_redirty(page);
+ page_cache_release(page);
+ index++;
+ }
+ return 0;
+}
+
+/*
+ * helper function to set both pages and extents in the tree writeback
+ */
+static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
+{
+ unsigned long index = start >> PAGE_CACHE_SHIFT;
+ unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+ struct page *page;
+
+ while (index <= end_index) {
+ page = find_get_page(tree->mapping, index);
+ BUG_ON(!page); /* Pages should be in the extent_io_tree */
+ set_page_writeback(page);
+ page_cache_release(page);
+ index++;
+ }
+ return 0;
+}
+
+/* find the first state struct with 'bits' set after 'start', and
+ * return it. tree->lock must be held. NULL will returned if
+ * nothing was found after 'start'
+ */
+static struct extent_state *
+find_first_extent_bit_state(struct extent_io_tree *tree,
+ u64 start, unsigned bits)
+{
+ struct rb_node *node;
+ struct extent_state *state;
+
+ /*
+ * this search will find all the extents that end after
+ * our range starts.
+ */
+ node = tree_search(tree, start);
+ if (!node)
+ goto out;
+
+ while (1) {
+ state = rb_entry(node, struct extent_state, rb_node);
+ if (state->end >= start && (state->state & bits))
+ return state;
+
+ node = rb_next(node);
+ if (!node)
+ break;
+ }
+out:
+ return NULL;
+}
+
+/*
+ * find the first offset in the io tree with 'bits' set. zero is
+ * returned if we find something, and *start_ret and *end_ret are
+ * set to reflect the state struct that was found.
+ *
+ * If nothing was found, 1 is returned. If found something, return 0.
+ */
+int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, unsigned bits,
+ struct extent_state **cached_state)
+{
+ struct extent_state *state;
+ struct rb_node *n;
+ int ret = 1;
+
+ spin_lock(&tree->lock);
+ if (cached_state && *cached_state) {
+ state = *cached_state;
+ if (state->end == start - 1 && extent_state_in_tree(state)) {
+ n = rb_next(&state->rb_node);
+ while (n) {
+ state = rb_entry(n, struct extent_state,
+ rb_node);
+ if (state->state & bits)
+ goto got_it;
+ n = rb_next(n);
+ }
+ free_extent_state(*cached_state);
+ *cached_state = NULL;
+ goto out;
+ }
+ free_extent_state(*cached_state);
+ *cached_state = NULL;
+ }
+
+ state = find_first_extent_bit_state(tree, start, bits);
+got_it:
+ if (state) {
+ cache_state_if_flags(state, cached_state, 0);
+ *start_ret = state->start;
+ *end_ret = state->end;
+ ret = 0;
+ }
+out:
+ spin_unlock(&tree->lock);
+ return ret;
+}
+
+/*
+ * find a contiguous range of bytes in the file marked as delalloc, not
+ * more than 'max_bytes'. start and end are used to return the range,
+ *
+ * 1 is returned if we find something, 0 if nothing was in the tree
+ */
+static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
+ u64 *start, u64 *end, u64 max_bytes,
+ struct extent_state **cached_state)
+{
+ struct rb_node *node;
+ struct extent_state *state;
+ u64 cur_start = *start;
+ u64 found = 0;
+ u64 total_bytes = 0;
+
+ spin_lock(&tree->lock);
+
+ /*
+ * this search will find all the extents that end after
+ * our range starts.
+ */
+ node = tree_search(tree, cur_start);
+ if (!node) {
+ if (!found)
+ *end = (u64)-1;
+ goto out;
+ }
+
+ while (1) {
+ state = rb_entry(node, struct extent_state, rb_node);
+ if (found && (state->start != cur_start ||
+ (state->state & EXTENT_BOUNDARY))) {
+ goto out;
+ }
+ if (!(state->state & EXTENT_DELALLOC)) {
+ if (!found)
+ *end = state->end;
+ goto out;
+ }
+ if (!found) {
+ *start = state->start;
+ *cached_state = state;
+ atomic_inc(&state->refs);
+ }
+ found++;
+ *end = state->end;
+ cur_start = state->end + 1;
+ node = rb_next(node);
+ total_bytes += state->end - state->start + 1;
+ if (total_bytes >= max_bytes)
+ break;
+ if (!node)
+ break;
+ }
+out:
+ spin_unlock(&tree->lock);
+ return found;
+}
+
+static noinline void __unlock_for_delalloc(struct inode *inode,
+ struct page *locked_page,
+ u64 start, u64 end)
+{
+ int ret;
+ struct page *pages[16];
+ unsigned long index = start >> PAGE_CACHE_SHIFT;
+ unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+ unsigned long nr_pages = end_index - index + 1;
+ int i;
+
+ if (index == locked_page->index && end_index == index)
+ return;
+
+ while (nr_pages > 0) {
+ ret = find_get_pages_contig(inode->i_mapping, index,
+ min_t(unsigned long, nr_pages,
+ ARRAY_SIZE(pages)), pages);
+ for (i = 0; i < ret; i++) {
+ if (pages[i] != locked_page)
+ unlock_page(pages[i]);
+ page_cache_release(pages[i]);
+ }
+ nr_pages -= ret;
+ index += ret;
+ cond_resched();
+ }
+}
+
+static noinline int lock_delalloc_pages(struct inode *inode,
+ struct page *locked_page,
+ u64 delalloc_start,
+ u64 delalloc_end)
+{
+ unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
+ unsigned long start_index = index;
+ unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
+ unsigned long pages_locked = 0;
+ struct page *pages[16];
+ unsigned long nrpages;
+ int ret;
+ int i;
+
+ /* the caller is responsible for locking the start index */
+ if (index == locked_page->index && index == end_index)
+ return 0;
+
+ /* skip the page at the start index */
+ nrpages = end_index - index + 1;
+ while (nrpages > 0) {
+ ret = find_get_pages_contig(inode->i_mapping, index,
+ min_t(unsigned long,
+ nrpages, ARRAY_SIZE(pages)), pages);
+ if (ret == 0) {
+ ret = -EAGAIN;
+ goto done;
+ }
+ /* now we have an array of pages, lock them all */
+ for (i = 0; i < ret; i++) {
+ /*
+ * the caller is taking responsibility for
+ * locked_page
+ */
+ if (pages[i] != locked_page) {
+ lock_page(pages[i]);
+ if (!PageDirty(pages[i]) ||
+ pages[i]->mapping != inode->i_mapping) {
+ ret = -EAGAIN;
+ unlock_page(pages[i]);
+ page_cache_release(pages[i]);
+ goto done;
+ }
+ }
+ page_cache_release(pages[i]);
+ pages_locked++;
+ }
+ nrpages -= ret;
+ index += ret;
+ cond_resched();
+ }
+ ret = 0;
+done:
+ if (ret && pages_locked) {
+ __unlock_for_delalloc(inode, locked_page,
+ delalloc_start,
+ ((u64)(start_index + pages_locked - 1)) <<
+ PAGE_CACHE_SHIFT);
+ }
+ return ret;
+}
+
+/*
+ * find a contiguous range of bytes in the file marked as delalloc, not
+ * more than 'max_bytes'. start and end are used to return the range,
+ *
+ * 1 is returned if we find something, 0 if nothing was in the tree
+ */
+STATIC u64 find_lock_delalloc_range(struct inode *inode,
+ struct extent_io_tree *tree,
+ struct page *locked_page, u64 *start,
+ u64 *end, u64 max_bytes)
+{
+ u64 delalloc_start;
+ u64 delalloc_end;
+ u64 found;
+ struct extent_state *cached_state = NULL;
+ int ret;
+ int loops = 0;
+
+again:
+ /* step one, find a bunch of delalloc bytes starting at start */
+ delalloc_start = *start;
+ delalloc_end = 0;
+ found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
+ max_bytes, &cached_state);
+ if (!found || delalloc_end <= *start) {
+ *start = delalloc_start;
+ *end = delalloc_end;
+ free_extent_state(cached_state);
+ return 0;
+ }
+
+ /*
+ * start comes from the offset of locked_page. We have to lock
+ * pages in order, so we can't process delalloc bytes before
+ * locked_page
+ */
+ if (delalloc_start < *start)
+ delalloc_start = *start;
+
+ /*
+ * make sure to limit the number of pages we try to lock down
+ */
+ if (delalloc_end + 1 - delalloc_start > max_bytes)
+ delalloc_end = delalloc_start + max_bytes - 1;
+
+ /* step two, lock all the pages after the page that has start */
+ ret = lock_delalloc_pages(inode, locked_page,
+ delalloc_start, delalloc_end);
+ if (ret == -EAGAIN) {
+ /* some of the pages are gone, lets avoid looping by
+ * shortening the size of the delalloc range we're searching
+ */
+ free_extent_state(cached_state);
+ cached_state = NULL;
+ if (!loops) {
+ max_bytes = PAGE_CACHE_SIZE;
+ loops = 1;
+ goto again;
+ } else {
+ found = 0;
+ goto out_failed;
+ }
+ }
+ BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */
+
+ /* step three, lock the state bits for the whole range */
+ lock_extent_bits(tree, delalloc_start, delalloc_end, 0, &cached_state);
+
+ /* then test to make sure it is all still delalloc */
+ ret = test_range_bit(tree, delalloc_start, delalloc_end,
+ EXTENT_DELALLOC, 1, cached_state);
+ if (!ret) {
+ unlock_extent_cached(tree, delalloc_start, delalloc_end,
+ &cached_state, GFP_NOFS);
+ __unlock_for_delalloc(inode, locked_page,
+ delalloc_start, delalloc_end);
+ cond_resched();
+ goto again;
+ }
+ free_extent_state(cached_state);
+ *start = delalloc_start;
+ *end = delalloc_end;
+out_failed:
+ return found;
+}
+
+int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
+ struct page *locked_page,
+ unsigned clear_bits,
+ unsigned long page_ops)
+{
+ struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
+ int ret;
+ struct page *pages[16];
+ unsigned long index = start >> PAGE_CACHE_SHIFT;
+ unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+ unsigned long nr_pages = end_index - index + 1;
+ int i;
+
+ clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
+ if (page_ops == 0)
+ return 0;
+
+ if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0)
+ mapping_set_error(inode->i_mapping, -EIO);
+
+ while (nr_pages > 0) {
+ ret = find_get_pages_contig(inode->i_mapping, index,
+ min_t(unsigned long,
+ nr_pages, ARRAY_SIZE(pages)), pages);
+ for (i = 0; i < ret; i++) {
+
+ if (page_ops & PAGE_SET_PRIVATE2)
+ SetPagePrivate2(pages[i]);
+
+ if (pages[i] == locked_page) {
+ page_cache_release(pages[i]);
+ continue;
+ }
+ if (page_ops & PAGE_CLEAR_DIRTY)
+ clear_page_dirty_for_io(pages[i]);
+ if (page_ops & PAGE_SET_WRITEBACK)
+ set_page_writeback(pages[i]);
+ if (page_ops & PAGE_SET_ERROR)
+ SetPageError(pages[i]);
+ if (page_ops & PAGE_END_WRITEBACK)
+ end_page_writeback(pages[i]);
+ if (page_ops & PAGE_UNLOCK)
+ unlock_page(pages[i]);
+ page_cache_release(pages[i]);
+ }
+ nr_pages -= ret;
+ index += ret;
+ cond_resched();
+ }
+ return 0;
+}
+
+/*
+ * count the number of bytes in the tree that have a given bit(s)
+ * set. This can be fairly slow, except for EXTENT_DIRTY which is
+ * cached. The total number found is returned.
+ */
+u64 count_range_bits(struct extent_io_tree *tree,
+ u64 *start, u64 search_end, u64 max_bytes,
+ unsigned bits, int contig)
+{
+ struct rb_node *node;
+ struct extent_state *state;
+ u64 cur_start = *start;
+ u64 total_bytes = 0;
+ u64 last = 0;
+ int found = 0;
+
+ if (WARN_ON(search_end <= cur_start))
+ return 0;
+
+ spin_lock(&tree->lock);
+ if (cur_start == 0 && bits == EXTENT_DIRTY) {
+ total_bytes = tree->dirty_bytes;
+ goto out;
+ }
+ /*
+ * this search will find all the extents that end after
+ * our range starts.
+ */
+ node = tree_search(tree, cur_start);
+ if (!node)
+ goto out;
+
+ while (1) {
+ state = rb_entry(node, struct extent_state, rb_node);
+ if (state->start > search_end)
+ break;
+ if (contig && found && state->start > last + 1)
+ break;
+ if (state->end >= cur_start && (state->state & bits) == bits) {
+ total_bytes += min(search_end, state->end) + 1 -
+ max(cur_start, state->start);
+ if (total_bytes >= max_bytes)
+ break;
+ if (!found) {
+ *start = max(cur_start, state->start);
+ found = 1;
+ }
+ last = state->end;
+ } else if (contig && found) {
+ break;
+ }
+ node = rb_next(node);
+ if (!node)
+ break;
+ }
+out:
+ spin_unlock(&tree->lock);
+ return total_bytes;
+}
+
+/*
+ * set the private field for a given byte offset in the tree. If there isn't
+ * an extent_state there already, this does nothing.
+ */
+static int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
+{
+ struct rb_node *node;
+ struct extent_state *state;
+ int ret = 0;
+
+ spin_lock(&tree->lock);
+ /*
+ * this search will find all the extents that end after
+ * our range starts.
+ */
+ node = tree_search(tree, start);
+ if (!node) {
+ ret = -ENOENT;
+ goto out;
+ }
+ state = rb_entry(node, struct extent_state, rb_node);
+ if (state->start != start) {
+ ret = -ENOENT;
+ goto out;
+ }
+ state->private = private;
+out:
+ spin_unlock(&tree->lock);
+ return ret;
+}
+
+int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
+{
+ struct rb_node *node;
+ struct extent_state *state;
+ int ret = 0;
+
+ spin_lock(&tree->lock);
+ /*
+ * this search will find all the extents that end after
+ * our range starts.
+ */
+ node = tree_search(tree, start);
+ if (!node) {
+ ret = -ENOENT;
+ goto out;
+ }
+ state = rb_entry(node, struct extent_state, rb_node);
+ if (state->start != start) {
+ ret = -ENOENT;
+ goto out;
+ }
+ *private = state->private;
+out:
+ spin_unlock(&tree->lock);
+ return ret;
+}
+
+/*
+ * searches a range in the state tree for a given mask.
+ * If 'filled' == 1, this returns 1 only if every extent in the tree
+ * has the bits set. Otherwise, 1 is returned if any bit in the
+ * range is found set.
+ */
+int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, int filled, struct extent_state *cached)
+{
+ struct extent_state *state = NULL;
+ struct rb_node *node;
+ int bitset = 0;
+
+ spin_lock(&tree->lock);
+ if (cached && extent_state_in_tree(cached) && cached->start <= start &&
+ cached->end > start)
+ node = &cached->rb_node;
+ else
+ node = tree_search(tree, start);
+ while (node && start <= end) {
+ state = rb_entry(node, struct extent_state, rb_node);
+
+ if (filled && state->start > start) {
+ bitset = 0;
+ break;
+ }
+
+ if (state->start > end)
+ break;
+
+ if (state->state & bits) {
+ bitset = 1;
+ if (!filled)
+ break;
+ } else if (filled) {
+ bitset = 0;
+ break;
+ }
+
+ if (state->end == (u64)-1)
+ break;
+
+ start = state->end + 1;
+ if (start > end)
+ break;
+ node = rb_next(node);
+ if (!node) {
+ if (filled)
+ bitset = 0;
+ break;
+ }
+ }
+ spin_unlock(&tree->lock);
+ return bitset;
+}
+
+/*
+ * helper function to set a given page up to date if all the
+ * extents in the tree for that page are up to date
+ */
+static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
+{
+ u64 start = page_offset(page);
+ u64 end = start + PAGE_CACHE_SIZE - 1;
+ if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
+ SetPageUptodate(page);
+}
+
+int free_io_failure(struct inode *inode, struct io_failure_record *rec)
+{
+ int ret;
+ int err = 0;
+ struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
+
+ set_state_private(failure_tree, rec->start, 0);
+ ret = clear_extent_bits(failure_tree, rec->start,
+ rec->start + rec->len - 1,
+ EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
+ if (ret)
+ err = ret;
+
+ ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
+ rec->start + rec->len - 1,
+ EXTENT_DAMAGED, GFP_NOFS);
+ if (ret && !err)
+ err = ret;
+
+ kfree(rec);
+ return err;
+}
+
+/*
+ * this bypasses the standard btrfs submit functions deliberately, as
+ * the standard behavior is to write all copies in a raid setup. here we only
+ * want to write the one bad copy. so we do the mapping for ourselves and issue
+ * submit_bio directly.
+ * to avoid any synchronization issues, wait for the data after writing, which
+ * actually prevents the read that triggered the error from finishing.
+ * currently, there can be no more than two copies of every data bit. thus,
+ * exactly one rewrite is required.
+ */
+int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
+ struct page *page, unsigned int pg_offset, int mirror_num)
+{
+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+ struct bio *bio;
+ struct btrfs_device *dev;
+ u64 map_length = 0;
+ u64 sector;
+ struct btrfs_bio *bbio = NULL;
+ struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
+ int ret;
+
+ ASSERT(!(fs_info->sb->s_flags & MS_RDONLY));
+ BUG_ON(!mirror_num);
+
+ /* we can't repair anything in raid56 yet */
+ if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num))
+ return 0;
+
+ bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
+ if (!bio)
+ return -EIO;
+ bio->bi_iter.bi_size = 0;
+ map_length = length;
+
+ ret = btrfs_map_block(fs_info, WRITE, logical,
+ &map_length, &bbio, mirror_num);
+ if (ret) {
+ bio_put(bio);
+ return -EIO;
+ }
+ BUG_ON(mirror_num != bbio->mirror_num);
+ sector = bbio->stripes[mirror_num-1].physical >> 9;
+ bio->bi_iter.bi_sector = sector;
+ dev = bbio->stripes[mirror_num-1].dev;
+ btrfs_put_bbio(bbio);
+ if (!dev || !dev->bdev || !dev->writeable) {
+ bio_put(bio);
+ return -EIO;
+ }
+ bio->bi_bdev = dev->bdev;
+ bio_add_page(bio, page, length, pg_offset);
+
+ if (btrfsic_submit_bio_wait(WRITE_SYNC, bio)) {
+ /* try to remap that extent elsewhere? */
+ bio_put(bio);
+ btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
+ return -EIO;
+ }
+
+ printk_ratelimited_in_rcu(KERN_INFO
+ "BTRFS: read error corrected: ino %llu off %llu (dev %s sector %llu)\n",
+ btrfs_ino(inode), start,
+ rcu_str_deref(dev->name), sector);
+ bio_put(bio);
+ return 0;
+}
+
+int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
+ int mirror_num)
+{
+ u64 start = eb->start;
+ unsigned long i, num_pages = num_extent_pages(eb->start, eb->len);
+ int ret = 0;
+
+ if (root->fs_info->sb->s_flags & MS_RDONLY)
+ return -EROFS;
+
+ for (i = 0; i < num_pages; i++) {
+ struct page *p = eb->pages[i];
+
+ ret = repair_io_failure(root->fs_info->btree_inode, start,
+ PAGE_CACHE_SIZE, start, p,
+ start - page_offset(p), mirror_num);
+ if (ret)
+ break;
+ start += PAGE_CACHE_SIZE;
+ }
+
+ return ret;
+}
+
+/*
+ * each time an IO finishes, we do a fast check in the IO failure tree
+ * to see if we need to process or clean up an io_failure_record
+ */
+int clean_io_failure(struct inode *inode, u64 start, struct page *page,
+ unsigned int pg_offset)
+{
+ u64 private;
+ u64 private_failure;
+ struct io_failure_record *failrec;
+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+ struct extent_state *state;
+ int num_copies;
+ int ret;
+
+ private = 0;
+ ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
+ (u64)-1, 1, EXTENT_DIRTY, 0);
+ if (!ret)
+ return 0;
+
+ ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start,
+ &private_failure);
+ if (ret)
+ return 0;
+
+ failrec = (struct io_failure_record *)(unsigned long) private_failure;
+ BUG_ON(!failrec->this_mirror);
+
+ if (failrec->in_validation) {
+ /* there was no real error, just free the record */
+ pr_debug("clean_io_failure: freeing dummy error at %llu\n",
+ failrec->start);
+ goto out;
+ }
+ if (fs_info->sb->s_flags & MS_RDONLY)
+ goto out;
+
+ spin_lock(&BTRFS_I(inode)->io_tree.lock);
+ state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
+ failrec->start,
+ EXTENT_LOCKED);
+ spin_unlock(&BTRFS_I(inode)->io_tree.lock);
+
+ if (state && state->start <= failrec->start &&
+ state->end >= failrec->start + failrec->len - 1) {
+ num_copies = btrfs_num_copies(fs_info, failrec->logical,
+ failrec->len);
+ if (num_copies > 1) {
+ repair_io_failure(inode, start, failrec->len,
+ failrec->logical, page,
+ pg_offset, failrec->failed_mirror);
+ }
+ }
+
+out:
+ free_io_failure(inode, failrec);
+
+ return 0;
+}
+
+/*
+ * Can be called when
+ * - hold extent lock
+ * - under ordered extent
+ * - the inode is freeing
+ */
+void btrfs_free_io_failure_record(struct inode *inode, u64 start, u64 end)
+{
+ struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
+ struct io_failure_record *failrec;
+ struct extent_state *state, *next;
+
+ if (RB_EMPTY_ROOT(&failure_tree->state))
+ return;
+
+ spin_lock(&failure_tree->lock);
+ state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY);
+ while (state) {
+ if (state->start > end)
+ break;
+
+ ASSERT(state->end <= end);
+
+ next = next_state(state);
+
+ failrec = (struct io_failure_record *)(unsigned long)state->private;
+ free_extent_state(state);
+ kfree(failrec);
+
+ state = next;
+ }
+ spin_unlock(&failure_tree->lock);
+}
+
+int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
+ struct io_failure_record **failrec_ret)
+{
+ struct io_failure_record *failrec;
+ u64 private;
+ struct extent_map *em;
+ struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
+ struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ int ret;
+ u64 logical;
+
+ ret = get_state_private(failure_tree, start, &private);
+ if (ret) {
+ failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
+ if (!failrec)
+ return -ENOMEM;
+
+ failrec->start = start;
+ failrec->len = end - start + 1;
+ failrec->this_mirror = 0;
+ failrec->bio_flags = 0;
+ failrec->in_validation = 0;
+
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, start, failrec->len);
+ if (!em) {
+ read_unlock(&em_tree->lock);
+ kfree(failrec);
+ return -EIO;
+ }
+
+ if (em->start > start || em->start + em->len <= start) {
+ free_extent_map(em);
+ em = NULL;
+ }
+ read_unlock(&em_tree->lock);
+ if (!em) {
+ kfree(failrec);
+ return -EIO;
+ }
+
+ logical = start - em->start;
+ logical = em->block_start + logical;
+ if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+ logical = em->block_start;
+ failrec->bio_flags = EXTENT_BIO_COMPRESSED;
+ extent_set_compress_type(&failrec->bio_flags,
+ em->compress_type);
+ }
+
+ pr_debug("Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu\n",
+ logical, start, failrec->len);
+
+ failrec->logical = logical;
+ free_extent_map(em);
+
+ /* set the bits in the private failure tree */
+ ret = set_extent_bits(failure_tree, start, end,
+ EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
+ if (ret >= 0)
+ ret = set_state_private(failure_tree, start,
+ (u64)(unsigned long)failrec);
+ /* set the bits in the inode's tree */
+ if (ret >= 0)
+ ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED,
+ GFP_NOFS);
+ if (ret < 0) {
+ kfree(failrec);
+ return ret;
+ }
+ } else {
+ failrec = (struct io_failure_record *)(unsigned long)private;
+ pr_debug("Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d\n",
+ failrec->logical, failrec->start, failrec->len,
+ failrec->in_validation);
+ /*
+ * when data can be on disk more than twice, add to failrec here
+ * (e.g. with a list for failed_mirror) to make
+ * clean_io_failure() clean all those errors at once.
+ */
+ }
+
+ *failrec_ret = failrec;
+
+ return 0;
+}
+
+int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio,
+ struct io_failure_record *failrec, int failed_mirror)
+{
+ int num_copies;
+
+ num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info,
+ failrec->logical, failrec->len);
+ if (num_copies == 1) {
+ /*
+ * we only have a single copy of the data, so don't bother with
+ * all the retry and error correction code that follows. no
+ * matter what the error is, it is very likely to persist.
+ */
+ pr_debug("Check Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n",
+ num_copies, failrec->this_mirror, failed_mirror);
+ return 0;
+ }
+
+ /*
+ * there are two premises:
+ * a) deliver good data to the caller
+ * b) correct the bad sectors on disk
+ */
+ if (failed_bio->bi_vcnt > 1) {
+ /*
+ * to fulfill b), we need to know the exact failing sectors, as
+ * we don't want to rewrite any more than the failed ones. thus,
+ * we need separate read requests for the failed bio
+ *
+ * if the following BUG_ON triggers, our validation request got
+ * merged. we need separate requests for our algorithm to work.
+ */
+ BUG_ON(failrec->in_validation);
+ failrec->in_validation = 1;
+ failrec->this_mirror = failed_mirror;
+ } else {
+ /*
+ * we're ready to fulfill a) and b) alongside. get a good copy
+ * of the failed sector and if we succeed, we have setup
+ * everything for repair_io_failure to do the rest for us.
+ */
+ if (failrec->in_validation) {
+ BUG_ON(failrec->this_mirror != failed_mirror);
+ failrec->in_validation = 0;
+ failrec->this_mirror = 0;
+ }
+ failrec->failed_mirror = failed_mirror;
+ failrec->this_mirror++;
+ if (failrec->this_mirror == failed_mirror)
+ failrec->this_mirror++;
+ }
+
+ if (failrec->this_mirror > num_copies) {
+ pr_debug("Check Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n",
+ num_copies, failrec->this_mirror, failed_mirror);
+ return 0;
+ }
+
+ return 1;
+}
+
+
+struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
+ struct io_failure_record *failrec,
+ struct page *page, int pg_offset, int icsum,
+ bio_end_io_t *endio_func, void *data)
+{
+ struct bio *bio;
+ struct btrfs_io_bio *btrfs_failed_bio;
+ struct btrfs_io_bio *btrfs_bio;
+
+ bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
+ if (!bio)
+ return NULL;
+
+ bio->bi_end_io = endio_func;
+ bio->bi_iter.bi_sector = failrec->logical >> 9;
+ bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+ bio->bi_iter.bi_size = 0;
+ bio->bi_private = data;
+
+ btrfs_failed_bio = btrfs_io_bio(failed_bio);
+ if (btrfs_failed_bio->csum) {
+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+ u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
+
+ btrfs_bio = btrfs_io_bio(bio);
+ btrfs_bio->csum = btrfs_bio->csum_inline;
+ icsum *= csum_size;
+ memcpy(btrfs_bio->csum, btrfs_failed_bio->csum + icsum,
+ csum_size);
+ }
+
+ bio_add_page(bio, page, failrec->len, pg_offset);
+
+ return bio;
+}
+
+/*
+ * this is a generic handler for readpage errors (default
+ * readpage_io_failed_hook). if other copies exist, read those and write back
+ * good data to the failed position. does not investigate in remapping the
+ * failed extent elsewhere, hoping the device will be smart enough to do this as
+ * needed
+ */
+
+static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
+ struct page *page, u64 start, u64 end,
+ int failed_mirror)
+{
+ struct io_failure_record *failrec;
+ struct inode *inode = page->mapping->host;
+ struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
+ struct bio *bio;
+ int read_mode;
+ int ret;
+
+ BUG_ON(failed_bio->bi_rw & REQ_WRITE);
+
+ ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
+ if (ret)
+ return ret;
+
+ ret = btrfs_check_repairable(inode, failed_bio, failrec, failed_mirror);
+ if (!ret) {
+ free_io_failure(inode, failrec);
+ return -EIO;
+ }
+
+ if (failed_bio->bi_vcnt > 1)
+ read_mode = READ_SYNC | REQ_FAILFAST_DEV;
+ else
+ read_mode = READ_SYNC;
+
+ phy_offset >>= inode->i_sb->s_blocksize_bits;
+ bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
+ start - page_offset(page),
+ (int)phy_offset, failed_bio->bi_end_io,
+ NULL);
+ if (!bio) {
+ free_io_failure(inode, failrec);
+ return -EIO;
+ }
+
+ pr_debug("Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d\n",
+ read_mode, failrec->this_mirror, failrec->in_validation);
+
+ ret = tree->ops->submit_bio_hook(inode, read_mode, bio,
+ failrec->this_mirror,
+ failrec->bio_flags, 0);
+ if (ret) {
+ free_io_failure(inode, failrec);
+ bio_put(bio);
+ }
+
+ return ret;
+}
+
+/* lots and lots of room for performance fixes in the end_bio funcs */
+
+int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
+{
+ int uptodate = (err == 0);
+ struct extent_io_tree *tree;
+ int ret = 0;
+
+ tree = &BTRFS_I(page->mapping->host)->io_tree;
+
+ if (tree->ops && tree->ops->writepage_end_io_hook) {
+ ret = tree->ops->writepage_end_io_hook(page, start,
+ end, NULL, uptodate);
+ if (ret)
+ uptodate = 0;
+ }
+
+ if (!uptodate) {
+ ClearPageUptodate(page);
+ SetPageError(page);
+ ret = ret < 0 ? ret : -EIO;
+ mapping_set_error(page->mapping, ret);
+ }
+ return 0;
+}
+
+/*
+ * after a writepage IO is done, we need to:
+ * clear the uptodate bits on error
+ * clear the writeback bits in the extent tree for this IO
+ * end_page_writeback if the page has no more pending IO
+ *
+ * Scheduling is not allowed, so the extent state tree is expected
+ * to have one and only one object corresponding to this IO.
+ */
+static void end_bio_extent_writepage(struct bio *bio, int err)
+{
+ struct bio_vec *bvec;
+ u64 start;
+ u64 end;
+ int i;
+
+ bio_for_each_segment_all(bvec, bio, i) {
+ struct page *page = bvec->bv_page;
+
+ /* We always issue full-page reads, but if some block
+ * in a page fails to read, blk_update_request() will
+ * advance bv_offset and adjust bv_len to compensate.
+ * Print a warning for nonzero offsets, and an error
+ * if they don't add up to a full page. */
+ if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) {
+ if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE)
+ btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info,
+ "partial page write in btrfs with offset %u and length %u",
+ bvec->bv_offset, bvec->bv_len);
+ else
+ btrfs_info(BTRFS_I(page->mapping->host)->root->fs_info,
+ "incomplete page write in btrfs with offset %u and "
+ "length %u",
+ bvec->bv_offset, bvec->bv_len);
+ }
+
+ start = page_offset(page);
+ end = start + bvec->bv_offset + bvec->bv_len - 1;
+
+ if (end_extent_writepage(page, err, start, end))
+ continue;
+
+ end_page_writeback(page);
+ }
+
+ bio_put(bio);
+}
+
+static void
+endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len,
+ int uptodate)
+{
+ struct extent_state *cached = NULL;
+ u64 end = start + len - 1;
+
+ if (uptodate && tree->track_uptodate)
+ set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC);
+ unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
+}
+
+/*
+ * after a readpage IO is done, we need to:
+ * clear the uptodate bits on error
+ * set the uptodate bits if things worked
+ * set the page up to date if all extents in the tree are uptodate
+ * clear the lock bit in the extent tree
+ * unlock the page if there are no other extents locked for it
+ *
+ * Scheduling is not allowed, so the extent state tree is expected
+ * to have one and only one object corresponding to this IO.
+ */
+static void end_bio_extent_readpage(struct bio *bio, int err)
+{
+ struct bio_vec *bvec;
+ int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+ struct extent_io_tree *tree;
+ u64 offset = 0;
+ u64 start;
+ u64 end;
+ u64 len;
+ u64 extent_start = 0;
+ u64 extent_len = 0;
+ int mirror;
+ int ret;
+ int i;
+
+ if (err)
+ uptodate = 0;
+
+ bio_for_each_segment_all(bvec, bio, i) {
+ struct page *page = bvec->bv_page;
+ struct inode *inode = page->mapping->host;
+
+ pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, "
+ "mirror=%u\n", (u64)bio->bi_iter.bi_sector, err,
+ io_bio->mirror_num);
+ tree = &BTRFS_I(inode)->io_tree;
+
+ /* We always issue full-page reads, but if some block
+ * in a page fails to read, blk_update_request() will
+ * advance bv_offset and adjust bv_len to compensate.
+ * Print a warning for nonzero offsets, and an error
+ * if they don't add up to a full page. */
+ if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) {
+ if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE)
+ btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info,
+ "partial page read in btrfs with offset %u and length %u",
+ bvec->bv_offset, bvec->bv_len);
+ else
+ btrfs_info(BTRFS_I(page->mapping->host)->root->fs_info,
+ "incomplete page read in btrfs with offset %u and "
+ "length %u",
+ bvec->bv_offset, bvec->bv_len);
+ }
+
+ start = page_offset(page);
+ end = start + bvec->bv_offset + bvec->bv_len - 1;
+ len = bvec->bv_len;
+
+ mirror = io_bio->mirror_num;
+ if (likely(uptodate && tree->ops &&
+ tree->ops->readpage_end_io_hook)) {
+ ret = tree->ops->readpage_end_io_hook(io_bio, offset,
+ page, start, end,
+ mirror);
+ if (ret)
+ uptodate = 0;
+ else
+ clean_io_failure(inode, start, page, 0);
+ }
+
+ if (likely(uptodate))
+ goto readpage_ok;
+
+ if (tree->ops && tree->ops->readpage_io_failed_hook) {
+ ret = tree->ops->readpage_io_failed_hook(page, mirror);
+ if (!ret && !err &&
+ test_bit(BIO_UPTODATE, &bio->bi_flags))
+ uptodate = 1;
+ } else {
+ /*
+ * The generic bio_readpage_error handles errors the
+ * following way: If possible, new read requests are
+ * created and submitted and will end up in
+ * end_bio_extent_readpage as well (if we're lucky, not
+ * in the !uptodate case). In that case it returns 0 and
+ * we just go on with the next page in our bio. If it
+ * can't handle the error it will return -EIO and we
+ * remain responsible for that page.
+ */
+ ret = bio_readpage_error(bio, offset, page, start, end,
+ mirror);
+ if (ret == 0) {
+ uptodate =
+ test_bit(BIO_UPTODATE, &bio->bi_flags);
+ if (err)
+ uptodate = 0;
+ offset += len;
+ continue;
+ }
+ }
+readpage_ok:
+ if (likely(uptodate)) {
+ loff_t i_size = i_size_read(inode);
+ pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+ unsigned off;
+
+ /* Zero out the end if this page straddles i_size */
+ off = i_size & (PAGE_CACHE_SIZE-1);
+ if (page->index == end_index && off)
+ zero_user_segment(page, off, PAGE_CACHE_SIZE);
+ SetPageUptodate(page);
+ } else {
+ ClearPageUptodate(page);
+ SetPageError(page);
+ }
+ unlock_page(page);
+ offset += len;
+
+ if (unlikely(!uptodate)) {
+ if (extent_len) {
+ endio_readpage_release_extent(tree,
+ extent_start,
+ extent_len, 1);
+ extent_start = 0;
+ extent_len = 0;
+ }
+ endio_readpage_release_extent(tree, start,
+ end - start + 1, 0);
+ } else if (!extent_len) {
+ extent_start = start;
+ extent_len = end + 1 - start;
+ } else if (extent_start + extent_len == start) {
+ extent_len += end + 1 - start;
+ } else {
+ endio_readpage_release_extent(tree, extent_start,
+ extent_len, uptodate);
+ extent_start = start;
+ extent_len = end + 1 - start;
+ }
+ }
+
+ if (extent_len)
+ endio_readpage_release_extent(tree, extent_start, extent_len,
+ uptodate);
+ if (io_bio->end_io)
+ io_bio->end_io(io_bio, err);
+ bio_put(bio);
+}
+
+/*
+ * this allocates from the btrfs_bioset. We're returning a bio right now
+ * but you can call btrfs_io_bio for the appropriate container_of magic
+ */
+struct bio *
+btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
+ gfp_t gfp_flags)
+{
+ struct btrfs_io_bio *btrfs_bio;
+ struct bio *bio;
+
+ bio = bio_alloc_bioset(gfp_flags, nr_vecs, btrfs_bioset);
+
+ if (bio == NULL && (current->flags & PF_MEMALLOC)) {
+ while (!bio && (nr_vecs /= 2)) {
+ bio = bio_alloc_bioset(gfp_flags,
+ nr_vecs, btrfs_bioset);
+ }
+ }
+
+ if (bio) {
+ bio->bi_bdev = bdev;
+ bio->bi_iter.bi_sector = first_sector;
+ btrfs_bio = btrfs_io_bio(bio);
+ btrfs_bio->csum = NULL;
+ btrfs_bio->csum_allocated = NULL;
+ btrfs_bio->end_io = NULL;
+ }
+ return bio;
+}
+
+struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask)
+{
+ struct btrfs_io_bio *btrfs_bio;
+ struct bio *new;
+
+ new = bio_clone_bioset(bio, gfp_mask, btrfs_bioset);
+ if (new) {
+ btrfs_bio = btrfs_io_bio(new);
+ btrfs_bio->csum = NULL;
+ btrfs_bio->csum_allocated = NULL;
+ btrfs_bio->end_io = NULL;
+ }
+ return new;
+}
+
+/* this also allocates from the btrfs_bioset */
+struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
+{
+ struct btrfs_io_bio *btrfs_bio;
+ struct bio *bio;
+
+ bio = bio_alloc_bioset(gfp_mask, nr_iovecs, btrfs_bioset);
+ if (bio) {
+ btrfs_bio = btrfs_io_bio(bio);
+ btrfs_bio->csum = NULL;
+ btrfs_bio->csum_allocated = NULL;
+ btrfs_bio->end_io = NULL;
+ }
+ return bio;
+}
+
+
+static int __must_check submit_one_bio(int rw, struct bio *bio,
+ int mirror_num, unsigned long bio_flags)
+{
+ int ret = 0;
+ struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+ struct page *page = bvec->bv_page;
+ struct extent_io_tree *tree = bio->bi_private;
+ u64 start;
+
+ start = page_offset(page) + bvec->bv_offset;
+
+ bio->bi_private = NULL;
+
+ bio_get(bio);
+
+ if (tree->ops && tree->ops->submit_bio_hook)
+ ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
+ mirror_num, bio_flags, start);
+ else
+ btrfsic_submit_bio(rw, bio);
+
+ if (bio_flagged(bio, BIO_EOPNOTSUPP))
+ ret = -EOPNOTSUPP;
+ bio_put(bio);
+ return ret;
+}
+
+static int merge_bio(int rw, struct extent_io_tree *tree, struct page *page,
+ unsigned long offset, size_t size, struct bio *bio,
+ unsigned long bio_flags)
+{
+ int ret = 0;
+ if (tree->ops && tree->ops->merge_bio_hook)
+ ret = tree->ops->merge_bio_hook(rw, page, offset, size, bio,
+ bio_flags);
+ BUG_ON(ret < 0);
+ return ret;
+
+}
+
+static int submit_extent_page(int rw, struct extent_io_tree *tree,
+ struct page *page, sector_t sector,
+ size_t size, unsigned long offset,
+ struct block_device *bdev,
+ struct bio **bio_ret,
+ unsigned long max_pages,
+ bio_end_io_t end_io_func,
+ int mirror_num,
+ unsigned long prev_bio_flags,
+ unsigned long bio_flags)
+{
+ int ret = 0;
+ struct bio *bio;
+ int nr;
+ int contig = 0;
+ int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
+ int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
+ size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE);
+
+ if (bio_ret && *bio_ret) {
+ bio = *bio_ret;
+ if (old_compressed)
+ contig = bio->bi_iter.bi_sector == sector;
+ else
+ contig = bio_end_sector(bio) == sector;
+
+ if (prev_bio_flags != bio_flags || !contig ||
+ merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) ||
+ bio_add_page(bio, page, page_size, offset) < page_size) {
+ ret = submit_one_bio(rw, bio, mirror_num,
+ prev_bio_flags);
+ if (ret < 0) {
+ *bio_ret = NULL;
+ return ret;
+ }
+ bio = NULL;
+ } else {
+ return 0;
+ }
+ }
+ if (this_compressed)
+ nr = BIO_MAX_PAGES;
+ else
+ nr = bio_get_nr_vecs(bdev);
+
+ bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
+ if (!bio)
+ return -ENOMEM;
+
+ bio_add_page(bio, page, page_size, offset);
+ bio->bi_end_io = end_io_func;
+ bio->bi_private = tree;
+
+ if (bio_ret)
+ *bio_ret = bio;
+ else
+ ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
+
+ return ret;
+}
+
+static void attach_extent_buffer_page(struct extent_buffer *eb,
+ struct page *page)
+{
+ if (!PagePrivate(page)) {
+ SetPagePrivate(page);
+ page_cache_get(page);
+ set_page_private(page, (unsigned long)eb);
+ } else {
+ WARN_ON(page->private != (unsigned long)eb);
+ }
+}
+
+void set_page_extent_mapped(struct page *page)
+{
+ if (!PagePrivate(page)) {
+ SetPagePrivate(page);
+ page_cache_get(page);
+ set_page_private(page, EXTENT_PAGE_PRIVATE);
+ }
+}
+
+static struct extent_map *
+__get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
+ u64 start, u64 len, get_extent_t *get_extent,
+ struct extent_map **em_cached)
+{
+ struct extent_map *em;
+
+ if (em_cached && *em_cached) {
+ em = *em_cached;
+ if (extent_map_in_tree(em) && start >= em->start &&
+ start < extent_map_end(em)) {
+ atomic_inc(&em->refs);
+ return em;
+ }
+
+ free_extent_map(em);
+ *em_cached = NULL;
+ }
+
+ em = get_extent(inode, page, pg_offset, start, len, 0);
+ if (em_cached && !IS_ERR_OR_NULL(em)) {
+ BUG_ON(*em_cached);
+ atomic_inc(&em->refs);
+ *em_cached = em;
+ }
+ return em;
+}
+/*
+ * basic readpage implementation. Locked extent state structs are inserted
+ * into the tree that are removed when the IO is done (by the end_io
+ * handlers)
+ * XXX JDM: This needs looking at to ensure proper page locking
+ */
+static int __do_readpage(struct extent_io_tree *tree,
+ struct page *page,
+ get_extent_t *get_extent,
+ struct extent_map **em_cached,
+ struct bio **bio, int mirror_num,
+ unsigned long *bio_flags, int rw)
+{
+ struct inode *inode = page->mapping->host;
+ u64 start = page_offset(page);
+ u64 page_end = start + PAGE_CACHE_SIZE - 1;
+ u64 end;
+ u64 cur = start;
+ u64 extent_offset;
+ u64 last_byte = i_size_read(inode);
+ u64 block_start;
+ u64 cur_end;
+ sector_t sector;
+ struct extent_map *em;
+ struct block_device *bdev;
+ int ret;
+ int nr = 0;
+ int parent_locked = *bio_flags & EXTENT_BIO_PARENT_LOCKED;
+ size_t pg_offset = 0;
+ size_t iosize;
+ size_t disk_io_size;
+ size_t blocksize = inode->i_sb->s_blocksize;
+ unsigned long this_bio_flag = *bio_flags & EXTENT_BIO_PARENT_LOCKED;
+
+ set_page_extent_mapped(page);
+
+ end = page_end;
+ if (!PageUptodate(page)) {
+ if (cleancache_get_page(page) == 0) {
+ BUG_ON(blocksize != PAGE_SIZE);
+ unlock_extent(tree, start, end);
+ goto out;
+ }
+ }
+
+ if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
+ char *userpage;
+ size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
+
+ if (zero_offset) {
+ iosize = PAGE_CACHE_SIZE - zero_offset;
+ userpage = kmap_atomic(page);
+ memset(userpage + zero_offset, 0, iosize);
+ flush_dcache_page(page);
+ kunmap_atomic(userpage);
+ }
+ }
+ while (cur <= end) {
+ unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
+
+ if (cur >= last_byte) {
+ char *userpage;
+ struct extent_state *cached = NULL;
+
+ iosize = PAGE_CACHE_SIZE - pg_offset;
+ userpage = kmap_atomic(page);
+ memset(userpage + pg_offset, 0, iosize);
+ flush_dcache_page(page);
+ kunmap_atomic(userpage);
+ set_extent_uptodate(tree, cur, cur + iosize - 1,
+ &cached, GFP_NOFS);
+ if (!parent_locked)
+ unlock_extent_cached(tree, cur,
+ cur + iosize - 1,
+ &cached, GFP_NOFS);
+ break;
+ }
+ em = __get_extent_map(inode, page, pg_offset, cur,
+ end - cur + 1, get_extent, em_cached);
+ if (IS_ERR_OR_NULL(em)) {
+ SetPageError(page);
+ if (!parent_locked)
+ unlock_extent(tree, cur, end);
+ break;
+ }
+ extent_offset = cur - em->start;
+ BUG_ON(extent_map_end(em) <= cur);
+ BUG_ON(end < cur);
+
+ if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+ this_bio_flag |= EXTENT_BIO_COMPRESSED;
+ extent_set_compress_type(&this_bio_flag,
+ em->compress_type);
+ }
+
+ iosize = min(extent_map_end(em) - cur, end - cur + 1);
+ cur_end = min(extent_map_end(em) - 1, end);
+ iosize = ALIGN(iosize, blocksize);
+ if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
+ disk_io_size = em->block_len;
+ sector = em->block_start >> 9;
+ } else {
+ sector = (em->block_start + extent_offset) >> 9;
+ disk_io_size = iosize;
+ }
+ bdev = em->bdev;
+ block_start = em->block_start;
+ if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ block_start = EXTENT_MAP_HOLE;
+ free_extent_map(em);
+ em = NULL;
+
+ /* we've found a hole, just zero and go on */
+ if (block_start == EXTENT_MAP_HOLE) {
+ char *userpage;
+ struct extent_state *cached = NULL;
+
+ userpage = kmap_atomic(page);
+ memset(userpage + pg_offset, 0, iosize);
+ flush_dcache_page(page);
+ kunmap_atomic(userpage);
+
+ set_extent_uptodate(tree, cur, cur + iosize - 1,
+ &cached, GFP_NOFS);
+ unlock_extent_cached(tree, cur, cur + iosize - 1,
+ &cached, GFP_NOFS);
+ cur = cur + iosize;
+ pg_offset += iosize;
+ continue;
+ }
+ /* the get_extent function already copied into the page */
+ if (test_range_bit(tree, cur, cur_end,
+ EXTENT_UPTODATE, 1, NULL)) {
+ check_page_uptodate(tree, page);
+ if (!parent_locked)
+ unlock_extent(tree, cur, cur + iosize - 1);
+ cur = cur + iosize;
+ pg_offset += iosize;
+ continue;
+ }
+ /* we have an inline extent but it didn't get marked up
+ * to date. Error out
+ */
+ if (block_start == EXTENT_MAP_INLINE) {
+ SetPageError(page);
+ if (!parent_locked)
+ unlock_extent(tree, cur, cur + iosize - 1);
+ cur = cur + iosize;
+ pg_offset += iosize;
+ continue;
+ }
+
+ pnr -= page->index;
+ ret = submit_extent_page(rw, tree, page,
+ sector, disk_io_size, pg_offset,
+ bdev, bio, pnr,
+ end_bio_extent_readpage, mirror_num,
+ *bio_flags,
+ this_bio_flag);
+ if (!ret) {
+ nr++;
+ *bio_flags = this_bio_flag;
+ } else {
+ SetPageError(page);
+ if (!parent_locked)
+ unlock_extent(tree, cur, cur + iosize - 1);
+ }
+ cur = cur + iosize;
+ pg_offset += iosize;
+ }
+out:
+ if (!nr) {
+ if (!PageError(page))
+ SetPageUptodate(page);
+ unlock_page(page);
+ }
+ return 0;
+}
+
+static inline void __do_contiguous_readpages(struct extent_io_tree *tree,
+ struct page *pages[], int nr_pages,
+ u64 start, u64 end,
+ get_extent_t *get_extent,
+ struct extent_map **em_cached,
+ struct bio **bio, int mirror_num,
+ unsigned long *bio_flags, int rw)
+{
+ struct inode *inode;
+ struct btrfs_ordered_extent *ordered;
+ int index;
+
+ inode = pages[0]->mapping->host;
+ while (1) {
+ lock_extent(tree, start, end);
+ ordered = btrfs_lookup_ordered_range(inode, start,
+ end - start + 1);
+ if (!ordered)
+ break;
+ unlock_extent(tree, start, end);
+ btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_put_ordered_extent(ordered);
+ }
+
+ for (index = 0; index < nr_pages; index++) {
+ __do_readpage(tree, pages[index], get_extent, em_cached, bio,
+ mirror_num, bio_flags, rw);
+ page_cache_release(pages[index]);
+ }
+}
+
+static void __extent_readpages(struct extent_io_tree *tree,
+ struct page *pages[],
+ int nr_pages, get_extent_t *get_extent,
+ struct extent_map **em_cached,
+ struct bio **bio, int mirror_num,
+ unsigned long *bio_flags, int rw)
+{
+ u64 start = 0;
+ u64 end = 0;
+ u64 page_start;
+ int index;
+ int first_index = 0;
+
+ for (index = 0; index < nr_pages; index++) {
+ page_start = page_offset(pages[index]);
+ if (!end) {
+ start = page_start;
+ end = start + PAGE_CACHE_SIZE - 1;
+ first_index = index;
+ } else if (end + 1 == page_start) {
+ end += PAGE_CACHE_SIZE;
+ } else {
+ __do_contiguous_readpages(tree, &pages[first_index],
+ index - first_index, start,
+ end, get_extent, em_cached,
+ bio, mirror_num, bio_flags,
+ rw);
+ start = page_start;
+ end = start + PAGE_CACHE_SIZE - 1;
+ first_index = index;
+ }
+ }
+
+ if (end)
+ __do_contiguous_readpages(tree, &pages[first_index],
+ index - first_index, start,
+ end, get_extent, em_cached, bio,
+ mirror_num, bio_flags, rw);
+}
+
+static int __extent_read_full_page(struct extent_io_tree *tree,
+ struct page *page,
+ get_extent_t *get_extent,
+ struct bio **bio, int mirror_num,
+ unsigned long *bio_flags, int rw)
+{
+ struct inode *inode = page->mapping->host;
+ struct btrfs_ordered_extent *ordered;
+ u64 start = page_offset(page);
+ u64 end = start + PAGE_CACHE_SIZE - 1;
+ int ret;
+
+ while (1) {
+ lock_extent(tree, start, end);
+ ordered = btrfs_lookup_ordered_extent(inode, start);
+ if (!ordered)
+ break;
+ unlock_extent(tree, start, end);
+ btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_put_ordered_extent(ordered);
+ }
+
+ ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num,
+ bio_flags, rw);
+ return ret;
+}
+
+int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
+ get_extent_t *get_extent, int mirror_num)
+{
+ struct bio *bio = NULL;
+ unsigned long bio_flags = 0;
+ int ret;
+
+ ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num,
+ &bio_flags, READ);
+ if (bio)
+ ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
+ return ret;
+}
+
+int extent_read_full_page_nolock(struct extent_io_tree *tree, struct page *page,
+ get_extent_t *get_extent, int mirror_num)
+{
+ struct bio *bio = NULL;
+ unsigned long bio_flags = EXTENT_BIO_PARENT_LOCKED;
+ int ret;
+
+ ret = __do_readpage(tree, page, get_extent, NULL, &bio, mirror_num,
+ &bio_flags, READ);
+ if (bio)
+ ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
+ return ret;
+}
+
+static noinline void update_nr_written(struct page *page,
+ struct writeback_control *wbc,
+ unsigned long nr_written)
+{
+ wbc->nr_to_write -= nr_written;
+ if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
+ wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
+ page->mapping->writeback_index = page->index + nr_written;
+}
+
+/*
+ * helper for __extent_writepage, doing all of the delayed allocation setup.
+ *
+ * This returns 1 if our fill_delalloc function did all the work required
+ * to write the page (copy into inline extent). In this case the IO has
+ * been started and the page is already unlocked.
+ *
+ * This returns 0 if all went well (page still locked)
+ * This returns < 0 if there were errors (page still locked)
+ */
+static noinline_for_stack int writepage_delalloc(struct inode *inode,
+ struct page *page, struct writeback_control *wbc,
+ struct extent_page_data *epd,
+ u64 delalloc_start,
+ unsigned long *nr_written)
+{
+ struct extent_io_tree *tree = epd->tree;
+ u64 page_end = delalloc_start + PAGE_CACHE_SIZE - 1;
+ u64 nr_delalloc;
+ u64 delalloc_to_write = 0;
+ u64 delalloc_end = 0;
+ int ret;
+ int page_started = 0;
+
+ if (epd->extent_locked || !tree->ops || !tree->ops->fill_delalloc)
+ return 0;
+
+ while (delalloc_end < page_end) {
+ nr_delalloc = find_lock_delalloc_range(inode, tree,
+ page,
+ &delalloc_start,
+ &delalloc_end,
+ BTRFS_MAX_EXTENT_SIZE);
+ if (nr_delalloc == 0) {
+ delalloc_start = delalloc_end + 1;
+ continue;
+ }
+ ret = tree->ops->fill_delalloc(inode, page,
+ delalloc_start,
+ delalloc_end,
+ &page_started,
+ nr_written);
+ /* File system has been set read-only */
+ if (ret) {
+ SetPageError(page);
+ /* fill_delalloc should be return < 0 for error
+ * but just in case, we use > 0 here meaning the
+ * IO is started, so we don't want to return > 0
+ * unless things are going well.
+ */
+ ret = ret < 0 ? ret : -EIO;
+ goto done;
+ }
+ /*
+ * delalloc_end is already one less than the total
+ * length, so we don't subtract one from
+ * PAGE_CACHE_SIZE
+ */
+ delalloc_to_write += (delalloc_end - delalloc_start +
+ PAGE_CACHE_SIZE) >>
+ PAGE_CACHE_SHIFT;
+ delalloc_start = delalloc_end + 1;
+ }
+ if (wbc->nr_to_write < delalloc_to_write) {
+ int thresh = 8192;
+
+ if (delalloc_to_write < thresh * 2)
+ thresh = delalloc_to_write;
+ wbc->nr_to_write = min_t(u64, delalloc_to_write,
+ thresh);
+ }
+
+ /* did the fill delalloc function already unlock and start
+ * the IO?
+ */
+ if (page_started) {
+ /*
+ * we've unlocked the page, so we can't update
+ * the mapping's writeback index, just update
+ * nr_to_write.
+ */
+ wbc->nr_to_write -= *nr_written;
+ return 1;
+ }
+
+ ret = 0;
+
+done:
+ return ret;
+}
+
+/*
+ * helper for __extent_writepage. This calls the writepage start hooks,
+ * and does the loop to map the page into extents and bios.
+ *
+ * We return 1 if the IO is started and the page is unlocked,
+ * 0 if all went well (page still locked)
+ * < 0 if there were errors (page still locked)
+ */
+static noinline_for_stack int __extent_writepage_io(struct inode *inode,
+ struct page *page,
+ struct writeback_control *wbc,
+ struct extent_page_data *epd,
+ loff_t i_size,
+ unsigned long nr_written,
+ int write_flags, int *nr_ret)
+{
+ struct extent_io_tree *tree = epd->tree;
+ u64 start = page_offset(page);
+ u64 page_end = start + PAGE_CACHE_SIZE - 1;
+ u64 end;
+ u64 cur = start;
+ u64 extent_offset;
+ u64 block_start;
+ u64 iosize;
+ sector_t sector;
+ struct extent_state *cached_state = NULL;
+ struct extent_map *em;
+ struct block_device *bdev;
+ size_t pg_offset = 0;
+ size_t blocksize;
+ int ret = 0;
+ int nr = 0;
+ bool compressed;
+
+ if (tree->ops && tree->ops->writepage_start_hook) {
+ ret = tree->ops->writepage_start_hook(page, start,
+ page_end);
+ if (ret) {
+ /* Fixup worker will requeue */
+ if (ret == -EBUSY)
+ wbc->pages_skipped++;
+ else
+ redirty_page_for_writepage(wbc, page);
+
+ update_nr_written(page, wbc, nr_written);
+ unlock_page(page);
+ ret = 1;
+ goto done_unlocked;
+ }
+ }
+
+ /*
+ * we don't want to touch the inode after unlocking the page,
+ * so we update the mapping writeback index now
+ */
+ update_nr_written(page, wbc, nr_written + 1);
+
+ end = page_end;
+ if (i_size <= start) {
+ if (tree->ops && tree->ops->writepage_end_io_hook)
+ tree->ops->writepage_end_io_hook(page, start,
+ page_end, NULL, 1);
+ goto done;
+ }
+
+ blocksize = inode->i_sb->s_blocksize;
+
+ while (cur <= end) {
+ u64 em_end;
+ if (cur >= i_size) {
+ if (tree->ops && tree->ops->writepage_end_io_hook)
+ tree->ops->writepage_end_io_hook(page, cur,
+ page_end, NULL, 1);
+ break;
+ }
+ em = epd->get_extent(inode, page, pg_offset, cur,
+ end - cur + 1, 1);
+ if (IS_ERR_OR_NULL(em)) {
+ SetPageError(page);
+ ret = PTR_ERR_OR_ZERO(em);
+ break;
+ }
+
+ extent_offset = cur - em->start;
+ em_end = extent_map_end(em);
+ BUG_ON(em_end <= cur);
+ BUG_ON(end < cur);
+ iosize = min(em_end - cur, end - cur + 1);
+ iosize = ALIGN(iosize, blocksize);
+ sector = (em->block_start + extent_offset) >> 9;
+ bdev = em->bdev;
+ block_start = em->block_start;
+ compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+ free_extent_map(em);
+ em = NULL;
+
+ /*
+ * compressed and inline extents are written through other
+ * paths in the FS
+ */
+ if (compressed || block_start == EXTENT_MAP_HOLE ||
+ block_start == EXTENT_MAP_INLINE) {
+ /*
+ * end_io notification does not happen here for
+ * compressed extents
+ */
+ if (!compressed && tree->ops &&
+ tree->ops->writepage_end_io_hook)
+ tree->ops->writepage_end_io_hook(page, cur,
+ cur + iosize - 1,
+ NULL, 1);
+ else if (compressed) {
+ /* we don't want to end_page_writeback on
+ * a compressed extent. this happens
+ * elsewhere
+ */
+ nr++;
+ }
+
+ cur += iosize;
+ pg_offset += iosize;
+ continue;
+ }
+
+ if (tree->ops && tree->ops->writepage_io_hook) {
+ ret = tree->ops->writepage_io_hook(page, cur,
+ cur + iosize - 1);
+ } else {
+ ret = 0;
+ }
+ if (ret) {
+ SetPageError(page);
+ } else {
+ unsigned long max_nr = (i_size >> PAGE_CACHE_SHIFT) + 1;
+
+ set_range_writeback(tree, cur, cur + iosize - 1);
+ if (!PageWriteback(page)) {
+ btrfs_err(BTRFS_I(inode)->root->fs_info,
+ "page %lu not writeback, cur %llu end %llu",
+ page->index, cur, end);
+ }
+
+ ret = submit_extent_page(write_flags, tree, page,
+ sector, iosize, pg_offset,
+ bdev, &epd->bio, max_nr,
+ end_bio_extent_writepage,
+ 0, 0, 0);
+ if (ret)
+ SetPageError(page);
+ }
+ cur = cur + iosize;
+ pg_offset += iosize;
+ nr++;
+ }
+done:
+ *nr_ret = nr;
+
+done_unlocked:
+
+ /* drop our reference on any cached states */
+ free_extent_state(cached_state);
+ return ret;
+}
+
+/*
+ * the writepage semantics are similar to regular writepage. extent
+ * records are inserted to lock ranges in the tree, and as dirty areas
+ * are found, they are marked writeback. Then the lock bits are removed
+ * and the end_io handler clears the writeback ranges
+ */
+static int __extent_writepage(struct page *page, struct writeback_control *wbc,
+ void *data)
+{
+ struct inode *inode = page->mapping->host;
+ struct extent_page_data *epd = data;
+ u64 start = page_offset(page);
+ u64 page_end = start + PAGE_CACHE_SIZE - 1;
+ int ret;
+ int nr = 0;
+ size_t pg_offset = 0;
+ loff_t i_size = i_size_read(inode);
+ unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
+ int write_flags;
+ unsigned long nr_written = 0;
+
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ write_flags = WRITE_SYNC;
+ else
+ write_flags = WRITE;
+
+ trace___extent_writepage(page, inode, wbc);
+
+ WARN_ON(!PageLocked(page));
+
+ ClearPageError(page);
+
+ pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
+ if (page->index > end_index ||
+ (page->index == end_index && !pg_offset)) {
+ page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
+ unlock_page(page);
+ return 0;
+ }
+
+ if (page->index == end_index) {
+ char *userpage;
+
+ userpage = kmap_atomic(page);
+ memset(userpage + pg_offset, 0,
+ PAGE_CACHE_SIZE - pg_offset);
+ kunmap_atomic(userpage);
+ flush_dcache_page(page);
+ }
+
+ pg_offset = 0;
+
+ set_page_extent_mapped(page);
+
+ ret = writepage_delalloc(inode, page, wbc, epd, start, &nr_written);
+ if (ret == 1)
+ goto done_unlocked;
+ if (ret)
+ goto done;
+
+ ret = __extent_writepage_io(inode, page, wbc, epd,
+ i_size, nr_written, write_flags, &nr);
+ if (ret == 1)
+ goto done_unlocked;
+
+done:
+ if (nr == 0) {
+ /* make sure the mapping tag for page dirty gets cleared */
+ set_page_writeback(page);
+ end_page_writeback(page);
+ }
+ if (PageError(page)) {
+ ret = ret < 0 ? ret : -EIO;
+ end_extent_writepage(page, ret, start, page_end);
+ }
+ unlock_page(page);
+ return ret;
+
+done_unlocked:
+ return 0;
+}
+
+void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
+{
+ wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK,
+ TASK_UNINTERRUPTIBLE);
+}
+
+static noinline_for_stack int
+lock_extent_buffer_for_io(struct extent_buffer *eb,
+ struct btrfs_fs_info *fs_info,
+ struct extent_page_data *epd)
+{
+ unsigned long i, num_pages;
+ int flush = 0;
+ int ret = 0;
+
+ if (!btrfs_try_tree_write_lock(eb)) {
+ flush = 1;
+ flush_write_bio(epd);
+ btrfs_tree_lock(eb);
+ }
+
+ if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
+ btrfs_tree_unlock(eb);
+ if (!epd->sync_io)
+ return 0;
+ if (!flush) {
+ flush_write_bio(epd);
+ flush = 1;
+ }
+ while (1) {
+ wait_on_extent_buffer_writeback(eb);
+ btrfs_tree_lock(eb);
+ if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags))
+ break;
+ btrfs_tree_unlock(eb);
+ }
+ }
+
+ /*
+ * We need to do this to prevent races in people who check if the eb is
+ * under IO since we can end up having no IO bits set for a short period
+ * of time.
+ */
+ spin_lock(&eb->refs_lock);
+ if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
+ set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
+ spin_unlock(&eb->refs_lock);
+ btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
+ __percpu_counter_add(&fs_info->dirty_metadata_bytes,
+ -eb->len,
+ fs_info->dirty_metadata_batch);
+ ret = 1;
+ } else {
+ spin_unlock(&eb->refs_lock);
+ }
+
+ btrfs_tree_unlock(eb);
+
+ if (!ret)
+ return ret;
+
+ num_pages = num_extent_pages(eb->start, eb->len);
+ for (i = 0; i < num_pages; i++) {
+ struct page *p = eb->pages[i];
+
+ if (!trylock_page(p)) {
+ if (!flush) {
+ flush_write_bio(epd);
+ flush = 1;
+ }
+ lock_page(p);
+ }
+ }
+
+ return ret;
+}
+
+static void end_extent_buffer_writeback(struct extent_buffer *eb)
+{
+ clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
+ smp_mb__after_atomic();
+ wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
+}
+
+static void set_btree_ioerr(struct page *page)
+{
+ struct extent_buffer *eb = (struct extent_buffer *)page->private;
+ struct btrfs_inode *btree_ino = BTRFS_I(eb->fs_info->btree_inode);
+
+ SetPageError(page);
+ if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
+ return;
+
+ /*
+ * If writeback for a btree extent that doesn't belong to a log tree
+ * failed, increment the counter transaction->eb_write_errors.
+ * We do this because while the transaction is running and before it's
+ * committing (when we call filemap_fdata[write|wait]_range against
+ * the btree inode), we might have
+ * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it
+ * returns an error or an error happens during writeback, when we're
+ * committing the transaction we wouldn't know about it, since the pages
+ * can be no longer dirty nor marked anymore for writeback (if a
+ * subsequent modification to the extent buffer didn't happen before the
+ * transaction commit), which makes filemap_fdata[write|wait]_range not
+ * able to find the pages tagged with SetPageError at transaction
+ * commit time. So if this happens we must abort the transaction,
+ * otherwise we commit a super block with btree roots that point to
+ * btree nodes/leafs whose content on disk is invalid - either garbage
+ * or the content of some node/leaf from a past generation that got
+ * cowed or deleted and is no longer valid.
+ *
+ * Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would
+ * not be enough - we need to distinguish between log tree extents vs
+ * non-log tree extents, and the next filemap_fdatawait_range() call
+ * will catch and clear such errors in the mapping - and that call might
+ * be from a log sync and not from a transaction commit. Also, checking
+ * for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is
+ * not done and would not be reliable - the eb might have been released
+ * from memory and reading it back again means that flag would not be
+ * set (since it's a runtime flag, not persisted on disk).
+ *
+ * Using the flags below in the btree inode also makes us achieve the
+ * goal of AS_EIO/AS_ENOSPC when writepages() returns success, started
+ * writeback for all dirty pages and before filemap_fdatawait_range()
+ * is called, the writeback for all dirty pages had already finished
+ * with errors - because we were not using AS_EIO/AS_ENOSPC,
+ * filemap_fdatawait_range() would return success, as it could not know
+ * that writeback errors happened (the pages were no longer tagged for
+ * writeback).
+ */
+ switch (eb->log_index) {
+ case -1:
+ set_bit(BTRFS_INODE_BTREE_ERR, &btree_ino->runtime_flags);
+ break;
+ case 0:
+ set_bit(BTRFS_INODE_BTREE_LOG1_ERR, &btree_ino->runtime_flags);
+ break;
+ case 1:
+ set_bit(BTRFS_INODE_BTREE_LOG2_ERR, &btree_ino->runtime_flags);
+ break;
+ default:
+ BUG(); /* unexpected, logic error */
+ }
+}
+
+static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
+{
+ struct bio_vec *bvec;
+ struct extent_buffer *eb;
+ int i, done;
+
+ bio_for_each_segment_all(bvec, bio, i) {
+ struct page *page = bvec->bv_page;
+
+ eb = (struct extent_buffer *)page->private;
+ BUG_ON(!eb);
+ done = atomic_dec_and_test(&eb->io_pages);
+
+ if (err || test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
+ ClearPageUptodate(page);
+ set_btree_ioerr(page);
+ }
+
+ end_page_writeback(page);
+
+ if (!done)
+ continue;
+
+ end_extent_buffer_writeback(eb);
+ }
+
+ bio_put(bio);
+}
+
+static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
+ struct btrfs_fs_info *fs_info,
+ struct writeback_control *wbc,
+ struct extent_page_data *epd)
+{
+ struct block_device *bdev = fs_info->fs_devices->latest_bdev;
+ struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
+ u64 offset = eb->start;
+ unsigned long i, num_pages;
+ unsigned long bio_flags = 0;
+ int rw = (epd->sync_io ? WRITE_SYNC : WRITE) | REQ_META;
+ int ret = 0;
+
+ clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);
+ num_pages = num_extent_pages(eb->start, eb->len);
+ atomic_set(&eb->io_pages, num_pages);
+ if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID)
+ bio_flags = EXTENT_BIO_TREE_LOG;
+
+ for (i = 0; i < num_pages; i++) {
+ struct page *p = eb->pages[i];
+
+ clear_page_dirty_for_io(p);
+ set_page_writeback(p);
+ ret = submit_extent_page(rw, tree, p, offset >> 9,
+ PAGE_CACHE_SIZE, 0, bdev, &epd->bio,
+ -1, end_bio_extent_buffer_writepage,
+ 0, epd->bio_flags, bio_flags);
+ epd->bio_flags = bio_flags;
+ if (ret) {
+ set_btree_ioerr(p);
+ end_page_writeback(p);
+ if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
+ end_extent_buffer_writeback(eb);
+ ret = -EIO;
+ break;
+ }
+ offset += PAGE_CACHE_SIZE;
+ update_nr_written(p, wbc, 1);
+ unlock_page(p);
+ }
+
+ if (unlikely(ret)) {
+ for (; i < num_pages; i++) {
+ struct page *p = eb->pages[i];
+ clear_page_dirty_for_io(p);
+ unlock_page(p);
+ }
+ }
+
+ return ret;
+}
+
+int btree_write_cache_pages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;
+ struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
+ struct extent_buffer *eb, *prev_eb = NULL;
+ struct extent_page_data epd = {
+ .bio = NULL,
+ .tree = tree,
+ .extent_locked = 0,
+ .sync_io = wbc->sync_mode == WB_SYNC_ALL,
+ .bio_flags = 0,
+ };
+ int ret = 0;
+ int done = 0;
+ int nr_to_write_done = 0;
+ struct pagevec pvec;
+ int nr_pages;
+ pgoff_t index;
+ pgoff_t end; /* Inclusive */
+ int scanned = 0;
+ int tag;
+
+ pagevec_init(&pvec, 0);
+ if (wbc->range_cyclic) {
+ index = mapping->writeback_index; /* Start from prev offset */
+ end = -1;
+ } else {
+ index = wbc->range_start >> PAGE_CACHE_SHIFT;
+ end = wbc->range_end >> PAGE_CACHE_SHIFT;
+ scanned = 1;
+ }
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ tag = PAGECACHE_TAG_TOWRITE;
+ else
+ tag = PAGECACHE_TAG_DIRTY;
+retry:
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ tag_pages_for_writeback(mapping, index, end);
+ while (!done && !nr_to_write_done && (index <= end) &&
+ (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+ unsigned i;
+
+ scanned = 1;
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ if (!PagePrivate(page))
+ continue;
+
+ if (!wbc->range_cyclic && page->index > end) {
+ done = 1;
+ break;
+ }
+
+ spin_lock(&mapping->private_lock);
+ if (!PagePrivate(page)) {
+ spin_unlock(&mapping->private_lock);
+ continue;
+ }
+
+ eb = (struct extent_buffer *)page->private;
+
+ /*
+ * Shouldn't happen and normally this would be a BUG_ON
+ * but no sense in crashing the users box for something
+ * we can survive anyway.
+ */
+ if (WARN_ON(!eb)) {
+ spin_unlock(&mapping->private_lock);
+ continue;
+ }
+
+ if (eb == prev_eb) {
+ spin_unlock(&mapping->private_lock);
+ continue;
+ }
+
+ ret = atomic_inc_not_zero(&eb->refs);
+ spin_unlock(&mapping->private_lock);
+ if (!ret)
+ continue;
+
+ prev_eb = eb;
+ ret = lock_extent_buffer_for_io(eb, fs_info, &epd);
+ if (!ret) {
+ free_extent_buffer(eb);
+ continue;
+ }
+
+ ret = write_one_eb(eb, fs_info, wbc, &epd);
+ if (ret) {
+ done = 1;
+ free_extent_buffer(eb);
+ break;
+ }
+ free_extent_buffer(eb);
+
+ /*
+ * the filesystem may choose to bump up nr_to_write.
+ * We have to make sure to honor the new nr_to_write
+ * at any time
+ */
+ nr_to_write_done = wbc->nr_to_write <= 0;
+ }
+ pagevec_release(&pvec);
+ cond_resched();
+ }
+ if (!scanned && !done) {
+ /*
+ * We hit the last page and there is more work to be done: wrap
+ * back to the start of the file
+ */
+ scanned = 1;
+ index = 0;
+ goto retry;
+ }
+ flush_write_bio(&epd);
+ return ret;
+}
+
+/**
+ * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
+ * @mapping: address space structure to write
+ * @wbc: subtract the number of written pages from *@wbc->nr_to_write
+ * @writepage: function called for each page
+ * @data: data passed to writepage function
+ *
+ * If a page is already under I/O, write_cache_pages() skips it, even
+ * if it's dirty. This is desirable behaviour for memory-cleaning writeback,
+ * but it is INCORRECT for data-integrity system calls such as fsync(). fsync()
+ * and msync() need to guarantee that all the data which was dirty at the time
+ * the call was made get new I/O started against them. If wbc->sync_mode is
+ * WB_SYNC_ALL then we were called for data integrity and we must wait for
+ * existing IO to complete.
+ */
+static int extent_write_cache_pages(struct extent_io_tree *tree,
+ struct address_space *mapping,
+ struct writeback_control *wbc,
+ writepage_t writepage, void *data,
+ void (*flush_fn)(void *))
+{
+ struct inode *inode = mapping->host;
+ int ret = 0;
+ int done = 0;
+ int err = 0;
+ int nr_to_write_done = 0;
+ struct pagevec pvec;
+ int nr_pages;
+ pgoff_t index;
+ pgoff_t end; /* Inclusive */
+ int scanned = 0;
+ int tag;
+
+ /*
+ * We have to hold onto the inode so that ordered extents can do their
+ * work when the IO finishes. The alternative to this is failing to add
+ * an ordered extent if the igrab() fails there and that is a huge pain
+ * to deal with, so instead just hold onto the inode throughout the
+ * writepages operation. If it fails here we are freeing up the inode
+ * anyway and we'd rather not waste our time writing out stuff that is
+ * going to be truncated anyway.
+ */
+ if (!igrab(inode))
+ return 0;
+
+ pagevec_init(&pvec, 0);
+ if (wbc->range_cyclic) {
+ index = mapping->writeback_index; /* Start from prev offset */
+ end = -1;
+ } else {
+ index = wbc->range_start >> PAGE_CACHE_SHIFT;
+ end = wbc->range_end >> PAGE_CACHE_SHIFT;
+ scanned = 1;
+ }
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ tag = PAGECACHE_TAG_TOWRITE;
+ else
+ tag = PAGECACHE_TAG_DIRTY;
+retry:
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ tag_pages_for_writeback(mapping, index, end);
+ while (!done && !nr_to_write_done && (index <= end) &&
+ (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+ unsigned i;
+
+ scanned = 1;
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ /*
+ * At this point we hold neither mapping->tree_lock nor
+ * lock on the page itself: the page may be truncated or
+ * invalidated (changing page->mapping to NULL), or even
+ * swizzled back from swapper_space to tmpfs file
+ * mapping
+ */
+ if (!trylock_page(page)) {
+ flush_fn(data);
+ lock_page(page);
+ }
+
+ if (unlikely(page->mapping != mapping)) {
+ unlock_page(page);
+ continue;
+ }
+
+ if (!wbc->range_cyclic && page->index > end) {
+ done = 1;
+ unlock_page(page);
+ continue;
+ }
+
+ if (wbc->sync_mode != WB_SYNC_NONE) {
+ if (PageWriteback(page))
+ flush_fn(data);
+ wait_on_page_writeback(page);
+ }
+
+ if (PageWriteback(page) ||
+ !clear_page_dirty_for_io(page)) {
+ unlock_page(page);
+ continue;
+ }
+
+ ret = (*writepage)(page, wbc, data);
+
+ if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
+ unlock_page(page);
+ ret = 0;
+ }
+ if (!err && ret < 0)
+ err = ret;
+
+ /*
+ * the filesystem may choose to bump up nr_to_write.
+ * We have to make sure to honor the new nr_to_write
+ * at any time
+ */
+ nr_to_write_done = wbc->nr_to_write <= 0;
+ }
+ pagevec_release(&pvec);
+ cond_resched();
+ }
+ if (!scanned && !done && !err) {
+ /*
+ * We hit the last page and there is more work to be done: wrap
+ * back to the start of the file
+ */
+ scanned = 1;
+ index = 0;
+ goto retry;
+ }
+ btrfs_add_delayed_iput(inode);
+ return err;
+}
+
+static void flush_epd_write_bio(struct extent_page_data *epd)
+{
+ if (epd->bio) {
+ int rw = WRITE;
+ int ret;
+
+ if (epd->sync_io)
+ rw = WRITE_SYNC;
+
+ ret = submit_one_bio(rw, epd->bio, 0, epd->bio_flags);
+ BUG_ON(ret < 0); /* -ENOMEM */
+ epd->bio = NULL;
+ }
+}
+
+static noinline void flush_write_bio(void *data)
+{
+ struct extent_page_data *epd = data;
+ flush_epd_write_bio(epd);
+}
+
+int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
+ get_extent_t *get_extent,
+ struct writeback_control *wbc)
+{
+ int ret;
+ struct extent_page_data epd = {
+ .bio = NULL,
+ .tree = tree,
+ .get_extent = get_extent,
+ .extent_locked = 0,
+ .sync_io = wbc->sync_mode == WB_SYNC_ALL,
+ .bio_flags = 0,
+ };
+
+ ret = __extent_writepage(page, wbc, &epd);
+
+ flush_epd_write_bio(&epd);
+ return ret;
+}
+
+int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
+ u64 start, u64 end, get_extent_t *get_extent,
+ int mode)
+{
+ int ret = 0;
+ struct address_space *mapping = inode->i_mapping;
+ struct page *page;
+ unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >>
+ PAGE_CACHE_SHIFT;
+
+ struct extent_page_data epd = {
+ .bio = NULL,
+ .tree = tree,
+ .get_extent = get_extent,
+ .extent_locked = 1,
+ .sync_io = mode == WB_SYNC_ALL,
+ .bio_flags = 0,
+ };
+ struct writeback_control wbc_writepages = {
+ .sync_mode = mode,
+ .nr_to_write = nr_pages * 2,
+ .range_start = start,
+ .range_end = end + 1,
+ };
+
+ while (start <= end) {
+ page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
+ if (clear_page_dirty_for_io(page))
+ ret = __extent_writepage(page, &wbc_writepages, &epd);
+ else {
+ if (tree->ops && tree->ops->writepage_end_io_hook)
+ tree->ops->writepage_end_io_hook(page, start,
+ start + PAGE_CACHE_SIZE - 1,
+ NULL, 1);
+ unlock_page(page);
+ }
+ page_cache_release(page);
+ start += PAGE_CACHE_SIZE;
+ }
+
+ flush_epd_write_bio(&epd);
+ return ret;
+}
+
+int extent_writepages(struct extent_io_tree *tree,
+ struct address_space *mapping,
+ get_extent_t *get_extent,
+ struct writeback_control *wbc)
+{
+ int ret = 0;
+ struct extent_page_data epd = {
+ .bio = NULL,
+ .tree = tree,
+ .get_extent = get_extent,
+ .extent_locked = 0,
+ .sync_io = wbc->sync_mode == WB_SYNC_ALL,
+ .bio_flags = 0,
+ };
+
+ ret = extent_write_cache_pages(tree, mapping, wbc,
+ __extent_writepage, &epd,
+ flush_write_bio);
+ flush_epd_write_bio(&epd);
+ return ret;
+}
+
+int extent_readpages(struct extent_io_tree *tree,
+ struct address_space *mapping,
+ struct list_head *pages, unsigned nr_pages,
+ get_extent_t get_extent)
+{
+ struct bio *bio = NULL;
+ unsigned page_idx;
+ unsigned long bio_flags = 0;
+ struct page *pagepool[16];
+ struct page *page;
+ struct extent_map *em_cached = NULL;
+ int nr = 0;
+
+ for (page_idx = 0; page_idx < nr_pages; page_idx++) {
+ page = list_entry(pages->prev, struct page, lru);
+
+ prefetchw(&page->flags);
+ list_del(&page->lru);
+ if (add_to_page_cache_lru(page, mapping,
+ page->index, GFP_NOFS)) {
+ page_cache_release(page);
+ continue;
+ }
+
+ pagepool[nr++] = page;
+ if (nr < ARRAY_SIZE(pagepool))
+ continue;
+ __extent_readpages(tree, pagepool, nr, get_extent, &em_cached,
+ &bio, 0, &bio_flags, READ);
+ nr = 0;
+ }
+ if (nr)
+ __extent_readpages(tree, pagepool, nr, get_extent, &em_cached,
+ &bio, 0, &bio_flags, READ);
+
+ if (em_cached)
+ free_extent_map(em_cached);
+
+ BUG_ON(!list_empty(pages));
+ if (bio)
+ return submit_one_bio(READ, bio, 0, bio_flags);
+ return 0;
+}
+
+/*
+ * basic invalidatepage code, this waits on any locked or writeback
+ * ranges corresponding to the page, and then deletes any extent state
+ * records from the tree
+ */
+int extent_invalidatepage(struct extent_io_tree *tree,
+ struct page *page, unsigned long offset)
+{
+ struct extent_state *cached_state = NULL;
+ u64 start = page_offset(page);
+ u64 end = start + PAGE_CACHE_SIZE - 1;
+ size_t blocksize = page->mapping->host->i_sb->s_blocksize;
+
+ start += ALIGN(offset, blocksize);
+ if (start > end)
+ return 0;
+
+ lock_extent_bits(tree, start, end, 0, &cached_state);
+ wait_on_page_writeback(page);
+ clear_extent_bit(tree, start, end,
+ EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING,
+ 1, 1, &cached_state, GFP_NOFS);
+ return 0;
+}
+
+/*
+ * a helper for releasepage, this tests for areas of the page that
+ * are locked or under IO and drops the related state bits if it is safe
+ * to drop the page.
+ */
+static int try_release_extent_state(struct extent_map_tree *map,
+ struct extent_io_tree *tree,
+ struct page *page, gfp_t mask)
+{
+ u64 start = page_offset(page);
+ u64 end = start + PAGE_CACHE_SIZE - 1;
+ int ret = 1;
+
+ if (test_range_bit(tree, start, end,
+ EXTENT_IOBITS, 0, NULL))
+ ret = 0;
+ else {
+ if ((mask & GFP_NOFS) == GFP_NOFS)
+ mask = GFP_NOFS;
+ /*
+ * at this point we can safely clear everything except the
+ * locked bit and the nodatasum bit
+ */
+ ret = clear_extent_bit(tree, start, end,
+ ~(EXTENT_LOCKED | EXTENT_NODATASUM),
+ 0, 0, NULL, mask);
+
+ /* if clear_extent_bit failed for enomem reasons,
+ * we can't allow the release to continue.
+ */
+ if (ret < 0)
+ ret = 0;
+ else
+ ret = 1;
+ }
+ return ret;
+}
+
+/*
+ * a helper for releasepage. As long as there are no locked extents
+ * in the range corresponding to the page, both state records and extent
+ * map records are removed
+ */
+int try_release_extent_mapping(struct extent_map_tree *map,
+ struct extent_io_tree *tree, struct page *page,
+ gfp_t mask)
+{
+ struct extent_map *em;
+ u64 start = page_offset(page);
+ u64 end = start + PAGE_CACHE_SIZE - 1;
+
+ if ((mask & __GFP_WAIT) &&
+ page->mapping->host->i_size > 16 * 1024 * 1024) {
+ u64 len;
+ while (start <= end) {
+ len = end - start + 1;
+ write_lock(&map->lock);
+ em = lookup_extent_mapping(map, start, len);
+ if (!em) {
+ write_unlock(&map->lock);
+ break;
+ }
+ if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
+ em->start != start) {
+ write_unlock(&map->lock);
+ free_extent_map(em);
+ break;
+ }
+ if (!test_range_bit(tree, em->start,
+ extent_map_end(em) - 1,
+ EXTENT_LOCKED | EXTENT_WRITEBACK,
+ 0, NULL)) {
+ remove_extent_mapping(map, em);
+ /* once for the rb tree */
+ free_extent_map(em);
+ }
+ start = extent_map_end(em);
+ write_unlock(&map->lock);
+
+ /* once for us */
+ free_extent_map(em);
+ }
+ }
+ return try_release_extent_state(map, tree, page, mask);
+}
+
+/*
+ * helper function for fiemap, which doesn't want to see any holes.
+ * This maps until we find something past 'last'
+ */
+static struct extent_map *get_extent_skip_holes(struct inode *inode,
+ u64 offset,
+ u64 last,
+ get_extent_t *get_extent)
+{
+ u64 sectorsize = BTRFS_I(inode)->root->sectorsize;
+ struct extent_map *em;
+ u64 len;
+
+ if (offset >= last)
+ return NULL;
+
+ while (1) {
+ len = last - offset;
+ if (len == 0)
+ break;
+ len = ALIGN(len, sectorsize);
+ em = get_extent(inode, NULL, 0, offset, len, 0);
+ if (IS_ERR_OR_NULL(em))
+ return em;
+
+ /* if this isn't a hole return it */
+ if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) &&
+ em->block_start != EXTENT_MAP_HOLE) {
+ return em;
+ }
+
+ /* this is a hole, advance to the next extent */
+ offset = extent_map_end(em);
+ free_extent_map(em);
+ if (offset >= last)
+ break;
+ }
+ return NULL;
+}
+
+int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ __u64 start, __u64 len, get_extent_t *get_extent)
+{
+ int ret = 0;
+ u64 off = start;
+ u64 max = start + len;
+ u32 flags = 0;
+ u32 found_type;
+ u64 last;
+ u64 last_for_get_extent = 0;
+ u64 disko = 0;
+ u64 isize = i_size_read(inode);
+ struct btrfs_key found_key;
+ struct extent_map *em = NULL;
+ struct extent_state *cached_state = NULL;
+ struct btrfs_path *path;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int end = 0;
+ u64 em_start = 0;
+ u64 em_len = 0;
+ u64 em_end = 0;
+
+ if (len == 0)
+ return -EINVAL;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ path->leave_spinning = 1;
+
+ start = round_down(start, BTRFS_I(inode)->root->sectorsize);
+ len = round_up(max, BTRFS_I(inode)->root->sectorsize) - start;
+
+ /*
+ * lookup the last file extent. We're not using i_size here
+ * because there might be preallocation past i_size
+ */
+ ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1,
+ 0);
+ if (ret < 0) {
+ btrfs_free_path(path);
+ return ret;
+ }
+ WARN_ON(!ret);
+ path->slots[0]--;
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
+ found_type = found_key.type;
+
+ /* No extents, but there might be delalloc bits */
+ if (found_key.objectid != btrfs_ino(inode) ||
+ found_type != BTRFS_EXTENT_DATA_KEY) {
+ /* have to trust i_size as the end */
+ last = (u64)-1;
+ last_for_get_extent = isize;
+ } else {
+ /*
+ * remember the start of the last extent. There are a
+ * bunch of different factors that go into the length of the
+ * extent, so its much less complex to remember where it started
+ */
+ last = found_key.offset;
+ last_for_get_extent = last + 1;
+ }
+ btrfs_release_path(path);
+
+ /*
+ * we might have some extents allocated but more delalloc past those
+ * extents. so, we trust isize unless the start of the last extent is
+ * beyond isize
+ */
+ if (last < isize) {
+ last = (u64)-1;
+ last_for_get_extent = isize;
+ }
+
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1, 0,
+ &cached_state);
+
+ em = get_extent_skip_holes(inode, start, last_for_get_extent,
+ get_extent);
+ if (!em)
+ goto out;
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto out;
+ }
+
+ while (!end) {
+ u64 offset_in_extent = 0;
+
+ /* break if the extent we found is outside the range */
+ if (em->start >= max || extent_map_end(em) < off)
+ break;
+
+ /*
+ * get_extent may return an extent that starts before our
+ * requested range. We have to make sure the ranges
+ * we return to fiemap always move forward and don't
+ * overlap, so adjust the offsets here
+ */
+ em_start = max(em->start, off);
+
+ /*
+ * record the offset from the start of the extent
+ * for adjusting the disk offset below. Only do this if the
+ * extent isn't compressed since our in ram offset may be past
+ * what we have actually allocated on disk.
+ */
+ if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+ offset_in_extent = em_start - em->start;
+ em_end = extent_map_end(em);
+ em_len = em_end - em_start;
+ disko = 0;
+ flags = 0;
+
+ /*
+ * bump off for our next call to get_extent
+ */
+ off = extent_map_end(em);
+ if (off >= max)
+ end = 1;
+
+ if (em->block_start == EXTENT_MAP_LAST_BYTE) {
+ end = 1;
+ flags |= FIEMAP_EXTENT_LAST;
+ } else if (em->block_start == EXTENT_MAP_INLINE) {
+ flags |= (FIEMAP_EXTENT_DATA_INLINE |
+ FIEMAP_EXTENT_NOT_ALIGNED);
+ } else if (em->block_start == EXTENT_MAP_DELALLOC) {
+ flags |= (FIEMAP_EXTENT_DELALLOC |
+ FIEMAP_EXTENT_UNKNOWN);
+ } else if (fieinfo->fi_extents_max) {
+ u64 bytenr = em->block_start -
+ (em->start - em->orig_start);
+
+ disko = em->block_start + offset_in_extent;
+
+ /*
+ * As btrfs supports shared space, this information
+ * can be exported to userspace tools via
+ * flag FIEMAP_EXTENT_SHARED. If fi_extents_max == 0
+ * then we're just getting a count and we can skip the
+ * lookup stuff.
+ */
+ ret = btrfs_check_shared(NULL, root->fs_info,
+ root->objectid,
+ btrfs_ino(inode), bytenr);
+ if (ret < 0)
+ goto out_free;
+ if (ret)
+ flags |= FIEMAP_EXTENT_SHARED;
+ ret = 0;
+ }
+ if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+ flags |= FIEMAP_EXTENT_ENCODED;
+
+ free_extent_map(em);
+ em = NULL;
+ if ((em_start >= last) || em_len == (u64)-1 ||
+ (last == (u64)-1 && isize <= em_end)) {
+ flags |= FIEMAP_EXTENT_LAST;
+ end = 1;
+ }
+
+ /* now scan forward to see if this is really the last extent. */
+ em = get_extent_skip_holes(inode, off, last_for_get_extent,
+ get_extent);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto out;
+ }
+ if (!em) {
+ flags |= FIEMAP_EXTENT_LAST;
+ end = 1;
+ }
+ ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
+ em_len, flags);
+ if (ret) {
+ if (ret == 1)
+ ret = 0;
+ goto out_free;
+ }
+ }
+out_free:
+ free_extent_map(em);
+out:
+ btrfs_free_path(path);
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1,
+ &cached_state, GFP_NOFS);
+ return ret;
+}
+
+static void __free_extent_buffer(struct extent_buffer *eb)
+{
+ btrfs_leak_debug_del(&eb->leak_list);
+ kmem_cache_free(extent_buffer_cache, eb);
+}
+
+int extent_buffer_under_io(struct extent_buffer *eb)
+{
+ return (atomic_read(&eb->io_pages) ||
+ test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
+ test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
+}
+
+/*
+ * Helper for releasing extent buffer page.
+ */
+static void btrfs_release_extent_buffer_page(struct extent_buffer *eb)
+{
+ unsigned long index;
+ struct page *page;
+ int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
+
+ BUG_ON(extent_buffer_under_io(eb));
+
+ index = num_extent_pages(eb->start, eb->len);
+ if (index == 0)
+ return;
+
+ do {
+ index--;
+ page = eb->pages[index];
+ if (!page)
+ continue;
+ if (mapped)
+ spin_lock(&page->mapping->private_lock);
+ /*
+ * We do this since we'll remove the pages after we've
+ * removed the eb from the radix tree, so we could race
+ * and have this page now attached to the new eb. So
+ * only clear page_private if it's still connected to
+ * this eb.
+ */
+ if (PagePrivate(page) &&
+ page->private == (unsigned long)eb) {
+ BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
+ BUG_ON(PageDirty(page));
+ BUG_ON(PageWriteback(page));
+ /*
+ * We need to make sure we haven't be attached
+ * to a new eb.
+ */
+ ClearPagePrivate(page);
+ set_page_private(page, 0);
+ /* One for the page private */
+ page_cache_release(page);
+ }
+
+ if (mapped)
+ spin_unlock(&page->mapping->private_lock);
+
+ /* One for when we alloced the page */
+ page_cache_release(page);
+ } while (index != 0);
+}
+
+/*
+ * Helper for releasing the extent buffer.
+ */
+static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
+{
+ btrfs_release_extent_buffer_page(eb);
+ __free_extent_buffer(eb);
+}
+
+static struct extent_buffer *
+__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
+ unsigned long len)
+{
+ struct extent_buffer *eb = NULL;
+
+ eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS);
+ if (eb == NULL)
+ return NULL;
+ eb->start = start;
+ eb->len = len;
+ eb->fs_info = fs_info;
+ eb->bflags = 0;
+ rwlock_init(&eb->lock);
+ atomic_set(&eb->write_locks, 0);
+ atomic_set(&eb->read_locks, 0);
+ atomic_set(&eb->blocking_readers, 0);
+ atomic_set(&eb->blocking_writers, 0);
+ atomic_set(&eb->spinning_readers, 0);
+ atomic_set(&eb->spinning_writers, 0);
+ eb->lock_nested = 0;
+ init_waitqueue_head(&eb->write_lock_wq);
+ init_waitqueue_head(&eb->read_lock_wq);
+
+ btrfs_leak_debug_add(&eb->leak_list, &buffers);
+
+ spin_lock_init(&eb->refs_lock);
+ atomic_set(&eb->refs, 1);
+ atomic_set(&eb->io_pages, 0);
+
+ /*
+ * Sanity checks, currently the maximum is 64k covered by 16x 4k pages
+ */
+ BUILD_BUG_ON(BTRFS_MAX_METADATA_BLOCKSIZE
+ > MAX_INLINE_EXTENT_BUFFER_SIZE);
+ BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE);
+
+ return eb;
+}
+
+struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
+{
+ unsigned long i;
+ struct page *p;
+ struct extent_buffer *new;
+ unsigned long num_pages = num_extent_pages(src->start, src->len);
+
+ new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
+ if (new == NULL)
+ return NULL;
+
+ for (i = 0; i < num_pages; i++) {
+ p = alloc_page(GFP_NOFS);
+ if (!p) {
+ btrfs_release_extent_buffer(new);
+ return NULL;
+ }
+ attach_extent_buffer_page(new, p);
+ WARN_ON(PageDirty(p));
+ SetPageUptodate(p);
+ new->pages[i] = p;
+ }
+
+ copy_extent_buffer(new, src, 0, 0, src->len);
+ set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags);
+ set_bit(EXTENT_BUFFER_DUMMY, &new->bflags);
+
+ return new;
+}
+
+struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
+ u64 start)
+{
+ struct extent_buffer *eb;
+ unsigned long len;
+ unsigned long num_pages;
+ unsigned long i;
+
+ if (!fs_info) {
+ /*
+ * Called only from tests that don't always have a fs_info
+ * available, but we know that nodesize is 4096
+ */
+ len = 4096;
+ } else {
+ len = fs_info->tree_root->nodesize;
+ }
+ num_pages = num_extent_pages(0, len);
+
+ eb = __alloc_extent_buffer(fs_info, start, len);
+ if (!eb)
+ return NULL;
+
+ for (i = 0; i < num_pages; i++) {
+ eb->pages[i] = alloc_page(GFP_NOFS);
+ if (!eb->pages[i])
+ goto err;
+ }
+ set_extent_buffer_uptodate(eb);
+ btrfs_set_header_nritems(eb, 0);
+ set_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
+
+ return eb;
+err:
+ for (; i > 0; i--)
+ __free_page(eb->pages[i - 1]);
+ __free_extent_buffer(eb);
+ return NULL;
+}
+
+static void check_buffer_tree_ref(struct extent_buffer *eb)
+{
+ int refs;
+ /* the ref bit is tricky. We have to make sure it is set
+ * if we have the buffer dirty. Otherwise the
+ * code to free a buffer can end up dropping a dirty
+ * page
+ *
+ * Once the ref bit is set, it won't go away while the
+ * buffer is dirty or in writeback, and it also won't
+ * go away while we have the reference count on the
+ * eb bumped.
+ *
+ * We can't just set the ref bit without bumping the
+ * ref on the eb because free_extent_buffer might
+ * see the ref bit and try to clear it. If this happens
+ * free_extent_buffer might end up dropping our original
+ * ref by mistake and freeing the page before we are able
+ * to add one more ref.
+ *
+ * So bump the ref count first, then set the bit. If someone
+ * beat us to it, drop the ref we added.
+ */
+ refs = atomic_read(&eb->refs);
+ if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
+ return;
+
+ spin_lock(&eb->refs_lock);
+ if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
+ atomic_inc(&eb->refs);
+ spin_unlock(&eb->refs_lock);
+}
+
+static void mark_extent_buffer_accessed(struct extent_buffer *eb,
+ struct page *accessed)
+{
+ unsigned long num_pages, i;
+
+ check_buffer_tree_ref(eb);
+
+ num_pages = num_extent_pages(eb->start, eb->len);
+ for (i = 0; i < num_pages; i++) {
+ struct page *p = eb->pages[i];
+
+ if (p != accessed)
+ mark_page_accessed(p);
+ }
+}
+
+struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
+ u64 start)
+{
+ struct extent_buffer *eb;
+
+ rcu_read_lock();
+ eb = radix_tree_lookup(&fs_info->buffer_radix,
+ start >> PAGE_CACHE_SHIFT);
+ if (eb && atomic_inc_not_zero(&eb->refs)) {
+ rcu_read_unlock();
+ /*
+ * Lock our eb's refs_lock to avoid races with
+ * free_extent_buffer. When we get our eb it might be flagged
+ * with EXTENT_BUFFER_STALE and another task running
+ * free_extent_buffer might have seen that flag set,
+ * eb->refs == 2, that the buffer isn't under IO (dirty and
+ * writeback flags not set) and it's still in the tree (flag
+ * EXTENT_BUFFER_TREE_REF set), therefore being in the process
+ * of decrementing the extent buffer's reference count twice.
+ * So here we could race and increment the eb's reference count,
+ * clear its stale flag, mark it as dirty and drop our reference
+ * before the other task finishes executing free_extent_buffer,
+ * which would later result in an attempt to free an extent
+ * buffer that is dirty.
+ */
+ if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) {
+ spin_lock(&eb->refs_lock);
+ spin_unlock(&eb->refs_lock);
+ }
+ mark_extent_buffer_accessed(eb, NULL);
+ return eb;
+ }
+ rcu_read_unlock();
+
+ return NULL;
+}
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
+ u64 start)
+{
+ struct extent_buffer *eb, *exists = NULL;
+ int ret;
+
+ eb = find_extent_buffer(fs_info, start);
+ if (eb)
+ return eb;
+ eb = alloc_dummy_extent_buffer(fs_info, start);
+ if (!eb)
+ return NULL;
+ eb->fs_info = fs_info;
+again:
+ ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+ if (ret)
+ goto free_eb;
+ spin_lock(&fs_info->buffer_lock);
+ ret = radix_tree_insert(&fs_info->buffer_radix,
+ start >> PAGE_CACHE_SHIFT, eb);
+ spin_unlock(&fs_info->buffer_lock);
+ radix_tree_preload_end();
+ if (ret == -EEXIST) {
+ exists = find_extent_buffer(fs_info, start);
+ if (exists)
+ goto free_eb;
+ else
+ goto again;
+ }
+ check_buffer_tree_ref(eb);
+ set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
+
+ /*
+ * We will free dummy extent buffer's if they come into
+ * free_extent_buffer with a ref count of 2, but if we are using this we
+ * want the buffers to stay in memory until we're done with them, so
+ * bump the ref count again.
+ */
+ atomic_inc(&eb->refs);
+ return eb;
+free_eb:
+ btrfs_release_extent_buffer(eb);
+ return exists;
+}
+#endif
+
+struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
+ u64 start)
+{
+ unsigned long len = fs_info->tree_root->nodesize;
+ unsigned long num_pages = num_extent_pages(start, len);
+ unsigned long i;
+ unsigned long index = start >> PAGE_CACHE_SHIFT;
+ struct extent_buffer *eb;
+ struct extent_buffer *exists = NULL;
+ struct page *p;
+ struct address_space *mapping = fs_info->btree_inode->i_mapping;
+ int uptodate = 1;
+ int ret;
+
+ eb = find_extent_buffer(fs_info, start);
+ if (eb)
+ return eb;
+
+ eb = __alloc_extent_buffer(fs_info, start, len);
+ if (!eb)
+ return NULL;
+
+ for (i = 0; i < num_pages; i++, index++) {
+ p = find_or_create_page(mapping, index, GFP_NOFS);
+ if (!p)
+ goto free_eb;
+
+ spin_lock(&mapping->private_lock);
+ if (PagePrivate(p)) {
+ /*
+ * We could have already allocated an eb for this page
+ * and attached one so lets see if we can get a ref on
+ * the existing eb, and if we can we know it's good and
+ * we can just return that one, else we know we can just
+ * overwrite page->private.
+ */
+ exists = (struct extent_buffer *)p->private;
+ if (atomic_inc_not_zero(&exists->refs)) {
+ spin_unlock(&mapping->private_lock);
+ unlock_page(p);
+ page_cache_release(p);
+ mark_extent_buffer_accessed(exists, p);
+ goto free_eb;
+ }
+ exists = NULL;
+
+ /*
+ * Do this so attach doesn't complain and we need to
+ * drop the ref the old guy had.
+ */
+ ClearPagePrivate(p);
+ WARN_ON(PageDirty(p));
+ page_cache_release(p);
+ }
+ attach_extent_buffer_page(eb, p);
+ spin_unlock(&mapping->private_lock);
+ WARN_ON(PageDirty(p));
+ eb->pages[i] = p;
+ if (!PageUptodate(p))
+ uptodate = 0;
+
+ /*
+ * see below about how we avoid a nasty race with release page
+ * and why we unlock later
+ */
+ }
+ if (uptodate)
+ set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
+again:
+ ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+ if (ret)
+ goto free_eb;
+
+ spin_lock(&fs_info->buffer_lock);
+ ret = radix_tree_insert(&fs_info->buffer_radix,
+ start >> PAGE_CACHE_SHIFT, eb);
+ spin_unlock(&fs_info->buffer_lock);
+ radix_tree_preload_end();
+ if (ret == -EEXIST) {
+ exists = find_extent_buffer(fs_info, start);
+ if (exists)
+ goto free_eb;
+ else
+ goto again;
+ }
+ /* add one reference for the tree */
+ check_buffer_tree_ref(eb);
+ set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
+
+ /*
+ * there is a race where release page may have
+ * tried to find this extent buffer in the radix
+ * but failed. It will tell the VM it is safe to
+ * reclaim the, and it will clear the page private bit.
+ * We must make sure to set the page private bit properly
+ * after the extent buffer is in the radix tree so
+ * it doesn't get lost
+ */
+ SetPageChecked(eb->pages[0]);
+ for (i = 1; i < num_pages; i++) {
+ p = eb->pages[i];
+ ClearPageChecked(p);
+ unlock_page(p);
+ }
+ unlock_page(eb->pages[0]);
+ return eb;
+
+free_eb:
+ WARN_ON(!atomic_dec_and_test(&eb->refs));
+ for (i = 0; i < num_pages; i++) {
+ if (eb->pages[i])
+ unlock_page(eb->pages[i]);
+ }
+
+ btrfs_release_extent_buffer(eb);
+ return exists;
+}
+
+static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
+{
+ struct extent_buffer *eb =
+ container_of(head, struct extent_buffer, rcu_head);
+
+ __free_extent_buffer(eb);
+}
+
+/* Expects to have eb->eb_lock already held */
+static int release_extent_buffer(struct extent_buffer *eb)
+{
+ WARN_ON(atomic_read(&eb->refs) == 0);
+ if (atomic_dec_and_test(&eb->refs)) {
+ if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) {
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+
+ spin_unlock(&eb->refs_lock);
+
+ spin_lock(&fs_info->buffer_lock);
+ radix_tree_delete(&fs_info->buffer_radix,
+ eb->start >> PAGE_CACHE_SHIFT);
+ spin_unlock(&fs_info->buffer_lock);
+ } else {
+ spin_unlock(&eb->refs_lock);
+ }
+
+ /* Should be safe to release our pages at this point */
+ btrfs_release_extent_buffer_page(eb);
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ if (unlikely(test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))) {
+ __free_extent_buffer(eb);
+ return 1;
+ }
+#endif
+ call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
+ return 1;
+ }
+ spin_unlock(&eb->refs_lock);
+
+ return 0;
+}
+
+void free_extent_buffer(struct extent_buffer *eb)
+{
+ int refs;
+ int old;
+ if (!eb)
+ return;
+
+ while (1) {
+ refs = atomic_read(&eb->refs);
+ if (refs <= 3)
+ break;
+ old = atomic_cmpxchg(&eb->refs, refs, refs - 1);
+ if (old == refs)
+ return;
+ }
+
+ spin_lock(&eb->refs_lock);
+ if (atomic_read(&eb->refs) == 2 &&
+ test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))
+ atomic_dec(&eb->refs);
+
+ if (atomic_read(&eb->refs) == 2 &&
+ test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
+ !extent_buffer_under_io(eb) &&
+ test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
+ atomic_dec(&eb->refs);
+
+ /*
+ * I know this is terrible, but it's temporary until we stop tracking
+ * the uptodate bits and such for the extent buffers.
+ */
+ release_extent_buffer(eb);
+}
+
+void free_extent_buffer_stale(struct extent_buffer *eb)
+{
+ if (!eb)
+ return;
+
+ spin_lock(&eb->refs_lock);
+ set_bit(EXTENT_BUFFER_STALE, &eb->bflags);
+
+ if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
+ test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
+ atomic_dec(&eb->refs);
+ release_extent_buffer(eb);
+}
+
+void clear_extent_buffer_dirty(struct extent_buffer *eb)
+{
+ unsigned long i;
+ unsigned long num_pages;
+ struct page *page;
+
+ num_pages = num_extent_pages(eb->start, eb->len);
+
+ for (i = 0; i < num_pages; i++) {
+ page = eb->pages[i];
+ if (!PageDirty(page))
+ continue;
+
+ lock_page(page);
+ WARN_ON(!PagePrivate(page));
+
+ clear_page_dirty_for_io(page);
+ spin_lock_irq(&page->mapping->tree_lock);
+ if (!PageDirty(page)) {
+ radix_tree_tag_clear(&page->mapping->page_tree,
+ page_index(page),
+ PAGECACHE_TAG_DIRTY);
+ }
+ spin_unlock_irq(&page->mapping->tree_lock);
+ ClearPageError(page);
+ unlock_page(page);
+ }
+ WARN_ON(atomic_read(&eb->refs) == 0);
+}
+
+int set_extent_buffer_dirty(struct extent_buffer *eb)
+{
+ unsigned long i;
+ unsigned long num_pages;
+ int was_dirty = 0;
+
+ check_buffer_tree_ref(eb);
+
+ was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
+
+ num_pages = num_extent_pages(eb->start, eb->len);
+ WARN_ON(atomic_read(&eb->refs) == 0);
+ WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
+
+ for (i = 0; i < num_pages; i++)
+ set_page_dirty(eb->pages[i]);
+ return was_dirty;
+}
+
+int clear_extent_buffer_uptodate(struct extent_buffer *eb)
+{
+ unsigned long i;
+ struct page *page;
+ unsigned long num_pages;
+
+ clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
+ num_pages = num_extent_pages(eb->start, eb->len);
+ for (i = 0; i < num_pages; i++) {
+ page = eb->pages[i];
+ if (page)
+ ClearPageUptodate(page);
+ }
+ return 0;
+}
+
+int set_extent_buffer_uptodate(struct extent_buffer *eb)
+{
+ unsigned long i;
+ struct page *page;
+ unsigned long num_pages;
+
+ set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
+ num_pages = num_extent_pages(eb->start, eb->len);
+ for (i = 0; i < num_pages; i++) {
+ page = eb->pages[i];
+ SetPageUptodate(page);
+ }
+ return 0;
+}
+
+int extent_buffer_uptodate(struct extent_buffer *eb)
+{
+ return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
+}
+
+int read_extent_buffer_pages(struct extent_io_tree *tree,
+ struct extent_buffer *eb, u64 start, int wait,
+ get_extent_t *get_extent, int mirror_num)
+{
+ unsigned long i;
+ unsigned long start_i;
+ struct page *page;
+ int err;
+ int ret = 0;
+ int locked_pages = 0;
+ int all_uptodate = 1;
+ unsigned long num_pages;
+ unsigned long num_reads = 0;
+ struct bio *bio = NULL;
+ unsigned long bio_flags = 0;
+
+ if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
+ return 0;
+
+ if (start) {
+ WARN_ON(start < eb->start);
+ start_i = (start >> PAGE_CACHE_SHIFT) -
+ (eb->start >> PAGE_CACHE_SHIFT);
+ } else {
+ start_i = 0;
+ }
+
+ num_pages = num_extent_pages(eb->start, eb->len);
+ for (i = start_i; i < num_pages; i++) {
+ page = eb->pages[i];
+ if (wait == WAIT_NONE) {
+ if (!trylock_page(page))
+ goto unlock_exit;
+ } else {
+ lock_page(page);
+ }
+ locked_pages++;
+ if (!PageUptodate(page)) {
+ num_reads++;
+ all_uptodate = 0;
+ }
+ }
+ if (all_uptodate) {
+ if (start_i == 0)
+ set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
+ goto unlock_exit;
+ }
+
+ clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
+ eb->read_mirror = 0;
+ atomic_set(&eb->io_pages, num_reads);
+ for (i = start_i; i < num_pages; i++) {
+ page = eb->pages[i];
+ if (!PageUptodate(page)) {
+ ClearPageError(page);
+ err = __extent_read_full_page(tree, page,
+ get_extent, &bio,
+ mirror_num, &bio_flags,
+ READ | REQ_META);
+ if (err)
+ ret = err;
+ } else {
+ unlock_page(page);
+ }
+ }
+
+ if (bio) {
+ err = submit_one_bio(READ | REQ_META, bio, mirror_num,
+ bio_flags);
+ if (err)
+ return err;
+ }
+
+ if (ret || wait != WAIT_COMPLETE)
+ return ret;
+
+ for (i = start_i; i < num_pages; i++) {
+ page = eb->pages[i];
+ wait_on_page_locked(page);
+ if (!PageUptodate(page))
+ ret = -EIO;
+ }
+
+ return ret;
+
+unlock_exit:
+ i = start_i;
+ while (locked_pages > 0) {
+ page = eb->pages[i];
+ i++;
+ unlock_page(page);
+ locked_pages--;
+ }
+ return ret;
+}
+
+void read_extent_buffer(struct extent_buffer *eb, void *dstv,
+ unsigned long start,
+ unsigned long len)
+{
+ size_t cur;
+ size_t offset;
+ struct page *page;
+ char *kaddr;
+ char *dst = (char *)dstv;
+ size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+ unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+
+ WARN_ON(start > eb->len);
+ WARN_ON(start + len > eb->start + eb->len);
+
+ offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
+
+ while (len > 0) {
+ page = eb->pages[i];
+
+ cur = min(len, (PAGE_CACHE_SIZE - offset));
+ kaddr = page_address(page);
+ memcpy(dst, kaddr + offset, cur);
+
+ dst += cur;
+ len -= cur;
+ offset = 0;
+ i++;
+ }
+}
+
+int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dstv,
+ unsigned long start,
+ unsigned long len)
+{
+ size_t cur;
+ size_t offset;
+ struct page *page;
+ char *kaddr;
+ char __user *dst = (char __user *)dstv;
+ size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+ unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+ int ret = 0;
+
+ WARN_ON(start > eb->len);
+ WARN_ON(start + len > eb->start + eb->len);
+
+ offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
+
+ while (len > 0) {
+ page = eb->pages[i];
+
+ cur = min(len, (PAGE_CACHE_SIZE - offset));
+ kaddr = page_address(page);
+ if (copy_to_user(dst, kaddr + offset, cur)) {
+ ret = -EFAULT;
+ break;
+ }
+
+ dst += cur;
+ len -= cur;
+ offset = 0;
+ i++;
+ }
+
+ return ret;
+}
+
+int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
+ unsigned long min_len, char **map,
+ unsigned long *map_start,
+ unsigned long *map_len)
+{
+ size_t offset = start & (PAGE_CACHE_SIZE - 1);
+ char *kaddr;
+ struct page *p;
+ size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+ unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+ unsigned long end_i = (start_offset + start + min_len - 1) >>
+ PAGE_CACHE_SHIFT;
+
+ if (i != end_i)
+ return -EINVAL;
+
+ if (i == 0) {
+ offset = start_offset;
+ *map_start = 0;
+ } else {
+ offset = 0;
+ *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
+ }
+
+ if (start + min_len > eb->len) {
+ WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
+ "wanted %lu %lu\n",
+ eb->start, eb->len, start, min_len);
+ return -EINVAL;
+ }
+
+ p = eb->pages[i];
+ kaddr = page_address(p);
+ *map = kaddr + offset;
+ *map_len = PAGE_CACHE_SIZE - offset;
+ return 0;
+}
+
+int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
+ unsigned long start,
+ unsigned long len)
+{
+ size_t cur;
+ size_t offset;
+ struct page *page;
+ char *kaddr;
+ char *ptr = (char *)ptrv;
+ size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+ unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+ int ret = 0;
+
+ WARN_ON(start > eb->len);
+ WARN_ON(start + len > eb->start + eb->len);
+
+ offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
+
+ while (len > 0) {
+ page = eb->pages[i];
+
+ cur = min(len, (PAGE_CACHE_SIZE - offset));
+
+ kaddr = page_address(page);
+ ret = memcmp(ptr, kaddr + offset, cur);
+ if (ret)
+ break;
+
+ ptr += cur;
+ len -= cur;
+ offset = 0;
+ i++;
+ }
+ return ret;
+}
+
+void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
+ unsigned long start, unsigned long len)
+{
+ size_t cur;
+ size_t offset;
+ struct page *page;
+ char *kaddr;
+ char *src = (char *)srcv;
+ size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+ unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+
+ WARN_ON(start > eb->len);
+ WARN_ON(start + len > eb->start + eb->len);
+
+ offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
+
+ while (len > 0) {
+ page = eb->pages[i];
+ WARN_ON(!PageUptodate(page));
+
+ cur = min(len, PAGE_CACHE_SIZE - offset);
+ kaddr = page_address(page);
+ memcpy(kaddr + offset, src, cur);
+
+ src += cur;
+ len -= cur;
+ offset = 0;
+ i++;
+ }
+}
+
+void memset_extent_buffer(struct extent_buffer *eb, char c,
+ unsigned long start, unsigned long len)
+{
+ size_t cur;
+ size_t offset;
+ struct page *page;
+ char *kaddr;
+ size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+ unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+
+ WARN_ON(start > eb->len);
+ WARN_ON(start + len > eb->start + eb->len);
+
+ offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
+
+ while (len > 0) {
+ page = eb->pages[i];
+ WARN_ON(!PageUptodate(page));
+
+ cur = min(len, PAGE_CACHE_SIZE - offset);
+ kaddr = page_address(page);
+ memset(kaddr + offset, c, cur);
+
+ len -= cur;
+ offset = 0;
+ i++;
+ }
+}
+
+void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
+ unsigned long dst_offset, unsigned long src_offset,
+ unsigned long len)
+{
+ u64 dst_len = dst->len;
+ size_t cur;
+ size_t offset;
+ struct page *page;
+ char *kaddr;
+ size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
+ unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
+
+ WARN_ON(src->len != dst_len);
+
+ offset = (start_offset + dst_offset) &
+ (PAGE_CACHE_SIZE - 1);
+
+ while (len > 0) {
+ page = dst->pages[i];
+ WARN_ON(!PageUptodate(page));
+
+ cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
+
+ kaddr = page_address(page);
+ read_extent_buffer(src, kaddr + offset, src_offset, cur);
+
+ src_offset += cur;
+ len -= cur;
+ offset = 0;
+ i++;
+ }
+}
+
+static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
+{
+ unsigned long distance = (src > dst) ? src - dst : dst - src;
+ return distance < len;
+}
+
+static void copy_pages(struct page *dst_page, struct page *src_page,
+ unsigned long dst_off, unsigned long src_off,
+ unsigned long len)
+{
+ char *dst_kaddr = page_address(dst_page);
+ char *src_kaddr;
+ int must_memmove = 0;
+
+ if (dst_page != src_page) {
+ src_kaddr = page_address(src_page);
+ } else {
+ src_kaddr = dst_kaddr;
+ if (areas_overlap(src_off, dst_off, len))
+ must_memmove = 1;
+ }
+
+ if (must_memmove)
+ memmove(dst_kaddr + dst_off, src_kaddr + src_off, len);
+ else
+ memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
+}
+
+void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
+ unsigned long src_offset, unsigned long len)
+{
+ size_t cur;
+ size_t dst_off_in_page;
+ size_t src_off_in_page;
+ size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
+ unsigned long dst_i;
+ unsigned long src_i;
+
+ if (src_offset + len > dst->len) {
+ printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move "
+ "len %lu dst len %lu\n", src_offset, len, dst->len);
+ BUG_ON(1);
+ }
+ if (dst_offset + len > dst->len) {
+ printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move "
+ "len %lu dst len %lu\n", dst_offset, len, dst->len);
+ BUG_ON(1);
+ }
+
+ while (len > 0) {
+ dst_off_in_page = (start_offset + dst_offset) &
+ (PAGE_CACHE_SIZE - 1);
+ src_off_in_page = (start_offset + src_offset) &
+ (PAGE_CACHE_SIZE - 1);
+
+ dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
+ src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
+
+ cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
+ src_off_in_page));
+ cur = min_t(unsigned long, cur,
+ (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
+
+ copy_pages(dst->pages[dst_i], dst->pages[src_i],
+ dst_off_in_page, src_off_in_page, cur);
+
+ src_offset += cur;
+ dst_offset += cur;
+ len -= cur;
+ }
+}
+
+void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
+ unsigned long src_offset, unsigned long len)
+{
+ size_t cur;
+ size_t dst_off_in_page;
+ size_t src_off_in_page;
+ unsigned long dst_end = dst_offset + len - 1;
+ unsigned long src_end = src_offset + len - 1;
+ size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
+ unsigned long dst_i;
+ unsigned long src_i;
+
+ if (src_offset + len > dst->len) {
+ printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move "
+ "len %lu len %lu\n", src_offset, len, dst->len);
+ BUG_ON(1);
+ }
+ if (dst_offset + len > dst->len) {
+ printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move "
+ "len %lu len %lu\n", dst_offset, len, dst->len);
+ BUG_ON(1);
+ }
+ if (dst_offset < src_offset) {
+ memcpy_extent_buffer(dst, dst_offset, src_offset, len);
+ return;
+ }
+ while (len > 0) {
+ dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
+ src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
+
+ dst_off_in_page = (start_offset + dst_end) &
+ (PAGE_CACHE_SIZE - 1);
+ src_off_in_page = (start_offset + src_end) &
+ (PAGE_CACHE_SIZE - 1);
+
+ cur = min_t(unsigned long, len, src_off_in_page + 1);
+ cur = min(cur, dst_off_in_page + 1);
+ copy_pages(dst->pages[dst_i], dst->pages[src_i],
+ dst_off_in_page - cur + 1,
+ src_off_in_page - cur + 1, cur);
+
+ dst_end -= cur;
+ src_end -= cur;
+ len -= cur;
+ }
+}
+
+int try_release_extent_buffer(struct page *page)
+{
+ struct extent_buffer *eb;
+
+ /*
+ * We need to make sure noboody is attaching this page to an eb right
+ * now.
+ */
+ spin_lock(&page->mapping->private_lock);
+ if (!PagePrivate(page)) {
+ spin_unlock(&page->mapping->private_lock);
+ return 1;
+ }
+
+ eb = (struct extent_buffer *)page->private;
+ BUG_ON(!eb);
+
+ /*
+ * This is a little awful but should be ok, we need to make sure that
+ * the eb doesn't disappear out from under us while we're looking at
+ * this page.
+ */
+ spin_lock(&eb->refs_lock);
+ if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
+ spin_unlock(&eb->refs_lock);
+ spin_unlock(&page->mapping->private_lock);
+ return 0;
+ }
+ spin_unlock(&page->mapping->private_lock);
+
+ /*
+ * If tree ref isn't set then we know the ref on this eb is a real ref,
+ * so just return, this page will likely be freed soon anyway.
+ */
+ if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
+ spin_unlock(&eb->refs_lock);
+ return 0;
+ }
+
+ return release_extent_buffer(eb);
+}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
new file mode 100644
index 000000000..c668f3689
--- /dev/null
+++ b/fs/btrfs/extent_io.h
@@ -0,0 +1,382 @@
+#ifndef __EXTENTIO__
+#define __EXTENTIO__
+
+#include <linux/rbtree.h>
+
+/* bits for the extent state */
+#define EXTENT_DIRTY (1U << 0)
+#define EXTENT_WRITEBACK (1U << 1)
+#define EXTENT_UPTODATE (1U << 2)
+#define EXTENT_LOCKED (1U << 3)
+#define EXTENT_NEW (1U << 4)
+#define EXTENT_DELALLOC (1U << 5)
+#define EXTENT_DEFRAG (1U << 6)
+#define EXTENT_BOUNDARY (1U << 9)
+#define EXTENT_NODATASUM (1U << 10)
+#define EXTENT_DO_ACCOUNTING (1U << 11)
+#define EXTENT_FIRST_DELALLOC (1U << 12)
+#define EXTENT_NEED_WAIT (1U << 13)
+#define EXTENT_DAMAGED (1U << 14)
+#define EXTENT_NORESERVE (1U << 15)
+#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
+#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
+
+/*
+ * flags for bio submission. The high bits indicate the compression
+ * type for this bio
+ */
+#define EXTENT_BIO_COMPRESSED 1
+#define EXTENT_BIO_TREE_LOG 2
+#define EXTENT_BIO_PARENT_LOCKED 4
+#define EXTENT_BIO_FLAG_SHIFT 16
+
+/* these are bit numbers for test/set bit */
+#define EXTENT_BUFFER_UPTODATE 0
+#define EXTENT_BUFFER_DIRTY 2
+#define EXTENT_BUFFER_CORRUPT 3
+#define EXTENT_BUFFER_READAHEAD 4 /* this got triggered by readahead */
+#define EXTENT_BUFFER_TREE_REF 5
+#define EXTENT_BUFFER_STALE 6
+#define EXTENT_BUFFER_WRITEBACK 7
+#define EXTENT_BUFFER_READ_ERR 8 /* read IO error */
+#define EXTENT_BUFFER_DUMMY 9
+#define EXTENT_BUFFER_IN_TREE 10
+#define EXTENT_BUFFER_WRITE_ERR 11 /* write IO error */
+
+/* these are flags for extent_clear_unlock_delalloc */
+#define PAGE_UNLOCK (1 << 0)
+#define PAGE_CLEAR_DIRTY (1 << 1)
+#define PAGE_SET_WRITEBACK (1 << 2)
+#define PAGE_END_WRITEBACK (1 << 3)
+#define PAGE_SET_PRIVATE2 (1 << 4)
+#define PAGE_SET_ERROR (1 << 5)
+
+/*
+ * page->private values. Every page that is controlled by the extent
+ * map has page->private set to one.
+ */
+#define EXTENT_PAGE_PRIVATE 1
+
+struct extent_state;
+struct btrfs_root;
+struct btrfs_io_bio;
+
+typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
+ struct bio *bio, int mirror_num,
+ unsigned long bio_flags, u64 bio_offset);
+struct extent_io_ops {
+ int (*fill_delalloc)(struct inode *inode, struct page *locked_page,
+ u64 start, u64 end, int *page_started,
+ unsigned long *nr_written);
+ int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
+ int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
+ extent_submit_bio_hook_t *submit_bio_hook;
+ int (*merge_bio_hook)(int rw, struct page *page, unsigned long offset,
+ size_t size, struct bio *bio,
+ unsigned long bio_flags);
+ int (*readpage_io_failed_hook)(struct page *page, int failed_mirror);
+ int (*readpage_end_io_hook)(struct btrfs_io_bio *io_bio, u64 phy_offset,
+ struct page *page, u64 start, u64 end,
+ int mirror);
+ int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
+ struct extent_state *state, int uptodate);
+ void (*set_bit_hook)(struct inode *inode, struct extent_state *state,
+ unsigned *bits);
+ void (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
+ unsigned *bits);
+ void (*merge_extent_hook)(struct inode *inode,
+ struct extent_state *new,
+ struct extent_state *other);
+ void (*split_extent_hook)(struct inode *inode,
+ struct extent_state *orig, u64 split);
+};
+
+struct extent_io_tree {
+ struct rb_root state;
+ struct address_space *mapping;
+ u64 dirty_bytes;
+ int track_uptodate;
+ spinlock_t lock;
+ const struct extent_io_ops *ops;
+};
+
+struct extent_state {
+ u64 start;
+ u64 end; /* inclusive */
+ struct rb_node rb_node;
+
+ /* ADD NEW ELEMENTS AFTER THIS */
+ wait_queue_head_t wq;
+ atomic_t refs;
+ unsigned state;
+
+ /* for use by the FS */
+ u64 private;
+
+#ifdef CONFIG_BTRFS_DEBUG
+ struct list_head leak_list;
+#endif
+};
+
+#define INLINE_EXTENT_BUFFER_PAGES 16
+#define MAX_INLINE_EXTENT_BUFFER_SIZE (INLINE_EXTENT_BUFFER_PAGES * PAGE_CACHE_SIZE)
+struct extent_buffer {
+ u64 start;
+ unsigned long len;
+ unsigned long bflags;
+ struct btrfs_fs_info *fs_info;
+ spinlock_t refs_lock;
+ atomic_t refs;
+ atomic_t io_pages;
+ int read_mirror;
+ struct rcu_head rcu_head;
+ pid_t lock_owner;
+
+ /* count of read lock holders on the extent buffer */
+ atomic_t write_locks;
+ atomic_t read_locks;
+ atomic_t blocking_writers;
+ atomic_t blocking_readers;
+ atomic_t spinning_readers;
+ atomic_t spinning_writers;
+ short lock_nested;
+ /* >= 0 if eb belongs to a log tree, -1 otherwise */
+ short log_index;
+
+ /* protects write locks */
+ rwlock_t lock;
+
+ /* readers use lock_wq while they wait for the write
+ * lock holders to unlock
+ */
+ wait_queue_head_t write_lock_wq;
+
+ /* writers use read_lock_wq while they wait for readers
+ * to unlock
+ */
+ wait_queue_head_t read_lock_wq;
+ struct page *pages[INLINE_EXTENT_BUFFER_PAGES];
+#ifdef CONFIG_BTRFS_DEBUG
+ struct list_head leak_list;
+#endif
+};
+
+static inline void extent_set_compress_type(unsigned long *bio_flags,
+ int compress_type)
+{
+ *bio_flags |= compress_type << EXTENT_BIO_FLAG_SHIFT;
+}
+
+static inline int extent_compress_type(unsigned long bio_flags)
+{
+ return bio_flags >> EXTENT_BIO_FLAG_SHIFT;
+}
+
+struct extent_map_tree;
+
+typedef struct extent_map *(get_extent_t)(struct inode *inode,
+ struct page *page,
+ size_t pg_offset,
+ u64 start, u64 len,
+ int create);
+
+void extent_io_tree_init(struct extent_io_tree *tree,
+ struct address_space *mapping);
+int try_release_extent_mapping(struct extent_map_tree *map,
+ struct extent_io_tree *tree, struct page *page,
+ gfp_t mask);
+int try_release_extent_buffer(struct page *page);
+int lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
+int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, struct extent_state **cached);
+int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end);
+int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached, gfp_t mask);
+int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
+int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
+ get_extent_t *get_extent, int mirror_num);
+int extent_read_full_page_nolock(struct extent_io_tree *tree, struct page *page,
+ get_extent_t *get_extent, int mirror_num);
+int __init extent_io_init(void);
+void extent_io_exit(void);
+
+u64 count_range_bits(struct extent_io_tree *tree,
+ u64 *start, u64 search_end,
+ u64 max_bytes, unsigned bits, int contig);
+
+void free_extent_state(struct extent_state *state);
+int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, int filled,
+ struct extent_state *cached_state);
+int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, gfp_t mask);
+int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, int wake, int delete,
+ struct extent_state **cached, gfp_t mask);
+int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, gfp_t mask);
+int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, u64 *failed_start,
+ struct extent_state **cached_state, gfp_t mask);
+int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached_state, gfp_t mask);
+int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached_state, gfp_t mask);
+int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
+ gfp_t mask);
+int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
+ gfp_t mask);
+int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
+ gfp_t mask);
+int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, unsigned clear_bits,
+ struct extent_state **cached_state, gfp_t mask);
+int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached_state, gfp_t mask);
+int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached_state, gfp_t mask);
+int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, unsigned bits,
+ struct extent_state **cached_state);
+int extent_invalidatepage(struct extent_io_tree *tree,
+ struct page *page, unsigned long offset);
+int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
+ get_extent_t *get_extent,
+ struct writeback_control *wbc);
+int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
+ u64 start, u64 end, get_extent_t *get_extent,
+ int mode);
+int extent_writepages(struct extent_io_tree *tree,
+ struct address_space *mapping,
+ get_extent_t *get_extent,
+ struct writeback_control *wbc);
+int btree_write_cache_pages(struct address_space *mapping,
+ struct writeback_control *wbc);
+int extent_readpages(struct extent_io_tree *tree,
+ struct address_space *mapping,
+ struct list_head *pages, unsigned nr_pages,
+ get_extent_t get_extent);
+int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ __u64 start, __u64 len, get_extent_t *get_extent);
+int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
+void set_page_extent_mapped(struct page *page);
+
+struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
+ u64 start);
+struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
+ u64 start);
+struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src);
+struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
+ u64 start);
+void free_extent_buffer(struct extent_buffer *eb);
+void free_extent_buffer_stale(struct extent_buffer *eb);
+#define WAIT_NONE 0
+#define WAIT_COMPLETE 1
+#define WAIT_PAGE_LOCK 2
+int read_extent_buffer_pages(struct extent_io_tree *tree,
+ struct extent_buffer *eb, u64 start, int wait,
+ get_extent_t *get_extent, int mirror_num);
+void wait_on_extent_buffer_writeback(struct extent_buffer *eb);
+
+static inline unsigned long num_extent_pages(u64 start, u64 len)
+{
+ return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
+ (start >> PAGE_CACHE_SHIFT);
+}
+
+static inline void extent_buffer_get(struct extent_buffer *eb)
+{
+ atomic_inc(&eb->refs);
+}
+
+int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
+ unsigned long start,
+ unsigned long len);
+void read_extent_buffer(struct extent_buffer *eb, void *dst,
+ unsigned long start,
+ unsigned long len);
+int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dst,
+ unsigned long start,
+ unsigned long len);
+void write_extent_buffer(struct extent_buffer *eb, const void *src,
+ unsigned long start, unsigned long len);
+void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
+ unsigned long dst_offset, unsigned long src_offset,
+ unsigned long len);
+void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
+ unsigned long src_offset, unsigned long len);
+void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
+ unsigned long src_offset, unsigned long len);
+void memset_extent_buffer(struct extent_buffer *eb, char c,
+ unsigned long start, unsigned long len);
+void clear_extent_buffer_dirty(struct extent_buffer *eb);
+int set_extent_buffer_dirty(struct extent_buffer *eb);
+int set_extent_buffer_uptodate(struct extent_buffer *eb);
+int clear_extent_buffer_uptodate(struct extent_buffer *eb);
+int extent_buffer_uptodate(struct extent_buffer *eb);
+int extent_buffer_under_io(struct extent_buffer *eb);
+int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,
+ unsigned long min_len, char **map,
+ unsigned long *map_start,
+ unsigned long *map_len);
+int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end);
+int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
+int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
+ struct page *locked_page,
+ unsigned bits_to_clear,
+ unsigned long page_ops);
+struct bio *
+btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
+ gfp_t gfp_flags);
+struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs);
+struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask);
+
+struct btrfs_fs_info;
+
+int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
+ struct page *page, unsigned int pg_offset,
+ int mirror_num);
+int clean_io_failure(struct inode *inode, u64 start, struct page *page,
+ unsigned int pg_offset);
+int end_extent_writepage(struct page *page, int err, u64 start, u64 end);
+int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
+ int mirror_num);
+
+/*
+ * When IO fails, either with EIO or csum verification fails, we
+ * try other mirrors that might have a good copy of the data. This
+ * io_failure_record is used to record state as we go through all the
+ * mirrors. If another mirror has good data, the page is set up to date
+ * and things continue. If a good mirror can't be found, the original
+ * bio end_io callback is called to indicate things have failed.
+ */
+struct io_failure_record {
+ struct page *page;
+ u64 start;
+ u64 len;
+ u64 logical;
+ unsigned long bio_flags;
+ int this_mirror;
+ int failed_mirror;
+ int in_validation;
+};
+
+void btrfs_free_io_failure_record(struct inode *inode, u64 start, u64 end);
+int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
+ struct io_failure_record **failrec_ret);
+int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio,
+ struct io_failure_record *failrec, int fail_mirror);
+struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
+ struct io_failure_record *failrec,
+ struct page *page, int pg_offset, int icsum,
+ bio_end_io_t *endio_func, void *data);
+int free_io_failure(struct inode *inode, struct io_failure_record *rec);
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+noinline u64 find_lock_delalloc_range(struct inode *inode,
+ struct extent_io_tree *tree,
+ struct page *locked_page, u64 *start,
+ u64 *end, u64 max_bytes);
+#endif
+struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
+ u64 start);
+#endif
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
new file mode 100644
index 000000000..6a98bddd8
--- /dev/null
+++ b/fs/btrfs/extent_map.c
@@ -0,0 +1,455 @@
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/hardirq.h>
+#include "ctree.h"
+#include "extent_map.h"
+
+
+static struct kmem_cache *extent_map_cache;
+
+int __init extent_map_init(void)
+{
+ extent_map_cache = kmem_cache_create("btrfs_extent_map",
+ sizeof(struct extent_map), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+ if (!extent_map_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void extent_map_exit(void)
+{
+ if (extent_map_cache)
+ kmem_cache_destroy(extent_map_cache);
+}
+
+/**
+ * extent_map_tree_init - initialize extent map tree
+ * @tree: tree to initialize
+ *
+ * Initialize the extent tree @tree. Should be called for each new inode
+ * or other user of the extent_map interface.
+ */
+void extent_map_tree_init(struct extent_map_tree *tree)
+{
+ tree->map = RB_ROOT;
+ INIT_LIST_HEAD(&tree->modified_extents);
+ rwlock_init(&tree->lock);
+}
+
+/**
+ * alloc_extent_map - allocate new extent map structure
+ *
+ * Allocate a new extent_map structure. The new structure is
+ * returned with a reference count of one and needs to be
+ * freed using free_extent_map()
+ */
+struct extent_map *alloc_extent_map(void)
+{
+ struct extent_map *em;
+ em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS);
+ if (!em)
+ return NULL;
+ RB_CLEAR_NODE(&em->rb_node);
+ em->flags = 0;
+ em->compress_type = BTRFS_COMPRESS_NONE;
+ em->generation = 0;
+ atomic_set(&em->refs, 1);
+ INIT_LIST_HEAD(&em->list);
+ return em;
+}
+
+/**
+ * free_extent_map - drop reference count of an extent_map
+ * @em: extent map beeing releasead
+ *
+ * Drops the reference out on @em by one and free the structure
+ * if the reference count hits zero.
+ */
+void free_extent_map(struct extent_map *em)
+{
+ if (!em)
+ return;
+ WARN_ON(atomic_read(&em->refs) == 0);
+ if (atomic_dec_and_test(&em->refs)) {
+ WARN_ON(extent_map_in_tree(em));
+ WARN_ON(!list_empty(&em->list));
+ if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags))
+ kfree(em->bdev);
+ kmem_cache_free(extent_map_cache, em);
+ }
+}
+
+/* simple helper to do math around the end of an extent, handling wrap */
+static u64 range_end(u64 start, u64 len)
+{
+ if (start + len < start)
+ return (u64)-1;
+ return start + len;
+}
+
+static int tree_insert(struct rb_root *root, struct extent_map *em)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+ struct extent_map *entry = NULL;
+ struct rb_node *orig_parent = NULL;
+ u64 end = range_end(em->start, em->len);
+
+ while (*p) {
+ parent = *p;
+ entry = rb_entry(parent, struct extent_map, rb_node);
+
+ if (em->start < entry->start)
+ p = &(*p)->rb_left;
+ else if (em->start >= extent_map_end(entry))
+ p = &(*p)->rb_right;
+ else
+ return -EEXIST;
+ }
+
+ orig_parent = parent;
+ while (parent && em->start >= extent_map_end(entry)) {
+ parent = rb_next(parent);
+ entry = rb_entry(parent, struct extent_map, rb_node);
+ }
+ if (parent)
+ if (end > entry->start && em->start < extent_map_end(entry))
+ return -EEXIST;
+
+ parent = orig_parent;
+ entry = rb_entry(parent, struct extent_map, rb_node);
+ while (parent && em->start < entry->start) {
+ parent = rb_prev(parent);
+ entry = rb_entry(parent, struct extent_map, rb_node);
+ }
+ if (parent)
+ if (end > entry->start && em->start < extent_map_end(entry))
+ return -EEXIST;
+
+ rb_link_node(&em->rb_node, orig_parent, p);
+ rb_insert_color(&em->rb_node, root);
+ return 0;
+}
+
+/*
+ * search through the tree for an extent_map with a given offset. If
+ * it can't be found, try to find some neighboring extents
+ */
+static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
+ struct rb_node **prev_ret,
+ struct rb_node **next_ret)
+{
+ struct rb_node *n = root->rb_node;
+ struct rb_node *prev = NULL;
+ struct rb_node *orig_prev = NULL;
+ struct extent_map *entry;
+ struct extent_map *prev_entry = NULL;
+
+ while (n) {
+ entry = rb_entry(n, struct extent_map, rb_node);
+ prev = n;
+ prev_entry = entry;
+
+ if (offset < entry->start)
+ n = n->rb_left;
+ else if (offset >= extent_map_end(entry))
+ n = n->rb_right;
+ else
+ return n;
+ }
+
+ if (prev_ret) {
+ orig_prev = prev;
+ while (prev && offset >= extent_map_end(prev_entry)) {
+ prev = rb_next(prev);
+ prev_entry = rb_entry(prev, struct extent_map, rb_node);
+ }
+ *prev_ret = prev;
+ prev = orig_prev;
+ }
+
+ if (next_ret) {
+ prev_entry = rb_entry(prev, struct extent_map, rb_node);
+ while (prev && offset < prev_entry->start) {
+ prev = rb_prev(prev);
+ prev_entry = rb_entry(prev, struct extent_map, rb_node);
+ }
+ *next_ret = prev;
+ }
+ return NULL;
+}
+
+/* check to see if two extent_map structs are adjacent and safe to merge */
+static int mergable_maps(struct extent_map *prev, struct extent_map *next)
+{
+ if (test_bit(EXTENT_FLAG_PINNED, &prev->flags))
+ return 0;
+
+ /*
+ * don't merge compressed extents, we need to know their
+ * actual size
+ */
+ if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags))
+ return 0;
+
+ if (test_bit(EXTENT_FLAG_LOGGING, &prev->flags) ||
+ test_bit(EXTENT_FLAG_LOGGING, &next->flags))
+ return 0;
+
+ /*
+ * We don't want to merge stuff that hasn't been written to the log yet
+ * since it may not reflect exactly what is on disk, and that would be
+ * bad.
+ */
+ if (!list_empty(&prev->list) || !list_empty(&next->list))
+ return 0;
+
+ if (extent_map_end(prev) == next->start &&
+ prev->flags == next->flags &&
+ prev->bdev == next->bdev &&
+ ((next->block_start == EXTENT_MAP_HOLE &&
+ prev->block_start == EXTENT_MAP_HOLE) ||
+ (next->block_start == EXTENT_MAP_INLINE &&
+ prev->block_start == EXTENT_MAP_INLINE) ||
+ (next->block_start == EXTENT_MAP_DELALLOC &&
+ prev->block_start == EXTENT_MAP_DELALLOC) ||
+ (next->block_start < EXTENT_MAP_LAST_BYTE - 1 &&
+ next->block_start == extent_map_block_end(prev)))) {
+ return 1;
+ }
+ return 0;
+}
+
+static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
+{
+ struct extent_map *merge = NULL;
+ struct rb_node *rb;
+
+ if (em->start != 0) {
+ rb = rb_prev(&em->rb_node);
+ if (rb)
+ merge = rb_entry(rb, struct extent_map, rb_node);
+ if (rb && mergable_maps(merge, em)) {
+ em->start = merge->start;
+ em->orig_start = merge->orig_start;
+ em->len += merge->len;
+ em->block_len += merge->block_len;
+ em->block_start = merge->block_start;
+ em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start;
+ em->mod_start = merge->mod_start;
+ em->generation = max(em->generation, merge->generation);
+
+ rb_erase(&merge->rb_node, &tree->map);
+ RB_CLEAR_NODE(&merge->rb_node);
+ free_extent_map(merge);
+ }
+ }
+
+ rb = rb_next(&em->rb_node);
+ if (rb)
+ merge = rb_entry(rb, struct extent_map, rb_node);
+ if (rb && mergable_maps(em, merge)) {
+ em->len += merge->len;
+ em->block_len += merge->block_len;
+ rb_erase(&merge->rb_node, &tree->map);
+ RB_CLEAR_NODE(&merge->rb_node);
+ em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
+ em->generation = max(em->generation, merge->generation);
+ free_extent_map(merge);
+ }
+}
+
+/**
+ * unpin_extent_cache - unpin an extent from the cache
+ * @tree: tree to unpin the extent in
+ * @start: logical offset in the file
+ * @len: length of the extent
+ * @gen: generation that this extent has been modified in
+ *
+ * Called after an extent has been written to disk properly. Set the generation
+ * to the generation that actually added the file item to the inode so we know
+ * we need to sync this extent when we call fsync().
+ */
+int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,
+ u64 gen)
+{
+ int ret = 0;
+ struct extent_map *em;
+ bool prealloc = false;
+
+ write_lock(&tree->lock);
+ em = lookup_extent_mapping(tree, start, len);
+
+ WARN_ON(!em || em->start != start);
+
+ if (!em)
+ goto out;
+
+ em->generation = gen;
+ clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+ em->mod_start = em->start;
+ em->mod_len = em->len;
+
+ if (test_bit(EXTENT_FLAG_FILLING, &em->flags)) {
+ prealloc = true;
+ clear_bit(EXTENT_FLAG_FILLING, &em->flags);
+ }
+
+ try_merge_map(tree, em);
+
+ if (prealloc) {
+ em->mod_start = em->start;
+ em->mod_len = em->len;
+ }
+
+ free_extent_map(em);
+out:
+ write_unlock(&tree->lock);
+ return ret;
+
+}
+
+void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em)
+{
+ clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
+ if (extent_map_in_tree(em))
+ try_merge_map(tree, em);
+}
+
+static inline void setup_extent_mapping(struct extent_map_tree *tree,
+ struct extent_map *em,
+ int modified)
+{
+ atomic_inc(&em->refs);
+ em->mod_start = em->start;
+ em->mod_len = em->len;
+
+ if (modified)
+ list_move(&em->list, &tree->modified_extents);
+ else
+ try_merge_map(tree, em);
+}
+
+/**
+ * add_extent_mapping - add new extent map to the extent tree
+ * @tree: tree to insert new map in
+ * @em: map to insert
+ *
+ * Insert @em into @tree or perform a simple forward/backward merge with
+ * existing mappings. The extent_map struct passed in will be inserted
+ * into the tree directly, with an additional reference taken, or a
+ * reference dropped if the merge attempt was successful.
+ */
+int add_extent_mapping(struct extent_map_tree *tree,
+ struct extent_map *em, int modified)
+{
+ int ret = 0;
+
+ ret = tree_insert(&tree->map, em);
+ if (ret)
+ goto out;
+
+ setup_extent_mapping(tree, em, modified);
+out:
+ return ret;
+}
+
+static struct extent_map *
+__lookup_extent_mapping(struct extent_map_tree *tree,
+ u64 start, u64 len, int strict)
+{
+ struct extent_map *em;
+ struct rb_node *rb_node;
+ struct rb_node *prev = NULL;
+ struct rb_node *next = NULL;
+ u64 end = range_end(start, len);
+
+ rb_node = __tree_search(&tree->map, start, &prev, &next);
+ if (!rb_node) {
+ if (prev)
+ rb_node = prev;
+ else if (next)
+ rb_node = next;
+ else
+ return NULL;
+ }
+
+ em = rb_entry(rb_node, struct extent_map, rb_node);
+
+ if (strict && !(end > em->start && start < extent_map_end(em)))
+ return NULL;
+
+ atomic_inc(&em->refs);
+ return em;
+}
+
+/**
+ * lookup_extent_mapping - lookup extent_map
+ * @tree: tree to lookup in
+ * @start: byte offset to start the search
+ * @len: length of the lookup range
+ *
+ * Find and return the first extent_map struct in @tree that intersects the
+ * [start, len] range. There may be additional objects in the tree that
+ * intersect, so check the object returned carefully to make sure that no
+ * additional lookups are needed.
+ */
+struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
+ u64 start, u64 len)
+{
+ return __lookup_extent_mapping(tree, start, len, 1);
+}
+
+/**
+ * search_extent_mapping - find a nearby extent map
+ * @tree: tree to lookup in
+ * @start: byte offset to start the search
+ * @len: length of the lookup range
+ *
+ * Find and return the first extent_map struct in @tree that intersects the
+ * [start, len] range.
+ *
+ * If one can't be found, any nearby extent may be returned
+ */
+struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
+ u64 start, u64 len)
+{
+ return __lookup_extent_mapping(tree, start, len, 0);
+}
+
+/**
+ * remove_extent_mapping - removes an extent_map from the extent tree
+ * @tree: extent tree to remove from
+ * @em: extent map beeing removed
+ *
+ * Removes @em from @tree. No reference counts are dropped, and no checks
+ * are done to see if the range is in use
+ */
+int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
+{
+ int ret = 0;
+
+ WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
+ rb_erase(&em->rb_node, &tree->map);
+ if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
+ list_del_init(&em->list);
+ RB_CLEAR_NODE(&em->rb_node);
+ return ret;
+}
+
+void replace_extent_mapping(struct extent_map_tree *tree,
+ struct extent_map *cur,
+ struct extent_map *new,
+ int modified)
+{
+ WARN_ON(test_bit(EXTENT_FLAG_PINNED, &cur->flags));
+ ASSERT(extent_map_in_tree(cur));
+ if (!test_bit(EXTENT_FLAG_LOGGING, &cur->flags))
+ list_del_init(&cur->list);
+ rb_replace_node(&cur->rb_node, &new->rb_node, &tree->map);
+ RB_CLEAR_NODE(&cur->rb_node);
+
+ setup_extent_mapping(tree, new, modified);
+}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
new file mode 100644
index 000000000..b2991fd85
--- /dev/null
+++ b/fs/btrfs/extent_map.h
@@ -0,0 +1,85 @@
+#ifndef __EXTENTMAP__
+#define __EXTENTMAP__
+
+#include <linux/rbtree.h>
+
+#define EXTENT_MAP_LAST_BYTE ((u64)-4)
+#define EXTENT_MAP_HOLE ((u64)-3)
+#define EXTENT_MAP_INLINE ((u64)-2)
+#define EXTENT_MAP_DELALLOC ((u64)-1)
+
+/* bits for the flags field */
+#define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */
+#define EXTENT_FLAG_COMPRESSED 1
+#define EXTENT_FLAG_VACANCY 2 /* no file extent item found */
+#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */
+#define EXTENT_FLAG_LOGGING 4 /* Logging this extent */
+#define EXTENT_FLAG_FILLING 5 /* Filling in a preallocated extent */
+#define EXTENT_FLAG_FS_MAPPING 6 /* filesystem extent mapping type */
+
+struct extent_map {
+ struct rb_node rb_node;
+
+ /* all of these are in bytes */
+ u64 start;
+ u64 len;
+ u64 mod_start;
+ u64 mod_len;
+ u64 orig_start;
+ u64 orig_block_len;
+ u64 ram_bytes;
+ u64 block_start;
+ u64 block_len;
+ u64 generation;
+ unsigned long flags;
+ struct block_device *bdev;
+ atomic_t refs;
+ unsigned int compress_type;
+ struct list_head list;
+};
+
+struct extent_map_tree {
+ struct rb_root map;
+ struct list_head modified_extents;
+ rwlock_t lock;
+};
+
+static inline int extent_map_in_tree(const struct extent_map *em)
+{
+ return !RB_EMPTY_NODE(&em->rb_node);
+}
+
+static inline u64 extent_map_end(struct extent_map *em)
+{
+ if (em->start + em->len < em->start)
+ return (u64)-1;
+ return em->start + em->len;
+}
+
+static inline u64 extent_map_block_end(struct extent_map *em)
+{
+ if (em->block_start + em->block_len < em->block_start)
+ return (u64)-1;
+ return em->block_start + em->block_len;
+}
+
+void extent_map_tree_init(struct extent_map_tree *tree);
+struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
+ u64 start, u64 len);
+int add_extent_mapping(struct extent_map_tree *tree,
+ struct extent_map *em, int modified);
+int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
+void replace_extent_mapping(struct extent_map_tree *tree,
+ struct extent_map *cur,
+ struct extent_map *new,
+ int modified);
+
+struct extent_map *alloc_extent_map(void);
+void free_extent_map(struct extent_map *em);
+int __init extent_map_init(void);
+void extent_map_exit(void);
+int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen);
+void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em);
+struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
+ u64 start, u64 len);
+#endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
new file mode 100644
index 000000000..58ece6558
--- /dev/null
+++ b/fs/btrfs/file-item.c
@@ -0,0 +1,953 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/bio.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "volumes.h"
+#include "print-tree.h"
+
+#define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \
+ sizeof(struct btrfs_item) * 2) / \
+ size) - 1))
+
+#define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \
+ PAGE_CACHE_SIZE))
+
+#define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \
+ sizeof(struct btrfs_ordered_sum)) / \
+ sizeof(u32) * (r)->sectorsize)
+
+int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 objectid, u64 pos,
+ u64 disk_offset, u64 disk_num_bytes,
+ u64 num_bytes, u64 offset, u64 ram_bytes,
+ u8 compression, u8 encryption, u16 other_encoding)
+{
+ int ret = 0;
+ struct btrfs_file_extent_item *item;
+ struct btrfs_key file_key;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ file_key.objectid = objectid;
+ file_key.offset = pos;
+ file_key.type = BTRFS_EXTENT_DATA_KEY;
+
+ path->leave_spinning = 1;
+ ret = btrfs_insert_empty_item(trans, root, path, &file_key,
+ sizeof(*item));
+ if (ret < 0)
+ goto out;
+ BUG_ON(ret); /* Can't happen */
+ leaf = path->nodes[0];
+ item = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_disk_bytenr(leaf, item, disk_offset);
+ btrfs_set_file_extent_disk_num_bytes(leaf, item, disk_num_bytes);
+ btrfs_set_file_extent_offset(leaf, item, offset);
+ btrfs_set_file_extent_num_bytes(leaf, item, num_bytes);
+ btrfs_set_file_extent_ram_bytes(leaf, item, ram_bytes);
+ btrfs_set_file_extent_generation(leaf, item, trans->transid);
+ btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
+ btrfs_set_file_extent_compression(leaf, item, compression);
+ btrfs_set_file_extent_encryption(leaf, item, encryption);
+ btrfs_set_file_extent_other_encoding(leaf, item, other_encoding);
+
+ btrfs_mark_buffer_dirty(leaf);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static struct btrfs_csum_item *
+btrfs_lookup_csum(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 bytenr, int cow)
+{
+ int ret;
+ struct btrfs_key file_key;
+ struct btrfs_key found_key;
+ struct btrfs_csum_item *item;
+ struct extent_buffer *leaf;
+ u64 csum_offset = 0;
+ u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
+ int csums_in_item;
+
+ file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+ file_key.offset = bytenr;
+ file_key.type = BTRFS_EXTENT_CSUM_KEY;
+ ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow);
+ if (ret < 0)
+ goto fail;
+ leaf = path->nodes[0];
+ if (ret > 0) {
+ ret = 1;
+ if (path->slots[0] == 0)
+ goto fail;
+ path->slots[0]--;
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ if (found_key.type != BTRFS_EXTENT_CSUM_KEY)
+ goto fail;
+
+ csum_offset = (bytenr - found_key.offset) >>
+ root->fs_info->sb->s_blocksize_bits;
+ csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]);
+ csums_in_item /= csum_size;
+
+ if (csum_offset == csums_in_item) {
+ ret = -EFBIG;
+ goto fail;
+ } else if (csum_offset > csums_in_item) {
+ goto fail;
+ }
+ }
+ item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
+ item = (struct btrfs_csum_item *)((unsigned char *)item +
+ csum_offset * csum_size);
+ return item;
+fail:
+ if (ret > 0)
+ ret = -ENOENT;
+ return ERR_PTR(ret);
+}
+
+int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 objectid,
+ u64 offset, int mod)
+{
+ int ret;
+ struct btrfs_key file_key;
+ int ins_len = mod < 0 ? -1 : 0;
+ int cow = mod != 0;
+
+ file_key.objectid = objectid;
+ file_key.offset = offset;
+ file_key.type = BTRFS_EXTENT_DATA_KEY;
+ ret = btrfs_search_slot(trans, root, &file_key, path, ins_len, cow);
+ return ret;
+}
+
+static void btrfs_io_bio_endio_readpage(struct btrfs_io_bio *bio, int err)
+{
+ kfree(bio->csum_allocated);
+}
+
+static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
+ struct inode *inode, struct bio *bio,
+ u64 logical_offset, u32 *dst, int dio)
+{
+ struct bio_vec *bvec = bio->bi_io_vec;
+ struct btrfs_io_bio *btrfs_bio = btrfs_io_bio(bio);
+ struct btrfs_csum_item *item = NULL;
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct btrfs_path *path;
+ u8 *csum;
+ u64 offset = 0;
+ u64 item_start_offset = 0;
+ u64 item_last_offset = 0;
+ u64 disk_bytenr;
+ u32 diff;
+ int nblocks;
+ int bio_index = 0;
+ int count;
+ u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ nblocks = bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits;
+ if (!dst) {
+ if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) {
+ btrfs_bio->csum_allocated = kmalloc_array(nblocks,
+ csum_size, GFP_NOFS);
+ if (!btrfs_bio->csum_allocated) {
+ btrfs_free_path(path);
+ return -ENOMEM;
+ }
+ btrfs_bio->csum = btrfs_bio->csum_allocated;
+ btrfs_bio->end_io = btrfs_io_bio_endio_readpage;
+ } else {
+ btrfs_bio->csum = btrfs_bio->csum_inline;
+ }
+ csum = btrfs_bio->csum;
+ } else {
+ csum = (u8 *)dst;
+ }
+
+ if (bio->bi_iter.bi_size > PAGE_CACHE_SIZE * 8)
+ path->reada = 2;
+
+ WARN_ON(bio->bi_vcnt <= 0);
+
+ /*
+ * the free space stuff is only read when it hasn't been
+ * updated in the current transaction. So, we can safely
+ * read from the commit root and sidestep a nasty deadlock
+ * between reading the free space cache and updating the csum tree.
+ */
+ if (btrfs_is_free_space_inode(inode)) {
+ path->search_commit_root = 1;
+ path->skip_locking = 1;
+ }
+
+ disk_bytenr = (u64)bio->bi_iter.bi_sector << 9;
+ if (dio)
+ offset = logical_offset;
+ while (bio_index < bio->bi_vcnt) {
+ if (!dio)
+ offset = page_offset(bvec->bv_page) + bvec->bv_offset;
+ count = btrfs_find_ordered_sum(inode, offset, disk_bytenr,
+ (u32 *)csum, nblocks);
+ if (count)
+ goto found;
+
+ if (!item || disk_bytenr < item_start_offset ||
+ disk_bytenr >= item_last_offset) {
+ struct btrfs_key found_key;
+ u32 item_size;
+
+ if (item)
+ btrfs_release_path(path);
+ item = btrfs_lookup_csum(NULL, root->fs_info->csum_root,
+ path, disk_bytenr, 0);
+ if (IS_ERR(item)) {
+ count = 1;
+ memset(csum, 0, csum_size);
+ if (BTRFS_I(inode)->root->root_key.objectid ==
+ BTRFS_DATA_RELOC_TREE_OBJECTID) {
+ set_extent_bits(io_tree, offset,
+ offset + bvec->bv_len - 1,
+ EXTENT_NODATASUM, GFP_NOFS);
+ } else {
+ btrfs_info(BTRFS_I(inode)->root->fs_info,
+ "no csum found for inode %llu start %llu",
+ btrfs_ino(inode), offset);
+ }
+ item = NULL;
+ btrfs_release_path(path);
+ goto found;
+ }
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+ path->slots[0]);
+
+ item_start_offset = found_key.offset;
+ item_size = btrfs_item_size_nr(path->nodes[0],
+ path->slots[0]);
+ item_last_offset = item_start_offset +
+ (item_size / csum_size) *
+ root->sectorsize;
+ item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_csum_item);
+ }
+ /*
+ * this byte range must be able to fit inside
+ * a single leaf so it will also fit inside a u32
+ */
+ diff = disk_bytenr - item_start_offset;
+ diff = diff / root->sectorsize;
+ diff = diff * csum_size;
+ count = min_t(int, nblocks, (item_last_offset - disk_bytenr) >>
+ inode->i_sb->s_blocksize_bits);
+ read_extent_buffer(path->nodes[0], csum,
+ ((unsigned long)item) + diff,
+ csum_size * count);
+found:
+ csum += count * csum_size;
+ nblocks -= count;
+ bio_index += count;
+ while (count--) {
+ disk_bytenr += bvec->bv_len;
+ offset += bvec->bv_len;
+ bvec++;
+ }
+ }
+ btrfs_free_path(path);
+ return 0;
+}
+
+int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
+ struct bio *bio, u32 *dst)
+{
+ return __btrfs_lookup_bio_sums(root, inode, bio, 0, dst, 0);
+}
+
+int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
+ struct bio *bio, u64 offset)
+{
+ return __btrfs_lookup_bio_sums(root, inode, bio, offset, NULL, 1);
+}
+
+int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
+ struct list_head *list, int search_commit)
+{
+ struct btrfs_key key;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_ordered_sum *sums;
+ struct btrfs_csum_item *item;
+ LIST_HEAD(tmplist);
+ unsigned long offset;
+ int ret;
+ size_t size;
+ u64 csum_end;
+ u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
+
+ ASSERT(IS_ALIGNED(start, root->sectorsize) &&
+ IS_ALIGNED(end + 1, root->sectorsize));
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ if (search_commit) {
+ path->skip_locking = 1;
+ path->reada = 2;
+ path->search_commit_root = 1;
+ }
+
+ key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+ key.offset = start;
+ key.type = BTRFS_EXTENT_CSUM_KEY;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto fail;
+ if (ret > 0 && path->slots[0] > 0) {
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
+ if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID &&
+ key.type == BTRFS_EXTENT_CSUM_KEY) {
+ offset = (start - key.offset) >>
+ root->fs_info->sb->s_blocksize_bits;
+ if (offset * csum_size <
+ btrfs_item_size_nr(leaf, path->slots[0] - 1))
+ path->slots[0]--;
+ }
+ }
+
+ while (start <= end) {
+ leaf = path->nodes[0];
+ if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto fail;
+ if (ret > 0)
+ break;
+ leaf = path->nodes[0];
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
+ key.type != BTRFS_EXTENT_CSUM_KEY ||
+ key.offset > end)
+ break;
+
+ if (key.offset > start)
+ start = key.offset;
+
+ size = btrfs_item_size_nr(leaf, path->slots[0]);
+ csum_end = key.offset + (size / csum_size) * root->sectorsize;
+ if (csum_end <= start) {
+ path->slots[0]++;
+ continue;
+ }
+
+ csum_end = min(csum_end, end + 1);
+ item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_csum_item);
+ while (start < csum_end) {
+ size = min_t(size_t, csum_end - start,
+ MAX_ORDERED_SUM_BYTES(root));
+ sums = kzalloc(btrfs_ordered_sum_size(root, size),
+ GFP_NOFS);
+ if (!sums) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+
+ sums->bytenr = start;
+ sums->len = (int)size;
+
+ offset = (start - key.offset) >>
+ root->fs_info->sb->s_blocksize_bits;
+ offset *= csum_size;
+ size >>= root->fs_info->sb->s_blocksize_bits;
+
+ read_extent_buffer(path->nodes[0],
+ sums->sums,
+ ((unsigned long)item) + offset,
+ csum_size * size);
+
+ start += root->sectorsize * size;
+ list_add_tail(&sums->list, &tmplist);
+ }
+ path->slots[0]++;
+ }
+ ret = 0;
+fail:
+ while (ret < 0 && !list_empty(&tmplist)) {
+ sums = list_entry(tmplist.next, struct btrfs_ordered_sum, list);
+ list_del(&sums->list);
+ kfree(sums);
+ }
+ list_splice_tail(&tmplist, list);
+
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
+ struct bio *bio, u64 file_start, int contig)
+{
+ struct btrfs_ordered_sum *sums;
+ struct btrfs_ordered_extent *ordered;
+ char *data;
+ struct bio_vec *bvec = bio->bi_io_vec;
+ int bio_index = 0;
+ int index;
+ unsigned long total_bytes = 0;
+ unsigned long this_sum_bytes = 0;
+ u64 offset;
+
+ WARN_ON(bio->bi_vcnt <= 0);
+ sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_iter.bi_size),
+ GFP_NOFS);
+ if (!sums)
+ return -ENOMEM;
+
+ sums->len = bio->bi_iter.bi_size;
+ INIT_LIST_HEAD(&sums->list);
+
+ if (contig)
+ offset = file_start;
+ else
+ offset = page_offset(bvec->bv_page) + bvec->bv_offset;
+
+ ordered = btrfs_lookup_ordered_extent(inode, offset);
+ BUG_ON(!ordered); /* Logic error */
+ sums->bytenr = (u64)bio->bi_iter.bi_sector << 9;
+ index = 0;
+
+ while (bio_index < bio->bi_vcnt) {
+ if (!contig)
+ offset = page_offset(bvec->bv_page) + bvec->bv_offset;
+
+ if (offset >= ordered->file_offset + ordered->len ||
+ offset < ordered->file_offset) {
+ unsigned long bytes_left;
+ sums->len = this_sum_bytes;
+ this_sum_bytes = 0;
+ btrfs_add_ordered_sum(inode, ordered, sums);
+ btrfs_put_ordered_extent(ordered);
+
+ bytes_left = bio->bi_iter.bi_size - total_bytes;
+
+ sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left),
+ GFP_NOFS);
+ BUG_ON(!sums); /* -ENOMEM */
+ sums->len = bytes_left;
+ ordered = btrfs_lookup_ordered_extent(inode, offset);
+ BUG_ON(!ordered); /* Logic error */
+ sums->bytenr = ((u64)bio->bi_iter.bi_sector << 9) +
+ total_bytes;
+ index = 0;
+ }
+
+ data = kmap_atomic(bvec->bv_page);
+ sums->sums[index] = ~(u32)0;
+ sums->sums[index] = btrfs_csum_data(data + bvec->bv_offset,
+ sums->sums[index],
+ bvec->bv_len);
+ kunmap_atomic(data);
+ btrfs_csum_final(sums->sums[index],
+ (char *)(sums->sums + index));
+
+ bio_index++;
+ index++;
+ total_bytes += bvec->bv_len;
+ this_sum_bytes += bvec->bv_len;
+ offset += bvec->bv_len;
+ bvec++;
+ }
+ this_sum_bytes = 0;
+ btrfs_add_ordered_sum(inode, ordered, sums);
+ btrfs_put_ordered_extent(ordered);
+ return 0;
+}
+
+/*
+ * helper function for csum removal, this expects the
+ * key to describe the csum pointed to by the path, and it expects
+ * the csum to overlap the range [bytenr, len]
+ *
+ * The csum should not be entirely contained in the range and the
+ * range should not be entirely contained in the csum.
+ *
+ * This calls btrfs_truncate_item with the correct args based on the
+ * overlap, and fixes up the key as required.
+ */
+static noinline void truncate_one_csum(struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_key *key,
+ u64 bytenr, u64 len)
+{
+ struct extent_buffer *leaf;
+ u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
+ u64 csum_end;
+ u64 end_byte = bytenr + len;
+ u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits;
+
+ leaf = path->nodes[0];
+ csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size;
+ csum_end <<= root->fs_info->sb->s_blocksize_bits;
+ csum_end += key->offset;
+
+ if (key->offset < bytenr && csum_end <= end_byte) {
+ /*
+ * [ bytenr - len ]
+ * [ ]
+ * [csum ]
+ * A simple truncate off the end of the item
+ */
+ u32 new_size = (bytenr - key->offset) >> blocksize_bits;
+ new_size *= csum_size;
+ btrfs_truncate_item(root, path, new_size, 1);
+ } else if (key->offset >= bytenr && csum_end > end_byte &&
+ end_byte > key->offset) {
+ /*
+ * [ bytenr - len ]
+ * [ ]
+ * [csum ]
+ * we need to truncate from the beginning of the csum
+ */
+ u32 new_size = (csum_end - end_byte) >> blocksize_bits;
+ new_size *= csum_size;
+
+ btrfs_truncate_item(root, path, new_size, 0);
+
+ key->offset = end_byte;
+ btrfs_set_item_key_safe(root->fs_info, path, key);
+ } else {
+ BUG();
+ }
+}
+
+/*
+ * deletes the csum items from the csum tree for a given
+ * range of bytes.
+ */
+int btrfs_del_csums(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytenr, u64 len)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ u64 end_byte = bytenr + len;
+ u64 csum_end;
+ struct extent_buffer *leaf;
+ int ret;
+ u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
+ int blocksize_bits = root->fs_info->sb->s_blocksize_bits;
+
+ root = root->fs_info->csum_root;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ while (1) {
+ key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+ key.offset = end_byte - 1;
+ key.type = BTRFS_EXTENT_CSUM_KEY;
+
+ path->leave_spinning = 1;
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret > 0) {
+ if (path->slots[0] == 0)
+ break;
+ path->slots[0]--;
+ } else if (ret < 0) {
+ break;
+ }
+
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+ if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
+ key.type != BTRFS_EXTENT_CSUM_KEY) {
+ break;
+ }
+
+ if (key.offset >= end_byte)
+ break;
+
+ csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size;
+ csum_end <<= blocksize_bits;
+ csum_end += key.offset;
+
+ /* this csum ends before we start, we're done */
+ if (csum_end <= bytenr)
+ break;
+
+ /* delete the entire item, it is inside our range */
+ if (key.offset >= bytenr && csum_end <= end_byte) {
+ ret = btrfs_del_item(trans, root, path);
+ if (ret)
+ goto out;
+ if (key.offset == bytenr)
+ break;
+ } else if (key.offset < bytenr && csum_end > end_byte) {
+ unsigned long offset;
+ unsigned long shift_len;
+ unsigned long item_offset;
+ /*
+ * [ bytenr - len ]
+ * [csum ]
+ *
+ * Our bytes are in the middle of the csum,
+ * we need to split this item and insert a new one.
+ *
+ * But we can't drop the path because the
+ * csum could change, get removed, extended etc.
+ *
+ * The trick here is the max size of a csum item leaves
+ * enough room in the tree block for a single
+ * item header. So, we split the item in place,
+ * adding a new header pointing to the existing
+ * bytes. Then we loop around again and we have
+ * a nicely formed csum item that we can neatly
+ * truncate.
+ */
+ offset = (bytenr - key.offset) >> blocksize_bits;
+ offset *= csum_size;
+
+ shift_len = (len >> blocksize_bits) * csum_size;
+
+ item_offset = btrfs_item_ptr_offset(leaf,
+ path->slots[0]);
+
+ memset_extent_buffer(leaf, 0, item_offset + offset,
+ shift_len);
+ key.offset = bytenr;
+
+ /*
+ * btrfs_split_item returns -EAGAIN when the
+ * item changed size or key
+ */
+ ret = btrfs_split_item(trans, root, path, &key, offset);
+ if (ret && ret != -EAGAIN) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
+
+ key.offset = end_byte - 1;
+ } else {
+ truncate_one_csum(root, path, &key, bytenr, len);
+ if (key.offset < bytenr)
+ break;
+ }
+ btrfs_release_path(path);
+ }
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_ordered_sum *sums)
+{
+ struct btrfs_key file_key;
+ struct btrfs_key found_key;
+ struct btrfs_path *path;
+ struct btrfs_csum_item *item;
+ struct btrfs_csum_item *item_end;
+ struct extent_buffer *leaf = NULL;
+ u64 next_offset;
+ u64 total_bytes = 0;
+ u64 csum_offset;
+ u64 bytenr;
+ u32 nritems;
+ u32 ins_size;
+ int index = 0;
+ int found_next;
+ int ret;
+ u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+again:
+ next_offset = (u64)-1;
+ found_next = 0;
+ bytenr = sums->bytenr + total_bytes;
+ file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+ file_key.offset = bytenr;
+ file_key.type = BTRFS_EXTENT_CSUM_KEY;
+
+ item = btrfs_lookup_csum(trans, root, path, bytenr, 1);
+ if (!IS_ERR(item)) {
+ ret = 0;
+ leaf = path->nodes[0];
+ item_end = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_csum_item);
+ item_end = (struct btrfs_csum_item *)((char *)item_end +
+ btrfs_item_size_nr(leaf, path->slots[0]));
+ goto found;
+ }
+ ret = PTR_ERR(item);
+ if (ret != -EFBIG && ret != -ENOENT)
+ goto fail_unlock;
+
+ if (ret == -EFBIG) {
+ u32 item_size;
+ /* we found one, but it isn't big enough yet */
+ leaf = path->nodes[0];
+ item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ if ((item_size / csum_size) >=
+ MAX_CSUM_ITEMS(root, csum_size)) {
+ /* already at max size, make a new one */
+ goto insert;
+ }
+ } else {
+ int slot = path->slots[0] + 1;
+ /* we didn't find a csum item, insert one */
+ nritems = btrfs_header_nritems(path->nodes[0]);
+ if (!nritems || (path->slots[0] >= nritems - 1)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret == 1)
+ found_next = 1;
+ if (ret != 0)
+ goto insert;
+ slot = path->slots[0];
+ }
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot);
+ if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
+ found_key.type != BTRFS_EXTENT_CSUM_KEY) {
+ found_next = 1;
+ goto insert;
+ }
+ next_offset = found_key.offset;
+ found_next = 1;
+ goto insert;
+ }
+
+ /*
+ * at this point, we know the tree has an item, but it isn't big
+ * enough yet to put our csum in. Grow it
+ */
+ btrfs_release_path(path);
+ ret = btrfs_search_slot(trans, root, &file_key, path,
+ csum_size, 1);
+ if (ret < 0)
+ goto fail_unlock;
+
+ if (ret > 0) {
+ if (path->slots[0] == 0)
+ goto insert;
+ path->slots[0]--;
+ }
+
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ csum_offset = (bytenr - found_key.offset) >>
+ root->fs_info->sb->s_blocksize_bits;
+
+ if (found_key.type != BTRFS_EXTENT_CSUM_KEY ||
+ found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
+ csum_offset >= MAX_CSUM_ITEMS(root, csum_size)) {
+ goto insert;
+ }
+
+ if (csum_offset == btrfs_item_size_nr(leaf, path->slots[0]) /
+ csum_size) {
+ int extend_nr;
+ u64 tmp;
+ u32 diff;
+ u32 free_space;
+
+ if (btrfs_leaf_free_space(root, leaf) <
+ sizeof(struct btrfs_item) + csum_size * 2)
+ goto insert;
+
+ free_space = btrfs_leaf_free_space(root, leaf) -
+ sizeof(struct btrfs_item) - csum_size;
+ tmp = sums->len - total_bytes;
+ tmp >>= root->fs_info->sb->s_blocksize_bits;
+ WARN_ON(tmp < 1);
+
+ extend_nr = max_t(int, 1, (int)tmp);
+ diff = (csum_offset + extend_nr) * csum_size;
+ diff = min(diff, MAX_CSUM_ITEMS(root, csum_size) * csum_size);
+
+ diff = diff - btrfs_item_size_nr(leaf, path->slots[0]);
+ diff = min(free_space, diff);
+ diff /= csum_size;
+ diff *= csum_size;
+
+ btrfs_extend_item(root, path, diff);
+ ret = 0;
+ goto csum;
+ }
+
+insert:
+ btrfs_release_path(path);
+ csum_offset = 0;
+ if (found_next) {
+ u64 tmp;
+
+ tmp = sums->len - total_bytes;
+ tmp >>= root->fs_info->sb->s_blocksize_bits;
+ tmp = min(tmp, (next_offset - file_key.offset) >>
+ root->fs_info->sb->s_blocksize_bits);
+
+ tmp = max((u64)1, tmp);
+ tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root, csum_size));
+ ins_size = csum_size * tmp;
+ } else {
+ ins_size = csum_size;
+ }
+ path->leave_spinning = 1;
+ ret = btrfs_insert_empty_item(trans, root, path, &file_key,
+ ins_size);
+ path->leave_spinning = 0;
+ if (ret < 0)
+ goto fail_unlock;
+ if (WARN_ON(ret != 0))
+ goto fail_unlock;
+ leaf = path->nodes[0];
+csum:
+ item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
+ item_end = (struct btrfs_csum_item *)((unsigned char *)item +
+ btrfs_item_size_nr(leaf, path->slots[0]));
+ item = (struct btrfs_csum_item *)((unsigned char *)item +
+ csum_offset * csum_size);
+found:
+ ins_size = (u32)(sums->len - total_bytes) >>
+ root->fs_info->sb->s_blocksize_bits;
+ ins_size *= csum_size;
+ ins_size = min_t(u32, (unsigned long)item_end - (unsigned long)item,
+ ins_size);
+ write_extent_buffer(leaf, sums->sums + index, (unsigned long)item,
+ ins_size);
+
+ ins_size /= csum_size;
+ total_bytes += ins_size * root->sectorsize;
+ index += ins_size;
+
+ btrfs_mark_buffer_dirty(path->nodes[0]);
+ if (total_bytes < sums->len) {
+ btrfs_release_path(path);
+ cond_resched();
+ goto again;
+ }
+out:
+ btrfs_free_path(path);
+ return ret;
+
+fail_unlock:
+ goto out;
+}
+
+void btrfs_extent_item_to_extent_map(struct inode *inode,
+ const struct btrfs_path *path,
+ struct btrfs_file_extent_item *fi,
+ const bool new_inline,
+ struct extent_map *em)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct extent_buffer *leaf = path->nodes[0];
+ const int slot = path->slots[0];
+ struct btrfs_key key;
+ u64 extent_start, extent_end;
+ u64 bytenr;
+ u8 type = btrfs_file_extent_type(leaf, fi);
+ int compress_type = btrfs_file_extent_compression(leaf, fi);
+
+ em->bdev = root->fs_info->fs_devices->latest_bdev;
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ extent_start = key.offset;
+
+ if (type == BTRFS_FILE_EXTENT_REG ||
+ type == BTRFS_FILE_EXTENT_PREALLOC) {
+ extent_end = extent_start +
+ btrfs_file_extent_num_bytes(leaf, fi);
+ } else if (type == BTRFS_FILE_EXTENT_INLINE) {
+ size_t size;
+ size = btrfs_file_extent_inline_len(leaf, slot, fi);
+ extent_end = ALIGN(extent_start + size, root->sectorsize);
+ }
+
+ em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
+ if (type == BTRFS_FILE_EXTENT_REG ||
+ type == BTRFS_FILE_EXTENT_PREALLOC) {
+ em->start = extent_start;
+ em->len = extent_end - extent_start;
+ em->orig_start = extent_start -
+ btrfs_file_extent_offset(leaf, fi);
+ em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
+ bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+ if (bytenr == 0) {
+ em->block_start = EXTENT_MAP_HOLE;
+ return;
+ }
+ if (compress_type != BTRFS_COMPRESS_NONE) {
+ set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+ em->compress_type = compress_type;
+ em->block_start = bytenr;
+ em->block_len = em->orig_block_len;
+ } else {
+ bytenr += btrfs_file_extent_offset(leaf, fi);
+ em->block_start = bytenr;
+ em->block_len = em->len;
+ if (type == BTRFS_FILE_EXTENT_PREALLOC)
+ set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+ }
+ } else if (type == BTRFS_FILE_EXTENT_INLINE) {
+ em->block_start = EXTENT_MAP_INLINE;
+ em->start = extent_start;
+ em->len = extent_end - extent_start;
+ /*
+ * Initialize orig_start and block_len with the same values
+ * as in inode.c:btrfs_get_extent().
+ */
+ em->orig_start = EXTENT_MAP_HOLE;
+ em->block_len = (u64)-1;
+ if (!new_inline && compress_type != BTRFS_COMPRESS_NONE) {
+ set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+ em->compress_type = compress_type;
+ }
+ } else {
+ btrfs_err(root->fs_info,
+ "unknown file extent item type %d, inode %llu, offset %llu, root %llu",
+ type, btrfs_ino(inode), extent_start,
+ root->root_key.objectid);
+ }
+}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
new file mode 100644
index 000000000..b072e1747
--- /dev/null
+++ b/fs/btrfs/file.c
@@ -0,0 +1,2860 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/backing-dev.h>
+#include <linux/mpage.h>
+#include <linux/falloc.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/statfs.h>
+#include <linux/compat.h>
+#include <linux/slab.h>
+#include <linux/btrfs.h>
+#include <linux/uio.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "print-tree.h"
+#include "tree-log.h"
+#include "locking.h"
+#include "volumes.h"
+#include "qgroup.h"
+
+static struct kmem_cache *btrfs_inode_defrag_cachep;
+/*
+ * when auto defrag is enabled we
+ * queue up these defrag structs to remember which
+ * inodes need defragging passes
+ */
+struct inode_defrag {
+ struct rb_node rb_node;
+ /* objectid */
+ u64 ino;
+ /*
+ * transid where the defrag was added, we search for
+ * extents newer than this
+ */
+ u64 transid;
+
+ /* root objectid */
+ u64 root;
+
+ /* last offset we were able to defrag */
+ u64 last_offset;
+
+ /* if we've wrapped around back to zero once already */
+ int cycled;
+};
+
+static int __compare_inode_defrag(struct inode_defrag *defrag1,
+ struct inode_defrag *defrag2)
+{
+ if (defrag1->root > defrag2->root)
+ return 1;
+ else if (defrag1->root < defrag2->root)
+ return -1;
+ else if (defrag1->ino > defrag2->ino)
+ return 1;
+ else if (defrag1->ino < defrag2->ino)
+ return -1;
+ else
+ return 0;
+}
+
+/* pop a record for an inode into the defrag tree. The lock
+ * must be held already
+ *
+ * If you're inserting a record for an older transid than an
+ * existing record, the transid already in the tree is lowered
+ *
+ * If an existing record is found the defrag item you
+ * pass in is freed
+ */
+static int __btrfs_add_inode_defrag(struct inode *inode,
+ struct inode_defrag *defrag)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct inode_defrag *entry;
+ struct rb_node **p;
+ struct rb_node *parent = NULL;
+ int ret;
+
+ p = &root->fs_info->defrag_inodes.rb_node;
+ while (*p) {
+ parent = *p;
+ entry = rb_entry(parent, struct inode_defrag, rb_node);
+
+ ret = __compare_inode_defrag(defrag, entry);
+ if (ret < 0)
+ p = &parent->rb_left;
+ else if (ret > 0)
+ p = &parent->rb_right;
+ else {
+ /* if we're reinserting an entry for
+ * an old defrag run, make sure to
+ * lower the transid of our existing record
+ */
+ if (defrag->transid < entry->transid)
+ entry->transid = defrag->transid;
+ if (defrag->last_offset > entry->last_offset)
+ entry->last_offset = defrag->last_offset;
+ return -EEXIST;
+ }
+ }
+ set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
+ rb_link_node(&defrag->rb_node, parent, p);
+ rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);
+ return 0;
+}
+
+static inline int __need_auto_defrag(struct btrfs_root *root)
+{
+ if (!btrfs_test_opt(root, AUTO_DEFRAG))
+ return 0;
+
+ if (btrfs_fs_closing(root->fs_info))
+ return 0;
+
+ return 1;
+}
+
+/*
+ * insert a defrag record for this inode if auto defrag is
+ * enabled
+ */
+int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
+ struct inode *inode)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct inode_defrag *defrag;
+ u64 transid;
+ int ret;
+
+ if (!__need_auto_defrag(root))
+ return 0;
+
+ if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
+ return 0;
+
+ if (trans)
+ transid = trans->transid;
+ else
+ transid = BTRFS_I(inode)->root->last_trans;
+
+ defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
+ if (!defrag)
+ return -ENOMEM;
+
+ defrag->ino = btrfs_ino(inode);
+ defrag->transid = transid;
+ defrag->root = root->root_key.objectid;
+
+ spin_lock(&root->fs_info->defrag_inodes_lock);
+ if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) {
+ /*
+ * If we set IN_DEFRAG flag and evict the inode from memory,
+ * and then re-read this inode, this new inode doesn't have
+ * IN_DEFRAG flag. At the case, we may find the existed defrag.
+ */
+ ret = __btrfs_add_inode_defrag(inode, defrag);
+ if (ret)
+ kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+ } else {
+ kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+ }
+ spin_unlock(&root->fs_info->defrag_inodes_lock);
+ return 0;
+}
+
+/*
+ * Requeue the defrag object. If there is a defrag object that points to
+ * the same inode in the tree, we will merge them together (by
+ * __btrfs_add_inode_defrag()) and free the one that we want to requeue.
+ */
+static void btrfs_requeue_inode_defrag(struct inode *inode,
+ struct inode_defrag *defrag)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret;
+
+ if (!__need_auto_defrag(root))
+ goto out;
+
+ /*
+ * Here we don't check the IN_DEFRAG flag, because we need merge
+ * them together.
+ */
+ spin_lock(&root->fs_info->defrag_inodes_lock);
+ ret = __btrfs_add_inode_defrag(inode, defrag);
+ spin_unlock(&root->fs_info->defrag_inodes_lock);
+ if (ret)
+ goto out;
+ return;
+out:
+ kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+}
+
+/*
+ * pick the defragable inode that we want, if it doesn't exist, we will get
+ * the next one.
+ */
+static struct inode_defrag *
+btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino)
+{
+ struct inode_defrag *entry = NULL;
+ struct inode_defrag tmp;
+ struct rb_node *p;
+ struct rb_node *parent = NULL;
+ int ret;
+
+ tmp.ino = ino;
+ tmp.root = root;
+
+ spin_lock(&fs_info->defrag_inodes_lock);
+ p = fs_info->defrag_inodes.rb_node;
+ while (p) {
+ parent = p;
+ entry = rb_entry(parent, struct inode_defrag, rb_node);
+
+ ret = __compare_inode_defrag(&tmp, entry);
+ if (ret < 0)
+ p = parent->rb_left;
+ else if (ret > 0)
+ p = parent->rb_right;
+ else
+ goto out;
+ }
+
+ if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
+ parent = rb_next(parent);
+ if (parent)
+ entry = rb_entry(parent, struct inode_defrag, rb_node);
+ else
+ entry = NULL;
+ }
+out:
+ if (entry)
+ rb_erase(parent, &fs_info->defrag_inodes);
+ spin_unlock(&fs_info->defrag_inodes_lock);
+ return entry;
+}
+
+void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
+{
+ struct inode_defrag *defrag;
+ struct rb_node *node;
+
+ spin_lock(&fs_info->defrag_inodes_lock);
+ node = rb_first(&fs_info->defrag_inodes);
+ while (node) {
+ rb_erase(node, &fs_info->defrag_inodes);
+ defrag = rb_entry(node, struct inode_defrag, rb_node);
+ kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+
+ cond_resched_lock(&fs_info->defrag_inodes_lock);
+
+ node = rb_first(&fs_info->defrag_inodes);
+ }
+ spin_unlock(&fs_info->defrag_inodes_lock);
+}
+
+#define BTRFS_DEFRAG_BATCH 1024
+
+static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
+ struct inode_defrag *defrag)
+{
+ struct btrfs_root *inode_root;
+ struct inode *inode;
+ struct btrfs_key key;
+ struct btrfs_ioctl_defrag_range_args range;
+ int num_defrag;
+ int index;
+ int ret;
+
+ /* get the inode */
+ key.objectid = defrag->root;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = (u64)-1;
+
+ index = srcu_read_lock(&fs_info->subvol_srcu);
+
+ inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
+ if (IS_ERR(inode_root)) {
+ ret = PTR_ERR(inode_root);
+ goto cleanup;
+ }
+
+ key.objectid = defrag->ino;
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+ inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ goto cleanup;
+ }
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
+
+ /* do a chunk of defrag */
+ clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
+ memset(&range, 0, sizeof(range));
+ range.len = (u64)-1;
+ range.start = defrag->last_offset;
+
+ sb_start_write(fs_info->sb);
+ num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
+ BTRFS_DEFRAG_BATCH);
+ sb_end_write(fs_info->sb);
+ /*
+ * if we filled the whole defrag batch, there
+ * must be more work to do. Queue this defrag
+ * again
+ */
+ if (num_defrag == BTRFS_DEFRAG_BATCH) {
+ defrag->last_offset = range.start;
+ btrfs_requeue_inode_defrag(inode, defrag);
+ } else if (defrag->last_offset && !defrag->cycled) {
+ /*
+ * we didn't fill our defrag batch, but
+ * we didn't start at zero. Make sure we loop
+ * around to the start of the file.
+ */
+ defrag->last_offset = 0;
+ defrag->cycled = 1;
+ btrfs_requeue_inode_defrag(inode, defrag);
+ } else {
+ kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+ }
+
+ iput(inode);
+ return 0;
+cleanup:
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
+ kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+ return ret;
+}
+
+/*
+ * run through the list of inodes in the FS that need
+ * defragging
+ */
+int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
+{
+ struct inode_defrag *defrag;
+ u64 first_ino = 0;
+ u64 root_objectid = 0;
+
+ atomic_inc(&fs_info->defrag_running);
+ while (1) {
+ /* Pause the auto defragger. */
+ if (test_bit(BTRFS_FS_STATE_REMOUNTING,
+ &fs_info->fs_state))
+ break;
+
+ if (!__need_auto_defrag(fs_info->tree_root))
+ break;
+
+ /* find an inode to defrag */
+ defrag = btrfs_pick_defrag_inode(fs_info, root_objectid,
+ first_ino);
+ if (!defrag) {
+ if (root_objectid || first_ino) {
+ root_objectid = 0;
+ first_ino = 0;
+ continue;
+ } else {
+ break;
+ }
+ }
+
+ first_ino = defrag->ino + 1;
+ root_objectid = defrag->root;
+
+ __btrfs_run_defrag_inode(fs_info, defrag);
+ }
+ atomic_dec(&fs_info->defrag_running);
+
+ /*
+ * during unmount, we use the transaction_wait queue to
+ * wait for the defragger to stop
+ */
+ wake_up(&fs_info->transaction_wait);
+ return 0;
+}
+
+/* simple helper to fault in pages and copy. This should go away
+ * and be replaced with calls into generic code.
+ */
+static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
+ size_t write_bytes,
+ struct page **prepared_pages,
+ struct iov_iter *i)
+{
+ size_t copied = 0;
+ size_t total_copied = 0;
+ int pg = 0;
+ int offset = pos & (PAGE_CACHE_SIZE - 1);
+
+ while (write_bytes > 0) {
+ size_t count = min_t(size_t,
+ PAGE_CACHE_SIZE - offset, write_bytes);
+ struct page *page = prepared_pages[pg];
+ /*
+ * Copy data from userspace to the current page
+ */
+ copied = iov_iter_copy_from_user_atomic(page, i, offset, count);
+
+ /* Flush processor's dcache for this page */
+ flush_dcache_page(page);
+
+ /*
+ * if we get a partial write, we can end up with
+ * partially up to date pages. These add
+ * a lot of complexity, so make sure they don't
+ * happen by forcing this copy to be retried.
+ *
+ * The rest of the btrfs_file_write code will fall
+ * back to page at a time copies after we return 0.
+ */
+ if (!PageUptodate(page) && copied < count)
+ copied = 0;
+
+ iov_iter_advance(i, copied);
+ write_bytes -= copied;
+ total_copied += copied;
+
+ /* Return to btrfs_file_write_iter to fault page */
+ if (unlikely(copied == 0))
+ break;
+
+ if (copied < PAGE_CACHE_SIZE - offset) {
+ offset += copied;
+ } else {
+ pg++;
+ offset = 0;
+ }
+ }
+ return total_copied;
+}
+
+/*
+ * unlocks pages after btrfs_file_write is done with them
+ */
+static void btrfs_drop_pages(struct page **pages, size_t num_pages)
+{
+ size_t i;
+ for (i = 0; i < num_pages; i++) {
+ /* page checked is some magic around finding pages that
+ * have been modified without going through btrfs_set_page_dirty
+ * clear it here. There should be no need to mark the pages
+ * accessed as prepare_pages should have marked them accessed
+ * in prepare_pages via find_or_create_page()
+ */
+ ClearPageChecked(pages[i]);
+ unlock_page(pages[i]);
+ page_cache_release(pages[i]);
+ }
+}
+
+/*
+ * after copy_from_user, pages need to be dirtied and we need to make
+ * sure holes are created between the current EOF and the start of
+ * any next extents (if required).
+ *
+ * this also makes the decision about creating an inline extent vs
+ * doing real data extents, marking pages dirty and delalloc as required.
+ */
+int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
+ struct page **pages, size_t num_pages,
+ loff_t pos, size_t write_bytes,
+ struct extent_state **cached)
+{
+ int err = 0;
+ int i;
+ u64 num_bytes;
+ u64 start_pos;
+ u64 end_of_last_block;
+ u64 end_pos = pos + write_bytes;
+ loff_t isize = i_size_read(inode);
+
+ start_pos = pos & ~((u64)root->sectorsize - 1);
+ num_bytes = ALIGN(write_bytes + pos - start_pos, root->sectorsize);
+
+ end_of_last_block = start_pos + num_bytes - 1;
+ err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
+ cached);
+ if (err)
+ return err;
+
+ for (i = 0; i < num_pages; i++) {
+ struct page *p = pages[i];
+ SetPageUptodate(p);
+ ClearPageChecked(p);
+ set_page_dirty(p);
+ }
+
+ /*
+ * we've only changed i_size in ram, and we haven't updated
+ * the disk i_size. There is no need to log the inode
+ * at this time.
+ */
+ if (end_pos > isize)
+ i_size_write(inode, end_pos);
+ return 0;
+}
+
+/*
+ * this drops all the extents in the cache that intersect the range
+ * [start, end]. Existing extents are split as required.
+ */
+void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
+ int skip_pinned)
+{
+ struct extent_map *em;
+ struct extent_map *split = NULL;
+ struct extent_map *split2 = NULL;
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ u64 len = end - start + 1;
+ u64 gen;
+ int ret;
+ int testend = 1;
+ unsigned long flags;
+ int compressed = 0;
+ bool modified;
+
+ WARN_ON(end < start);
+ if (end == (u64)-1) {
+ len = (u64)-1;
+ testend = 0;
+ }
+ while (1) {
+ int no_splits = 0;
+
+ modified = false;
+ if (!split)
+ split = alloc_extent_map();
+ if (!split2)
+ split2 = alloc_extent_map();
+ if (!split || !split2)
+ no_splits = 1;
+
+ write_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, start, len);
+ if (!em) {
+ write_unlock(&em_tree->lock);
+ break;
+ }
+ flags = em->flags;
+ gen = em->generation;
+ if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
+ if (testend && em->start + em->len >= start + len) {
+ free_extent_map(em);
+ write_unlock(&em_tree->lock);
+ break;
+ }
+ start = em->start + em->len;
+ if (testend)
+ len = start + len - (em->start + em->len);
+ free_extent_map(em);
+ write_unlock(&em_tree->lock);
+ continue;
+ }
+ compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+ clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+ clear_bit(EXTENT_FLAG_LOGGING, &flags);
+ modified = !list_empty(&em->list);
+ if (no_splits)
+ goto next;
+
+ if (em->start < start) {
+ split->start = em->start;
+ split->len = start - em->start;
+
+ if (em->block_start < EXTENT_MAP_LAST_BYTE) {
+ split->orig_start = em->orig_start;
+ split->block_start = em->block_start;
+
+ if (compressed)
+ split->block_len = em->block_len;
+ else
+ split->block_len = split->len;
+ split->orig_block_len = max(split->block_len,
+ em->orig_block_len);
+ split->ram_bytes = em->ram_bytes;
+ } else {
+ split->orig_start = split->start;
+ split->block_len = 0;
+ split->block_start = em->block_start;
+ split->orig_block_len = 0;
+ split->ram_bytes = split->len;
+ }
+
+ split->generation = gen;
+ split->bdev = em->bdev;
+ split->flags = flags;
+ split->compress_type = em->compress_type;
+ replace_extent_mapping(em_tree, em, split, modified);
+ free_extent_map(split);
+ split = split2;
+ split2 = NULL;
+ }
+ if (testend && em->start + em->len > start + len) {
+ u64 diff = start + len - em->start;
+
+ split->start = start + len;
+ split->len = em->start + em->len - (start + len);
+ split->bdev = em->bdev;
+ split->flags = flags;
+ split->compress_type = em->compress_type;
+ split->generation = gen;
+
+ if (em->block_start < EXTENT_MAP_LAST_BYTE) {
+ split->orig_block_len = max(em->block_len,
+ em->orig_block_len);
+
+ split->ram_bytes = em->ram_bytes;
+ if (compressed) {
+ split->block_len = em->block_len;
+ split->block_start = em->block_start;
+ split->orig_start = em->orig_start;
+ } else {
+ split->block_len = split->len;
+ split->block_start = em->block_start
+ + diff;
+ split->orig_start = em->orig_start;
+ }
+ } else {
+ split->ram_bytes = split->len;
+ split->orig_start = split->start;
+ split->block_len = 0;
+ split->block_start = em->block_start;
+ split->orig_block_len = 0;
+ }
+
+ if (extent_map_in_tree(em)) {
+ replace_extent_mapping(em_tree, em, split,
+ modified);
+ } else {
+ ret = add_extent_mapping(em_tree, split,
+ modified);
+ ASSERT(ret == 0); /* Logic error */
+ }
+ free_extent_map(split);
+ split = NULL;
+ }
+next:
+ if (extent_map_in_tree(em))
+ remove_extent_mapping(em_tree, em);
+ write_unlock(&em_tree->lock);
+
+ /* once for us */
+ free_extent_map(em);
+ /* once for the tree*/
+ free_extent_map(em);
+ }
+ if (split)
+ free_extent_map(split);
+ if (split2)
+ free_extent_map(split2);
+}
+
+/*
+ * this is very complex, but the basic idea is to drop all extents
+ * in the range start - end. hint_block is filled in with a block number
+ * that would be a good hint to the block allocator for this file.
+ *
+ * If an extent intersects the range but is not entirely inside the range
+ * it is either truncated or split. Anything entirely inside the range
+ * is deleted from the tree.
+ */
+int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode,
+ struct btrfs_path *path, u64 start, u64 end,
+ u64 *drop_end, int drop_cache,
+ int replace_extent,
+ u32 extent_item_size,
+ int *key_inserted)
+{
+ struct extent_buffer *leaf;
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_key key;
+ struct btrfs_key new_key;
+ u64 ino = btrfs_ino(inode);
+ u64 search_start = start;
+ u64 disk_bytenr = 0;
+ u64 num_bytes = 0;
+ u64 extent_offset = 0;
+ u64 extent_end = 0;
+ int del_nr = 0;
+ int del_slot = 0;
+ int extent_type;
+ int recow;
+ int ret;
+ int modify_tree = -1;
+ int update_refs;
+ int found = 0;
+ int leafs_visited = 0;
+
+ if (drop_cache)
+ btrfs_drop_extent_cache(inode, start, end - 1, 0);
+
+ if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent)
+ modify_tree = 0;
+
+ update_refs = (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
+ root == root->fs_info->tree_root);
+ while (1) {
+ recow = 0;
+ ret = btrfs_lookup_file_extent(trans, root, path, ino,
+ search_start, modify_tree);
+ if (ret < 0)
+ break;
+ if (ret > 0 && path->slots[0] > 0 && search_start == start) {
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
+ if (key.objectid == ino &&
+ key.type == BTRFS_EXTENT_DATA_KEY)
+ path->slots[0]--;
+ }
+ ret = 0;
+ leafs_visited++;
+next_slot:
+ leaf = path->nodes[0];
+ if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+ BUG_ON(del_nr > 0);
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ break;
+ if (ret > 0) {
+ ret = 0;
+ break;
+ }
+ leafs_visited++;
+ leaf = path->nodes[0];
+ recow = 1;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid > ino ||
+ key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
+ break;
+
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ extent_type = btrfs_file_extent_type(leaf, fi);
+
+ if (extent_type == BTRFS_FILE_EXTENT_REG ||
+ extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+ num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+ extent_offset = btrfs_file_extent_offset(leaf, fi);
+ extent_end = key.offset +
+ btrfs_file_extent_num_bytes(leaf, fi);
+ } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+ extent_end = key.offset +
+ btrfs_file_extent_inline_len(leaf,
+ path->slots[0], fi);
+ } else {
+ WARN_ON(1);
+ extent_end = search_start;
+ }
+
+ /*
+ * Don't skip extent items representing 0 byte lengths. They
+ * used to be created (bug) if while punching holes we hit
+ * -ENOSPC condition. So if we find one here, just ensure we
+ * delete it, otherwise we would insert a new file extent item
+ * with the same key (offset) as that 0 bytes length file
+ * extent item in the call to setup_items_for_insert() later
+ * in this function.
+ */
+ if (extent_end == key.offset && extent_end >= search_start)
+ goto delete_extent_item;
+
+ if (extent_end <= search_start) {
+ path->slots[0]++;
+ goto next_slot;
+ }
+
+ found = 1;
+ search_start = max(key.offset, start);
+ if (recow || !modify_tree) {
+ modify_tree = -1;
+ btrfs_release_path(path);
+ continue;
+ }
+
+ /*
+ * | - range to drop - |
+ * | -------- extent -------- |
+ */
+ if (start > key.offset && end < extent_end) {
+ BUG_ON(del_nr > 0);
+ if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+ ret = -EOPNOTSUPP;
+ break;
+ }
+
+ memcpy(&new_key, &key, sizeof(new_key));
+ new_key.offset = start;
+ ret = btrfs_duplicate_item(trans, root, path,
+ &new_key);
+ if (ret == -EAGAIN) {
+ btrfs_release_path(path);
+ continue;
+ }
+ if (ret < 0)
+ break;
+
+ leaf = path->nodes[0];
+ fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ start - key.offset);
+
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+
+ extent_offset += start - key.offset;
+ btrfs_set_file_extent_offset(leaf, fi, extent_offset);
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ extent_end - start);
+ btrfs_mark_buffer_dirty(leaf);
+
+ if (update_refs && disk_bytenr > 0) {
+ ret = btrfs_inc_extent_ref(trans, root,
+ disk_bytenr, num_bytes, 0,
+ root->root_key.objectid,
+ new_key.objectid,
+ start - extent_offset, 1);
+ BUG_ON(ret); /* -ENOMEM */
+ }
+ key.offset = start;
+ }
+ /*
+ * | ---- range to drop ----- |
+ * | -------- extent -------- |
+ */
+ if (start <= key.offset && end < extent_end) {
+ if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+ ret = -EOPNOTSUPP;
+ break;
+ }
+
+ memcpy(&new_key, &key, sizeof(new_key));
+ new_key.offset = end;
+ btrfs_set_item_key_safe(root->fs_info, path, &new_key);
+
+ extent_offset += end - key.offset;
+ btrfs_set_file_extent_offset(leaf, fi, extent_offset);
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ extent_end - end);
+ btrfs_mark_buffer_dirty(leaf);
+ if (update_refs && disk_bytenr > 0)
+ inode_sub_bytes(inode, end - key.offset);
+ break;
+ }
+
+ search_start = extent_end;
+ /*
+ * | ---- range to drop ----- |
+ * | -------- extent -------- |
+ */
+ if (start > key.offset && end >= extent_end) {
+ BUG_ON(del_nr > 0);
+ if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+ ret = -EOPNOTSUPP;
+ break;
+ }
+
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ start - key.offset);
+ btrfs_mark_buffer_dirty(leaf);
+ if (update_refs && disk_bytenr > 0)
+ inode_sub_bytes(inode, extent_end - start);
+ if (end == extent_end)
+ break;
+
+ path->slots[0]++;
+ goto next_slot;
+ }
+
+ /*
+ * | ---- range to drop ----- |
+ * | ------ extent ------ |
+ */
+ if (start <= key.offset && end >= extent_end) {
+delete_extent_item:
+ if (del_nr == 0) {
+ del_slot = path->slots[0];
+ del_nr = 1;
+ } else {
+ BUG_ON(del_slot + del_nr != path->slots[0]);
+ del_nr++;
+ }
+
+ if (update_refs &&
+ extent_type == BTRFS_FILE_EXTENT_INLINE) {
+ inode_sub_bytes(inode,
+ extent_end - key.offset);
+ extent_end = ALIGN(extent_end,
+ root->sectorsize);
+ } else if (update_refs && disk_bytenr > 0) {
+ ret = btrfs_free_extent(trans, root,
+ disk_bytenr, num_bytes, 0,
+ root->root_key.objectid,
+ key.objectid, key.offset -
+ extent_offset, 0);
+ BUG_ON(ret); /* -ENOMEM */
+ inode_sub_bytes(inode,
+ extent_end - key.offset);
+ }
+
+ if (end == extent_end)
+ break;
+
+ if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) {
+ path->slots[0]++;
+ goto next_slot;
+ }
+
+ ret = btrfs_del_items(trans, root, path, del_slot,
+ del_nr);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ break;
+ }
+
+ del_nr = 0;
+ del_slot = 0;
+
+ btrfs_release_path(path);
+ continue;
+ }
+
+ BUG_ON(1);
+ }
+
+ if (!ret && del_nr > 0) {
+ /*
+ * Set path->slots[0] to first slot, so that after the delete
+ * if items are move off from our leaf to its immediate left or
+ * right neighbor leafs, we end up with a correct and adjusted
+ * path->slots[0] for our insertion (if replace_extent != 0).
+ */
+ path->slots[0] = del_slot;
+ ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
+ if (ret)
+ btrfs_abort_transaction(trans, root, ret);
+ }
+
+ leaf = path->nodes[0];
+ /*
+ * If btrfs_del_items() was called, it might have deleted a leaf, in
+ * which case it unlocked our path, so check path->locks[0] matches a
+ * write lock.
+ */
+ if (!ret && replace_extent && leafs_visited == 1 &&
+ (path->locks[0] == BTRFS_WRITE_LOCK_BLOCKING ||
+ path->locks[0] == BTRFS_WRITE_LOCK) &&
+ btrfs_leaf_free_space(root, leaf) >=
+ sizeof(struct btrfs_item) + extent_item_size) {
+
+ key.objectid = ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = start;
+ if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) {
+ struct btrfs_key slot_key;
+
+ btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]);
+ if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
+ path->slots[0]++;
+ }
+ setup_items_for_insert(root, path, &key,
+ &extent_item_size,
+ extent_item_size,
+ sizeof(struct btrfs_item) +
+ extent_item_size, 1);
+ *key_inserted = 1;
+ }
+
+ if (!replace_extent || !(*key_inserted))
+ btrfs_release_path(path);
+ if (drop_end)
+ *drop_end = found ? min(end, extent_end) : end;
+ return ret;
+}
+
+int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode, u64 start,
+ u64 end, int drop_cache)
+{
+ struct btrfs_path *path;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL,
+ drop_cache, 0, 0, NULL);
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int extent_mergeable(struct extent_buffer *leaf, int slot,
+ u64 objectid, u64 bytenr, u64 orig_offset,
+ u64 *start, u64 *end)
+{
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_key key;
+ u64 extent_end;
+
+ if (slot < 0 || slot >= btrfs_header_nritems(leaf))
+ return 0;
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
+ return 0;
+
+ fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+ if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
+ btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
+ btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset ||
+ btrfs_file_extent_compression(leaf, fi) ||
+ btrfs_file_extent_encryption(leaf, fi) ||
+ btrfs_file_extent_other_encoding(leaf, fi))
+ return 0;
+
+ extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
+ if ((*start && *start != key.offset) || (*end && *end != extent_end))
+ return 0;
+
+ *start = key.offset;
+ *end = extent_end;
+ return 1;
+}
+
+/*
+ * Mark extent in the range start - end as written.
+ *
+ * This changes extent type from 'pre-allocated' to 'regular'. If only
+ * part of extent is marked as written, the extent will be split into
+ * two or three.
+ */
+int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
+ struct inode *inode, u64 start, u64 end)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct extent_buffer *leaf;
+ struct btrfs_path *path;
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_key key;
+ struct btrfs_key new_key;
+ u64 bytenr;
+ u64 num_bytes;
+ u64 extent_end;
+ u64 orig_offset;
+ u64 other_start;
+ u64 other_end;
+ u64 split;
+ int del_nr = 0;
+ int del_slot = 0;
+ int recow;
+ int ret;
+ u64 ino = btrfs_ino(inode);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+again:
+ recow = 0;
+ split = start;
+ key.objectid = ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = split;
+
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret < 0)
+ goto out;
+ if (ret > 0 && path->slots[0] > 0)
+ path->slots[0]--;
+
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ BUG_ON(key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY);
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ BUG_ON(btrfs_file_extent_type(leaf, fi) !=
+ BTRFS_FILE_EXTENT_PREALLOC);
+ extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
+ BUG_ON(key.offset > start || extent_end < end);
+
+ bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+ num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+ orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
+ memcpy(&new_key, &key, sizeof(new_key));
+
+ if (start == key.offset && end < extent_end) {
+ other_start = 0;
+ other_end = start;
+ if (extent_mergeable(leaf, path->slots[0] - 1,
+ ino, bytenr, orig_offset,
+ &other_start, &other_end)) {
+ new_key.offset = end;
+ btrfs_set_item_key_safe(root->fs_info, path, &new_key);
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_generation(leaf, fi,
+ trans->transid);
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ extent_end - end);
+ btrfs_set_file_extent_offset(leaf, fi,
+ end - orig_offset);
+ fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_generation(leaf, fi,
+ trans->transid);
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ end - other_start);
+ btrfs_mark_buffer_dirty(leaf);
+ goto out;
+ }
+ }
+
+ if (start > key.offset && end == extent_end) {
+ other_start = end;
+ other_end = 0;
+ if (extent_mergeable(leaf, path->slots[0] + 1,
+ ino, bytenr, orig_offset,
+ &other_start, &other_end)) {
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ start - key.offset);
+ btrfs_set_file_extent_generation(leaf, fi,
+ trans->transid);
+ path->slots[0]++;
+ new_key.offset = start;
+ btrfs_set_item_key_safe(root->fs_info, path, &new_key);
+
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_generation(leaf, fi,
+ trans->transid);
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ other_end - start);
+ btrfs_set_file_extent_offset(leaf, fi,
+ start - orig_offset);
+ btrfs_mark_buffer_dirty(leaf);
+ goto out;
+ }
+ }
+
+ while (start > key.offset || end < extent_end) {
+ if (key.offset == start)
+ split = end;
+
+ new_key.offset = split;
+ ret = btrfs_duplicate_item(trans, root, path, &new_key);
+ if (ret == -EAGAIN) {
+ btrfs_release_path(path);
+ goto again;
+ }
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
+
+ leaf = path->nodes[0];
+ fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_generation(leaf, fi, trans->transid);
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ split - key.offset);
+
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+
+ btrfs_set_file_extent_generation(leaf, fi, trans->transid);
+ btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ extent_end - split);
+ btrfs_mark_buffer_dirty(leaf);
+
+ ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
+ root->root_key.objectid,
+ ino, orig_offset, 1);
+ BUG_ON(ret); /* -ENOMEM */
+
+ if (split == start) {
+ key.offset = start;
+ } else {
+ BUG_ON(start != key.offset);
+ path->slots[0]--;
+ extent_end = end;
+ }
+ recow = 1;
+ }
+
+ other_start = end;
+ other_end = 0;
+ if (extent_mergeable(leaf, path->slots[0] + 1,
+ ino, bytenr, orig_offset,
+ &other_start, &other_end)) {
+ if (recow) {
+ btrfs_release_path(path);
+ goto again;
+ }
+ extent_end = other_end;
+ del_slot = path->slots[0] + 1;
+ del_nr++;
+ ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
+ 0, root->root_key.objectid,
+ ino, orig_offset, 0);
+ BUG_ON(ret); /* -ENOMEM */
+ }
+ other_start = 0;
+ other_end = start;
+ if (extent_mergeable(leaf, path->slots[0] - 1,
+ ino, bytenr, orig_offset,
+ &other_start, &other_end)) {
+ if (recow) {
+ btrfs_release_path(path);
+ goto again;
+ }
+ key.offset = other_start;
+ del_slot = path->slots[0];
+ del_nr++;
+ ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
+ 0, root->root_key.objectid,
+ ino, orig_offset, 0);
+ BUG_ON(ret); /* -ENOMEM */
+ }
+ if (del_nr == 0) {
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_type(leaf, fi,
+ BTRFS_FILE_EXTENT_REG);
+ btrfs_set_file_extent_generation(leaf, fi, trans->transid);
+ btrfs_mark_buffer_dirty(leaf);
+ } else {
+ fi = btrfs_item_ptr(leaf, del_slot - 1,
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_type(leaf, fi,
+ BTRFS_FILE_EXTENT_REG);
+ btrfs_set_file_extent_generation(leaf, fi, trans->transid);
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ extent_end - key.offset);
+ btrfs_mark_buffer_dirty(leaf);
+
+ ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
+ }
+out:
+ btrfs_free_path(path);
+ return 0;
+}
+
+/*
+ * on error we return an unlocked page and the error value
+ * on success we return a locked page and 0
+ */
+static int prepare_uptodate_page(struct page *page, u64 pos,
+ bool force_uptodate)
+{
+ int ret = 0;
+
+ if (((pos & (PAGE_CACHE_SIZE - 1)) || force_uptodate) &&
+ !PageUptodate(page)) {
+ ret = btrfs_readpage(NULL, page);
+ if (ret)
+ return ret;
+ lock_page(page);
+ if (!PageUptodate(page)) {
+ unlock_page(page);
+ return -EIO;
+ }
+ }
+ return 0;
+}
+
+/*
+ * this just gets pages into the page cache and locks them down.
+ */
+static noinline int prepare_pages(struct inode *inode, struct page **pages,
+ size_t num_pages, loff_t pos,
+ size_t write_bytes, bool force_uptodate)
+{
+ int i;
+ unsigned long index = pos >> PAGE_CACHE_SHIFT;
+ gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
+ int err = 0;
+ int faili;
+
+ for (i = 0; i < num_pages; i++) {
+ pages[i] = find_or_create_page(inode->i_mapping, index + i,
+ mask | __GFP_WRITE);
+ if (!pages[i]) {
+ faili = i - 1;
+ err = -ENOMEM;
+ goto fail;
+ }
+
+ if (i == 0)
+ err = prepare_uptodate_page(pages[i], pos,
+ force_uptodate);
+ if (i == num_pages - 1)
+ err = prepare_uptodate_page(pages[i],
+ pos + write_bytes, false);
+ if (err) {
+ page_cache_release(pages[i]);
+ faili = i - 1;
+ goto fail;
+ }
+ wait_on_page_writeback(pages[i]);
+ }
+
+ return 0;
+fail:
+ while (faili >= 0) {
+ unlock_page(pages[faili]);
+ page_cache_release(pages[faili]);
+ faili--;
+ }
+ return err;
+
+}
+
+/*
+ * This function locks the extent and properly waits for data=ordered extents
+ * to finish before allowing the pages to be modified if need.
+ *
+ * The return value:
+ * 1 - the extent is locked
+ * 0 - the extent is not locked, and everything is OK
+ * -EAGAIN - need re-prepare the pages
+ * the other < 0 number - Something wrong happens
+ */
+static noinline int
+lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages,
+ size_t num_pages, loff_t pos,
+ u64 *lockstart, u64 *lockend,
+ struct extent_state **cached_state)
+{
+ u64 start_pos;
+ u64 last_pos;
+ int i;
+ int ret = 0;
+
+ start_pos = pos & ~((u64)PAGE_CACHE_SIZE - 1);
+ last_pos = start_pos + ((u64)num_pages << PAGE_CACHE_SHIFT) - 1;
+
+ if (start_pos < inode->i_size) {
+ struct btrfs_ordered_extent *ordered;
+ lock_extent_bits(&BTRFS_I(inode)->io_tree,
+ start_pos, last_pos, 0, cached_state);
+ ordered = btrfs_lookup_ordered_range(inode, start_pos,
+ last_pos - start_pos + 1);
+ if (ordered &&
+ ordered->file_offset + ordered->len > start_pos &&
+ ordered->file_offset <= last_pos) {
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+ start_pos, last_pos,
+ cached_state, GFP_NOFS);
+ for (i = 0; i < num_pages; i++) {
+ unlock_page(pages[i]);
+ page_cache_release(pages[i]);
+ }
+ btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_put_ordered_extent(ordered);
+ return -EAGAIN;
+ }
+ if (ordered)
+ btrfs_put_ordered_extent(ordered);
+
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos,
+ last_pos, EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
+ 0, 0, cached_state, GFP_NOFS);
+ *lockstart = start_pos;
+ *lockend = last_pos;
+ ret = 1;
+ }
+
+ for (i = 0; i < num_pages; i++) {
+ if (clear_page_dirty_for_io(pages[i]))
+ account_page_redirty(pages[i]);
+ set_page_extent_mapped(pages[i]);
+ WARN_ON(!PageLocked(pages[i]));
+ }
+
+ return ret;
+}
+
+static noinline int check_can_nocow(struct inode *inode, loff_t pos,
+ size_t *write_bytes)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_ordered_extent *ordered;
+ u64 lockstart, lockend;
+ u64 num_bytes;
+ int ret;
+
+ ret = btrfs_start_write_no_snapshoting(root);
+ if (!ret)
+ return -ENOSPC;
+
+ lockstart = round_down(pos, root->sectorsize);
+ lockend = round_up(pos + *write_bytes, root->sectorsize) - 1;
+
+ while (1) {
+ lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
+ ordered = btrfs_lookup_ordered_range(inode, lockstart,
+ lockend - lockstart + 1);
+ if (!ordered) {
+ break;
+ }
+ unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
+ btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_put_ordered_extent(ordered);
+ }
+
+ num_bytes = lockend - lockstart + 1;
+ ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL);
+ if (ret <= 0) {
+ ret = 0;
+ btrfs_end_write_no_snapshoting(root);
+ } else {
+ *write_bytes = min_t(size_t, *write_bytes ,
+ num_bytes - pos + lockstart);
+ }
+
+ unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
+
+ return ret;
+}
+
+static noinline ssize_t __btrfs_buffered_write(struct file *file,
+ struct iov_iter *i,
+ loff_t pos)
+{
+ struct inode *inode = file_inode(file);
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct page **pages = NULL;
+ struct extent_state *cached_state = NULL;
+ u64 release_bytes = 0;
+ u64 lockstart;
+ u64 lockend;
+ unsigned long first_index;
+ size_t num_written = 0;
+ int nrptrs;
+ int ret = 0;
+ bool only_release_metadata = false;
+ bool force_page_uptodate = false;
+ bool need_unlock;
+
+ nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_CACHE_SIZE),
+ PAGE_CACHE_SIZE / (sizeof(struct page *)));
+ nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
+ nrptrs = max(nrptrs, 8);
+ pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL);
+ if (!pages)
+ return -ENOMEM;
+
+ first_index = pos >> PAGE_CACHE_SHIFT;
+
+ while (iov_iter_count(i) > 0) {
+ size_t offset = pos & (PAGE_CACHE_SIZE - 1);
+ size_t write_bytes = min(iov_iter_count(i),
+ nrptrs * (size_t)PAGE_CACHE_SIZE -
+ offset);
+ size_t num_pages = DIV_ROUND_UP(write_bytes + offset,
+ PAGE_CACHE_SIZE);
+ size_t reserve_bytes;
+ size_t dirty_pages;
+ size_t copied;
+
+ WARN_ON(num_pages > nrptrs);
+
+ /*
+ * Fault pages before locking them in prepare_pages
+ * to avoid recursive lock
+ */
+ if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) {
+ ret = -EFAULT;
+ break;
+ }
+
+ reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
+ ret = btrfs_check_data_free_space(inode, reserve_bytes, write_bytes);
+ if (ret == -ENOSPC &&
+ (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
+ BTRFS_INODE_PREALLOC))) {
+ ret = check_can_nocow(inode, pos, &write_bytes);
+ if (ret > 0) {
+ only_release_metadata = true;
+ /*
+ * our prealloc extent may be smaller than
+ * write_bytes, so scale down.
+ */
+ num_pages = DIV_ROUND_UP(write_bytes + offset,
+ PAGE_CACHE_SIZE);
+ reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
+ ret = 0;
+ } else {
+ ret = -ENOSPC;
+ }
+ }
+
+ if (ret)
+ break;
+
+ ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes);
+ if (ret) {
+ if (!only_release_metadata)
+ btrfs_free_reserved_data_space(inode,
+ reserve_bytes);
+ else
+ btrfs_end_write_no_snapshoting(root);
+ break;
+ }
+
+ release_bytes = reserve_bytes;
+ need_unlock = false;
+again:
+ /*
+ * This is going to setup the pages array with the number of
+ * pages we want, so we don't really need to worry about the
+ * contents of pages from loop to loop
+ */
+ ret = prepare_pages(inode, pages, num_pages,
+ pos, write_bytes,
+ force_page_uptodate);
+ if (ret)
+ break;
+
+ ret = lock_and_cleanup_extent_if_need(inode, pages, num_pages,
+ pos, &lockstart, &lockend,
+ &cached_state);
+ if (ret < 0) {
+ if (ret == -EAGAIN)
+ goto again;
+ break;
+ } else if (ret > 0) {
+ need_unlock = true;
+ ret = 0;
+ }
+
+ copied = btrfs_copy_from_user(pos, num_pages,
+ write_bytes, pages, i);
+
+ /*
+ * if we have trouble faulting in the pages, fall
+ * back to one page at a time
+ */
+ if (copied < write_bytes)
+ nrptrs = 1;
+
+ if (copied == 0) {
+ force_page_uptodate = true;
+ dirty_pages = 0;
+ } else {
+ force_page_uptodate = false;
+ dirty_pages = DIV_ROUND_UP(copied + offset,
+ PAGE_CACHE_SIZE);
+ }
+
+ /*
+ * If we had a short copy we need to release the excess delaloc
+ * bytes we reserved. We need to increment outstanding_extents
+ * because btrfs_delalloc_release_space will decrement it, but
+ * we still have an outstanding extent for the chunk we actually
+ * managed to copy.
+ */
+ if (num_pages > dirty_pages) {
+ release_bytes = (num_pages - dirty_pages) <<
+ PAGE_CACHE_SHIFT;
+ if (copied > 0) {
+ spin_lock(&BTRFS_I(inode)->lock);
+ BTRFS_I(inode)->outstanding_extents++;
+ spin_unlock(&BTRFS_I(inode)->lock);
+ }
+ if (only_release_metadata)
+ btrfs_delalloc_release_metadata(inode,
+ release_bytes);
+ else
+ btrfs_delalloc_release_space(inode,
+ release_bytes);
+ }
+
+ release_bytes = dirty_pages << PAGE_CACHE_SHIFT;
+
+ if (copied > 0)
+ ret = btrfs_dirty_pages(root, inode, pages,
+ dirty_pages, pos, copied,
+ NULL);
+ if (need_unlock)
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+ lockstart, lockend, &cached_state,
+ GFP_NOFS);
+ if (ret) {
+ btrfs_drop_pages(pages, num_pages);
+ break;
+ }
+
+ release_bytes = 0;
+ if (only_release_metadata)
+ btrfs_end_write_no_snapshoting(root);
+
+ if (only_release_metadata && copied > 0) {
+ lockstart = round_down(pos, root->sectorsize);
+ lockend = lockstart +
+ (dirty_pages << PAGE_CACHE_SHIFT) - 1;
+
+ set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+ lockend, EXTENT_NORESERVE, NULL,
+ NULL, GFP_NOFS);
+ only_release_metadata = false;
+ }
+
+ btrfs_drop_pages(pages, num_pages);
+
+ cond_resched();
+
+ balance_dirty_pages_ratelimited(inode->i_mapping);
+ if (dirty_pages < (root->nodesize >> PAGE_CACHE_SHIFT) + 1)
+ btrfs_btree_balance_dirty(root);
+
+ pos += copied;
+ num_written += copied;
+ }
+
+ kfree(pages);
+
+ if (release_bytes) {
+ if (only_release_metadata) {
+ btrfs_end_write_no_snapshoting(root);
+ btrfs_delalloc_release_metadata(inode, release_bytes);
+ } else {
+ btrfs_delalloc_release_space(inode, release_bytes);
+ }
+ }
+
+ return num_written ? num_written : ret;
+}
+
+static ssize_t __btrfs_direct_write(struct kiocb *iocb,
+ struct iov_iter *from,
+ loff_t pos)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
+ ssize_t written;
+ ssize_t written_buffered;
+ loff_t endbyte;
+ int err;
+
+ written = generic_file_direct_write(iocb, from, pos);
+
+ if (written < 0 || !iov_iter_count(from))
+ return written;
+
+ pos += written;
+ written_buffered = __btrfs_buffered_write(file, from, pos);
+ if (written_buffered < 0) {
+ err = written_buffered;
+ goto out;
+ }
+ /*
+ * Ensure all data is persisted. We want the next direct IO read to be
+ * able to read what was just written.
+ */
+ endbyte = pos + written_buffered - 1;
+ err = btrfs_fdatawrite_range(inode, pos, endbyte);
+ if (err)
+ goto out;
+ err = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
+ if (err)
+ goto out;
+ written += written_buffered;
+ iocb->ki_pos = pos + written_buffered;
+ invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT,
+ endbyte >> PAGE_CACHE_SHIFT);
+out:
+ return written ? written : err;
+}
+
+static void update_time_for_write(struct inode *inode)
+{
+ struct timespec now;
+
+ if (IS_NOCMTIME(inode))
+ return;
+
+ now = current_fs_time(inode->i_sb);
+ if (!timespec_equal(&inode->i_mtime, &now))
+ inode->i_mtime = now;
+
+ if (!timespec_equal(&inode->i_ctime, &now))
+ inode->i_ctime = now;
+
+ if (IS_I_VERSION(inode))
+ inode_inc_iversion(inode);
+}
+
+static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
+ struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ u64 start_pos;
+ u64 end_pos;
+ ssize_t num_written = 0;
+ bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);
+ ssize_t err;
+ loff_t pos;
+ size_t count;
+
+ mutex_lock(&inode->i_mutex);
+ err = generic_write_checks(iocb, from);
+ if (err <= 0) {
+ mutex_unlock(&inode->i_mutex);
+ return err;
+ }
+
+ current->backing_dev_info = inode_to_bdi(inode);
+ err = file_remove_suid(file);
+ if (err) {
+ mutex_unlock(&inode->i_mutex);
+ goto out;
+ }
+
+ /*
+ * If BTRFS flips readonly due to some impossible error
+ * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR),
+ * although we have opened a file as writable, we have
+ * to stop this write operation to ensure FS consistency.
+ */
+ if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) {
+ mutex_unlock(&inode->i_mutex);
+ err = -EROFS;
+ goto out;
+ }
+
+ /*
+ * We reserve space for updating the inode when we reserve space for the
+ * extent we are going to write, so we will enospc out there. We don't
+ * need to start yet another transaction to update the inode as we will
+ * update the inode when we finish writing whatever data we write.
+ */
+ update_time_for_write(inode);
+
+ pos = iocb->ki_pos;
+ count = iov_iter_count(from);
+ start_pos = round_down(pos, root->sectorsize);
+ if (start_pos > i_size_read(inode)) {
+ /* Expand hole size to cover write data, preventing empty gap */
+ end_pos = round_up(pos + count, root->sectorsize);
+ err = btrfs_cont_expand(inode, i_size_read(inode), end_pos);
+ if (err) {
+ mutex_unlock(&inode->i_mutex);
+ goto out;
+ }
+ }
+
+ if (sync)
+ atomic_inc(&BTRFS_I(inode)->sync_writers);
+
+ if (iocb->ki_flags & IOCB_DIRECT) {
+ num_written = __btrfs_direct_write(iocb, from, pos);
+ } else {
+ num_written = __btrfs_buffered_write(file, from, pos);
+ if (num_written > 0)
+ iocb->ki_pos = pos + num_written;
+ }
+
+ mutex_unlock(&inode->i_mutex);
+
+ /*
+ * We also have to set last_sub_trans to the current log transid,
+ * otherwise subsequent syncs to a file that's been synced in this
+ * transaction will appear to have already occured.
+ */
+ spin_lock(&BTRFS_I(inode)->lock);
+ BTRFS_I(inode)->last_sub_trans = root->log_transid;
+ spin_unlock(&BTRFS_I(inode)->lock);
+ if (num_written > 0) {
+ err = generic_write_sync(file, pos, num_written);
+ if (err < 0)
+ num_written = err;
+ }
+
+ if (sync)
+ atomic_dec(&BTRFS_I(inode)->sync_writers);
+out:
+ current->backing_dev_info = NULL;
+ return num_written ? num_written : err;
+}
+
+int btrfs_release_file(struct inode *inode, struct file *filp)
+{
+ if (filp->private_data)
+ btrfs_ioctl_trans_end(filp);
+ /*
+ * ordered_data_close is set by settattr when we are about to truncate
+ * a file from a non-zero size to a zero size. This tries to
+ * flush down new bytes that may have been written if the
+ * application were using truncate to replace a file in place.
+ */
+ if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
+ &BTRFS_I(inode)->runtime_flags))
+ filemap_flush(inode->i_mapping);
+ return 0;
+}
+
+static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
+{
+ int ret;
+
+ atomic_inc(&BTRFS_I(inode)->sync_writers);
+ ret = btrfs_fdatawrite_range(inode, start, end);
+ atomic_dec(&BTRFS_I(inode)->sync_writers);
+
+ return ret;
+}
+
+/*
+ * fsync call for both files and directories. This logs the inode into
+ * the tree log instead of forcing full commits whenever possible.
+ *
+ * It needs to call filemap_fdatawait so that all ordered extent updates are
+ * in the metadata btree are up to date for copying to the log.
+ *
+ * It drops the inode mutex before doing the tree log commit. This is an
+ * important optimization for directories because holding the mutex prevents
+ * new operations on the dir while we write to disk.
+ */
+int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
+{
+ struct dentry *dentry = file->f_path.dentry;
+ struct inode *inode = d_inode(dentry);
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_log_ctx ctx;
+ int ret = 0;
+ bool full_sync = 0;
+
+ trace_btrfs_sync_file(file, datasync);
+
+ /*
+ * We write the dirty pages in the range and wait until they complete
+ * out of the ->i_mutex. If so, we can flush the dirty pages by
+ * multi-task, and make the performance up. See
+ * btrfs_wait_ordered_range for an explanation of the ASYNC check.
+ */
+ ret = start_ordered_ops(inode, start, end);
+ if (ret)
+ return ret;
+
+ mutex_lock(&inode->i_mutex);
+ atomic_inc(&root->log_batch);
+ full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
+ /*
+ * We might have have had more pages made dirty after calling
+ * start_ordered_ops and before acquiring the inode's i_mutex.
+ */
+ if (full_sync) {
+ /*
+ * For a full sync, we need to make sure any ordered operations
+ * start and finish before we start logging the inode, so that
+ * all extents are persisted and the respective file extent
+ * items are in the fs/subvol btree.
+ */
+ ret = btrfs_wait_ordered_range(inode, start, end - start + 1);
+ } else {
+ /*
+ * Start any new ordered operations before starting to log the
+ * inode. We will wait for them to finish in btrfs_sync_log().
+ *
+ * Right before acquiring the inode's mutex, we might have new
+ * writes dirtying pages, which won't immediately start the
+ * respective ordered operations - that is done through the
+ * fill_delalloc callbacks invoked from the writepage and
+ * writepages address space operations. So make sure we start
+ * all ordered operations before starting to log our inode. Not
+ * doing this means that while logging the inode, writeback
+ * could start and invoke writepage/writepages, which would call
+ * the fill_delalloc callbacks (cow_file_range,
+ * submit_compressed_extents). These callbacks add first an
+ * extent map to the modified list of extents and then create
+ * the respective ordered operation, which means in
+ * tree-log.c:btrfs_log_inode() we might capture all existing
+ * ordered operations (with btrfs_get_logged_extents()) before
+ * the fill_delalloc callback adds its ordered operation, and by
+ * the time we visit the modified list of extent maps (with
+ * btrfs_log_changed_extents()), we see and process the extent
+ * map they created. We then use the extent map to construct a
+ * file extent item for logging without waiting for the
+ * respective ordered operation to finish - this file extent
+ * item points to a disk location that might not have yet been
+ * written to, containing random data - so after a crash a log
+ * replay will make our inode have file extent items that point
+ * to disk locations containing invalid data, as we returned
+ * success to userspace without waiting for the respective
+ * ordered operation to finish, because it wasn't captured by
+ * btrfs_get_logged_extents().
+ */
+ ret = start_ordered_ops(inode, start, end);
+ }
+ if (ret) {
+ mutex_unlock(&inode->i_mutex);
+ goto out;
+ }
+ atomic_inc(&root->log_batch);
+
+ /*
+ * If the last transaction that changed this file was before the current
+ * transaction and we have the full sync flag set in our inode, we can
+ * bail out now without any syncing.
+ *
+ * Note that we can't bail out if the full sync flag isn't set. This is
+ * because when the full sync flag is set we start all ordered extents
+ * and wait for them to fully complete - when they complete they update
+ * the inode's last_trans field through:
+ *
+ * btrfs_finish_ordered_io() ->
+ * btrfs_update_inode_fallback() ->
+ * btrfs_update_inode() ->
+ * btrfs_set_inode_last_trans()
+ *
+ * So we are sure that last_trans is up to date and can do this check to
+ * bail out safely. For the fast path, when the full sync flag is not
+ * set in our inode, we can not do it because we start only our ordered
+ * extents and don't wait for them to complete (that is when
+ * btrfs_finish_ordered_io runs), so here at this point their last_trans
+ * value might be less than or equals to fs_info->last_trans_committed,
+ * and setting a speculative last_trans for an inode when a buffered
+ * write is made (such as fs_info->generation + 1 for example) would not
+ * be reliable since after setting the value and before fsync is called
+ * any number of transactions can start and commit (transaction kthread
+ * commits the current transaction periodically), and a transaction
+ * commit does not start nor waits for ordered extents to complete.
+ */
+ smp_mb();
+ if (btrfs_inode_in_log(inode, root->fs_info->generation) ||
+ (full_sync && BTRFS_I(inode)->last_trans <=
+ root->fs_info->last_trans_committed)) {
+ /*
+ * We'v had everything committed since the last time we were
+ * modified so clear this flag in case it was set for whatever
+ * reason, it's no longer relevant.
+ */
+ clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
+ mutex_unlock(&inode->i_mutex);
+ goto out;
+ }
+
+ /*
+ * ok we haven't committed the transaction yet, lets do a commit
+ */
+ if (file->private_data)
+ btrfs_ioctl_trans_end(file);
+
+ /*
+ * We use start here because we will need to wait on the IO to complete
+ * in btrfs_sync_log, which could require joining a transaction (for
+ * example checking cross references in the nocow path). If we use join
+ * here we could get into a situation where we're waiting on IO to
+ * happen that is blocked on a transaction trying to commit. With start
+ * we inc the extwriter counter, so we wait for all extwriters to exit
+ * before we start blocking join'ers. This comment is to keep somebody
+ * from thinking they are super smart and changing this to
+ * btrfs_join_transaction *cough*Josef*cough*.
+ */
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ mutex_unlock(&inode->i_mutex);
+ goto out;
+ }
+ trans->sync = true;
+
+ btrfs_init_log_ctx(&ctx);
+
+ ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, &ctx);
+ if (ret < 0) {
+ /* Fallthrough and commit/free transaction. */
+ ret = 1;
+ }
+
+ /* we've logged all the items and now have a consistent
+ * version of the file in the log. It is possible that
+ * someone will come in and modify the file, but that's
+ * fine because the log is consistent on disk, and we
+ * have references to all of the file's extents
+ *
+ * It is possible that someone will come in and log the
+ * file again, but that will end up using the synchronization
+ * inside btrfs_sync_log to keep things safe.
+ */
+ mutex_unlock(&inode->i_mutex);
+
+ /*
+ * If any of the ordered extents had an error, just return it to user
+ * space, so that the application knows some writes didn't succeed and
+ * can take proper action (retry for e.g.). Blindly committing the
+ * transaction in this case, would fool userspace that everything was
+ * successful. And we also want to make sure our log doesn't contain
+ * file extent items pointing to extents that weren't fully written to -
+ * just like in the non fast fsync path, where we check for the ordered
+ * operation's error flag before writing to the log tree and return -EIO
+ * if any of them had this flag set (btrfs_wait_ordered_range) -
+ * therefore we need to check for errors in the ordered operations,
+ * which are indicated by ctx.io_err.
+ */
+ if (ctx.io_err) {
+ btrfs_end_transaction(trans, root);
+ ret = ctx.io_err;
+ goto out;
+ }
+
+ if (ret != BTRFS_NO_LOG_SYNC) {
+ if (!ret) {
+ ret = btrfs_sync_log(trans, root, &ctx);
+ if (!ret) {
+ ret = btrfs_end_transaction(trans, root);
+ goto out;
+ }
+ }
+ if (!full_sync) {
+ ret = btrfs_wait_ordered_range(inode, start,
+ end - start + 1);
+ if (ret) {
+ btrfs_end_transaction(trans, root);
+ goto out;
+ }
+ }
+ ret = btrfs_commit_transaction(trans, root);
+ } else {
+ ret = btrfs_end_transaction(trans, root);
+ }
+out:
+ return ret > 0 ? -EIO : ret;
+}
+
+static const struct vm_operations_struct btrfs_file_vm_ops = {
+ .fault = filemap_fault,
+ .map_pages = filemap_map_pages,
+ .page_mkwrite = btrfs_page_mkwrite,
+};
+
+static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+ struct address_space *mapping = filp->f_mapping;
+
+ if (!mapping->a_ops->readpage)
+ return -ENOEXEC;
+
+ file_accessed(filp);
+ vma->vm_ops = &btrfs_file_vm_ops;
+
+ return 0;
+}
+
+static int hole_mergeable(struct inode *inode, struct extent_buffer *leaf,
+ int slot, u64 start, u64 end)
+{
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_key key;
+
+ if (slot < 0 || slot >= btrfs_header_nritems(leaf))
+ return 0;
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid != btrfs_ino(inode) ||
+ key.type != BTRFS_EXTENT_DATA_KEY)
+ return 0;
+
+ fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+
+ if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
+ return 0;
+
+ if (btrfs_file_extent_disk_bytenr(leaf, fi))
+ return 0;
+
+ if (key.offset == end)
+ return 1;
+ if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start)
+ return 1;
+ return 0;
+}
+
+static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode,
+ struct btrfs_path *path, u64 offset, u64 end)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct extent_buffer *leaf;
+ struct btrfs_file_extent_item *fi;
+ struct extent_map *hole_em;
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ struct btrfs_key key;
+ int ret;
+
+ if (btrfs_fs_incompat(root->fs_info, NO_HOLES))
+ goto out;
+
+ key.objectid = btrfs_ino(inode);
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = offset;
+
+ ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+ if (ret < 0)
+ return ret;
+ BUG_ON(!ret);
+
+ leaf = path->nodes[0];
+ if (hole_mergeable(inode, leaf, path->slots[0]-1, offset, end)) {
+ u64 num_bytes;
+
+ path->slots[0]--;
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +
+ end - offset;
+ btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
+ btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
+ btrfs_set_file_extent_offset(leaf, fi, 0);
+ btrfs_mark_buffer_dirty(leaf);
+ goto out;
+ }
+
+ if (hole_mergeable(inode, leaf, path->slots[0], offset, end)) {
+ u64 num_bytes;
+
+ key.offset = offset;
+ btrfs_set_item_key_safe(root->fs_info, path, &key);
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
+ offset;
+ btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
+ btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
+ btrfs_set_file_extent_offset(leaf, fi, 0);
+ btrfs_mark_buffer_dirty(leaf);
+ goto out;
+ }
+ btrfs_release_path(path);
+
+ ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset,
+ 0, 0, end - offset, 0, end - offset,
+ 0, 0, 0);
+ if (ret)
+ return ret;
+
+out:
+ btrfs_release_path(path);
+
+ hole_em = alloc_extent_map();
+ if (!hole_em) {
+ btrfs_drop_extent_cache(inode, offset, end - 1, 0);
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
+ } else {
+ hole_em->start = offset;
+ hole_em->len = end - offset;
+ hole_em->ram_bytes = hole_em->len;
+ hole_em->orig_start = offset;
+
+ hole_em->block_start = EXTENT_MAP_HOLE;
+ hole_em->block_len = 0;
+ hole_em->orig_block_len = 0;
+ hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
+ hole_em->compress_type = BTRFS_COMPRESS_NONE;
+ hole_em->generation = trans->transid;
+
+ do {
+ btrfs_drop_extent_cache(inode, offset, end - 1, 0);
+ write_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, hole_em, 1);
+ write_unlock(&em_tree->lock);
+ } while (ret == -EEXIST);
+ free_extent_map(hole_em);
+ if (ret)
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
+ }
+
+ return 0;
+}
+
+/*
+ * Find a hole extent on given inode and change start/len to the end of hole
+ * extent.(hole/vacuum extent whose em->start <= start &&
+ * em->start + em->len > start)
+ * When a hole extent is found, return 1 and modify start/len.
+ */
+static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len)
+{
+ struct extent_map *em;
+ int ret = 0;
+
+ em = btrfs_get_extent(inode, NULL, 0, *start, *len, 0);
+ if (IS_ERR_OR_NULL(em)) {
+ if (!em)
+ ret = -ENOMEM;
+ else
+ ret = PTR_ERR(em);
+ return ret;
+ }
+
+ /* Hole or vacuum extent(only exists in no-hole mode) */
+ if (em->block_start == EXTENT_MAP_HOLE) {
+ ret = 1;
+ *len = em->start + em->len > *start + *len ?
+ 0 : *start + *len - em->start - em->len;
+ *start = em->start + em->len;
+ }
+ free_extent_map(em);
+ return ret;
+}
+
+static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct extent_state *cached_state = NULL;
+ struct btrfs_path *path;
+ struct btrfs_block_rsv *rsv;
+ struct btrfs_trans_handle *trans;
+ u64 lockstart;
+ u64 lockend;
+ u64 tail_start;
+ u64 tail_len;
+ u64 orig_start = offset;
+ u64 cur_offset;
+ u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
+ u64 drop_end;
+ int ret = 0;
+ int err = 0;
+ int rsv_count;
+ bool same_page;
+ bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
+ u64 ino_size;
+ bool truncated_page = false;
+ bool updated_inode = false;
+
+ ret = btrfs_wait_ordered_range(inode, offset, len);
+ if (ret)
+ return ret;
+
+ mutex_lock(&inode->i_mutex);
+ ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE);
+ ret = find_first_non_hole(inode, &offset, &len);
+ if (ret < 0)
+ goto out_only_mutex;
+ if (ret && !len) {
+ /* Already in a large hole */
+ ret = 0;
+ goto out_only_mutex;
+ }
+
+ lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize);
+ lockend = round_down(offset + len,
+ BTRFS_I(inode)->root->sectorsize) - 1;
+ same_page = ((offset >> PAGE_CACHE_SHIFT) ==
+ ((offset + len - 1) >> PAGE_CACHE_SHIFT));
+
+ /*
+ * We needn't truncate any page which is beyond the end of the file
+ * because we are sure there is no data there.
+ */
+ /*
+ * Only do this if we are in the same page and we aren't doing the
+ * entire page.
+ */
+ if (same_page && len < PAGE_CACHE_SIZE) {
+ if (offset < ino_size) {
+ truncated_page = true;
+ ret = btrfs_truncate_page(inode, offset, len, 0);
+ } else {
+ ret = 0;
+ }
+ goto out_only_mutex;
+ }
+
+ /* zero back part of the first page */
+ if (offset < ino_size) {
+ truncated_page = true;
+ ret = btrfs_truncate_page(inode, offset, 0, 0);
+ if (ret) {
+ mutex_unlock(&inode->i_mutex);
+ return ret;
+ }
+ }
+
+ /* Check the aligned pages after the first unaligned page,
+ * if offset != orig_start, which means the first unaligned page
+ * including serveral following pages are already in holes,
+ * the extra check can be skipped */
+ if (offset == orig_start) {
+ /* after truncate page, check hole again */
+ len = offset + len - lockstart;
+ offset = lockstart;
+ ret = find_first_non_hole(inode, &offset, &len);
+ if (ret < 0)
+ goto out_only_mutex;
+ if (ret && !len) {
+ ret = 0;
+ goto out_only_mutex;
+ }
+ lockstart = offset;
+ }
+
+ /* Check the tail unaligned part is in a hole */
+ tail_start = lockend + 1;
+ tail_len = offset + len - tail_start;
+ if (tail_len) {
+ ret = find_first_non_hole(inode, &tail_start, &tail_len);
+ if (unlikely(ret < 0))
+ goto out_only_mutex;
+ if (!ret) {
+ /* zero the front end of the last page */
+ if (tail_start + tail_len < ino_size) {
+ truncated_page = true;
+ ret = btrfs_truncate_page(inode,
+ tail_start + tail_len, 0, 1);
+ if (ret)
+ goto out_only_mutex;
+ }
+ }
+ }
+
+ if (lockend < lockstart) {
+ ret = 0;
+ goto out_only_mutex;
+ }
+
+ while (1) {
+ struct btrfs_ordered_extent *ordered;
+
+ truncate_pagecache_range(inode, lockstart, lockend);
+
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ 0, &cached_state);
+ ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
+
+ /*
+ * We need to make sure we have no ordered extents in this range
+ * and nobody raced in and read a page in this range, if we did
+ * we need to try again.
+ */
+ if ((!ordered ||
+ (ordered->file_offset + ordered->len <= lockstart ||
+ ordered->file_offset > lockend)) &&
+ !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
+ if (ordered)
+ btrfs_put_ordered_extent(ordered);
+ break;
+ }
+ if (ordered)
+ btrfs_put_ordered_extent(ordered);
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
+ lockend, &cached_state, GFP_NOFS);
+ ret = btrfs_wait_ordered_range(inode, lockstart,
+ lockend - lockstart + 1);
+ if (ret) {
+ mutex_unlock(&inode->i_mutex);
+ return ret;
+ }
+ }
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
+ if (!rsv) {
+ ret = -ENOMEM;
+ goto out_free;
+ }
+ rsv->size = btrfs_calc_trunc_metadata_size(root, 1);
+ rsv->failfast = 1;
+
+ /*
+ * 1 - update the inode
+ * 1 - removing the extents in the range
+ * 1 - adding the hole extent if no_holes isn't set
+ */
+ rsv_count = no_holes ? 2 : 3;
+ trans = btrfs_start_transaction(root, rsv_count);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ goto out_free;
+ }
+
+ ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
+ min_size);
+ BUG_ON(ret);
+ trans->block_rsv = rsv;
+
+ cur_offset = lockstart;
+ len = lockend - cur_offset;
+ while (cur_offset < lockend) {
+ ret = __btrfs_drop_extents(trans, root, inode, path,
+ cur_offset, lockend + 1,
+ &drop_end, 1, 0, 0, NULL);
+ if (ret != -ENOSPC)
+ break;
+
+ trans->block_rsv = &root->fs_info->trans_block_rsv;
+
+ if (cur_offset < ino_size) {
+ ret = fill_holes(trans, inode, path, cur_offset,
+ drop_end);
+ if (ret) {
+ err = ret;
+ break;
+ }
+ }
+
+ cur_offset = drop_end;
+
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret) {
+ err = ret;
+ break;
+ }
+
+ btrfs_end_transaction(trans, root);
+ btrfs_btree_balance_dirty(root);
+
+ trans = btrfs_start_transaction(root, rsv_count);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ trans = NULL;
+ break;
+ }
+
+ ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
+ rsv, min_size);
+ BUG_ON(ret); /* shouldn't happen */
+ trans->block_rsv = rsv;
+
+ ret = find_first_non_hole(inode, &cur_offset, &len);
+ if (unlikely(ret < 0))
+ break;
+ if (ret && !len) {
+ ret = 0;
+ break;
+ }
+ }
+
+ if (ret) {
+ err = ret;
+ goto out_trans;
+ }
+
+ trans->block_rsv = &root->fs_info->trans_block_rsv;
+ /*
+ * Don't insert file hole extent item if it's for a range beyond eof
+ * (because it's useless) or if it represents a 0 bytes range (when
+ * cur_offset == drop_end).
+ */
+ if (cur_offset < ino_size && cur_offset < drop_end) {
+ ret = fill_holes(trans, inode, path, cur_offset, drop_end);
+ if (ret) {
+ err = ret;
+ goto out_trans;
+ }
+ }
+
+out_trans:
+ if (!trans)
+ goto out_free;
+
+ inode_inc_iversion(inode);
+ inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+
+ trans->block_rsv = &root->fs_info->trans_block_rsv;
+ ret = btrfs_update_inode(trans, root, inode);
+ updated_inode = true;
+ btrfs_end_transaction(trans, root);
+ btrfs_btree_balance_dirty(root);
+out_free:
+ btrfs_free_path(path);
+ btrfs_free_block_rsv(root, rsv);
+out:
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ &cached_state, GFP_NOFS);
+out_only_mutex:
+ if (!updated_inode && truncated_page && !ret && !err) {
+ /*
+ * If we only end up zeroing part of a page, we still need to
+ * update the inode item, so that all the time fields are
+ * updated as well as the necessary btrfs inode in memory fields
+ * for detecting, at fsync time, if the inode isn't yet in the
+ * log tree or it's there but not up to date.
+ */
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ } else {
+ err = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_end_transaction(trans, root);
+ }
+ }
+ mutex_unlock(&inode->i_mutex);
+ if (ret && !err)
+ err = ret;
+ return err;
+}
+
+static long btrfs_fallocate(struct file *file, int mode,
+ loff_t offset, loff_t len)
+{
+ struct inode *inode = file_inode(file);
+ struct extent_state *cached_state = NULL;
+ u64 cur_offset;
+ u64 last_byte;
+ u64 alloc_start;
+ u64 alloc_end;
+ u64 alloc_hint = 0;
+ u64 locked_end;
+ struct extent_map *em;
+ int blocksize = BTRFS_I(inode)->root->sectorsize;
+ int ret;
+
+ alloc_start = round_down(offset, blocksize);
+ alloc_end = round_up(offset + len, blocksize);
+
+ /* Make sure we aren't being give some crap mode */
+ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+ return -EOPNOTSUPP;
+
+ if (mode & FALLOC_FL_PUNCH_HOLE)
+ return btrfs_punch_hole(inode, offset, len);
+
+ /*
+ * Make sure we have enough space before we do the
+ * allocation.
+ */
+ ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start, alloc_end - alloc_start);
+ if (ret)
+ return ret;
+
+ mutex_lock(&inode->i_mutex);
+ ret = inode_newsize_ok(inode, alloc_end);
+ if (ret)
+ goto out;
+
+ if (alloc_start > inode->i_size) {
+ ret = btrfs_cont_expand(inode, i_size_read(inode),
+ alloc_start);
+ if (ret)
+ goto out;
+ } else {
+ /*
+ * If we are fallocating from the end of the file onward we
+ * need to zero out the end of the page if i_size lands in the
+ * middle of a page.
+ */
+ ret = btrfs_truncate_page(inode, inode->i_size, 0, 0);
+ if (ret)
+ goto out;
+ }
+
+ /*
+ * wait for ordered IO before we have any locks. We'll loop again
+ * below with the locks held.
+ */
+ ret = btrfs_wait_ordered_range(inode, alloc_start,
+ alloc_end - alloc_start);
+ if (ret)
+ goto out;
+
+ locked_end = alloc_end - 1;
+ while (1) {
+ struct btrfs_ordered_extent *ordered;
+
+ /* the extent lock is ordered inside the running
+ * transaction
+ */
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
+ locked_end, 0, &cached_state);
+ ordered = btrfs_lookup_first_ordered_extent(inode,
+ alloc_end - 1);
+ if (ordered &&
+ ordered->file_offset + ordered->len > alloc_start &&
+ ordered->file_offset < alloc_end) {
+ btrfs_put_ordered_extent(ordered);
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+ alloc_start, locked_end,
+ &cached_state, GFP_NOFS);
+ /*
+ * we can't wait on the range with the transaction
+ * running or with the extent lock held
+ */
+ ret = btrfs_wait_ordered_range(inode, alloc_start,
+ alloc_end - alloc_start);
+ if (ret)
+ goto out;
+ } else {
+ if (ordered)
+ btrfs_put_ordered_extent(ordered);
+ break;
+ }
+ }
+
+ cur_offset = alloc_start;
+ while (1) {
+ u64 actual_end;
+
+ em = btrfs_get_extent(inode, NULL, 0, cur_offset,
+ alloc_end - cur_offset, 0);
+ if (IS_ERR_OR_NULL(em)) {
+ if (!em)
+ ret = -ENOMEM;
+ else
+ ret = PTR_ERR(em);
+ break;
+ }
+ last_byte = min(extent_map_end(em), alloc_end);
+ actual_end = min_t(u64, extent_map_end(em), offset + len);
+ last_byte = ALIGN(last_byte, blocksize);
+
+ if (em->block_start == EXTENT_MAP_HOLE ||
+ (cur_offset >= inode->i_size &&
+ !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
+ ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
+ last_byte - cur_offset,
+ 1 << inode->i_blkbits,
+ offset + len,
+ &alloc_hint);
+ } else if (actual_end > inode->i_size &&
+ !(mode & FALLOC_FL_KEEP_SIZE)) {
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+
+ /*
+ * We didn't need to allocate any more space, but we
+ * still extended the size of the file so we need to
+ * update i_size and the inode item.
+ */
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ } else {
+ inode->i_ctime = CURRENT_TIME;
+ i_size_write(inode, actual_end);
+ btrfs_ordered_update_i_size(inode, actual_end,
+ NULL);
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret)
+ btrfs_end_transaction(trans, root);
+ else
+ ret = btrfs_end_transaction(trans,
+ root);
+ }
+ }
+ free_extent_map(em);
+ if (ret < 0)
+ break;
+
+ cur_offset = last_byte;
+ if (cur_offset >= alloc_end) {
+ ret = 0;
+ break;
+ }
+ }
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
+ &cached_state, GFP_NOFS);
+out:
+ mutex_unlock(&inode->i_mutex);
+ /* Let go of our reservation. */
+ btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
+ return ret;
+}
+
+static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct extent_map *em = NULL;
+ struct extent_state *cached_state = NULL;
+ u64 lockstart;
+ u64 lockend;
+ u64 start;
+ u64 len;
+ int ret = 0;
+
+ if (inode->i_size == 0)
+ return -ENXIO;
+
+ /*
+ * *offset can be negative, in this case we start finding DATA/HOLE from
+ * the very start of the file.
+ */
+ start = max_t(loff_t, 0, *offset);
+
+ lockstart = round_down(start, root->sectorsize);
+ lockend = round_up(i_size_read(inode), root->sectorsize);
+ if (lockend <= lockstart)
+ lockend = lockstart + root->sectorsize;
+ lockend--;
+ len = lockend - lockstart + 1;
+
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0,
+ &cached_state);
+
+ while (start < inode->i_size) {
+ em = btrfs_get_extent_fiemap(inode, NULL, 0, start, len, 0);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ em = NULL;
+ break;
+ }
+
+ if (whence == SEEK_HOLE &&
+ (em->block_start == EXTENT_MAP_HOLE ||
+ test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
+ break;
+ else if (whence == SEEK_DATA &&
+ (em->block_start != EXTENT_MAP_HOLE &&
+ !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
+ break;
+
+ start = em->start + em->len;
+ free_extent_map(em);
+ em = NULL;
+ cond_resched();
+ }
+ free_extent_map(em);
+ if (!ret) {
+ if (whence == SEEK_DATA && start >= inode->i_size)
+ ret = -ENXIO;
+ else
+ *offset = min_t(loff_t, start, inode->i_size);
+ }
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ &cached_state, GFP_NOFS);
+ return ret;
+}
+
+static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
+{
+ struct inode *inode = file->f_mapping->host;
+ int ret;
+
+ mutex_lock(&inode->i_mutex);
+ switch (whence) {
+ case SEEK_END:
+ case SEEK_CUR:
+ offset = generic_file_llseek(file, offset, whence);
+ goto out;
+ case SEEK_DATA:
+ case SEEK_HOLE:
+ if (offset >= i_size_read(inode)) {
+ mutex_unlock(&inode->i_mutex);
+ return -ENXIO;
+ }
+
+ ret = find_desired_extent(inode, &offset, whence);
+ if (ret) {
+ mutex_unlock(&inode->i_mutex);
+ return ret;
+ }
+ }
+
+ offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
+out:
+ mutex_unlock(&inode->i_mutex);
+ return offset;
+}
+
+const struct file_operations btrfs_file_operations = {
+ .llseek = btrfs_file_llseek,
+ .read_iter = generic_file_read_iter,
+ .splice_read = generic_file_splice_read,
+ .write_iter = btrfs_file_write_iter,
+ .mmap = btrfs_file_mmap,
+ .open = generic_file_open,
+ .release = btrfs_release_file,
+ .fsync = btrfs_sync_file,
+ .fallocate = btrfs_fallocate,
+ .unlocked_ioctl = btrfs_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = btrfs_ioctl,
+#endif
+};
+
+void btrfs_auto_defrag_exit(void)
+{
+ if (btrfs_inode_defrag_cachep)
+ kmem_cache_destroy(btrfs_inode_defrag_cachep);
+}
+
+int btrfs_auto_defrag_init(void)
+{
+ btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
+ sizeof(struct inode_defrag), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+ NULL);
+ if (!btrfs_inode_defrag_cachep)
+ return -ENOMEM;
+
+ return 0;
+}
+
+int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end)
+{
+ int ret;
+
+ /*
+ * So with compression we will find and lock a dirty page and clear the
+ * first one as dirty, setup an async extent, and immediately return
+ * with the entire range locked but with nobody actually marked with
+ * writeback. So we can't just filemap_write_and_wait_range() and
+ * expect it to work since it will just kick off a thread to do the
+ * actual work. So we need to call filemap_fdatawrite_range _again_
+ * since it will wait on the page lock, which won't be unlocked until
+ * after the pages have been marked as writeback and so we're good to go
+ * from there. We have to do this otherwise we'll miss the ordered
+ * extents and that results in badness. Please Josef, do not think you
+ * know better and pull this out at some point in the future, it is
+ * right and you are wrong.
+ */
+ ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
+ if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags))
+ ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
+
+ return ret;
+}
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
new file mode 100644
index 000000000..9dbe5b548
--- /dev/null
+++ b/fs/btrfs/free-space-cache.c
@@ -0,0 +1,3653 @@
+/*
+ * Copyright (C) 2008 Red Hat. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/pagemap.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/math64.h>
+#include <linux/ratelimit.h>
+#include "ctree.h"
+#include "free-space-cache.h"
+#include "transaction.h"
+#include "disk-io.h"
+#include "extent_io.h"
+#include "inode-map.h"
+#include "volumes.h"
+
+#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8)
+#define MAX_CACHE_BYTES_PER_GIG (32 * 1024)
+
+struct btrfs_trim_range {
+ u64 start;
+ u64 bytes;
+ struct list_head list;
+};
+
+static int link_free_space(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info);
+static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info);
+
+static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 offset)
+{
+ struct btrfs_key key;
+ struct btrfs_key location;
+ struct btrfs_disk_key disk_key;
+ struct btrfs_free_space_header *header;
+ struct extent_buffer *leaf;
+ struct inode *inode = NULL;
+ int ret;
+
+ key.objectid = BTRFS_FREE_SPACE_OBJECTID;
+ key.offset = offset;
+ key.type = 0;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ if (ret > 0) {
+ btrfs_release_path(path);
+ return ERR_PTR(-ENOENT);
+ }
+
+ leaf = path->nodes[0];
+ header = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_free_space_header);
+ btrfs_free_space_key(leaf, header, &disk_key);
+ btrfs_disk_key_to_cpu(&location, &disk_key);
+ btrfs_release_path(path);
+
+ inode = btrfs_iget(root->fs_info->sb, &location, root, NULL);
+ if (!inode)
+ return ERR_PTR(-ENOENT);
+ if (IS_ERR(inode))
+ return inode;
+ if (is_bad_inode(inode)) {
+ iput(inode);
+ return ERR_PTR(-ENOENT);
+ }
+
+ mapping_set_gfp_mask(inode->i_mapping,
+ mapping_gfp_mask(inode->i_mapping) &
+ ~(__GFP_FS | __GFP_HIGHMEM));
+
+ return inode;
+}
+
+struct inode *lookup_free_space_inode(struct btrfs_root *root,
+ struct btrfs_block_group_cache
+ *block_group, struct btrfs_path *path)
+{
+ struct inode *inode = NULL;
+ u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
+
+ spin_lock(&block_group->lock);
+ if (block_group->inode)
+ inode = igrab(block_group->inode);
+ spin_unlock(&block_group->lock);
+ if (inode)
+ return inode;
+
+ inode = __lookup_free_space_inode(root, path,
+ block_group->key.objectid);
+ if (IS_ERR(inode))
+ return inode;
+
+ spin_lock(&block_group->lock);
+ if (!((BTRFS_I(inode)->flags & flags) == flags)) {
+ btrfs_info(root->fs_info,
+ "Old style space inode found, converting.");
+ BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM |
+ BTRFS_INODE_NODATACOW;
+ block_group->disk_cache_state = BTRFS_DC_CLEAR;
+ }
+
+ if (!block_group->iref) {
+ block_group->inode = igrab(inode);
+ block_group->iref = 1;
+ }
+ spin_unlock(&block_group->lock);
+
+ return inode;
+}
+
+static int __create_free_space_inode(struct btrfs_root *root,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ u64 ino, u64 offset)
+{
+ struct btrfs_key key;
+ struct btrfs_disk_key disk_key;
+ struct btrfs_free_space_header *header;
+ struct btrfs_inode_item *inode_item;
+ struct extent_buffer *leaf;
+ u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC;
+ int ret;
+
+ ret = btrfs_insert_empty_inode(trans, root, path, ino);
+ if (ret)
+ return ret;
+
+ /* We inline crc's for the free disk space cache */
+ if (ino != BTRFS_FREE_INO_OBJECTID)
+ flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
+
+ leaf = path->nodes[0];
+ inode_item = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_inode_item);
+ btrfs_item_key(leaf, &disk_key, path->slots[0]);
+ memset_extent_buffer(leaf, 0, (unsigned long)inode_item,
+ sizeof(*inode_item));
+ btrfs_set_inode_generation(leaf, inode_item, trans->transid);
+ btrfs_set_inode_size(leaf, inode_item, 0);
+ btrfs_set_inode_nbytes(leaf, inode_item, 0);
+ btrfs_set_inode_uid(leaf, inode_item, 0);
+ btrfs_set_inode_gid(leaf, inode_item, 0);
+ btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
+ btrfs_set_inode_flags(leaf, inode_item, flags);
+ btrfs_set_inode_nlink(leaf, inode_item, 1);
+ btrfs_set_inode_transid(leaf, inode_item, trans->transid);
+ btrfs_set_inode_block_group(leaf, inode_item, offset);
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_release_path(path);
+
+ key.objectid = BTRFS_FREE_SPACE_OBJECTID;
+ key.offset = offset;
+ key.type = 0;
+ ret = btrfs_insert_empty_item(trans, root, path, &key,
+ sizeof(struct btrfs_free_space_header));
+ if (ret < 0) {
+ btrfs_release_path(path);
+ return ret;
+ }
+
+ leaf = path->nodes[0];
+ header = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_free_space_header);
+ memset_extent_buffer(leaf, 0, (unsigned long)header, sizeof(*header));
+ btrfs_set_free_space_key(leaf, header, &disk_key);
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_release_path(path);
+
+ return 0;
+}
+
+int create_free_space_inode(struct btrfs_root *root,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path)
+{
+ int ret;
+ u64 ino;
+
+ ret = btrfs_find_free_objectid(root, &ino);
+ if (ret < 0)
+ return ret;
+
+ return __create_free_space_inode(root, trans, path, ino,
+ block_group->key.objectid);
+}
+
+int btrfs_check_trunc_cache_free_space(struct btrfs_root *root,
+ struct btrfs_block_rsv *rsv)
+{
+ u64 needed_bytes;
+ int ret;
+
+ /* 1 for slack space, 1 for updating the inode */
+ needed_bytes = btrfs_calc_trunc_metadata_size(root, 1) +
+ btrfs_calc_trans_metadata_size(root, 1);
+
+ spin_lock(&rsv->lock);
+ if (rsv->reserved < needed_bytes)
+ ret = -ENOSPC;
+ else
+ ret = 0;
+ spin_unlock(&rsv->lock);
+ return ret;
+}
+
+int btrfs_truncate_free_space_cache(struct btrfs_root *root,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_block_group_cache *block_group,
+ struct inode *inode)
+{
+ int ret = 0;
+ struct btrfs_path *path = btrfs_alloc_path();
+
+ if (!path) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+
+ if (block_group) {
+ mutex_lock(&trans->transaction->cache_write_mutex);
+ if (!list_empty(&block_group->io_list)) {
+ list_del_init(&block_group->io_list);
+
+ btrfs_wait_cache_io(root, trans, block_group,
+ &block_group->io_ctl, path,
+ block_group->key.objectid);
+ btrfs_put_block_group(block_group);
+ }
+
+ /*
+ * now that we've truncated the cache away, its no longer
+ * setup or written
+ */
+ spin_lock(&block_group->lock);
+ block_group->disk_cache_state = BTRFS_DC_CLEAR;
+ spin_unlock(&block_group->lock);
+ }
+ btrfs_free_path(path);
+
+ btrfs_i_size_write(inode, 0);
+ truncate_pagecache(inode, 0);
+
+ /*
+ * We don't need an orphan item because truncating the free space cache
+ * will never be split across transactions.
+ * We don't need to check for -EAGAIN because we're a free space
+ * cache inode
+ */
+ ret = btrfs_truncate_inode_items(trans, root, inode,
+ 0, BTRFS_EXTENT_DATA_KEY);
+ if (ret) {
+ mutex_unlock(&trans->transaction->cache_write_mutex);
+ btrfs_abort_transaction(trans, root, ret);
+ return ret;
+ }
+
+ ret = btrfs_update_inode(trans, root, inode);
+
+ if (block_group)
+ mutex_unlock(&trans->transaction->cache_write_mutex);
+
+fail:
+ if (ret)
+ btrfs_abort_transaction(trans, root, ret);
+
+ return ret;
+}
+
+static int readahead_cache(struct inode *inode)
+{
+ struct file_ra_state *ra;
+ unsigned long last_index;
+
+ ra = kzalloc(sizeof(*ra), GFP_NOFS);
+ if (!ra)
+ return -ENOMEM;
+
+ file_ra_state_init(ra, inode->i_mapping);
+ last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
+
+ page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index);
+
+ kfree(ra);
+
+ return 0;
+}
+
+static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
+ struct btrfs_root *root, int write)
+{
+ int num_pages;
+ int check_crcs = 0;
+
+ num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE);
+
+ if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID)
+ check_crcs = 1;
+
+ /* Make sure we can fit our crcs into the first page */
+ if (write && check_crcs &&
+ (num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE)
+ return -ENOSPC;
+
+ memset(io_ctl, 0, sizeof(struct btrfs_io_ctl));
+
+ io_ctl->pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS);
+ if (!io_ctl->pages)
+ return -ENOMEM;
+
+ io_ctl->num_pages = num_pages;
+ io_ctl->root = root;
+ io_ctl->check_crcs = check_crcs;
+ io_ctl->inode = inode;
+
+ return 0;
+}
+
+static void io_ctl_free(struct btrfs_io_ctl *io_ctl)
+{
+ kfree(io_ctl->pages);
+ io_ctl->pages = NULL;
+}
+
+static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl)
+{
+ if (io_ctl->cur) {
+ io_ctl->cur = NULL;
+ io_ctl->orig = NULL;
+ }
+}
+
+static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear)
+{
+ ASSERT(io_ctl->index < io_ctl->num_pages);
+ io_ctl->page = io_ctl->pages[io_ctl->index++];
+ io_ctl->cur = page_address(io_ctl->page);
+ io_ctl->orig = io_ctl->cur;
+ io_ctl->size = PAGE_CACHE_SIZE;
+ if (clear)
+ memset(io_ctl->cur, 0, PAGE_CACHE_SIZE);
+}
+
+static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
+{
+ int i;
+
+ io_ctl_unmap_page(io_ctl);
+
+ for (i = 0; i < io_ctl->num_pages; i++) {
+ if (io_ctl->pages[i]) {
+ ClearPageChecked(io_ctl->pages[i]);
+ unlock_page(io_ctl->pages[i]);
+ page_cache_release(io_ctl->pages[i]);
+ }
+ }
+}
+
+static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, struct inode *inode,
+ int uptodate)
+{
+ struct page *page;
+ gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
+ int i;
+
+ for (i = 0; i < io_ctl->num_pages; i++) {
+ page = find_or_create_page(inode->i_mapping, i, mask);
+ if (!page) {
+ io_ctl_drop_pages(io_ctl);
+ return -ENOMEM;
+ }
+ io_ctl->pages[i] = page;
+ if (uptodate && !PageUptodate(page)) {
+ btrfs_readpage(NULL, page);
+ lock_page(page);
+ if (!PageUptodate(page)) {
+ btrfs_err(BTRFS_I(inode)->root->fs_info,
+ "error reading free space cache");
+ io_ctl_drop_pages(io_ctl);
+ return -EIO;
+ }
+ }
+ }
+
+ for (i = 0; i < io_ctl->num_pages; i++) {
+ clear_page_dirty_for_io(io_ctl->pages[i]);
+ set_page_extent_mapped(io_ctl->pages[i]);
+ }
+
+ return 0;
+}
+
+static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
+{
+ __le64 *val;
+
+ io_ctl_map_page(io_ctl, 1);
+
+ /*
+ * Skip the csum areas. If we don't check crcs then we just have a
+ * 64bit chunk at the front of the first page.
+ */
+ if (io_ctl->check_crcs) {
+ io_ctl->cur += (sizeof(u32) * io_ctl->num_pages);
+ io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
+ } else {
+ io_ctl->cur += sizeof(u64);
+ io_ctl->size -= sizeof(u64) * 2;
+ }
+
+ val = io_ctl->cur;
+ *val = cpu_to_le64(generation);
+ io_ctl->cur += sizeof(u64);
+}
+
+static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
+{
+ __le64 *gen;
+
+ /*
+ * Skip the crc area. If we don't check crcs then we just have a 64bit
+ * chunk at the front of the first page.
+ */
+ if (io_ctl->check_crcs) {
+ io_ctl->cur += sizeof(u32) * io_ctl->num_pages;
+ io_ctl->size -= sizeof(u64) +
+ (sizeof(u32) * io_ctl->num_pages);
+ } else {
+ io_ctl->cur += sizeof(u64);
+ io_ctl->size -= sizeof(u64) * 2;
+ }
+
+ gen = io_ctl->cur;
+ if (le64_to_cpu(*gen) != generation) {
+ printk_ratelimited(KERN_ERR "BTRFS: space cache generation "
+ "(%Lu) does not match inode (%Lu)\n", *gen,
+ generation);
+ io_ctl_unmap_page(io_ctl);
+ return -EIO;
+ }
+ io_ctl->cur += sizeof(u64);
+ return 0;
+}
+
+static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index)
+{
+ u32 *tmp;
+ u32 crc = ~(u32)0;
+ unsigned offset = 0;
+
+ if (!io_ctl->check_crcs) {
+ io_ctl_unmap_page(io_ctl);
+ return;
+ }
+
+ if (index == 0)
+ offset = sizeof(u32) * io_ctl->num_pages;
+
+ crc = btrfs_csum_data(io_ctl->orig + offset, crc,
+ PAGE_CACHE_SIZE - offset);
+ btrfs_csum_final(crc, (char *)&crc);
+ io_ctl_unmap_page(io_ctl);
+ tmp = page_address(io_ctl->pages[0]);
+ tmp += index;
+ *tmp = crc;
+}
+
+static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
+{
+ u32 *tmp, val;
+ u32 crc = ~(u32)0;
+ unsigned offset = 0;
+
+ if (!io_ctl->check_crcs) {
+ io_ctl_map_page(io_ctl, 0);
+ return 0;
+ }
+
+ if (index == 0)
+ offset = sizeof(u32) * io_ctl->num_pages;
+
+ tmp = page_address(io_ctl->pages[0]);
+ tmp += index;
+ val = *tmp;
+
+ io_ctl_map_page(io_ctl, 0);
+ crc = btrfs_csum_data(io_ctl->orig + offset, crc,
+ PAGE_CACHE_SIZE - offset);
+ btrfs_csum_final(crc, (char *)&crc);
+ if (val != crc) {
+ printk_ratelimited(KERN_ERR "BTRFS: csum mismatch on free "
+ "space cache\n");
+ io_ctl_unmap_page(io_ctl);
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes,
+ void *bitmap)
+{
+ struct btrfs_free_space_entry *entry;
+
+ if (!io_ctl->cur)
+ return -ENOSPC;
+
+ entry = io_ctl->cur;
+ entry->offset = cpu_to_le64(offset);
+ entry->bytes = cpu_to_le64(bytes);
+ entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
+ BTRFS_FREE_SPACE_EXTENT;
+ io_ctl->cur += sizeof(struct btrfs_free_space_entry);
+ io_ctl->size -= sizeof(struct btrfs_free_space_entry);
+
+ if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
+ return 0;
+
+ io_ctl_set_crc(io_ctl, io_ctl->index - 1);
+
+ /* No more pages to map */
+ if (io_ctl->index >= io_ctl->num_pages)
+ return 0;
+
+ /* map the next page */
+ io_ctl_map_page(io_ctl, 1);
+ return 0;
+}
+
+static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap)
+{
+ if (!io_ctl->cur)
+ return -ENOSPC;
+
+ /*
+ * If we aren't at the start of the current page, unmap this one and
+ * map the next one if there is any left.
+ */
+ if (io_ctl->cur != io_ctl->orig) {
+ io_ctl_set_crc(io_ctl, io_ctl->index - 1);
+ if (io_ctl->index >= io_ctl->num_pages)
+ return -ENOSPC;
+ io_ctl_map_page(io_ctl, 0);
+ }
+
+ memcpy(io_ctl->cur, bitmap, PAGE_CACHE_SIZE);
+ io_ctl_set_crc(io_ctl, io_ctl->index - 1);
+ if (io_ctl->index < io_ctl->num_pages)
+ io_ctl_map_page(io_ctl, 0);
+ return 0;
+}
+
+static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl *io_ctl)
+{
+ /*
+ * If we're not on the boundary we know we've modified the page and we
+ * need to crc the page.
+ */
+ if (io_ctl->cur != io_ctl->orig)
+ io_ctl_set_crc(io_ctl, io_ctl->index - 1);
+ else
+ io_ctl_unmap_page(io_ctl);
+
+ while (io_ctl->index < io_ctl->num_pages) {
+ io_ctl_map_page(io_ctl, 1);
+ io_ctl_set_crc(io_ctl, io_ctl->index - 1);
+ }
+}
+
+static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl,
+ struct btrfs_free_space *entry, u8 *type)
+{
+ struct btrfs_free_space_entry *e;
+ int ret;
+
+ if (!io_ctl->cur) {
+ ret = io_ctl_check_crc(io_ctl, io_ctl->index);
+ if (ret)
+ return ret;
+ }
+
+ e = io_ctl->cur;
+ entry->offset = le64_to_cpu(e->offset);
+ entry->bytes = le64_to_cpu(e->bytes);
+ *type = e->type;
+ io_ctl->cur += sizeof(struct btrfs_free_space_entry);
+ io_ctl->size -= sizeof(struct btrfs_free_space_entry);
+
+ if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
+ return 0;
+
+ io_ctl_unmap_page(io_ctl);
+
+ return 0;
+}
+
+static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl,
+ struct btrfs_free_space *entry)
+{
+ int ret;
+
+ ret = io_ctl_check_crc(io_ctl, io_ctl->index);
+ if (ret)
+ return ret;
+
+ memcpy(entry->bitmap, io_ctl->cur, PAGE_CACHE_SIZE);
+ io_ctl_unmap_page(io_ctl);
+
+ return 0;
+}
+
+/*
+ * Since we attach pinned extents after the fact we can have contiguous sections
+ * of free space that are split up in entries. This poses a problem with the
+ * tree logging stuff since it could have allocated across what appears to be 2
+ * entries since we would have merged the entries when adding the pinned extents
+ * back to the free space cache. So run through the space cache that we just
+ * loaded and merge contiguous entries. This will make the log replay stuff not
+ * blow up and it will make for nicer allocator behavior.
+ */
+static void merge_space_tree(struct btrfs_free_space_ctl *ctl)
+{
+ struct btrfs_free_space *e, *prev = NULL;
+ struct rb_node *n;
+
+again:
+ spin_lock(&ctl->tree_lock);
+ for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
+ e = rb_entry(n, struct btrfs_free_space, offset_index);
+ if (!prev)
+ goto next;
+ if (e->bitmap || prev->bitmap)
+ goto next;
+ if (prev->offset + prev->bytes == e->offset) {
+ unlink_free_space(ctl, prev);
+ unlink_free_space(ctl, e);
+ prev->bytes += e->bytes;
+ kmem_cache_free(btrfs_free_space_cachep, e);
+ link_free_space(ctl, prev);
+ prev = NULL;
+ spin_unlock(&ctl->tree_lock);
+ goto again;
+ }
+next:
+ prev = e;
+ }
+ spin_unlock(&ctl->tree_lock);
+}
+
+static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
+ struct btrfs_free_space_ctl *ctl,
+ struct btrfs_path *path, u64 offset)
+{
+ struct btrfs_free_space_header *header;
+ struct extent_buffer *leaf;
+ struct btrfs_io_ctl io_ctl;
+ struct btrfs_key key;
+ struct btrfs_free_space *e, *n;
+ LIST_HEAD(bitmaps);
+ u64 num_entries;
+ u64 num_bitmaps;
+ u64 generation;
+ u8 type;
+ int ret = 0;
+
+ /* Nothing in the space cache, goodbye */
+ if (!i_size_read(inode))
+ return 0;
+
+ key.objectid = BTRFS_FREE_SPACE_OBJECTID;
+ key.offset = offset;
+ key.type = 0;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ return 0;
+ else if (ret > 0) {
+ btrfs_release_path(path);
+ return 0;
+ }
+
+ ret = -1;
+
+ leaf = path->nodes[0];
+ header = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_free_space_header);
+ num_entries = btrfs_free_space_entries(leaf, header);
+ num_bitmaps = btrfs_free_space_bitmaps(leaf, header);
+ generation = btrfs_free_space_generation(leaf, header);
+ btrfs_release_path(path);
+
+ if (!BTRFS_I(inode)->generation) {
+ btrfs_info(root->fs_info,
+ "The free space cache file (%llu) is invalid. skip it\n",
+ offset);
+ return 0;
+ }
+
+ if (BTRFS_I(inode)->generation != generation) {
+ btrfs_err(root->fs_info,
+ "free space inode generation (%llu) "
+ "did not match free space cache generation (%llu)",
+ BTRFS_I(inode)->generation, generation);
+ return 0;
+ }
+
+ if (!num_entries)
+ return 0;
+
+ ret = io_ctl_init(&io_ctl, inode, root, 0);
+ if (ret)
+ return ret;
+
+ ret = readahead_cache(inode);
+ if (ret)
+ goto out;
+
+ ret = io_ctl_prepare_pages(&io_ctl, inode, 1);
+ if (ret)
+ goto out;
+
+ ret = io_ctl_check_crc(&io_ctl, 0);
+ if (ret)
+ goto free_cache;
+
+ ret = io_ctl_check_generation(&io_ctl, generation);
+ if (ret)
+ goto free_cache;
+
+ while (num_entries) {
+ e = kmem_cache_zalloc(btrfs_free_space_cachep,
+ GFP_NOFS);
+ if (!e)
+ goto free_cache;
+
+ ret = io_ctl_read_entry(&io_ctl, e, &type);
+ if (ret) {
+ kmem_cache_free(btrfs_free_space_cachep, e);
+ goto free_cache;
+ }
+
+ if (!e->bytes) {
+ kmem_cache_free(btrfs_free_space_cachep, e);
+ goto free_cache;
+ }
+
+ if (type == BTRFS_FREE_SPACE_EXTENT) {
+ spin_lock(&ctl->tree_lock);
+ ret = link_free_space(ctl, e);
+ spin_unlock(&ctl->tree_lock);
+ if (ret) {
+ btrfs_err(root->fs_info,
+ "Duplicate entries in free space cache, dumping");
+ kmem_cache_free(btrfs_free_space_cachep, e);
+ goto free_cache;
+ }
+ } else {
+ ASSERT(num_bitmaps);
+ num_bitmaps--;
+ e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+ if (!e->bitmap) {
+ kmem_cache_free(
+ btrfs_free_space_cachep, e);
+ goto free_cache;
+ }
+ spin_lock(&ctl->tree_lock);
+ ret = link_free_space(ctl, e);
+ ctl->total_bitmaps++;
+ ctl->op->recalc_thresholds(ctl);
+ spin_unlock(&ctl->tree_lock);
+ if (ret) {
+ btrfs_err(root->fs_info,
+ "Duplicate entries in free space cache, dumping");
+ kmem_cache_free(btrfs_free_space_cachep, e);
+ goto free_cache;
+ }
+ list_add_tail(&e->list, &bitmaps);
+ }
+
+ num_entries--;
+ }
+
+ io_ctl_unmap_page(&io_ctl);
+
+ /*
+ * We add the bitmaps at the end of the entries in order that
+ * the bitmap entries are added to the cache.
+ */
+ list_for_each_entry_safe(e, n, &bitmaps, list) {
+ list_del_init(&e->list);
+ ret = io_ctl_read_bitmap(&io_ctl, e);
+ if (ret)
+ goto free_cache;
+ }
+
+ io_ctl_drop_pages(&io_ctl);
+ merge_space_tree(ctl);
+ ret = 1;
+out:
+ io_ctl_free(&io_ctl);
+ return ret;
+free_cache:
+ io_ctl_drop_pages(&io_ctl);
+ __btrfs_remove_free_space_cache(ctl);
+ goto out;
+}
+
+int load_free_space_cache(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group)
+{
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ struct btrfs_root *root = fs_info->tree_root;
+ struct inode *inode;
+ struct btrfs_path *path;
+ int ret = 0;
+ bool matched;
+ u64 used = btrfs_block_group_used(&block_group->item);
+
+ /*
+ * If this block group has been marked to be cleared for one reason or
+ * another then we can't trust the on disk cache, so just return.
+ */
+ spin_lock(&block_group->lock);
+ if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
+ spin_unlock(&block_group->lock);
+ return 0;
+ }
+ spin_unlock(&block_group->lock);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return 0;
+ path->search_commit_root = 1;
+ path->skip_locking = 1;
+
+ inode = lookup_free_space_inode(root, block_group, path);
+ if (IS_ERR(inode)) {
+ btrfs_free_path(path);
+ return 0;
+ }
+
+ /* We may have converted the inode and made the cache invalid. */
+ spin_lock(&block_group->lock);
+ if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
+ spin_unlock(&block_group->lock);
+ btrfs_free_path(path);
+ goto out;
+ }
+ spin_unlock(&block_group->lock);
+
+ ret = __load_free_space_cache(fs_info->tree_root, inode, ctl,
+ path, block_group->key.objectid);
+ btrfs_free_path(path);
+ if (ret <= 0)
+ goto out;
+
+ spin_lock(&ctl->tree_lock);
+ matched = (ctl->free_space == (block_group->key.offset - used -
+ block_group->bytes_super));
+ spin_unlock(&ctl->tree_lock);
+
+ if (!matched) {
+ __btrfs_remove_free_space_cache(ctl);
+ btrfs_warn(fs_info, "block group %llu has wrong amount of free space",
+ block_group->key.objectid);
+ ret = -1;
+ }
+out:
+ if (ret < 0) {
+ /* This cache is bogus, make sure it gets cleared */
+ spin_lock(&block_group->lock);
+ block_group->disk_cache_state = BTRFS_DC_CLEAR;
+ spin_unlock(&block_group->lock);
+ ret = 0;
+
+ btrfs_warn(fs_info, "failed to load free space cache for block group %llu, rebuild it now",
+ block_group->key.objectid);
+ }
+
+ iput(inode);
+ return ret;
+}
+
+static noinline_for_stack
+int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl,
+ struct btrfs_free_space_ctl *ctl,
+ struct btrfs_block_group_cache *block_group,
+ int *entries, int *bitmaps,
+ struct list_head *bitmap_list)
+{
+ int ret;
+ struct btrfs_free_cluster *cluster = NULL;
+ struct btrfs_free_cluster *cluster_locked = NULL;
+ struct rb_node *node = rb_first(&ctl->free_space_offset);
+ struct btrfs_trim_range *trim_entry;
+
+ /* Get the cluster for this block_group if it exists */
+ if (block_group && !list_empty(&block_group->cluster_list)) {
+ cluster = list_entry(block_group->cluster_list.next,
+ struct btrfs_free_cluster,
+ block_group_list);
+ }
+
+ if (!node && cluster) {
+ cluster_locked = cluster;
+ spin_lock(&cluster_locked->lock);
+ node = rb_first(&cluster->root);
+ cluster = NULL;
+ }
+
+ /* Write out the extent entries */
+ while (node) {
+ struct btrfs_free_space *e;
+
+ e = rb_entry(node, struct btrfs_free_space, offset_index);
+ *entries += 1;
+
+ ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes,
+ e->bitmap);
+ if (ret)
+ goto fail;
+
+ if (e->bitmap) {
+ list_add_tail(&e->list, bitmap_list);
+ *bitmaps += 1;
+ }
+ node = rb_next(node);
+ if (!node && cluster) {
+ node = rb_first(&cluster->root);
+ cluster_locked = cluster;
+ spin_lock(&cluster_locked->lock);
+ cluster = NULL;
+ }
+ }
+ if (cluster_locked) {
+ spin_unlock(&cluster_locked->lock);
+ cluster_locked = NULL;
+ }
+
+ /*
+ * Make sure we don't miss any range that was removed from our rbtree
+ * because trimming is running. Otherwise after a umount+mount (or crash
+ * after committing the transaction) we would leak free space and get
+ * an inconsistent free space cache report from fsck.
+ */
+ list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) {
+ ret = io_ctl_add_entry(io_ctl, trim_entry->start,
+ trim_entry->bytes, NULL);
+ if (ret)
+ goto fail;
+ *entries += 1;
+ }
+
+ return 0;
+fail:
+ if (cluster_locked)
+ spin_unlock(&cluster_locked->lock);
+ return -ENOSPC;
+}
+
+static noinline_for_stack int
+update_cache_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *inode,
+ struct btrfs_path *path, u64 offset,
+ int entries, int bitmaps)
+{
+ struct btrfs_key key;
+ struct btrfs_free_space_header *header;
+ struct extent_buffer *leaf;
+ int ret;
+
+ key.objectid = BTRFS_FREE_SPACE_OBJECTID;
+ key.offset = offset;
+ key.type = 0;
+
+ ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+ if (ret < 0) {
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
+ EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL,
+ GFP_NOFS);
+ goto fail;
+ }
+ leaf = path->nodes[0];
+ if (ret > 0) {
+ struct btrfs_key found_key;
+ ASSERT(path->slots[0]);
+ path->slots[0]--;
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
+ found_key.offset != offset) {
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
+ inode->i_size - 1,
+ EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0,
+ NULL, GFP_NOFS);
+ btrfs_release_path(path);
+ goto fail;
+ }
+ }
+
+ BTRFS_I(inode)->generation = trans->transid;
+ header = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_free_space_header);
+ btrfs_set_free_space_entries(leaf, header, entries);
+ btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
+ btrfs_set_free_space_generation(leaf, header, trans->transid);
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_release_path(path);
+
+ return 0;
+
+fail:
+ return -1;
+}
+
+static noinline_for_stack int
+write_pinned_extent_entries(struct btrfs_root *root,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_io_ctl *io_ctl,
+ int *entries)
+{
+ u64 start, extent_start, extent_end, len;
+ struct extent_io_tree *unpin = NULL;
+ int ret;
+
+ if (!block_group)
+ return 0;
+
+ /*
+ * We want to add any pinned extents to our free space cache
+ * so we don't leak the space
+ *
+ * We shouldn't have switched the pinned extents yet so this is the
+ * right one
+ */
+ unpin = root->fs_info->pinned_extents;
+
+ start = block_group->key.objectid;
+
+ while (start < block_group->key.objectid + block_group->key.offset) {
+ ret = find_first_extent_bit(unpin, start,
+ &extent_start, &extent_end,
+ EXTENT_DIRTY, NULL);
+ if (ret)
+ return 0;
+
+ /* This pinned extent is out of our range */
+ if (extent_start >= block_group->key.objectid +
+ block_group->key.offset)
+ return 0;
+
+ extent_start = max(extent_start, start);
+ extent_end = min(block_group->key.objectid +
+ block_group->key.offset, extent_end + 1);
+ len = extent_end - extent_start;
+
+ *entries += 1;
+ ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL);
+ if (ret)
+ return -ENOSPC;
+
+ start = extent_end;
+ }
+
+ return 0;
+}
+
+static noinline_for_stack int
+write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list)
+{
+ struct list_head *pos, *n;
+ int ret;
+
+ /* Write out the bitmaps */
+ list_for_each_safe(pos, n, bitmap_list) {
+ struct btrfs_free_space *entry =
+ list_entry(pos, struct btrfs_free_space, list);
+
+ ret = io_ctl_add_bitmap(io_ctl, entry->bitmap);
+ if (ret)
+ return -ENOSPC;
+ list_del_init(&entry->list);
+ }
+
+ return 0;
+}
+
+static int flush_dirty_cache(struct inode *inode)
+{
+ int ret;
+
+ ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
+ if (ret)
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
+ EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL,
+ GFP_NOFS);
+
+ return ret;
+}
+
+static void noinline_for_stack
+cleanup_bitmap_list(struct list_head *bitmap_list)
+{
+ struct list_head *pos, *n;
+
+ list_for_each_safe(pos, n, bitmap_list) {
+ struct btrfs_free_space *entry =
+ list_entry(pos, struct btrfs_free_space, list);
+ list_del_init(&entry->list);
+ }
+}
+
+static void noinline_for_stack
+cleanup_write_cache_enospc(struct inode *inode,
+ struct btrfs_io_ctl *io_ctl,
+ struct extent_state **cached_state,
+ struct list_head *bitmap_list)
+{
+ io_ctl_drop_pages(io_ctl);
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
+ i_size_read(inode) - 1, cached_state,
+ GFP_NOFS);
+}
+
+int btrfs_wait_cache_io(struct btrfs_root *root,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_io_ctl *io_ctl,
+ struct btrfs_path *path, u64 offset)
+{
+ int ret;
+ struct inode *inode = io_ctl->inode;
+
+ if (!inode)
+ return 0;
+
+ if (block_group)
+ root = root->fs_info->tree_root;
+
+ /* Flush the dirty pages in the cache file. */
+ ret = flush_dirty_cache(inode);
+ if (ret)
+ goto out;
+
+ /* Update the cache item to tell everyone this cache file is valid. */
+ ret = update_cache_item(trans, root, inode, path, offset,
+ io_ctl->entries, io_ctl->bitmaps);
+out:
+ io_ctl_free(io_ctl);
+ if (ret) {
+ invalidate_inode_pages2(inode->i_mapping);
+ BTRFS_I(inode)->generation = 0;
+ if (block_group) {
+#ifdef DEBUG
+ btrfs_err(root->fs_info,
+ "failed to write free space cache for block group %llu",
+ block_group->key.objectid);
+#endif
+ }
+ }
+ btrfs_update_inode(trans, root, inode);
+
+ if (block_group) {
+ /* the dirty list is protected by the dirty_bgs_lock */
+ spin_lock(&trans->transaction->dirty_bgs_lock);
+
+ /* the disk_cache_state is protected by the block group lock */
+ spin_lock(&block_group->lock);
+
+ /*
+ * only mark this as written if we didn't get put back on
+ * the dirty list while waiting for IO. Otherwise our
+ * cache state won't be right, and we won't get written again
+ */
+ if (!ret && list_empty(&block_group->dirty_list))
+ block_group->disk_cache_state = BTRFS_DC_WRITTEN;
+ else if (ret)
+ block_group->disk_cache_state = BTRFS_DC_ERROR;
+
+ spin_unlock(&block_group->lock);
+ spin_unlock(&trans->transaction->dirty_bgs_lock);
+ io_ctl->inode = NULL;
+ iput(inode);
+ }
+
+ return ret;
+
+}
+
+/**
+ * __btrfs_write_out_cache - write out cached info to an inode
+ * @root - the root the inode belongs to
+ * @ctl - the free space cache we are going to write out
+ * @block_group - the block_group for this cache if it belongs to a block_group
+ * @trans - the trans handle
+ * @path - the path to use
+ * @offset - the offset for the key we'll insert
+ *
+ * This function writes out a free space cache struct to disk for quick recovery
+ * on mount. This will return 0 if it was successfull in writing the cache out,
+ * or an errno if it was not.
+ */
+static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
+ struct btrfs_free_space_ctl *ctl,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_io_ctl *io_ctl,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_path *path, u64 offset)
+{
+ struct extent_state *cached_state = NULL;
+ LIST_HEAD(bitmap_list);
+ int entries = 0;
+ int bitmaps = 0;
+ int ret;
+ int must_iput = 0;
+
+ if (!i_size_read(inode))
+ return -EIO;
+
+ WARN_ON(io_ctl->pages);
+ ret = io_ctl_init(io_ctl, inode, root, 1);
+ if (ret)
+ return ret;
+
+ if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) {
+ down_write(&block_group->data_rwsem);
+ spin_lock(&block_group->lock);
+ if (block_group->delalloc_bytes) {
+ block_group->disk_cache_state = BTRFS_DC_WRITTEN;
+ spin_unlock(&block_group->lock);
+ up_write(&block_group->data_rwsem);
+ BTRFS_I(inode)->generation = 0;
+ ret = 0;
+ must_iput = 1;
+ goto out;
+ }
+ spin_unlock(&block_group->lock);
+ }
+
+ /* Lock all pages first so we can lock the extent safely. */
+ ret = io_ctl_prepare_pages(io_ctl, inode, 0);
+ if (ret)
+ goto out;
+
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
+ 0, &cached_state);
+
+ io_ctl_set_generation(io_ctl, trans->transid);
+
+ mutex_lock(&ctl->cache_writeout_mutex);
+ /* Write out the extent entries in the free space cache */
+ spin_lock(&ctl->tree_lock);
+ ret = write_cache_extent_entries(io_ctl, ctl,
+ block_group, &entries, &bitmaps,
+ &bitmap_list);
+ if (ret)
+ goto out_nospc_locked;
+
+ /*
+ * Some spaces that are freed in the current transaction are pinned,
+ * they will be added into free space cache after the transaction is
+ * committed, we shouldn't lose them.
+ *
+ * If this changes while we are working we'll get added back to
+ * the dirty list and redo it. No locking needed
+ */
+ ret = write_pinned_extent_entries(root, block_group, io_ctl, &entries);
+ if (ret)
+ goto out_nospc_locked;
+
+ /*
+ * At last, we write out all the bitmaps and keep cache_writeout_mutex
+ * locked while doing it because a concurrent trim can be manipulating
+ * or freeing the bitmap.
+ */
+ ret = write_bitmap_entries(io_ctl, &bitmap_list);
+ spin_unlock(&ctl->tree_lock);
+ mutex_unlock(&ctl->cache_writeout_mutex);
+ if (ret)
+ goto out_nospc;
+
+ /* Zero out the rest of the pages just to make sure */
+ io_ctl_zero_remaining_pages(io_ctl);
+
+ /* Everything is written out, now we dirty the pages in the file. */
+ ret = btrfs_dirty_pages(root, inode, io_ctl->pages, io_ctl->num_pages,
+ 0, i_size_read(inode), &cached_state);
+ if (ret)
+ goto out_nospc;
+
+ if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
+ up_write(&block_group->data_rwsem);
+ /*
+ * Release the pages and unlock the extent, we will flush
+ * them out later
+ */
+ io_ctl_drop_pages(io_ctl);
+
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
+ i_size_read(inode) - 1, &cached_state, GFP_NOFS);
+
+ /*
+ * at this point the pages are under IO and we're happy,
+ * The caller is responsible for waiting on them and updating the
+ * the cache and the inode
+ */
+ io_ctl->entries = entries;
+ io_ctl->bitmaps = bitmaps;
+
+ ret = btrfs_fdatawrite_range(inode, 0, (u64)-1);
+ if (ret)
+ goto out;
+
+ return 0;
+
+out:
+ io_ctl->inode = NULL;
+ io_ctl_free(io_ctl);
+ if (ret) {
+ invalidate_inode_pages2(inode->i_mapping);
+ BTRFS_I(inode)->generation = 0;
+ }
+ btrfs_update_inode(trans, root, inode);
+ if (must_iput)
+ iput(inode);
+ return ret;
+
+out_nospc_locked:
+ cleanup_bitmap_list(&bitmap_list);
+ spin_unlock(&ctl->tree_lock);
+ mutex_unlock(&ctl->cache_writeout_mutex);
+
+out_nospc:
+ cleanup_write_cache_enospc(inode, io_ctl, &cached_state, &bitmap_list);
+
+ if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
+ up_write(&block_group->data_rwsem);
+
+ goto out;
+}
+
+int btrfs_write_out_cache(struct btrfs_root *root,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path)
+{
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ struct inode *inode;
+ int ret = 0;
+
+ root = root->fs_info->tree_root;
+
+ spin_lock(&block_group->lock);
+ if (block_group->disk_cache_state < BTRFS_DC_SETUP) {
+ spin_unlock(&block_group->lock);
+ return 0;
+ }
+ spin_unlock(&block_group->lock);
+
+ inode = lookup_free_space_inode(root, block_group, path);
+ if (IS_ERR(inode))
+ return 0;
+
+ ret = __btrfs_write_out_cache(root, inode, ctl, block_group,
+ &block_group->io_ctl, trans,
+ path, block_group->key.objectid);
+ if (ret) {
+#ifdef DEBUG
+ btrfs_err(root->fs_info,
+ "failed to write free space cache for block group %llu",
+ block_group->key.objectid);
+#endif
+ spin_lock(&block_group->lock);
+ block_group->disk_cache_state = BTRFS_DC_ERROR;
+ spin_unlock(&block_group->lock);
+
+ block_group->io_ctl.inode = NULL;
+ iput(inode);
+ }
+
+ /*
+ * if ret == 0 the caller is expected to call btrfs_wait_cache_io
+ * to wait for IO and put the inode
+ */
+
+ return ret;
+}
+
+static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit,
+ u64 offset)
+{
+ ASSERT(offset >= bitmap_start);
+ offset -= bitmap_start;
+ return (unsigned long)(div_u64(offset, unit));
+}
+
+static inline unsigned long bytes_to_bits(u64 bytes, u32 unit)
+{
+ return (unsigned long)(div_u64(bytes, unit));
+}
+
+static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl,
+ u64 offset)
+{
+ u64 bitmap_start;
+ u32 bytes_per_bitmap;
+
+ bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit;
+ bitmap_start = offset - ctl->start;
+ bitmap_start = div_u64(bitmap_start, bytes_per_bitmap);
+ bitmap_start *= bytes_per_bitmap;
+ bitmap_start += ctl->start;
+
+ return bitmap_start;
+}
+
+static int tree_insert_offset(struct rb_root *root, u64 offset,
+ struct rb_node *node, int bitmap)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+ struct btrfs_free_space *info;
+
+ while (*p) {
+ parent = *p;
+ info = rb_entry(parent, struct btrfs_free_space, offset_index);
+
+ if (offset < info->offset) {
+ p = &(*p)->rb_left;
+ } else if (offset > info->offset) {
+ p = &(*p)->rb_right;
+ } else {
+ /*
+ * we could have a bitmap entry and an extent entry
+ * share the same offset. If this is the case, we want
+ * the extent entry to always be found first if we do a
+ * linear search through the tree, since we want to have
+ * the quickest allocation time, and allocating from an
+ * extent is faster than allocating from a bitmap. So
+ * if we're inserting a bitmap and we find an entry at
+ * this offset, we want to go right, or after this entry
+ * logically. If we are inserting an extent and we've
+ * found a bitmap, we want to go left, or before
+ * logically.
+ */
+ if (bitmap) {
+ if (info->bitmap) {
+ WARN_ON_ONCE(1);
+ return -EEXIST;
+ }
+ p = &(*p)->rb_right;
+ } else {
+ if (!info->bitmap) {
+ WARN_ON_ONCE(1);
+ return -EEXIST;
+ }
+ p = &(*p)->rb_left;
+ }
+ }
+ }
+
+ rb_link_node(node, parent, p);
+ rb_insert_color(node, root);
+
+ return 0;
+}
+
+/*
+ * searches the tree for the given offset.
+ *
+ * fuzzy - If this is set, then we are trying to make an allocation, and we just
+ * want a section that has at least bytes size and comes at or after the given
+ * offset.
+ */
+static struct btrfs_free_space *
+tree_search_offset(struct btrfs_free_space_ctl *ctl,
+ u64 offset, int bitmap_only, int fuzzy)
+{
+ struct rb_node *n = ctl->free_space_offset.rb_node;
+ struct btrfs_free_space *entry, *prev = NULL;
+
+ /* find entry that is closest to the 'offset' */
+ while (1) {
+ if (!n) {
+ entry = NULL;
+ break;
+ }
+
+ entry = rb_entry(n, struct btrfs_free_space, offset_index);
+ prev = entry;
+
+ if (offset < entry->offset)
+ n = n->rb_left;
+ else if (offset > entry->offset)
+ n = n->rb_right;
+ else
+ break;
+ }
+
+ if (bitmap_only) {
+ if (!entry)
+ return NULL;
+ if (entry->bitmap)
+ return entry;
+
+ /*
+ * bitmap entry and extent entry may share same offset,
+ * in that case, bitmap entry comes after extent entry.
+ */
+ n = rb_next(n);
+ if (!n)
+ return NULL;
+ entry = rb_entry(n, struct btrfs_free_space, offset_index);
+ if (entry->offset != offset)
+ return NULL;
+
+ WARN_ON(!entry->bitmap);
+ return entry;
+ } else if (entry) {
+ if (entry->bitmap) {
+ /*
+ * if previous extent entry covers the offset,
+ * we should return it instead of the bitmap entry
+ */
+ n = rb_prev(&entry->offset_index);
+ if (n) {
+ prev = rb_entry(n, struct btrfs_free_space,
+ offset_index);
+ if (!prev->bitmap &&
+ prev->offset + prev->bytes > offset)
+ entry = prev;
+ }
+ }
+ return entry;
+ }
+
+ if (!prev)
+ return NULL;
+
+ /* find last entry before the 'offset' */
+ entry = prev;
+ if (entry->offset > offset) {
+ n = rb_prev(&entry->offset_index);
+ if (n) {
+ entry = rb_entry(n, struct btrfs_free_space,
+ offset_index);
+ ASSERT(entry->offset <= offset);
+ } else {
+ if (fuzzy)
+ return entry;
+ else
+ return NULL;
+ }
+ }
+
+ if (entry->bitmap) {
+ n = rb_prev(&entry->offset_index);
+ if (n) {
+ prev = rb_entry(n, struct btrfs_free_space,
+ offset_index);
+ if (!prev->bitmap &&
+ prev->offset + prev->bytes > offset)
+ return prev;
+ }
+ if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset)
+ return entry;
+ } else if (entry->offset + entry->bytes > offset)
+ return entry;
+
+ if (!fuzzy)
+ return NULL;
+
+ while (1) {
+ if (entry->bitmap) {
+ if (entry->offset + BITS_PER_BITMAP *
+ ctl->unit > offset)
+ break;
+ } else {
+ if (entry->offset + entry->bytes > offset)
+ break;
+ }
+
+ n = rb_next(&entry->offset_index);
+ if (!n)
+ return NULL;
+ entry = rb_entry(n, struct btrfs_free_space, offset_index);
+ }
+ return entry;
+}
+
+static inline void
+__unlink_free_space(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info)
+{
+ rb_erase(&info->offset_index, &ctl->free_space_offset);
+ ctl->free_extents--;
+}
+
+static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info)
+{
+ __unlink_free_space(ctl, info);
+ ctl->free_space -= info->bytes;
+}
+
+static int link_free_space(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info)
+{
+ int ret = 0;
+
+ ASSERT(info->bytes || info->bitmap);
+ ret = tree_insert_offset(&ctl->free_space_offset, info->offset,
+ &info->offset_index, (info->bitmap != NULL));
+ if (ret)
+ return ret;
+
+ ctl->free_space += info->bytes;
+ ctl->free_extents++;
+ return ret;
+}
+
+static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
+{
+ struct btrfs_block_group_cache *block_group = ctl->private;
+ u64 max_bytes;
+ u64 bitmap_bytes;
+ u64 extent_bytes;
+ u64 size = block_group->key.offset;
+ u32 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
+ u32 max_bitmaps = div_u64(size + bytes_per_bg - 1, bytes_per_bg);
+
+ max_bitmaps = max_t(u32, max_bitmaps, 1);
+
+ ASSERT(ctl->total_bitmaps <= max_bitmaps);
+
+ /*
+ * The goal is to keep the total amount of memory used per 1gb of space
+ * at or below 32k, so we need to adjust how much memory we allow to be
+ * used by extent based free space tracking
+ */
+ if (size < 1024 * 1024 * 1024)
+ max_bytes = MAX_CACHE_BYTES_PER_GIG;
+ else
+ max_bytes = MAX_CACHE_BYTES_PER_GIG *
+ div_u64(size, 1024 * 1024 * 1024);
+
+ /*
+ * we want to account for 1 more bitmap than what we have so we can make
+ * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as
+ * we add more bitmaps.
+ */
+ bitmap_bytes = (ctl->total_bitmaps + 1) * PAGE_CACHE_SIZE;
+
+ if (bitmap_bytes >= max_bytes) {
+ ctl->extents_thresh = 0;
+ return;
+ }
+
+ /*
+ * we want the extent entry threshold to always be at most 1/2 the max
+ * bytes we can have, or whatever is less than that.
+ */
+ extent_bytes = max_bytes - bitmap_bytes;
+ extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1);
+
+ ctl->extents_thresh =
+ div_u64(extent_bytes, sizeof(struct btrfs_free_space));
+}
+
+static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info,
+ u64 offset, u64 bytes)
+{
+ unsigned long start, count;
+
+ start = offset_to_bit(info->offset, ctl->unit, offset);
+ count = bytes_to_bits(bytes, ctl->unit);
+ ASSERT(start + count <= BITS_PER_BITMAP);
+
+ bitmap_clear(info->bitmap, start, count);
+
+ info->bytes -= bytes;
+}
+
+static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info, u64 offset,
+ u64 bytes)
+{
+ __bitmap_clear_bits(ctl, info, offset, bytes);
+ ctl->free_space -= bytes;
+}
+
+static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info, u64 offset,
+ u64 bytes)
+{
+ unsigned long start, count;
+
+ start = offset_to_bit(info->offset, ctl->unit, offset);
+ count = bytes_to_bits(bytes, ctl->unit);
+ ASSERT(start + count <= BITS_PER_BITMAP);
+
+ bitmap_set(info->bitmap, start, count);
+
+ info->bytes += bytes;
+ ctl->free_space += bytes;
+}
+
+/*
+ * If we can not find suitable extent, we will use bytes to record
+ * the size of the max extent.
+ */
+static int search_bitmap(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *bitmap_info, u64 *offset,
+ u64 *bytes)
+{
+ unsigned long found_bits = 0;
+ unsigned long max_bits = 0;
+ unsigned long bits, i;
+ unsigned long next_zero;
+ unsigned long extent_bits;
+
+ i = offset_to_bit(bitmap_info->offset, ctl->unit,
+ max_t(u64, *offset, bitmap_info->offset));
+ bits = bytes_to_bits(*bytes, ctl->unit);
+
+ for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) {
+ next_zero = find_next_zero_bit(bitmap_info->bitmap,
+ BITS_PER_BITMAP, i);
+ extent_bits = next_zero - i;
+ if (extent_bits >= bits) {
+ found_bits = extent_bits;
+ break;
+ } else if (extent_bits > max_bits) {
+ max_bits = extent_bits;
+ }
+ i = next_zero;
+ }
+
+ if (found_bits) {
+ *offset = (u64)(i * ctl->unit) + bitmap_info->offset;
+ *bytes = (u64)(found_bits) * ctl->unit;
+ return 0;
+ }
+
+ *bytes = (u64)(max_bits) * ctl->unit;
+ return -1;
+}
+
+/* Cache the size of the max extent in bytes */
+static struct btrfs_free_space *
+find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
+ unsigned long align, u64 *max_extent_size)
+{
+ struct btrfs_free_space *entry;
+ struct rb_node *node;
+ u64 tmp;
+ u64 align_off;
+ int ret;
+
+ if (!ctl->free_space_offset.rb_node)
+ goto out;
+
+ entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1);
+ if (!entry)
+ goto out;
+
+ for (node = &entry->offset_index; node; node = rb_next(node)) {
+ entry = rb_entry(node, struct btrfs_free_space, offset_index);
+ if (entry->bytes < *bytes) {
+ if (entry->bytes > *max_extent_size)
+ *max_extent_size = entry->bytes;
+ continue;
+ }
+
+ /* make sure the space returned is big enough
+ * to match our requested alignment
+ */
+ if (*bytes >= align) {
+ tmp = entry->offset - ctl->start + align - 1;
+ tmp = div64_u64(tmp, align);
+ tmp = tmp * align + ctl->start;
+ align_off = tmp - entry->offset;
+ } else {
+ align_off = 0;
+ tmp = entry->offset;
+ }
+
+ if (entry->bytes < *bytes + align_off) {
+ if (entry->bytes > *max_extent_size)
+ *max_extent_size = entry->bytes;
+ continue;
+ }
+
+ if (entry->bitmap) {
+ u64 size = *bytes;
+
+ ret = search_bitmap(ctl, entry, &tmp, &size);
+ if (!ret) {
+ *offset = tmp;
+ *bytes = size;
+ return entry;
+ } else if (size > *max_extent_size) {
+ *max_extent_size = size;
+ }
+ continue;
+ }
+
+ *offset = tmp;
+ *bytes = entry->bytes - align_off;
+ return entry;
+ }
+out:
+ return NULL;
+}
+
+static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info, u64 offset)
+{
+ info->offset = offset_to_bitmap(ctl, offset);
+ info->bytes = 0;
+ INIT_LIST_HEAD(&info->list);
+ link_free_space(ctl, info);
+ ctl->total_bitmaps++;
+
+ ctl->op->recalc_thresholds(ctl);
+}
+
+static void free_bitmap(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *bitmap_info)
+{
+ unlink_free_space(ctl, bitmap_info);
+ kfree(bitmap_info->bitmap);
+ kmem_cache_free(btrfs_free_space_cachep, bitmap_info);
+ ctl->total_bitmaps--;
+ ctl->op->recalc_thresholds(ctl);
+}
+
+static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *bitmap_info,
+ u64 *offset, u64 *bytes)
+{
+ u64 end;
+ u64 search_start, search_bytes;
+ int ret;
+
+again:
+ end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1;
+
+ /*
+ * We need to search for bits in this bitmap. We could only cover some
+ * of the extent in this bitmap thanks to how we add space, so we need
+ * to search for as much as it as we can and clear that amount, and then
+ * go searching for the next bit.
+ */
+ search_start = *offset;
+ search_bytes = ctl->unit;
+ search_bytes = min(search_bytes, end - search_start + 1);
+ ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes);
+ if (ret < 0 || search_start != *offset)
+ return -EINVAL;
+
+ /* We may have found more bits than what we need */
+ search_bytes = min(search_bytes, *bytes);
+
+ /* Cannot clear past the end of the bitmap */
+ search_bytes = min(search_bytes, end - search_start + 1);
+
+ bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes);
+ *offset += search_bytes;
+ *bytes -= search_bytes;
+
+ if (*bytes) {
+ struct rb_node *next = rb_next(&bitmap_info->offset_index);
+ if (!bitmap_info->bytes)
+ free_bitmap(ctl, bitmap_info);
+
+ /*
+ * no entry after this bitmap, but we still have bytes to
+ * remove, so something has gone wrong.
+ */
+ if (!next)
+ return -EINVAL;
+
+ bitmap_info = rb_entry(next, struct btrfs_free_space,
+ offset_index);
+
+ /*
+ * if the next entry isn't a bitmap we need to return to let the
+ * extent stuff do its work.
+ */
+ if (!bitmap_info->bitmap)
+ return -EAGAIN;
+
+ /*
+ * Ok the next item is a bitmap, but it may not actually hold
+ * the information for the rest of this free space stuff, so
+ * look for it, and if we don't find it return so we can try
+ * everything over again.
+ */
+ search_start = *offset;
+ search_bytes = ctl->unit;
+ ret = search_bitmap(ctl, bitmap_info, &search_start,
+ &search_bytes);
+ if (ret < 0 || search_start != *offset)
+ return -EAGAIN;
+
+ goto again;
+ } else if (!bitmap_info->bytes)
+ free_bitmap(ctl, bitmap_info);
+
+ return 0;
+}
+
+static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info, u64 offset,
+ u64 bytes)
+{
+ u64 bytes_to_set = 0;
+ u64 end;
+
+ end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit);
+
+ bytes_to_set = min(end - offset, bytes);
+
+ bitmap_set_bits(ctl, info, offset, bytes_to_set);
+
+ return bytes_to_set;
+
+}
+
+static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info)
+{
+ struct btrfs_block_group_cache *block_group = ctl->private;
+
+ /*
+ * If we are below the extents threshold then we can add this as an
+ * extent, and don't have to deal with the bitmap
+ */
+ if (ctl->free_extents < ctl->extents_thresh) {
+ /*
+ * If this block group has some small extents we don't want to
+ * use up all of our free slots in the cache with them, we want
+ * to reserve them to larger extents, however if we have plent
+ * of cache left then go ahead an dadd them, no sense in adding
+ * the overhead of a bitmap if we don't have to.
+ */
+ if (info->bytes <= block_group->sectorsize * 4) {
+ if (ctl->free_extents * 2 <= ctl->extents_thresh)
+ return false;
+ } else {
+ return false;
+ }
+ }
+
+ /*
+ * The original block groups from mkfs can be really small, like 8
+ * megabytes, so don't bother with a bitmap for those entries. However
+ * some block groups can be smaller than what a bitmap would cover but
+ * are still large enough that they could overflow the 32k memory limit,
+ * so allow those block groups to still be allowed to have a bitmap
+ * entry.
+ */
+ if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->key.offset)
+ return false;
+
+ return true;
+}
+
+static struct btrfs_free_space_op free_space_op = {
+ .recalc_thresholds = recalculate_thresholds,
+ .use_bitmap = use_bitmap,
+};
+
+static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info)
+{
+ struct btrfs_free_space *bitmap_info;
+ struct btrfs_block_group_cache *block_group = NULL;
+ int added = 0;
+ u64 bytes, offset, bytes_added;
+ int ret;
+
+ bytes = info->bytes;
+ offset = info->offset;
+
+ if (!ctl->op->use_bitmap(ctl, info))
+ return 0;
+
+ if (ctl->op == &free_space_op)
+ block_group = ctl->private;
+again:
+ /*
+ * Since we link bitmaps right into the cluster we need to see if we
+ * have a cluster here, and if so and it has our bitmap we need to add
+ * the free space to that bitmap.
+ */
+ if (block_group && !list_empty(&block_group->cluster_list)) {
+ struct btrfs_free_cluster *cluster;
+ struct rb_node *node;
+ struct btrfs_free_space *entry;
+
+ cluster = list_entry(block_group->cluster_list.next,
+ struct btrfs_free_cluster,
+ block_group_list);
+ spin_lock(&cluster->lock);
+ node = rb_first(&cluster->root);
+ if (!node) {
+ spin_unlock(&cluster->lock);
+ goto no_cluster_bitmap;
+ }
+
+ entry = rb_entry(node, struct btrfs_free_space, offset_index);
+ if (!entry->bitmap) {
+ spin_unlock(&cluster->lock);
+ goto no_cluster_bitmap;
+ }
+
+ if (entry->offset == offset_to_bitmap(ctl, offset)) {
+ bytes_added = add_bytes_to_bitmap(ctl, entry,
+ offset, bytes);
+ bytes -= bytes_added;
+ offset += bytes_added;
+ }
+ spin_unlock(&cluster->lock);
+ if (!bytes) {
+ ret = 1;
+ goto out;
+ }
+ }
+
+no_cluster_bitmap:
+ bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
+ 1, 0);
+ if (!bitmap_info) {
+ ASSERT(added == 0);
+ goto new_bitmap;
+ }
+
+ bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes);
+ bytes -= bytes_added;
+ offset += bytes_added;
+ added = 0;
+
+ if (!bytes) {
+ ret = 1;
+ goto out;
+ } else
+ goto again;
+
+new_bitmap:
+ if (info && info->bitmap) {
+ add_new_bitmap(ctl, info, offset);
+ added = 1;
+ info = NULL;
+ goto again;
+ } else {
+ spin_unlock(&ctl->tree_lock);
+
+ /* no pre-allocated info, allocate a new one */
+ if (!info) {
+ info = kmem_cache_zalloc(btrfs_free_space_cachep,
+ GFP_NOFS);
+ if (!info) {
+ spin_lock(&ctl->tree_lock);
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
+ /* allocate the bitmap */
+ info->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+ spin_lock(&ctl->tree_lock);
+ if (!info->bitmap) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ goto again;
+ }
+
+out:
+ if (info) {
+ if (info->bitmap)
+ kfree(info->bitmap);
+ kmem_cache_free(btrfs_free_space_cachep, info);
+ }
+
+ return ret;
+}
+
+static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info, bool update_stat)
+{
+ struct btrfs_free_space *left_info;
+ struct btrfs_free_space *right_info;
+ bool merged = false;
+ u64 offset = info->offset;
+ u64 bytes = info->bytes;
+
+ /*
+ * first we want to see if there is free space adjacent to the range we
+ * are adding, if there is remove that struct and add a new one to
+ * cover the entire range
+ */
+ right_info = tree_search_offset(ctl, offset + bytes, 0, 0);
+ if (right_info && rb_prev(&right_info->offset_index))
+ left_info = rb_entry(rb_prev(&right_info->offset_index),
+ struct btrfs_free_space, offset_index);
+ else
+ left_info = tree_search_offset(ctl, offset - 1, 0, 0);
+
+ if (right_info && !right_info->bitmap) {
+ if (update_stat)
+ unlink_free_space(ctl, right_info);
+ else
+ __unlink_free_space(ctl, right_info);
+ info->bytes += right_info->bytes;
+ kmem_cache_free(btrfs_free_space_cachep, right_info);
+ merged = true;
+ }
+
+ if (left_info && !left_info->bitmap &&
+ left_info->offset + left_info->bytes == offset) {
+ if (update_stat)
+ unlink_free_space(ctl, left_info);
+ else
+ __unlink_free_space(ctl, left_info);
+ info->offset = left_info->offset;
+ info->bytes += left_info->bytes;
+ kmem_cache_free(btrfs_free_space_cachep, left_info);
+ merged = true;
+ }
+
+ return merged;
+}
+
+static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info,
+ bool update_stat)
+{
+ struct btrfs_free_space *bitmap;
+ unsigned long i;
+ unsigned long j;
+ const u64 end = info->offset + info->bytes;
+ const u64 bitmap_offset = offset_to_bitmap(ctl, end);
+ u64 bytes;
+
+ bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
+ if (!bitmap)
+ return false;
+
+ i = offset_to_bit(bitmap->offset, ctl->unit, end);
+ j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i);
+ if (j == i)
+ return false;
+ bytes = (j - i) * ctl->unit;
+ info->bytes += bytes;
+
+ if (update_stat)
+ bitmap_clear_bits(ctl, bitmap, end, bytes);
+ else
+ __bitmap_clear_bits(ctl, bitmap, end, bytes);
+
+ if (!bitmap->bytes)
+ free_bitmap(ctl, bitmap);
+
+ return true;
+}
+
+static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info,
+ bool update_stat)
+{
+ struct btrfs_free_space *bitmap;
+ u64 bitmap_offset;
+ unsigned long i;
+ unsigned long j;
+ unsigned long prev_j;
+ u64 bytes;
+
+ bitmap_offset = offset_to_bitmap(ctl, info->offset);
+ /* If we're on a boundary, try the previous logical bitmap. */
+ if (bitmap_offset == info->offset) {
+ if (info->offset == 0)
+ return false;
+ bitmap_offset = offset_to_bitmap(ctl, info->offset - 1);
+ }
+
+ bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
+ if (!bitmap)
+ return false;
+
+ i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1;
+ j = 0;
+ prev_j = (unsigned long)-1;
+ for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) {
+ if (j > i)
+ break;
+ prev_j = j;
+ }
+ if (prev_j == i)
+ return false;
+
+ if (prev_j == (unsigned long)-1)
+ bytes = (i + 1) * ctl->unit;
+ else
+ bytes = (i - prev_j) * ctl->unit;
+
+ info->offset -= bytes;
+ info->bytes += bytes;
+
+ if (update_stat)
+ bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
+ else
+ __bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
+
+ if (!bitmap->bytes)
+ free_bitmap(ctl, bitmap);
+
+ return true;
+}
+
+/*
+ * We prefer always to allocate from extent entries, both for clustered and
+ * non-clustered allocation requests. So when attempting to add a new extent
+ * entry, try to see if there's adjacent free space in bitmap entries, and if
+ * there is, migrate that space from the bitmaps to the extent.
+ * Like this we get better chances of satisfying space allocation requests
+ * because we attempt to satisfy them based on a single cache entry, and never
+ * on 2 or more entries - even if the entries represent a contiguous free space
+ * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry
+ * ends).
+ */
+static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info,
+ bool update_stat)
+{
+ /*
+ * Only work with disconnected entries, as we can change their offset,
+ * and must be extent entries.
+ */
+ ASSERT(!info->bitmap);
+ ASSERT(RB_EMPTY_NODE(&info->offset_index));
+
+ if (ctl->total_bitmaps > 0) {
+ bool stole_end;
+ bool stole_front = false;
+
+ stole_end = steal_from_bitmap_to_end(ctl, info, update_stat);
+ if (ctl->total_bitmaps > 0)
+ stole_front = steal_from_bitmap_to_front(ctl, info,
+ update_stat);
+
+ if (stole_end || stole_front)
+ try_merge_free_space(ctl, info, update_stat);
+ }
+}
+
+int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl,
+ u64 offset, u64 bytes)
+{
+ struct btrfs_free_space *info;
+ int ret = 0;
+
+ info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
+ if (!info)
+ return -ENOMEM;
+
+ info->offset = offset;
+ info->bytes = bytes;
+ RB_CLEAR_NODE(&info->offset_index);
+
+ spin_lock(&ctl->tree_lock);
+
+ if (try_merge_free_space(ctl, info, true))
+ goto link;
+
+ /*
+ * There was no extent directly to the left or right of this new
+ * extent then we know we're going to have to allocate a new extent, so
+ * before we do that see if we need to drop this into a bitmap
+ */
+ ret = insert_into_bitmap(ctl, info);
+ if (ret < 0) {
+ goto out;
+ } else if (ret) {
+ ret = 0;
+ goto out;
+ }
+link:
+ /*
+ * Only steal free space from adjacent bitmaps if we're sure we're not
+ * going to add the new free space to existing bitmap entries - because
+ * that would mean unnecessary work that would be reverted. Therefore
+ * attempt to steal space from bitmaps if we're adding an extent entry.
+ */
+ steal_from_bitmap(ctl, info, true);
+
+ ret = link_free_space(ctl, info);
+ if (ret)
+ kmem_cache_free(btrfs_free_space_cachep, info);
+out:
+ spin_unlock(&ctl->tree_lock);
+
+ if (ret) {
+ printk(KERN_CRIT "BTRFS: unable to add free space :%d\n", ret);
+ ASSERT(ret != -EEXIST);
+ }
+
+ return ret;
+}
+
+int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
+ u64 offset, u64 bytes)
+{
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ struct btrfs_free_space *info;
+ int ret;
+ bool re_search = false;
+
+ spin_lock(&ctl->tree_lock);
+
+again:
+ ret = 0;
+ if (!bytes)
+ goto out_lock;
+
+ info = tree_search_offset(ctl, offset, 0, 0);
+ if (!info) {
+ /*
+ * oops didn't find an extent that matched the space we wanted
+ * to remove, look for a bitmap instead
+ */
+ info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
+ 1, 0);
+ if (!info) {
+ /*
+ * If we found a partial bit of our free space in a
+ * bitmap but then couldn't find the other part this may
+ * be a problem, so WARN about it.
+ */
+ WARN_ON(re_search);
+ goto out_lock;
+ }
+ }
+
+ re_search = false;
+ if (!info->bitmap) {
+ unlink_free_space(ctl, info);
+ if (offset == info->offset) {
+ u64 to_free = min(bytes, info->bytes);
+
+ info->bytes -= to_free;
+ info->offset += to_free;
+ if (info->bytes) {
+ ret = link_free_space(ctl, info);
+ WARN_ON(ret);
+ } else {
+ kmem_cache_free(btrfs_free_space_cachep, info);
+ }
+
+ offset += to_free;
+ bytes -= to_free;
+ goto again;
+ } else {
+ u64 old_end = info->bytes + info->offset;
+
+ info->bytes = offset - info->offset;
+ ret = link_free_space(ctl, info);
+ WARN_ON(ret);
+ if (ret)
+ goto out_lock;
+
+ /* Not enough bytes in this entry to satisfy us */
+ if (old_end < offset + bytes) {
+ bytes -= old_end - offset;
+ offset = old_end;
+ goto again;
+ } else if (old_end == offset + bytes) {
+ /* all done */
+ goto out_lock;
+ }
+ spin_unlock(&ctl->tree_lock);
+
+ ret = btrfs_add_free_space(block_group, offset + bytes,
+ old_end - (offset + bytes));
+ WARN_ON(ret);
+ goto out;
+ }
+ }
+
+ ret = remove_from_bitmap(ctl, info, &offset, &bytes);
+ if (ret == -EAGAIN) {
+ re_search = true;
+ goto again;
+ }
+out_lock:
+ spin_unlock(&ctl->tree_lock);
+out:
+ return ret;
+}
+
+void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
+ u64 bytes)
+{
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ struct btrfs_free_space *info;
+ struct rb_node *n;
+ int count = 0;
+
+ for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
+ info = rb_entry(n, struct btrfs_free_space, offset_index);
+ if (info->bytes >= bytes && !block_group->ro)
+ count++;
+ btrfs_crit(block_group->fs_info,
+ "entry offset %llu, bytes %llu, bitmap %s",
+ info->offset, info->bytes,
+ (info->bitmap) ? "yes" : "no");
+ }
+ btrfs_info(block_group->fs_info, "block group has cluster?: %s",
+ list_empty(&block_group->cluster_list) ? "no" : "yes");
+ btrfs_info(block_group->fs_info,
+ "%d blocks of free space at or bigger than bytes is", count);
+}
+
+void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group)
+{
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+
+ spin_lock_init(&ctl->tree_lock);
+ ctl->unit = block_group->sectorsize;
+ ctl->start = block_group->key.objectid;
+ ctl->private = block_group;
+ ctl->op = &free_space_op;
+ INIT_LIST_HEAD(&ctl->trimming_ranges);
+ mutex_init(&ctl->cache_writeout_mutex);
+
+ /*
+ * we only want to have 32k of ram per block group for keeping
+ * track of free space, and if we pass 1/2 of that we want to
+ * start converting things over to using bitmaps
+ */
+ ctl->extents_thresh = ((1024 * 32) / 2) /
+ sizeof(struct btrfs_free_space);
+}
+
+/*
+ * for a given cluster, put all of its extents back into the free
+ * space cache. If the block group passed doesn't match the block group
+ * pointed to by the cluster, someone else raced in and freed the
+ * cluster already. In that case, we just return without changing anything
+ */
+static int
+__btrfs_return_cluster_to_free_space(
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_cluster *cluster)
+{
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ struct btrfs_free_space *entry;
+ struct rb_node *node;
+
+ spin_lock(&cluster->lock);
+ if (cluster->block_group != block_group)
+ goto out;
+
+ cluster->block_group = NULL;
+ cluster->window_start = 0;
+ list_del_init(&cluster->block_group_list);
+
+ node = rb_first(&cluster->root);
+ while (node) {
+ bool bitmap;
+
+ entry = rb_entry(node, struct btrfs_free_space, offset_index);
+ node = rb_next(&entry->offset_index);
+ rb_erase(&entry->offset_index, &cluster->root);
+ RB_CLEAR_NODE(&entry->offset_index);
+
+ bitmap = (entry->bitmap != NULL);
+ if (!bitmap) {
+ try_merge_free_space(ctl, entry, false);
+ steal_from_bitmap(ctl, entry, false);
+ }
+ tree_insert_offset(&ctl->free_space_offset,
+ entry->offset, &entry->offset_index, bitmap);
+ }
+ cluster->root = RB_ROOT;
+
+out:
+ spin_unlock(&cluster->lock);
+ btrfs_put_block_group(block_group);
+ return 0;
+}
+
+static void __btrfs_remove_free_space_cache_locked(
+ struct btrfs_free_space_ctl *ctl)
+{
+ struct btrfs_free_space *info;
+ struct rb_node *node;
+
+ while ((node = rb_last(&ctl->free_space_offset)) != NULL) {
+ info = rb_entry(node, struct btrfs_free_space, offset_index);
+ if (!info->bitmap) {
+ unlink_free_space(ctl, info);
+ kmem_cache_free(btrfs_free_space_cachep, info);
+ } else {
+ free_bitmap(ctl, info);
+ }
+
+ cond_resched_lock(&ctl->tree_lock);
+ }
+}
+
+void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
+{
+ spin_lock(&ctl->tree_lock);
+ __btrfs_remove_free_space_cache_locked(ctl);
+ spin_unlock(&ctl->tree_lock);
+}
+
+void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
+{
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ struct btrfs_free_cluster *cluster;
+ struct list_head *head;
+
+ spin_lock(&ctl->tree_lock);
+ while ((head = block_group->cluster_list.next) !=
+ &block_group->cluster_list) {
+ cluster = list_entry(head, struct btrfs_free_cluster,
+ block_group_list);
+
+ WARN_ON(cluster->block_group != block_group);
+ __btrfs_return_cluster_to_free_space(block_group, cluster);
+
+ cond_resched_lock(&ctl->tree_lock);
+ }
+ __btrfs_remove_free_space_cache_locked(ctl);
+ spin_unlock(&ctl->tree_lock);
+
+}
+
+u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
+ u64 offset, u64 bytes, u64 empty_size,
+ u64 *max_extent_size)
+{
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ struct btrfs_free_space *entry = NULL;
+ u64 bytes_search = bytes + empty_size;
+ u64 ret = 0;
+ u64 align_gap = 0;
+ u64 align_gap_len = 0;
+
+ spin_lock(&ctl->tree_lock);
+ entry = find_free_space(ctl, &offset, &bytes_search,
+ block_group->full_stripe_len, max_extent_size);
+ if (!entry)
+ goto out;
+
+ ret = offset;
+ if (entry->bitmap) {
+ bitmap_clear_bits(ctl, entry, offset, bytes);
+ if (!entry->bytes)
+ free_bitmap(ctl, entry);
+ } else {
+ unlink_free_space(ctl, entry);
+ align_gap_len = offset - entry->offset;
+ align_gap = entry->offset;
+
+ entry->offset = offset + bytes;
+ WARN_ON(entry->bytes < bytes + align_gap_len);
+
+ entry->bytes -= bytes + align_gap_len;
+ if (!entry->bytes)
+ kmem_cache_free(btrfs_free_space_cachep, entry);
+ else
+ link_free_space(ctl, entry);
+ }
+out:
+ spin_unlock(&ctl->tree_lock);
+
+ if (align_gap_len)
+ __btrfs_add_free_space(ctl, align_gap, align_gap_len);
+ return ret;
+}
+
+/*
+ * given a cluster, put all of its extents back into the free space
+ * cache. If a block group is passed, this function will only free
+ * a cluster that belongs to the passed block group.
+ *
+ * Otherwise, it'll get a reference on the block group pointed to by the
+ * cluster and remove the cluster from it.
+ */
+int btrfs_return_cluster_to_free_space(
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_cluster *cluster)
+{
+ struct btrfs_free_space_ctl *ctl;
+ int ret;
+
+ /* first, get a safe pointer to the block group */
+ spin_lock(&cluster->lock);
+ if (!block_group) {
+ block_group = cluster->block_group;
+ if (!block_group) {
+ spin_unlock(&cluster->lock);
+ return 0;
+ }
+ } else if (cluster->block_group != block_group) {
+ /* someone else has already freed it don't redo their work */
+ spin_unlock(&cluster->lock);
+ return 0;
+ }
+ atomic_inc(&block_group->count);
+ spin_unlock(&cluster->lock);
+
+ ctl = block_group->free_space_ctl;
+
+ /* now return any extents the cluster had on it */
+ spin_lock(&ctl->tree_lock);
+ ret = __btrfs_return_cluster_to_free_space(block_group, cluster);
+ spin_unlock(&ctl->tree_lock);
+
+ /* finally drop our ref */
+ btrfs_put_block_group(block_group);
+ return ret;
+}
+
+static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_cluster *cluster,
+ struct btrfs_free_space *entry,
+ u64 bytes, u64 min_start,
+ u64 *max_extent_size)
+{
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ int err;
+ u64 search_start = cluster->window_start;
+ u64 search_bytes = bytes;
+ u64 ret = 0;
+
+ search_start = min_start;
+ search_bytes = bytes;
+
+ err = search_bitmap(ctl, entry, &search_start, &search_bytes);
+ if (err) {
+ if (search_bytes > *max_extent_size)
+ *max_extent_size = search_bytes;
+ return 0;
+ }
+
+ ret = search_start;
+ __bitmap_clear_bits(ctl, entry, ret, bytes);
+
+ return ret;
+}
+
+/*
+ * given a cluster, try to allocate 'bytes' from it, returns 0
+ * if it couldn't find anything suitably large, or a logical disk offset
+ * if things worked out
+ */
+u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_cluster *cluster, u64 bytes,
+ u64 min_start, u64 *max_extent_size)
+{
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ struct btrfs_free_space *entry = NULL;
+ struct rb_node *node;
+ u64 ret = 0;
+
+ spin_lock(&cluster->lock);
+ if (bytes > cluster->max_size)
+ goto out;
+
+ if (cluster->block_group != block_group)
+ goto out;
+
+ node = rb_first(&cluster->root);
+ if (!node)
+ goto out;
+
+ entry = rb_entry(node, struct btrfs_free_space, offset_index);
+ while (1) {
+ if (entry->bytes < bytes && entry->bytes > *max_extent_size)
+ *max_extent_size = entry->bytes;
+
+ if (entry->bytes < bytes ||
+ (!entry->bitmap && entry->offset < min_start)) {
+ node = rb_next(&entry->offset_index);
+ if (!node)
+ break;
+ entry = rb_entry(node, struct btrfs_free_space,
+ offset_index);
+ continue;
+ }
+
+ if (entry->bitmap) {
+ ret = btrfs_alloc_from_bitmap(block_group,
+ cluster, entry, bytes,
+ cluster->window_start,
+ max_extent_size);
+ if (ret == 0) {
+ node = rb_next(&entry->offset_index);
+ if (!node)
+ break;
+ entry = rb_entry(node, struct btrfs_free_space,
+ offset_index);
+ continue;
+ }
+ cluster->window_start += bytes;
+ } else {
+ ret = entry->offset;
+
+ entry->offset += bytes;
+ entry->bytes -= bytes;
+ }
+
+ if (entry->bytes == 0)
+ rb_erase(&entry->offset_index, &cluster->root);
+ break;
+ }
+out:
+ spin_unlock(&cluster->lock);
+
+ if (!ret)
+ return 0;
+
+ spin_lock(&ctl->tree_lock);
+
+ ctl->free_space -= bytes;
+ if (entry->bytes == 0) {
+ ctl->free_extents--;
+ if (entry->bitmap) {
+ kfree(entry->bitmap);
+ ctl->total_bitmaps--;
+ ctl->op->recalc_thresholds(ctl);
+ }
+ kmem_cache_free(btrfs_free_space_cachep, entry);
+ }
+
+ spin_unlock(&ctl->tree_lock);
+
+ return ret;
+}
+
+static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_space *entry,
+ struct btrfs_free_cluster *cluster,
+ u64 offset, u64 bytes,
+ u64 cont1_bytes, u64 min_bytes)
+{
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ unsigned long next_zero;
+ unsigned long i;
+ unsigned long want_bits;
+ unsigned long min_bits;
+ unsigned long found_bits;
+ unsigned long start = 0;
+ unsigned long total_found = 0;
+ int ret;
+
+ i = offset_to_bit(entry->offset, ctl->unit,
+ max_t(u64, offset, entry->offset));
+ want_bits = bytes_to_bits(bytes, ctl->unit);
+ min_bits = bytes_to_bits(min_bytes, ctl->unit);
+
+again:
+ found_bits = 0;
+ for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) {
+ next_zero = find_next_zero_bit(entry->bitmap,
+ BITS_PER_BITMAP, i);
+ if (next_zero - i >= min_bits) {
+ found_bits = next_zero - i;
+ break;
+ }
+ i = next_zero;
+ }
+
+ if (!found_bits)
+ return -ENOSPC;
+
+ if (!total_found) {
+ start = i;
+ cluster->max_size = 0;
+ }
+
+ total_found += found_bits;
+
+ if (cluster->max_size < found_bits * ctl->unit)
+ cluster->max_size = found_bits * ctl->unit;
+
+ if (total_found < want_bits || cluster->max_size < cont1_bytes) {
+ i = next_zero + 1;
+ goto again;
+ }
+
+ cluster->window_start = start * ctl->unit + entry->offset;
+ rb_erase(&entry->offset_index, &ctl->free_space_offset);
+ ret = tree_insert_offset(&cluster->root, entry->offset,
+ &entry->offset_index, 1);
+ ASSERT(!ret); /* -EEXIST; Logic error */
+
+ trace_btrfs_setup_cluster(block_group, cluster,
+ total_found * ctl->unit, 1);
+ return 0;
+}
+
+/*
+ * This searches the block group for just extents to fill the cluster with.
+ * Try to find a cluster with at least bytes total bytes, at least one
+ * extent of cont1_bytes, and other clusters of at least min_bytes.
+ */
+static noinline int
+setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_cluster *cluster,
+ struct list_head *bitmaps, u64 offset, u64 bytes,
+ u64 cont1_bytes, u64 min_bytes)
+{
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ struct btrfs_free_space *first = NULL;
+ struct btrfs_free_space *entry = NULL;
+ struct btrfs_free_space *last;
+ struct rb_node *node;
+ u64 window_free;
+ u64 max_extent;
+ u64 total_size = 0;
+
+ entry = tree_search_offset(ctl, offset, 0, 1);
+ if (!entry)
+ return -ENOSPC;
+
+ /*
+ * We don't want bitmaps, so just move along until we find a normal
+ * extent entry.
+ */
+ while (entry->bitmap || entry->bytes < min_bytes) {
+ if (entry->bitmap && list_empty(&entry->list))
+ list_add_tail(&entry->list, bitmaps);
+ node = rb_next(&entry->offset_index);
+ if (!node)
+ return -ENOSPC;
+ entry = rb_entry(node, struct btrfs_free_space, offset_index);
+ }
+
+ window_free = entry->bytes;
+ max_extent = entry->bytes;
+ first = entry;
+ last = entry;
+
+ for (node = rb_next(&entry->offset_index); node;
+ node = rb_next(&entry->offset_index)) {
+ entry = rb_entry(node, struct btrfs_free_space, offset_index);
+
+ if (entry->bitmap) {
+ if (list_empty(&entry->list))
+ list_add_tail(&entry->list, bitmaps);
+ continue;
+ }
+
+ if (entry->bytes < min_bytes)
+ continue;
+
+ last = entry;
+ window_free += entry->bytes;
+ if (entry->bytes > max_extent)
+ max_extent = entry->bytes;
+ }
+
+ if (window_free < bytes || max_extent < cont1_bytes)
+ return -ENOSPC;
+
+ cluster->window_start = first->offset;
+
+ node = &first->offset_index;
+
+ /*
+ * now we've found our entries, pull them out of the free space
+ * cache and put them into the cluster rbtree
+ */
+ do {
+ int ret;
+
+ entry = rb_entry(node, struct btrfs_free_space, offset_index);
+ node = rb_next(&entry->offset_index);
+ if (entry->bitmap || entry->bytes < min_bytes)
+ continue;
+
+ rb_erase(&entry->offset_index, &ctl->free_space_offset);
+ ret = tree_insert_offset(&cluster->root, entry->offset,
+ &entry->offset_index, 0);
+ total_size += entry->bytes;
+ ASSERT(!ret); /* -EEXIST; Logic error */
+ } while (node && entry != last);
+
+ cluster->max_size = max_extent;
+ trace_btrfs_setup_cluster(block_group, cluster, total_size, 0);
+ return 0;
+}
+
+/*
+ * This specifically looks for bitmaps that may work in the cluster, we assume
+ * that we have already failed to find extents that will work.
+ */
+static noinline int
+setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_cluster *cluster,
+ struct list_head *bitmaps, u64 offset, u64 bytes,
+ u64 cont1_bytes, u64 min_bytes)
+{
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ struct btrfs_free_space *entry;
+ int ret = -ENOSPC;
+ u64 bitmap_offset = offset_to_bitmap(ctl, offset);
+
+ if (ctl->total_bitmaps == 0)
+ return -ENOSPC;
+
+ /*
+ * The bitmap that covers offset won't be in the list unless offset
+ * is just its start offset.
+ */
+ entry = list_first_entry(bitmaps, struct btrfs_free_space, list);
+ if (entry->offset != bitmap_offset) {
+ entry = tree_search_offset(ctl, bitmap_offset, 1, 0);
+ if (entry && list_empty(&entry->list))
+ list_add(&entry->list, bitmaps);
+ }
+
+ list_for_each_entry(entry, bitmaps, list) {
+ if (entry->bytes < bytes)
+ continue;
+ ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
+ bytes, cont1_bytes, min_bytes);
+ if (!ret)
+ return 0;
+ }
+
+ /*
+ * The bitmaps list has all the bitmaps that record free space
+ * starting after offset, so no more search is required.
+ */
+ return -ENOSPC;
+}
+
+/*
+ * here we try to find a cluster of blocks in a block group. The goal
+ * is to find at least bytes+empty_size.
+ * We might not find them all in one contiguous area.
+ *
+ * returns zero and sets up cluster if things worked out, otherwise
+ * it returns -enospc
+ */
+int btrfs_find_space_cluster(struct btrfs_root *root,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_cluster *cluster,
+ u64 offset, u64 bytes, u64 empty_size)
+{
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ struct btrfs_free_space *entry, *tmp;
+ LIST_HEAD(bitmaps);
+ u64 min_bytes;
+ u64 cont1_bytes;
+ int ret;
+
+ /*
+ * Choose the minimum extent size we'll require for this
+ * cluster. For SSD_SPREAD, don't allow any fragmentation.
+ * For metadata, allow allocates with smaller extents. For
+ * data, keep it dense.
+ */
+ if (btrfs_test_opt(root, SSD_SPREAD)) {
+ cont1_bytes = min_bytes = bytes + empty_size;
+ } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
+ cont1_bytes = bytes;
+ min_bytes = block_group->sectorsize;
+ } else {
+ cont1_bytes = max(bytes, (bytes + empty_size) >> 2);
+ min_bytes = block_group->sectorsize;
+ }
+
+ spin_lock(&ctl->tree_lock);
+
+ /*
+ * If we know we don't have enough space to make a cluster don't even
+ * bother doing all the work to try and find one.
+ */
+ if (ctl->free_space < bytes) {
+ spin_unlock(&ctl->tree_lock);
+ return -ENOSPC;
+ }
+
+ spin_lock(&cluster->lock);
+
+ /* someone already found a cluster, hooray */
+ if (cluster->block_group) {
+ ret = 0;
+ goto out;
+ }
+
+ trace_btrfs_find_cluster(block_group, offset, bytes, empty_size,
+ min_bytes);
+
+ ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
+ bytes + empty_size,
+ cont1_bytes, min_bytes);
+ if (ret)
+ ret = setup_cluster_bitmap(block_group, cluster, &bitmaps,
+ offset, bytes + empty_size,
+ cont1_bytes, min_bytes);
+
+ /* Clear our temporary list */
+ list_for_each_entry_safe(entry, tmp, &bitmaps, list)
+ list_del_init(&entry->list);
+
+ if (!ret) {
+ atomic_inc(&block_group->count);
+ list_add_tail(&cluster->block_group_list,
+ &block_group->cluster_list);
+ cluster->block_group = block_group;
+ } else {
+ trace_btrfs_failed_cluster_setup(block_group);
+ }
+out:
+ spin_unlock(&cluster->lock);
+ spin_unlock(&ctl->tree_lock);
+
+ return ret;
+}
+
+/*
+ * simple code to zero out a cluster
+ */
+void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
+{
+ spin_lock_init(&cluster->lock);
+ spin_lock_init(&cluster->refill_lock);
+ cluster->root = RB_ROOT;
+ cluster->max_size = 0;
+ INIT_LIST_HEAD(&cluster->block_group_list);
+ cluster->block_group = NULL;
+}
+
+static int do_trimming(struct btrfs_block_group_cache *block_group,
+ u64 *total_trimmed, u64 start, u64 bytes,
+ u64 reserved_start, u64 reserved_bytes,
+ struct btrfs_trim_range *trim_entry)
+{
+ struct btrfs_space_info *space_info = block_group->space_info;
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ int ret;
+ int update = 0;
+ u64 trimmed = 0;
+
+ spin_lock(&space_info->lock);
+ spin_lock(&block_group->lock);
+ if (!block_group->ro) {
+ block_group->reserved += reserved_bytes;
+ space_info->bytes_reserved += reserved_bytes;
+ update = 1;
+ }
+ spin_unlock(&block_group->lock);
+ spin_unlock(&space_info->lock);
+
+ ret = btrfs_discard_extent(fs_info->extent_root,
+ start, bytes, &trimmed);
+ if (!ret)
+ *total_trimmed += trimmed;
+
+ mutex_lock(&ctl->cache_writeout_mutex);
+ btrfs_add_free_space(block_group, reserved_start, reserved_bytes);
+ list_del(&trim_entry->list);
+ mutex_unlock(&ctl->cache_writeout_mutex);
+
+ if (update) {
+ spin_lock(&space_info->lock);
+ spin_lock(&block_group->lock);
+ if (block_group->ro)
+ space_info->bytes_readonly += reserved_bytes;
+ block_group->reserved -= reserved_bytes;
+ space_info->bytes_reserved -= reserved_bytes;
+ spin_unlock(&space_info->lock);
+ spin_unlock(&block_group->lock);
+ }
+
+ return ret;
+}
+
+static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
+ u64 *total_trimmed, u64 start, u64 end, u64 minlen)
+{
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ struct btrfs_free_space *entry;
+ struct rb_node *node;
+ int ret = 0;
+ u64 extent_start;
+ u64 extent_bytes;
+ u64 bytes;
+
+ while (start < end) {
+ struct btrfs_trim_range trim_entry;
+
+ mutex_lock(&ctl->cache_writeout_mutex);
+ spin_lock(&ctl->tree_lock);
+
+ if (ctl->free_space < minlen) {
+ spin_unlock(&ctl->tree_lock);
+ mutex_unlock(&ctl->cache_writeout_mutex);
+ break;
+ }
+
+ entry = tree_search_offset(ctl, start, 0, 1);
+ if (!entry) {
+ spin_unlock(&ctl->tree_lock);
+ mutex_unlock(&ctl->cache_writeout_mutex);
+ break;
+ }
+
+ /* skip bitmaps */
+ while (entry->bitmap) {
+ node = rb_next(&entry->offset_index);
+ if (!node) {
+ spin_unlock(&ctl->tree_lock);
+ mutex_unlock(&ctl->cache_writeout_mutex);
+ goto out;
+ }
+ entry = rb_entry(node, struct btrfs_free_space,
+ offset_index);
+ }
+
+ if (entry->offset >= end) {
+ spin_unlock(&ctl->tree_lock);
+ mutex_unlock(&ctl->cache_writeout_mutex);
+ break;
+ }
+
+ extent_start = entry->offset;
+ extent_bytes = entry->bytes;
+ start = max(start, extent_start);
+ bytes = min(extent_start + extent_bytes, end) - start;
+ if (bytes < minlen) {
+ spin_unlock(&ctl->tree_lock);
+ mutex_unlock(&ctl->cache_writeout_mutex);
+ goto next;
+ }
+
+ unlink_free_space(ctl, entry);
+ kmem_cache_free(btrfs_free_space_cachep, entry);
+
+ spin_unlock(&ctl->tree_lock);
+ trim_entry.start = extent_start;
+ trim_entry.bytes = extent_bytes;
+ list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
+ mutex_unlock(&ctl->cache_writeout_mutex);
+
+ ret = do_trimming(block_group, total_trimmed, start, bytes,
+ extent_start, extent_bytes, &trim_entry);
+ if (ret)
+ break;
+next:
+ start += bytes;
+
+ if (fatal_signal_pending(current)) {
+ ret = -ERESTARTSYS;
+ break;
+ }
+
+ cond_resched();
+ }
+out:
+ return ret;
+}
+
+static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
+ u64 *total_trimmed, u64 start, u64 end, u64 minlen)
+{
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ struct btrfs_free_space *entry;
+ int ret = 0;
+ int ret2;
+ u64 bytes;
+ u64 offset = offset_to_bitmap(ctl, start);
+
+ while (offset < end) {
+ bool next_bitmap = false;
+ struct btrfs_trim_range trim_entry;
+
+ mutex_lock(&ctl->cache_writeout_mutex);
+ spin_lock(&ctl->tree_lock);
+
+ if (ctl->free_space < minlen) {
+ spin_unlock(&ctl->tree_lock);
+ mutex_unlock(&ctl->cache_writeout_mutex);
+ break;
+ }
+
+ entry = tree_search_offset(ctl, offset, 1, 0);
+ if (!entry) {
+ spin_unlock(&ctl->tree_lock);
+ mutex_unlock(&ctl->cache_writeout_mutex);
+ next_bitmap = true;
+ goto next;
+ }
+
+ bytes = minlen;
+ ret2 = search_bitmap(ctl, entry, &start, &bytes);
+ if (ret2 || start >= end) {
+ spin_unlock(&ctl->tree_lock);
+ mutex_unlock(&ctl->cache_writeout_mutex);
+ next_bitmap = true;
+ goto next;
+ }
+
+ bytes = min(bytes, end - start);
+ if (bytes < minlen) {
+ spin_unlock(&ctl->tree_lock);
+ mutex_unlock(&ctl->cache_writeout_mutex);
+ goto next;
+ }
+
+ bitmap_clear_bits(ctl, entry, start, bytes);
+ if (entry->bytes == 0)
+ free_bitmap(ctl, entry);
+
+ spin_unlock(&ctl->tree_lock);
+ trim_entry.start = start;
+ trim_entry.bytes = bytes;
+ list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
+ mutex_unlock(&ctl->cache_writeout_mutex);
+
+ ret = do_trimming(block_group, total_trimmed, start, bytes,
+ start, bytes, &trim_entry);
+ if (ret)
+ break;
+next:
+ if (next_bitmap) {
+ offset += BITS_PER_BITMAP * ctl->unit;
+ } else {
+ start += bytes;
+ if (start >= offset + BITS_PER_BITMAP * ctl->unit)
+ offset += BITS_PER_BITMAP * ctl->unit;
+ }
+
+ if (fatal_signal_pending(current)) {
+ ret = -ERESTARTSYS;
+ break;
+ }
+
+ cond_resched();
+ }
+
+ return ret;
+}
+
+int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
+ u64 *trimmed, u64 start, u64 end, u64 minlen)
+{
+ int ret;
+
+ *trimmed = 0;
+
+ spin_lock(&block_group->lock);
+ if (block_group->removed) {
+ spin_unlock(&block_group->lock);
+ return 0;
+ }
+ atomic_inc(&block_group->trimming);
+ spin_unlock(&block_group->lock);
+
+ ret = trim_no_bitmap(block_group, trimmed, start, end, minlen);
+ if (ret)
+ goto out;
+
+ ret = trim_bitmaps(block_group, trimmed, start, end, minlen);
+out:
+ spin_lock(&block_group->lock);
+ if (atomic_dec_and_test(&block_group->trimming) &&
+ block_group->removed) {
+ struct extent_map_tree *em_tree;
+ struct extent_map *em;
+
+ spin_unlock(&block_group->lock);
+
+ lock_chunks(block_group->fs_info->chunk_root);
+ em_tree = &block_group->fs_info->mapping_tree.map_tree;
+ write_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, block_group->key.objectid,
+ 1);
+ BUG_ON(!em); /* logic error, can't happen */
+ /*
+ * remove_extent_mapping() will delete us from the pinned_chunks
+ * list, which is protected by the chunk mutex.
+ */
+ remove_extent_mapping(em_tree, em);
+ write_unlock(&em_tree->lock);
+ unlock_chunks(block_group->fs_info->chunk_root);
+
+ /* once for us and once for the tree */
+ free_extent_map(em);
+ free_extent_map(em);
+
+ /*
+ * We've left one free space entry and other tasks trimming
+ * this block group have left 1 entry each one. Free them.
+ */
+ __btrfs_remove_free_space_cache(block_group->free_space_ctl);
+ } else {
+ spin_unlock(&block_group->lock);
+ }
+
+ return ret;
+}
+
+/*
+ * Find the left-most item in the cache tree, and then return the
+ * smallest inode number in the item.
+ *
+ * Note: the returned inode number may not be the smallest one in
+ * the tree, if the left-most item is a bitmap.
+ */
+u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root)
+{
+ struct btrfs_free_space_ctl *ctl = fs_root->free_ino_ctl;
+ struct btrfs_free_space *entry = NULL;
+ u64 ino = 0;
+
+ spin_lock(&ctl->tree_lock);
+
+ if (RB_EMPTY_ROOT(&ctl->free_space_offset))
+ goto out;
+
+ entry = rb_entry(rb_first(&ctl->free_space_offset),
+ struct btrfs_free_space, offset_index);
+
+ if (!entry->bitmap) {
+ ino = entry->offset;
+
+ unlink_free_space(ctl, entry);
+ entry->offset++;
+ entry->bytes--;
+ if (!entry->bytes)
+ kmem_cache_free(btrfs_free_space_cachep, entry);
+ else
+ link_free_space(ctl, entry);
+ } else {
+ u64 offset = 0;
+ u64 count = 1;
+ int ret;
+
+ ret = search_bitmap(ctl, entry, &offset, &count);
+ /* Logic error; Should be empty if it can't find anything */
+ ASSERT(!ret);
+
+ ino = offset;
+ bitmap_clear_bits(ctl, entry, offset, 1);
+ if (entry->bytes == 0)
+ free_bitmap(ctl, entry);
+ }
+out:
+ spin_unlock(&ctl->tree_lock);
+
+ return ino;
+}
+
+struct inode *lookup_free_ino_inode(struct btrfs_root *root,
+ struct btrfs_path *path)
+{
+ struct inode *inode = NULL;
+
+ spin_lock(&root->ino_cache_lock);
+ if (root->ino_cache_inode)
+ inode = igrab(root->ino_cache_inode);
+ spin_unlock(&root->ino_cache_lock);
+ if (inode)
+ return inode;
+
+ inode = __lookup_free_space_inode(root, path, 0);
+ if (IS_ERR(inode))
+ return inode;
+
+ spin_lock(&root->ino_cache_lock);
+ if (!btrfs_fs_closing(root->fs_info))
+ root->ino_cache_inode = igrab(inode);
+ spin_unlock(&root->ino_cache_lock);
+
+ return inode;
+}
+
+int create_free_ino_inode(struct btrfs_root *root,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_path *path)
+{
+ return __create_free_space_inode(root, trans, path,
+ BTRFS_FREE_INO_OBJECTID, 0);
+}
+
+int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
+{
+ struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
+ struct btrfs_path *path;
+ struct inode *inode;
+ int ret = 0;
+ u64 root_gen = btrfs_root_generation(&root->root_item);
+
+ if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+ return 0;
+
+ /*
+ * If we're unmounting then just return, since this does a search on the
+ * normal root and not the commit root and we could deadlock.
+ */
+ if (btrfs_fs_closing(fs_info))
+ return 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return 0;
+
+ inode = lookup_free_ino_inode(root, path);
+ if (IS_ERR(inode))
+ goto out;
+
+ if (root_gen != BTRFS_I(inode)->generation)
+ goto out_put;
+
+ ret = __load_free_space_cache(root, inode, ctl, path, 0);
+
+ if (ret < 0)
+ btrfs_err(fs_info,
+ "failed to load free ino cache for root %llu",
+ root->root_key.objectid);
+out_put:
+ iput(inode);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_write_out_ino_cache(struct btrfs_root *root,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ struct inode *inode)
+{
+ struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
+ int ret;
+ struct btrfs_io_ctl io_ctl;
+ bool release_metadata = true;
+
+ if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+ return 0;
+
+ memset(&io_ctl, 0, sizeof(io_ctl));
+ ret = __btrfs_write_out_cache(root, inode, ctl, NULL, &io_ctl,
+ trans, path, 0);
+ if (!ret) {
+ /*
+ * At this point writepages() didn't error out, so our metadata
+ * reservation is released when the writeback finishes, at
+ * inode.c:btrfs_finish_ordered_io(), regardless of it finishing
+ * with or without an error.
+ */
+ release_metadata = false;
+ ret = btrfs_wait_cache_io(root, trans, NULL, &io_ctl, path, 0);
+ }
+
+ if (ret) {
+ if (release_metadata)
+ btrfs_delalloc_release_metadata(inode, inode->i_size);
+#ifdef DEBUG
+ btrfs_err(root->fs_info,
+ "failed to write free ino cache for root %llu",
+ root->root_key.objectid);
+#endif
+ }
+
+ return ret;
+}
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+/*
+ * Use this if you need to make a bitmap or extent entry specifically, it
+ * doesn't do any of the merging that add_free_space does, this acts a lot like
+ * how the free space cache loading stuff works, so you can get really weird
+ * configurations.
+ */
+int test_add_free_space_entry(struct btrfs_block_group_cache *cache,
+ u64 offset, u64 bytes, bool bitmap)
+{
+ struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
+ struct btrfs_free_space *info = NULL, *bitmap_info;
+ void *map = NULL;
+ u64 bytes_added;
+ int ret;
+
+again:
+ if (!info) {
+ info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
+ if (!info)
+ return -ENOMEM;
+ }
+
+ if (!bitmap) {
+ spin_lock(&ctl->tree_lock);
+ info->offset = offset;
+ info->bytes = bytes;
+ ret = link_free_space(ctl, info);
+ spin_unlock(&ctl->tree_lock);
+ if (ret)
+ kmem_cache_free(btrfs_free_space_cachep, info);
+ return ret;
+ }
+
+ if (!map) {
+ map = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+ if (!map) {
+ kmem_cache_free(btrfs_free_space_cachep, info);
+ return -ENOMEM;
+ }
+ }
+
+ spin_lock(&ctl->tree_lock);
+ bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
+ 1, 0);
+ if (!bitmap_info) {
+ info->bitmap = map;
+ map = NULL;
+ add_new_bitmap(ctl, info, offset);
+ bitmap_info = info;
+ info = NULL;
+ }
+
+ bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes);
+ bytes -= bytes_added;
+ offset += bytes_added;
+ spin_unlock(&ctl->tree_lock);
+
+ if (bytes)
+ goto again;
+
+ if (info)
+ kmem_cache_free(btrfs_free_space_cachep, info);
+ if (map)
+ kfree(map);
+ return 0;
+}
+
+/*
+ * Checks to see if the given range is in the free space cache. This is really
+ * just used to check the absence of space, so if there is free space in the
+ * range at all we will return 1.
+ */
+int test_check_exists(struct btrfs_block_group_cache *cache,
+ u64 offset, u64 bytes)
+{
+ struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
+ struct btrfs_free_space *info;
+ int ret = 0;
+
+ spin_lock(&ctl->tree_lock);
+ info = tree_search_offset(ctl, offset, 0, 0);
+ if (!info) {
+ info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
+ 1, 0);
+ if (!info)
+ goto out;
+ }
+
+have_info:
+ if (info->bitmap) {
+ u64 bit_off, bit_bytes;
+ struct rb_node *n;
+ struct btrfs_free_space *tmp;
+
+ bit_off = offset;
+ bit_bytes = ctl->unit;
+ ret = search_bitmap(ctl, info, &bit_off, &bit_bytes);
+ if (!ret) {
+ if (bit_off == offset) {
+ ret = 1;
+ goto out;
+ } else if (bit_off > offset &&
+ offset + bytes > bit_off) {
+ ret = 1;
+ goto out;
+ }
+ }
+
+ n = rb_prev(&info->offset_index);
+ while (n) {
+ tmp = rb_entry(n, struct btrfs_free_space,
+ offset_index);
+ if (tmp->offset + tmp->bytes < offset)
+ break;
+ if (offset + bytes < tmp->offset) {
+ n = rb_prev(&info->offset_index);
+ continue;
+ }
+ info = tmp;
+ goto have_info;
+ }
+
+ n = rb_next(&info->offset_index);
+ while (n) {
+ tmp = rb_entry(n, struct btrfs_free_space,
+ offset_index);
+ if (offset + bytes < tmp->offset)
+ break;
+ if (tmp->offset + tmp->bytes < offset) {
+ n = rb_next(&info->offset_index);
+ continue;
+ }
+ info = tmp;
+ goto have_info;
+ }
+
+ ret = 0;
+ goto out;
+ }
+
+ if (info->offset == offset) {
+ ret = 1;
+ goto out;
+ }
+
+ if (offset > info->offset && offset < info->offset + info->bytes)
+ ret = 1;
+out:
+ spin_unlock(&ctl->tree_lock);
+ return ret;
+}
+#endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
new file mode 100644
index 000000000..a16a029ad
--- /dev/null
+++ b/fs/btrfs/free-space-cache.h
@@ -0,0 +1,133 @@
+/*
+ * Copyright (C) 2009 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_FREE_SPACE_CACHE
+#define __BTRFS_FREE_SPACE_CACHE
+
+struct btrfs_free_space {
+ struct rb_node offset_index;
+ u64 offset;
+ u64 bytes;
+ unsigned long *bitmap;
+ struct list_head list;
+};
+
+struct btrfs_free_space_ctl {
+ spinlock_t tree_lock;
+ struct rb_root free_space_offset;
+ u64 free_space;
+ int extents_thresh;
+ int free_extents;
+ int total_bitmaps;
+ int unit;
+ u64 start;
+ struct btrfs_free_space_op *op;
+ void *private;
+ struct mutex cache_writeout_mutex;
+ struct list_head trimming_ranges;
+};
+
+struct btrfs_free_space_op {
+ void (*recalc_thresholds)(struct btrfs_free_space_ctl *ctl);
+ bool (*use_bitmap)(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info);
+};
+
+struct btrfs_io_ctl;
+
+struct inode *lookup_free_space_inode(struct btrfs_root *root,
+ struct btrfs_block_group_cache
+ *block_group, struct btrfs_path *path);
+int create_free_space_inode(struct btrfs_root *root,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path);
+
+int btrfs_check_trunc_cache_free_space(struct btrfs_root *root,
+ struct btrfs_block_rsv *rsv);
+int btrfs_truncate_free_space_cache(struct btrfs_root *root,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_block_group_cache *block_group,
+ struct inode *inode);
+int load_free_space_cache(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group);
+int btrfs_wait_cache_io(struct btrfs_root *root,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_io_ctl *io_ctl,
+ struct btrfs_path *path, u64 offset);
+int btrfs_write_out_cache(struct btrfs_root *root,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path);
+struct inode *lookup_free_ino_inode(struct btrfs_root *root,
+ struct btrfs_path *path);
+int create_free_ino_inode(struct btrfs_root *root,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_path *path);
+int load_free_ino_cache(struct btrfs_fs_info *fs_info,
+ struct btrfs_root *root);
+int btrfs_write_out_ino_cache(struct btrfs_root *root,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ struct inode *inode);
+
+void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group);
+int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl,
+ u64 bytenr, u64 size);
+static inline int
+btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
+ u64 bytenr, u64 size)
+{
+ return __btrfs_add_free_space(block_group->free_space_ctl,
+ bytenr, size);
+}
+int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
+ u64 bytenr, u64 size);
+void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl);
+void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
+ *block_group);
+u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
+ u64 offset, u64 bytes, u64 empty_size,
+ u64 *max_extent_size);
+u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root);
+void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
+ u64 bytes);
+int btrfs_find_space_cluster(struct btrfs_root *root,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_cluster *cluster,
+ u64 offset, u64 bytes, u64 empty_size);
+void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster);
+u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_cluster *cluster, u64 bytes,
+ u64 min_start, u64 *max_extent_size);
+int btrfs_return_cluster_to_free_space(
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_cluster *cluster);
+int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
+ u64 *trimmed, u64 start, u64 end, u64 minlen);
+
+/* Support functions for runnint our sanity tests */
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+int test_add_free_space_entry(struct btrfs_block_group_cache *cache,
+ u64 offset, u64 bytes, bool bitmap);
+int test_check_exists(struct btrfs_block_group_cache *cache,
+ u64 offset, u64 bytes);
+#endif
+
+#endif
diff --git a/fs/btrfs/hash.c b/fs/btrfs/hash.c
new file mode 100644
index 000000000..aae520b2a
--- /dev/null
+++ b/fs/btrfs/hash.c
@@ -0,0 +1,46 @@
+/*
+ * Copyright (C) 2014 Filipe David Borba Manana <fdmanana@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <crypto/hash.h>
+#include <linux/err.h>
+#include "hash.h"
+
+static struct crypto_shash *tfm;
+
+int __init btrfs_hash_init(void)
+{
+ tfm = crypto_alloc_shash("crc32c", 0, 0);
+
+ return PTR_ERR_OR_ZERO(tfm);
+}
+
+void btrfs_hash_exit(void)
+{
+ crypto_free_shash(tfm);
+}
+
+u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length)
+{
+ SHASH_DESC_ON_STACK(shash, tfm);
+ u32 *ctx = (u32 *)shash_desc_ctx(shash);
+ int err;
+
+ shash->tfm = tfm;
+ shash->flags = 0;
+ *ctx = crc;
+
+ err = crypto_shash_update(shash, address, length);
+ BUG_ON(err);
+
+ return *ctx;
+}
diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h
new file mode 100644
index 000000000..118a2316e
--- /dev/null
+++ b/fs/btrfs/hash.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __HASH__
+#define __HASH__
+
+int __init btrfs_hash_init(void);
+
+void btrfs_hash_exit(void);
+
+u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length);
+
+static inline u64 btrfs_name_hash(const char *name, int len)
+{
+ return btrfs_crc32c((u32)~1, name, len);
+}
+
+/*
+ * Figure the key offset of an extended inode ref
+ */
+static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name,
+ int len)
+{
+ return (u64) btrfs_crc32c(parent_objectid, name, len);
+}
+
+#endif
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
new file mode 100644
index 000000000..265e03c73
--- /dev/null
+++ b/fs/btrfs/inode-item.c
@@ -0,0 +1,440 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "ctree.h"
+#include "disk-io.h"
+#include "hash.h"
+#include "transaction.h"
+#include "print-tree.h"
+
+static int find_name_in_backref(struct btrfs_path *path, const char *name,
+ int name_len, struct btrfs_inode_ref **ref_ret)
+{
+ struct extent_buffer *leaf;
+ struct btrfs_inode_ref *ref;
+ unsigned long ptr;
+ unsigned long name_ptr;
+ u32 item_size;
+ u32 cur_offset = 0;
+ int len;
+
+ leaf = path->nodes[0];
+ item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ while (cur_offset < item_size) {
+ ref = (struct btrfs_inode_ref *)(ptr + cur_offset);
+ len = btrfs_inode_ref_name_len(leaf, ref);
+ name_ptr = (unsigned long)(ref + 1);
+ cur_offset += len + sizeof(*ref);
+ if (len != name_len)
+ continue;
+ if (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) {
+ *ref_ret = ref;
+ return 1;
+ }
+ }
+ return 0;
+}
+
+int btrfs_find_name_in_ext_backref(struct btrfs_path *path, u64 ref_objectid,
+ const char *name, int name_len,
+ struct btrfs_inode_extref **extref_ret)
+{
+ struct extent_buffer *leaf;
+ struct btrfs_inode_extref *extref;
+ unsigned long ptr;
+ unsigned long name_ptr;
+ u32 item_size;
+ u32 cur_offset = 0;
+ int ref_name_len;
+
+ leaf = path->nodes[0];
+ item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+
+ /*
+ * Search all extended backrefs in this item. We're only
+ * looking through any collisions so most of the time this is
+ * just going to compare against one buffer. If all is well,
+ * we'll return success and the inode ref object.
+ */
+ while (cur_offset < item_size) {
+ extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
+ name_ptr = (unsigned long)(&extref->name);
+ ref_name_len = btrfs_inode_extref_name_len(leaf, extref);
+
+ if (ref_name_len == name_len &&
+ btrfs_inode_extref_parent(leaf, extref) == ref_objectid &&
+ (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)) {
+ if (extref_ret)
+ *extref_ret = extref;
+ return 1;
+ }
+
+ cur_offset += ref_name_len + sizeof(*extref);
+ }
+ return 0;
+}
+
+/* Returns NULL if no extref found */
+struct btrfs_inode_extref *
+btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ const char *name, int name_len,
+ u64 inode_objectid, u64 ref_objectid, int ins_len,
+ int cow)
+{
+ int ret;
+ struct btrfs_key key;
+ struct btrfs_inode_extref *extref;
+
+ key.objectid = inode_objectid;
+ key.type = BTRFS_INODE_EXTREF_KEY;
+ key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
+
+ ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ if (ret > 0)
+ return NULL;
+ if (!btrfs_find_name_in_ext_backref(path, ref_objectid, name, name_len, &extref))
+ return NULL;
+ return extref;
+}
+
+static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+ u64 inode_objectid, u64 ref_objectid,
+ u64 *index)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct btrfs_inode_extref *extref;
+ struct extent_buffer *leaf;
+ int ret;
+ int del_len = name_len + sizeof(*extref);
+ unsigned long ptr;
+ unsigned long item_start;
+ u32 item_size;
+
+ key.objectid = inode_objectid;
+ key.type = BTRFS_INODE_EXTREF_KEY;
+ key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ path->leave_spinning = 1;
+
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret > 0)
+ ret = -ENOENT;
+ if (ret < 0)
+ goto out;
+
+ /*
+ * Sanity check - did we find the right item for this name?
+ * This should always succeed so error here will make the FS
+ * readonly.
+ */
+ if (!btrfs_find_name_in_ext_backref(path, ref_objectid,
+ name, name_len, &extref)) {
+ btrfs_std_error(root->fs_info, -ENOENT);
+ ret = -EROFS;
+ goto out;
+ }
+
+ leaf = path->nodes[0];
+ item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ if (index)
+ *index = btrfs_inode_extref_index(leaf, extref);
+
+ if (del_len == item_size) {
+ /*
+ * Common case only one ref in the item, remove the
+ * whole item.
+ */
+ ret = btrfs_del_item(trans, root, path);
+ goto out;
+ }
+
+ ptr = (unsigned long)extref;
+ item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
+
+ memmove_extent_buffer(leaf, ptr, ptr + del_len,
+ item_size - (ptr + del_len - item_start));
+
+ btrfs_truncate_item(root, path, item_size - del_len, 1);
+
+out:
+ btrfs_free_path(path);
+
+ return ret;
+}
+
+int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+ u64 inode_objectid, u64 ref_objectid, u64 *index)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct btrfs_inode_ref *ref;
+ struct extent_buffer *leaf;
+ unsigned long ptr;
+ unsigned long item_start;
+ u32 item_size;
+ u32 sub_item_len;
+ int ret;
+ int search_ext_refs = 0;
+ int del_len = name_len + sizeof(*ref);
+
+ key.objectid = inode_objectid;
+ key.offset = ref_objectid;
+ key.type = BTRFS_INODE_REF_KEY;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ path->leave_spinning = 1;
+
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret > 0) {
+ ret = -ENOENT;
+ search_ext_refs = 1;
+ goto out;
+ } else if (ret < 0) {
+ goto out;
+ }
+ if (!find_name_in_backref(path, name, name_len, &ref)) {
+ ret = -ENOENT;
+ search_ext_refs = 1;
+ goto out;
+ }
+ leaf = path->nodes[0];
+ item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+
+ if (index)
+ *index = btrfs_inode_ref_index(leaf, ref);
+
+ if (del_len == item_size) {
+ ret = btrfs_del_item(trans, root, path);
+ goto out;
+ }
+ ptr = (unsigned long)ref;
+ sub_item_len = name_len + sizeof(*ref);
+ item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
+ item_size - (ptr + sub_item_len - item_start));
+ btrfs_truncate_item(root, path, item_size - sub_item_len, 1);
+out:
+ btrfs_free_path(path);
+
+ if (search_ext_refs) {
+ /*
+ * No refs were found, or we could not find the
+ * name in our ref array. Find and remove the extended
+ * inode ref then.
+ */
+ return btrfs_del_inode_extref(trans, root, name, name_len,
+ inode_objectid, ref_objectid, index);
+ }
+
+ return ret;
+}
+
+/*
+ * btrfs_insert_inode_extref() - Inserts an extended inode ref into a tree.
+ *
+ * The caller must have checked against BTRFS_LINK_MAX already.
+ */
+static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+ u64 inode_objectid, u64 ref_objectid, u64 index)
+{
+ struct btrfs_inode_extref *extref;
+ int ret;
+ int ins_len = name_len + sizeof(*extref);
+ unsigned long ptr;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ struct btrfs_item *item;
+
+ key.objectid = inode_objectid;
+ key.type = BTRFS_INODE_EXTREF_KEY;
+ key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ path->leave_spinning = 1;
+ ret = btrfs_insert_empty_item(trans, root, path, &key,
+ ins_len);
+ if (ret == -EEXIST) {
+ if (btrfs_find_name_in_ext_backref(path, ref_objectid,
+ name, name_len, NULL))
+ goto out;
+
+ btrfs_extend_item(root, path, ins_len);
+ ret = 0;
+ }
+ if (ret < 0)
+ goto out;
+
+ leaf = path->nodes[0];
+ item = btrfs_item_nr(path->slots[0]);
+ ptr = (unsigned long)btrfs_item_ptr(leaf, path->slots[0], char);
+ ptr += btrfs_item_size(leaf, item) - ins_len;
+ extref = (struct btrfs_inode_extref *)ptr;
+
+ btrfs_set_inode_extref_name_len(path->nodes[0], extref, name_len);
+ btrfs_set_inode_extref_index(path->nodes[0], extref, index);
+ btrfs_set_inode_extref_parent(path->nodes[0], extref, ref_objectid);
+
+ ptr = (unsigned long)&extref->name;
+ write_extent_buffer(path->nodes[0], name, ptr, name_len);
+ btrfs_mark_buffer_dirty(path->nodes[0]);
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/* Will return 0, -ENOMEM, -EMLINK, or -EEXIST or anything from the CoW path */
+int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+ u64 inode_objectid, u64 ref_objectid, u64 index)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct btrfs_inode_ref *ref;
+ unsigned long ptr;
+ int ret;
+ int ins_len = name_len + sizeof(*ref);
+
+ key.objectid = inode_objectid;
+ key.offset = ref_objectid;
+ key.type = BTRFS_INODE_REF_KEY;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ path->leave_spinning = 1;
+ path->skip_release_on_error = 1;
+ ret = btrfs_insert_empty_item(trans, root, path, &key,
+ ins_len);
+ if (ret == -EEXIST) {
+ u32 old_size;
+
+ if (find_name_in_backref(path, name, name_len, &ref))
+ goto out;
+
+ old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
+ btrfs_extend_item(root, path, ins_len);
+ ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_inode_ref);
+ ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size);
+ btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
+ btrfs_set_inode_ref_index(path->nodes[0], ref, index);
+ ptr = (unsigned long)(ref + 1);
+ ret = 0;
+ } else if (ret < 0) {
+ if (ret == -EOVERFLOW) {
+ if (find_name_in_backref(path, name, name_len, &ref))
+ ret = -EEXIST;
+ else
+ ret = -EMLINK;
+ }
+ goto out;
+ } else {
+ ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_inode_ref);
+ btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
+ btrfs_set_inode_ref_index(path->nodes[0], ref, index);
+ ptr = (unsigned long)(ref + 1);
+ }
+ write_extent_buffer(path->nodes[0], name, ptr, name_len);
+ btrfs_mark_buffer_dirty(path->nodes[0]);
+
+out:
+ btrfs_free_path(path);
+
+ if (ret == -EMLINK) {
+ struct btrfs_super_block *disk_super = root->fs_info->super_copy;
+ /* We ran out of space in the ref array. Need to
+ * add an extended ref. */
+ if (btrfs_super_incompat_flags(disk_super)
+ & BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
+ ret = btrfs_insert_inode_extref(trans, root, name,
+ name_len,
+ inode_objectid,
+ ref_objectid, index);
+ }
+
+ return ret;
+}
+
+int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 objectid)
+{
+ struct btrfs_key key;
+ int ret;
+ key.objectid = objectid;
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+
+ ret = btrfs_insert_empty_item(trans, root, path, &key,
+ sizeof(struct btrfs_inode_item));
+ return ret;
+}
+
+int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_path *path,
+ struct btrfs_key *location, int mod)
+{
+ int ins_len = mod < 0 ? -1 : 0;
+ int cow = mod != 0;
+ int ret;
+ int slot;
+ struct extent_buffer *leaf;
+ struct btrfs_key found_key;
+
+ ret = btrfs_search_slot(trans, root, location, path, ins_len, cow);
+ if (ret > 0 && location->type == BTRFS_ROOT_ITEM_KEY &&
+ location->offset == (u64)-1 && path->slots[0] != 0) {
+ slot = path->slots[0] - 1;
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
+ if (found_key.objectid == location->objectid &&
+ found_key.type == location->type) {
+ path->slots[0]--;
+ return 0;
+ }
+ }
+ return ret;
+}
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
new file mode 100644
index 000000000..d4a582ac3
--- /dev/null
+++ b/fs/btrfs/inode-map.c
@@ -0,0 +1,575 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/pagemap.h>
+
+#include "ctree.h"
+#include "disk-io.h"
+#include "free-space-cache.h"
+#include "inode-map.h"
+#include "transaction.h"
+
+static int caching_kthread(void *data)
+{
+ struct btrfs_root *root = data;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
+ struct btrfs_key key;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ u64 last = (u64)-1;
+ int slot;
+ int ret;
+
+ if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+ return 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ /* Since the commit root is read-only, we can safely skip locking. */
+ path->skip_locking = 1;
+ path->search_commit_root = 1;
+ path->reada = 2;
+
+ key.objectid = BTRFS_FIRST_FREE_OBJECTID;
+ key.offset = 0;
+ key.type = BTRFS_INODE_ITEM_KEY;
+again:
+ /* need to make sure the commit_root doesn't disappear */
+ down_read(&fs_info->commit_root_sem);
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ while (1) {
+ if (btrfs_fs_closing(fs_info))
+ goto out;
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ if (slot >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto out;
+ else if (ret > 0)
+ break;
+
+ if (need_resched() ||
+ btrfs_transaction_in_commit(fs_info)) {
+ leaf = path->nodes[0];
+
+ if (WARN_ON(btrfs_header_nritems(leaf) == 0))
+ break;
+
+ /*
+ * Save the key so we can advances forward
+ * in the next search.
+ */
+ btrfs_item_key_to_cpu(leaf, &key, 0);
+ btrfs_release_path(path);
+ root->ino_cache_progress = last;
+ up_read(&fs_info->commit_root_sem);
+ schedule_timeout(1);
+ goto again;
+ } else
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+
+ if (key.type != BTRFS_INODE_ITEM_KEY)
+ goto next;
+
+ if (key.objectid >= root->highest_objectid)
+ break;
+
+ if (last != (u64)-1 && last + 1 != key.objectid) {
+ __btrfs_add_free_space(ctl, last + 1,
+ key.objectid - last - 1);
+ wake_up(&root->ino_cache_wait);
+ }
+
+ last = key.objectid;
+next:
+ path->slots[0]++;
+ }
+
+ if (last < root->highest_objectid - 1) {
+ __btrfs_add_free_space(ctl, last + 1,
+ root->highest_objectid - last - 1);
+ }
+
+ spin_lock(&root->ino_cache_lock);
+ root->ino_cache_state = BTRFS_CACHE_FINISHED;
+ spin_unlock(&root->ino_cache_lock);
+
+ root->ino_cache_progress = (u64)-1;
+ btrfs_unpin_free_ino(root);
+out:
+ wake_up(&root->ino_cache_wait);
+ up_read(&fs_info->commit_root_sem);
+
+ btrfs_free_path(path);
+
+ return ret;
+}
+
+static void start_caching(struct btrfs_root *root)
+{
+ struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
+ struct task_struct *tsk;
+ int ret;
+ u64 objectid;
+
+ if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+ return;
+
+ spin_lock(&root->ino_cache_lock);
+ if (root->ino_cache_state != BTRFS_CACHE_NO) {
+ spin_unlock(&root->ino_cache_lock);
+ return;
+ }
+
+ root->ino_cache_state = BTRFS_CACHE_STARTED;
+ spin_unlock(&root->ino_cache_lock);
+
+ ret = load_free_ino_cache(root->fs_info, root);
+ if (ret == 1) {
+ spin_lock(&root->ino_cache_lock);
+ root->ino_cache_state = BTRFS_CACHE_FINISHED;
+ spin_unlock(&root->ino_cache_lock);
+ return;
+ }
+
+ /*
+ * It can be quite time-consuming to fill the cache by searching
+ * through the extent tree, and this can keep ino allocation path
+ * waiting. Therefore at start we quickly find out the highest
+ * inode number and we know we can use inode numbers which fall in
+ * [highest_ino + 1, BTRFS_LAST_FREE_OBJECTID].
+ */
+ ret = btrfs_find_free_objectid(root, &objectid);
+ if (!ret && objectid <= BTRFS_LAST_FREE_OBJECTID) {
+ __btrfs_add_free_space(ctl, objectid,
+ BTRFS_LAST_FREE_OBJECTID - objectid + 1);
+ }
+
+ tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu",
+ root->root_key.objectid);
+ if (IS_ERR(tsk)) {
+ btrfs_warn(root->fs_info, "failed to start inode caching task");
+ btrfs_clear_pending_and_info(root->fs_info, INODE_MAP_CACHE,
+ "disabling inode map caching");
+ }
+}
+
+int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid)
+{
+ if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+ return btrfs_find_free_objectid(root, objectid);
+
+again:
+ *objectid = btrfs_find_ino_for_alloc(root);
+
+ if (*objectid != 0)
+ return 0;
+
+ start_caching(root);
+
+ wait_event(root->ino_cache_wait,
+ root->ino_cache_state == BTRFS_CACHE_FINISHED ||
+ root->free_ino_ctl->free_space > 0);
+
+ if (root->ino_cache_state == BTRFS_CACHE_FINISHED &&
+ root->free_ino_ctl->free_space == 0)
+ return -ENOSPC;
+ else
+ goto again;
+}
+
+void btrfs_return_ino(struct btrfs_root *root, u64 objectid)
+{
+ struct btrfs_free_space_ctl *pinned = root->free_ino_pinned;
+
+ if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+ return;
+again:
+ if (root->ino_cache_state == BTRFS_CACHE_FINISHED) {
+ __btrfs_add_free_space(pinned, objectid, 1);
+ } else {
+ down_write(&root->fs_info->commit_root_sem);
+ spin_lock(&root->ino_cache_lock);
+ if (root->ino_cache_state == BTRFS_CACHE_FINISHED) {
+ spin_unlock(&root->ino_cache_lock);
+ up_write(&root->fs_info->commit_root_sem);
+ goto again;
+ }
+ spin_unlock(&root->ino_cache_lock);
+
+ start_caching(root);
+
+ __btrfs_add_free_space(pinned, objectid, 1);
+
+ up_write(&root->fs_info->commit_root_sem);
+ }
+}
+
+/*
+ * When a transaction is committed, we'll move those inode numbers which are
+ * smaller than root->ino_cache_progress from pinned tree to free_ino tree, and
+ * others will just be dropped, because the commit root we were searching has
+ * changed.
+ *
+ * Must be called with root->fs_info->commit_root_sem held
+ */
+void btrfs_unpin_free_ino(struct btrfs_root *root)
+{
+ struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
+ struct rb_root *rbroot = &root->free_ino_pinned->free_space_offset;
+ spinlock_t *rbroot_lock = &root->free_ino_pinned->tree_lock;
+ struct btrfs_free_space *info;
+ struct rb_node *n;
+ u64 count;
+
+ if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+ return;
+
+ while (1) {
+ bool add_to_ctl = true;
+
+ spin_lock(rbroot_lock);
+ n = rb_first(rbroot);
+ if (!n) {
+ spin_unlock(rbroot_lock);
+ break;
+ }
+
+ info = rb_entry(n, struct btrfs_free_space, offset_index);
+ BUG_ON(info->bitmap); /* Logic error */
+
+ if (info->offset > root->ino_cache_progress)
+ add_to_ctl = false;
+ else if (info->offset + info->bytes > root->ino_cache_progress)
+ count = root->ino_cache_progress - info->offset + 1;
+ else
+ count = info->bytes;
+
+ rb_erase(&info->offset_index, rbroot);
+ spin_unlock(rbroot_lock);
+ if (add_to_ctl)
+ __btrfs_add_free_space(ctl, info->offset, count);
+ kmem_cache_free(btrfs_free_space_cachep, info);
+ }
+}
+
+#define INIT_THRESHOLD (((1024 * 32) / 2) / sizeof(struct btrfs_free_space))
+#define INODES_PER_BITMAP (PAGE_CACHE_SIZE * 8)
+
+/*
+ * The goal is to keep the memory used by the free_ino tree won't
+ * exceed the memory if we use bitmaps only.
+ */
+static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
+{
+ struct btrfs_free_space *info;
+ struct rb_node *n;
+ int max_ino;
+ int max_bitmaps;
+
+ n = rb_last(&ctl->free_space_offset);
+ if (!n) {
+ ctl->extents_thresh = INIT_THRESHOLD;
+ return;
+ }
+ info = rb_entry(n, struct btrfs_free_space, offset_index);
+
+ /*
+ * Find the maximum inode number in the filesystem. Note we
+ * ignore the fact that this can be a bitmap, because we are
+ * not doing precise calculation.
+ */
+ max_ino = info->bytes - 1;
+
+ max_bitmaps = ALIGN(max_ino, INODES_PER_BITMAP) / INODES_PER_BITMAP;
+ if (max_bitmaps <= ctl->total_bitmaps) {
+ ctl->extents_thresh = 0;
+ return;
+ }
+
+ ctl->extents_thresh = (max_bitmaps - ctl->total_bitmaps) *
+ PAGE_CACHE_SIZE / sizeof(*info);
+}
+
+/*
+ * We don't fall back to bitmap, if we are below the extents threshold
+ * or this chunk of inode numbers is a big one.
+ */
+static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info)
+{
+ if (ctl->free_extents < ctl->extents_thresh ||
+ info->bytes > INODES_PER_BITMAP / 10)
+ return false;
+
+ return true;
+}
+
+static struct btrfs_free_space_op free_ino_op = {
+ .recalc_thresholds = recalculate_thresholds,
+ .use_bitmap = use_bitmap,
+};
+
+static void pinned_recalc_thresholds(struct btrfs_free_space_ctl *ctl)
+{
+}
+
+static bool pinned_use_bitmap(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info)
+{
+ /*
+ * We always use extents for two reasons:
+ *
+ * - The pinned tree is only used during the process of caching
+ * work.
+ * - Make code simpler. See btrfs_unpin_free_ino().
+ */
+ return false;
+}
+
+static struct btrfs_free_space_op pinned_free_ino_op = {
+ .recalc_thresholds = pinned_recalc_thresholds,
+ .use_bitmap = pinned_use_bitmap,
+};
+
+void btrfs_init_free_ino_ctl(struct btrfs_root *root)
+{
+ struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
+ struct btrfs_free_space_ctl *pinned = root->free_ino_pinned;
+
+ spin_lock_init(&ctl->tree_lock);
+ ctl->unit = 1;
+ ctl->start = 0;
+ ctl->private = NULL;
+ ctl->op = &free_ino_op;
+ INIT_LIST_HEAD(&ctl->trimming_ranges);
+ mutex_init(&ctl->cache_writeout_mutex);
+
+ /*
+ * Initially we allow to use 16K of ram to cache chunks of
+ * inode numbers before we resort to bitmaps. This is somewhat
+ * arbitrary, but it will be adjusted in runtime.
+ */
+ ctl->extents_thresh = INIT_THRESHOLD;
+
+ spin_lock_init(&pinned->tree_lock);
+ pinned->unit = 1;
+ pinned->start = 0;
+ pinned->private = NULL;
+ pinned->extents_thresh = 0;
+ pinned->op = &pinned_free_ino_op;
+}
+
+int btrfs_save_ino_cache(struct btrfs_root *root,
+ struct btrfs_trans_handle *trans)
+{
+ struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
+ struct btrfs_path *path;
+ struct inode *inode;
+ struct btrfs_block_rsv *rsv;
+ u64 num_bytes;
+ u64 alloc_hint = 0;
+ int ret;
+ int prealloc;
+ bool retry = false;
+
+ /* only fs tree and subvol/snap needs ino cache */
+ if (root->root_key.objectid != BTRFS_FS_TREE_OBJECTID &&
+ (root->root_key.objectid < BTRFS_FIRST_FREE_OBJECTID ||
+ root->root_key.objectid > BTRFS_LAST_FREE_OBJECTID))
+ return 0;
+
+ /* Don't save inode cache if we are deleting this root */
+ if (btrfs_root_refs(&root->root_item) == 0)
+ return 0;
+
+ if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+ return 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ rsv = trans->block_rsv;
+ trans->block_rsv = &root->fs_info->trans_block_rsv;
+
+ num_bytes = trans->bytes_reserved;
+ /*
+ * 1 item for inode item insertion if need
+ * 4 items for inode item update (in the worst case)
+ * 1 items for slack space if we need do truncation
+ * 1 item for free space object
+ * 3 items for pre-allocation
+ */
+ trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 10);
+ ret = btrfs_block_rsv_add(root, trans->block_rsv,
+ trans->bytes_reserved,
+ BTRFS_RESERVE_NO_FLUSH);
+ if (ret)
+ goto out;
+ trace_btrfs_space_reservation(root->fs_info, "ino_cache",
+ trans->transid, trans->bytes_reserved, 1);
+again:
+ inode = lookup_free_ino_inode(root, path);
+ if (IS_ERR(inode) && (PTR_ERR(inode) != -ENOENT || retry)) {
+ ret = PTR_ERR(inode);
+ goto out_release;
+ }
+
+ if (IS_ERR(inode)) {
+ BUG_ON(retry); /* Logic error */
+ retry = true;
+
+ ret = create_free_ino_inode(root, trans, path);
+ if (ret)
+ goto out_release;
+ goto again;
+ }
+
+ BTRFS_I(inode)->generation = 0;
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out_put;
+ }
+
+ if (i_size_read(inode) > 0) {
+ ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode);
+ if (ret) {
+ if (ret != -ENOSPC)
+ btrfs_abort_transaction(trans, root, ret);
+ goto out_put;
+ }
+ }
+
+ spin_lock(&root->ino_cache_lock);
+ if (root->ino_cache_state != BTRFS_CACHE_FINISHED) {
+ ret = -1;
+ spin_unlock(&root->ino_cache_lock);
+ goto out_put;
+ }
+ spin_unlock(&root->ino_cache_lock);
+
+ spin_lock(&ctl->tree_lock);
+ prealloc = sizeof(struct btrfs_free_space) * ctl->free_extents;
+ prealloc = ALIGN(prealloc, PAGE_CACHE_SIZE);
+ prealloc += ctl->total_bitmaps * PAGE_CACHE_SIZE;
+ spin_unlock(&ctl->tree_lock);
+
+ /* Just to make sure we have enough space */
+ prealloc += 8 * PAGE_CACHE_SIZE;
+
+ ret = btrfs_delalloc_reserve_space(inode, prealloc);
+ if (ret)
+ goto out_put;
+
+ ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
+ prealloc, prealloc, &alloc_hint);
+ if (ret) {
+ btrfs_delalloc_release_space(inode, prealloc);
+ goto out_put;
+ }
+ btrfs_free_reserved_data_space(inode, prealloc);
+
+ ret = btrfs_write_out_ino_cache(root, trans, path, inode);
+out_put:
+ iput(inode);
+out_release:
+ trace_btrfs_space_reservation(root->fs_info, "ino_cache",
+ trans->transid, trans->bytes_reserved, 0);
+ btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
+out:
+ trans->block_rsv = rsv;
+ trans->bytes_reserved = num_bytes;
+
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid)
+{
+ struct btrfs_path *path;
+ int ret;
+ struct extent_buffer *l;
+ struct btrfs_key search_key;
+ struct btrfs_key found_key;
+ int slot;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ search_key.objectid = BTRFS_LAST_FREE_OBJECTID;
+ search_key.type = -1;
+ search_key.offset = (u64)-1;
+ ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+ if (ret < 0)
+ goto error;
+ BUG_ON(ret == 0); /* Corruption */
+ if (path->slots[0] > 0) {
+ slot = path->slots[0] - 1;
+ l = path->nodes[0];
+ btrfs_item_key_to_cpu(l, &found_key, slot);
+ *objectid = max_t(u64, found_key.objectid,
+ BTRFS_FIRST_FREE_OBJECTID - 1);
+ } else {
+ *objectid = BTRFS_FIRST_FREE_OBJECTID - 1;
+ }
+ ret = 0;
+error:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid)
+{
+ int ret;
+ mutex_lock(&root->objectid_mutex);
+
+ if (unlikely(root->highest_objectid < BTRFS_FIRST_FREE_OBJECTID)) {
+ ret = btrfs_find_highest_objectid(root,
+ &root->highest_objectid);
+ if (ret)
+ goto out;
+ }
+
+ if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
+ ret = -ENOSPC;
+ goto out;
+ }
+
+ *objectid = ++root->highest_objectid;
+ ret = 0;
+out:
+ mutex_unlock(&root->objectid_mutex);
+ return ret;
+}
diff --git a/fs/btrfs/inode-map.h b/fs/btrfs/inode-map.h
new file mode 100644
index 000000000..ddb347bfe
--- /dev/null
+++ b/fs/btrfs/inode-map.h
@@ -0,0 +1,13 @@
+#ifndef __BTRFS_INODE_MAP
+#define __BTRFS_INODE_MAP
+
+void btrfs_init_free_ino_ctl(struct btrfs_root *root);
+void btrfs_unpin_free_ino(struct btrfs_root *root);
+void btrfs_return_ino(struct btrfs_root *root, u64 objectid);
+int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid);
+int btrfs_save_ino_cache(struct btrfs_root *root,
+ struct btrfs_trans_handle *trans);
+
+int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid);
+
+#endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
new file mode 100644
index 000000000..8bb013672
--- /dev/null
+++ b/fs/btrfs/inode.c
@@ -0,0 +1,9907 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/bio.h>
+#include <linux/buffer_head.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/backing-dev.h>
+#include <linux/mpage.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/statfs.h>
+#include <linux/compat.h>
+#include <linux/bit_spinlock.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl.h>
+#include <linux/falloc.h>
+#include <linux/slab.h>
+#include <linux/ratelimit.h>
+#include <linux/mount.h>
+#include <linux/btrfs.h>
+#include <linux/blkdev.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/uio.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "print-tree.h"
+#include "ordered-data.h"
+#include "xattr.h"
+#include "tree-log.h"
+#include "volumes.h"
+#include "compression.h"
+#include "locking.h"
+#include "free-space-cache.h"
+#include "inode-map.h"
+#include "backref.h"
+#include "hash.h"
+#include "props.h"
+#include "qgroup.h"
+
+struct btrfs_iget_args {
+ struct btrfs_key *location;
+ struct btrfs_root *root;
+};
+
+static const struct inode_operations btrfs_dir_inode_operations;
+static const struct inode_operations btrfs_symlink_inode_operations;
+static const struct inode_operations btrfs_dir_ro_inode_operations;
+static const struct inode_operations btrfs_special_inode_operations;
+static const struct inode_operations btrfs_file_inode_operations;
+static const struct address_space_operations btrfs_aops;
+static const struct address_space_operations btrfs_symlink_aops;
+static const struct file_operations btrfs_dir_file_operations;
+static struct extent_io_ops btrfs_extent_io_ops;
+
+static struct kmem_cache *btrfs_inode_cachep;
+static struct kmem_cache *btrfs_delalloc_work_cachep;
+struct kmem_cache *btrfs_trans_handle_cachep;
+struct kmem_cache *btrfs_transaction_cachep;
+struct kmem_cache *btrfs_path_cachep;
+struct kmem_cache *btrfs_free_space_cachep;
+
+#define S_SHIFT 12
+static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
+ [S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE,
+ [S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR,
+ [S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV,
+ [S_IFBLK >> S_SHIFT] = BTRFS_FT_BLKDEV,
+ [S_IFIFO >> S_SHIFT] = BTRFS_FT_FIFO,
+ [S_IFSOCK >> S_SHIFT] = BTRFS_FT_SOCK,
+ [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK,
+};
+
+static int btrfs_setsize(struct inode *inode, struct iattr *attr);
+static int btrfs_truncate(struct inode *inode);
+static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
+static noinline int cow_file_range(struct inode *inode,
+ struct page *locked_page,
+ u64 start, u64 end, int *page_started,
+ unsigned long *nr_written, int unlock);
+static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
+ u64 len, u64 orig_start,
+ u64 block_start, u64 block_len,
+ u64 orig_block_len, u64 ram_bytes,
+ int type);
+
+static int btrfs_dirty_inode(struct inode *inode);
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+void btrfs_test_inode_set_ops(struct inode *inode)
+{
+ BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+}
+#endif
+
+static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
+ struct inode *inode, struct inode *dir,
+ const struct qstr *qstr)
+{
+ int err;
+
+ err = btrfs_init_acl(trans, inode, dir);
+ if (!err)
+ err = btrfs_xattr_security_init(trans, inode, dir, qstr);
+ return err;
+}
+
+/*
+ * this does all the hard work for inserting an inline extent into
+ * the btree. The caller should have done a btrfs_drop_extents so that
+ * no overlapping inline items exist in the btree
+ */
+static int insert_inline_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path, int extent_inserted,
+ struct btrfs_root *root, struct inode *inode,
+ u64 start, size_t size, size_t compressed_size,
+ int compress_type,
+ struct page **compressed_pages)
+{
+ struct extent_buffer *leaf;
+ struct page *page = NULL;
+ char *kaddr;
+ unsigned long ptr;
+ struct btrfs_file_extent_item *ei;
+ int err = 0;
+ int ret;
+ size_t cur_size = size;
+ unsigned long offset;
+
+ if (compressed_size && compressed_pages)
+ cur_size = compressed_size;
+
+ inode_add_bytes(inode, size);
+
+ if (!extent_inserted) {
+ struct btrfs_key key;
+ size_t datasize;
+
+ key.objectid = btrfs_ino(inode);
+ key.offset = start;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+
+ datasize = btrfs_file_extent_calc_inline_size(cur_size);
+ path->leave_spinning = 1;
+ ret = btrfs_insert_empty_item(trans, root, path, &key,
+ datasize);
+ if (ret) {
+ err = ret;
+ goto fail;
+ }
+ }
+ leaf = path->nodes[0];
+ ei = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_generation(leaf, ei, trans->transid);
+ btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
+ btrfs_set_file_extent_encryption(leaf, ei, 0);
+ btrfs_set_file_extent_other_encoding(leaf, ei, 0);
+ btrfs_set_file_extent_ram_bytes(leaf, ei, size);
+ ptr = btrfs_file_extent_inline_start(ei);
+
+ if (compress_type != BTRFS_COMPRESS_NONE) {
+ struct page *cpage;
+ int i = 0;
+ while (compressed_size > 0) {
+ cpage = compressed_pages[i];
+ cur_size = min_t(unsigned long, compressed_size,
+ PAGE_CACHE_SIZE);
+
+ kaddr = kmap_atomic(cpage);
+ write_extent_buffer(leaf, kaddr, ptr, cur_size);
+ kunmap_atomic(kaddr);
+
+ i++;
+ ptr += cur_size;
+ compressed_size -= cur_size;
+ }
+ btrfs_set_file_extent_compression(leaf, ei,
+ compress_type);
+ } else {
+ page = find_get_page(inode->i_mapping,
+ start >> PAGE_CACHE_SHIFT);
+ btrfs_set_file_extent_compression(leaf, ei, 0);
+ kaddr = kmap_atomic(page);
+ offset = start & (PAGE_CACHE_SIZE - 1);
+ write_extent_buffer(leaf, kaddr + offset, ptr, size);
+ kunmap_atomic(kaddr);
+ page_cache_release(page);
+ }
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_release_path(path);
+
+ /*
+ * we're an inline extent, so nobody can
+ * extend the file past i_size without locking
+ * a page we already have locked.
+ *
+ * We must do any isize and inode updates
+ * before we unlock the pages. Otherwise we
+ * could end up racing with unlink.
+ */
+ BTRFS_I(inode)->disk_i_size = inode->i_size;
+ ret = btrfs_update_inode(trans, root, inode);
+
+ return ret;
+fail:
+ return err;
+}
+
+
+/*
+ * conditionally insert an inline extent into the file. This
+ * does the checks required to make sure the data is small enough
+ * to fit as an inline extent.
+ */
+static noinline int cow_file_range_inline(struct btrfs_root *root,
+ struct inode *inode, u64 start,
+ u64 end, size_t compressed_size,
+ int compress_type,
+ struct page **compressed_pages)
+{
+ struct btrfs_trans_handle *trans;
+ u64 isize = i_size_read(inode);
+ u64 actual_end = min(end + 1, isize);
+ u64 inline_len = actual_end - start;
+ u64 aligned_end = ALIGN(end, root->sectorsize);
+ u64 data_len = inline_len;
+ int ret;
+ struct btrfs_path *path;
+ int extent_inserted = 0;
+ u32 extent_item_size;
+
+ if (compressed_size)
+ data_len = compressed_size;
+
+ if (start > 0 ||
+ actual_end > PAGE_CACHE_SIZE ||
+ data_len > BTRFS_MAX_INLINE_DATA_SIZE(root) ||
+ (!compressed_size &&
+ (actual_end & (root->sectorsize - 1)) == 0) ||
+ end + 1 < isize ||
+ data_len > root->fs_info->max_inline) {
+ return 1;
+ }
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ btrfs_free_path(path);
+ return PTR_ERR(trans);
+ }
+ trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+
+ if (compressed_size && compressed_pages)
+ extent_item_size = btrfs_file_extent_calc_inline_size(
+ compressed_size);
+ else
+ extent_item_size = btrfs_file_extent_calc_inline_size(
+ inline_len);
+
+ ret = __btrfs_drop_extents(trans, root, inode, path,
+ start, aligned_end, NULL,
+ 1, 1, extent_item_size, &extent_inserted);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
+
+ if (isize > actual_end)
+ inline_len = min_t(u64, isize, actual_end);
+ ret = insert_inline_extent(trans, path, extent_inserted,
+ root, inode, start,
+ inline_len, compressed_size,
+ compress_type, compressed_pages);
+ if (ret && ret != -ENOSPC) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ } else if (ret == -ENOSPC) {
+ ret = 1;
+ goto out;
+ }
+
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
+ btrfs_delalloc_release_metadata(inode, end + 1 - start);
+ btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
+out:
+ btrfs_free_path(path);
+ btrfs_end_transaction(trans, root);
+ return ret;
+}
+
+struct async_extent {
+ u64 start;
+ u64 ram_size;
+ u64 compressed_size;
+ struct page **pages;
+ unsigned long nr_pages;
+ int compress_type;
+ struct list_head list;
+};
+
+struct async_cow {
+ struct inode *inode;
+ struct btrfs_root *root;
+ struct page *locked_page;
+ u64 start;
+ u64 end;
+ struct list_head extents;
+ struct btrfs_work work;
+};
+
+static noinline int add_async_extent(struct async_cow *cow,
+ u64 start, u64 ram_size,
+ u64 compressed_size,
+ struct page **pages,
+ unsigned long nr_pages,
+ int compress_type)
+{
+ struct async_extent *async_extent;
+
+ async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
+ BUG_ON(!async_extent); /* -ENOMEM */
+ async_extent->start = start;
+ async_extent->ram_size = ram_size;
+ async_extent->compressed_size = compressed_size;
+ async_extent->pages = pages;
+ async_extent->nr_pages = nr_pages;
+ async_extent->compress_type = compress_type;
+ list_add_tail(&async_extent->list, &cow->extents);
+ return 0;
+}
+
+static inline int inode_need_compress(struct inode *inode)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+
+ /* force compress */
+ if (btrfs_test_opt(root, FORCE_COMPRESS))
+ return 1;
+ /* bad compression ratios */
+ if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
+ return 0;
+ if (btrfs_test_opt(root, COMPRESS) ||
+ BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS ||
+ BTRFS_I(inode)->force_compress)
+ return 1;
+ return 0;
+}
+
+/*
+ * we create compressed extents in two phases. The first
+ * phase compresses a range of pages that have already been
+ * locked (both pages and state bits are locked).
+ *
+ * This is done inside an ordered work queue, and the compression
+ * is spread across many cpus. The actual IO submission is step
+ * two, and the ordered work queue takes care of making sure that
+ * happens in the same order things were put onto the queue by
+ * writepages and friends.
+ *
+ * If this code finds it can't get good compression, it puts an
+ * entry onto the work queue to write the uncompressed bytes. This
+ * makes sure that both compressed inodes and uncompressed inodes
+ * are written in the same order that the flusher thread sent them
+ * down.
+ */
+static noinline void compress_file_range(struct inode *inode,
+ struct page *locked_page,
+ u64 start, u64 end,
+ struct async_cow *async_cow,
+ int *num_added)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ u64 num_bytes;
+ u64 blocksize = root->sectorsize;
+ u64 actual_end;
+ u64 isize = i_size_read(inode);
+ int ret = 0;
+ struct page **pages = NULL;
+ unsigned long nr_pages;
+ unsigned long nr_pages_ret = 0;
+ unsigned long total_compressed = 0;
+ unsigned long total_in = 0;
+ unsigned long max_compressed = 128 * 1024;
+ unsigned long max_uncompressed = 128 * 1024;
+ int i;
+ int will_compress;
+ int compress_type = root->fs_info->compress_type;
+ int redirty = 0;
+
+ /* if this is a small write inside eof, kick off a defrag */
+ if ((end - start + 1) < 16 * 1024 &&
+ (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
+ btrfs_add_inode_defrag(NULL, inode);
+
+ actual_end = min_t(u64, isize, end + 1);
+again:
+ will_compress = 0;
+ nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
+ nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
+
+ /*
+ * we don't want to send crud past the end of i_size through
+ * compression, that's just a waste of CPU time. So, if the
+ * end of the file is before the start of our current
+ * requested range of bytes, we bail out to the uncompressed
+ * cleanup code that can deal with all of this.
+ *
+ * It isn't really the fastest way to fix things, but this is a
+ * very uncommon corner.
+ */
+ if (actual_end <= start)
+ goto cleanup_and_bail_uncompressed;
+
+ total_compressed = actual_end - start;
+
+ /*
+ * skip compression for a small file range(<=blocksize) that
+ * isn't an inline extent, since it dosen't save disk space at all.
+ */
+ if (total_compressed <= blocksize &&
+ (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
+ goto cleanup_and_bail_uncompressed;
+
+ /* we want to make sure that amount of ram required to uncompress
+ * an extent is reasonable, so we limit the total size in ram
+ * of a compressed extent to 128k. This is a crucial number
+ * because it also controls how easily we can spread reads across
+ * cpus for decompression.
+ *
+ * We also want to make sure the amount of IO required to do
+ * a random read is reasonably small, so we limit the size of
+ * a compressed extent to 128k.
+ */
+ total_compressed = min(total_compressed, max_uncompressed);
+ num_bytes = ALIGN(end - start + 1, blocksize);
+ num_bytes = max(blocksize, num_bytes);
+ total_in = 0;
+ ret = 0;
+
+ /*
+ * we do compression for mount -o compress and when the
+ * inode has not been flagged as nocompress. This flag can
+ * change at any time if we discover bad compression ratios.
+ */
+ if (inode_need_compress(inode)) {
+ WARN_ON(pages);
+ pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
+ if (!pages) {
+ /* just bail out to the uncompressed code */
+ goto cont;
+ }
+
+ if (BTRFS_I(inode)->force_compress)
+ compress_type = BTRFS_I(inode)->force_compress;
+
+ /*
+ * we need to call clear_page_dirty_for_io on each
+ * page in the range. Otherwise applications with the file
+ * mmap'd can wander in and change the page contents while
+ * we are compressing them.
+ *
+ * If the compression fails for any reason, we set the pages
+ * dirty again later on.
+ */
+ extent_range_clear_dirty_for_io(inode, start, end);
+ redirty = 1;
+ ret = btrfs_compress_pages(compress_type,
+ inode->i_mapping, start,
+ total_compressed, pages,
+ nr_pages, &nr_pages_ret,
+ &total_in,
+ &total_compressed,
+ max_compressed);
+
+ if (!ret) {
+ unsigned long offset = total_compressed &
+ (PAGE_CACHE_SIZE - 1);
+ struct page *page = pages[nr_pages_ret - 1];
+ char *kaddr;
+
+ /* zero the tail end of the last page, we might be
+ * sending it down to disk
+ */
+ if (offset) {
+ kaddr = kmap_atomic(page);
+ memset(kaddr + offset, 0,
+ PAGE_CACHE_SIZE - offset);
+ kunmap_atomic(kaddr);
+ }
+ will_compress = 1;
+ }
+ }
+cont:
+ if (start == 0) {
+ /* lets try to make an inline extent */
+ if (ret || total_in < (actual_end - start)) {
+ /* we didn't compress the entire range, try
+ * to make an uncompressed inline extent.
+ */
+ ret = cow_file_range_inline(root, inode, start, end,
+ 0, 0, NULL);
+ } else {
+ /* try making a compressed inline extent */
+ ret = cow_file_range_inline(root, inode, start, end,
+ total_compressed,
+ compress_type, pages);
+ }
+ if (ret <= 0) {
+ unsigned long clear_flags = EXTENT_DELALLOC |
+ EXTENT_DEFRAG;
+ unsigned long page_error_op;
+
+ clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0;
+ page_error_op = ret < 0 ? PAGE_SET_ERROR : 0;
+
+ /*
+ * inline extent creation worked or returned error,
+ * we don't need to create any more async work items.
+ * Unlock and free up our temp pages.
+ */
+ extent_clear_unlock_delalloc(inode, start, end, NULL,
+ clear_flags, PAGE_UNLOCK |
+ PAGE_CLEAR_DIRTY |
+ PAGE_SET_WRITEBACK |
+ page_error_op |
+ PAGE_END_WRITEBACK);
+ goto free_pages_out;
+ }
+ }
+
+ if (will_compress) {
+ /*
+ * we aren't doing an inline extent round the compressed size
+ * up to a block size boundary so the allocator does sane
+ * things
+ */
+ total_compressed = ALIGN(total_compressed, blocksize);
+
+ /*
+ * one last check to make sure the compression is really a
+ * win, compare the page count read with the blocks on disk
+ */
+ total_in = ALIGN(total_in, PAGE_CACHE_SIZE);
+ if (total_compressed >= total_in) {
+ will_compress = 0;
+ } else {
+ num_bytes = total_in;
+ }
+ }
+ if (!will_compress && pages) {
+ /*
+ * the compression code ran but failed to make things smaller,
+ * free any pages it allocated and our page pointer array
+ */
+ for (i = 0; i < nr_pages_ret; i++) {
+ WARN_ON(pages[i]->mapping);
+ page_cache_release(pages[i]);
+ }
+ kfree(pages);
+ pages = NULL;
+ total_compressed = 0;
+ nr_pages_ret = 0;
+
+ /* flag the file so we don't compress in the future */
+ if (!btrfs_test_opt(root, FORCE_COMPRESS) &&
+ !(BTRFS_I(inode)->force_compress)) {
+ BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
+ }
+ }
+ if (will_compress) {
+ *num_added += 1;
+
+ /* the async work queues will take care of doing actual
+ * allocation on disk for these compressed pages,
+ * and will submit them to the elevator.
+ */
+ add_async_extent(async_cow, start, num_bytes,
+ total_compressed, pages, nr_pages_ret,
+ compress_type);
+
+ if (start + num_bytes < end) {
+ start += num_bytes;
+ pages = NULL;
+ cond_resched();
+ goto again;
+ }
+ } else {
+cleanup_and_bail_uncompressed:
+ /*
+ * No compression, but we still need to write the pages in
+ * the file we've been given so far. redirty the locked
+ * page if it corresponds to our extent and set things up
+ * for the async work queue to run cow_file_range to do
+ * the normal delalloc dance
+ */
+ if (page_offset(locked_page) >= start &&
+ page_offset(locked_page) <= end) {
+ __set_page_dirty_nobuffers(locked_page);
+ /* unlocked later on in the async handlers */
+ }
+ if (redirty)
+ extent_range_redirty_for_io(inode, start, end);
+ add_async_extent(async_cow, start, end - start + 1,
+ 0, NULL, 0, BTRFS_COMPRESS_NONE);
+ *num_added += 1;
+ }
+
+ return;
+
+free_pages_out:
+ for (i = 0; i < nr_pages_ret; i++) {
+ WARN_ON(pages[i]->mapping);
+ page_cache_release(pages[i]);
+ }
+ kfree(pages);
+}
+
+static void free_async_extent_pages(struct async_extent *async_extent)
+{
+ int i;
+
+ if (!async_extent->pages)
+ return;
+
+ for (i = 0; i < async_extent->nr_pages; i++) {
+ WARN_ON(async_extent->pages[i]->mapping);
+ page_cache_release(async_extent->pages[i]);
+ }
+ kfree(async_extent->pages);
+ async_extent->nr_pages = 0;
+ async_extent->pages = NULL;
+}
+
+/*
+ * phase two of compressed writeback. This is the ordered portion
+ * of the code, which only gets called in the order the work was
+ * queued. We walk all the async extents created by compress_file_range
+ * and send them down to the disk.
+ */
+static noinline void submit_compressed_extents(struct inode *inode,
+ struct async_cow *async_cow)
+{
+ struct async_extent *async_extent;
+ u64 alloc_hint = 0;
+ struct btrfs_key ins;
+ struct extent_map *em;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ struct extent_io_tree *io_tree;
+ int ret = 0;
+
+again:
+ while (!list_empty(&async_cow->extents)) {
+ async_extent = list_entry(async_cow->extents.next,
+ struct async_extent, list);
+ list_del(&async_extent->list);
+
+ io_tree = &BTRFS_I(inode)->io_tree;
+
+retry:
+ /* did the compression code fall back to uncompressed IO? */
+ if (!async_extent->pages) {
+ int page_started = 0;
+ unsigned long nr_written = 0;
+
+ lock_extent(io_tree, async_extent->start,
+ async_extent->start +
+ async_extent->ram_size - 1);
+
+ /* allocate blocks */
+ ret = cow_file_range(inode, async_cow->locked_page,
+ async_extent->start,
+ async_extent->start +
+ async_extent->ram_size - 1,
+ &page_started, &nr_written, 0);
+
+ /* JDM XXX */
+
+ /*
+ * if page_started, cow_file_range inserted an
+ * inline extent and took care of all the unlocking
+ * and IO for us. Otherwise, we need to submit
+ * all those pages down to the drive.
+ */
+ if (!page_started && !ret)
+ extent_write_locked_range(io_tree,
+ inode, async_extent->start,
+ async_extent->start +
+ async_extent->ram_size - 1,
+ btrfs_get_extent,
+ WB_SYNC_ALL);
+ else if (ret)
+ unlock_page(async_cow->locked_page);
+ kfree(async_extent);
+ cond_resched();
+ continue;
+ }
+
+ lock_extent(io_tree, async_extent->start,
+ async_extent->start + async_extent->ram_size - 1);
+
+ ret = btrfs_reserve_extent(root,
+ async_extent->compressed_size,
+ async_extent->compressed_size,
+ 0, alloc_hint, &ins, 1, 1);
+ if (ret) {
+ free_async_extent_pages(async_extent);
+
+ if (ret == -ENOSPC) {
+ unlock_extent(io_tree, async_extent->start,
+ async_extent->start +
+ async_extent->ram_size - 1);
+
+ /*
+ * we need to redirty the pages if we decide to
+ * fallback to uncompressed IO, otherwise we
+ * will not submit these pages down to lower
+ * layers.
+ */
+ extent_range_redirty_for_io(inode,
+ async_extent->start,
+ async_extent->start +
+ async_extent->ram_size - 1);
+
+ goto retry;
+ }
+ goto out_free;
+ }
+ /*
+ * here we're doing allocation and writeback of the
+ * compressed pages
+ */
+ btrfs_drop_extent_cache(inode, async_extent->start,
+ async_extent->start +
+ async_extent->ram_size - 1, 0);
+
+ em = alloc_extent_map();
+ if (!em) {
+ ret = -ENOMEM;
+ goto out_free_reserve;
+ }
+ em->start = async_extent->start;
+ em->len = async_extent->ram_size;
+ em->orig_start = em->start;
+ em->mod_start = em->start;
+ em->mod_len = em->len;
+
+ em->block_start = ins.objectid;
+ em->block_len = ins.offset;
+ em->orig_block_len = ins.offset;
+ em->ram_bytes = async_extent->ram_size;
+ em->bdev = root->fs_info->fs_devices->latest_bdev;
+ em->compress_type = async_extent->compress_type;
+ set_bit(EXTENT_FLAG_PINNED, &em->flags);
+ set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+ em->generation = -1;
+
+ while (1) {
+ write_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, em, 1);
+ write_unlock(&em_tree->lock);
+ if (ret != -EEXIST) {
+ free_extent_map(em);
+ break;
+ }
+ btrfs_drop_extent_cache(inode, async_extent->start,
+ async_extent->start +
+ async_extent->ram_size - 1, 0);
+ }
+
+ if (ret)
+ goto out_free_reserve;
+
+ ret = btrfs_add_ordered_extent_compress(inode,
+ async_extent->start,
+ ins.objectid,
+ async_extent->ram_size,
+ ins.offset,
+ BTRFS_ORDERED_COMPRESSED,
+ async_extent->compress_type);
+ if (ret) {
+ btrfs_drop_extent_cache(inode, async_extent->start,
+ async_extent->start +
+ async_extent->ram_size - 1, 0);
+ goto out_free_reserve;
+ }
+
+ /*
+ * clear dirty, set writeback and unlock the pages.
+ */
+ extent_clear_unlock_delalloc(inode, async_extent->start,
+ async_extent->start +
+ async_extent->ram_size - 1,
+ NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
+ PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
+ PAGE_SET_WRITEBACK);
+ ret = btrfs_submit_compressed_write(inode,
+ async_extent->start,
+ async_extent->ram_size,
+ ins.objectid,
+ ins.offset, async_extent->pages,
+ async_extent->nr_pages);
+ if (ret) {
+ struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
+ struct page *p = async_extent->pages[0];
+ const u64 start = async_extent->start;
+ const u64 end = start + async_extent->ram_size - 1;
+
+ p->mapping = inode->i_mapping;
+ tree->ops->writepage_end_io_hook(p, start, end,
+ NULL, 0);
+ p->mapping = NULL;
+ extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
+ PAGE_END_WRITEBACK |
+ PAGE_SET_ERROR);
+ free_async_extent_pages(async_extent);
+ }
+ alloc_hint = ins.objectid + ins.offset;
+ kfree(async_extent);
+ cond_resched();
+ }
+ return;
+out_free_reserve:
+ btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
+out_free:
+ extent_clear_unlock_delalloc(inode, async_extent->start,
+ async_extent->start +
+ async_extent->ram_size - 1,
+ NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
+ EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
+ PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
+ PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
+ PAGE_SET_ERROR);
+ free_async_extent_pages(async_extent);
+ kfree(async_extent);
+ goto again;
+}
+
+static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
+ u64 num_bytes)
+{
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ struct extent_map *em;
+ u64 alloc_hint = 0;
+
+ read_lock(&em_tree->lock);
+ em = search_extent_mapping(em_tree, start, num_bytes);
+ if (em) {
+ /*
+ * if block start isn't an actual block number then find the
+ * first block in this inode and use that as a hint. If that
+ * block is also bogus then just don't worry about it.
+ */
+ if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+ free_extent_map(em);
+ em = search_extent_mapping(em_tree, 0, 0);
+ if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
+ alloc_hint = em->block_start;
+ if (em)
+ free_extent_map(em);
+ } else {
+ alloc_hint = em->block_start;
+ free_extent_map(em);
+ }
+ }
+ read_unlock(&em_tree->lock);
+
+ return alloc_hint;
+}
+
+/*
+ * when extent_io.c finds a delayed allocation range in the file,
+ * the call backs end up in this code. The basic idea is to
+ * allocate extents on disk for the range, and create ordered data structs
+ * in ram to track those extents.
+ *
+ * locked_page is the page that writepage had locked already. We use
+ * it to make sure we don't do extra locks or unlocks.
+ *
+ * *page_started is set to one if we unlock locked_page and do everything
+ * required to start IO on it. It may be clean and already done with
+ * IO when we return.
+ */
+static noinline int cow_file_range(struct inode *inode,
+ struct page *locked_page,
+ u64 start, u64 end, int *page_started,
+ unsigned long *nr_written,
+ int unlock)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ u64 alloc_hint = 0;
+ u64 num_bytes;
+ unsigned long ram_size;
+ u64 disk_num_bytes;
+ u64 cur_alloc_size;
+ u64 blocksize = root->sectorsize;
+ struct btrfs_key ins;
+ struct extent_map *em;
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ int ret = 0;
+
+ if (btrfs_is_free_space_inode(inode)) {
+ WARN_ON_ONCE(1);
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ num_bytes = ALIGN(end - start + 1, blocksize);
+ num_bytes = max(blocksize, num_bytes);
+ disk_num_bytes = num_bytes;
+
+ /* if this is a small write inside eof, kick off defrag */
+ if (num_bytes < 64 * 1024 &&
+ (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
+ btrfs_add_inode_defrag(NULL, inode);
+
+ if (start == 0) {
+ /* lets try to make an inline extent */
+ ret = cow_file_range_inline(root, inode, start, end, 0, 0,
+ NULL);
+ if (ret == 0) {
+ extent_clear_unlock_delalloc(inode, start, end, NULL,
+ EXTENT_LOCKED | EXTENT_DELALLOC |
+ EXTENT_DEFRAG, PAGE_UNLOCK |
+ PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
+ PAGE_END_WRITEBACK);
+
+ *nr_written = *nr_written +
+ (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
+ *page_started = 1;
+ goto out;
+ } else if (ret < 0) {
+ goto out_unlock;
+ }
+ }
+
+ BUG_ON(disk_num_bytes >
+ btrfs_super_total_bytes(root->fs_info->super_copy));
+
+ alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
+ btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
+
+ while (disk_num_bytes > 0) {
+ unsigned long op;
+
+ cur_alloc_size = disk_num_bytes;
+ ret = btrfs_reserve_extent(root, cur_alloc_size,
+ root->sectorsize, 0, alloc_hint,
+ &ins, 1, 1);
+ if (ret < 0)
+ goto out_unlock;
+
+ em = alloc_extent_map();
+ if (!em) {
+ ret = -ENOMEM;
+ goto out_reserve;
+ }
+ em->start = start;
+ em->orig_start = em->start;
+ ram_size = ins.offset;
+ em->len = ins.offset;
+ em->mod_start = em->start;
+ em->mod_len = em->len;
+
+ em->block_start = ins.objectid;
+ em->block_len = ins.offset;
+ em->orig_block_len = ins.offset;
+ em->ram_bytes = ram_size;
+ em->bdev = root->fs_info->fs_devices->latest_bdev;
+ set_bit(EXTENT_FLAG_PINNED, &em->flags);
+ em->generation = -1;
+
+ while (1) {
+ write_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, em, 1);
+ write_unlock(&em_tree->lock);
+ if (ret != -EEXIST) {
+ free_extent_map(em);
+ break;
+ }
+ btrfs_drop_extent_cache(inode, start,
+ start + ram_size - 1, 0);
+ }
+ if (ret)
+ goto out_reserve;
+
+ cur_alloc_size = ins.offset;
+ ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
+ ram_size, cur_alloc_size, 0);
+ if (ret)
+ goto out_drop_extent_cache;
+
+ if (root->root_key.objectid ==
+ BTRFS_DATA_RELOC_TREE_OBJECTID) {
+ ret = btrfs_reloc_clone_csums(inode, start,
+ cur_alloc_size);
+ if (ret)
+ goto out_drop_extent_cache;
+ }
+
+ if (disk_num_bytes < cur_alloc_size)
+ break;
+
+ /* we're not doing compressed IO, don't unlock the first
+ * page (which the caller expects to stay locked), don't
+ * clear any dirty bits and don't set any writeback bits
+ *
+ * Do set the Private2 bit so we know this page was properly
+ * setup for writepage
+ */
+ op = unlock ? PAGE_UNLOCK : 0;
+ op |= PAGE_SET_PRIVATE2;
+
+ extent_clear_unlock_delalloc(inode, start,
+ start + ram_size - 1, locked_page,
+ EXTENT_LOCKED | EXTENT_DELALLOC,
+ op);
+ disk_num_bytes -= cur_alloc_size;
+ num_bytes -= cur_alloc_size;
+ alloc_hint = ins.objectid + ins.offset;
+ start += cur_alloc_size;
+ }
+out:
+ return ret;
+
+out_drop_extent_cache:
+ btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0);
+out_reserve:
+ btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
+out_unlock:
+ extent_clear_unlock_delalloc(inode, start, end, locked_page,
+ EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
+ EXTENT_DELALLOC | EXTENT_DEFRAG,
+ PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
+ PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK);
+ goto out;
+}
+
+/*
+ * work queue call back to started compression on a file and pages
+ */
+static noinline void async_cow_start(struct btrfs_work *work)
+{
+ struct async_cow *async_cow;
+ int num_added = 0;
+ async_cow = container_of(work, struct async_cow, work);
+
+ compress_file_range(async_cow->inode, async_cow->locked_page,
+ async_cow->start, async_cow->end, async_cow,
+ &num_added);
+ if (num_added == 0) {
+ btrfs_add_delayed_iput(async_cow->inode);
+ async_cow->inode = NULL;
+ }
+}
+
+/*
+ * work queue call back to submit previously compressed pages
+ */
+static noinline void async_cow_submit(struct btrfs_work *work)
+{
+ struct async_cow *async_cow;
+ struct btrfs_root *root;
+ unsigned long nr_pages;
+
+ async_cow = container_of(work, struct async_cow, work);
+
+ root = async_cow->root;
+ nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
+ PAGE_CACHE_SHIFT;
+
+ if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) <
+ 5 * 1024 * 1024 &&
+ waitqueue_active(&root->fs_info->async_submit_wait))
+ wake_up(&root->fs_info->async_submit_wait);
+
+ if (async_cow->inode)
+ submit_compressed_extents(async_cow->inode, async_cow);
+}
+
+static noinline void async_cow_free(struct btrfs_work *work)
+{
+ struct async_cow *async_cow;
+ async_cow = container_of(work, struct async_cow, work);
+ if (async_cow->inode)
+ btrfs_add_delayed_iput(async_cow->inode);
+ kfree(async_cow);
+}
+
+static int cow_file_range_async(struct inode *inode, struct page *locked_page,
+ u64 start, u64 end, int *page_started,
+ unsigned long *nr_written)
+{
+ struct async_cow *async_cow;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ unsigned long nr_pages;
+ u64 cur_end;
+ int limit = 10 * 1024 * 1024;
+
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
+ 1, 0, NULL, GFP_NOFS);
+ while (start < end) {
+ async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
+ BUG_ON(!async_cow); /* -ENOMEM */
+ async_cow->inode = igrab(inode);
+ async_cow->root = root;
+ async_cow->locked_page = locked_page;
+ async_cow->start = start;
+
+ if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
+ !btrfs_test_opt(root, FORCE_COMPRESS))
+ cur_end = end;
+ else
+ cur_end = min(end, start + 512 * 1024 - 1);
+
+ async_cow->end = cur_end;
+ INIT_LIST_HEAD(&async_cow->extents);
+
+ btrfs_init_work(&async_cow->work,
+ btrfs_delalloc_helper,
+ async_cow_start, async_cow_submit,
+ async_cow_free);
+
+ nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
+ PAGE_CACHE_SHIFT;
+ atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
+
+ btrfs_queue_work(root->fs_info->delalloc_workers,
+ &async_cow->work);
+
+ if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) {
+ wait_event(root->fs_info->async_submit_wait,
+ (atomic_read(&root->fs_info->async_delalloc_pages) <
+ limit));
+ }
+
+ while (atomic_read(&root->fs_info->async_submit_draining) &&
+ atomic_read(&root->fs_info->async_delalloc_pages)) {
+ wait_event(root->fs_info->async_submit_wait,
+ (atomic_read(&root->fs_info->async_delalloc_pages) ==
+ 0));
+ }
+
+ *nr_written += nr_pages;
+ start = cur_end + 1;
+ }
+ *page_started = 1;
+ return 0;
+}
+
+static noinline int csum_exist_in_range(struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes)
+{
+ int ret;
+ struct btrfs_ordered_sum *sums;
+ LIST_HEAD(list);
+
+ ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr,
+ bytenr + num_bytes - 1, &list, 0);
+ if (ret == 0 && list_empty(&list))
+ return 0;
+
+ while (!list_empty(&list)) {
+ sums = list_entry(list.next, struct btrfs_ordered_sum, list);
+ list_del(&sums->list);
+ kfree(sums);
+ }
+ return 1;
+}
+
+/*
+ * when nowcow writeback call back. This checks for snapshots or COW copies
+ * of the extents that exist in the file, and COWs the file as required.
+ *
+ * If no cow copies or snapshots exist, we write directly to the existing
+ * blocks on disk
+ */
+static noinline int run_delalloc_nocow(struct inode *inode,
+ struct page *locked_page,
+ u64 start, u64 end, int *page_started, int force,
+ unsigned long *nr_written)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_trans_handle *trans;
+ struct extent_buffer *leaf;
+ struct btrfs_path *path;
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_key found_key;
+ u64 cow_start;
+ u64 cur_offset;
+ u64 extent_end;
+ u64 extent_offset;
+ u64 disk_bytenr;
+ u64 num_bytes;
+ u64 disk_num_bytes;
+ u64 ram_bytes;
+ int extent_type;
+ int ret, err;
+ int type;
+ int nocow;
+ int check_prev = 1;
+ bool nolock;
+ u64 ino = btrfs_ino(inode);
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ extent_clear_unlock_delalloc(inode, start, end, locked_page,
+ EXTENT_LOCKED | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING |
+ EXTENT_DEFRAG, PAGE_UNLOCK |
+ PAGE_CLEAR_DIRTY |
+ PAGE_SET_WRITEBACK |
+ PAGE_END_WRITEBACK);
+ return -ENOMEM;
+ }
+
+ nolock = btrfs_is_free_space_inode(inode);
+
+ if (nolock)
+ trans = btrfs_join_transaction_nolock(root);
+ else
+ trans = btrfs_join_transaction(root);
+
+ if (IS_ERR(trans)) {
+ extent_clear_unlock_delalloc(inode, start, end, locked_page,
+ EXTENT_LOCKED | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING |
+ EXTENT_DEFRAG, PAGE_UNLOCK |
+ PAGE_CLEAR_DIRTY |
+ PAGE_SET_WRITEBACK |
+ PAGE_END_WRITEBACK);
+ btrfs_free_path(path);
+ return PTR_ERR(trans);
+ }
+
+ trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+
+ cow_start = (u64)-1;
+ cur_offset = start;
+ while (1) {
+ ret = btrfs_lookup_file_extent(trans, root, path, ino,
+ cur_offset, 0);
+ if (ret < 0)
+ goto error;
+ if (ret > 0 && path->slots[0] > 0 && check_prev) {
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &found_key,
+ path->slots[0] - 1);
+ if (found_key.objectid == ino &&
+ found_key.type == BTRFS_EXTENT_DATA_KEY)
+ path->slots[0]--;
+ }
+ check_prev = 0;
+next_slot:
+ leaf = path->nodes[0];
+ if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto error;
+ if (ret > 0)
+ break;
+ leaf = path->nodes[0];
+ }
+
+ nocow = 0;
+ disk_bytenr = 0;
+ num_bytes = 0;
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+ if (found_key.objectid > ino ||
+ found_key.type > BTRFS_EXTENT_DATA_KEY ||
+ found_key.offset > end)
+ break;
+
+ if (found_key.offset > cur_offset) {
+ extent_end = found_key.offset;
+ extent_type = 0;
+ goto out_check;
+ }
+
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ extent_type = btrfs_file_extent_type(leaf, fi);
+
+ ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
+ if (extent_type == BTRFS_FILE_EXTENT_REG ||
+ extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+ extent_offset = btrfs_file_extent_offset(leaf, fi);
+ extent_end = found_key.offset +
+ btrfs_file_extent_num_bytes(leaf, fi);
+ disk_num_bytes =
+ btrfs_file_extent_disk_num_bytes(leaf, fi);
+ if (extent_end <= start) {
+ path->slots[0]++;
+ goto next_slot;
+ }
+ if (disk_bytenr == 0)
+ goto out_check;
+ if (btrfs_file_extent_compression(leaf, fi) ||
+ btrfs_file_extent_encryption(leaf, fi) ||
+ btrfs_file_extent_other_encoding(leaf, fi))
+ goto out_check;
+ if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
+ goto out_check;
+ if (btrfs_extent_readonly(root, disk_bytenr))
+ goto out_check;
+ if (btrfs_cross_ref_exist(trans, root, ino,
+ found_key.offset -
+ extent_offset, disk_bytenr))
+ goto out_check;
+ disk_bytenr += extent_offset;
+ disk_bytenr += cur_offset - found_key.offset;
+ num_bytes = min(end + 1, extent_end) - cur_offset;
+ /*
+ * if there are pending snapshots for this root,
+ * we fall into common COW way.
+ */
+ if (!nolock) {
+ err = btrfs_start_write_no_snapshoting(root);
+ if (!err)
+ goto out_check;
+ }
+ /*
+ * force cow if csum exists in the range.
+ * this ensure that csum for a given extent are
+ * either valid or do not exist.
+ */
+ if (csum_exist_in_range(root, disk_bytenr, num_bytes))
+ goto out_check;
+ nocow = 1;
+ } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+ extent_end = found_key.offset +
+ btrfs_file_extent_inline_len(leaf,
+ path->slots[0], fi);
+ extent_end = ALIGN(extent_end, root->sectorsize);
+ } else {
+ BUG_ON(1);
+ }
+out_check:
+ if (extent_end <= start) {
+ path->slots[0]++;
+ if (!nolock && nocow)
+ btrfs_end_write_no_snapshoting(root);
+ goto next_slot;
+ }
+ if (!nocow) {
+ if (cow_start == (u64)-1)
+ cow_start = cur_offset;
+ cur_offset = extent_end;
+ if (cur_offset > end)
+ break;
+ path->slots[0]++;
+ goto next_slot;
+ }
+
+ btrfs_release_path(path);
+ if (cow_start != (u64)-1) {
+ ret = cow_file_range(inode, locked_page,
+ cow_start, found_key.offset - 1,
+ page_started, nr_written, 1);
+ if (ret) {
+ if (!nolock && nocow)
+ btrfs_end_write_no_snapshoting(root);
+ goto error;
+ }
+ cow_start = (u64)-1;
+ }
+
+ if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ struct extent_map *em;
+ struct extent_map_tree *em_tree;
+ em_tree = &BTRFS_I(inode)->extent_tree;
+ em = alloc_extent_map();
+ BUG_ON(!em); /* -ENOMEM */
+ em->start = cur_offset;
+ em->orig_start = found_key.offset - extent_offset;
+ em->len = num_bytes;
+ em->block_len = num_bytes;
+ em->block_start = disk_bytenr;
+ em->orig_block_len = disk_num_bytes;
+ em->ram_bytes = ram_bytes;
+ em->bdev = root->fs_info->fs_devices->latest_bdev;
+ em->mod_start = em->start;
+ em->mod_len = em->len;
+ set_bit(EXTENT_FLAG_PINNED, &em->flags);
+ set_bit(EXTENT_FLAG_FILLING, &em->flags);
+ em->generation = -1;
+ while (1) {
+ write_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, em, 1);
+ write_unlock(&em_tree->lock);
+ if (ret != -EEXIST) {
+ free_extent_map(em);
+ break;
+ }
+ btrfs_drop_extent_cache(inode, em->start,
+ em->start + em->len - 1, 0);
+ }
+ type = BTRFS_ORDERED_PREALLOC;
+ } else {
+ type = BTRFS_ORDERED_NOCOW;
+ }
+
+ ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
+ num_bytes, num_bytes, type);
+ BUG_ON(ret); /* -ENOMEM */
+
+ if (root->root_key.objectid ==
+ BTRFS_DATA_RELOC_TREE_OBJECTID) {
+ ret = btrfs_reloc_clone_csums(inode, cur_offset,
+ num_bytes);
+ if (ret) {
+ if (!nolock && nocow)
+ btrfs_end_write_no_snapshoting(root);
+ goto error;
+ }
+ }
+
+ extent_clear_unlock_delalloc(inode, cur_offset,
+ cur_offset + num_bytes - 1,
+ locked_page, EXTENT_LOCKED |
+ EXTENT_DELALLOC, PAGE_UNLOCK |
+ PAGE_SET_PRIVATE2);
+ if (!nolock && nocow)
+ btrfs_end_write_no_snapshoting(root);
+ cur_offset = extent_end;
+ if (cur_offset > end)
+ break;
+ }
+ btrfs_release_path(path);
+
+ if (cur_offset <= end && cow_start == (u64)-1) {
+ cow_start = cur_offset;
+ cur_offset = end;
+ }
+
+ if (cow_start != (u64)-1) {
+ ret = cow_file_range(inode, locked_page, cow_start, end,
+ page_started, nr_written, 1);
+ if (ret)
+ goto error;
+ }
+
+error:
+ err = btrfs_end_transaction(trans, root);
+ if (!ret)
+ ret = err;
+
+ if (ret && cur_offset < end)
+ extent_clear_unlock_delalloc(inode, cur_offset, end,
+ locked_page, EXTENT_LOCKED |
+ EXTENT_DELALLOC | EXTENT_DEFRAG |
+ EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
+ PAGE_CLEAR_DIRTY |
+ PAGE_SET_WRITEBACK |
+ PAGE_END_WRITEBACK);
+ btrfs_free_path(path);
+ return ret;
+}
+
+static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
+{
+
+ if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
+ !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC))
+ return 0;
+
+ /*
+ * @defrag_bytes is a hint value, no spinlock held here,
+ * if is not zero, it means the file is defragging.
+ * Force cow if given extent needs to be defragged.
+ */
+ if (BTRFS_I(inode)->defrag_bytes &&
+ test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
+ EXTENT_DEFRAG, 0, NULL))
+ return 1;
+
+ return 0;
+}
+
+/*
+ * extent_io.c call back to do delayed allocation processing
+ */
+static int run_delalloc_range(struct inode *inode, struct page *locked_page,
+ u64 start, u64 end, int *page_started,
+ unsigned long *nr_written)
+{
+ int ret;
+ int force_cow = need_force_cow(inode, start, end);
+
+ if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) {
+ ret = run_delalloc_nocow(inode, locked_page, start, end,
+ page_started, 1, nr_written);
+ } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) {
+ ret = run_delalloc_nocow(inode, locked_page, start, end,
+ page_started, 0, nr_written);
+ } else if (!inode_need_compress(inode)) {
+ ret = cow_file_range(inode, locked_page, start, end,
+ page_started, nr_written, 1);
+ } else {
+ set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags);
+ ret = cow_file_range_async(inode, locked_page, start, end,
+ page_started, nr_written);
+ }
+ return ret;
+}
+
+static void btrfs_split_extent_hook(struct inode *inode,
+ struct extent_state *orig, u64 split)
+{
+ u64 size;
+
+ /* not delalloc, ignore it */
+ if (!(orig->state & EXTENT_DELALLOC))
+ return;
+
+ size = orig->end - orig->start + 1;
+ if (size > BTRFS_MAX_EXTENT_SIZE) {
+ u64 num_extents;
+ u64 new_size;
+
+ /*
+ * See the explanation in btrfs_merge_extent_hook, the same
+ * applies here, just in reverse.
+ */
+ new_size = orig->end - split + 1;
+ num_extents = div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
+ BTRFS_MAX_EXTENT_SIZE);
+ new_size = split - orig->start;
+ num_extents += div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
+ BTRFS_MAX_EXTENT_SIZE);
+ if (div64_u64(size + BTRFS_MAX_EXTENT_SIZE - 1,
+ BTRFS_MAX_EXTENT_SIZE) >= num_extents)
+ return;
+ }
+
+ spin_lock(&BTRFS_I(inode)->lock);
+ BTRFS_I(inode)->outstanding_extents++;
+ spin_unlock(&BTRFS_I(inode)->lock);
+}
+
+/*
+ * extent_io.c merge_extent_hook, used to track merged delayed allocation
+ * extents so we can keep track of new extents that are just merged onto old
+ * extents, such as when we are doing sequential writes, so we can properly
+ * account for the metadata space we'll need.
+ */
+static void btrfs_merge_extent_hook(struct inode *inode,
+ struct extent_state *new,
+ struct extent_state *other)
+{
+ u64 new_size, old_size;
+ u64 num_extents;
+
+ /* not delalloc, ignore it */
+ if (!(other->state & EXTENT_DELALLOC))
+ return;
+
+ if (new->start > other->start)
+ new_size = new->end - other->start + 1;
+ else
+ new_size = other->end - new->start + 1;
+
+ /* we're not bigger than the max, unreserve the space and go */
+ if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
+ spin_lock(&BTRFS_I(inode)->lock);
+ BTRFS_I(inode)->outstanding_extents--;
+ spin_unlock(&BTRFS_I(inode)->lock);
+ return;
+ }
+
+ /*
+ * We have to add up either side to figure out how many extents were
+ * accounted for before we merged into one big extent. If the number of
+ * extents we accounted for is <= the amount we need for the new range
+ * then we can return, otherwise drop. Think of it like this
+ *
+ * [ 4k][MAX_SIZE]
+ *
+ * So we've grown the extent by a MAX_SIZE extent, this would mean we
+ * need 2 outstanding extents, on one side we have 1 and the other side
+ * we have 1 so they are == and we can return. But in this case
+ *
+ * [MAX_SIZE+4k][MAX_SIZE+4k]
+ *
+ * Each range on their own accounts for 2 extents, but merged together
+ * they are only 3 extents worth of accounting, so we need to drop in
+ * this case.
+ */
+ old_size = other->end - other->start + 1;
+ num_extents = div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1,
+ BTRFS_MAX_EXTENT_SIZE);
+ old_size = new->end - new->start + 1;
+ num_extents += div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1,
+ BTRFS_MAX_EXTENT_SIZE);
+
+ if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
+ BTRFS_MAX_EXTENT_SIZE) >= num_extents)
+ return;
+
+ spin_lock(&BTRFS_I(inode)->lock);
+ BTRFS_I(inode)->outstanding_extents--;
+ spin_unlock(&BTRFS_I(inode)->lock);
+}
+
+static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
+ struct inode *inode)
+{
+ spin_lock(&root->delalloc_lock);
+ if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+ list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
+ &root->delalloc_inodes);
+ set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+ &BTRFS_I(inode)->runtime_flags);
+ root->nr_delalloc_inodes++;
+ if (root->nr_delalloc_inodes == 1) {
+ spin_lock(&root->fs_info->delalloc_root_lock);
+ BUG_ON(!list_empty(&root->delalloc_root));
+ list_add_tail(&root->delalloc_root,
+ &root->fs_info->delalloc_roots);
+ spin_unlock(&root->fs_info->delalloc_root_lock);
+ }
+ }
+ spin_unlock(&root->delalloc_lock);
+}
+
+static void btrfs_del_delalloc_inode(struct btrfs_root *root,
+ struct inode *inode)
+{
+ spin_lock(&root->delalloc_lock);
+ if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+ list_del_init(&BTRFS_I(inode)->delalloc_inodes);
+ clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+ &BTRFS_I(inode)->runtime_flags);
+ root->nr_delalloc_inodes--;
+ if (!root->nr_delalloc_inodes) {
+ spin_lock(&root->fs_info->delalloc_root_lock);
+ BUG_ON(list_empty(&root->delalloc_root));
+ list_del_init(&root->delalloc_root);
+ spin_unlock(&root->fs_info->delalloc_root_lock);
+ }
+ }
+ spin_unlock(&root->delalloc_lock);
+}
+
+/*
+ * extent_io.c set_bit_hook, used to track delayed allocation
+ * bytes in this file, and to maintain the list of inodes that
+ * have pending delalloc work to be done.
+ */
+static void btrfs_set_bit_hook(struct inode *inode,
+ struct extent_state *state, unsigned *bits)
+{
+
+ if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
+ WARN_ON(1);
+ /*
+ * set_bit and clear bit hooks normally require _irqsave/restore
+ * but in this case, we are only testing for the DELALLOC
+ * bit, which is only set or cleared with irqs on
+ */
+ if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ u64 len = state->end + 1 - state->start;
+ bool do_list = !btrfs_is_free_space_inode(inode);
+
+ if (*bits & EXTENT_FIRST_DELALLOC) {
+ *bits &= ~EXTENT_FIRST_DELALLOC;
+ } else {
+ spin_lock(&BTRFS_I(inode)->lock);
+ BTRFS_I(inode)->outstanding_extents++;
+ spin_unlock(&BTRFS_I(inode)->lock);
+ }
+
+ /* For sanity tests */
+ if (btrfs_test_is_dummy_root(root))
+ return;
+
+ __percpu_counter_add(&root->fs_info->delalloc_bytes, len,
+ root->fs_info->delalloc_batch);
+ spin_lock(&BTRFS_I(inode)->lock);
+ BTRFS_I(inode)->delalloc_bytes += len;
+ if (*bits & EXTENT_DEFRAG)
+ BTRFS_I(inode)->defrag_bytes += len;
+ if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+ &BTRFS_I(inode)->runtime_flags))
+ btrfs_add_delalloc_inodes(root, inode);
+ spin_unlock(&BTRFS_I(inode)->lock);
+ }
+}
+
+/*
+ * extent_io.c clear_bit_hook, see set_bit_hook for why
+ */
+static void btrfs_clear_bit_hook(struct inode *inode,
+ struct extent_state *state,
+ unsigned *bits)
+{
+ u64 len = state->end + 1 - state->start;
+ u64 num_extents = div64_u64(len + BTRFS_MAX_EXTENT_SIZE -1,
+ BTRFS_MAX_EXTENT_SIZE);
+
+ spin_lock(&BTRFS_I(inode)->lock);
+ if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG))
+ BTRFS_I(inode)->defrag_bytes -= len;
+ spin_unlock(&BTRFS_I(inode)->lock);
+
+ /*
+ * set_bit and clear bit hooks normally require _irqsave/restore
+ * but in this case, we are only testing for the DELALLOC
+ * bit, which is only set or cleared with irqs on
+ */
+ if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ bool do_list = !btrfs_is_free_space_inode(inode);
+
+ if (*bits & EXTENT_FIRST_DELALLOC) {
+ *bits &= ~EXTENT_FIRST_DELALLOC;
+ } else if (!(*bits & EXTENT_DO_ACCOUNTING)) {
+ spin_lock(&BTRFS_I(inode)->lock);
+ BTRFS_I(inode)->outstanding_extents -= num_extents;
+ spin_unlock(&BTRFS_I(inode)->lock);
+ }
+
+ /*
+ * We don't reserve metadata space for space cache inodes so we
+ * don't need to call dellalloc_release_metadata if there is an
+ * error.
+ */
+ if (*bits & EXTENT_DO_ACCOUNTING &&
+ root != root->fs_info->tree_root)
+ btrfs_delalloc_release_metadata(inode, len);
+
+ /* For sanity tests. */
+ if (btrfs_test_is_dummy_root(root))
+ return;
+
+ if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
+ && do_list && !(state->state & EXTENT_NORESERVE))
+ btrfs_free_reserved_data_space(inode, len);
+
+ __percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
+ root->fs_info->delalloc_batch);
+ spin_lock(&BTRFS_I(inode)->lock);
+ BTRFS_I(inode)->delalloc_bytes -= len;
+ if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
+ test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+ &BTRFS_I(inode)->runtime_flags))
+ btrfs_del_delalloc_inode(root, inode);
+ spin_unlock(&BTRFS_I(inode)->lock);
+ }
+}
+
+/*
+ * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
+ * we don't create bios that span stripes or chunks
+ */
+int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
+ size_t size, struct bio *bio,
+ unsigned long bio_flags)
+{
+ struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
+ u64 logical = (u64)bio->bi_iter.bi_sector << 9;
+ u64 length = 0;
+ u64 map_length;
+ int ret;
+
+ if (bio_flags & EXTENT_BIO_COMPRESSED)
+ return 0;
+
+ length = bio->bi_iter.bi_size;
+ map_length = length;
+ ret = btrfs_map_block(root->fs_info, rw, logical,
+ &map_length, NULL, 0);
+ /* Will always return 0 with map_multi == NULL */
+ BUG_ON(ret < 0);
+ if (map_length < length + size)
+ return 1;
+ return 0;
+}
+
+/*
+ * in order to insert checksums into the metadata in large chunks,
+ * we wait until bio submission time. All the pages in the bio are
+ * checksummed and sums are attached onto the ordered extent record.
+ *
+ * At IO completion time the cums attached on the ordered extent record
+ * are inserted into the btree
+ */
+static int __btrfs_submit_bio_start(struct inode *inode, int rw,
+ struct bio *bio, int mirror_num,
+ unsigned long bio_flags,
+ u64 bio_offset)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret = 0;
+
+ ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
+ BUG_ON(ret); /* -ENOMEM */
+ return 0;
+}
+
+/*
+ * in order to insert checksums into the metadata in large chunks,
+ * we wait until bio submission time. All the pages in the bio are
+ * checksummed and sums are attached onto the ordered extent record.
+ *
+ * At IO completion time the cums attached on the ordered extent record
+ * are inserted into the btree
+ */
+static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
+ int mirror_num, unsigned long bio_flags,
+ u64 bio_offset)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret;
+
+ ret = btrfs_map_bio(root, rw, bio, mirror_num, 1);
+ if (ret)
+ bio_endio(bio, ret);
+ return ret;
+}
+
+/*
+ * extent_io.c submission hook. This does the right thing for csum calculation
+ * on write, or reading the csums from the tree before a read
+ */
+static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
+ int mirror_num, unsigned long bio_flags,
+ u64 bio_offset)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret = 0;
+ int skip_sum;
+ int metadata = 0;
+ int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
+
+ skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+
+ if (btrfs_is_free_space_inode(inode))
+ metadata = 2;
+
+ if (!(rw & REQ_WRITE)) {
+ ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
+ if (ret)
+ goto out;
+
+ if (bio_flags & EXTENT_BIO_COMPRESSED) {
+ ret = btrfs_submit_compressed_read(inode, bio,
+ mirror_num,
+ bio_flags);
+ goto out;
+ } else if (!skip_sum) {
+ ret = btrfs_lookup_bio_sums(root, inode, bio, NULL);
+ if (ret)
+ goto out;
+ }
+ goto mapit;
+ } else if (async && !skip_sum) {
+ /* csum items have already been cloned */
+ if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
+ goto mapit;
+ /* we're doing a write, do the async checksumming */
+ ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
+ inode, rw, bio, mirror_num,
+ bio_flags, bio_offset,
+ __btrfs_submit_bio_start,
+ __btrfs_submit_bio_done);
+ goto out;
+ } else if (!skip_sum) {
+ ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
+ if (ret)
+ goto out;
+ }
+
+mapit:
+ ret = btrfs_map_bio(root, rw, bio, mirror_num, 0);
+
+out:
+ if (ret < 0)
+ bio_endio(bio, ret);
+ return ret;
+}
+
+/*
+ * given a list of ordered sums record them in the inode. This happens
+ * at IO completion time based on sums calculated at bio submission time.
+ */
+static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
+ struct inode *inode, u64 file_offset,
+ struct list_head *list)
+{
+ struct btrfs_ordered_sum *sum;
+
+ list_for_each_entry(sum, list, list) {
+ trans->adding_csums = 1;
+ btrfs_csum_file_blocks(trans,
+ BTRFS_I(inode)->root->fs_info->csum_root, sum);
+ trans->adding_csums = 0;
+ }
+ return 0;
+}
+
+int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
+ struct extent_state **cached_state)
+{
+ WARN_ON((end & (PAGE_CACHE_SIZE - 1)) == 0);
+ return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
+ cached_state, GFP_NOFS);
+}
+
+/* see btrfs_writepage_start_hook for details on why this is required */
+struct btrfs_writepage_fixup {
+ struct page *page;
+ struct btrfs_work work;
+};
+
+static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
+{
+ struct btrfs_writepage_fixup *fixup;
+ struct btrfs_ordered_extent *ordered;
+ struct extent_state *cached_state = NULL;
+ struct page *page;
+ struct inode *inode;
+ u64 page_start;
+ u64 page_end;
+ int ret;
+
+ fixup = container_of(work, struct btrfs_writepage_fixup, work);
+ page = fixup->page;
+again:
+ lock_page(page);
+ if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
+ ClearPageChecked(page);
+ goto out_page;
+ }
+
+ inode = page->mapping->host;
+ page_start = page_offset(page);
+ page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
+
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0,
+ &cached_state);
+
+ /* already ordered? We're done */
+ if (PagePrivate2(page))
+ goto out;
+
+ ordered = btrfs_lookup_ordered_extent(inode, page_start);
+ if (ordered) {
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
+ page_end, &cached_state, GFP_NOFS);
+ unlock_page(page);
+ btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_put_ordered_extent(ordered);
+ goto again;
+ }
+
+ ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+ if (ret) {
+ mapping_set_error(page->mapping, ret);
+ end_extent_writepage(page, ret, page_start, page_end);
+ ClearPageChecked(page);
+ goto out;
+ }
+
+ btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);
+ ClearPageChecked(page);
+ set_page_dirty(page);
+out:
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
+ &cached_state, GFP_NOFS);
+out_page:
+ unlock_page(page);
+ page_cache_release(page);
+ kfree(fixup);
+}
+
+/*
+ * There are a few paths in the higher layers of the kernel that directly
+ * set the page dirty bit without asking the filesystem if it is a
+ * good idea. This causes problems because we want to make sure COW
+ * properly happens and the data=ordered rules are followed.
+ *
+ * In our case any range that doesn't have the ORDERED bit set
+ * hasn't been properly setup for IO. We kick off an async process
+ * to fix it up. The async helper will wait for ordered extents, set
+ * the delalloc bit and make it safe to write the page.
+ */
+static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
+{
+ struct inode *inode = page->mapping->host;
+ struct btrfs_writepage_fixup *fixup;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+
+ /* this page is properly in the ordered list */
+ if (TestClearPagePrivate2(page))
+ return 0;
+
+ if (PageChecked(page))
+ return -EAGAIN;
+
+ fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
+ if (!fixup)
+ return -EAGAIN;
+
+ SetPageChecked(page);
+ page_cache_get(page);
+ btrfs_init_work(&fixup->work, btrfs_fixup_helper,
+ btrfs_writepage_fixup_worker, NULL, NULL);
+ fixup->page = page;
+ btrfs_queue_work(root->fs_info->fixup_workers, &fixup->work);
+ return -EBUSY;
+}
+
+static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
+ struct inode *inode, u64 file_pos,
+ u64 disk_bytenr, u64 disk_num_bytes,
+ u64 num_bytes, u64 ram_bytes,
+ u8 compression, u8 encryption,
+ u16 other_encoding, int extent_type)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_key ins;
+ int extent_inserted = 0;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ /*
+ * we may be replacing one extent in the tree with another.
+ * The new extent is pinned in the extent map, and we don't want
+ * to drop it from the cache until it is completely in the btree.
+ *
+ * So, tell btrfs_drop_extents to leave this extent in the cache.
+ * the caller is expected to unpin it and allow it to be merged
+ * with the others.
+ */
+ ret = __btrfs_drop_extents(trans, root, inode, path, file_pos,
+ file_pos + num_bytes, NULL, 0,
+ 1, sizeof(*fi), &extent_inserted);
+ if (ret)
+ goto out;
+
+ if (!extent_inserted) {
+ ins.objectid = btrfs_ino(inode);
+ ins.offset = file_pos;
+ ins.type = BTRFS_EXTENT_DATA_KEY;
+
+ path->leave_spinning = 1;
+ ret = btrfs_insert_empty_item(trans, root, path, &ins,
+ sizeof(*fi));
+ if (ret)
+ goto out;
+ }
+ leaf = path->nodes[0];
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_generation(leaf, fi, trans->transid);
+ btrfs_set_file_extent_type(leaf, fi, extent_type);
+ btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
+ btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
+ btrfs_set_file_extent_offset(leaf, fi, 0);
+ btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
+ btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
+ btrfs_set_file_extent_compression(leaf, fi, compression);
+ btrfs_set_file_extent_encryption(leaf, fi, encryption);
+ btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
+
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_release_path(path);
+
+ inode_add_bytes(inode, num_bytes);
+
+ ins.objectid = disk_bytenr;
+ ins.offset = disk_num_bytes;
+ ins.type = BTRFS_EXTENT_ITEM_KEY;
+ ret = btrfs_alloc_reserved_file_extent(trans, root,
+ root->root_key.objectid,
+ btrfs_ino(inode), file_pos, &ins);
+out:
+ btrfs_free_path(path);
+
+ return ret;
+}
+
+/* snapshot-aware defrag */
+struct sa_defrag_extent_backref {
+ struct rb_node node;
+ struct old_sa_defrag_extent *old;
+ u64 root_id;
+ u64 inum;
+ u64 file_pos;
+ u64 extent_offset;
+ u64 num_bytes;
+ u64 generation;
+};
+
+struct old_sa_defrag_extent {
+ struct list_head list;
+ struct new_sa_defrag_extent *new;
+
+ u64 extent_offset;
+ u64 bytenr;
+ u64 offset;
+ u64 len;
+ int count;
+};
+
+struct new_sa_defrag_extent {
+ struct rb_root root;
+ struct list_head head;
+ struct btrfs_path *path;
+ struct inode *inode;
+ u64 file_pos;
+ u64 len;
+ u64 bytenr;
+ u64 disk_len;
+ u8 compress_type;
+};
+
+static int backref_comp(struct sa_defrag_extent_backref *b1,
+ struct sa_defrag_extent_backref *b2)
+{
+ if (b1->root_id < b2->root_id)
+ return -1;
+ else if (b1->root_id > b2->root_id)
+ return 1;
+
+ if (b1->inum < b2->inum)
+ return -1;
+ else if (b1->inum > b2->inum)
+ return 1;
+
+ if (b1->file_pos < b2->file_pos)
+ return -1;
+ else if (b1->file_pos > b2->file_pos)
+ return 1;
+
+ /*
+ * [------------------------------] ===> (a range of space)
+ * |<--->| |<---->| =============> (fs/file tree A)
+ * |<---------------------------->| ===> (fs/file tree B)
+ *
+ * A range of space can refer to two file extents in one tree while
+ * refer to only one file extent in another tree.
+ *
+ * So we may process a disk offset more than one time(two extents in A)
+ * and locate at the same extent(one extent in B), then insert two same
+ * backrefs(both refer to the extent in B).
+ */
+ return 0;
+}
+
+static void backref_insert(struct rb_root *root,
+ struct sa_defrag_extent_backref *backref)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+ struct sa_defrag_extent_backref *entry;
+ int ret;
+
+ while (*p) {
+ parent = *p;
+ entry = rb_entry(parent, struct sa_defrag_extent_backref, node);
+
+ ret = backref_comp(backref, entry);
+ if (ret < 0)
+ p = &(*p)->rb_left;
+ else
+ p = &(*p)->rb_right;
+ }
+
+ rb_link_node(&backref->node, parent, p);
+ rb_insert_color(&backref->node, root);
+}
+
+/*
+ * Note the backref might has changed, and in this case we just return 0.
+ */
+static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
+ void *ctx)
+{
+ struct btrfs_file_extent_item *extent;
+ struct btrfs_fs_info *fs_info;
+ struct old_sa_defrag_extent *old = ctx;
+ struct new_sa_defrag_extent *new = old->new;
+ struct btrfs_path *path = new->path;
+ struct btrfs_key key;
+ struct btrfs_root *root;
+ struct sa_defrag_extent_backref *backref;
+ struct extent_buffer *leaf;
+ struct inode *inode = new->inode;
+ int slot;
+ int ret;
+ u64 extent_offset;
+ u64 num_bytes;
+
+ if (BTRFS_I(inode)->root->root_key.objectid == root_id &&
+ inum == btrfs_ino(inode))
+ return 0;
+
+ key.objectid = root_id;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = (u64)-1;
+
+ fs_info = BTRFS_I(inode)->root->fs_info;
+ root = btrfs_read_fs_root_no_name(fs_info, &key);
+ if (IS_ERR(root)) {
+ if (PTR_ERR(root) == -ENOENT)
+ return 0;
+ WARN_ON(1);
+ pr_debug("inum=%llu, offset=%llu, root_id=%llu\n",
+ inum, offset, root_id);
+ return PTR_ERR(root);
+ }
+
+ key.objectid = inum;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ if (offset > (u64)-1 << 32)
+ key.offset = 0;
+ else
+ key.offset = offset;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (WARN_ON(ret < 0))
+ return ret;
+ ret = 0;
+
+ while (1) {
+ cond_resched();
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+
+ if (slot >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0) {
+ goto out;
+ } else if (ret > 0) {
+ ret = 0;
+ goto out;
+ }
+ continue;
+ }
+
+ path->slots[0]++;
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+
+ if (key.objectid > inum)
+ goto out;
+
+ if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY)
+ continue;
+
+ extent = btrfs_item_ptr(leaf, slot,
+ struct btrfs_file_extent_item);
+
+ if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr)
+ continue;
+
+ /*
+ * 'offset' refers to the exact key.offset,
+ * NOT the 'offset' field in btrfs_extent_data_ref, ie.
+ * (key.offset - extent_offset).
+ */
+ if (key.offset != offset)
+ continue;
+
+ extent_offset = btrfs_file_extent_offset(leaf, extent);
+ num_bytes = btrfs_file_extent_num_bytes(leaf, extent);
+
+ if (extent_offset >= old->extent_offset + old->offset +
+ old->len || extent_offset + num_bytes <=
+ old->extent_offset + old->offset)
+ continue;
+ break;
+ }
+
+ backref = kmalloc(sizeof(*backref), GFP_NOFS);
+ if (!backref) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ backref->root_id = root_id;
+ backref->inum = inum;
+ backref->file_pos = offset;
+ backref->num_bytes = num_bytes;
+ backref->extent_offset = extent_offset;
+ backref->generation = btrfs_file_extent_generation(leaf, extent);
+ backref->old = old;
+ backref_insert(&new->root, backref);
+ old->count++;
+out:
+ btrfs_release_path(path);
+ WARN_ON(ret);
+ return ret;
+}
+
+static noinline bool record_extent_backrefs(struct btrfs_path *path,
+ struct new_sa_defrag_extent *new)
+{
+ struct btrfs_fs_info *fs_info = BTRFS_I(new->inode)->root->fs_info;
+ struct old_sa_defrag_extent *old, *tmp;
+ int ret;
+
+ new->path = path;
+
+ list_for_each_entry_safe(old, tmp, &new->head, list) {
+ ret = iterate_inodes_from_logical(old->bytenr +
+ old->extent_offset, fs_info,
+ path, record_one_backref,
+ old);
+ if (ret < 0 && ret != -ENOENT)
+ return false;
+
+ /* no backref to be processed for this extent */
+ if (!old->count) {
+ list_del(&old->list);
+ kfree(old);
+ }
+ }
+
+ if (list_empty(&new->head))
+ return false;
+
+ return true;
+}
+
+static int relink_is_mergable(struct extent_buffer *leaf,
+ struct btrfs_file_extent_item *fi,
+ struct new_sa_defrag_extent *new)
+{
+ if (btrfs_file_extent_disk_bytenr(leaf, fi) != new->bytenr)
+ return 0;
+
+ if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
+ return 0;
+
+ if (btrfs_file_extent_compression(leaf, fi) != new->compress_type)
+ return 0;
+
+ if (btrfs_file_extent_encryption(leaf, fi) ||
+ btrfs_file_extent_other_encoding(leaf, fi))
+ return 0;
+
+ return 1;
+}
+
+/*
+ * Note the backref might has changed, and in this case we just return 0.
+ */
+static noinline int relink_extent_backref(struct btrfs_path *path,
+ struct sa_defrag_extent_backref *prev,
+ struct sa_defrag_extent_backref *backref)
+{
+ struct btrfs_file_extent_item *extent;
+ struct btrfs_file_extent_item *item;
+ struct btrfs_ordered_extent *ordered;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_root *root;
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ struct old_sa_defrag_extent *old = backref->old;
+ struct new_sa_defrag_extent *new = old->new;
+ struct inode *src_inode = new->inode;
+ struct inode *inode;
+ struct extent_state *cached = NULL;
+ int ret = 0;
+ u64 start;
+ u64 len;
+ u64 lock_start;
+ u64 lock_end;
+ bool merge = false;
+ int index;
+
+ if (prev && prev->root_id == backref->root_id &&
+ prev->inum == backref->inum &&
+ prev->file_pos + prev->num_bytes == backref->file_pos)
+ merge = true;
+
+ /* step 1: get root */
+ key.objectid = backref->root_id;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = (u64)-1;
+
+ fs_info = BTRFS_I(src_inode)->root->fs_info;
+ index = srcu_read_lock(&fs_info->subvol_srcu);
+
+ root = btrfs_read_fs_root_no_name(fs_info, &key);
+ if (IS_ERR(root)) {
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
+ if (PTR_ERR(root) == -ENOENT)
+ return 0;
+ return PTR_ERR(root);
+ }
+
+ if (btrfs_root_readonly(root)) {
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
+ return 0;
+ }
+
+ /* step 2: get inode */
+ key.objectid = backref->inum;
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+
+ inode = btrfs_iget(fs_info->sb, &key, root, NULL);
+ if (IS_ERR(inode)) {
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
+ return 0;
+ }
+
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
+
+ /* step 3: relink backref */
+ lock_start = backref->file_pos;
+ lock_end = backref->file_pos + backref->num_bytes - 1;
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
+ 0, &cached);
+
+ ordered = btrfs_lookup_first_ordered_extent(inode, lock_end);
+ if (ordered) {
+ btrfs_put_ordered_extent(ordered);
+ goto out_unlock;
+ }
+
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out_unlock;
+ }
+
+ key.objectid = backref->inum;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = backref->file_pos;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0) {
+ goto out_free_path;
+ } else if (ret > 0) {
+ ret = 0;
+ goto out_free_path;
+ }
+
+ extent = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_file_extent_item);
+
+ if (btrfs_file_extent_generation(path->nodes[0], extent) !=
+ backref->generation)
+ goto out_free_path;
+
+ btrfs_release_path(path);
+
+ start = backref->file_pos;
+ if (backref->extent_offset < old->extent_offset + old->offset)
+ start += old->extent_offset + old->offset -
+ backref->extent_offset;
+
+ len = min(backref->extent_offset + backref->num_bytes,
+ old->extent_offset + old->offset + old->len);
+ len -= max(backref->extent_offset, old->extent_offset + old->offset);
+
+ ret = btrfs_drop_extents(trans, root, inode, start,
+ start + len, 1);
+ if (ret)
+ goto out_free_path;
+again:
+ key.objectid = btrfs_ino(inode);
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = start;
+
+ path->leave_spinning = 1;
+ if (merge) {
+ struct btrfs_file_extent_item *fi;
+ u64 extent_len;
+ struct btrfs_key found_key;
+
+ ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+ if (ret < 0)
+ goto out_free_path;
+
+ path->slots[0]--;
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ extent_len = btrfs_file_extent_num_bytes(leaf, fi);
+
+ if (extent_len + found_key.offset == start &&
+ relink_is_mergable(leaf, fi, new)) {
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ extent_len + len);
+ btrfs_mark_buffer_dirty(leaf);
+ inode_add_bytes(inode, len);
+
+ ret = 1;
+ goto out_free_path;
+ } else {
+ merge = false;
+ btrfs_release_path(path);
+ goto again;
+ }
+ }
+
+ ret = btrfs_insert_empty_item(trans, root, path, &key,
+ sizeof(*extent));
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out_free_path;
+ }
+
+ leaf = path->nodes[0];
+ item = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr);
+ btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len);
+ btrfs_set_file_extent_offset(leaf, item, start - new->file_pos);
+ btrfs_set_file_extent_num_bytes(leaf, item, len);
+ btrfs_set_file_extent_ram_bytes(leaf, item, new->len);
+ btrfs_set_file_extent_generation(leaf, item, trans->transid);
+ btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
+ btrfs_set_file_extent_compression(leaf, item, new->compress_type);
+ btrfs_set_file_extent_encryption(leaf, item, 0);
+ btrfs_set_file_extent_other_encoding(leaf, item, 0);
+
+ btrfs_mark_buffer_dirty(leaf);
+ inode_add_bytes(inode, len);
+ btrfs_release_path(path);
+
+ ret = btrfs_inc_extent_ref(trans, root, new->bytenr,
+ new->disk_len, 0,
+ backref->root_id, backref->inum,
+ new->file_pos, 0); /* start - extent_offset */
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out_free_path;
+ }
+
+ ret = 1;
+out_free_path:
+ btrfs_release_path(path);
+ path->leave_spinning = 0;
+ btrfs_end_transaction(trans, root);
+out_unlock:
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
+ &cached, GFP_NOFS);
+ iput(inode);
+ return ret;
+}
+
+static void free_sa_defrag_extent(struct new_sa_defrag_extent *new)
+{
+ struct old_sa_defrag_extent *old, *tmp;
+
+ if (!new)
+ return;
+
+ list_for_each_entry_safe(old, tmp, &new->head, list) {
+ list_del(&old->list);
+ kfree(old);
+ }
+ kfree(new);
+}
+
+static void relink_file_extents(struct new_sa_defrag_extent *new)
+{
+ struct btrfs_path *path;
+ struct sa_defrag_extent_backref *backref;
+ struct sa_defrag_extent_backref *prev = NULL;
+ struct inode *inode;
+ struct btrfs_root *root;
+ struct rb_node *node;
+ int ret;
+
+ inode = new->inode;
+ root = BTRFS_I(inode)->root;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return;
+
+ if (!record_extent_backrefs(path, new)) {
+ btrfs_free_path(path);
+ goto out;
+ }
+ btrfs_release_path(path);
+
+ while (1) {
+ node = rb_first(&new->root);
+ if (!node)
+ break;
+ rb_erase(node, &new->root);
+
+ backref = rb_entry(node, struct sa_defrag_extent_backref, node);
+
+ ret = relink_extent_backref(path, prev, backref);
+ WARN_ON(ret < 0);
+
+ kfree(prev);
+
+ if (ret == 1)
+ prev = backref;
+ else
+ prev = NULL;
+ cond_resched();
+ }
+ kfree(prev);
+
+ btrfs_free_path(path);
+out:
+ free_sa_defrag_extent(new);
+
+ atomic_dec(&root->fs_info->defrag_running);
+ wake_up(&root->fs_info->transaction_wait);
+}
+
+static struct new_sa_defrag_extent *
+record_old_file_extents(struct inode *inode,
+ struct btrfs_ordered_extent *ordered)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct old_sa_defrag_extent *old;
+ struct new_sa_defrag_extent *new;
+ int ret;
+
+ new = kmalloc(sizeof(*new), GFP_NOFS);
+ if (!new)
+ return NULL;
+
+ new->inode = inode;
+ new->file_pos = ordered->file_offset;
+ new->len = ordered->len;
+ new->bytenr = ordered->start;
+ new->disk_len = ordered->disk_len;
+ new->compress_type = ordered->compress_type;
+ new->root = RB_ROOT;
+ INIT_LIST_HEAD(&new->head);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ goto out_kfree;
+
+ key.objectid = btrfs_ino(inode);
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = new->file_pos;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out_free_path;
+ if (ret > 0 && path->slots[0] > 0)
+ path->slots[0]--;
+
+ /* find out all the old extents for the file range */
+ while (1) {
+ struct btrfs_file_extent_item *extent;
+ struct extent_buffer *l;
+ int slot;
+ u64 num_bytes;
+ u64 offset;
+ u64 end;
+ u64 disk_bytenr;
+ u64 extent_offset;
+
+ l = path->nodes[0];
+ slot = path->slots[0];
+
+ if (slot >= btrfs_header_nritems(l)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto out_free_path;
+ else if (ret > 0)
+ break;
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(l, &key, slot);
+
+ if (key.objectid != btrfs_ino(inode))
+ break;
+ if (key.type != BTRFS_EXTENT_DATA_KEY)
+ break;
+ if (key.offset >= new->file_pos + new->len)
+ break;
+
+ extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item);
+
+ num_bytes = btrfs_file_extent_num_bytes(l, extent);
+ if (key.offset + num_bytes < new->file_pos)
+ goto next;
+
+ disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent);
+ if (!disk_bytenr)
+ goto next;
+
+ extent_offset = btrfs_file_extent_offset(l, extent);
+
+ old = kmalloc(sizeof(*old), GFP_NOFS);
+ if (!old)
+ goto out_free_path;
+
+ offset = max(new->file_pos, key.offset);
+ end = min(new->file_pos + new->len, key.offset + num_bytes);
+
+ old->bytenr = disk_bytenr;
+ old->extent_offset = extent_offset;
+ old->offset = offset - key.offset;
+ old->len = end - offset;
+ old->new = new;
+ old->count = 0;
+ list_add_tail(&old->list, &new->head);
+next:
+ path->slots[0]++;
+ cond_resched();
+ }
+
+ btrfs_free_path(path);
+ atomic_inc(&root->fs_info->defrag_running);
+
+ return new;
+
+out_free_path:
+ btrfs_free_path(path);
+out_kfree:
+ free_sa_defrag_extent(new);
+ return NULL;
+}
+
+static void btrfs_release_delalloc_bytes(struct btrfs_root *root,
+ u64 start, u64 len)
+{
+ struct btrfs_block_group_cache *cache;
+
+ cache = btrfs_lookup_block_group(root->fs_info, start);
+ ASSERT(cache);
+
+ spin_lock(&cache->lock);
+ cache->delalloc_bytes -= len;
+ spin_unlock(&cache->lock);
+
+ btrfs_put_block_group(cache);
+}
+
+/* as ordered data IO finishes, this gets called so we can finish
+ * an ordered extent if the range of bytes in the file it covers are
+ * fully written.
+ */
+static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
+{
+ struct inode *inode = ordered_extent->inode;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_trans_handle *trans = NULL;
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct extent_state *cached_state = NULL;
+ struct new_sa_defrag_extent *new = NULL;
+ int compress_type = 0;
+ int ret = 0;
+ u64 logical_len = ordered_extent->len;
+ bool nolock;
+ bool truncated = false;
+
+ nolock = btrfs_is_free_space_inode(inode);
+
+ if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
+ ret = -EIO;
+ goto out;
+ }
+
+ btrfs_free_io_failure_record(inode, ordered_extent->file_offset,
+ ordered_extent->file_offset +
+ ordered_extent->len - 1);
+
+ if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
+ truncated = true;
+ logical_len = ordered_extent->truncated_len;
+ /* Truncated the entire extent, don't bother adding */
+ if (!logical_len)
+ goto out;
+ }
+
+ if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
+ BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
+ btrfs_ordered_update_i_size(inode, 0, ordered_extent);
+ if (nolock)
+ trans = btrfs_join_transaction_nolock(root);
+ else
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ trans = NULL;
+ goto out;
+ }
+ trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+ ret = btrfs_update_inode_fallback(trans, root, inode);
+ if (ret) /* -ENOMEM or corruption */
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
+
+ lock_extent_bits(io_tree, ordered_extent->file_offset,
+ ordered_extent->file_offset + ordered_extent->len - 1,
+ 0, &cached_state);
+
+ ret = test_range_bit(io_tree, ordered_extent->file_offset,
+ ordered_extent->file_offset + ordered_extent->len - 1,
+ EXTENT_DEFRAG, 1, cached_state);
+ if (ret) {
+ u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
+ if (0 && last_snapshot >= BTRFS_I(inode)->generation)
+ /* the inode is shared */
+ new = record_old_file_extents(inode, ordered_extent);
+
+ clear_extent_bit(io_tree, ordered_extent->file_offset,
+ ordered_extent->file_offset + ordered_extent->len - 1,
+ EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS);
+ }
+
+ if (nolock)
+ trans = btrfs_join_transaction_nolock(root);
+ else
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ trans = NULL;
+ goto out_unlock;
+ }
+
+ trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+
+ if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
+ compress_type = ordered_extent->compress_type;
+ if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
+ BUG_ON(compress_type);
+ ret = btrfs_mark_extent_written(trans, inode,
+ ordered_extent->file_offset,
+ ordered_extent->file_offset +
+ logical_len);
+ } else {
+ BUG_ON(root == root->fs_info->tree_root);
+ ret = insert_reserved_file_extent(trans, inode,
+ ordered_extent->file_offset,
+ ordered_extent->start,
+ ordered_extent->disk_len,
+ logical_len, logical_len,
+ compress_type, 0, 0,
+ BTRFS_FILE_EXTENT_REG);
+ if (!ret)
+ btrfs_release_delalloc_bytes(root,
+ ordered_extent->start,
+ ordered_extent->disk_len);
+ }
+ unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
+ ordered_extent->file_offset, ordered_extent->len,
+ trans->transid);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out_unlock;
+ }
+
+ add_pending_csums(trans, inode, ordered_extent->file_offset,
+ &ordered_extent->list);
+
+ btrfs_ordered_update_i_size(inode, 0, ordered_extent);
+ ret = btrfs_update_inode_fallback(trans, root, inode);
+ if (ret) { /* -ENOMEM or corruption */
+ btrfs_abort_transaction(trans, root, ret);
+ goto out_unlock;
+ }
+ ret = 0;
+out_unlock:
+ unlock_extent_cached(io_tree, ordered_extent->file_offset,
+ ordered_extent->file_offset +
+ ordered_extent->len - 1, &cached_state, GFP_NOFS);
+out:
+ if (root != root->fs_info->tree_root)
+ btrfs_delalloc_release_metadata(inode, ordered_extent->len);
+ if (trans)
+ btrfs_end_transaction(trans, root);
+
+ if (ret || truncated) {
+ u64 start, end;
+
+ if (truncated)
+ start = ordered_extent->file_offset + logical_len;
+ else
+ start = ordered_extent->file_offset;
+ end = ordered_extent->file_offset + ordered_extent->len - 1;
+ clear_extent_uptodate(io_tree, start, end, NULL, GFP_NOFS);
+
+ /* Drop the cache for the part of the extent we didn't write. */
+ btrfs_drop_extent_cache(inode, start, end, 0);
+
+ /*
+ * If the ordered extent had an IOERR or something else went
+ * wrong we need to return the space for this ordered extent
+ * back to the allocator. We only free the extent in the
+ * truncated case if we didn't write out the extent at all.
+ */
+ if ((ret || !logical_len) &&
+ !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
+ !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
+ btrfs_free_reserved_extent(root, ordered_extent->start,
+ ordered_extent->disk_len, 1);
+ }
+
+
+ /*
+ * This needs to be done to make sure anybody waiting knows we are done
+ * updating everything for this ordered extent.
+ */
+ btrfs_remove_ordered_extent(inode, ordered_extent);
+
+ /* for snapshot-aware defrag */
+ if (new) {
+ if (ret) {
+ free_sa_defrag_extent(new);
+ atomic_dec(&root->fs_info->defrag_running);
+ } else {
+ relink_file_extents(new);
+ }
+ }
+
+ /* once for us */
+ btrfs_put_ordered_extent(ordered_extent);
+ /* once for the tree */
+ btrfs_put_ordered_extent(ordered_extent);
+
+ return ret;
+}
+
+static void finish_ordered_fn(struct btrfs_work *work)
+{
+ struct btrfs_ordered_extent *ordered_extent;
+ ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
+ btrfs_finish_ordered_io(ordered_extent);
+}
+
+static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
+ struct extent_state *state, int uptodate)
+{
+ struct inode *inode = page->mapping->host;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_ordered_extent *ordered_extent = NULL;
+ struct btrfs_workqueue *wq;
+ btrfs_work_func_t func;
+
+ trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
+
+ ClearPagePrivate2(page);
+ if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
+ end - start + 1, uptodate))
+ return 0;
+
+ if (btrfs_is_free_space_inode(inode)) {
+ wq = root->fs_info->endio_freespace_worker;
+ func = btrfs_freespace_write_helper;
+ } else {
+ wq = root->fs_info->endio_write_workers;
+ func = btrfs_endio_write_helper;
+ }
+
+ btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL,
+ NULL);
+ btrfs_queue_work(wq, &ordered_extent->work);
+
+ return 0;
+}
+
+static int __readpage_endio_check(struct inode *inode,
+ struct btrfs_io_bio *io_bio,
+ int icsum, struct page *page,
+ int pgoff, u64 start, size_t len)
+{
+ char *kaddr;
+ u32 csum_expected;
+ u32 csum = ~(u32)0;
+ static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+
+ csum_expected = *(((u32 *)io_bio->csum) + icsum);
+
+ kaddr = kmap_atomic(page);
+ csum = btrfs_csum_data(kaddr + pgoff, csum, len);
+ btrfs_csum_final(csum, (char *)&csum);
+ if (csum != csum_expected)
+ goto zeroit;
+
+ kunmap_atomic(kaddr);
+ return 0;
+zeroit:
+ if (__ratelimit(&_rs))
+ btrfs_warn(BTRFS_I(inode)->root->fs_info,
+ "csum failed ino %llu off %llu csum %u expected csum %u",
+ btrfs_ino(inode), start, csum, csum_expected);
+ memset(kaddr + pgoff, 1, len);
+ flush_dcache_page(page);
+ kunmap_atomic(kaddr);
+ if (csum_expected == 0)
+ return 0;
+ return -EIO;
+}
+
+/*
+ * when reads are done, we need to check csums to verify the data is correct
+ * if there's a match, we allow the bio to finish. If not, the code in
+ * extent_io.c will try to find good copies for us.
+ */
+static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
+ u64 phy_offset, struct page *page,
+ u64 start, u64 end, int mirror)
+{
+ size_t offset = start - page_offset(page);
+ struct inode *inode = page->mapping->host;
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+
+ if (PageChecked(page)) {
+ ClearPageChecked(page);
+ return 0;
+ }
+
+ if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
+ return 0;
+
+ if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
+ test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
+ clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM,
+ GFP_NOFS);
+ return 0;
+ }
+
+ phy_offset >>= inode->i_sb->s_blocksize_bits;
+ return __readpage_endio_check(inode, io_bio, phy_offset, page, offset,
+ start, (size_t)(end - start + 1));
+}
+
+struct delayed_iput {
+ struct list_head list;
+ struct inode *inode;
+};
+
+/* JDM: If this is fs-wide, why can't we add a pointer to
+ * btrfs_inode instead and avoid the allocation? */
+void btrfs_add_delayed_iput(struct inode *inode)
+{
+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+ struct delayed_iput *delayed;
+
+ if (atomic_add_unless(&inode->i_count, -1, 1))
+ return;
+
+ delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL);
+ delayed->inode = inode;
+
+ spin_lock(&fs_info->delayed_iput_lock);
+ list_add_tail(&delayed->list, &fs_info->delayed_iputs);
+ spin_unlock(&fs_info->delayed_iput_lock);
+}
+
+void btrfs_run_delayed_iputs(struct btrfs_root *root)
+{
+ LIST_HEAD(list);
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct delayed_iput *delayed;
+ int empty;
+
+ spin_lock(&fs_info->delayed_iput_lock);
+ empty = list_empty(&fs_info->delayed_iputs);
+ spin_unlock(&fs_info->delayed_iput_lock);
+ if (empty)
+ return;
+
+ down_read(&fs_info->delayed_iput_sem);
+
+ spin_lock(&fs_info->delayed_iput_lock);
+ list_splice_init(&fs_info->delayed_iputs, &list);
+ spin_unlock(&fs_info->delayed_iput_lock);
+
+ while (!list_empty(&list)) {
+ delayed = list_entry(list.next, struct delayed_iput, list);
+ list_del(&delayed->list);
+ iput(delayed->inode);
+ kfree(delayed);
+ }
+
+ up_read(&root->fs_info->delayed_iput_sem);
+}
+
+/*
+ * This is called in transaction commit time. If there are no orphan
+ * files in the subvolume, it removes orphan item and frees block_rsv
+ * structure.
+ */
+void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_block_rsv *block_rsv;
+ int ret;
+
+ if (atomic_read(&root->orphan_inodes) ||
+ root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
+ return;
+
+ spin_lock(&root->orphan_lock);
+ if (atomic_read(&root->orphan_inodes)) {
+ spin_unlock(&root->orphan_lock);
+ return;
+ }
+
+ if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) {
+ spin_unlock(&root->orphan_lock);
+ return;
+ }
+
+ block_rsv = root->orphan_block_rsv;
+ root->orphan_block_rsv = NULL;
+ spin_unlock(&root->orphan_lock);
+
+ if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state) &&
+ btrfs_root_refs(&root->root_item) > 0) {
+ ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
+ root->root_key.objectid);
+ if (ret)
+ btrfs_abort_transaction(trans, root, ret);
+ else
+ clear_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
+ &root->state);
+ }
+
+ if (block_rsv) {
+ WARN_ON(block_rsv->size > 0);
+ btrfs_free_block_rsv(root, block_rsv);
+ }
+}
+
+/*
+ * This creates an orphan entry for the given inode in case something goes
+ * wrong in the middle of an unlink/truncate.
+ *
+ * NOTE: caller of this function should reserve 5 units of metadata for
+ * this function.
+ */
+int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_block_rsv *block_rsv = NULL;
+ int reserve = 0;
+ int insert = 0;
+ int ret;
+
+ if (!root->orphan_block_rsv) {
+ block_rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
+ if (!block_rsv)
+ return -ENOMEM;
+ }
+
+ spin_lock(&root->orphan_lock);
+ if (!root->orphan_block_rsv) {
+ root->orphan_block_rsv = block_rsv;
+ } else if (block_rsv) {
+ btrfs_free_block_rsv(root, block_rsv);
+ block_rsv = NULL;
+ }
+
+ if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+ &BTRFS_I(inode)->runtime_flags)) {
+#if 0
+ /*
+ * For proper ENOSPC handling, we should do orphan
+ * cleanup when mounting. But this introduces backward
+ * compatibility issue.
+ */
+ if (!xchg(&root->orphan_item_inserted, 1))
+ insert = 2;
+ else
+ insert = 1;
+#endif
+ insert = 1;
+ atomic_inc(&root->orphan_inodes);
+ }
+
+ if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
+ &BTRFS_I(inode)->runtime_flags))
+ reserve = 1;
+ spin_unlock(&root->orphan_lock);
+
+ /* grab metadata reservation from transaction handle */
+ if (reserve) {
+ ret = btrfs_orphan_reserve_metadata(trans, inode);
+ BUG_ON(ret); /* -ENOSPC in reservation; Logic error? JDM */
+ }
+
+ /* insert an orphan item to track this unlinked/truncated file */
+ if (insert >= 1) {
+ ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
+ if (ret) {
+ atomic_dec(&root->orphan_inodes);
+ if (reserve) {
+ clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
+ &BTRFS_I(inode)->runtime_flags);
+ btrfs_orphan_release_metadata(inode);
+ }
+ if (ret != -EEXIST) {
+ clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+ &BTRFS_I(inode)->runtime_flags);
+ btrfs_abort_transaction(trans, root, ret);
+ return ret;
+ }
+ }
+ ret = 0;
+ }
+
+ /* insert an orphan item to track subvolume contains orphan files */
+ if (insert >= 2) {
+ ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
+ root->root_key.objectid);
+ if (ret && ret != -EEXIST) {
+ btrfs_abort_transaction(trans, root, ret);
+ return ret;
+ }
+ }
+ return 0;
+}
+
+/*
+ * We have done the truncate/delete so we can go ahead and remove the orphan
+ * item for this particular inode.
+ */
+static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
+ struct inode *inode)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int delete_item = 0;
+ int release_rsv = 0;
+ int ret = 0;
+
+ spin_lock(&root->orphan_lock);
+ if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+ &BTRFS_I(inode)->runtime_flags))
+ delete_item = 1;
+
+ if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
+ &BTRFS_I(inode)->runtime_flags))
+ release_rsv = 1;
+ spin_unlock(&root->orphan_lock);
+
+ if (delete_item) {
+ atomic_dec(&root->orphan_inodes);
+ if (trans)
+ ret = btrfs_del_orphan_item(trans, root,
+ btrfs_ino(inode));
+ }
+
+ if (release_rsv)
+ btrfs_orphan_release_metadata(inode);
+
+ return ret;
+}
+
+/*
+ * this cleans up any orphans that may be left on the list from the last use
+ * of this root.
+ */
+int btrfs_orphan_cleanup(struct btrfs_root *root)
+{
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_key key, found_key;
+ struct btrfs_trans_handle *trans;
+ struct inode *inode;
+ u64 last_objectid = 0;
+ int ret = 0, nr_unlink = 0, nr_truncate = 0;
+
+ if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
+ return 0;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ path->reada = -1;
+
+ key.objectid = BTRFS_ORPHAN_OBJECTID;
+ key.type = BTRFS_ORPHAN_ITEM_KEY;
+ key.offset = (u64)-1;
+
+ while (1) {
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ /*
+ * if ret == 0 means we found what we were searching for, which
+ * is weird, but possible, so only screw with path if we didn't
+ * find the key and see if we have stuff that matches
+ */
+ if (ret > 0) {
+ ret = 0;
+ if (path->slots[0] == 0)
+ break;
+ path->slots[0]--;
+ }
+
+ /* pull out the item */
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+ /* make sure the item matches what we want */
+ if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
+ break;
+ if (found_key.type != BTRFS_ORPHAN_ITEM_KEY)
+ break;
+
+ /* release the path since we're done with it */
+ btrfs_release_path(path);
+
+ /*
+ * this is where we are basically btrfs_lookup, without the
+ * crossing root thing. we store the inode number in the
+ * offset of the orphan item.
+ */
+
+ if (found_key.offset == last_objectid) {
+ btrfs_err(root->fs_info,
+ "Error removing orphan entry, stopping orphan cleanup");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ last_objectid = found_key.offset;
+
+ found_key.objectid = found_key.offset;
+ found_key.type = BTRFS_INODE_ITEM_KEY;
+ found_key.offset = 0;
+ inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
+ ret = PTR_ERR_OR_ZERO(inode);
+ if (ret && ret != -ESTALE)
+ goto out;
+
+ if (ret == -ESTALE && root == root->fs_info->tree_root) {
+ struct btrfs_root *dead_root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ int is_dead_root = 0;
+
+ /*
+ * this is an orphan in the tree root. Currently these
+ * could come from 2 sources:
+ * a) a snapshot deletion in progress
+ * b) a free space cache inode
+ * We need to distinguish those two, as the snapshot
+ * orphan must not get deleted.
+ * find_dead_roots already ran before us, so if this
+ * is a snapshot deletion, we should find the root
+ * in the dead_roots list
+ */
+ spin_lock(&fs_info->trans_lock);
+ list_for_each_entry(dead_root, &fs_info->dead_roots,
+ root_list) {
+ if (dead_root->root_key.objectid ==
+ found_key.objectid) {
+ is_dead_root = 1;
+ break;
+ }
+ }
+ spin_unlock(&fs_info->trans_lock);
+ if (is_dead_root) {
+ /* prevent this orphan from being found again */
+ key.offset = found_key.objectid - 1;
+ continue;
+ }
+ }
+ /*
+ * Inode is already gone but the orphan item is still there,
+ * kill the orphan item.
+ */
+ if (ret == -ESTALE) {
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+ btrfs_debug(root->fs_info, "auto deleting %Lu",
+ found_key.objectid);
+ ret = btrfs_del_orphan_item(trans, root,
+ found_key.objectid);
+ btrfs_end_transaction(trans, root);
+ if (ret)
+ goto out;
+ continue;
+ }
+
+ /*
+ * add this inode to the orphan list so btrfs_orphan_del does
+ * the proper thing when we hit it
+ */
+ set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+ &BTRFS_I(inode)->runtime_flags);
+ atomic_inc(&root->orphan_inodes);
+
+ /* if we have links, this was a truncate, lets do that */
+ if (inode->i_nlink) {
+ if (WARN_ON(!S_ISREG(inode->i_mode))) {
+ iput(inode);
+ continue;
+ }
+ nr_truncate++;
+
+ /* 1 for the orphan item deletion. */
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ iput(inode);
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+ ret = btrfs_orphan_add(trans, inode);
+ btrfs_end_transaction(trans, root);
+ if (ret) {
+ iput(inode);
+ goto out;
+ }
+
+ ret = btrfs_truncate(inode);
+ if (ret)
+ btrfs_orphan_del(NULL, inode);
+ } else {
+ nr_unlink++;
+ }
+
+ /* this will do delete_inode and everything for us */
+ iput(inode);
+ if (ret)
+ goto out;
+ }
+ /* release the path since we're done with it */
+ btrfs_release_path(path);
+
+ root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
+
+ if (root->orphan_block_rsv)
+ btrfs_block_rsv_release(root, root->orphan_block_rsv,
+ (u64)-1);
+
+ if (root->orphan_block_rsv ||
+ test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
+ trans = btrfs_join_transaction(root);
+ if (!IS_ERR(trans))
+ btrfs_end_transaction(trans, root);
+ }
+
+ if (nr_unlink)
+ btrfs_debug(root->fs_info, "unlinked %d orphans", nr_unlink);
+ if (nr_truncate)
+ btrfs_debug(root->fs_info, "truncated %d orphans", nr_truncate);
+
+out:
+ if (ret)
+ btrfs_err(root->fs_info,
+ "could not do orphan cleanup %d", ret);
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * very simple check to peek ahead in the leaf looking for xattrs. If we
+ * don't find any xattrs, we know there can't be any acls.
+ *
+ * slot is the slot the inode is in, objectid is the objectid of the inode
+ */
+static noinline int acls_after_inode_item(struct extent_buffer *leaf,
+ int slot, u64 objectid,
+ int *first_xattr_slot)
+{
+ u32 nritems = btrfs_header_nritems(leaf);
+ struct btrfs_key found_key;
+ static u64 xattr_access = 0;
+ static u64 xattr_default = 0;
+ int scanned = 0;
+
+ if (!xattr_access) {
+ xattr_access = btrfs_name_hash(POSIX_ACL_XATTR_ACCESS,
+ strlen(POSIX_ACL_XATTR_ACCESS));
+ xattr_default = btrfs_name_hash(POSIX_ACL_XATTR_DEFAULT,
+ strlen(POSIX_ACL_XATTR_DEFAULT));
+ }
+
+ slot++;
+ *first_xattr_slot = -1;
+ while (slot < nritems) {
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+ /* we found a different objectid, there must not be acls */
+ if (found_key.objectid != objectid)
+ return 0;
+
+ /* we found an xattr, assume we've got an acl */
+ if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
+ if (*first_xattr_slot == -1)
+ *first_xattr_slot = slot;
+ if (found_key.offset == xattr_access ||
+ found_key.offset == xattr_default)
+ return 1;
+ }
+
+ /*
+ * we found a key greater than an xattr key, there can't
+ * be any acls later on
+ */
+ if (found_key.type > BTRFS_XATTR_ITEM_KEY)
+ return 0;
+
+ slot++;
+ scanned++;
+
+ /*
+ * it goes inode, inode backrefs, xattrs, extents,
+ * so if there are a ton of hard links to an inode there can
+ * be a lot of backrefs. Don't waste time searching too hard,
+ * this is just an optimization
+ */
+ if (scanned >= 8)
+ break;
+ }
+ /* we hit the end of the leaf before we found an xattr or
+ * something larger than an xattr. We have to assume the inode
+ * has acls
+ */
+ if (*first_xattr_slot == -1)
+ *first_xattr_slot = slot;
+ return 1;
+}
+
+/*
+ * read an inode from the btree into the in-memory inode
+ */
+static void btrfs_read_locked_inode(struct inode *inode)
+{
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_inode_item *inode_item;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_key location;
+ unsigned long ptr;
+ int maybe_acls;
+ u32 rdev;
+ int ret;
+ bool filled = false;
+ int first_xattr_slot;
+
+ ret = btrfs_fill_inode(inode, &rdev);
+ if (!ret)
+ filled = true;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ goto make_bad;
+
+ memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
+
+ ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
+ if (ret)
+ goto make_bad;
+
+ leaf = path->nodes[0];
+
+ if (filled)
+ goto cache_index;
+
+ inode_item = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_inode_item);
+ inode->i_mode = btrfs_inode_mode(leaf, inode_item);
+ set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
+ i_uid_write(inode, btrfs_inode_uid(leaf, inode_item));
+ i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
+ btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
+
+ inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime);
+ inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime);
+
+ inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime);
+ inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime);
+
+ inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->ctime);
+ inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->ctime);
+
+ BTRFS_I(inode)->i_otime.tv_sec =
+ btrfs_timespec_sec(leaf, &inode_item->otime);
+ BTRFS_I(inode)->i_otime.tv_nsec =
+ btrfs_timespec_nsec(leaf, &inode_item->otime);
+
+ inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
+ BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
+ BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
+
+ inode->i_version = btrfs_inode_sequence(leaf, inode_item);
+ inode->i_generation = BTRFS_I(inode)->generation;
+ inode->i_rdev = 0;
+ rdev = btrfs_inode_rdev(leaf, inode_item);
+
+ BTRFS_I(inode)->index_cnt = (u64)-1;
+ BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
+
+cache_index:
+ /*
+ * If we were modified in the current generation and evicted from memory
+ * and then re-read we need to do a full sync since we don't have any
+ * idea about which extents were modified before we were evicted from
+ * cache.
+ *
+ * This is required for both inode re-read from disk and delayed inode
+ * in delayed_nodes_tree.
+ */
+ if (BTRFS_I(inode)->last_trans == root->fs_info->generation)
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
+
+ path->slots[0]++;
+ if (inode->i_nlink != 1 ||
+ path->slots[0] >= btrfs_header_nritems(leaf))
+ goto cache_acl;
+
+ btrfs_item_key_to_cpu(leaf, &location, path->slots[0]);
+ if (location.objectid != btrfs_ino(inode))
+ goto cache_acl;
+
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ if (location.type == BTRFS_INODE_REF_KEY) {
+ struct btrfs_inode_ref *ref;
+
+ ref = (struct btrfs_inode_ref *)ptr;
+ BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref);
+ } else if (location.type == BTRFS_INODE_EXTREF_KEY) {
+ struct btrfs_inode_extref *extref;
+
+ extref = (struct btrfs_inode_extref *)ptr;
+ BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf,
+ extref);
+ }
+cache_acl:
+ /*
+ * try to precache a NULL acl entry for files that don't have
+ * any xattrs or acls
+ */
+ maybe_acls = acls_after_inode_item(leaf, path->slots[0],
+ btrfs_ino(inode), &first_xattr_slot);
+ if (first_xattr_slot != -1) {
+ path->slots[0] = first_xattr_slot;
+ ret = btrfs_load_inode_props(inode, path);
+ if (ret)
+ btrfs_err(root->fs_info,
+ "error loading props for ino %llu (root %llu): %d",
+ btrfs_ino(inode),
+ root->root_key.objectid, ret);
+ }
+ btrfs_free_path(path);
+
+ if (!maybe_acls)
+ cache_no_acl(inode);
+
+ switch (inode->i_mode & S_IFMT) {
+ case S_IFREG:
+ inode->i_mapping->a_ops = &btrfs_aops;
+ BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+ inode->i_fop = &btrfs_file_operations;
+ inode->i_op = &btrfs_file_inode_operations;
+ break;
+ case S_IFDIR:
+ inode->i_fop = &btrfs_dir_file_operations;
+ if (root == root->fs_info->tree_root)
+ inode->i_op = &btrfs_dir_ro_inode_operations;
+ else
+ inode->i_op = &btrfs_dir_inode_operations;
+ break;
+ case S_IFLNK:
+ inode->i_op = &btrfs_symlink_inode_operations;
+ inode->i_mapping->a_ops = &btrfs_symlink_aops;
+ break;
+ default:
+ inode->i_op = &btrfs_special_inode_operations;
+ init_special_inode(inode, inode->i_mode, rdev);
+ break;
+ }
+
+ btrfs_update_iflags(inode);
+ return;
+
+make_bad:
+ btrfs_free_path(path);
+ make_bad_inode(inode);
+}
+
+/*
+ * given a leaf and an inode, copy the inode fields into the leaf
+ */
+static void fill_inode_item(struct btrfs_trans_handle *trans,
+ struct extent_buffer *leaf,
+ struct btrfs_inode_item *item,
+ struct inode *inode)
+{
+ struct btrfs_map_token token;
+
+ btrfs_init_map_token(&token);
+
+ btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
+ btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
+ btrfs_set_token_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size,
+ &token);
+ btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
+ btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
+
+ btrfs_set_token_timespec_sec(leaf, &item->atime,
+ inode->i_atime.tv_sec, &token);
+ btrfs_set_token_timespec_nsec(leaf, &item->atime,
+ inode->i_atime.tv_nsec, &token);
+
+ btrfs_set_token_timespec_sec(leaf, &item->mtime,
+ inode->i_mtime.tv_sec, &token);
+ btrfs_set_token_timespec_nsec(leaf, &item->mtime,
+ inode->i_mtime.tv_nsec, &token);
+
+ btrfs_set_token_timespec_sec(leaf, &item->ctime,
+ inode->i_ctime.tv_sec, &token);
+ btrfs_set_token_timespec_nsec(leaf, &item->ctime,
+ inode->i_ctime.tv_nsec, &token);
+
+ btrfs_set_token_timespec_sec(leaf, &item->otime,
+ BTRFS_I(inode)->i_otime.tv_sec, &token);
+ btrfs_set_token_timespec_nsec(leaf, &item->otime,
+ BTRFS_I(inode)->i_otime.tv_nsec, &token);
+
+ btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
+ &token);
+ btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation,
+ &token);
+ btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
+ btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
+ btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
+ btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
+ btrfs_set_token_inode_block_group(leaf, item, 0, &token);
+}
+
+/*
+ * copy everything in the in-memory inode into the btree.
+ */
+static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode)
+{
+ struct btrfs_inode_item *inode_item;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ path->leave_spinning = 1;
+ ret = btrfs_lookup_inode(trans, root, path, &BTRFS_I(inode)->location,
+ 1);
+ if (ret) {
+ if (ret > 0)
+ ret = -ENOENT;
+ goto failed;
+ }
+
+ leaf = path->nodes[0];
+ inode_item = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_inode_item);
+
+ fill_inode_item(trans, leaf, inode_item, inode);
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_set_inode_last_trans(trans, inode);
+ ret = 0;
+failed:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * copy everything in the in-memory inode into the btree.
+ */
+noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode)
+{
+ int ret;
+
+ /*
+ * If the inode is a free space inode, we can deadlock during commit
+ * if we put it into the delayed code.
+ *
+ * The data relocation inode should also be directly updated
+ * without delay
+ */
+ if (!btrfs_is_free_space_inode(inode)
+ && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
+ && !root->fs_info->log_root_recovering) {
+ btrfs_update_root_times(trans, root);
+
+ ret = btrfs_delayed_update_inode(trans, root, inode);
+ if (!ret)
+ btrfs_set_inode_last_trans(trans, inode);
+ return ret;
+ }
+
+ return btrfs_update_inode_item(trans, root, inode);
+}
+
+noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *inode)
+{
+ int ret;
+
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret == -ENOSPC)
+ return btrfs_update_inode_item(trans, root, inode);
+ return ret;
+}
+
+/*
+ * unlink helper that gets used here in inode.c and in the tree logging
+ * recovery code. It remove a link in a directory with a given name, and
+ * also drops the back refs in the inode to the directory
+ */
+static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *dir, struct inode *inode,
+ const char *name, int name_len)
+{
+ struct btrfs_path *path;
+ int ret = 0;
+ struct extent_buffer *leaf;
+ struct btrfs_dir_item *di;
+ struct btrfs_key key;
+ u64 index;
+ u64 ino = btrfs_ino(inode);
+ u64 dir_ino = btrfs_ino(dir);
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ path->leave_spinning = 1;
+ di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
+ name, name_len, -1);
+ if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
+ goto err;
+ }
+ if (!di) {
+ ret = -ENOENT;
+ goto err;
+ }
+ leaf = path->nodes[0];
+ btrfs_dir_item_key_to_cpu(leaf, di, &key);
+ ret = btrfs_delete_one_dir_name(trans, root, path, di);
+ if (ret)
+ goto err;
+ btrfs_release_path(path);
+
+ /*
+ * If we don't have dir index, we have to get it by looking up
+ * the inode ref, since we get the inode ref, remove it directly,
+ * it is unnecessary to do delayed deletion.
+ *
+ * But if we have dir index, needn't search inode ref to get it.
+ * Since the inode ref is close to the inode item, it is better
+ * that we delay to delete it, and just do this deletion when
+ * we update the inode item.
+ */
+ if (BTRFS_I(inode)->dir_index) {
+ ret = btrfs_delayed_delete_inode_ref(inode);
+ if (!ret) {
+ index = BTRFS_I(inode)->dir_index;
+ goto skip_backref;
+ }
+ }
+
+ ret = btrfs_del_inode_ref(trans, root, name, name_len, ino,
+ dir_ino, &index);
+ if (ret) {
+ btrfs_info(root->fs_info,
+ "failed to delete reference to %.*s, inode %llu parent %llu",
+ name_len, name, ino, dir_ino);
+ btrfs_abort_transaction(trans, root, ret);
+ goto err;
+ }
+skip_backref:
+ ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto err;
+ }
+
+ ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
+ inode, dir_ino);
+ if (ret != 0 && ret != -ENOENT) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto err;
+ }
+
+ ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
+ dir, index);
+ if (ret == -ENOENT)
+ ret = 0;
+ else if (ret)
+ btrfs_abort_transaction(trans, root, ret);
+err:
+ btrfs_free_path(path);
+ if (ret)
+ goto out;
+
+ btrfs_i_size_write(dir, dir->i_size - name_len * 2);
+ inode_inc_iversion(inode);
+ inode_inc_iversion(dir);
+ inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+ ret = btrfs_update_inode(trans, root, dir);
+out:
+ return ret;
+}
+
+int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *dir, struct inode *inode,
+ const char *name, int name_len)
+{
+ int ret;
+ ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
+ if (!ret) {
+ drop_nlink(inode);
+ ret = btrfs_update_inode(trans, root, inode);
+ }
+ return ret;
+}
+
+/*
+ * helper to start transaction for unlink and rmdir.
+ *
+ * unlink and rmdir are special in btrfs, they do not always free space, so
+ * if we cannot make our reservations the normal way try and see if there is
+ * plenty of slack room in the global reserve to migrate, otherwise we cannot
+ * allow the unlink to occur.
+ */
+static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ int ret;
+
+ /*
+ * 1 for the possible orphan item
+ * 1 for the dir item
+ * 1 for the dir index
+ * 1 for the inode ref
+ * 1 for the inode
+ */
+ trans = btrfs_start_transaction(root, 5);
+ if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
+ return trans;
+
+ if (PTR_ERR(trans) == -ENOSPC) {
+ u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5);
+
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans))
+ return trans;
+ ret = btrfs_cond_migrate_bytes(root->fs_info,
+ &root->fs_info->trans_block_rsv,
+ num_bytes, 5);
+ if (ret) {
+ btrfs_end_transaction(trans, root);
+ return ERR_PTR(ret);
+ }
+ trans->block_rsv = &root->fs_info->trans_block_rsv;
+ trans->bytes_reserved = num_bytes;
+ }
+ return trans;
+}
+
+static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_trans_handle *trans;
+ struct inode *inode = d_inode(dentry);
+ int ret;
+
+ trans = __unlink_start_trans(dir);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ btrfs_record_unlink_dir(trans, dir, d_inode(dentry), 0);
+
+ ret = btrfs_unlink_inode(trans, root, dir, d_inode(dentry),
+ dentry->d_name.name, dentry->d_name.len);
+ if (ret)
+ goto out;
+
+ if (inode->i_nlink == 0) {
+ ret = btrfs_orphan_add(trans, inode);
+ if (ret)
+ goto out;
+ }
+
+out:
+ btrfs_end_transaction(trans, root);
+ btrfs_btree_balance_dirty(root);
+ return ret;
+}
+
+int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *dir, u64 objectid,
+ const char *name, int name_len)
+{
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_dir_item *di;
+ struct btrfs_key key;
+ u64 index;
+ int ret;
+ u64 dir_ino = btrfs_ino(dir);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
+ name, name_len, -1);
+ if (IS_ERR_OR_NULL(di)) {
+ if (!di)
+ ret = -ENOENT;
+ else
+ ret = PTR_ERR(di);
+ goto out;
+ }
+
+ leaf = path->nodes[0];
+ btrfs_dir_item_key_to_cpu(leaf, di, &key);
+ WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
+ ret = btrfs_delete_one_dir_name(trans, root, path, di);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
+ btrfs_release_path(path);
+
+ ret = btrfs_del_root_ref(trans, root->fs_info->tree_root,
+ objectid, root->root_key.objectid,
+ dir_ino, &index, name, name_len);
+ if (ret < 0) {
+ if (ret != -ENOENT) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
+ di = btrfs_search_dir_index_item(root, path, dir_ino,
+ name, name_len);
+ if (IS_ERR_OR_NULL(di)) {
+ if (!di)
+ ret = -ENOENT;
+ else
+ ret = PTR_ERR(di);
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
+
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ btrfs_release_path(path);
+ index = key.offset;
+ }
+ btrfs_release_path(path);
+
+ ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
+
+ btrfs_i_size_write(dir, dir->i_size - name_len * 2);
+ inode_inc_iversion(dir);
+ dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+ ret = btrfs_update_inode_fallback(trans, root, dir);
+ if (ret)
+ btrfs_abort_transaction(trans, root, ret);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+ struct inode *inode = d_inode(dentry);
+ int err = 0;
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_trans_handle *trans;
+
+ if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
+ return -ENOTEMPTY;
+ if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
+ return -EPERM;
+
+ trans = __unlink_start_trans(dir);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ if (unlikely(btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
+ err = btrfs_unlink_subvol(trans, root, dir,
+ BTRFS_I(inode)->location.objectid,
+ dentry->d_name.name,
+ dentry->d_name.len);
+ goto out;
+ }
+
+ err = btrfs_orphan_add(trans, inode);
+ if (err)
+ goto out;
+
+ /* now the directory is empty */
+ err = btrfs_unlink_inode(trans, root, dir, d_inode(dentry),
+ dentry->d_name.name, dentry->d_name.len);
+ if (!err)
+ btrfs_i_size_write(inode, 0);
+out:
+ btrfs_end_transaction(trans, root);
+ btrfs_btree_balance_dirty(root);
+
+ return err;
+}
+
+static int truncate_space_check(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 bytes_deleted)
+{
+ int ret;
+
+ bytes_deleted = btrfs_csum_bytes_to_leaves(root, bytes_deleted);
+ ret = btrfs_block_rsv_add(root, &root->fs_info->trans_block_rsv,
+ bytes_deleted, BTRFS_RESERVE_NO_FLUSH);
+ if (!ret)
+ trans->bytes_reserved += bytes_deleted;
+ return ret;
+
+}
+
+/*
+ * this can truncate away extent items, csum items and directory items.
+ * It starts at a high offset and removes keys until it can't find
+ * any higher than new_size
+ *
+ * csum items that cross the new i_size are truncated to the new size
+ * as well.
+ *
+ * min_type is the minimum key type to truncate down to. If set to 0, this
+ * will kill all the items on this inode, including the INODE_ITEM_KEY.
+ */
+int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *inode,
+ u64 new_size, u32 min_type)
+{
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ u64 extent_start = 0;
+ u64 extent_num_bytes = 0;
+ u64 extent_offset = 0;
+ u64 item_end = 0;
+ u64 last_size = (u64)-1;
+ u32 found_type = (u8)-1;
+ int found_extent;
+ int del_item;
+ int pending_del_nr = 0;
+ int pending_del_slot = 0;
+ int extent_type = -1;
+ int ret;
+ int err = 0;
+ u64 ino = btrfs_ino(inode);
+ u64 bytes_deleted = 0;
+ bool be_nice = 0;
+ bool should_throttle = 0;
+ bool should_end = 0;
+
+ BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
+
+ /*
+ * for non-free space inodes and ref cows, we want to back off from
+ * time to time
+ */
+ if (!btrfs_is_free_space_inode(inode) &&
+ test_bit(BTRFS_ROOT_REF_COWS, &root->state))
+ be_nice = 1;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ path->reada = -1;
+
+ /*
+ * We want to drop from the next block forward in case this new size is
+ * not block aligned since we will be keeping the last block of the
+ * extent just the way it is.
+ */
+ if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
+ root == root->fs_info->tree_root)
+ btrfs_drop_extent_cache(inode, ALIGN(new_size,
+ root->sectorsize), (u64)-1, 0);
+
+ /*
+ * This function is also used to drop the items in the log tree before
+ * we relog the inode, so if root != BTRFS_I(inode)->root, it means
+ * it is used to drop the loged items. So we shouldn't kill the delayed
+ * items.
+ */
+ if (min_type == 0 && root == BTRFS_I(inode)->root)
+ btrfs_kill_delayed_inode_items(inode);
+
+ key.objectid = ino;
+ key.offset = (u64)-1;
+ key.type = (u8)-1;
+
+search_again:
+ /*
+ * with a 16K leaf size and 128MB extents, you can actually queue
+ * up a huge file in a single leaf. Most of the time that
+ * bytes_deleted is > 0, it will be huge by the time we get here
+ */
+ if (be_nice && bytes_deleted > 32 * 1024 * 1024) {
+ if (btrfs_should_end_transaction(trans, root)) {
+ err = -EAGAIN;
+ goto error;
+ }
+ }
+
+
+ path->leave_spinning = 1;
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+
+ if (ret > 0) {
+ /* there are no items in the tree for us to truncate, we're
+ * done
+ */
+ if (path->slots[0] == 0)
+ goto out;
+ path->slots[0]--;
+ }
+
+ while (1) {
+ fi = NULL;
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ found_type = found_key.type;
+
+ if (found_key.objectid != ino)
+ break;
+
+ if (found_type < min_type)
+ break;
+
+ item_end = found_key.offset;
+ if (found_type == BTRFS_EXTENT_DATA_KEY) {
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ extent_type = btrfs_file_extent_type(leaf, fi);
+ if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
+ item_end +=
+ btrfs_file_extent_num_bytes(leaf, fi);
+ } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+ item_end += btrfs_file_extent_inline_len(leaf,
+ path->slots[0], fi);
+ }
+ item_end--;
+ }
+ if (found_type > min_type) {
+ del_item = 1;
+ } else {
+ if (item_end < new_size)
+ break;
+ if (found_key.offset >= new_size)
+ del_item = 1;
+ else
+ del_item = 0;
+ }
+ found_extent = 0;
+ /* FIXME, shrink the extent if the ref count is only 1 */
+ if (found_type != BTRFS_EXTENT_DATA_KEY)
+ goto delete;
+
+ if (del_item)
+ last_size = found_key.offset;
+ else
+ last_size = new_size;
+
+ if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
+ u64 num_dec;
+ extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
+ if (!del_item) {
+ u64 orig_num_bytes =
+ btrfs_file_extent_num_bytes(leaf, fi);
+ extent_num_bytes = ALIGN(new_size -
+ found_key.offset,
+ root->sectorsize);
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ extent_num_bytes);
+ num_dec = (orig_num_bytes -
+ extent_num_bytes);
+ if (test_bit(BTRFS_ROOT_REF_COWS,
+ &root->state) &&
+ extent_start != 0)
+ inode_sub_bytes(inode, num_dec);
+ btrfs_mark_buffer_dirty(leaf);
+ } else {
+ extent_num_bytes =
+ btrfs_file_extent_disk_num_bytes(leaf,
+ fi);
+ extent_offset = found_key.offset -
+ btrfs_file_extent_offset(leaf, fi);
+
+ /* FIXME blocksize != 4096 */
+ num_dec = btrfs_file_extent_num_bytes(leaf, fi);
+ if (extent_start != 0) {
+ found_extent = 1;
+ if (test_bit(BTRFS_ROOT_REF_COWS,
+ &root->state))
+ inode_sub_bytes(inode, num_dec);
+ }
+ }
+ } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+ /*
+ * we can't truncate inline items that have had
+ * special encodings
+ */
+ if (!del_item &&
+ btrfs_file_extent_compression(leaf, fi) == 0 &&
+ btrfs_file_extent_encryption(leaf, fi) == 0 &&
+ btrfs_file_extent_other_encoding(leaf, fi) == 0) {
+ u32 size = new_size - found_key.offset;
+
+ if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
+ inode_sub_bytes(inode, item_end + 1 -
+ new_size);
+
+ /*
+ * update the ram bytes to properly reflect
+ * the new size of our item
+ */
+ btrfs_set_file_extent_ram_bytes(leaf, fi, size);
+ size =
+ btrfs_file_extent_calc_inline_size(size);
+ btrfs_truncate_item(root, path, size, 1);
+ } else if (test_bit(BTRFS_ROOT_REF_COWS,
+ &root->state)) {
+ inode_sub_bytes(inode, item_end + 1 -
+ found_key.offset);
+ }
+ }
+delete:
+ if (del_item) {
+ if (!pending_del_nr) {
+ /* no pending yet, add ourselves */
+ pending_del_slot = path->slots[0];
+ pending_del_nr = 1;
+ } else if (pending_del_nr &&
+ path->slots[0] + 1 == pending_del_slot) {
+ /* hop on the pending chunk */
+ pending_del_nr++;
+ pending_del_slot = path->slots[0];
+ } else {
+ BUG();
+ }
+ } else {
+ break;
+ }
+ should_throttle = 0;
+
+ if (found_extent &&
+ (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
+ root == root->fs_info->tree_root)) {
+ btrfs_set_path_blocking(path);
+ bytes_deleted += extent_num_bytes;
+ ret = btrfs_free_extent(trans, root, extent_start,
+ extent_num_bytes, 0,
+ btrfs_header_owner(leaf),
+ ino, extent_offset, 0);
+ BUG_ON(ret);
+ if (btrfs_should_throttle_delayed_refs(trans, root))
+ btrfs_async_run_delayed_refs(root,
+ trans->delayed_ref_updates * 2, 0);
+ if (be_nice) {
+ if (truncate_space_check(trans, root,
+ extent_num_bytes)) {
+ should_end = 1;
+ }
+ if (btrfs_should_throttle_delayed_refs(trans,
+ root)) {
+ should_throttle = 1;
+ }
+ }
+ }
+
+ if (found_type == BTRFS_INODE_ITEM_KEY)
+ break;
+
+ if (path->slots[0] == 0 ||
+ path->slots[0] != pending_del_slot ||
+ should_throttle || should_end) {
+ if (pending_del_nr) {
+ ret = btrfs_del_items(trans, root, path,
+ pending_del_slot,
+ pending_del_nr);
+ if (ret) {
+ btrfs_abort_transaction(trans,
+ root, ret);
+ goto error;
+ }
+ pending_del_nr = 0;
+ }
+ btrfs_release_path(path);
+ if (should_throttle) {
+ unsigned long updates = trans->delayed_ref_updates;
+ if (updates) {
+ trans->delayed_ref_updates = 0;
+ ret = btrfs_run_delayed_refs(trans, root, updates * 2);
+ if (ret && !err)
+ err = ret;
+ }
+ }
+ /*
+ * if we failed to refill our space rsv, bail out
+ * and let the transaction restart
+ */
+ if (should_end) {
+ err = -EAGAIN;
+ goto error;
+ }
+ goto search_again;
+ } else {
+ path->slots[0]--;
+ }
+ }
+out:
+ if (pending_del_nr) {
+ ret = btrfs_del_items(trans, root, path, pending_del_slot,
+ pending_del_nr);
+ if (ret)
+ btrfs_abort_transaction(trans, root, ret);
+ }
+error:
+ if (last_size != (u64)-1 &&
+ root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
+ btrfs_ordered_update_i_size(inode, last_size, NULL);
+
+ btrfs_free_path(path);
+
+ if (be_nice && bytes_deleted > 32 * 1024 * 1024) {
+ unsigned long updates = trans->delayed_ref_updates;
+ if (updates) {
+ trans->delayed_ref_updates = 0;
+ ret = btrfs_run_delayed_refs(trans, root, updates * 2);
+ if (ret && !err)
+ err = ret;
+ }
+ }
+ return err;
+}
+
+/*
+ * btrfs_truncate_page - read, zero a chunk and write a page
+ * @inode - inode that we're zeroing
+ * @from - the offset to start zeroing
+ * @len - the length to zero, 0 to zero the entire range respective to the
+ * offset
+ * @front - zero up to the offset instead of from the offset on
+ *
+ * This will find the page for the "from" offset and cow the page and zero the
+ * part we want to zero. This is used with truncate and hole punching.
+ */
+int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
+ int front)
+{
+ struct address_space *mapping = inode->i_mapping;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct btrfs_ordered_extent *ordered;
+ struct extent_state *cached_state = NULL;
+ char *kaddr;
+ u32 blocksize = root->sectorsize;
+ pgoff_t index = from >> PAGE_CACHE_SHIFT;
+ unsigned offset = from & (PAGE_CACHE_SIZE-1);
+ struct page *page;
+ gfp_t mask = btrfs_alloc_write_mask(mapping);
+ int ret = 0;
+ u64 page_start;
+ u64 page_end;
+
+ if ((offset & (blocksize - 1)) == 0 &&
+ (!len || ((len & (blocksize - 1)) == 0)))
+ goto out;
+ ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+ if (ret)
+ goto out;
+
+again:
+ page = find_or_create_page(mapping, index, mask);
+ if (!page) {
+ btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ page_start = page_offset(page);
+ page_end = page_start + PAGE_CACHE_SIZE - 1;
+
+ if (!PageUptodate(page)) {
+ ret = btrfs_readpage(NULL, page);
+ lock_page(page);
+ if (page->mapping != mapping) {
+ unlock_page(page);
+ page_cache_release(page);
+ goto again;
+ }
+ if (!PageUptodate(page)) {
+ ret = -EIO;
+ goto out_unlock;
+ }
+ }
+ wait_on_page_writeback(page);
+
+ lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state);
+ set_page_extent_mapped(page);
+
+ ordered = btrfs_lookup_ordered_extent(inode, page_start);
+ if (ordered) {
+ unlock_extent_cached(io_tree, page_start, page_end,
+ &cached_state, GFP_NOFS);
+ unlock_page(page);
+ page_cache_release(page);
+ btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_put_ordered_extent(ordered);
+ goto again;
+ }
+
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
+ EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
+ 0, 0, &cached_state, GFP_NOFS);
+
+ ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
+ &cached_state);
+ if (ret) {
+ unlock_extent_cached(io_tree, page_start, page_end,
+ &cached_state, GFP_NOFS);
+ goto out_unlock;
+ }
+
+ if (offset != PAGE_CACHE_SIZE) {
+ if (!len)
+ len = PAGE_CACHE_SIZE - offset;
+ kaddr = kmap(page);
+ if (front)
+ memset(kaddr, 0, offset);
+ else
+ memset(kaddr + offset, 0, len);
+ flush_dcache_page(page);
+ kunmap(page);
+ }
+ ClearPageChecked(page);
+ set_page_dirty(page);
+ unlock_extent_cached(io_tree, page_start, page_end, &cached_state,
+ GFP_NOFS);
+
+out_unlock:
+ if (ret)
+ btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+ unlock_page(page);
+ page_cache_release(page);
+out:
+ return ret;
+}
+
+static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode,
+ u64 offset, u64 len)
+{
+ struct btrfs_trans_handle *trans;
+ int ret;
+
+ /*
+ * Still need to make sure the inode looks like it's been updated so
+ * that any holes get logged if we fsync.
+ */
+ if (btrfs_fs_incompat(root->fs_info, NO_HOLES)) {
+ BTRFS_I(inode)->last_trans = root->fs_info->generation;
+ BTRFS_I(inode)->last_sub_trans = root->log_transid;
+ BTRFS_I(inode)->last_log_commit = root->last_log_commit;
+ return 0;
+ }
+
+ /*
+ * 1 - for the one we're dropping
+ * 1 - for the one we're adding
+ * 1 - for updating the inode.
+ */
+ trans = btrfs_start_transaction(root, 3);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ ret = btrfs_drop_extents(trans, root, inode, offset, offset + len, 1);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ btrfs_end_transaction(trans, root);
+ return ret;
+ }
+
+ ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset,
+ 0, 0, len, 0, len, 0, 0, 0);
+ if (ret)
+ btrfs_abort_transaction(trans, root, ret);
+ else
+ btrfs_update_inode(trans, root, inode);
+ btrfs_end_transaction(trans, root);
+ return ret;
+}
+
+/*
+ * This function puts in dummy file extents for the area we're creating a hole
+ * for. So if we are truncating this file to a larger size we need to insert
+ * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
+ * the range between oldsize and size
+ */
+int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct extent_map *em = NULL;
+ struct extent_state *cached_state = NULL;
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ u64 hole_start = ALIGN(oldsize, root->sectorsize);
+ u64 block_end = ALIGN(size, root->sectorsize);
+ u64 last_byte;
+ u64 cur_offset;
+ u64 hole_size;
+ int err = 0;
+
+ /*
+ * If our size started in the middle of a page we need to zero out the
+ * rest of the page before we expand the i_size, otherwise we could
+ * expose stale data.
+ */
+ err = btrfs_truncate_page(inode, oldsize, 0, 0);
+ if (err)
+ return err;
+
+ if (size <= hole_start)
+ return 0;
+
+ while (1) {
+ struct btrfs_ordered_extent *ordered;
+
+ lock_extent_bits(io_tree, hole_start, block_end - 1, 0,
+ &cached_state);
+ ordered = btrfs_lookup_ordered_range(inode, hole_start,
+ block_end - hole_start);
+ if (!ordered)
+ break;
+ unlock_extent_cached(io_tree, hole_start, block_end - 1,
+ &cached_state, GFP_NOFS);
+ btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_put_ordered_extent(ordered);
+ }
+
+ cur_offset = hole_start;
+ while (1) {
+ em = btrfs_get_extent(inode, NULL, 0, cur_offset,
+ block_end - cur_offset, 0);
+ if (IS_ERR(em)) {
+ err = PTR_ERR(em);
+ em = NULL;
+ break;
+ }
+ last_byte = min(extent_map_end(em), block_end);
+ last_byte = ALIGN(last_byte , root->sectorsize);
+ if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+ struct extent_map *hole_em;
+ hole_size = last_byte - cur_offset;
+
+ err = maybe_insert_hole(root, inode, cur_offset,
+ hole_size);
+ if (err)
+ break;
+ btrfs_drop_extent_cache(inode, cur_offset,
+ cur_offset + hole_size - 1, 0);
+ hole_em = alloc_extent_map();
+ if (!hole_em) {
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
+ goto next;
+ }
+ hole_em->start = cur_offset;
+ hole_em->len = hole_size;
+ hole_em->orig_start = cur_offset;
+
+ hole_em->block_start = EXTENT_MAP_HOLE;
+ hole_em->block_len = 0;
+ hole_em->orig_block_len = 0;
+ hole_em->ram_bytes = hole_size;
+ hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
+ hole_em->compress_type = BTRFS_COMPRESS_NONE;
+ hole_em->generation = root->fs_info->generation;
+
+ while (1) {
+ write_lock(&em_tree->lock);
+ err = add_extent_mapping(em_tree, hole_em, 1);
+ write_unlock(&em_tree->lock);
+ if (err != -EEXIST)
+ break;
+ btrfs_drop_extent_cache(inode, cur_offset,
+ cur_offset +
+ hole_size - 1, 0);
+ }
+ free_extent_map(hole_em);
+ }
+next:
+ free_extent_map(em);
+ em = NULL;
+ cur_offset = last_byte;
+ if (cur_offset >= block_end)
+ break;
+ }
+ free_extent_map(em);
+ unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
+ GFP_NOFS);
+ return err;
+}
+
+static int wait_snapshoting_atomic_t(atomic_t *a)
+{
+ schedule();
+ return 0;
+}
+
+static void wait_for_snapshot_creation(struct btrfs_root *root)
+{
+ while (true) {
+ int ret;
+
+ ret = btrfs_start_write_no_snapshoting(root);
+ if (ret)
+ break;
+ wait_on_atomic_t(&root->will_be_snapshoted,
+ wait_snapshoting_atomic_t,
+ TASK_UNINTERRUPTIBLE);
+ }
+}
+
+static int btrfs_setsize(struct inode *inode, struct iattr *attr)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_trans_handle *trans;
+ loff_t oldsize = i_size_read(inode);
+ loff_t newsize = attr->ia_size;
+ int mask = attr->ia_valid;
+ int ret;
+
+ /*
+ * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
+ * special case where we need to update the times despite not having
+ * these flags set. For all other operations the VFS set these flags
+ * explicitly if it wants a timestamp update.
+ */
+ if (newsize != oldsize) {
+ inode_inc_iversion(inode);
+ if (!(mask & (ATTR_CTIME | ATTR_MTIME)))
+ inode->i_ctime = inode->i_mtime =
+ current_fs_time(inode->i_sb);
+ }
+
+ if (newsize > oldsize) {
+ truncate_pagecache(inode, newsize);
+ /*
+ * Don't do an expanding truncate while snapshoting is ongoing.
+ * This is to ensure the snapshot captures a fully consistent
+ * state of this file - if the snapshot captures this expanding
+ * truncation, it must capture all writes that happened before
+ * this truncation.
+ */
+ wait_for_snapshot_creation(root);
+ ret = btrfs_cont_expand(inode, oldsize, newsize);
+ if (ret) {
+ btrfs_end_write_no_snapshoting(root);
+ return ret;
+ }
+
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ btrfs_end_write_no_snapshoting(root);
+ return PTR_ERR(trans);
+ }
+
+ i_size_write(inode, newsize);
+ btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
+ ret = btrfs_update_inode(trans, root, inode);
+ btrfs_end_write_no_snapshoting(root);
+ btrfs_end_transaction(trans, root);
+ } else {
+
+ /*
+ * We're truncating a file that used to have good data down to
+ * zero. Make sure it gets into the ordered flush list so that
+ * any new writes get down to disk quickly.
+ */
+ if (newsize == 0)
+ set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
+ &BTRFS_I(inode)->runtime_flags);
+
+ /*
+ * 1 for the orphan item we're going to add
+ * 1 for the orphan item deletion.
+ */
+ trans = btrfs_start_transaction(root, 2);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ /*
+ * We need to do this in case we fail at _any_ point during the
+ * actual truncate. Once we do the truncate_setsize we could
+ * invalidate pages which forces any outstanding ordered io to
+ * be instantly completed which will give us extents that need
+ * to be truncated. If we fail to get an orphan inode down we
+ * could have left over extents that were never meant to live,
+ * so we need to garuntee from this point on that everything
+ * will be consistent.
+ */
+ ret = btrfs_orphan_add(trans, inode);
+ btrfs_end_transaction(trans, root);
+ if (ret)
+ return ret;
+
+ /* we don't support swapfiles, so vmtruncate shouldn't fail */
+ truncate_setsize(inode, newsize);
+
+ /* Disable nonlocked read DIO to avoid the end less truncate */
+ btrfs_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
+ btrfs_inode_resume_unlocked_dio(inode);
+
+ ret = btrfs_truncate(inode);
+ if (ret && inode->i_nlink) {
+ int err;
+
+ /*
+ * failed to truncate, disk_i_size is only adjusted down
+ * as we remove extents, so it should represent the true
+ * size of the inode, so reset the in memory size and
+ * delete our orphan entry.
+ */
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ btrfs_orphan_del(NULL, inode);
+ return ret;
+ }
+ i_size_write(inode, BTRFS_I(inode)->disk_i_size);
+ err = btrfs_orphan_del(trans, inode);
+ if (err)
+ btrfs_abort_transaction(trans, root, err);
+ btrfs_end_transaction(trans, root);
+ }
+ }
+
+ return ret;
+}
+
+static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+ struct inode *inode = d_inode(dentry);
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int err;
+
+ if (btrfs_root_readonly(root))
+ return -EROFS;
+
+ err = inode_change_ok(inode, attr);
+ if (err)
+ return err;
+
+ if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
+ err = btrfs_setsize(inode, attr);
+ if (err)
+ return err;
+ }
+
+ if (attr->ia_valid) {
+ setattr_copy(inode, attr);
+ inode_inc_iversion(inode);
+ err = btrfs_dirty_inode(inode);
+
+ if (!err && attr->ia_valid & ATTR_MODE)
+ err = posix_acl_chmod(inode, inode->i_mode);
+ }
+
+ return err;
+}
+
+/*
+ * While truncating the inode pages during eviction, we get the VFS calling
+ * btrfs_invalidatepage() against each page of the inode. This is slow because
+ * the calls to btrfs_invalidatepage() result in a huge amount of calls to
+ * lock_extent_bits() and clear_extent_bit(), which keep merging and splitting
+ * extent_state structures over and over, wasting lots of time.
+ *
+ * Therefore if the inode is being evicted, let btrfs_invalidatepage() skip all
+ * those expensive operations on a per page basis and do only the ordered io
+ * finishing, while we release here the extent_map and extent_state structures,
+ * without the excessive merging and splitting.
+ */
+static void evict_inode_truncate_pages(struct inode *inode)
+{
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree;
+ struct rb_node *node;
+
+ ASSERT(inode->i_state & I_FREEING);
+ truncate_inode_pages_final(&inode->i_data);
+
+ write_lock(&map_tree->lock);
+ while (!RB_EMPTY_ROOT(&map_tree->map)) {
+ struct extent_map *em;
+
+ node = rb_first(&map_tree->map);
+ em = rb_entry(node, struct extent_map, rb_node);
+ clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+ clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
+ remove_extent_mapping(map_tree, em);
+ free_extent_map(em);
+ if (need_resched()) {
+ write_unlock(&map_tree->lock);
+ cond_resched();
+ write_lock(&map_tree->lock);
+ }
+ }
+ write_unlock(&map_tree->lock);
+
+ spin_lock(&io_tree->lock);
+ while (!RB_EMPTY_ROOT(&io_tree->state)) {
+ struct extent_state *state;
+ struct extent_state *cached_state = NULL;
+
+ node = rb_first(&io_tree->state);
+ state = rb_entry(node, struct extent_state, rb_node);
+ atomic_inc(&state->refs);
+ spin_unlock(&io_tree->lock);
+
+ lock_extent_bits(io_tree, state->start, state->end,
+ 0, &cached_state);
+ clear_extent_bit(io_tree, state->start, state->end,
+ EXTENT_LOCKED | EXTENT_DIRTY |
+ EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
+ EXTENT_DEFRAG, 1, 1,
+ &cached_state, GFP_NOFS);
+ free_extent_state(state);
+
+ cond_resched();
+ spin_lock(&io_tree->lock);
+ }
+ spin_unlock(&io_tree->lock);
+}
+
+void btrfs_evict_inode(struct inode *inode)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_block_rsv *rsv, *global_rsv;
+ int steal_from_global = 0;
+ u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
+ int ret;
+
+ trace_btrfs_inode_evict(inode);
+
+ evict_inode_truncate_pages(inode);
+
+ if (inode->i_nlink &&
+ ((btrfs_root_refs(&root->root_item) != 0 &&
+ root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) ||
+ btrfs_is_free_space_inode(inode)))
+ goto no_delete;
+
+ if (is_bad_inode(inode)) {
+ btrfs_orphan_del(NULL, inode);
+ goto no_delete;
+ }
+ /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
+ btrfs_wait_ordered_range(inode, 0, (u64)-1);
+
+ btrfs_free_io_failure_record(inode, 0, (u64)-1);
+
+ if (root->fs_info->log_root_recovering) {
+ BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+ &BTRFS_I(inode)->runtime_flags));
+ goto no_delete;
+ }
+
+ if (inode->i_nlink > 0) {
+ BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
+ root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID);
+ goto no_delete;
+ }
+
+ ret = btrfs_commit_inode_delayed_inode(inode);
+ if (ret) {
+ btrfs_orphan_del(NULL, inode);
+ goto no_delete;
+ }
+
+ rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
+ if (!rsv) {
+ btrfs_orphan_del(NULL, inode);
+ goto no_delete;
+ }
+ rsv->size = min_size;
+ rsv->failfast = 1;
+ global_rsv = &root->fs_info->global_block_rsv;
+
+ btrfs_i_size_write(inode, 0);
+
+ /*
+ * This is a bit simpler than btrfs_truncate since we've already
+ * reserved our space for our orphan item in the unlink, so we just
+ * need to reserve some slack space in case we add bytes and update
+ * inode item when doing the truncate.
+ */
+ while (1) {
+ ret = btrfs_block_rsv_refill(root, rsv, min_size,
+ BTRFS_RESERVE_FLUSH_LIMIT);
+
+ /*
+ * Try and steal from the global reserve since we will
+ * likely not use this space anyway, we want to try as
+ * hard as possible to get this to work.
+ */
+ if (ret)
+ steal_from_global++;
+ else
+ steal_from_global = 0;
+ ret = 0;
+
+ /*
+ * steal_from_global == 0: we reserved stuff, hooray!
+ * steal_from_global == 1: we didn't reserve stuff, boo!
+ * steal_from_global == 2: we've committed, still not a lot of
+ * room but maybe we'll have room in the global reserve this
+ * time.
+ * steal_from_global == 3: abandon all hope!
+ */
+ if (steal_from_global > 2) {
+ btrfs_warn(root->fs_info,
+ "Could not get space for a delete, will truncate on mount %d",
+ ret);
+ btrfs_orphan_del(NULL, inode);
+ btrfs_free_block_rsv(root, rsv);
+ goto no_delete;
+ }
+
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ btrfs_orphan_del(NULL, inode);
+ btrfs_free_block_rsv(root, rsv);
+ goto no_delete;
+ }
+
+ /*
+ * We can't just steal from the global reserve, we need tomake
+ * sure there is room to do it, if not we need to commit and try
+ * again.
+ */
+ if (steal_from_global) {
+ if (!btrfs_check_space_for_delayed_refs(trans, root))
+ ret = btrfs_block_rsv_migrate(global_rsv, rsv,
+ min_size);
+ else
+ ret = -ENOSPC;
+ }
+
+ /*
+ * Couldn't steal from the global reserve, we have too much
+ * pending stuff built up, commit the transaction and try it
+ * again.
+ */
+ if (ret) {
+ ret = btrfs_commit_transaction(trans, root);
+ if (ret) {
+ btrfs_orphan_del(NULL, inode);
+ btrfs_free_block_rsv(root, rsv);
+ goto no_delete;
+ }
+ continue;
+ } else {
+ steal_from_global = 0;
+ }
+
+ trans->block_rsv = rsv;
+
+ ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
+ if (ret != -ENOSPC && ret != -EAGAIN)
+ break;
+
+ trans->block_rsv = &root->fs_info->trans_block_rsv;
+ btrfs_end_transaction(trans, root);
+ trans = NULL;
+ btrfs_btree_balance_dirty(root);
+ }
+
+ btrfs_free_block_rsv(root, rsv);
+
+ /*
+ * Errors here aren't a big deal, it just means we leave orphan items
+ * in the tree. They will be cleaned up on the next mount.
+ */
+ if (ret == 0) {
+ trans->block_rsv = root->orphan_block_rsv;
+ btrfs_orphan_del(trans, inode);
+ } else {
+ btrfs_orphan_del(NULL, inode);
+ }
+
+ trans->block_rsv = &root->fs_info->trans_block_rsv;
+ if (!(root == root->fs_info->tree_root ||
+ root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
+ btrfs_return_ino(root, btrfs_ino(inode));
+
+ btrfs_end_transaction(trans, root);
+ btrfs_btree_balance_dirty(root);
+no_delete:
+ btrfs_remove_delayed_node(inode);
+ clear_inode(inode);
+ return;
+}
+
+/*
+ * this returns the key found in the dir entry in the location pointer.
+ * If no dir entries were found, location->objectid is 0.
+ */
+static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
+ struct btrfs_key *location)
+{
+ const char *name = dentry->d_name.name;
+ int namelen = dentry->d_name.len;
+ struct btrfs_dir_item *di;
+ struct btrfs_path *path;
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ int ret = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), name,
+ namelen, 0);
+ if (IS_ERR(di))
+ ret = PTR_ERR(di);
+
+ if (IS_ERR_OR_NULL(di))
+ goto out_err;
+
+ btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
+out:
+ btrfs_free_path(path);
+ return ret;
+out_err:
+ location->objectid = 0;
+ goto out;
+}
+
+/*
+ * when we hit a tree root in a directory, the btrfs part of the inode
+ * needs to be changed to reflect the root directory of the tree root. This
+ * is kind of like crossing a mount point.
+ */
+static int fixup_tree_root_location(struct btrfs_root *root,
+ struct inode *dir,
+ struct dentry *dentry,
+ struct btrfs_key *location,
+ struct btrfs_root **sub_root)
+{
+ struct btrfs_path *path;
+ struct btrfs_root *new_root;
+ struct btrfs_root_ref *ref;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ int ret;
+ int err = 0;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = -ENOENT;
+ key.objectid = BTRFS_I(dir)->root->root_key.objectid;
+ key.type = BTRFS_ROOT_REF_KEY;
+ key.offset = location->objectid;
+
+ ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key, path,
+ 0, 0);
+ if (ret) {
+ if (ret < 0)
+ err = ret;
+ goto out;
+ }
+
+ leaf = path->nodes[0];
+ ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
+ if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) ||
+ btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)
+ goto out;
+
+ ret = memcmp_extent_buffer(leaf, dentry->d_name.name,
+ (unsigned long)(ref + 1),
+ dentry->d_name.len);
+ if (ret)
+ goto out;
+
+ btrfs_release_path(path);
+
+ new_root = btrfs_read_fs_root_no_name(root->fs_info, location);
+ if (IS_ERR(new_root)) {
+ err = PTR_ERR(new_root);
+ goto out;
+ }
+
+ *sub_root = new_root;
+ location->objectid = btrfs_root_dirid(&new_root->root_item);
+ location->type = BTRFS_INODE_ITEM_KEY;
+ location->offset = 0;
+ err = 0;
+out:
+ btrfs_free_path(path);
+ return err;
+}
+
+static void inode_tree_add(struct inode *inode)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_inode *entry;
+ struct rb_node **p;
+ struct rb_node *parent;
+ struct rb_node *new = &BTRFS_I(inode)->rb_node;
+ u64 ino = btrfs_ino(inode);
+
+ if (inode_unhashed(inode))
+ return;
+ parent = NULL;
+ spin_lock(&root->inode_lock);
+ p = &root->inode_tree.rb_node;
+ while (*p) {
+ parent = *p;
+ entry = rb_entry(parent, struct btrfs_inode, rb_node);
+
+ if (ino < btrfs_ino(&entry->vfs_inode))
+ p = &parent->rb_left;
+ else if (ino > btrfs_ino(&entry->vfs_inode))
+ p = &parent->rb_right;
+ else {
+ WARN_ON(!(entry->vfs_inode.i_state &
+ (I_WILL_FREE | I_FREEING)));
+ rb_replace_node(parent, new, &root->inode_tree);
+ RB_CLEAR_NODE(parent);
+ spin_unlock(&root->inode_lock);
+ return;
+ }
+ }
+ rb_link_node(new, parent, p);
+ rb_insert_color(new, &root->inode_tree);
+ spin_unlock(&root->inode_lock);
+}
+
+static void inode_tree_del(struct inode *inode)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int empty = 0;
+
+ spin_lock(&root->inode_lock);
+ if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
+ rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
+ RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
+ empty = RB_EMPTY_ROOT(&root->inode_tree);
+ }
+ spin_unlock(&root->inode_lock);
+
+ if (empty && btrfs_root_refs(&root->root_item) == 0) {
+ synchronize_srcu(&root->fs_info->subvol_srcu);
+ spin_lock(&root->inode_lock);
+ empty = RB_EMPTY_ROOT(&root->inode_tree);
+ spin_unlock(&root->inode_lock);
+ if (empty)
+ btrfs_add_dead_root(root);
+ }
+}
+
+void btrfs_invalidate_inodes(struct btrfs_root *root)
+{
+ struct rb_node *node;
+ struct rb_node *prev;
+ struct btrfs_inode *entry;
+ struct inode *inode;
+ u64 objectid = 0;
+
+ if (!test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
+ WARN_ON(btrfs_root_refs(&root->root_item) != 0);
+
+ spin_lock(&root->inode_lock);
+again:
+ node = root->inode_tree.rb_node;
+ prev = NULL;
+ while (node) {
+ prev = node;
+ entry = rb_entry(node, struct btrfs_inode, rb_node);
+
+ if (objectid < btrfs_ino(&entry->vfs_inode))
+ node = node->rb_left;
+ else if (objectid > btrfs_ino(&entry->vfs_inode))
+ node = node->rb_right;
+ else
+ break;
+ }
+ if (!node) {
+ while (prev) {
+ entry = rb_entry(prev, struct btrfs_inode, rb_node);
+ if (objectid <= btrfs_ino(&entry->vfs_inode)) {
+ node = prev;
+ break;
+ }
+ prev = rb_next(prev);
+ }
+ }
+ while (node) {
+ entry = rb_entry(node, struct btrfs_inode, rb_node);
+ objectid = btrfs_ino(&entry->vfs_inode) + 1;
+ inode = igrab(&entry->vfs_inode);
+ if (inode) {
+ spin_unlock(&root->inode_lock);
+ if (atomic_read(&inode->i_count) > 1)
+ d_prune_aliases(inode);
+ /*
+ * btrfs_drop_inode will have it removed from
+ * the inode cache when its usage count
+ * hits zero.
+ */
+ iput(inode);
+ cond_resched();
+ spin_lock(&root->inode_lock);
+ goto again;
+ }
+
+ if (cond_resched_lock(&root->inode_lock))
+ goto again;
+
+ node = rb_next(node);
+ }
+ spin_unlock(&root->inode_lock);
+}
+
+static int btrfs_init_locked_inode(struct inode *inode, void *p)
+{
+ struct btrfs_iget_args *args = p;
+ inode->i_ino = args->location->objectid;
+ memcpy(&BTRFS_I(inode)->location, args->location,
+ sizeof(*args->location));
+ BTRFS_I(inode)->root = args->root;
+ return 0;
+}
+
+static int btrfs_find_actor(struct inode *inode, void *opaque)
+{
+ struct btrfs_iget_args *args = opaque;
+ return args->location->objectid == BTRFS_I(inode)->location.objectid &&
+ args->root == BTRFS_I(inode)->root;
+}
+
+static struct inode *btrfs_iget_locked(struct super_block *s,
+ struct btrfs_key *location,
+ struct btrfs_root *root)
+{
+ struct inode *inode;
+ struct btrfs_iget_args args;
+ unsigned long hashval = btrfs_inode_hash(location->objectid, root);
+
+ args.location = location;
+ args.root = root;
+
+ inode = iget5_locked(s, hashval, btrfs_find_actor,
+ btrfs_init_locked_inode,
+ (void *)&args);
+ return inode;
+}
+
+/* Get an inode object given its location and corresponding root.
+ * Returns in *is_new if the inode was read from disk
+ */
+struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
+ struct btrfs_root *root, int *new)
+{
+ struct inode *inode;
+
+ inode = btrfs_iget_locked(s, location, root);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+
+ if (inode->i_state & I_NEW) {
+ btrfs_read_locked_inode(inode);
+ if (!is_bad_inode(inode)) {
+ inode_tree_add(inode);
+ unlock_new_inode(inode);
+ if (new)
+ *new = 1;
+ } else {
+ unlock_new_inode(inode);
+ iput(inode);
+ inode = ERR_PTR(-ESTALE);
+ }
+ }
+
+ return inode;
+}
+
+static struct inode *new_simple_dir(struct super_block *s,
+ struct btrfs_key *key,
+ struct btrfs_root *root)
+{
+ struct inode *inode = new_inode(s);
+
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+
+ BTRFS_I(inode)->root = root;
+ memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
+ set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
+
+ inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
+ inode->i_op = &btrfs_dir_ro_inode_operations;
+ inode->i_fop = &simple_dir_operations;
+ inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
+ inode->i_mtime = CURRENT_TIME;
+ inode->i_atime = inode->i_mtime;
+ inode->i_ctime = inode->i_mtime;
+ BTRFS_I(inode)->i_otime = inode->i_mtime;
+
+ return inode;
+}
+
+struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
+{
+ struct inode *inode;
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_root *sub_root = root;
+ struct btrfs_key location;
+ int index;
+ int ret = 0;
+
+ if (dentry->d_name.len > BTRFS_NAME_LEN)
+ return ERR_PTR(-ENAMETOOLONG);
+
+ ret = btrfs_inode_by_name(dir, dentry, &location);
+ if (ret < 0)
+ return ERR_PTR(ret);
+
+ if (location.objectid == 0)
+ return ERR_PTR(-ENOENT);
+
+ if (location.type == BTRFS_INODE_ITEM_KEY) {
+ inode = btrfs_iget(dir->i_sb, &location, root, NULL);
+ return inode;
+ }
+
+ BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY);
+
+ index = srcu_read_lock(&root->fs_info->subvol_srcu);
+ ret = fixup_tree_root_location(root, dir, dentry,
+ &location, &sub_root);
+ if (ret < 0) {
+ if (ret != -ENOENT)
+ inode = ERR_PTR(ret);
+ else
+ inode = new_simple_dir(dir->i_sb, &location, sub_root);
+ } else {
+ inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL);
+ }
+ srcu_read_unlock(&root->fs_info->subvol_srcu, index);
+
+ if (!IS_ERR(inode) && root != sub_root) {
+ down_read(&root->fs_info->cleanup_work_sem);
+ if (!(inode->i_sb->s_flags & MS_RDONLY))
+ ret = btrfs_orphan_cleanup(sub_root);
+ up_read(&root->fs_info->cleanup_work_sem);
+ if (ret) {
+ iput(inode);
+ inode = ERR_PTR(ret);
+ }
+ }
+
+ return inode;
+}
+
+static int btrfs_dentry_delete(const struct dentry *dentry)
+{
+ struct btrfs_root *root;
+ struct inode *inode = d_inode(dentry);
+
+ if (!inode && !IS_ROOT(dentry))
+ inode = d_inode(dentry->d_parent);
+
+ if (inode) {
+ root = BTRFS_I(inode)->root;
+ if (btrfs_root_refs(&root->root_item) == 0)
+ return 1;
+
+ if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
+ return 1;
+ }
+ return 0;
+}
+
+static void btrfs_dentry_release(struct dentry *dentry)
+{
+ kfree(dentry->d_fsdata);
+}
+
+static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
+ unsigned int flags)
+{
+ struct inode *inode;
+
+ inode = btrfs_lookup_dentry(dir, dentry);
+ if (IS_ERR(inode)) {
+ if (PTR_ERR(inode) == -ENOENT)
+ inode = NULL;
+ else
+ return ERR_CAST(inode);
+ }
+
+ return d_splice_alias(inode, dentry);
+}
+
+unsigned char btrfs_filetype_table[] = {
+ DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
+};
+
+static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
+{
+ struct inode *inode = file_inode(file);
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_item *item;
+ struct btrfs_dir_item *di;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ struct btrfs_path *path;
+ struct list_head ins_list;
+ struct list_head del_list;
+ int ret;
+ struct extent_buffer *leaf;
+ int slot;
+ unsigned char d_type;
+ int over = 0;
+ u32 di_cur;
+ u32 di_total;
+ u32 di_len;
+ int key_type = BTRFS_DIR_INDEX_KEY;
+ char tmp_name[32];
+ char *name_ptr;
+ int name_len;
+ int is_curr = 0; /* ctx->pos points to the current index? */
+
+ /* FIXME, use a real flag for deciding about the key type */
+ if (root->fs_info->tree_root == root)
+ key_type = BTRFS_DIR_ITEM_KEY;
+
+ if (!dir_emit_dots(file, ctx))
+ return 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ path->reada = 1;
+
+ if (key_type == BTRFS_DIR_INDEX_KEY) {
+ INIT_LIST_HEAD(&ins_list);
+ INIT_LIST_HEAD(&del_list);
+ btrfs_get_delayed_items(inode, &ins_list, &del_list);
+ }
+
+ key.type = key_type;
+ key.offset = ctx->pos;
+ key.objectid = btrfs_ino(inode);
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto err;
+
+ while (1) {
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ if (slot >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto err;
+ else if (ret > 0)
+ break;
+ continue;
+ }
+
+ item = btrfs_item_nr(slot);
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+ if (found_key.objectid != key.objectid)
+ break;
+ if (found_key.type != key_type)
+ break;
+ if (found_key.offset < ctx->pos)
+ goto next;
+ if (key_type == BTRFS_DIR_INDEX_KEY &&
+ btrfs_should_delete_dir_index(&del_list,
+ found_key.offset))
+ goto next;
+
+ ctx->pos = found_key.offset;
+ is_curr = 1;
+
+ di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
+ di_cur = 0;
+ di_total = btrfs_item_size(leaf, item);
+
+ while (di_cur < di_total) {
+ struct btrfs_key location;
+
+ if (verify_dir_item(root, leaf, di))
+ break;
+
+ name_len = btrfs_dir_name_len(leaf, di);
+ if (name_len <= sizeof(tmp_name)) {
+ name_ptr = tmp_name;
+ } else {
+ name_ptr = kmalloc(name_len, GFP_NOFS);
+ if (!name_ptr) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ }
+ read_extent_buffer(leaf, name_ptr,
+ (unsigned long)(di + 1), name_len);
+
+ d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
+ btrfs_dir_item_key_to_cpu(leaf, di, &location);
+
+
+ /* is this a reference to our own snapshot? If so
+ * skip it.
+ *
+ * In contrast to old kernels, we insert the snapshot's
+ * dir item and dir index after it has been created, so
+ * we won't find a reference to our own snapshot. We
+ * still keep the following code for backward
+ * compatibility.
+ */
+ if (location.type == BTRFS_ROOT_ITEM_KEY &&
+ location.objectid == root->root_key.objectid) {
+ over = 0;
+ goto skip;
+ }
+ over = !dir_emit(ctx, name_ptr, name_len,
+ location.objectid, d_type);
+
+skip:
+ if (name_ptr != tmp_name)
+ kfree(name_ptr);
+
+ if (over)
+ goto nopos;
+ di_len = btrfs_dir_name_len(leaf, di) +
+ btrfs_dir_data_len(leaf, di) + sizeof(*di);
+ di_cur += di_len;
+ di = (struct btrfs_dir_item *)((char *)di + di_len);
+ }
+next:
+ path->slots[0]++;
+ }
+
+ if (key_type == BTRFS_DIR_INDEX_KEY) {
+ if (is_curr)
+ ctx->pos++;
+ ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
+ if (ret)
+ goto nopos;
+ }
+
+ /* Reached end of directory/root. Bump pos past the last item. */
+ ctx->pos++;
+
+ /*
+ * Stop new entries from being returned after we return the last
+ * entry.
+ *
+ * New directory entries are assigned a strictly increasing
+ * offset. This means that new entries created during readdir
+ * are *guaranteed* to be seen in the future by that readdir.
+ * This has broken buggy programs which operate on names as
+ * they're returned by readdir. Until we re-use freed offsets
+ * we have this hack to stop new entries from being returned
+ * under the assumption that they'll never reach this huge
+ * offset.
+ *
+ * This is being careful not to overflow 32bit loff_t unless the
+ * last entry requires it because doing so has broken 32bit apps
+ * in the past.
+ */
+ if (key_type == BTRFS_DIR_INDEX_KEY) {
+ if (ctx->pos >= INT_MAX)
+ ctx->pos = LLONG_MAX;
+ else
+ ctx->pos = INT_MAX;
+ }
+nopos:
+ ret = 0;
+err:
+ if (key_type == BTRFS_DIR_INDEX_KEY)
+ btrfs_put_delayed_items(&ins_list, &del_list);
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_trans_handle *trans;
+ int ret = 0;
+ bool nolock = false;
+
+ if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
+ return 0;
+
+ if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(inode))
+ nolock = true;
+
+ if (wbc->sync_mode == WB_SYNC_ALL) {
+ if (nolock)
+ trans = btrfs_join_transaction_nolock(root);
+ else
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+ ret = btrfs_commit_transaction(trans, root);
+ }
+ return ret;
+}
+
+/*
+ * This is somewhat expensive, updating the tree every time the
+ * inode changes. But, it is most likely to find the inode in cache.
+ * FIXME, needs more benchmarking...there are no reasons other than performance
+ * to keep or drop this code.
+ */
+static int btrfs_dirty_inode(struct inode *inode)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_trans_handle *trans;
+ int ret;
+
+ if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
+ return 0;
+
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret && ret == -ENOSPC) {
+ /* whoops, lets try again with the full transaction */
+ btrfs_end_transaction(trans, root);
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ ret = btrfs_update_inode(trans, root, inode);
+ }
+ btrfs_end_transaction(trans, root);
+ if (BTRFS_I(inode)->delayed_node)
+ btrfs_balance_delayed_items(root);
+
+ return ret;
+}
+
+/*
+ * This is a copy of file_update_time. We need this so we can return error on
+ * ENOSPC for updating the inode in the case of file write and mmap writes.
+ */
+static int btrfs_update_time(struct inode *inode, struct timespec *now,
+ int flags)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+
+ if (btrfs_root_readonly(root))
+ return -EROFS;
+
+ if (flags & S_VERSION)
+ inode_inc_iversion(inode);
+ if (flags & S_CTIME)
+ inode->i_ctime = *now;
+ if (flags & S_MTIME)
+ inode->i_mtime = *now;
+ if (flags & S_ATIME)
+ inode->i_atime = *now;
+ return btrfs_dirty_inode(inode);
+}
+
+/*
+ * find the highest existing sequence number in a directory
+ * and then set the in-memory index_cnt variable to reflect
+ * free sequence numbers
+ */
+static int btrfs_set_inode_index_count(struct inode *inode)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_key key, found_key;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ int ret;
+
+ key.objectid = btrfs_ino(inode);
+ key.type = BTRFS_DIR_INDEX_KEY;
+ key.offset = (u64)-1;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ /* FIXME: we should be able to handle this */
+ if (ret == 0)
+ goto out;
+ ret = 0;
+
+ /*
+ * MAGIC NUMBER EXPLANATION:
+ * since we search a directory based on f_pos we have to start at 2
+ * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody
+ * else has to start at 2
+ */
+ if (path->slots[0] == 0) {
+ BTRFS_I(inode)->index_cnt = 2;
+ goto out;
+ }
+
+ path->slots[0]--;
+
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+ if (found_key.objectid != btrfs_ino(inode) ||
+ found_key.type != BTRFS_DIR_INDEX_KEY) {
+ BTRFS_I(inode)->index_cnt = 2;
+ goto out;
+ }
+
+ BTRFS_I(inode)->index_cnt = found_key.offset + 1;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * helper to find a free sequence number in a given directory. This current
+ * code is very simple, later versions will do smarter things in the btree
+ */
+int btrfs_set_inode_index(struct inode *dir, u64 *index)
+{
+ int ret = 0;
+
+ if (BTRFS_I(dir)->index_cnt == (u64)-1) {
+ ret = btrfs_inode_delayed_dir_index_count(dir);
+ if (ret) {
+ ret = btrfs_set_inode_index_count(dir);
+ if (ret)
+ return ret;
+ }
+ }
+
+ *index = BTRFS_I(dir)->index_cnt;
+ BTRFS_I(dir)->index_cnt++;
+
+ return ret;
+}
+
+static int btrfs_insert_inode_locked(struct inode *inode)
+{
+ struct btrfs_iget_args args;
+ args.location = &BTRFS_I(inode)->location;
+ args.root = BTRFS_I(inode)->root;
+
+ return insert_inode_locked4(inode,
+ btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root),
+ btrfs_find_actor, &args);
+}
+
+static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *dir,
+ const char *name, int name_len,
+ u64 ref_objectid, u64 objectid,
+ umode_t mode, u64 *index)
+{
+ struct inode *inode;
+ struct btrfs_inode_item *inode_item;
+ struct btrfs_key *location;
+ struct btrfs_path *path;
+ struct btrfs_inode_ref *ref;
+ struct btrfs_key key[2];
+ u32 sizes[2];
+ int nitems = name ? 2 : 1;
+ unsigned long ptr;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return ERR_PTR(-ENOMEM);
+
+ inode = new_inode(root->fs_info->sb);
+ if (!inode) {
+ btrfs_free_path(path);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ /*
+ * O_TMPFILE, set link count to 0, so that after this point,
+ * we fill in an inode item with the correct link count.
+ */
+ if (!name)
+ set_nlink(inode, 0);
+
+ /*
+ * we have to initialize this early, so we can reclaim the inode
+ * number if we fail afterwards in this function.
+ */
+ inode->i_ino = objectid;
+
+ if (dir && name) {
+ trace_btrfs_inode_request(dir);
+
+ ret = btrfs_set_inode_index(dir, index);
+ if (ret) {
+ btrfs_free_path(path);
+ iput(inode);
+ return ERR_PTR(ret);
+ }
+ } else if (dir) {
+ *index = 0;
+ }
+ /*
+ * index_cnt is ignored for everything but a dir,
+ * btrfs_get_inode_index_count has an explanation for the magic
+ * number
+ */
+ BTRFS_I(inode)->index_cnt = 2;
+ BTRFS_I(inode)->dir_index = *index;
+ BTRFS_I(inode)->root = root;
+ BTRFS_I(inode)->generation = trans->transid;
+ inode->i_generation = BTRFS_I(inode)->generation;
+
+ /*
+ * We could have gotten an inode number from somebody who was fsynced
+ * and then removed in this same transaction, so let's just set full
+ * sync since it will be a full sync anyway and this will blow away the
+ * old info in the log.
+ */
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
+
+ key[0].objectid = objectid;
+ key[0].type = BTRFS_INODE_ITEM_KEY;
+ key[0].offset = 0;
+
+ sizes[0] = sizeof(struct btrfs_inode_item);
+
+ if (name) {
+ /*
+ * Start new inodes with an inode_ref. This is slightly more
+ * efficient for small numbers of hard links since they will
+ * be packed into one item. Extended refs will kick in if we
+ * add more hard links than can fit in the ref item.
+ */
+ key[1].objectid = objectid;
+ key[1].type = BTRFS_INODE_REF_KEY;
+ key[1].offset = ref_objectid;
+
+ sizes[1] = name_len + sizeof(*ref);
+ }
+
+ location = &BTRFS_I(inode)->location;
+ location->objectid = objectid;
+ location->offset = 0;
+ location->type = BTRFS_INODE_ITEM_KEY;
+
+ ret = btrfs_insert_inode_locked(inode);
+ if (ret < 0)
+ goto fail;
+
+ path->leave_spinning = 1;
+ ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems);
+ if (ret != 0)
+ goto fail_unlock;
+
+ inode_init_owner(inode, dir, mode);
+ inode_set_bytes(inode, 0);
+
+ inode->i_mtime = CURRENT_TIME;
+ inode->i_atime = inode->i_mtime;
+ inode->i_ctime = inode->i_mtime;
+ BTRFS_I(inode)->i_otime = inode->i_mtime;
+
+ inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_inode_item);
+ memset_extent_buffer(path->nodes[0], 0, (unsigned long)inode_item,
+ sizeof(*inode_item));
+ fill_inode_item(trans, path->nodes[0], inode_item, inode);
+
+ if (name) {
+ ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
+ struct btrfs_inode_ref);
+ btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
+ btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
+ ptr = (unsigned long)(ref + 1);
+ write_extent_buffer(path->nodes[0], name, ptr, name_len);
+ }
+
+ btrfs_mark_buffer_dirty(path->nodes[0]);
+ btrfs_free_path(path);
+
+ btrfs_inherit_iflags(inode, dir);
+
+ if (S_ISREG(mode)) {
+ if (btrfs_test_opt(root, NODATASUM))
+ BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
+ if (btrfs_test_opt(root, NODATACOW))
+ BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
+ BTRFS_INODE_NODATASUM;
+ }
+
+ inode_tree_add(inode);
+
+ trace_btrfs_inode_new(inode);
+ btrfs_set_inode_last_trans(trans, inode);
+
+ btrfs_update_root_times(trans, root);
+
+ ret = btrfs_inode_inherit_props(trans, inode, dir);
+ if (ret)
+ btrfs_err(root->fs_info,
+ "error inheriting props for ino %llu (root %llu): %d",
+ btrfs_ino(inode), root->root_key.objectid, ret);
+
+ return inode;
+
+fail_unlock:
+ unlock_new_inode(inode);
+fail:
+ if (dir && name)
+ BTRFS_I(dir)->index_cnt--;
+ btrfs_free_path(path);
+ iput(inode);
+ return ERR_PTR(ret);
+}
+
+static inline u8 btrfs_inode_type(struct inode *inode)
+{
+ return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT];
+}
+
+/*
+ * utility function to add 'inode' into 'parent_inode' with
+ * a give name and a given sequence number.
+ * if 'add_backref' is true, also insert a backref from the
+ * inode to the parent directory.
+ */
+int btrfs_add_link(struct btrfs_trans_handle *trans,
+ struct inode *parent_inode, struct inode *inode,
+ const char *name, int name_len, int add_backref, u64 index)
+{
+ int ret = 0;
+ struct btrfs_key key;
+ struct btrfs_root *root = BTRFS_I(parent_inode)->root;
+ u64 ino = btrfs_ino(inode);
+ u64 parent_ino = btrfs_ino(parent_inode);
+
+ if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
+ memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key));
+ } else {
+ key.objectid = ino;
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+ }
+
+ if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
+ ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
+ key.objectid, root->root_key.objectid,
+ parent_ino, index, name, name_len);
+ } else if (add_backref) {
+ ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino,
+ parent_ino, index);
+ }
+
+ /* Nothing to clean up yet */
+ if (ret)
+ return ret;
+
+ ret = btrfs_insert_dir_item(trans, root, name, name_len,
+ parent_inode, &key,
+ btrfs_inode_type(inode), index);
+ if (ret == -EEXIST || ret == -EOVERFLOW)
+ goto fail_dir_item;
+ else if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ return ret;
+ }
+
+ btrfs_i_size_write(parent_inode, parent_inode->i_size +
+ name_len * 2);
+ inode_inc_iversion(parent_inode);
+ parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
+ ret = btrfs_update_inode(trans, root, parent_inode);
+ if (ret)
+ btrfs_abort_transaction(trans, root, ret);
+ return ret;
+
+fail_dir_item:
+ if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
+ u64 local_index;
+ int err;
+ err = btrfs_del_root_ref(trans, root->fs_info->tree_root,
+ key.objectid, root->root_key.objectid,
+ parent_ino, &local_index, name, name_len);
+
+ } else if (add_backref) {
+ u64 local_index;
+ int err;
+
+ err = btrfs_del_inode_ref(trans, root, name, name_len,
+ ino, parent_ino, &local_index);
+ }
+ return ret;
+}
+
+static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
+ struct inode *dir, struct dentry *dentry,
+ struct inode *inode, int backref, u64 index)
+{
+ int err = btrfs_add_link(trans, dir, inode,
+ dentry->d_name.name, dentry->d_name.len,
+ backref, index);
+ if (err > 0)
+ err = -EEXIST;
+ return err;
+}
+
+static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
+ umode_t mode, dev_t rdev)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct inode *inode = NULL;
+ int err;
+ int drop_inode = 0;
+ u64 objectid;
+ u64 index = 0;
+
+ if (!new_valid_dev(rdev))
+ return -EINVAL;
+
+ /*
+ * 2 for inode item and ref
+ * 2 for dir items
+ * 1 for xattr if selinux is on
+ */
+ trans = btrfs_start_transaction(root, 5);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ err = btrfs_find_free_ino(root, &objectid);
+ if (err)
+ goto out_unlock;
+
+ inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
+ dentry->d_name.len, btrfs_ino(dir), objectid,
+ mode, &index);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ goto out_unlock;
+ }
+
+ /*
+ * If the active LSM wants to access the inode during
+ * d_instantiate it needs these. Smack checks to see
+ * if the filesystem supports xattrs by looking at the
+ * ops vector.
+ */
+ inode->i_op = &btrfs_special_inode_operations;
+ init_special_inode(inode, inode->i_mode, rdev);
+
+ err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
+ if (err)
+ goto out_unlock_inode;
+
+ err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
+ if (err) {
+ goto out_unlock_inode;
+ } else {
+ btrfs_update_inode(trans, root, inode);
+ unlock_new_inode(inode);
+ d_instantiate(dentry, inode);
+ }
+
+out_unlock:
+ btrfs_end_transaction(trans, root);
+ btrfs_balance_delayed_items(root);
+ btrfs_btree_balance_dirty(root);
+ if (drop_inode) {
+ inode_dec_link_count(inode);
+ iput(inode);
+ }
+ return err;
+
+out_unlock_inode:
+ drop_inode = 1;
+ unlock_new_inode(inode);
+ goto out_unlock;
+
+}
+
+static int btrfs_create(struct inode *dir, struct dentry *dentry,
+ umode_t mode, bool excl)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct inode *inode = NULL;
+ int drop_inode_on_err = 0;
+ int err;
+ u64 objectid;
+ u64 index = 0;
+
+ /*
+ * 2 for inode item and ref
+ * 2 for dir items
+ * 1 for xattr if selinux is on
+ */
+ trans = btrfs_start_transaction(root, 5);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ err = btrfs_find_free_ino(root, &objectid);
+ if (err)
+ goto out_unlock;
+
+ inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
+ dentry->d_name.len, btrfs_ino(dir), objectid,
+ mode, &index);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ goto out_unlock;
+ }
+ drop_inode_on_err = 1;
+ /*
+ * If the active LSM wants to access the inode during
+ * d_instantiate it needs these. Smack checks to see
+ * if the filesystem supports xattrs by looking at the
+ * ops vector.
+ */
+ inode->i_fop = &btrfs_file_operations;
+ inode->i_op = &btrfs_file_inode_operations;
+ inode->i_mapping->a_ops = &btrfs_aops;
+
+ err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
+ if (err)
+ goto out_unlock_inode;
+
+ err = btrfs_update_inode(trans, root, inode);
+ if (err)
+ goto out_unlock_inode;
+
+ err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
+ if (err)
+ goto out_unlock_inode;
+
+ BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+ unlock_new_inode(inode);
+ d_instantiate(dentry, inode);
+
+out_unlock:
+ btrfs_end_transaction(trans, root);
+ if (err && drop_inode_on_err) {
+ inode_dec_link_count(inode);
+ iput(inode);
+ }
+ btrfs_balance_delayed_items(root);
+ btrfs_btree_balance_dirty(root);
+ return err;
+
+out_unlock_inode:
+ unlock_new_inode(inode);
+ goto out_unlock;
+
+}
+
+static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
+ struct dentry *dentry)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct inode *inode = d_inode(old_dentry);
+ u64 index;
+ int err;
+ int drop_inode = 0;
+
+ /* do not allow sys_link's with other subvols of the same device */
+ if (root->objectid != BTRFS_I(inode)->root->objectid)
+ return -EXDEV;
+
+ if (inode->i_nlink >= BTRFS_LINK_MAX)
+ return -EMLINK;
+
+ err = btrfs_set_inode_index(dir, &index);
+ if (err)
+ goto fail;
+
+ /*
+ * 2 items for inode and inode ref
+ * 2 items for dir items
+ * 1 item for parent inode
+ */
+ trans = btrfs_start_transaction(root, 5);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ goto fail;
+ }
+
+ /* There are several dir indexes for this inode, clear the cache. */
+ BTRFS_I(inode)->dir_index = 0ULL;
+ inc_nlink(inode);
+ inode_inc_iversion(inode);
+ inode->i_ctime = CURRENT_TIME;
+ ihold(inode);
+ set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
+
+ err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index);
+
+ if (err) {
+ drop_inode = 1;
+ } else {
+ struct dentry *parent = dentry->d_parent;
+ err = btrfs_update_inode(trans, root, inode);
+ if (err)
+ goto fail;
+ if (inode->i_nlink == 1) {
+ /*
+ * If new hard link count is 1, it's a file created
+ * with open(2) O_TMPFILE flag.
+ */
+ err = btrfs_orphan_del(trans, inode);
+ if (err)
+ goto fail;
+ }
+ d_instantiate(dentry, inode);
+ btrfs_log_new_name(trans, inode, NULL, parent);
+ }
+
+ btrfs_end_transaction(trans, root);
+ btrfs_balance_delayed_items(root);
+fail:
+ if (drop_inode) {
+ inode_dec_link_count(inode);
+ iput(inode);
+ }
+ btrfs_btree_balance_dirty(root);
+ return err;
+}
+
+static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ struct inode *inode = NULL;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ int err = 0;
+ int drop_on_err = 0;
+ u64 objectid = 0;
+ u64 index = 0;
+
+ /*
+ * 2 items for inode and ref
+ * 2 items for dir items
+ * 1 for xattr if selinux is on
+ */
+ trans = btrfs_start_transaction(root, 5);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ err = btrfs_find_free_ino(root, &objectid);
+ if (err)
+ goto out_fail;
+
+ inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
+ dentry->d_name.len, btrfs_ino(dir), objectid,
+ S_IFDIR | mode, &index);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ goto out_fail;
+ }
+
+ drop_on_err = 1;
+ /* these must be set before we unlock the inode */
+ inode->i_op = &btrfs_dir_inode_operations;
+ inode->i_fop = &btrfs_dir_file_operations;
+
+ err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
+ if (err)
+ goto out_fail_inode;
+
+ btrfs_i_size_write(inode, 0);
+ err = btrfs_update_inode(trans, root, inode);
+ if (err)
+ goto out_fail_inode;
+
+ err = btrfs_add_link(trans, dir, inode, dentry->d_name.name,
+ dentry->d_name.len, 0, index);
+ if (err)
+ goto out_fail_inode;
+
+ d_instantiate(dentry, inode);
+ /*
+ * mkdir is special. We're unlocking after we call d_instantiate
+ * to avoid a race with nfsd calling d_instantiate.
+ */
+ unlock_new_inode(inode);
+ drop_on_err = 0;
+
+out_fail:
+ btrfs_end_transaction(trans, root);
+ if (drop_on_err) {
+ inode_dec_link_count(inode);
+ iput(inode);
+ }
+ btrfs_balance_delayed_items(root);
+ btrfs_btree_balance_dirty(root);
+ return err;
+
+out_fail_inode:
+ unlock_new_inode(inode);
+ goto out_fail;
+}
+
+/* Find next extent map of a given extent map, caller needs to ensure locks */
+static struct extent_map *next_extent_map(struct extent_map *em)
+{
+ struct rb_node *next;
+
+ next = rb_next(&em->rb_node);
+ if (!next)
+ return NULL;
+ return container_of(next, struct extent_map, rb_node);
+}
+
+static struct extent_map *prev_extent_map(struct extent_map *em)
+{
+ struct rb_node *prev;
+
+ prev = rb_prev(&em->rb_node);
+ if (!prev)
+ return NULL;
+ return container_of(prev, struct extent_map, rb_node);
+}
+
+/* helper for btfs_get_extent. Given an existing extent in the tree,
+ * the existing extent is the nearest extent to map_start,
+ * and an extent that you want to insert, deal with overlap and insert
+ * the best fitted new extent into the tree.
+ */
+static int merge_extent_mapping(struct extent_map_tree *em_tree,
+ struct extent_map *existing,
+ struct extent_map *em,
+ u64 map_start)
+{
+ struct extent_map *prev;
+ struct extent_map *next;
+ u64 start;
+ u64 end;
+ u64 start_diff;
+
+ BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
+
+ if (existing->start > map_start) {
+ next = existing;
+ prev = prev_extent_map(next);
+ } else {
+ prev = existing;
+ next = next_extent_map(prev);
+ }
+
+ start = prev ? extent_map_end(prev) : em->start;
+ start = max_t(u64, start, em->start);
+ end = next ? next->start : extent_map_end(em);
+ end = min_t(u64, end, extent_map_end(em));
+ start_diff = start - em->start;
+ em->start = start;
+ em->len = end - start;
+ if (em->block_start < EXTENT_MAP_LAST_BYTE &&
+ !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+ em->block_start += start_diff;
+ em->block_len -= start_diff;
+ }
+ return add_extent_mapping(em_tree, em, 0);
+}
+
+static noinline int uncompress_inline(struct btrfs_path *path,
+ struct inode *inode, struct page *page,
+ size_t pg_offset, u64 extent_offset,
+ struct btrfs_file_extent_item *item)
+{
+ int ret;
+ struct extent_buffer *leaf = path->nodes[0];
+ char *tmp;
+ size_t max_size;
+ unsigned long inline_size;
+ unsigned long ptr;
+ int compress_type;
+
+ WARN_ON(pg_offset != 0);
+ compress_type = btrfs_file_extent_compression(leaf, item);
+ max_size = btrfs_file_extent_ram_bytes(leaf, item);
+ inline_size = btrfs_file_extent_inline_item_len(leaf,
+ btrfs_item_nr(path->slots[0]));
+ tmp = kmalloc(inline_size, GFP_NOFS);
+ if (!tmp)
+ return -ENOMEM;
+ ptr = btrfs_file_extent_inline_start(item);
+
+ read_extent_buffer(leaf, tmp, ptr, inline_size);
+
+ max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
+ ret = btrfs_decompress(compress_type, tmp, page,
+ extent_offset, inline_size, max_size);
+ kfree(tmp);
+ return ret;
+}
+
+/*
+ * a bit scary, this does extent mapping from logical file offset to the disk.
+ * the ugly parts come from merging extents from the disk with the in-ram
+ * representation. This gets more complex because of the data=ordered code,
+ * where the in-ram extents might be locked pending data=ordered completion.
+ *
+ * This also copies inline extents directly into the page.
+ */
+
+struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
+ size_t pg_offset, u64 start, u64 len,
+ int create)
+{
+ int ret;
+ int err = 0;
+ u64 extent_start = 0;
+ u64 extent_end = 0;
+ u64 objectid = btrfs_ino(inode);
+ u32 found_type;
+ struct btrfs_path *path = NULL;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_file_extent_item *item;
+ struct extent_buffer *leaf;
+ struct btrfs_key found_key;
+ struct extent_map *em = NULL;
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct btrfs_trans_handle *trans = NULL;
+ const bool new_inline = !page || create;
+
+again:
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, start, len);
+ if (em)
+ em->bdev = root->fs_info->fs_devices->latest_bdev;
+ read_unlock(&em_tree->lock);
+
+ if (em) {
+ if (em->start > start || em->start + em->len <= start)
+ free_extent_map(em);
+ else if (em->block_start == EXTENT_MAP_INLINE && page)
+ free_extent_map(em);
+ else
+ goto out;
+ }
+ em = alloc_extent_map();
+ if (!em) {
+ err = -ENOMEM;
+ goto out;
+ }
+ em->bdev = root->fs_info->fs_devices->latest_bdev;
+ em->start = EXTENT_MAP_HOLE;
+ em->orig_start = EXTENT_MAP_HOLE;
+ em->len = (u64)-1;
+ em->block_len = (u64)-1;
+
+ if (!path) {
+ path = btrfs_alloc_path();
+ if (!path) {
+ err = -ENOMEM;
+ goto out;
+ }
+ /*
+ * Chances are we'll be called again, so go ahead and do
+ * readahead
+ */
+ path->reada = 1;
+ }
+
+ ret = btrfs_lookup_file_extent(trans, root, path,
+ objectid, start, trans != NULL);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+
+ if (ret != 0) {
+ if (path->slots[0] == 0)
+ goto not_found;
+ path->slots[0]--;
+ }
+
+ leaf = path->nodes[0];
+ item = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ /* are we inside the extent that was found? */
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ found_type = found_key.type;
+ if (found_key.objectid != objectid ||
+ found_type != BTRFS_EXTENT_DATA_KEY) {
+ /*
+ * If we backup past the first extent we want to move forward
+ * and see if there is an extent in front of us, otherwise we'll
+ * say there is a hole for our whole search range which can
+ * cause problems.
+ */
+ extent_end = start;
+ goto next;
+ }
+
+ found_type = btrfs_file_extent_type(leaf, item);
+ extent_start = found_key.offset;
+ if (found_type == BTRFS_FILE_EXTENT_REG ||
+ found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ extent_end = extent_start +
+ btrfs_file_extent_num_bytes(leaf, item);
+ } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+ size_t size;
+ size = btrfs_file_extent_inline_len(leaf, path->slots[0], item);
+ extent_end = ALIGN(extent_start + size, root->sectorsize);
+ }
+next:
+ if (start >= extent_end) {
+ path->slots[0]++;
+ if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+ if (ret > 0)
+ goto not_found;
+ leaf = path->nodes[0];
+ }
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ if (found_key.objectid != objectid ||
+ found_key.type != BTRFS_EXTENT_DATA_KEY)
+ goto not_found;
+ if (start + len <= found_key.offset)
+ goto not_found;
+ if (start > found_key.offset)
+ goto next;
+ em->start = start;
+ em->orig_start = start;
+ em->len = found_key.offset - start;
+ goto not_found_em;
+ }
+
+ btrfs_extent_item_to_extent_map(inode, path, item, new_inline, em);
+
+ if (found_type == BTRFS_FILE_EXTENT_REG ||
+ found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ goto insert;
+ } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+ unsigned long ptr;
+ char *map;
+ size_t size;
+ size_t extent_offset;
+ size_t copy_size;
+
+ if (new_inline)
+ goto out;
+
+ size = btrfs_file_extent_inline_len(leaf, path->slots[0], item);
+ extent_offset = page_offset(page) + pg_offset - extent_start;
+ copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset,
+ size - extent_offset);
+ em->start = extent_start + extent_offset;
+ em->len = ALIGN(copy_size, root->sectorsize);
+ em->orig_block_len = em->len;
+ em->orig_start = em->start;
+ ptr = btrfs_file_extent_inline_start(item) + extent_offset;
+ if (create == 0 && !PageUptodate(page)) {
+ if (btrfs_file_extent_compression(leaf, item) !=
+ BTRFS_COMPRESS_NONE) {
+ ret = uncompress_inline(path, inode, page,
+ pg_offset,
+ extent_offset, item);
+ if (ret) {
+ err = ret;
+ goto out;
+ }
+ } else {
+ map = kmap(page);
+ read_extent_buffer(leaf, map + pg_offset, ptr,
+ copy_size);
+ if (pg_offset + copy_size < PAGE_CACHE_SIZE) {
+ memset(map + pg_offset + copy_size, 0,
+ PAGE_CACHE_SIZE - pg_offset -
+ copy_size);
+ }
+ kunmap(page);
+ }
+ flush_dcache_page(page);
+ } else if (create && PageUptodate(page)) {
+ BUG();
+ if (!trans) {
+ kunmap(page);
+ free_extent_map(em);
+ em = NULL;
+
+ btrfs_release_path(path);
+ trans = btrfs_join_transaction(root);
+
+ if (IS_ERR(trans))
+ return ERR_CAST(trans);
+ goto again;
+ }
+ map = kmap(page);
+ write_extent_buffer(leaf, map + pg_offset, ptr,
+ copy_size);
+ kunmap(page);
+ btrfs_mark_buffer_dirty(leaf);
+ }
+ set_extent_uptodate(io_tree, em->start,
+ extent_map_end(em) - 1, NULL, GFP_NOFS);
+ goto insert;
+ }
+not_found:
+ em->start = start;
+ em->orig_start = start;
+ em->len = len;
+not_found_em:
+ em->block_start = EXTENT_MAP_HOLE;
+ set_bit(EXTENT_FLAG_VACANCY, &em->flags);
+insert:
+ btrfs_release_path(path);
+ if (em->start > start || extent_map_end(em) <= start) {
+ btrfs_err(root->fs_info, "bad extent! em: [%llu %llu] passed [%llu %llu]",
+ em->start, em->len, start, len);
+ err = -EIO;
+ goto out;
+ }
+
+ err = 0;
+ write_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, em, 0);
+ /* it is possible that someone inserted the extent into the tree
+ * while we had the lock dropped. It is also possible that
+ * an overlapping map exists in the tree
+ */
+ if (ret == -EEXIST) {
+ struct extent_map *existing;
+
+ ret = 0;
+
+ existing = search_extent_mapping(em_tree, start, len);
+ /*
+ * existing will always be non-NULL, since there must be
+ * extent causing the -EEXIST.
+ */
+ if (start >= extent_map_end(existing) ||
+ start <= existing->start) {
+ /*
+ * The existing extent map is the one nearest to
+ * the [start, start + len) range which overlaps
+ */
+ err = merge_extent_mapping(em_tree, existing,
+ em, start);
+ free_extent_map(existing);
+ if (err) {
+ free_extent_map(em);
+ em = NULL;
+ }
+ } else {
+ free_extent_map(em);
+ em = existing;
+ err = 0;
+ }
+ }
+ write_unlock(&em_tree->lock);
+out:
+
+ trace_btrfs_get_extent(root, em);
+
+ if (path)
+ btrfs_free_path(path);
+ if (trans) {
+ ret = btrfs_end_transaction(trans, root);
+ if (!err)
+ err = ret;
+ }
+ if (err) {
+ free_extent_map(em);
+ return ERR_PTR(err);
+ }
+ BUG_ON(!em); /* Error is always set */
+ return em;
+}
+
+struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
+ size_t pg_offset, u64 start, u64 len,
+ int create)
+{
+ struct extent_map *em;
+ struct extent_map *hole_em = NULL;
+ u64 range_start = start;
+ u64 end;
+ u64 found;
+ u64 found_end;
+ int err = 0;
+
+ em = btrfs_get_extent(inode, page, pg_offset, start, len, create);
+ if (IS_ERR(em))
+ return em;
+ if (em) {
+ /*
+ * if our em maps to
+ * - a hole or
+ * - a pre-alloc extent,
+ * there might actually be delalloc bytes behind it.
+ */
+ if (em->block_start != EXTENT_MAP_HOLE &&
+ !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ return em;
+ else
+ hole_em = em;
+ }
+
+ /* check to see if we've wrapped (len == -1 or similar) */
+ end = start + len;
+ if (end < start)
+ end = (u64)-1;
+ else
+ end -= 1;
+
+ em = NULL;
+
+ /* ok, we didn't find anything, lets look for delalloc */
+ found = count_range_bits(&BTRFS_I(inode)->io_tree, &range_start,
+ end, len, EXTENT_DELALLOC, 1);
+ found_end = range_start + found;
+ if (found_end < range_start)
+ found_end = (u64)-1;
+
+ /*
+ * we didn't find anything useful, return
+ * the original results from get_extent()
+ */
+ if (range_start > end || found_end <= start) {
+ em = hole_em;
+ hole_em = NULL;
+ goto out;
+ }
+
+ /* adjust the range_start to make sure it doesn't
+ * go backwards from the start they passed in
+ */
+ range_start = max(start, range_start);
+ found = found_end - range_start;
+
+ if (found > 0) {
+ u64 hole_start = start;
+ u64 hole_len = len;
+
+ em = alloc_extent_map();
+ if (!em) {
+ err = -ENOMEM;
+ goto out;
+ }
+ /*
+ * when btrfs_get_extent can't find anything it
+ * returns one huge hole
+ *
+ * make sure what it found really fits our range, and
+ * adjust to make sure it is based on the start from
+ * the caller
+ */
+ if (hole_em) {
+ u64 calc_end = extent_map_end(hole_em);
+
+ if (calc_end <= start || (hole_em->start > end)) {
+ free_extent_map(hole_em);
+ hole_em = NULL;
+ } else {
+ hole_start = max(hole_em->start, start);
+ hole_len = calc_end - hole_start;
+ }
+ }
+ em->bdev = NULL;
+ if (hole_em && range_start > hole_start) {
+ /* our hole starts before our delalloc, so we
+ * have to return just the parts of the hole
+ * that go until the delalloc starts
+ */
+ em->len = min(hole_len,
+ range_start - hole_start);
+ em->start = hole_start;
+ em->orig_start = hole_start;
+ /*
+ * don't adjust block start at all,
+ * it is fixed at EXTENT_MAP_HOLE
+ */
+ em->block_start = hole_em->block_start;
+ em->block_len = hole_len;
+ if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags))
+ set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+ } else {
+ em->start = range_start;
+ em->len = found;
+ em->orig_start = range_start;
+ em->block_start = EXTENT_MAP_DELALLOC;
+ em->block_len = found;
+ }
+ } else if (hole_em) {
+ return hole_em;
+ }
+out:
+
+ free_extent_map(hole_em);
+ if (err) {
+ free_extent_map(em);
+ return ERR_PTR(err);
+ }
+ return em;
+}
+
+static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
+ u64 start, u64 len)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct extent_map *em;
+ struct btrfs_key ins;
+ u64 alloc_hint;
+ int ret;
+
+ alloc_hint = get_extent_allocation_hint(inode, start, len);
+ ret = btrfs_reserve_extent(root, len, root->sectorsize, 0,
+ alloc_hint, &ins, 1, 1);
+ if (ret)
+ return ERR_PTR(ret);
+
+ em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
+ ins.offset, ins.offset, ins.offset, 0);
+ if (IS_ERR(em)) {
+ btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
+ return em;
+ }
+
+ ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
+ ins.offset, ins.offset, 0);
+ if (ret) {
+ btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
+ free_extent_map(em);
+ return ERR_PTR(ret);
+ }
+
+ return em;
+}
+
+/*
+ * returns 1 when the nocow is safe, < 1 on error, 0 if the
+ * block must be cow'd
+ */
+noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
+ u64 *orig_start, u64 *orig_block_len,
+ u64 *ram_bytes)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_path *path;
+ int ret;
+ struct extent_buffer *leaf;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_key key;
+ u64 disk_bytenr;
+ u64 backref_offset;
+ u64 extent_end;
+ u64 num_bytes;
+ int slot;
+ int found_type;
+ bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
+ offset, 0);
+ if (ret < 0)
+ goto out;
+
+ slot = path->slots[0];
+ if (ret == 1) {
+ if (slot == 0) {
+ /* can't find the item, must cow */
+ ret = 0;
+ goto out;
+ }
+ slot--;
+ }
+ ret = 0;
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid != btrfs_ino(inode) ||
+ key.type != BTRFS_EXTENT_DATA_KEY) {
+ /* not our file or wrong item type, must cow */
+ goto out;
+ }
+
+ if (key.offset > offset) {
+ /* Wrong offset, must cow */
+ goto out;
+ }
+
+ fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+ found_type = btrfs_file_extent_type(leaf, fi);
+ if (found_type != BTRFS_FILE_EXTENT_REG &&
+ found_type != BTRFS_FILE_EXTENT_PREALLOC) {
+ /* not a regular extent, must cow */
+ goto out;
+ }
+
+ if (!nocow && found_type == BTRFS_FILE_EXTENT_REG)
+ goto out;
+
+ extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
+ if (extent_end <= offset)
+ goto out;
+
+ disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+ if (disk_bytenr == 0)
+ goto out;
+
+ if (btrfs_file_extent_compression(leaf, fi) ||
+ btrfs_file_extent_encryption(leaf, fi) ||
+ btrfs_file_extent_other_encoding(leaf, fi))
+ goto out;
+
+ backref_offset = btrfs_file_extent_offset(leaf, fi);
+
+ if (orig_start) {
+ *orig_start = key.offset - backref_offset;
+ *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
+ *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
+ }
+
+ if (btrfs_extent_readonly(root, disk_bytenr))
+ goto out;
+
+ num_bytes = min(offset + *len, extent_end) - offset;
+ if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ u64 range_end;
+
+ range_end = round_up(offset + num_bytes, root->sectorsize) - 1;
+ ret = test_range_bit(io_tree, offset, range_end,
+ EXTENT_DELALLOC, 0, NULL);
+ if (ret) {
+ ret = -EAGAIN;
+ goto out;
+ }
+ }
+
+ btrfs_release_path(path);
+
+ /*
+ * look for other files referencing this extent, if we
+ * find any we must cow
+ */
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ ret = 0;
+ goto out;
+ }
+
+ ret = btrfs_cross_ref_exist(trans, root, btrfs_ino(inode),
+ key.offset - backref_offset, disk_bytenr);
+ btrfs_end_transaction(trans, root);
+ if (ret) {
+ ret = 0;
+ goto out;
+ }
+
+ /*
+ * adjust disk_bytenr and num_bytes to cover just the bytes
+ * in this extent we are about to write. If there
+ * are any csums in that range we have to cow in order
+ * to keep the csums correct
+ */
+ disk_bytenr += backref_offset;
+ disk_bytenr += offset - key.offset;
+ if (csum_exist_in_range(root, disk_bytenr, num_bytes))
+ goto out;
+ /*
+ * all of the above have passed, it is safe to overwrite this extent
+ * without cow
+ */
+ *len = num_bytes;
+ ret = 1;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end)
+{
+ struct radix_tree_root *root = &inode->i_mapping->page_tree;
+ int found = false;
+ void **pagep = NULL;
+ struct page *page = NULL;
+ int start_idx;
+ int end_idx;
+
+ start_idx = start >> PAGE_CACHE_SHIFT;
+
+ /*
+ * end is the last byte in the last page. end == start is legal
+ */
+ end_idx = end >> PAGE_CACHE_SHIFT;
+
+ rcu_read_lock();
+
+ /* Most of the code in this while loop is lifted from
+ * find_get_page. It's been modified to begin searching from a
+ * page and return just the first page found in that range. If the
+ * found idx is less than or equal to the end idx then we know that
+ * a page exists. If no pages are found or if those pages are
+ * outside of the range then we're fine (yay!) */
+ while (page == NULL &&
+ radix_tree_gang_lookup_slot(root, &pagep, NULL, start_idx, 1)) {
+ page = radix_tree_deref_slot(pagep);
+ if (unlikely(!page))
+ break;
+
+ if (radix_tree_exception(page)) {
+ if (radix_tree_deref_retry(page)) {
+ page = NULL;
+ continue;
+ }
+ /*
+ * Otherwise, shmem/tmpfs must be storing a swap entry
+ * here as an exceptional entry: so return it without
+ * attempting to raise page count.
+ */
+ page = NULL;
+ break; /* TODO: Is this relevant for this use case? */
+ }
+
+ if (!page_cache_get_speculative(page)) {
+ page = NULL;
+ continue;
+ }
+
+ /*
+ * Has the page moved?
+ * This is part of the lockless pagecache protocol. See
+ * include/linux/pagemap.h for details.
+ */
+ if (unlikely(page != *pagep)) {
+ page_cache_release(page);
+ page = NULL;
+ }
+ }
+
+ if (page) {
+ if (page->index <= end_idx)
+ found = true;
+ page_cache_release(page);
+ }
+
+ rcu_read_unlock();
+ return found;
+}
+
+static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
+ struct extent_state **cached_state, int writing)
+{
+ struct btrfs_ordered_extent *ordered;
+ int ret = 0;
+
+ while (1) {
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ 0, cached_state);
+ /*
+ * We're concerned with the entire range that we're going to be
+ * doing DIO to, so we need to make sure theres no ordered
+ * extents in this range.
+ */
+ ordered = btrfs_lookup_ordered_range(inode, lockstart,
+ lockend - lockstart + 1);
+
+ /*
+ * We need to make sure there are no buffered pages in this
+ * range either, we could have raced between the invalidate in
+ * generic_file_direct_write and locking the extent. The
+ * invalidate needs to happen so that reads after a write do not
+ * get stale data.
+ */
+ if (!ordered &&
+ (!writing ||
+ !btrfs_page_exists_in_range(inode, lockstart, lockend)))
+ break;
+
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ cached_state, GFP_NOFS);
+
+ if (ordered) {
+ btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_put_ordered_extent(ordered);
+ } else {
+ /* Screw you mmap */
+ ret = btrfs_fdatawrite_range(inode, lockstart, lockend);
+ if (ret)
+ break;
+ ret = filemap_fdatawait_range(inode->i_mapping,
+ lockstart,
+ lockend);
+ if (ret)
+ break;
+
+ /*
+ * If we found a page that couldn't be invalidated just
+ * fall back to buffered.
+ */
+ ret = invalidate_inode_pages2_range(inode->i_mapping,
+ lockstart >> PAGE_CACHE_SHIFT,
+ lockend >> PAGE_CACHE_SHIFT);
+ if (ret)
+ break;
+ }
+
+ cond_resched();
+ }
+
+ return ret;
+}
+
+static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
+ u64 len, u64 orig_start,
+ u64 block_start, u64 block_len,
+ u64 orig_block_len, u64 ram_bytes,
+ int type)
+{
+ struct extent_map_tree *em_tree;
+ struct extent_map *em;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret;
+
+ em_tree = &BTRFS_I(inode)->extent_tree;
+ em = alloc_extent_map();
+ if (!em)
+ return ERR_PTR(-ENOMEM);
+
+ em->start = start;
+ em->orig_start = orig_start;
+ em->mod_start = start;
+ em->mod_len = len;
+ em->len = len;
+ em->block_len = block_len;
+ em->block_start = block_start;
+ em->bdev = root->fs_info->fs_devices->latest_bdev;
+ em->orig_block_len = orig_block_len;
+ em->ram_bytes = ram_bytes;
+ em->generation = -1;
+ set_bit(EXTENT_FLAG_PINNED, &em->flags);
+ if (type == BTRFS_ORDERED_PREALLOC)
+ set_bit(EXTENT_FLAG_FILLING, &em->flags);
+
+ do {
+ btrfs_drop_extent_cache(inode, em->start,
+ em->start + em->len - 1, 0);
+ write_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, em, 1);
+ write_unlock(&em_tree->lock);
+ } while (ret == -EEXIST);
+
+ if (ret) {
+ free_extent_map(em);
+ return ERR_PTR(ret);
+ }
+
+ return em;
+}
+
+
+static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ struct extent_map *em;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct extent_state *cached_state = NULL;
+ u64 start = iblock << inode->i_blkbits;
+ u64 lockstart, lockend;
+ u64 len = bh_result->b_size;
+ u64 *outstanding_extents = NULL;
+ int unlock_bits = EXTENT_LOCKED;
+ int ret = 0;
+
+ if (create)
+ unlock_bits |= EXTENT_DIRTY;
+ else
+ len = min_t(u64, len, root->sectorsize);
+
+ lockstart = start;
+ lockend = start + len - 1;
+
+ if (current->journal_info) {
+ /*
+ * Need to pull our outstanding extents and set journal_info to NULL so
+ * that anything that needs to check if there's a transction doesn't get
+ * confused.
+ */
+ outstanding_extents = current->journal_info;
+ current->journal_info = NULL;
+ }
+
+ /*
+ * If this errors out it's because we couldn't invalidate pagecache for
+ * this range and we need to fallback to buffered.
+ */
+ if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create))
+ return -ENOTBLK;
+
+ em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto unlock_err;
+ }
+
+ /*
+ * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
+ * io. INLINE is special, and we could probably kludge it in here, but
+ * it's still buffered so for safety lets just fall back to the generic
+ * buffered path.
+ *
+ * For COMPRESSED we _have_ to read the entire extent in so we can
+ * decompress it, so there will be buffering required no matter what we
+ * do, so go ahead and fallback to buffered.
+ *
+ * We return -ENOTBLK because thats what makes DIO go ahead and go back
+ * to buffered IO. Don't blame me, this is the price we pay for using
+ * the generic code.
+ */
+ if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
+ em->block_start == EXTENT_MAP_INLINE) {
+ free_extent_map(em);
+ ret = -ENOTBLK;
+ goto unlock_err;
+ }
+
+ /* Just a good old fashioned hole, return */
+ if (!create && (em->block_start == EXTENT_MAP_HOLE ||
+ test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
+ free_extent_map(em);
+ goto unlock_err;
+ }
+
+ /*
+ * We don't allocate a new extent in the following cases
+ *
+ * 1) The inode is marked as NODATACOW. In this case we'll just use the
+ * existing extent.
+ * 2) The extent is marked as PREALLOC. We're good to go here and can
+ * just use the extent.
+ *
+ */
+ if (!create) {
+ len = min(len, em->len - (start - em->start));
+ lockstart = start + len;
+ goto unlock;
+ }
+
+ if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
+ ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
+ em->block_start != EXTENT_MAP_HOLE)) {
+ int type;
+ u64 block_start, orig_start, orig_block_len, ram_bytes;
+
+ if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ type = BTRFS_ORDERED_PREALLOC;
+ else
+ type = BTRFS_ORDERED_NOCOW;
+ len = min(len, em->len - (start - em->start));
+ block_start = em->block_start + (start - em->start);
+
+ if (can_nocow_extent(inode, start, &len, &orig_start,
+ &orig_block_len, &ram_bytes) == 1) {
+ if (type == BTRFS_ORDERED_PREALLOC) {
+ free_extent_map(em);
+ em = create_pinned_em(inode, start, len,
+ orig_start,
+ block_start, len,
+ orig_block_len,
+ ram_bytes, type);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto unlock_err;
+ }
+ }
+
+ ret = btrfs_add_ordered_extent_dio(inode, start,
+ block_start, len, len, type);
+ if (ret) {
+ free_extent_map(em);
+ goto unlock_err;
+ }
+ goto unlock;
+ }
+ }
+
+ /*
+ * this will cow the extent, reset the len in case we changed
+ * it above
+ */
+ len = bh_result->b_size;
+ free_extent_map(em);
+ em = btrfs_new_extent_direct(inode, start, len);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto unlock_err;
+ }
+ len = min(len, em->len - (start - em->start));
+unlock:
+ bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
+ inode->i_blkbits;
+ bh_result->b_size = len;
+ bh_result->b_bdev = em->bdev;
+ set_buffer_mapped(bh_result);
+ if (create) {
+ if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ set_buffer_new(bh_result);
+
+ /*
+ * Need to update the i_size under the extent lock so buffered
+ * readers will get the updated i_size when we unlock.
+ */
+ if (start + len > i_size_read(inode))
+ i_size_write(inode, start + len);
+
+ /*
+ * If we have an outstanding_extents count still set then we're
+ * within our reservation, otherwise we need to adjust our inode
+ * counter appropriately.
+ */
+ if (*outstanding_extents) {
+ (*outstanding_extents)--;
+ } else {
+ spin_lock(&BTRFS_I(inode)->lock);
+ BTRFS_I(inode)->outstanding_extents++;
+ spin_unlock(&BTRFS_I(inode)->lock);
+ }
+
+ current->journal_info = outstanding_extents;
+ btrfs_free_reserved_data_space(inode, len);
+ }
+
+ /*
+ * In the case of write we need to clear and unlock the entire range,
+ * in the case of read we need to unlock only the end area that we
+ * aren't using if there is any left over space.
+ */
+ if (lockstart < lockend) {
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+ lockend, unlock_bits, 1, 0,
+ &cached_state, GFP_NOFS);
+ } else {
+ free_extent_state(cached_state);
+ }
+
+ free_extent_map(em);
+
+ return 0;
+
+unlock_err:
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ unlock_bits, 1, 0, &cached_state, GFP_NOFS);
+ if (outstanding_extents)
+ current->journal_info = outstanding_extents;
+ return ret;
+}
+
+static inline int submit_dio_repair_bio(struct inode *inode, struct bio *bio,
+ int rw, int mirror_num)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret;
+
+ BUG_ON(rw & REQ_WRITE);
+
+ bio_get(bio);
+
+ ret = btrfs_bio_wq_end_io(root->fs_info, bio,
+ BTRFS_WQ_ENDIO_DIO_REPAIR);
+ if (ret)
+ goto err;
+
+ ret = btrfs_map_bio(root, rw, bio, mirror_num, 0);
+err:
+ bio_put(bio);
+ return ret;
+}
+
+static int btrfs_check_dio_repairable(struct inode *inode,
+ struct bio *failed_bio,
+ struct io_failure_record *failrec,
+ int failed_mirror)
+{
+ int num_copies;
+
+ num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info,
+ failrec->logical, failrec->len);
+ if (num_copies == 1) {
+ /*
+ * we only have a single copy of the data, so don't bother with
+ * all the retry and error correction code that follows. no
+ * matter what the error is, it is very likely to persist.
+ */
+ pr_debug("Check DIO Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n",
+ num_copies, failrec->this_mirror, failed_mirror);
+ return 0;
+ }
+
+ failrec->failed_mirror = failed_mirror;
+ failrec->this_mirror++;
+ if (failrec->this_mirror == failed_mirror)
+ failrec->this_mirror++;
+
+ if (failrec->this_mirror > num_copies) {
+ pr_debug("Check DIO Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n",
+ num_copies, failrec->this_mirror, failed_mirror);
+ return 0;
+ }
+
+ return 1;
+}
+
+static int dio_read_error(struct inode *inode, struct bio *failed_bio,
+ struct page *page, u64 start, u64 end,
+ int failed_mirror, bio_end_io_t *repair_endio,
+ void *repair_arg)
+{
+ struct io_failure_record *failrec;
+ struct bio *bio;
+ int isector;
+ int read_mode;
+ int ret;
+
+ BUG_ON(failed_bio->bi_rw & REQ_WRITE);
+
+ ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
+ if (ret)
+ return ret;
+
+ ret = btrfs_check_dio_repairable(inode, failed_bio, failrec,
+ failed_mirror);
+ if (!ret) {
+ free_io_failure(inode, failrec);
+ return -EIO;
+ }
+
+ if (failed_bio->bi_vcnt > 1)
+ read_mode = READ_SYNC | REQ_FAILFAST_DEV;
+ else
+ read_mode = READ_SYNC;
+
+ isector = start - btrfs_io_bio(failed_bio)->logical;
+ isector >>= inode->i_sb->s_blocksize_bits;
+ bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
+ 0, isector, repair_endio, repair_arg);
+ if (!bio) {
+ free_io_failure(inode, failrec);
+ return -EIO;
+ }
+
+ btrfs_debug(BTRFS_I(inode)->root->fs_info,
+ "Repair DIO Read Error: submitting new dio read[%#x] to this_mirror=%d, in_validation=%d\n",
+ read_mode, failrec->this_mirror, failrec->in_validation);
+
+ ret = submit_dio_repair_bio(inode, bio, read_mode,
+ failrec->this_mirror);
+ if (ret) {
+ free_io_failure(inode, failrec);
+ bio_put(bio);
+ }
+
+ return ret;
+}
+
+struct btrfs_retry_complete {
+ struct completion done;
+ struct inode *inode;
+ u64 start;
+ int uptodate;
+};
+
+static void btrfs_retry_endio_nocsum(struct bio *bio, int err)
+{
+ struct btrfs_retry_complete *done = bio->bi_private;
+ struct bio_vec *bvec;
+ int i;
+
+ if (err)
+ goto end;
+
+ done->uptodate = 1;
+ bio_for_each_segment_all(bvec, bio, i)
+ clean_io_failure(done->inode, done->start, bvec->bv_page, 0);
+end:
+ complete(&done->done);
+ bio_put(bio);
+}
+
+static int __btrfs_correct_data_nocsum(struct inode *inode,
+ struct btrfs_io_bio *io_bio)
+{
+ struct bio_vec *bvec;
+ struct btrfs_retry_complete done;
+ u64 start;
+ int i;
+ int ret;
+
+ start = io_bio->logical;
+ done.inode = inode;
+
+ bio_for_each_segment_all(bvec, &io_bio->bio, i) {
+try_again:
+ done.uptodate = 0;
+ done.start = start;
+ init_completion(&done.done);
+
+ ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start,
+ start + bvec->bv_len - 1,
+ io_bio->mirror_num,
+ btrfs_retry_endio_nocsum, &done);
+ if (ret)
+ return ret;
+
+ wait_for_completion(&done.done);
+
+ if (!done.uptodate) {
+ /* We might have another mirror, so try again */
+ goto try_again;
+ }
+
+ start += bvec->bv_len;
+ }
+
+ return 0;
+}
+
+static void btrfs_retry_endio(struct bio *bio, int err)
+{
+ struct btrfs_retry_complete *done = bio->bi_private;
+ struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+ struct bio_vec *bvec;
+ int uptodate;
+ int ret;
+ int i;
+
+ if (err)
+ goto end;
+
+ uptodate = 1;
+ bio_for_each_segment_all(bvec, bio, i) {
+ ret = __readpage_endio_check(done->inode, io_bio, i,
+ bvec->bv_page, 0,
+ done->start, bvec->bv_len);
+ if (!ret)
+ clean_io_failure(done->inode, done->start,
+ bvec->bv_page, 0);
+ else
+ uptodate = 0;
+ }
+
+ done->uptodate = uptodate;
+end:
+ complete(&done->done);
+ bio_put(bio);
+}
+
+static int __btrfs_subio_endio_read(struct inode *inode,
+ struct btrfs_io_bio *io_bio, int err)
+{
+ struct bio_vec *bvec;
+ struct btrfs_retry_complete done;
+ u64 start;
+ u64 offset = 0;
+ int i;
+ int ret;
+
+ err = 0;
+ start = io_bio->logical;
+ done.inode = inode;
+
+ bio_for_each_segment_all(bvec, &io_bio->bio, i) {
+ ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page,
+ 0, start, bvec->bv_len);
+ if (likely(!ret))
+ goto next;
+try_again:
+ done.uptodate = 0;
+ done.start = start;
+ init_completion(&done.done);
+
+ ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start,
+ start + bvec->bv_len - 1,
+ io_bio->mirror_num,
+ btrfs_retry_endio, &done);
+ if (ret) {
+ err = ret;
+ goto next;
+ }
+
+ wait_for_completion(&done.done);
+
+ if (!done.uptodate) {
+ /* We might have another mirror, so try again */
+ goto try_again;
+ }
+next:
+ offset += bvec->bv_len;
+ start += bvec->bv_len;
+ }
+
+ return err;
+}
+
+static int btrfs_subio_endio_read(struct inode *inode,
+ struct btrfs_io_bio *io_bio, int err)
+{
+ bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+
+ if (skip_csum) {
+ if (unlikely(err))
+ return __btrfs_correct_data_nocsum(inode, io_bio);
+ else
+ return 0;
+ } else {
+ return __btrfs_subio_endio_read(inode, io_bio, err);
+ }
+}
+
+static void btrfs_endio_direct_read(struct bio *bio, int err)
+{
+ struct btrfs_dio_private *dip = bio->bi_private;
+ struct inode *inode = dip->inode;
+ struct bio *dio_bio;
+ struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+
+ if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED)
+ err = btrfs_subio_endio_read(inode, io_bio, err);
+
+ unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
+ dip->logical_offset + dip->bytes - 1);
+ dio_bio = dip->dio_bio;
+
+ kfree(dip);
+
+ /* If we had a csum failure make sure to clear the uptodate flag */
+ if (err)
+ clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
+ dio_end_io(dio_bio, err);
+
+ if (io_bio->end_io)
+ io_bio->end_io(io_bio, err);
+ bio_put(bio);
+}
+
+static void btrfs_endio_direct_write(struct bio *bio, int err)
+{
+ struct btrfs_dio_private *dip = bio->bi_private;
+ struct inode *inode = dip->inode;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_ordered_extent *ordered = NULL;
+ u64 ordered_offset = dip->logical_offset;
+ u64 ordered_bytes = dip->bytes;
+ struct bio *dio_bio;
+ int ret;
+
+ if (err)
+ goto out_done;
+again:
+ ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
+ &ordered_offset,
+ ordered_bytes, !err);
+ if (!ret)
+ goto out_test;
+
+ btrfs_init_work(&ordered->work, btrfs_endio_write_helper,
+ finish_ordered_fn, NULL, NULL);
+ btrfs_queue_work(root->fs_info->endio_write_workers,
+ &ordered->work);
+out_test:
+ /*
+ * our bio might span multiple ordered extents. If we haven't
+ * completed the accounting for the whole dio, go back and try again
+ */
+ if (ordered_offset < dip->logical_offset + dip->bytes) {
+ ordered_bytes = dip->logical_offset + dip->bytes -
+ ordered_offset;
+ ordered = NULL;
+ goto again;
+ }
+out_done:
+ dio_bio = dip->dio_bio;
+
+ kfree(dip);
+
+ /* If we had an error make sure to clear the uptodate flag */
+ if (err)
+ clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
+ dio_end_io(dio_bio, err);
+ bio_put(bio);
+}
+
+static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
+ struct bio *bio, int mirror_num,
+ unsigned long bio_flags, u64 offset)
+{
+ int ret;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ ret = btrfs_csum_one_bio(root, inode, bio, offset, 1);
+ BUG_ON(ret); /* -ENOMEM */
+ return 0;
+}
+
+static void btrfs_end_dio_bio(struct bio *bio, int err)
+{
+ struct btrfs_dio_private *dip = bio->bi_private;
+
+ if (err)
+ btrfs_warn(BTRFS_I(dip->inode)->root->fs_info,
+ "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d",
+ btrfs_ino(dip->inode), bio->bi_rw,
+ (unsigned long long)bio->bi_iter.bi_sector,
+ bio->bi_iter.bi_size, err);
+
+ if (dip->subio_endio)
+ err = dip->subio_endio(dip->inode, btrfs_io_bio(bio), err);
+
+ if (err) {
+ dip->errors = 1;
+
+ /*
+ * before atomic variable goto zero, we must make sure
+ * dip->errors is perceived to be set.
+ */
+ smp_mb__before_atomic();
+ }
+
+ /* if there are more bios still pending for this dio, just exit */
+ if (!atomic_dec_and_test(&dip->pending_bios))
+ goto out;
+
+ if (dip->errors) {
+ bio_io_error(dip->orig_bio);
+ } else {
+ set_bit(BIO_UPTODATE, &dip->dio_bio->bi_flags);
+ bio_endio(dip->orig_bio, 0);
+ }
+out:
+ bio_put(bio);
+}
+
+static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
+ u64 first_sector, gfp_t gfp_flags)
+{
+ int nr_vecs = bio_get_nr_vecs(bdev);
+ return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags);
+}
+
+static inline int btrfs_lookup_and_bind_dio_csum(struct btrfs_root *root,
+ struct inode *inode,
+ struct btrfs_dio_private *dip,
+ struct bio *bio,
+ u64 file_offset)
+{
+ struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+ struct btrfs_io_bio *orig_io_bio = btrfs_io_bio(dip->orig_bio);
+ int ret;
+
+ /*
+ * We load all the csum data we need when we submit
+ * the first bio to reduce the csum tree search and
+ * contention.
+ */
+ if (dip->logical_offset == file_offset) {
+ ret = btrfs_lookup_bio_sums_dio(root, inode, dip->orig_bio,
+ file_offset);
+ if (ret)
+ return ret;
+ }
+
+ if (bio == dip->orig_bio)
+ return 0;
+
+ file_offset -= dip->logical_offset;
+ file_offset >>= inode->i_sb->s_blocksize_bits;
+ io_bio->csum = (u8 *)(((u32 *)orig_io_bio->csum) + file_offset);
+
+ return 0;
+}
+
+static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
+ int rw, u64 file_offset, int skip_sum,
+ int async_submit)
+{
+ struct btrfs_dio_private *dip = bio->bi_private;
+ int write = rw & REQ_WRITE;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret;
+
+ if (async_submit)
+ async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers);
+
+ bio_get(bio);
+
+ if (!write) {
+ ret = btrfs_bio_wq_end_io(root->fs_info, bio,
+ BTRFS_WQ_ENDIO_DATA);
+ if (ret)
+ goto err;
+ }
+
+ if (skip_sum)
+ goto map;
+
+ if (write && async_submit) {
+ ret = btrfs_wq_submit_bio(root->fs_info,
+ inode, rw, bio, 0, 0,
+ file_offset,
+ __btrfs_submit_bio_start_direct_io,
+ __btrfs_submit_bio_done);
+ goto err;
+ } else if (write) {
+ /*
+ * If we aren't doing async submit, calculate the csum of the
+ * bio now.
+ */
+ ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1);
+ if (ret)
+ goto err;
+ } else {
+ ret = btrfs_lookup_and_bind_dio_csum(root, inode, dip, bio,
+ file_offset);
+ if (ret)
+ goto err;
+ }
+map:
+ ret = btrfs_map_bio(root, rw, bio, 0, async_submit);
+err:
+ bio_put(bio);
+ return ret;
+}
+
+static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
+ int skip_sum)
+{
+ struct inode *inode = dip->inode;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct bio *bio;
+ struct bio *orig_bio = dip->orig_bio;
+ struct bio_vec *bvec = orig_bio->bi_io_vec;
+ u64 start_sector = orig_bio->bi_iter.bi_sector;
+ u64 file_offset = dip->logical_offset;
+ u64 submit_len = 0;
+ u64 map_length;
+ int nr_pages = 0;
+ int ret;
+ int async_submit = 0;
+
+ map_length = orig_bio->bi_iter.bi_size;
+ ret = btrfs_map_block(root->fs_info, rw, start_sector << 9,
+ &map_length, NULL, 0);
+ if (ret)
+ return -EIO;
+
+ if (map_length >= orig_bio->bi_iter.bi_size) {
+ bio = orig_bio;
+ dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED;
+ goto submit;
+ }
+
+ /* async crcs make it difficult to collect full stripe writes. */
+ if (btrfs_get_alloc_profile(root, 1) & BTRFS_BLOCK_GROUP_RAID56_MASK)
+ async_submit = 0;
+ else
+ async_submit = 1;
+
+ bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
+ if (!bio)
+ return -ENOMEM;
+
+ bio->bi_private = dip;
+ bio->bi_end_io = btrfs_end_dio_bio;
+ btrfs_io_bio(bio)->logical = file_offset;
+ atomic_inc(&dip->pending_bios);
+
+ while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) {
+ if (map_length < submit_len + bvec->bv_len ||
+ bio_add_page(bio, bvec->bv_page, bvec->bv_len,
+ bvec->bv_offset) < bvec->bv_len) {
+ /*
+ * inc the count before we submit the bio so
+ * we know the end IO handler won't happen before
+ * we inc the count. Otherwise, the dip might get freed
+ * before we're done setting it up
+ */
+ atomic_inc(&dip->pending_bios);
+ ret = __btrfs_submit_dio_bio(bio, inode, rw,
+ file_offset, skip_sum,
+ async_submit);
+ if (ret) {
+ bio_put(bio);
+ atomic_dec(&dip->pending_bios);
+ goto out_err;
+ }
+
+ start_sector += submit_len >> 9;
+ file_offset += submit_len;
+
+ submit_len = 0;
+ nr_pages = 0;
+
+ bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev,
+ start_sector, GFP_NOFS);
+ if (!bio)
+ goto out_err;
+ bio->bi_private = dip;
+ bio->bi_end_io = btrfs_end_dio_bio;
+ btrfs_io_bio(bio)->logical = file_offset;
+
+ map_length = orig_bio->bi_iter.bi_size;
+ ret = btrfs_map_block(root->fs_info, rw,
+ start_sector << 9,
+ &map_length, NULL, 0);
+ if (ret) {
+ bio_put(bio);
+ goto out_err;
+ }
+ } else {
+ submit_len += bvec->bv_len;
+ nr_pages++;
+ bvec++;
+ }
+ }
+
+submit:
+ ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum,
+ async_submit);
+ if (!ret)
+ return 0;
+
+ bio_put(bio);
+out_err:
+ dip->errors = 1;
+ /*
+ * before atomic variable goto zero, we must
+ * make sure dip->errors is perceived to be set.
+ */
+ smp_mb__before_atomic();
+ if (atomic_dec_and_test(&dip->pending_bios))
+ bio_io_error(dip->orig_bio);
+
+ /* bio_end_io() will handle error, so we needn't return it */
+ return 0;
+}
+
+static void btrfs_submit_direct(int rw, struct bio *dio_bio,
+ struct inode *inode, loff_t file_offset)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_dio_private *dip;
+ struct bio *io_bio;
+ struct btrfs_io_bio *btrfs_bio;
+ int skip_sum;
+ int write = rw & REQ_WRITE;
+ int ret = 0;
+
+ skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+
+ io_bio = btrfs_bio_clone(dio_bio, GFP_NOFS);
+ if (!io_bio) {
+ ret = -ENOMEM;
+ goto free_ordered;
+ }
+
+ dip = kzalloc(sizeof(*dip), GFP_NOFS);
+ if (!dip) {
+ ret = -ENOMEM;
+ goto free_io_bio;
+ }
+
+ dip->private = dio_bio->bi_private;
+ dip->inode = inode;
+ dip->logical_offset = file_offset;
+ dip->bytes = dio_bio->bi_iter.bi_size;
+ dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
+ io_bio->bi_private = dip;
+ dip->orig_bio = io_bio;
+ dip->dio_bio = dio_bio;
+ atomic_set(&dip->pending_bios, 0);
+ btrfs_bio = btrfs_io_bio(io_bio);
+ btrfs_bio->logical = file_offset;
+
+ if (write) {
+ io_bio->bi_end_io = btrfs_endio_direct_write;
+ } else {
+ io_bio->bi_end_io = btrfs_endio_direct_read;
+ dip->subio_endio = btrfs_subio_endio_read;
+ }
+
+ ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
+ if (!ret)
+ return;
+
+ if (btrfs_bio->end_io)
+ btrfs_bio->end_io(btrfs_bio, ret);
+free_io_bio:
+ bio_put(io_bio);
+
+free_ordered:
+ /*
+ * If this is a write, we need to clean up the reserved space and kill
+ * the ordered extent.
+ */
+ if (write) {
+ struct btrfs_ordered_extent *ordered;
+ ordered = btrfs_lookup_ordered_extent(inode, file_offset);
+ if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
+ !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
+ btrfs_free_reserved_extent(root, ordered->start,
+ ordered->disk_len, 1);
+ btrfs_put_ordered_extent(ordered);
+ btrfs_put_ordered_extent(ordered);
+ }
+ bio_endio(dio_bio, ret);
+}
+
+static ssize_t check_direct_IO(struct btrfs_root *root, struct kiocb *iocb,
+ const struct iov_iter *iter, loff_t offset)
+{
+ int seg;
+ int i;
+ unsigned blocksize_mask = root->sectorsize - 1;
+ ssize_t retval = -EINVAL;
+
+ if (offset & blocksize_mask)
+ goto out;
+
+ if (iov_iter_alignment(iter) & blocksize_mask)
+ goto out;
+
+ /* If this is a write we don't need to check anymore */
+ if (iov_iter_rw(iter) == WRITE)
+ return 0;
+ /*
+ * Check to make sure we don't have duplicate iov_base's in this
+ * iovec, if so return EINVAL, otherwise we'll get csum errors
+ * when reading back.
+ */
+ for (seg = 0; seg < iter->nr_segs; seg++) {
+ for (i = seg + 1; i < iter->nr_segs; i++) {
+ if (iter->iov[seg].iov_base == iter->iov[i].iov_base)
+ goto out;
+ }
+ }
+ retval = 0;
+out:
+ return retval;
+}
+
+static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
+ loff_t offset)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+ u64 outstanding_extents = 0;
+ size_t count = 0;
+ int flags = 0;
+ bool wakeup = true;
+ bool relock = false;
+ ssize_t ret;
+
+ if (check_direct_IO(BTRFS_I(inode)->root, iocb, iter, offset))
+ return 0;
+
+ inode_dio_begin(inode);
+ smp_mb__after_atomic();
+
+ /*
+ * The generic stuff only does filemap_write_and_wait_range, which
+ * isn't enough if we've written compressed pages to this area, so
+ * we need to flush the dirty pages again to make absolutely sure
+ * that any outstanding dirty pages are on disk.
+ */
+ count = iov_iter_count(iter);
+ if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags))
+ filemap_fdatawrite_range(inode->i_mapping, offset,
+ offset + count - 1);
+
+ if (iov_iter_rw(iter) == WRITE) {
+ /*
+ * If the write DIO is beyond the EOF, we need update
+ * the isize, but it is protected by i_mutex. So we can
+ * not unlock the i_mutex at this case.
+ */
+ if (offset + count <= inode->i_size) {
+ mutex_unlock(&inode->i_mutex);
+ relock = true;
+ }
+ ret = btrfs_delalloc_reserve_space(inode, count);
+ if (ret)
+ goto out;
+ outstanding_extents = div64_u64(count +
+ BTRFS_MAX_EXTENT_SIZE - 1,
+ BTRFS_MAX_EXTENT_SIZE);
+
+ /*
+ * We need to know how many extents we reserved so that we can
+ * do the accounting properly if we go over the number we
+ * originally calculated. Abuse current->journal_info for this.
+ */
+ current->journal_info = &outstanding_extents;
+ } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
+ &BTRFS_I(inode)->runtime_flags)) {
+ inode_dio_end(inode);
+ flags = DIO_LOCKING | DIO_SKIP_HOLES;
+ wakeup = false;
+ }
+
+ ret = __blockdev_direct_IO(iocb, inode,
+ BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
+ iter, offset, btrfs_get_blocks_direct, NULL,
+ btrfs_submit_direct, flags);
+ if (iov_iter_rw(iter) == WRITE) {
+ current->journal_info = NULL;
+ if (ret < 0 && ret != -EIOCBQUEUED)
+ btrfs_delalloc_release_space(inode, count);
+ else if (ret >= 0 && (size_t)ret < count)
+ btrfs_delalloc_release_space(inode,
+ count - (size_t)ret);
+ }
+out:
+ if (wakeup)
+ inode_dio_end(inode);
+ if (relock)
+ mutex_lock(&inode->i_mutex);
+
+ return ret;
+}
+
+#define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC)
+
+static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ __u64 start, __u64 len)
+{
+ int ret;
+
+ ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS);
+ if (ret)
+ return ret;
+
+ return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap);
+}
+
+int btrfs_readpage(struct file *file, struct page *page)
+{
+ struct extent_io_tree *tree;
+ tree = &BTRFS_I(page->mapping->host)->io_tree;
+ return extent_read_full_page(tree, page, btrfs_get_extent, 0);
+}
+
+static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+ struct extent_io_tree *tree;
+
+
+ if (current->flags & PF_MEMALLOC) {
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return 0;
+ }
+ tree = &BTRFS_I(page->mapping->host)->io_tree;
+ return extent_write_full_page(tree, page, btrfs_get_extent, wbc);
+}
+
+static int btrfs_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct extent_io_tree *tree;
+
+ tree = &BTRFS_I(mapping->host)->io_tree;
+ return extent_writepages(tree, mapping, btrfs_get_extent, wbc);
+}
+
+static int
+btrfs_readpages(struct file *file, struct address_space *mapping,
+ struct list_head *pages, unsigned nr_pages)
+{
+ struct extent_io_tree *tree;
+ tree = &BTRFS_I(mapping->host)->io_tree;
+ return extent_readpages(tree, mapping, pages, nr_pages,
+ btrfs_get_extent);
+}
+static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
+{
+ struct extent_io_tree *tree;
+ struct extent_map_tree *map;
+ int ret;
+
+ tree = &BTRFS_I(page->mapping->host)->io_tree;
+ map = &BTRFS_I(page->mapping->host)->extent_tree;
+ ret = try_release_extent_mapping(map, tree, page, gfp_flags);
+ if (ret == 1) {
+ ClearPagePrivate(page);
+ set_page_private(page, 0);
+ page_cache_release(page);
+ }
+ return ret;
+}
+
+static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
+{
+ if (PageWriteback(page) || PageDirty(page))
+ return 0;
+ return __btrfs_releasepage(page, gfp_flags & GFP_NOFS);
+}
+
+static void btrfs_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
+{
+ struct inode *inode = page->mapping->host;
+ struct extent_io_tree *tree;
+ struct btrfs_ordered_extent *ordered;
+ struct extent_state *cached_state = NULL;
+ u64 page_start = page_offset(page);
+ u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
+ int inode_evicting = inode->i_state & I_FREEING;
+
+ /*
+ * we have the page locked, so new writeback can't start,
+ * and the dirty bit won't be cleared while we are here.
+ *
+ * Wait for IO on this page so that we can safely clear
+ * the PagePrivate2 bit and do ordered accounting
+ */
+ wait_on_page_writeback(page);
+
+ tree = &BTRFS_I(inode)->io_tree;
+ if (offset) {
+ btrfs_releasepage(page, GFP_NOFS);
+ return;
+ }
+
+ if (!inode_evicting)
+ lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
+ ordered = btrfs_lookup_ordered_extent(inode, page_start);
+ if (ordered) {
+ /*
+ * IO on this page will never be started, so we need
+ * to account for any ordered extents now
+ */
+ if (!inode_evicting)
+ clear_extent_bit(tree, page_start, page_end,
+ EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
+ EXTENT_DEFRAG, 1, 0, &cached_state,
+ GFP_NOFS);
+ /*
+ * whoever cleared the private bit is responsible
+ * for the finish_ordered_io
+ */
+ if (TestClearPagePrivate2(page)) {
+ struct btrfs_ordered_inode_tree *tree;
+ u64 new_len;
+
+ tree = &BTRFS_I(inode)->ordered_tree;
+
+ spin_lock_irq(&tree->lock);
+ set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
+ new_len = page_start - ordered->file_offset;
+ if (new_len < ordered->truncated_len)
+ ordered->truncated_len = new_len;
+ spin_unlock_irq(&tree->lock);
+
+ if (btrfs_dec_test_ordered_pending(inode, &ordered,
+ page_start,
+ PAGE_CACHE_SIZE, 1))
+ btrfs_finish_ordered_io(ordered);
+ }
+ btrfs_put_ordered_extent(ordered);
+ if (!inode_evicting) {
+ cached_state = NULL;
+ lock_extent_bits(tree, page_start, page_end, 0,
+ &cached_state);
+ }
+ }
+
+ if (!inode_evicting) {
+ clear_extent_bit(tree, page_start, page_end,
+ EXTENT_LOCKED | EXTENT_DIRTY |
+ EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
+ EXTENT_DEFRAG, 1, 1,
+ &cached_state, GFP_NOFS);
+
+ __btrfs_releasepage(page, GFP_NOFS);
+ }
+
+ ClearPageChecked(page);
+ if (PagePrivate(page)) {
+ ClearPagePrivate(page);
+ set_page_private(page, 0);
+ page_cache_release(page);
+ }
+}
+
+/*
+ * btrfs_page_mkwrite() is not allowed to change the file size as it gets
+ * called from a page fault handler when a page is first dirtied. Hence we must
+ * be careful to check for EOF conditions here. We set the page up correctly
+ * for a written page which means we get ENOSPC checking when writing into
+ * holes and correct delalloc and unwritten extent mapping on filesystems that
+ * support these features.
+ *
+ * We are not allowed to take the i_mutex here so we have to play games to
+ * protect against truncate races as the page could now be beyond EOF. Because
+ * vmtruncate() writes the inode size before removing pages, once we have the
+ * page lock we can determine safely if the page is beyond EOF. If it is not
+ * beyond EOF, then the page is guaranteed safe against truncation until we
+ * unlock the page.
+ */
+int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct page *page = vmf->page;
+ struct inode *inode = file_inode(vma->vm_file);
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct btrfs_ordered_extent *ordered;
+ struct extent_state *cached_state = NULL;
+ char *kaddr;
+ unsigned long zero_start;
+ loff_t size;
+ int ret;
+ int reserved = 0;
+ u64 page_start;
+ u64 page_end;
+
+ sb_start_pagefault(inode->i_sb);
+ ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+ if (!ret) {
+ ret = file_update_time(vma->vm_file);
+ reserved = 1;
+ }
+ if (ret) {
+ if (ret == -ENOMEM)
+ ret = VM_FAULT_OOM;
+ else /* -ENOSPC, -EIO, etc */
+ ret = VM_FAULT_SIGBUS;
+ if (reserved)
+ goto out;
+ goto out_noreserve;
+ }
+
+ ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
+again:
+ lock_page(page);
+ size = i_size_read(inode);
+ page_start = page_offset(page);
+ page_end = page_start + PAGE_CACHE_SIZE - 1;
+
+ if ((page->mapping != inode->i_mapping) ||
+ (page_start >= size)) {
+ /* page got truncated out from underneath us */
+ goto out_unlock;
+ }
+ wait_on_page_writeback(page);
+
+ lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state);
+ set_page_extent_mapped(page);
+
+ /*
+ * we can't set the delalloc bits if there are pending ordered
+ * extents. Drop our locks and wait for them to finish
+ */
+ ordered = btrfs_lookup_ordered_extent(inode, page_start);
+ if (ordered) {
+ unlock_extent_cached(io_tree, page_start, page_end,
+ &cached_state, GFP_NOFS);
+ unlock_page(page);
+ btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_put_ordered_extent(ordered);
+ goto again;
+ }
+
+ /*
+ * XXX - page_mkwrite gets called every time the page is dirtied, even
+ * if it was already dirty, so for space accounting reasons we need to
+ * clear any delalloc bits for the range we are fixing to save. There
+ * is probably a better way to do this, but for now keep consistent with
+ * prepare_pages in the normal write path.
+ */
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
+ EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
+ 0, 0, &cached_state, GFP_NOFS);
+
+ ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
+ &cached_state);
+ if (ret) {
+ unlock_extent_cached(io_tree, page_start, page_end,
+ &cached_state, GFP_NOFS);
+ ret = VM_FAULT_SIGBUS;
+ goto out_unlock;
+ }
+ ret = 0;
+
+ /* page is wholly or partially inside EOF */
+ if (page_start + PAGE_CACHE_SIZE > size)
+ zero_start = size & ~PAGE_CACHE_MASK;
+ else
+ zero_start = PAGE_CACHE_SIZE;
+
+ if (zero_start != PAGE_CACHE_SIZE) {
+ kaddr = kmap(page);
+ memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start);
+ flush_dcache_page(page);
+ kunmap(page);
+ }
+ ClearPageChecked(page);
+ set_page_dirty(page);
+ SetPageUptodate(page);
+
+ BTRFS_I(inode)->last_trans = root->fs_info->generation;
+ BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
+ BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
+
+ unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
+
+out_unlock:
+ if (!ret) {
+ sb_end_pagefault(inode->i_sb);
+ return VM_FAULT_LOCKED;
+ }
+ unlock_page(page);
+out:
+ btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+out_noreserve:
+ sb_end_pagefault(inode->i_sb);
+ return ret;
+}
+
+static int btrfs_truncate(struct inode *inode)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_block_rsv *rsv;
+ int ret = 0;
+ int err = 0;
+ struct btrfs_trans_handle *trans;
+ u64 mask = root->sectorsize - 1;
+ u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
+
+ ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask),
+ (u64)-1);
+ if (ret)
+ return ret;
+
+ /*
+ * Yes ladies and gentelment, this is indeed ugly. The fact is we have
+ * 3 things going on here
+ *
+ * 1) We need to reserve space for our orphan item and the space to
+ * delete our orphan item. Lord knows we don't want to have a dangling
+ * orphan item because we didn't reserve space to remove it.
+ *
+ * 2) We need to reserve space to update our inode.
+ *
+ * 3) We need to have something to cache all the space that is going to
+ * be free'd up by the truncate operation, but also have some slack
+ * space reserved in case it uses space during the truncate (thank you
+ * very much snapshotting).
+ *
+ * And we need these to all be seperate. The fact is we can use alot of
+ * space doing the truncate, and we have no earthly idea how much space
+ * we will use, so we need the truncate reservation to be seperate so it
+ * doesn't end up using space reserved for updating the inode or
+ * removing the orphan item. We also need to be able to stop the
+ * transaction and start a new one, which means we need to be able to
+ * update the inode several times, and we have no idea of knowing how
+ * many times that will be, so we can't just reserve 1 item for the
+ * entirety of the opration, so that has to be done seperately as well.
+ * Then there is the orphan item, which does indeed need to be held on
+ * to for the whole operation, and we need nobody to touch this reserved
+ * space except the orphan code.
+ *
+ * So that leaves us with
+ *
+ * 1) root->orphan_block_rsv - for the orphan deletion.
+ * 2) rsv - for the truncate reservation, which we will steal from the
+ * transaction reservation.
+ * 3) fs_info->trans_block_rsv - this will have 1 items worth left for
+ * updating the inode.
+ */
+ rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
+ if (!rsv)
+ return -ENOMEM;
+ rsv->size = min_size;
+ rsv->failfast = 1;
+
+ /*
+ * 1 for the truncate slack space
+ * 1 for updating the inode.
+ */
+ trans = btrfs_start_transaction(root, 2);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ goto out;
+ }
+
+ /* Migrate the slack space for the truncate to our reserve */
+ ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
+ min_size);
+ BUG_ON(ret);
+
+ /*
+ * So if we truncate and then write and fsync we normally would just
+ * write the extents that changed, which is a problem if we need to
+ * first truncate that entire inode. So set this flag so we write out
+ * all of the extents in the inode to the sync log so we're completely
+ * safe.
+ */
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
+ trans->block_rsv = rsv;
+
+ while (1) {
+ ret = btrfs_truncate_inode_items(trans, root, inode,
+ inode->i_size,
+ BTRFS_EXTENT_DATA_KEY);
+ if (ret != -ENOSPC && ret != -EAGAIN) {
+ err = ret;
+ break;
+ }
+
+ trans->block_rsv = &root->fs_info->trans_block_rsv;
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret) {
+ err = ret;
+ break;
+ }
+
+ btrfs_end_transaction(trans, root);
+ btrfs_btree_balance_dirty(root);
+
+ trans = btrfs_start_transaction(root, 2);
+ if (IS_ERR(trans)) {
+ ret = err = PTR_ERR(trans);
+ trans = NULL;
+ break;
+ }
+
+ ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
+ rsv, min_size);
+ BUG_ON(ret); /* shouldn't happen */
+ trans->block_rsv = rsv;
+ }
+
+ if (ret == 0 && inode->i_nlink > 0) {
+ trans->block_rsv = root->orphan_block_rsv;
+ ret = btrfs_orphan_del(trans, inode);
+ if (ret)
+ err = ret;
+ }
+
+ if (trans) {
+ trans->block_rsv = &root->fs_info->trans_block_rsv;
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret && !err)
+ err = ret;
+
+ ret = btrfs_end_transaction(trans, root);
+ btrfs_btree_balance_dirty(root);
+ }
+
+out:
+ btrfs_free_block_rsv(root, rsv);
+
+ if (ret && !err)
+ err = ret;
+
+ return err;
+}
+
+/*
+ * create a new subvolume directory/inode (helper for the ioctl).
+ */
+int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *new_root,
+ struct btrfs_root *parent_root,
+ u64 new_dirid)
+{
+ struct inode *inode;
+ int err;
+ u64 index = 0;
+
+ inode = btrfs_new_inode(trans, new_root, NULL, "..", 2,
+ new_dirid, new_dirid,
+ S_IFDIR | (~current_umask() & S_IRWXUGO),
+ &index);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+ inode->i_op = &btrfs_dir_inode_operations;
+ inode->i_fop = &btrfs_dir_file_operations;
+
+ set_nlink(inode, 1);
+ btrfs_i_size_write(inode, 0);
+ unlock_new_inode(inode);
+
+ err = btrfs_subvol_inherit_props(trans, new_root, parent_root);
+ if (err)
+ btrfs_err(new_root->fs_info,
+ "error inheriting subvolume %llu properties: %d",
+ new_root->root_key.objectid, err);
+
+ err = btrfs_update_inode(trans, new_root, inode);
+
+ iput(inode);
+ return err;
+}
+
+struct inode *btrfs_alloc_inode(struct super_block *sb)
+{
+ struct btrfs_inode *ei;
+ struct inode *inode;
+
+ ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
+ if (!ei)
+ return NULL;
+
+ ei->root = NULL;
+ ei->generation = 0;
+ ei->last_trans = 0;
+ ei->last_sub_trans = 0;
+ ei->logged_trans = 0;
+ ei->delalloc_bytes = 0;
+ ei->defrag_bytes = 0;
+ ei->disk_i_size = 0;
+ ei->flags = 0;
+ ei->csum_bytes = 0;
+ ei->index_cnt = (u64)-1;
+ ei->dir_index = 0;
+ ei->last_unlink_trans = 0;
+ ei->last_log_commit = 0;
+
+ spin_lock_init(&ei->lock);
+ ei->outstanding_extents = 0;
+ ei->reserved_extents = 0;
+
+ ei->runtime_flags = 0;
+ ei->force_compress = BTRFS_COMPRESS_NONE;
+
+ ei->delayed_node = NULL;
+
+ ei->i_otime.tv_sec = 0;
+ ei->i_otime.tv_nsec = 0;
+
+ inode = &ei->vfs_inode;
+ extent_map_tree_init(&ei->extent_tree);
+ extent_io_tree_init(&ei->io_tree, &inode->i_data);
+ extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);
+ ei->io_tree.track_uptodate = 1;
+ ei->io_failure_tree.track_uptodate = 1;
+ atomic_set(&ei->sync_writers, 0);
+ mutex_init(&ei->log_mutex);
+ mutex_init(&ei->delalloc_mutex);
+ btrfs_ordered_inode_tree_init(&ei->ordered_tree);
+ INIT_LIST_HEAD(&ei->delalloc_inodes);
+ RB_CLEAR_NODE(&ei->rb_node);
+
+ return inode;
+}
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+void btrfs_test_destroy_inode(struct inode *inode)
+{
+ btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
+ kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
+}
+#endif
+
+static void btrfs_i_callback(struct rcu_head *head)
+{
+ struct inode *inode = container_of(head, struct inode, i_rcu);
+ kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
+}
+
+void btrfs_destroy_inode(struct inode *inode)
+{
+ struct btrfs_ordered_extent *ordered;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+
+ WARN_ON(!hlist_empty(&inode->i_dentry));
+ WARN_ON(inode->i_data.nrpages);
+ WARN_ON(BTRFS_I(inode)->outstanding_extents);
+ WARN_ON(BTRFS_I(inode)->reserved_extents);
+ WARN_ON(BTRFS_I(inode)->delalloc_bytes);
+ WARN_ON(BTRFS_I(inode)->csum_bytes);
+ WARN_ON(BTRFS_I(inode)->defrag_bytes);
+
+ /*
+ * This can happen where we create an inode, but somebody else also
+ * created the same inode and we need to destroy the one we already
+ * created.
+ */
+ if (!root)
+ goto free;
+
+ if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+ &BTRFS_I(inode)->runtime_flags)) {
+ btrfs_info(root->fs_info, "inode %llu still on the orphan list",
+ btrfs_ino(inode));
+ atomic_dec(&root->orphan_inodes);
+ }
+
+ while (1) {
+ ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
+ if (!ordered)
+ break;
+ else {
+ btrfs_err(root->fs_info, "found ordered extent %llu %llu on inode cleanup",
+ ordered->file_offset, ordered->len);
+ btrfs_remove_ordered_extent(inode, ordered);
+ btrfs_put_ordered_extent(ordered);
+ btrfs_put_ordered_extent(ordered);
+ }
+ }
+ inode_tree_del(inode);
+ btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
+free:
+ call_rcu(&inode->i_rcu, btrfs_i_callback);
+}
+
+int btrfs_drop_inode(struct inode *inode)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+
+ if (root == NULL)
+ return 1;
+
+ /* the snap/subvol tree is on deleting */
+ if (btrfs_root_refs(&root->root_item) == 0)
+ return 1;
+ else
+ return generic_drop_inode(inode);
+}
+
+static void init_once(void *foo)
+{
+ struct btrfs_inode *ei = (struct btrfs_inode *) foo;
+
+ inode_init_once(&ei->vfs_inode);
+}
+
+void btrfs_destroy_cachep(void)
+{
+ /*
+ * Make sure all delayed rcu free inodes are flushed before we
+ * destroy cache.
+ */
+ rcu_barrier();
+ if (btrfs_inode_cachep)
+ kmem_cache_destroy(btrfs_inode_cachep);
+ if (btrfs_trans_handle_cachep)
+ kmem_cache_destroy(btrfs_trans_handle_cachep);
+ if (btrfs_transaction_cachep)
+ kmem_cache_destroy(btrfs_transaction_cachep);
+ if (btrfs_path_cachep)
+ kmem_cache_destroy(btrfs_path_cachep);
+ if (btrfs_free_space_cachep)
+ kmem_cache_destroy(btrfs_free_space_cachep);
+ if (btrfs_delalloc_work_cachep)
+ kmem_cache_destroy(btrfs_delalloc_work_cachep);
+}
+
+int btrfs_init_cachep(void)
+{
+ btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
+ sizeof(struct btrfs_inode), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once);
+ if (!btrfs_inode_cachep)
+ goto fail;
+
+ btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
+ sizeof(struct btrfs_trans_handle), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+ if (!btrfs_trans_handle_cachep)
+ goto fail;
+
+ btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction",
+ sizeof(struct btrfs_transaction), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+ if (!btrfs_transaction_cachep)
+ goto fail;
+
+ btrfs_path_cachep = kmem_cache_create("btrfs_path",
+ sizeof(struct btrfs_path), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+ if (!btrfs_path_cachep)
+ goto fail;
+
+ btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space",
+ sizeof(struct btrfs_free_space), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+ if (!btrfs_free_space_cachep)
+ goto fail;
+
+ btrfs_delalloc_work_cachep = kmem_cache_create("btrfs_delalloc_work",
+ sizeof(struct btrfs_delalloc_work), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+ NULL);
+ if (!btrfs_delalloc_work_cachep)
+ goto fail;
+
+ return 0;
+fail:
+ btrfs_destroy_cachep();
+ return -ENOMEM;
+}
+
+static int btrfs_getattr(struct vfsmount *mnt,
+ struct dentry *dentry, struct kstat *stat)
+{
+ u64 delalloc_bytes;
+ struct inode *inode = d_inode(dentry);
+ u32 blocksize = inode->i_sb->s_blocksize;
+
+ generic_fillattr(inode, stat);
+ stat->dev = BTRFS_I(inode)->root->anon_dev;
+ stat->blksize = PAGE_CACHE_SIZE;
+
+ spin_lock(&BTRFS_I(inode)->lock);
+ delalloc_bytes = BTRFS_I(inode)->delalloc_bytes;
+ spin_unlock(&BTRFS_I(inode)->lock);
+ stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) +
+ ALIGN(delalloc_bytes, blocksize)) >> 9;
+ return 0;
+}
+
+static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(old_dir)->root;
+ struct btrfs_root *dest = BTRFS_I(new_dir)->root;
+ struct inode *new_inode = d_inode(new_dentry);
+ struct inode *old_inode = d_inode(old_dentry);
+ struct timespec ctime = CURRENT_TIME;
+ u64 index = 0;
+ u64 root_objectid;
+ int ret;
+ u64 old_ino = btrfs_ino(old_inode);
+
+ if (btrfs_ino(new_dir) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
+ return -EPERM;
+
+ /* we only allow rename subvolume link between subvolumes */
+ if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
+ return -EXDEV;
+
+ if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
+ (new_inode && btrfs_ino(new_inode) == BTRFS_FIRST_FREE_OBJECTID))
+ return -ENOTEMPTY;
+
+ if (S_ISDIR(old_inode->i_mode) && new_inode &&
+ new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
+ return -ENOTEMPTY;
+
+
+ /* check for collisions, even if the name isn't there */
+ ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino,
+ new_dentry->d_name.name,
+ new_dentry->d_name.len);
+
+ if (ret) {
+ if (ret == -EEXIST) {
+ /* we shouldn't get
+ * eexist without a new_inode */
+ if (WARN_ON(!new_inode)) {
+ return ret;
+ }
+ } else {
+ /* maybe -EOVERFLOW */
+ return ret;
+ }
+ }
+ ret = 0;
+
+ /*
+ * we're using rename to replace one file with another. Start IO on it
+ * now so we don't add too much work to the end of the transaction
+ */
+ if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
+ filemap_flush(old_inode->i_mapping);
+
+ /* close the racy window with snapshot create/destroy ioctl */
+ if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
+ down_read(&root->fs_info->subvol_sem);
+ /*
+ * We want to reserve the absolute worst case amount of items. So if
+ * both inodes are subvols and we need to unlink them then that would
+ * require 4 item modifications, but if they are both normal inodes it
+ * would require 5 item modifications, so we'll assume their normal
+ * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items
+ * should cover the worst case number of items we'll modify.
+ */
+ trans = btrfs_start_transaction(root, 11);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out_notrans;
+ }
+
+ if (dest != root)
+ btrfs_record_root_in_trans(trans, dest);
+
+ ret = btrfs_set_inode_index(new_dir, &index);
+ if (ret)
+ goto out_fail;
+
+ BTRFS_I(old_inode)->dir_index = 0ULL;
+ if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
+ /* force full log commit if subvolume involved. */
+ btrfs_set_log_full_commit(root->fs_info, trans);
+ } else {
+ ret = btrfs_insert_inode_ref(trans, dest,
+ new_dentry->d_name.name,
+ new_dentry->d_name.len,
+ old_ino,
+ btrfs_ino(new_dir), index);
+ if (ret)
+ goto out_fail;
+ /*
+ * this is an ugly little race, but the rename is required
+ * to make sure that if we crash, the inode is either at the
+ * old name or the new one. pinning the log transaction lets
+ * us make sure we don't allow a log commit to come in after
+ * we unlink the name but before we add the new name back in.
+ */
+ btrfs_pin_log_trans(root);
+ }
+
+ inode_inc_iversion(old_dir);
+ inode_inc_iversion(new_dir);
+ inode_inc_iversion(old_inode);
+ old_dir->i_ctime = old_dir->i_mtime = ctime;
+ new_dir->i_ctime = new_dir->i_mtime = ctime;
+ old_inode->i_ctime = ctime;
+
+ if (old_dentry->d_parent != new_dentry->d_parent)
+ btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
+
+ if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
+ root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
+ ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid,
+ old_dentry->d_name.name,
+ old_dentry->d_name.len);
+ } else {
+ ret = __btrfs_unlink_inode(trans, root, old_dir,
+ d_inode(old_dentry),
+ old_dentry->d_name.name,
+ old_dentry->d_name.len);
+ if (!ret)
+ ret = btrfs_update_inode(trans, root, old_inode);
+ }
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out_fail;
+ }
+
+ if (new_inode) {
+ inode_inc_iversion(new_inode);
+ new_inode->i_ctime = CURRENT_TIME;
+ if (unlikely(btrfs_ino(new_inode) ==
+ BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
+ root_objectid = BTRFS_I(new_inode)->location.objectid;
+ ret = btrfs_unlink_subvol(trans, dest, new_dir,
+ root_objectid,
+ new_dentry->d_name.name,
+ new_dentry->d_name.len);
+ BUG_ON(new_inode->i_nlink == 0);
+ } else {
+ ret = btrfs_unlink_inode(trans, dest, new_dir,
+ d_inode(new_dentry),
+ new_dentry->d_name.name,
+ new_dentry->d_name.len);
+ }
+ if (!ret && new_inode->i_nlink == 0)
+ ret = btrfs_orphan_add(trans, d_inode(new_dentry));
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out_fail;
+ }
+ }
+
+ ret = btrfs_add_link(trans, new_dir, old_inode,
+ new_dentry->d_name.name,
+ new_dentry->d_name.len, 0, index);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out_fail;
+ }
+
+ if (old_inode->i_nlink == 1)
+ BTRFS_I(old_inode)->dir_index = index;
+
+ if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
+ struct dentry *parent = new_dentry->d_parent;
+ btrfs_log_new_name(trans, old_inode, old_dir, parent);
+ btrfs_end_log_trans(root);
+ }
+out_fail:
+ btrfs_end_transaction(trans, root);
+out_notrans:
+ if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
+ up_read(&root->fs_info->subvol_sem);
+
+ return ret;
+}
+
+static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
+{
+ if (flags & ~RENAME_NOREPLACE)
+ return -EINVAL;
+
+ return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry);
+}
+
+static void btrfs_run_delalloc_work(struct btrfs_work *work)
+{
+ struct btrfs_delalloc_work *delalloc_work;
+ struct inode *inode;
+
+ delalloc_work = container_of(work, struct btrfs_delalloc_work,
+ work);
+ inode = delalloc_work->inode;
+ if (delalloc_work->wait) {
+ btrfs_wait_ordered_range(inode, 0, (u64)-1);
+ } else {
+ filemap_flush(inode->i_mapping);
+ if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags))
+ filemap_flush(inode->i_mapping);
+ }
+
+ if (delalloc_work->delay_iput)
+ btrfs_add_delayed_iput(inode);
+ else
+ iput(inode);
+ complete(&delalloc_work->completion);
+}
+
+struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
+ int wait, int delay_iput)
+{
+ struct btrfs_delalloc_work *work;
+
+ work = kmem_cache_zalloc(btrfs_delalloc_work_cachep, GFP_NOFS);
+ if (!work)
+ return NULL;
+
+ init_completion(&work->completion);
+ INIT_LIST_HEAD(&work->list);
+ work->inode = inode;
+ work->wait = wait;
+ work->delay_iput = delay_iput;
+ WARN_ON_ONCE(!inode);
+ btrfs_init_work(&work->work, btrfs_flush_delalloc_helper,
+ btrfs_run_delalloc_work, NULL, NULL);
+
+ return work;
+}
+
+void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
+{
+ wait_for_completion(&work->completion);
+ kmem_cache_free(btrfs_delalloc_work_cachep, work);
+}
+
+/*
+ * some fairly slow code that needs optimization. This walks the list
+ * of all the inodes with pending delalloc and forces them to disk.
+ */
+static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput,
+ int nr)
+{
+ struct btrfs_inode *binode;
+ struct inode *inode;
+ struct btrfs_delalloc_work *work, *next;
+ struct list_head works;
+ struct list_head splice;
+ int ret = 0;
+
+ INIT_LIST_HEAD(&works);
+ INIT_LIST_HEAD(&splice);
+
+ mutex_lock(&root->delalloc_mutex);
+ spin_lock(&root->delalloc_lock);
+ list_splice_init(&root->delalloc_inodes, &splice);
+ while (!list_empty(&splice)) {
+ binode = list_entry(splice.next, struct btrfs_inode,
+ delalloc_inodes);
+
+ list_move_tail(&binode->delalloc_inodes,
+ &root->delalloc_inodes);
+ inode = igrab(&binode->vfs_inode);
+ if (!inode) {
+ cond_resched_lock(&root->delalloc_lock);
+ continue;
+ }
+ spin_unlock(&root->delalloc_lock);
+
+ work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
+ if (!work) {
+ if (delay_iput)
+ btrfs_add_delayed_iput(inode);
+ else
+ iput(inode);
+ ret = -ENOMEM;
+ goto out;
+ }
+ list_add_tail(&work->list, &works);
+ btrfs_queue_work(root->fs_info->flush_workers,
+ &work->work);
+ ret++;
+ if (nr != -1 && ret >= nr)
+ goto out;
+ cond_resched();
+ spin_lock(&root->delalloc_lock);
+ }
+ spin_unlock(&root->delalloc_lock);
+
+out:
+ list_for_each_entry_safe(work, next, &works, list) {
+ list_del_init(&work->list);
+ btrfs_wait_and_free_delalloc_work(work);
+ }
+
+ if (!list_empty_careful(&splice)) {
+ spin_lock(&root->delalloc_lock);
+ list_splice_tail(&splice, &root->delalloc_inodes);
+ spin_unlock(&root->delalloc_lock);
+ }
+ mutex_unlock(&root->delalloc_mutex);
+ return ret;
+}
+
+int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
+{
+ int ret;
+
+ if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
+ return -EROFS;
+
+ ret = __start_delalloc_inodes(root, delay_iput, -1);
+ if (ret > 0)
+ ret = 0;
+ /*
+ * the filemap_flush will queue IO into the worker threads, but
+ * we have to make sure the IO is actually started and that
+ * ordered extents get created before we return
+ */
+ atomic_inc(&root->fs_info->async_submit_draining);
+ while (atomic_read(&root->fs_info->nr_async_submits) ||
+ atomic_read(&root->fs_info->async_delalloc_pages)) {
+ wait_event(root->fs_info->async_submit_wait,
+ (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
+ atomic_read(&root->fs_info->async_delalloc_pages) == 0));
+ }
+ atomic_dec(&root->fs_info->async_submit_draining);
+ return ret;
+}
+
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
+ int nr)
+{
+ struct btrfs_root *root;
+ struct list_head splice;
+ int ret;
+
+ if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+ return -EROFS;
+
+ INIT_LIST_HEAD(&splice);
+
+ mutex_lock(&fs_info->delalloc_root_mutex);
+ spin_lock(&fs_info->delalloc_root_lock);
+ list_splice_init(&fs_info->delalloc_roots, &splice);
+ while (!list_empty(&splice) && nr) {
+ root = list_first_entry(&splice, struct btrfs_root,
+ delalloc_root);
+ root = btrfs_grab_fs_root(root);
+ BUG_ON(!root);
+ list_move_tail(&root->delalloc_root,
+ &fs_info->delalloc_roots);
+ spin_unlock(&fs_info->delalloc_root_lock);
+
+ ret = __start_delalloc_inodes(root, delay_iput, nr);
+ btrfs_put_fs_root(root);
+ if (ret < 0)
+ goto out;
+
+ if (nr != -1) {
+ nr -= ret;
+ WARN_ON(nr < 0);
+ }
+ spin_lock(&fs_info->delalloc_root_lock);
+ }
+ spin_unlock(&fs_info->delalloc_root_lock);
+
+ ret = 0;
+ atomic_inc(&fs_info->async_submit_draining);
+ while (atomic_read(&fs_info->nr_async_submits) ||
+ atomic_read(&fs_info->async_delalloc_pages)) {
+ wait_event(fs_info->async_submit_wait,
+ (atomic_read(&fs_info->nr_async_submits) == 0 &&
+ atomic_read(&fs_info->async_delalloc_pages) == 0));
+ }
+ atomic_dec(&fs_info->async_submit_draining);
+out:
+ if (!list_empty_careful(&splice)) {
+ spin_lock(&fs_info->delalloc_root_lock);
+ list_splice_tail(&splice, &fs_info->delalloc_roots);
+ spin_unlock(&fs_info->delalloc_root_lock);
+ }
+ mutex_unlock(&fs_info->delalloc_root_mutex);
+ return ret;
+}
+
+static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
+ const char *symname)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct inode *inode = NULL;
+ int err;
+ int drop_inode = 0;
+ u64 objectid;
+ u64 index = 0;
+ int name_len;
+ int datasize;
+ unsigned long ptr;
+ struct btrfs_file_extent_item *ei;
+ struct extent_buffer *leaf;
+
+ name_len = strlen(symname);
+ if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
+ return -ENAMETOOLONG;
+
+ /*
+ * 2 items for inode item and ref
+ * 2 items for dir items
+ * 1 item for xattr if selinux is on
+ */
+ trans = btrfs_start_transaction(root, 5);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ err = btrfs_find_free_ino(root, &objectid);
+ if (err)
+ goto out_unlock;
+
+ inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
+ dentry->d_name.len, btrfs_ino(dir), objectid,
+ S_IFLNK|S_IRWXUGO, &index);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ goto out_unlock;
+ }
+
+ /*
+ * If the active LSM wants to access the inode during
+ * d_instantiate it needs these. Smack checks to see
+ * if the filesystem supports xattrs by looking at the
+ * ops vector.
+ */
+ inode->i_fop = &btrfs_file_operations;
+ inode->i_op = &btrfs_file_inode_operations;
+ inode->i_mapping->a_ops = &btrfs_aops;
+ BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+
+ err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
+ if (err)
+ goto out_unlock_inode;
+
+ err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
+ if (err)
+ goto out_unlock_inode;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ err = -ENOMEM;
+ goto out_unlock_inode;
+ }
+ key.objectid = btrfs_ino(inode);
+ key.offset = 0;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ datasize = btrfs_file_extent_calc_inline_size(name_len);
+ err = btrfs_insert_empty_item(trans, root, path, &key,
+ datasize);
+ if (err) {
+ btrfs_free_path(path);
+ goto out_unlock_inode;
+ }
+ leaf = path->nodes[0];
+ ei = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_generation(leaf, ei, trans->transid);
+ btrfs_set_file_extent_type(leaf, ei,
+ BTRFS_FILE_EXTENT_INLINE);
+ btrfs_set_file_extent_encryption(leaf, ei, 0);
+ btrfs_set_file_extent_compression(leaf, ei, 0);
+ btrfs_set_file_extent_other_encoding(leaf, ei, 0);
+ btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
+
+ ptr = btrfs_file_extent_inline_start(ei);
+ write_extent_buffer(leaf, symname, ptr, name_len);
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_free_path(path);
+
+ inode->i_op = &btrfs_symlink_inode_operations;
+ inode->i_mapping->a_ops = &btrfs_symlink_aops;
+ inode_set_bytes(inode, name_len);
+ btrfs_i_size_write(inode, name_len);
+ err = btrfs_update_inode(trans, root, inode);
+ if (err) {
+ drop_inode = 1;
+ goto out_unlock_inode;
+ }
+
+ unlock_new_inode(inode);
+ d_instantiate(dentry, inode);
+
+out_unlock:
+ btrfs_end_transaction(trans, root);
+ if (drop_inode) {
+ inode_dec_link_count(inode);
+ iput(inode);
+ }
+ btrfs_btree_balance_dirty(root);
+ return err;
+
+out_unlock_inode:
+ drop_inode = 1;
+ unlock_new_inode(inode);
+ goto out_unlock;
+}
+
+static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
+ u64 start, u64 num_bytes, u64 min_size,
+ loff_t actual_len, u64 *alloc_hint,
+ struct btrfs_trans_handle *trans)
+{
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ struct extent_map *em;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_key ins;
+ u64 cur_offset = start;
+ u64 i_size;
+ u64 cur_bytes;
+ int ret = 0;
+ bool own_trans = true;
+
+ if (trans)
+ own_trans = false;
+ while (num_bytes > 0) {
+ if (own_trans) {
+ trans = btrfs_start_transaction(root, 3);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ break;
+ }
+ }
+
+ cur_bytes = min(num_bytes, 256ULL * 1024 * 1024);
+ cur_bytes = max(cur_bytes, min_size);
+ ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0,
+ *alloc_hint, &ins, 1, 0);
+ if (ret) {
+ if (own_trans)
+ btrfs_end_transaction(trans, root);
+ break;
+ }
+
+ ret = insert_reserved_file_extent(trans, inode,
+ cur_offset, ins.objectid,
+ ins.offset, ins.offset,
+ ins.offset, 0, 0, 0,
+ BTRFS_FILE_EXTENT_PREALLOC);
+ if (ret) {
+ btrfs_free_reserved_extent(root, ins.objectid,
+ ins.offset, 0);
+ btrfs_abort_transaction(trans, root, ret);
+ if (own_trans)
+ btrfs_end_transaction(trans, root);
+ break;
+ }
+
+ btrfs_drop_extent_cache(inode, cur_offset,
+ cur_offset + ins.offset -1, 0);
+
+ em = alloc_extent_map();
+ if (!em) {
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
+ goto next;
+ }
+
+ em->start = cur_offset;
+ em->orig_start = cur_offset;
+ em->len = ins.offset;
+ em->block_start = ins.objectid;
+ em->block_len = ins.offset;
+ em->orig_block_len = ins.offset;
+ em->ram_bytes = ins.offset;
+ em->bdev = root->fs_info->fs_devices->latest_bdev;
+ set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+ em->generation = trans->transid;
+
+ while (1) {
+ write_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, em, 1);
+ write_unlock(&em_tree->lock);
+ if (ret != -EEXIST)
+ break;
+ btrfs_drop_extent_cache(inode, cur_offset,
+ cur_offset + ins.offset - 1,
+ 0);
+ }
+ free_extent_map(em);
+next:
+ num_bytes -= ins.offset;
+ cur_offset += ins.offset;
+ *alloc_hint = ins.objectid + ins.offset;
+
+ inode_inc_iversion(inode);
+ inode->i_ctime = CURRENT_TIME;
+ BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
+ if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+ (actual_len > inode->i_size) &&
+ (cur_offset > inode->i_size)) {
+ if (cur_offset > actual_len)
+ i_size = actual_len;
+ else
+ i_size = cur_offset;
+ i_size_write(inode, i_size);
+ btrfs_ordered_update_i_size(inode, i_size, NULL);
+ }
+
+ ret = btrfs_update_inode(trans, root, inode);
+
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ if (own_trans)
+ btrfs_end_transaction(trans, root);
+ break;
+ }
+
+ if (own_trans)
+ btrfs_end_transaction(trans, root);
+ }
+ return ret;
+}
+
+int btrfs_prealloc_file_range(struct inode *inode, int mode,
+ u64 start, u64 num_bytes, u64 min_size,
+ loff_t actual_len, u64 *alloc_hint)
+{
+ return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
+ min_size, actual_len, alloc_hint,
+ NULL);
+}
+
+int btrfs_prealloc_file_range_trans(struct inode *inode,
+ struct btrfs_trans_handle *trans, int mode,
+ u64 start, u64 num_bytes, u64 min_size,
+ loff_t actual_len, u64 *alloc_hint)
+{
+ return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
+ min_size, actual_len, alloc_hint, trans);
+}
+
+static int btrfs_set_page_dirty(struct page *page)
+{
+ return __set_page_dirty_nobuffers(page);
+}
+
+static int btrfs_permission(struct inode *inode, int mask)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ umode_t mode = inode->i_mode;
+
+ if (mask & MAY_WRITE &&
+ (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
+ if (btrfs_root_readonly(root))
+ return -EROFS;
+ if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
+ return -EACCES;
+ }
+ return generic_permission(inode, mask);
+}
+
+static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct inode *inode = NULL;
+ u64 objectid;
+ u64 index;
+ int ret = 0;
+
+ /*
+ * 5 units required for adding orphan entry
+ */
+ trans = btrfs_start_transaction(root, 5);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ ret = btrfs_find_free_ino(root, &objectid);
+ if (ret)
+ goto out;
+
+ inode = btrfs_new_inode(trans, root, dir, NULL, 0,
+ btrfs_ino(dir), objectid, mode, &index);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ inode = NULL;
+ goto out;
+ }
+
+ inode->i_fop = &btrfs_file_operations;
+ inode->i_op = &btrfs_file_inode_operations;
+
+ inode->i_mapping->a_ops = &btrfs_aops;
+ BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+
+ ret = btrfs_init_inode_security(trans, inode, dir, NULL);
+ if (ret)
+ goto out_inode;
+
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret)
+ goto out_inode;
+ ret = btrfs_orphan_add(trans, inode);
+ if (ret)
+ goto out_inode;
+
+ /*
+ * We set number of links to 0 in btrfs_new_inode(), and here we set
+ * it to 1 because d_tmpfile() will issue a warning if the count is 0,
+ * through:
+ *
+ * d_tmpfile() -> inode_dec_link_count() -> drop_nlink()
+ */
+ set_nlink(inode, 1);
+ unlock_new_inode(inode);
+ d_tmpfile(dentry, inode);
+ mark_inode_dirty(inode);
+
+out:
+ btrfs_end_transaction(trans, root);
+ if (ret)
+ iput(inode);
+ btrfs_balance_delayed_items(root);
+ btrfs_btree_balance_dirty(root);
+ return ret;
+
+out_inode:
+ unlock_new_inode(inode);
+ goto out;
+
+}
+
+/* Inspired by filemap_check_errors() */
+int btrfs_inode_check_errors(struct inode *inode)
+{
+ int ret = 0;
+
+ if (test_bit(AS_ENOSPC, &inode->i_mapping->flags) &&
+ test_and_clear_bit(AS_ENOSPC, &inode->i_mapping->flags))
+ ret = -ENOSPC;
+ if (test_bit(AS_EIO, &inode->i_mapping->flags) &&
+ test_and_clear_bit(AS_EIO, &inode->i_mapping->flags))
+ ret = -EIO;
+
+ return ret;
+}
+
+static const struct inode_operations btrfs_dir_inode_operations = {
+ .getattr = btrfs_getattr,
+ .lookup = btrfs_lookup,
+ .create = btrfs_create,
+ .unlink = btrfs_unlink,
+ .link = btrfs_link,
+ .mkdir = btrfs_mkdir,
+ .rmdir = btrfs_rmdir,
+ .rename2 = btrfs_rename2,
+ .symlink = btrfs_symlink,
+ .setattr = btrfs_setattr,
+ .mknod = btrfs_mknod,
+ .setxattr = btrfs_setxattr,
+ .getxattr = btrfs_getxattr,
+ .listxattr = btrfs_listxattr,
+ .removexattr = btrfs_removexattr,
+ .permission = btrfs_permission,
+ .get_acl = btrfs_get_acl,
+ .set_acl = btrfs_set_acl,
+ .update_time = btrfs_update_time,
+ .tmpfile = btrfs_tmpfile,
+};
+static const struct inode_operations btrfs_dir_ro_inode_operations = {
+ .lookup = btrfs_lookup,
+ .permission = btrfs_permission,
+ .get_acl = btrfs_get_acl,
+ .set_acl = btrfs_set_acl,
+ .update_time = btrfs_update_time,
+};
+
+static const struct file_operations btrfs_dir_file_operations = {
+ .llseek = generic_file_llseek,
+ .read = generic_read_dir,
+ .iterate = btrfs_real_readdir,
+ .unlocked_ioctl = btrfs_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = btrfs_ioctl,
+#endif
+ .release = btrfs_release_file,
+ .fsync = btrfs_sync_file,
+};
+
+static struct extent_io_ops btrfs_extent_io_ops = {
+ .fill_delalloc = run_delalloc_range,
+ .submit_bio_hook = btrfs_submit_bio_hook,
+ .merge_bio_hook = btrfs_merge_bio_hook,
+ .readpage_end_io_hook = btrfs_readpage_end_io_hook,
+ .writepage_end_io_hook = btrfs_writepage_end_io_hook,
+ .writepage_start_hook = btrfs_writepage_start_hook,
+ .set_bit_hook = btrfs_set_bit_hook,
+ .clear_bit_hook = btrfs_clear_bit_hook,
+ .merge_extent_hook = btrfs_merge_extent_hook,
+ .split_extent_hook = btrfs_split_extent_hook,
+};
+
+/*
+ * btrfs doesn't support the bmap operation because swapfiles
+ * use bmap to make a mapping of extents in the file. They assume
+ * these extents won't change over the life of the file and they
+ * use the bmap result to do IO directly to the drive.
+ *
+ * the btrfs bmap call would return logical addresses that aren't
+ * suitable for IO and they also will change frequently as COW
+ * operations happen. So, swapfile + btrfs == corruption.
+ *
+ * For now we're avoiding this by dropping bmap.
+ */
+static const struct address_space_operations btrfs_aops = {
+ .readpage = btrfs_readpage,
+ .writepage = btrfs_writepage,
+ .writepages = btrfs_writepages,
+ .readpages = btrfs_readpages,
+ .direct_IO = btrfs_direct_IO,
+ .invalidatepage = btrfs_invalidatepage,
+ .releasepage = btrfs_releasepage,
+ .set_page_dirty = btrfs_set_page_dirty,
+ .error_remove_page = generic_error_remove_page,
+};
+
+static const struct address_space_operations btrfs_symlink_aops = {
+ .readpage = btrfs_readpage,
+ .writepage = btrfs_writepage,
+ .invalidatepage = btrfs_invalidatepage,
+ .releasepage = btrfs_releasepage,
+};
+
+static const struct inode_operations btrfs_file_inode_operations = {
+ .getattr = btrfs_getattr,
+ .setattr = btrfs_setattr,
+ .setxattr = btrfs_setxattr,
+ .getxattr = btrfs_getxattr,
+ .listxattr = btrfs_listxattr,
+ .removexattr = btrfs_removexattr,
+ .permission = btrfs_permission,
+ .fiemap = btrfs_fiemap,
+ .get_acl = btrfs_get_acl,
+ .set_acl = btrfs_set_acl,
+ .update_time = btrfs_update_time,
+};
+static const struct inode_operations btrfs_special_inode_operations = {
+ .getattr = btrfs_getattr,
+ .setattr = btrfs_setattr,
+ .permission = btrfs_permission,
+ .setxattr = btrfs_setxattr,
+ .getxattr = btrfs_getxattr,
+ .listxattr = btrfs_listxattr,
+ .removexattr = btrfs_removexattr,
+ .get_acl = btrfs_get_acl,
+ .set_acl = btrfs_set_acl,
+ .update_time = btrfs_update_time,
+};
+static const struct inode_operations btrfs_symlink_inode_operations = {
+ .readlink = generic_readlink,
+ .follow_link = page_follow_link_light,
+ .put_link = page_put_link,
+ .getattr = btrfs_getattr,
+ .setattr = btrfs_setattr,
+ .permission = btrfs_permission,
+ .setxattr = btrfs_setxattr,
+ .getxattr = btrfs_getxattr,
+ .listxattr = btrfs_listxattr,
+ .removexattr = btrfs_removexattr,
+ .update_time = btrfs_update_time,
+};
+
+const struct dentry_operations btrfs_dentry_operations = {
+ .d_delete = btrfs_dentry_delete,
+ .d_release = btrfs_dentry_release,
+};
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
new file mode 100644
index 000000000..37d456a9a
--- /dev/null
+++ b/fs/btrfs/ioctl.c
@@ -0,0 +1,5373 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/bio.h>
+#include <linux/buffer_head.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/fsnotify.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/backing-dev.h>
+#include <linux/mount.h>
+#include <linux/mpage.h>
+#include <linux/namei.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/statfs.h>
+#include <linux/compat.h>
+#include <linux/bit_spinlock.h>
+#include <linux/security.h>
+#include <linux/xattr.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/uuid.h>
+#include <linux/btrfs.h>
+#include <linux/uaccess.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "print-tree.h"
+#include "volumes.h"
+#include "locking.h"
+#include "inode-map.h"
+#include "backref.h"
+#include "rcu-string.h"
+#include "send.h"
+#include "dev-replace.h"
+#include "props.h"
+#include "sysfs.h"
+#include "qgroup.h"
+
+#ifdef CONFIG_64BIT
+/* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
+ * structures are incorrect, as the timespec structure from userspace
+ * is 4 bytes too small. We define these alternatives here to teach
+ * the kernel about the 32-bit struct packing.
+ */
+struct btrfs_ioctl_timespec_32 {
+ __u64 sec;
+ __u32 nsec;
+} __attribute__ ((__packed__));
+
+struct btrfs_ioctl_received_subvol_args_32 {
+ char uuid[BTRFS_UUID_SIZE]; /* in */
+ __u64 stransid; /* in */
+ __u64 rtransid; /* out */
+ struct btrfs_ioctl_timespec_32 stime; /* in */
+ struct btrfs_ioctl_timespec_32 rtime; /* out */
+ __u64 flags; /* in */
+ __u64 reserved[16]; /* in */
+} __attribute__ ((__packed__));
+
+#define BTRFS_IOC_SET_RECEIVED_SUBVOL_32 _IOWR(BTRFS_IOCTL_MAGIC, 37, \
+ struct btrfs_ioctl_received_subvol_args_32)
+#endif
+
+
+static int btrfs_clone(struct inode *src, struct inode *inode,
+ u64 off, u64 olen, u64 olen_aligned, u64 destoff);
+
+/* Mask out flags that are inappropriate for the given type of inode. */
+static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
+{
+ if (S_ISDIR(mode))
+ return flags;
+ else if (S_ISREG(mode))
+ return flags & ~FS_DIRSYNC_FL;
+ else
+ return flags & (FS_NODUMP_FL | FS_NOATIME_FL);
+}
+
+/*
+ * Export inode flags to the format expected by the FS_IOC_GETFLAGS ioctl.
+ */
+static unsigned int btrfs_flags_to_ioctl(unsigned int flags)
+{
+ unsigned int iflags = 0;
+
+ if (flags & BTRFS_INODE_SYNC)
+ iflags |= FS_SYNC_FL;
+ if (flags & BTRFS_INODE_IMMUTABLE)
+ iflags |= FS_IMMUTABLE_FL;
+ if (flags & BTRFS_INODE_APPEND)
+ iflags |= FS_APPEND_FL;
+ if (flags & BTRFS_INODE_NODUMP)
+ iflags |= FS_NODUMP_FL;
+ if (flags & BTRFS_INODE_NOATIME)
+ iflags |= FS_NOATIME_FL;
+ if (flags & BTRFS_INODE_DIRSYNC)
+ iflags |= FS_DIRSYNC_FL;
+ if (flags & BTRFS_INODE_NODATACOW)
+ iflags |= FS_NOCOW_FL;
+
+ if ((flags & BTRFS_INODE_COMPRESS) && !(flags & BTRFS_INODE_NOCOMPRESS))
+ iflags |= FS_COMPR_FL;
+ else if (flags & BTRFS_INODE_NOCOMPRESS)
+ iflags |= FS_NOCOMP_FL;
+
+ return iflags;
+}
+
+/*
+ * Update inode->i_flags based on the btrfs internal flags.
+ */
+void btrfs_update_iflags(struct inode *inode)
+{
+ struct btrfs_inode *ip = BTRFS_I(inode);
+ unsigned int new_fl = 0;
+
+ if (ip->flags & BTRFS_INODE_SYNC)
+ new_fl |= S_SYNC;
+ if (ip->flags & BTRFS_INODE_IMMUTABLE)
+ new_fl |= S_IMMUTABLE;
+ if (ip->flags & BTRFS_INODE_APPEND)
+ new_fl |= S_APPEND;
+ if (ip->flags & BTRFS_INODE_NOATIME)
+ new_fl |= S_NOATIME;
+ if (ip->flags & BTRFS_INODE_DIRSYNC)
+ new_fl |= S_DIRSYNC;
+
+ set_mask_bits(&inode->i_flags,
+ S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC,
+ new_fl);
+}
+
+/*
+ * Inherit flags from the parent inode.
+ *
+ * Currently only the compression flags and the cow flags are inherited.
+ */
+void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
+{
+ unsigned int flags;
+
+ if (!dir)
+ return;
+
+ flags = BTRFS_I(dir)->flags;
+
+ if (flags & BTRFS_INODE_NOCOMPRESS) {
+ BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
+ BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
+ } else if (flags & BTRFS_INODE_COMPRESS) {
+ BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
+ BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
+ }
+
+ if (flags & BTRFS_INODE_NODATACOW) {
+ BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
+ if (S_ISREG(inode->i_mode))
+ BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
+ }
+
+ btrfs_update_iflags(inode);
+}
+
+static int btrfs_ioctl_getflags(struct file *file, void __user *arg)
+{
+ struct btrfs_inode *ip = BTRFS_I(file_inode(file));
+ unsigned int flags = btrfs_flags_to_ioctl(ip->flags);
+
+ if (copy_to_user(arg, &flags, sizeof(flags)))
+ return -EFAULT;
+ return 0;
+}
+
+static int check_flags(unsigned int flags)
+{
+ if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
+ FS_NOATIME_FL | FS_NODUMP_FL | \
+ FS_SYNC_FL | FS_DIRSYNC_FL | \
+ FS_NOCOMP_FL | FS_COMPR_FL |
+ FS_NOCOW_FL))
+ return -EOPNOTSUPP;
+
+ if ((flags & FS_NOCOMP_FL) && (flags & FS_COMPR_FL))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
+{
+ struct inode *inode = file_inode(file);
+ struct btrfs_inode *ip = BTRFS_I(inode);
+ struct btrfs_root *root = ip->root;
+ struct btrfs_trans_handle *trans;
+ unsigned int flags, oldflags;
+ int ret;
+ u64 ip_oldflags;
+ unsigned int i_oldflags;
+ umode_t mode;
+
+ if (!inode_owner_or_capable(inode))
+ return -EPERM;
+
+ if (btrfs_root_readonly(root))
+ return -EROFS;
+
+ if (copy_from_user(&flags, arg, sizeof(flags)))
+ return -EFAULT;
+
+ ret = check_flags(flags);
+ if (ret)
+ return ret;
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
+
+ mutex_lock(&inode->i_mutex);
+
+ ip_oldflags = ip->flags;
+ i_oldflags = inode->i_flags;
+ mode = inode->i_mode;
+
+ flags = btrfs_mask_flags(inode->i_mode, flags);
+ oldflags = btrfs_flags_to_ioctl(ip->flags);
+ if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
+ if (!capable(CAP_LINUX_IMMUTABLE)) {
+ ret = -EPERM;
+ goto out_unlock;
+ }
+ }
+
+ if (flags & FS_SYNC_FL)
+ ip->flags |= BTRFS_INODE_SYNC;
+ else
+ ip->flags &= ~BTRFS_INODE_SYNC;
+ if (flags & FS_IMMUTABLE_FL)
+ ip->flags |= BTRFS_INODE_IMMUTABLE;
+ else
+ ip->flags &= ~BTRFS_INODE_IMMUTABLE;
+ if (flags & FS_APPEND_FL)
+ ip->flags |= BTRFS_INODE_APPEND;
+ else
+ ip->flags &= ~BTRFS_INODE_APPEND;
+ if (flags & FS_NODUMP_FL)
+ ip->flags |= BTRFS_INODE_NODUMP;
+ else
+ ip->flags &= ~BTRFS_INODE_NODUMP;
+ if (flags & FS_NOATIME_FL)
+ ip->flags |= BTRFS_INODE_NOATIME;
+ else
+ ip->flags &= ~BTRFS_INODE_NOATIME;
+ if (flags & FS_DIRSYNC_FL)
+ ip->flags |= BTRFS_INODE_DIRSYNC;
+ else
+ ip->flags &= ~BTRFS_INODE_DIRSYNC;
+ if (flags & FS_NOCOW_FL) {
+ if (S_ISREG(mode)) {
+ /*
+ * It's safe to turn csums off here, no extents exist.
+ * Otherwise we want the flag to reflect the real COW
+ * status of the file and will not set it.
+ */
+ if (inode->i_size == 0)
+ ip->flags |= BTRFS_INODE_NODATACOW
+ | BTRFS_INODE_NODATASUM;
+ } else {
+ ip->flags |= BTRFS_INODE_NODATACOW;
+ }
+ } else {
+ /*
+ * Revert back under same assuptions as above
+ */
+ if (S_ISREG(mode)) {
+ if (inode->i_size == 0)
+ ip->flags &= ~(BTRFS_INODE_NODATACOW
+ | BTRFS_INODE_NODATASUM);
+ } else {
+ ip->flags &= ~BTRFS_INODE_NODATACOW;
+ }
+ }
+
+ /*
+ * The COMPRESS flag can only be changed by users, while the NOCOMPRESS
+ * flag may be changed automatically if compression code won't make
+ * things smaller.
+ */
+ if (flags & FS_NOCOMP_FL) {
+ ip->flags &= ~BTRFS_INODE_COMPRESS;
+ ip->flags |= BTRFS_INODE_NOCOMPRESS;
+
+ ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0);
+ if (ret && ret != -ENODATA)
+ goto out_drop;
+ } else if (flags & FS_COMPR_FL) {
+ const char *comp;
+
+ ip->flags |= BTRFS_INODE_COMPRESS;
+ ip->flags &= ~BTRFS_INODE_NOCOMPRESS;
+
+ if (root->fs_info->compress_type == BTRFS_COMPRESS_LZO)
+ comp = "lzo";
+ else
+ comp = "zlib";
+ ret = btrfs_set_prop(inode, "btrfs.compression",
+ comp, strlen(comp), 0);
+ if (ret)
+ goto out_drop;
+
+ } else {
+ ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0);
+ if (ret && ret != -ENODATA)
+ goto out_drop;
+ ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
+ }
+
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out_drop;
+ }
+
+ btrfs_update_iflags(inode);
+ inode_inc_iversion(inode);
+ inode->i_ctime = CURRENT_TIME;
+ ret = btrfs_update_inode(trans, root, inode);
+
+ btrfs_end_transaction(trans, root);
+ out_drop:
+ if (ret) {
+ ip->flags = ip_oldflags;
+ inode->i_flags = i_oldflags;
+ }
+
+ out_unlock:
+ mutex_unlock(&inode->i_mutex);
+ mnt_drop_write_file(file);
+ return ret;
+}
+
+static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
+{
+ struct inode *inode = file_inode(file);
+
+ return put_user(inode->i_generation, arg);
+}
+
+static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(file_inode(file)->i_sb);
+ struct btrfs_device *device;
+ struct request_queue *q;
+ struct fstrim_range range;
+ u64 minlen = ULLONG_MAX;
+ u64 num_devices = 0;
+ u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(device, &fs_info->fs_devices->devices,
+ dev_list) {
+ if (!device->bdev)
+ continue;
+ q = bdev_get_queue(device->bdev);
+ if (blk_queue_discard(q)) {
+ num_devices++;
+ minlen = min((u64)q->limits.discard_granularity,
+ minlen);
+ }
+ }
+ rcu_read_unlock();
+
+ if (!num_devices)
+ return -EOPNOTSUPP;
+ if (copy_from_user(&range, arg, sizeof(range)))
+ return -EFAULT;
+ if (range.start > total_bytes ||
+ range.len < fs_info->sb->s_blocksize)
+ return -EINVAL;
+
+ range.len = min(range.len, total_bytes - range.start);
+ range.minlen = max(range.minlen, minlen);
+ ret = btrfs_trim_fs(fs_info->tree_root, &range);
+ if (ret < 0)
+ return ret;
+
+ if (copy_to_user(arg, &range, sizeof(range)))
+ return -EFAULT;
+
+ return 0;
+}
+
+int btrfs_is_empty_uuid(u8 *uuid)
+{
+ int i;
+
+ for (i = 0; i < BTRFS_UUID_SIZE; i++) {
+ if (uuid[i])
+ return 0;
+ }
+ return 1;
+}
+
+static noinline int create_subvol(struct inode *dir,
+ struct dentry *dentry,
+ char *name, int namelen,
+ u64 *async_transid,
+ struct btrfs_qgroup_inherit *inherit)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_key key;
+ struct btrfs_root_item root_item;
+ struct btrfs_inode_item *inode_item;
+ struct extent_buffer *leaf;
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_root *new_root;
+ struct btrfs_block_rsv block_rsv;
+ struct timespec cur_time = CURRENT_TIME;
+ struct inode *inode;
+ int ret;
+ int err;
+ u64 objectid;
+ u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
+ u64 index = 0;
+ u64 qgroup_reserved;
+ uuid_le new_uuid;
+
+ ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid);
+ if (ret)
+ return ret;
+
+ /*
+ * Don't create subvolume whose level is not zero. Or qgroup will be
+ * screwed up since it assume subvolme qgroup's level to be 0.
+ */
+ if (btrfs_qgroup_level(objectid))
+ return -ENOSPC;
+
+ btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
+ /*
+ * The same as the snapshot creation, please see the comment
+ * of create_snapshot().
+ */
+ ret = btrfs_subvolume_reserve_metadata(root, &block_rsv,
+ 8, &qgroup_reserved, false);
+ if (ret)
+ return ret;
+
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ btrfs_subvolume_release_metadata(root, &block_rsv,
+ qgroup_reserved);
+ return ret;
+ }
+ trans->block_rsv = &block_rsv;
+ trans->bytes_reserved = block_rsv.size;
+
+ ret = btrfs_qgroup_inherit(trans, root->fs_info, 0, objectid, inherit);
+ if (ret)
+ goto fail;
+
+ leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0);
+ if (IS_ERR(leaf)) {
+ ret = PTR_ERR(leaf);
+ goto fail;
+ }
+
+ memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header));
+ btrfs_set_header_bytenr(leaf, leaf->start);
+ btrfs_set_header_generation(leaf, trans->transid);
+ btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
+ btrfs_set_header_owner(leaf, objectid);
+
+ write_extent_buffer(leaf, root->fs_info->fsid, btrfs_header_fsid(),
+ BTRFS_FSID_SIZE);
+ write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
+ btrfs_header_chunk_tree_uuid(leaf),
+ BTRFS_UUID_SIZE);
+ btrfs_mark_buffer_dirty(leaf);
+
+ memset(&root_item, 0, sizeof(root_item));
+
+ inode_item = &root_item.inode;
+ btrfs_set_stack_inode_generation(inode_item, 1);
+ btrfs_set_stack_inode_size(inode_item, 3);
+ btrfs_set_stack_inode_nlink(inode_item, 1);
+ btrfs_set_stack_inode_nbytes(inode_item, root->nodesize);
+ btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
+
+ btrfs_set_root_flags(&root_item, 0);
+ btrfs_set_root_limit(&root_item, 0);
+ btrfs_set_stack_inode_flags(inode_item, BTRFS_INODE_ROOT_ITEM_INIT);
+
+ btrfs_set_root_bytenr(&root_item, leaf->start);
+ btrfs_set_root_generation(&root_item, trans->transid);
+ btrfs_set_root_level(&root_item, 0);
+ btrfs_set_root_refs(&root_item, 1);
+ btrfs_set_root_used(&root_item, leaf->len);
+ btrfs_set_root_last_snapshot(&root_item, 0);
+
+ btrfs_set_root_generation_v2(&root_item,
+ btrfs_root_generation(&root_item));
+ uuid_le_gen(&new_uuid);
+ memcpy(root_item.uuid, new_uuid.b, BTRFS_UUID_SIZE);
+ btrfs_set_stack_timespec_sec(&root_item.otime, cur_time.tv_sec);
+ btrfs_set_stack_timespec_nsec(&root_item.otime, cur_time.tv_nsec);
+ root_item.ctime = root_item.otime;
+ btrfs_set_root_ctransid(&root_item, trans->transid);
+ btrfs_set_root_otransid(&root_item, trans->transid);
+
+ btrfs_tree_unlock(leaf);
+ free_extent_buffer(leaf);
+ leaf = NULL;
+
+ btrfs_set_root_dirid(&root_item, new_dirid);
+
+ key.objectid = objectid;
+ key.offset = 0;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
+ &root_item);
+ if (ret)
+ goto fail;
+
+ key.offset = (u64)-1;
+ new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
+ if (IS_ERR(new_root)) {
+ btrfs_abort_transaction(trans, root, PTR_ERR(new_root));
+ ret = PTR_ERR(new_root);
+ goto fail;
+ }
+
+ btrfs_record_root_in_trans(trans, new_root);
+
+ ret = btrfs_create_subvol_root(trans, new_root, root, new_dirid);
+ if (ret) {
+ /* We potentially lose an unused inode item here */
+ btrfs_abort_transaction(trans, root, ret);
+ goto fail;
+ }
+
+ /*
+ * insert the directory item
+ */
+ ret = btrfs_set_inode_index(dir, &index);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto fail;
+ }
+
+ ret = btrfs_insert_dir_item(trans, root,
+ name, namelen, dir, &key,
+ BTRFS_FT_DIR, index);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto fail;
+ }
+
+ btrfs_i_size_write(dir, dir->i_size + namelen * 2);
+ ret = btrfs_update_inode(trans, root, dir);
+ BUG_ON(ret);
+
+ ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
+ objectid, root->root_key.objectid,
+ btrfs_ino(dir), index, name, namelen);
+ BUG_ON(ret);
+
+ ret = btrfs_uuid_tree_add(trans, root->fs_info->uuid_root,
+ root_item.uuid, BTRFS_UUID_KEY_SUBVOL,
+ objectid);
+ if (ret)
+ btrfs_abort_transaction(trans, root, ret);
+
+fail:
+ trans->block_rsv = NULL;
+ trans->bytes_reserved = 0;
+ btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved);
+
+ if (async_transid) {
+ *async_transid = trans->transid;
+ err = btrfs_commit_transaction_async(trans, root, 1);
+ if (err)
+ err = btrfs_commit_transaction(trans, root);
+ } else {
+ err = btrfs_commit_transaction(trans, root);
+ }
+ if (err && !ret)
+ ret = err;
+
+ if (!ret) {
+ inode = btrfs_lookup_dentry(dir, dentry);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+ d_instantiate(dentry, inode);
+ }
+ return ret;
+}
+
+static void btrfs_wait_for_no_snapshoting_writes(struct btrfs_root *root)
+{
+ s64 writers;
+ DEFINE_WAIT(wait);
+
+ do {
+ prepare_to_wait(&root->subv_writers->wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+
+ writers = percpu_counter_sum(&root->subv_writers->counter);
+ if (writers)
+ schedule();
+
+ finish_wait(&root->subv_writers->wait, &wait);
+ } while (writers);
+}
+
+static int create_snapshot(struct btrfs_root *root, struct inode *dir,
+ struct dentry *dentry, char *name, int namelen,
+ u64 *async_transid, bool readonly,
+ struct btrfs_qgroup_inherit *inherit)
+{
+ struct inode *inode;
+ struct btrfs_pending_snapshot *pending_snapshot;
+ struct btrfs_trans_handle *trans;
+ int ret;
+
+ if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
+ return -EINVAL;
+
+ atomic_inc(&root->will_be_snapshoted);
+ smp_mb__after_atomic();
+ btrfs_wait_for_no_snapshoting_writes(root);
+
+ ret = btrfs_start_delalloc_inodes(root, 0);
+ if (ret)
+ goto out;
+
+ btrfs_wait_ordered_extents(root, -1);
+
+ pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
+ if (!pending_snapshot) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ btrfs_init_block_rsv(&pending_snapshot->block_rsv,
+ BTRFS_BLOCK_RSV_TEMP);
+ /*
+ * 1 - parent dir inode
+ * 2 - dir entries
+ * 1 - root item
+ * 2 - root ref/backref
+ * 1 - root of snapshot
+ * 1 - UUID item
+ */
+ ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root,
+ &pending_snapshot->block_rsv, 8,
+ &pending_snapshot->qgroup_reserved,
+ false);
+ if (ret)
+ goto free;
+
+ pending_snapshot->dentry = dentry;
+ pending_snapshot->root = root;
+ pending_snapshot->readonly = readonly;
+ pending_snapshot->dir = dir;
+ pending_snapshot->inherit = inherit;
+
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto fail;
+ }
+
+ spin_lock(&root->fs_info->trans_lock);
+ list_add(&pending_snapshot->list,
+ &trans->transaction->pending_snapshots);
+ spin_unlock(&root->fs_info->trans_lock);
+ if (async_transid) {
+ *async_transid = trans->transid;
+ ret = btrfs_commit_transaction_async(trans,
+ root->fs_info->extent_root, 1);
+ if (ret)
+ ret = btrfs_commit_transaction(trans, root);
+ } else {
+ ret = btrfs_commit_transaction(trans,
+ root->fs_info->extent_root);
+ }
+ if (ret)
+ goto fail;
+
+ ret = pending_snapshot->error;
+ if (ret)
+ goto fail;
+
+ ret = btrfs_orphan_cleanup(pending_snapshot->snap);
+ if (ret)
+ goto fail;
+
+ inode = btrfs_lookup_dentry(d_inode(dentry->d_parent), dentry);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ goto fail;
+ }
+
+ d_instantiate(dentry, inode);
+ ret = 0;
+fail:
+ btrfs_subvolume_release_metadata(BTRFS_I(dir)->root,
+ &pending_snapshot->block_rsv,
+ pending_snapshot->qgroup_reserved);
+free:
+ kfree(pending_snapshot);
+out:
+ if (atomic_dec_and_test(&root->will_be_snapshoted))
+ wake_up_atomic_t(&root->will_be_snapshoted);
+ return ret;
+}
+
+/* copy of may_delete in fs/namei.c()
+ * Check whether we can remove a link victim from directory dir, check
+ * whether the type of victim is right.
+ * 1. We can't do it if dir is read-only (done in permission())
+ * 2. We should have write and exec permissions on dir
+ * 3. We can't remove anything from append-only dir
+ * 4. We can't do anything with immutable dir (done in permission())
+ * 5. If the sticky bit on dir is set we should either
+ * a. be owner of dir, or
+ * b. be owner of victim, or
+ * c. have CAP_FOWNER capability
+ * 6. If the victim is append-only or immutable we can't do antyhing with
+ * links pointing to it.
+ * 7. If we were asked to remove a directory and victim isn't one - ENOTDIR.
+ * 8. If we were asked to remove a non-directory and victim isn't one - EISDIR.
+ * 9. We can't remove a root or mountpoint.
+ * 10. We don't allow removal of NFS sillyrenamed files; it's handled by
+ * nfs_async_unlink().
+ */
+
+static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir)
+{
+ int error;
+
+ if (d_really_is_negative(victim))
+ return -ENOENT;
+
+ BUG_ON(d_inode(victim->d_parent) != dir);
+ audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
+
+ error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
+ if (error)
+ return error;
+ if (IS_APPEND(dir))
+ return -EPERM;
+ if (check_sticky(dir, d_inode(victim)) || IS_APPEND(d_inode(victim)) ||
+ IS_IMMUTABLE(d_inode(victim)) || IS_SWAPFILE(d_inode(victim)))
+ return -EPERM;
+ if (isdir) {
+ if (!d_is_dir(victim))
+ return -ENOTDIR;
+ if (IS_ROOT(victim))
+ return -EBUSY;
+ } else if (d_is_dir(victim))
+ return -EISDIR;
+ if (IS_DEADDIR(dir))
+ return -ENOENT;
+ if (victim->d_flags & DCACHE_NFSFS_RENAMED)
+ return -EBUSY;
+ return 0;
+}
+
+/* copy of may_create in fs/namei.c() */
+static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
+{
+ if (d_really_is_positive(child))
+ return -EEXIST;
+ if (IS_DEADDIR(dir))
+ return -ENOENT;
+ return inode_permission(dir, MAY_WRITE | MAY_EXEC);
+}
+
+/*
+ * Create a new subvolume below @parent. This is largely modeled after
+ * sys_mkdirat and vfs_mkdir, but we only do a single component lookup
+ * inside this filesystem so it's quite a bit simpler.
+ */
+static noinline int btrfs_mksubvol(struct path *parent,
+ char *name, int namelen,
+ struct btrfs_root *snap_src,
+ u64 *async_transid, bool readonly,
+ struct btrfs_qgroup_inherit *inherit)
+{
+ struct inode *dir = d_inode(parent->dentry);
+ struct dentry *dentry;
+ int error;
+
+ error = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT);
+ if (error == -EINTR)
+ return error;
+
+ dentry = lookup_one_len(name, parent->dentry, namelen);
+ error = PTR_ERR(dentry);
+ if (IS_ERR(dentry))
+ goto out_unlock;
+
+ error = -EEXIST;
+ if (d_really_is_positive(dentry))
+ goto out_dput;
+
+ error = btrfs_may_create(dir, dentry);
+ if (error)
+ goto out_dput;
+
+ /*
+ * even if this name doesn't exist, we may get hash collisions.
+ * check for them now when we can safely fail
+ */
+ error = btrfs_check_dir_item_collision(BTRFS_I(dir)->root,
+ dir->i_ino, name,
+ namelen);
+ if (error)
+ goto out_dput;
+
+ down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
+
+ if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0)
+ goto out_up_read;
+
+ if (snap_src) {
+ error = create_snapshot(snap_src, dir, dentry, name, namelen,
+ async_transid, readonly, inherit);
+ } else {
+ error = create_subvol(dir, dentry, name, namelen,
+ async_transid, inherit);
+ }
+ if (!error)
+ fsnotify_mkdir(dir, dentry);
+out_up_read:
+ up_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
+out_dput:
+ dput(dentry);
+out_unlock:
+ mutex_unlock(&dir->i_mutex);
+ return error;
+}
+
+/*
+ * When we're defragging a range, we don't want to kick it off again
+ * if it is really just waiting for delalloc to send it down.
+ * If we find a nice big extent or delalloc range for the bytes in the
+ * file you want to defrag, we return 0 to let you know to skip this
+ * part of the file
+ */
+static int check_defrag_in_cache(struct inode *inode, u64 offset, u32 thresh)
+{
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct extent_map *em = NULL;
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ u64 end;
+
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
+ read_unlock(&em_tree->lock);
+
+ if (em) {
+ end = extent_map_end(em);
+ free_extent_map(em);
+ if (end - offset > thresh)
+ return 0;
+ }
+ /* if we already have a nice delalloc here, just stop */
+ thresh /= 2;
+ end = count_range_bits(io_tree, &offset, offset + thresh,
+ thresh, EXTENT_DELALLOC, 1);
+ if (end >= thresh)
+ return 0;
+ return 1;
+}
+
+/*
+ * helper function to walk through a file and find extents
+ * newer than a specific transid, and smaller than thresh.
+ *
+ * This is used by the defragging code to find new and small
+ * extents
+ */
+static int find_new_extents(struct btrfs_root *root,
+ struct inode *inode, u64 newer_than,
+ u64 *off, u32 thresh)
+{
+ struct btrfs_path *path;
+ struct btrfs_key min_key;
+ struct extent_buffer *leaf;
+ struct btrfs_file_extent_item *extent;
+ int type;
+ int ret;
+ u64 ino = btrfs_ino(inode);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ min_key.objectid = ino;
+ min_key.type = BTRFS_EXTENT_DATA_KEY;
+ min_key.offset = *off;
+
+ while (1) {
+ ret = btrfs_search_forward(root, &min_key, path, newer_than);
+ if (ret != 0)
+ goto none;
+process_slot:
+ if (min_key.objectid != ino)
+ goto none;
+ if (min_key.type != BTRFS_EXTENT_DATA_KEY)
+ goto none;
+
+ leaf = path->nodes[0];
+ extent = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+
+ type = btrfs_file_extent_type(leaf, extent);
+ if (type == BTRFS_FILE_EXTENT_REG &&
+ btrfs_file_extent_num_bytes(leaf, extent) < thresh &&
+ check_defrag_in_cache(inode, min_key.offset, thresh)) {
+ *off = min_key.offset;
+ btrfs_free_path(path);
+ return 0;
+ }
+
+ path->slots[0]++;
+ if (path->slots[0] < btrfs_header_nritems(leaf)) {
+ btrfs_item_key_to_cpu(leaf, &min_key, path->slots[0]);
+ goto process_slot;
+ }
+
+ if (min_key.offset == (u64)-1)
+ goto none;
+
+ min_key.offset++;
+ btrfs_release_path(path);
+ }
+none:
+ btrfs_free_path(path);
+ return -ENOENT;
+}
+
+static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start)
+{
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct extent_map *em;
+ u64 len = PAGE_CACHE_SIZE;
+
+ /*
+ * hopefully we have this extent in the tree already, try without
+ * the full extent lock
+ */
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, start, len);
+ read_unlock(&em_tree->lock);
+
+ if (!em) {
+ struct extent_state *cached = NULL;
+ u64 end = start + len - 1;
+
+ /* get the big lock and read metadata off disk */
+ lock_extent_bits(io_tree, start, end, 0, &cached);
+ em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
+ unlock_extent_cached(io_tree, start, end, &cached, GFP_NOFS);
+
+ if (IS_ERR(em))
+ return NULL;
+ }
+
+ return em;
+}
+
+static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
+{
+ struct extent_map *next;
+ bool ret = true;
+
+ /* this is the last extent */
+ if (em->start + em->len >= i_size_read(inode))
+ return false;
+
+ next = defrag_lookup_extent(inode, em->start + em->len);
+ if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
+ ret = false;
+ else if ((em->block_start + em->block_len == next->block_start) &&
+ (em->block_len > 128 * 1024 && next->block_len > 128 * 1024))
+ ret = false;
+
+ free_extent_map(next);
+ return ret;
+}
+
+static int should_defrag_range(struct inode *inode, u64 start, u32 thresh,
+ u64 *last_len, u64 *skip, u64 *defrag_end,
+ int compress)
+{
+ struct extent_map *em;
+ int ret = 1;
+ bool next_mergeable = true;
+
+ /*
+ * make sure that once we start defragging an extent, we keep on
+ * defragging it
+ */
+ if (start < *defrag_end)
+ return 1;
+
+ *skip = 0;
+
+ em = defrag_lookup_extent(inode, start);
+ if (!em)
+ return 0;
+
+ /* this will cover holes, and inline extents */
+ if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+ ret = 0;
+ goto out;
+ }
+
+ next_mergeable = defrag_check_next_extent(inode, em);
+ /*
+ * we hit a real extent, if it is big or the next extent is not a
+ * real extent, don't bother defragging it
+ */
+ if (!compress && (*last_len == 0 || *last_len >= thresh) &&
+ (em->len >= thresh || !next_mergeable))
+ ret = 0;
+out:
+ /*
+ * last_len ends up being a counter of how many bytes we've defragged.
+ * every time we choose not to defrag an extent, we reset *last_len
+ * so that the next tiny extent will force a defrag.
+ *
+ * The end result of this is that tiny extents before a single big
+ * extent will force at least part of that big extent to be defragged.
+ */
+ if (ret) {
+ *defrag_end = extent_map_end(em);
+ } else {
+ *last_len = 0;
+ *skip = extent_map_end(em);
+ *defrag_end = 0;
+ }
+
+ free_extent_map(em);
+ return ret;
+}
+
+/*
+ * it doesn't do much good to defrag one or two pages
+ * at a time. This pulls in a nice chunk of pages
+ * to COW and defrag.
+ *
+ * It also makes sure the delalloc code has enough
+ * dirty data to avoid making new small extents as part
+ * of the defrag
+ *
+ * It's a good idea to start RA on this range
+ * before calling this.
+ */
+static int cluster_pages_for_defrag(struct inode *inode,
+ struct page **pages,
+ unsigned long start_index,
+ unsigned long num_pages)
+{
+ unsigned long file_end;
+ u64 isize = i_size_read(inode);
+ u64 page_start;
+ u64 page_end;
+ u64 page_cnt;
+ int ret;
+ int i;
+ int i_done;
+ struct btrfs_ordered_extent *ordered;
+ struct extent_state *cached_state = NULL;
+ struct extent_io_tree *tree;
+ gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
+
+ file_end = (isize - 1) >> PAGE_CACHE_SHIFT;
+ if (!isize || start_index > file_end)
+ return 0;
+
+ page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1);
+
+ ret = btrfs_delalloc_reserve_space(inode,
+ page_cnt << PAGE_CACHE_SHIFT);
+ if (ret)
+ return ret;
+ i_done = 0;
+ tree = &BTRFS_I(inode)->io_tree;
+
+ /* step one, lock all the pages */
+ for (i = 0; i < page_cnt; i++) {
+ struct page *page;
+again:
+ page = find_or_create_page(inode->i_mapping,
+ start_index + i, mask);
+ if (!page)
+ break;
+
+ page_start = page_offset(page);
+ page_end = page_start + PAGE_CACHE_SIZE - 1;
+ while (1) {
+ lock_extent_bits(tree, page_start, page_end,
+ 0, &cached_state);
+ ordered = btrfs_lookup_ordered_extent(inode,
+ page_start);
+ unlock_extent_cached(tree, page_start, page_end,
+ &cached_state, GFP_NOFS);
+ if (!ordered)
+ break;
+
+ unlock_page(page);
+ btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_put_ordered_extent(ordered);
+ lock_page(page);
+ /*
+ * we unlocked the page above, so we need check if
+ * it was released or not.
+ */
+ if (page->mapping != inode->i_mapping) {
+ unlock_page(page);
+ page_cache_release(page);
+ goto again;
+ }
+ }
+
+ if (!PageUptodate(page)) {
+ btrfs_readpage(NULL, page);
+ lock_page(page);
+ if (!PageUptodate(page)) {
+ unlock_page(page);
+ page_cache_release(page);
+ ret = -EIO;
+ break;
+ }
+ }
+
+ if (page->mapping != inode->i_mapping) {
+ unlock_page(page);
+ page_cache_release(page);
+ goto again;
+ }
+
+ pages[i] = page;
+ i_done++;
+ }
+ if (!i_done || ret)
+ goto out;
+
+ if (!(inode->i_sb->s_flags & MS_ACTIVE))
+ goto out;
+
+ /*
+ * so now we have a nice long stream of locked
+ * and up to date pages, lets wait on them
+ */
+ for (i = 0; i < i_done; i++)
+ wait_on_page_writeback(pages[i]);
+
+ page_start = page_offset(pages[0]);
+ page_end = page_offset(pages[i_done - 1]) + PAGE_CACHE_SIZE;
+
+ lock_extent_bits(&BTRFS_I(inode)->io_tree,
+ page_start, page_end - 1, 0, &cached_state);
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start,
+ page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0,
+ &cached_state, GFP_NOFS);
+
+ if (i_done != page_cnt) {
+ spin_lock(&BTRFS_I(inode)->lock);
+ BTRFS_I(inode)->outstanding_extents++;
+ spin_unlock(&BTRFS_I(inode)->lock);
+ btrfs_delalloc_release_space(inode,
+ (page_cnt - i_done) << PAGE_CACHE_SHIFT);
+ }
+
+
+ set_extent_defrag(&BTRFS_I(inode)->io_tree, page_start, page_end - 1,
+ &cached_state, GFP_NOFS);
+
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+ page_start, page_end - 1, &cached_state,
+ GFP_NOFS);
+
+ for (i = 0; i < i_done; i++) {
+ clear_page_dirty_for_io(pages[i]);
+ ClearPageChecked(pages[i]);
+ set_page_extent_mapped(pages[i]);
+ set_page_dirty(pages[i]);
+ unlock_page(pages[i]);
+ page_cache_release(pages[i]);
+ }
+ return i_done;
+out:
+ for (i = 0; i < i_done; i++) {
+ unlock_page(pages[i]);
+ page_cache_release(pages[i]);
+ }
+ btrfs_delalloc_release_space(inode, page_cnt << PAGE_CACHE_SHIFT);
+ return ret;
+
+}
+
+int btrfs_defrag_file(struct inode *inode, struct file *file,
+ struct btrfs_ioctl_defrag_range_args *range,
+ u64 newer_than, unsigned long max_to_defrag)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct file_ra_state *ra = NULL;
+ unsigned long last_index;
+ u64 isize = i_size_read(inode);
+ u64 last_len = 0;
+ u64 skip = 0;
+ u64 defrag_end = 0;
+ u64 newer_off = range->start;
+ unsigned long i;
+ unsigned long ra_index = 0;
+ int ret;
+ int defrag_count = 0;
+ int compress_type = BTRFS_COMPRESS_ZLIB;
+ u32 extent_thresh = range->extent_thresh;
+ unsigned long max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
+ unsigned long cluster = max_cluster;
+ u64 new_align = ~((u64)128 * 1024 - 1);
+ struct page **pages = NULL;
+
+ if (isize == 0)
+ return 0;
+
+ if (range->start >= isize)
+ return -EINVAL;
+
+ if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) {
+ if (range->compress_type > BTRFS_COMPRESS_TYPES)
+ return -EINVAL;
+ if (range->compress_type)
+ compress_type = range->compress_type;
+ }
+
+ if (extent_thresh == 0)
+ extent_thresh = 256 * 1024;
+
+ /*
+ * if we were not given a file, allocate a readahead
+ * context
+ */
+ if (!file) {
+ ra = kzalloc(sizeof(*ra), GFP_NOFS);
+ if (!ra)
+ return -ENOMEM;
+ file_ra_state_init(ra, inode->i_mapping);
+ } else {
+ ra = &file->f_ra;
+ }
+
+ pages = kmalloc_array(max_cluster, sizeof(struct page *),
+ GFP_NOFS);
+ if (!pages) {
+ ret = -ENOMEM;
+ goto out_ra;
+ }
+
+ /* find the last page to defrag */
+ if (range->start + range->len > range->start) {
+ last_index = min_t(u64, isize - 1,
+ range->start + range->len - 1) >> PAGE_CACHE_SHIFT;
+ } else {
+ last_index = (isize - 1) >> PAGE_CACHE_SHIFT;
+ }
+
+ if (newer_than) {
+ ret = find_new_extents(root, inode, newer_than,
+ &newer_off, 64 * 1024);
+ if (!ret) {
+ range->start = newer_off;
+ /*
+ * we always align our defrag to help keep
+ * the extents in the file evenly spaced
+ */
+ i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
+ } else
+ goto out_ra;
+ } else {
+ i = range->start >> PAGE_CACHE_SHIFT;
+ }
+ if (!max_to_defrag)
+ max_to_defrag = last_index + 1;
+
+ /*
+ * make writeback starts from i, so the defrag range can be
+ * written sequentially.
+ */
+ if (i < inode->i_mapping->writeback_index)
+ inode->i_mapping->writeback_index = i;
+
+ while (i <= last_index && defrag_count < max_to_defrag &&
+ (i < DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE))) {
+ /*
+ * make sure we stop running if someone unmounts
+ * the FS
+ */
+ if (!(inode->i_sb->s_flags & MS_ACTIVE))
+ break;
+
+ if (btrfs_defrag_cancelled(root->fs_info)) {
+ printk(KERN_DEBUG "BTRFS: defrag_file cancelled\n");
+ ret = -EAGAIN;
+ break;
+ }
+
+ if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT,
+ extent_thresh, &last_len, &skip,
+ &defrag_end, range->flags &
+ BTRFS_DEFRAG_RANGE_COMPRESS)) {
+ unsigned long next;
+ /*
+ * the should_defrag function tells us how much to skip
+ * bump our counter by the suggested amount
+ */
+ next = DIV_ROUND_UP(skip, PAGE_CACHE_SIZE);
+ i = max(i + 1, next);
+ continue;
+ }
+
+ if (!newer_than) {
+ cluster = (PAGE_CACHE_ALIGN(defrag_end) >>
+ PAGE_CACHE_SHIFT) - i;
+ cluster = min(cluster, max_cluster);
+ } else {
+ cluster = max_cluster;
+ }
+
+ if (i + cluster > ra_index) {
+ ra_index = max(i, ra_index);
+ btrfs_force_ra(inode->i_mapping, ra, file, ra_index,
+ cluster);
+ ra_index += max_cluster;
+ }
+
+ mutex_lock(&inode->i_mutex);
+ if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
+ BTRFS_I(inode)->force_compress = compress_type;
+ ret = cluster_pages_for_defrag(inode, pages, i, cluster);
+ if (ret < 0) {
+ mutex_unlock(&inode->i_mutex);
+ goto out_ra;
+ }
+
+ defrag_count += ret;
+ balance_dirty_pages_ratelimited(inode->i_mapping);
+ mutex_unlock(&inode->i_mutex);
+
+ if (newer_than) {
+ if (newer_off == (u64)-1)
+ break;
+
+ if (ret > 0)
+ i += ret;
+
+ newer_off = max(newer_off + 1,
+ (u64)i << PAGE_CACHE_SHIFT);
+
+ ret = find_new_extents(root, inode,
+ newer_than, &newer_off,
+ 64 * 1024);
+ if (!ret) {
+ range->start = newer_off;
+ i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
+ } else {
+ break;
+ }
+ } else {
+ if (ret > 0) {
+ i += ret;
+ last_len += ret << PAGE_CACHE_SHIFT;
+ } else {
+ i++;
+ last_len = 0;
+ }
+ }
+ }
+
+ if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) {
+ filemap_flush(inode->i_mapping);
+ if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags))
+ filemap_flush(inode->i_mapping);
+ }
+
+ if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
+ /* the filemap_flush will queue IO into the worker threads, but
+ * we have to make sure the IO is actually started and that
+ * ordered extents get created before we return
+ */
+ atomic_inc(&root->fs_info->async_submit_draining);
+ while (atomic_read(&root->fs_info->nr_async_submits) ||
+ atomic_read(&root->fs_info->async_delalloc_pages)) {
+ wait_event(root->fs_info->async_submit_wait,
+ (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
+ atomic_read(&root->fs_info->async_delalloc_pages) == 0));
+ }
+ atomic_dec(&root->fs_info->async_submit_draining);
+ }
+
+ if (range->compress_type == BTRFS_COMPRESS_LZO) {
+ btrfs_set_fs_incompat(root->fs_info, COMPRESS_LZO);
+ }
+
+ ret = defrag_count;
+
+out_ra:
+ if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) {
+ mutex_lock(&inode->i_mutex);
+ BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE;
+ mutex_unlock(&inode->i_mutex);
+ }
+ if (!file)
+ kfree(ra);
+ kfree(pages);
+ return ret;
+}
+
+static noinline int btrfs_ioctl_resize(struct file *file,
+ void __user *arg)
+{
+ u64 new_size;
+ u64 old_size;
+ u64 devid = 1;
+ struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+ struct btrfs_ioctl_vol_args *vol_args;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_device *device = NULL;
+ char *sizestr;
+ char *retptr;
+ char *devstr = NULL;
+ int ret = 0;
+ int mod = 0;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
+
+ if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+ 1)) {
+ mnt_drop_write_file(file);
+ return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+ }
+
+ mutex_lock(&root->fs_info->volume_mutex);
+ vol_args = memdup_user(arg, sizeof(*vol_args));
+ if (IS_ERR(vol_args)) {
+ ret = PTR_ERR(vol_args);
+ goto out;
+ }
+
+ vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+
+ sizestr = vol_args->name;
+ devstr = strchr(sizestr, ':');
+ if (devstr) {
+ sizestr = devstr + 1;
+ *devstr = '\0';
+ devstr = vol_args->name;
+ ret = kstrtoull(devstr, 10, &devid);
+ if (ret)
+ goto out_free;
+ if (!devid) {
+ ret = -EINVAL;
+ goto out_free;
+ }
+ btrfs_info(root->fs_info, "resizing devid %llu", devid);
+ }
+
+ device = btrfs_find_device(root->fs_info, devid, NULL, NULL);
+ if (!device) {
+ btrfs_info(root->fs_info, "resizer unable to find device %llu",
+ devid);
+ ret = -ENODEV;
+ goto out_free;
+ }
+
+ if (!device->writeable) {
+ btrfs_info(root->fs_info,
+ "resizer unable to apply on readonly device %llu",
+ devid);
+ ret = -EPERM;
+ goto out_free;
+ }
+
+ if (!strcmp(sizestr, "max"))
+ new_size = device->bdev->bd_inode->i_size;
+ else {
+ if (sizestr[0] == '-') {
+ mod = -1;
+ sizestr++;
+ } else if (sizestr[0] == '+') {
+ mod = 1;
+ sizestr++;
+ }
+ new_size = memparse(sizestr, &retptr);
+ if (*retptr != '\0' || new_size == 0) {
+ ret = -EINVAL;
+ goto out_free;
+ }
+ }
+
+ if (device->is_tgtdev_for_dev_replace) {
+ ret = -EPERM;
+ goto out_free;
+ }
+
+ old_size = btrfs_device_get_total_bytes(device);
+
+ if (mod < 0) {
+ if (new_size > old_size) {
+ ret = -EINVAL;
+ goto out_free;
+ }
+ new_size = old_size - new_size;
+ } else if (mod > 0) {
+ if (new_size > ULLONG_MAX - old_size) {
+ ret = -ERANGE;
+ goto out_free;
+ }
+ new_size = old_size + new_size;
+ }
+
+ if (new_size < 256 * 1024 * 1024) {
+ ret = -EINVAL;
+ goto out_free;
+ }
+ if (new_size > device->bdev->bd_inode->i_size) {
+ ret = -EFBIG;
+ goto out_free;
+ }
+
+ new_size = div_u64(new_size, root->sectorsize);
+ new_size *= root->sectorsize;
+
+ printk_in_rcu(KERN_INFO "BTRFS: new size for %s is %llu\n",
+ rcu_str_deref(device->name), new_size);
+
+ if (new_size > old_size) {
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out_free;
+ }
+ ret = btrfs_grow_device(trans, device, new_size);
+ btrfs_commit_transaction(trans, root);
+ } else if (new_size < old_size) {
+ ret = btrfs_shrink_device(device, new_size);
+ } /* equal, nothing need to do */
+
+out_free:
+ kfree(vol_args);
+out:
+ mutex_unlock(&root->fs_info->volume_mutex);
+ atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
+ mnt_drop_write_file(file);
+ return ret;
+}
+
+static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
+ char *name, unsigned long fd, int subvol,
+ u64 *transid, bool readonly,
+ struct btrfs_qgroup_inherit *inherit)
+{
+ int namelen;
+ int ret = 0;
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ goto out;
+
+ namelen = strlen(name);
+ if (strchr(name, '/')) {
+ ret = -EINVAL;
+ goto out_drop_write;
+ }
+
+ if (name[0] == '.' &&
+ (namelen == 1 || (name[1] == '.' && namelen == 2))) {
+ ret = -EEXIST;
+ goto out_drop_write;
+ }
+
+ if (subvol) {
+ ret = btrfs_mksubvol(&file->f_path, name, namelen,
+ NULL, transid, readonly, inherit);
+ } else {
+ struct fd src = fdget(fd);
+ struct inode *src_inode;
+ if (!src.file) {
+ ret = -EINVAL;
+ goto out_drop_write;
+ }
+
+ src_inode = file_inode(src.file);
+ if (src_inode->i_sb != file_inode(file)->i_sb) {
+ btrfs_info(BTRFS_I(src_inode)->root->fs_info,
+ "Snapshot src from another FS");
+ ret = -EXDEV;
+ } else if (!inode_owner_or_capable(src_inode)) {
+ /*
+ * Subvolume creation is not restricted, but snapshots
+ * are limited to own subvolumes only
+ */
+ ret = -EPERM;
+ } else {
+ ret = btrfs_mksubvol(&file->f_path, name, namelen,
+ BTRFS_I(src_inode)->root,
+ transid, readonly, inherit);
+ }
+ fdput(src);
+ }
+out_drop_write:
+ mnt_drop_write_file(file);
+out:
+ return ret;
+}
+
+static noinline int btrfs_ioctl_snap_create(struct file *file,
+ void __user *arg, int subvol)
+{
+ struct btrfs_ioctl_vol_args *vol_args;
+ int ret;
+
+ vol_args = memdup_user(arg, sizeof(*vol_args));
+ if (IS_ERR(vol_args))
+ return PTR_ERR(vol_args);
+ vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+
+ ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
+ vol_args->fd, subvol,
+ NULL, false, NULL);
+
+ kfree(vol_args);
+ return ret;
+}
+
+static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
+ void __user *arg, int subvol)
+{
+ struct btrfs_ioctl_vol_args_v2 *vol_args;
+ int ret;
+ u64 transid = 0;
+ u64 *ptr = NULL;
+ bool readonly = false;
+ struct btrfs_qgroup_inherit *inherit = NULL;
+
+ vol_args = memdup_user(arg, sizeof(*vol_args));
+ if (IS_ERR(vol_args))
+ return PTR_ERR(vol_args);
+ vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
+
+ if (vol_args->flags &
+ ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY |
+ BTRFS_SUBVOL_QGROUP_INHERIT)) {
+ ret = -EOPNOTSUPP;
+ goto free_args;
+ }
+
+ if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC)
+ ptr = &transid;
+ if (vol_args->flags & BTRFS_SUBVOL_RDONLY)
+ readonly = true;
+ if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) {
+ if (vol_args->size > PAGE_CACHE_SIZE) {
+ ret = -EINVAL;
+ goto free_args;
+ }
+ inherit = memdup_user(vol_args->qgroup_inherit, vol_args->size);
+ if (IS_ERR(inherit)) {
+ ret = PTR_ERR(inherit);
+ goto free_args;
+ }
+ }
+
+ ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
+ vol_args->fd, subvol, ptr,
+ readonly, inherit);
+ if (ret)
+ goto free_inherit;
+
+ if (ptr && copy_to_user(arg +
+ offsetof(struct btrfs_ioctl_vol_args_v2,
+ transid),
+ ptr, sizeof(*ptr)))
+ ret = -EFAULT;
+
+free_inherit:
+ kfree(inherit);
+free_args:
+ kfree(vol_args);
+ return ret;
+}
+
+static noinline int btrfs_ioctl_subvol_getflags(struct file *file,
+ void __user *arg)
+{
+ struct inode *inode = file_inode(file);
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret = 0;
+ u64 flags = 0;
+
+ if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID)
+ return -EINVAL;
+
+ down_read(&root->fs_info->subvol_sem);
+ if (btrfs_root_readonly(root))
+ flags |= BTRFS_SUBVOL_RDONLY;
+ up_read(&root->fs_info->subvol_sem);
+
+ if (copy_to_user(arg, &flags, sizeof(flags)))
+ ret = -EFAULT;
+
+ return ret;
+}
+
+static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
+ void __user *arg)
+{
+ struct inode *inode = file_inode(file);
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_trans_handle *trans;
+ u64 root_flags;
+ u64 flags;
+ int ret = 0;
+
+ if (!inode_owner_or_capable(inode))
+ return -EPERM;
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ goto out;
+
+ if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) {
+ ret = -EINVAL;
+ goto out_drop_write;
+ }
+
+ if (copy_from_user(&flags, arg, sizeof(flags))) {
+ ret = -EFAULT;
+ goto out_drop_write;
+ }
+
+ if (flags & BTRFS_SUBVOL_CREATE_ASYNC) {
+ ret = -EINVAL;
+ goto out_drop_write;
+ }
+
+ if (flags & ~BTRFS_SUBVOL_RDONLY) {
+ ret = -EOPNOTSUPP;
+ goto out_drop_write;
+ }
+
+ down_write(&root->fs_info->subvol_sem);
+
+ /* nothing to do */
+ if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root))
+ goto out_drop_sem;
+
+ root_flags = btrfs_root_flags(&root->root_item);
+ if (flags & BTRFS_SUBVOL_RDONLY) {
+ btrfs_set_root_flags(&root->root_item,
+ root_flags | BTRFS_ROOT_SUBVOL_RDONLY);
+ } else {
+ /*
+ * Block RO -> RW transition if this subvolume is involved in
+ * send
+ */
+ spin_lock(&root->root_item_lock);
+ if (root->send_in_progress == 0) {
+ btrfs_set_root_flags(&root->root_item,
+ root_flags & ~BTRFS_ROOT_SUBVOL_RDONLY);
+ spin_unlock(&root->root_item_lock);
+ } else {
+ spin_unlock(&root->root_item_lock);
+ btrfs_warn(root->fs_info,
+ "Attempt to set subvolume %llu read-write during send",
+ root->root_key.objectid);
+ ret = -EPERM;
+ goto out_drop_sem;
+ }
+ }
+
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out_reset;
+ }
+
+ ret = btrfs_update_root(trans, root->fs_info->tree_root,
+ &root->root_key, &root->root_item);
+
+ btrfs_commit_transaction(trans, root);
+out_reset:
+ if (ret)
+ btrfs_set_root_flags(&root->root_item, root_flags);
+out_drop_sem:
+ up_write(&root->fs_info->subvol_sem);
+out_drop_write:
+ mnt_drop_write_file(file);
+out:
+ return ret;
+}
+
+/*
+ * helper to check if the subvolume references other subvolumes
+ */
+static noinline int may_destroy_subvol(struct btrfs_root *root)
+{
+ struct btrfs_path *path;
+ struct btrfs_dir_item *di;
+ struct btrfs_key key;
+ u64 dir_id;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ /* Make sure this root isn't set as the default subvol */
+ dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
+ di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root, path,
+ dir_id, "default", 7, 0);
+ if (di && !IS_ERR(di)) {
+ btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
+ if (key.objectid == root->root_key.objectid) {
+ ret = -EPERM;
+ btrfs_err(root->fs_info, "deleting default subvolume "
+ "%llu is not allowed", key.objectid);
+ goto out;
+ }
+ btrfs_release_path(path);
+ }
+
+ key.objectid = root->root_key.objectid;
+ key.type = BTRFS_ROOT_REF_KEY;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_slot(NULL, root->fs_info->tree_root,
+ &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ BUG_ON(ret == 0);
+
+ ret = 0;
+ if (path->slots[0] > 0) {
+ path->slots[0]--;
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.objectid == root->root_key.objectid &&
+ key.type == BTRFS_ROOT_REF_KEY)
+ ret = -ENOTEMPTY;
+ }
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static noinline int key_in_sk(struct btrfs_key *key,
+ struct btrfs_ioctl_search_key *sk)
+{
+ struct btrfs_key test;
+ int ret;
+
+ test.objectid = sk->min_objectid;
+ test.type = sk->min_type;
+ test.offset = sk->min_offset;
+
+ ret = btrfs_comp_cpu_keys(key, &test);
+ if (ret < 0)
+ return 0;
+
+ test.objectid = sk->max_objectid;
+ test.type = sk->max_type;
+ test.offset = sk->max_offset;
+
+ ret = btrfs_comp_cpu_keys(key, &test);
+ if (ret > 0)
+ return 0;
+ return 1;
+}
+
+static noinline int copy_to_sk(struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_key *key,
+ struct btrfs_ioctl_search_key *sk,
+ size_t *buf_size,
+ char __user *ubuf,
+ unsigned long *sk_offset,
+ int *num_found)
+{
+ u64 found_transid;
+ struct extent_buffer *leaf;
+ struct btrfs_ioctl_search_header sh;
+ unsigned long item_off;
+ unsigned long item_len;
+ int nritems;
+ int i;
+ int slot;
+ int ret = 0;
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ nritems = btrfs_header_nritems(leaf);
+
+ if (btrfs_header_generation(leaf) > sk->max_transid) {
+ i = nritems;
+ goto advance_key;
+ }
+ found_transid = btrfs_header_generation(leaf);
+
+ for (i = slot; i < nritems; i++) {
+ item_off = btrfs_item_ptr_offset(leaf, i);
+ item_len = btrfs_item_size_nr(leaf, i);
+
+ btrfs_item_key_to_cpu(leaf, key, i);
+ if (!key_in_sk(key, sk))
+ continue;
+
+ if (sizeof(sh) + item_len > *buf_size) {
+ if (*num_found) {
+ ret = 1;
+ goto out;
+ }
+
+ /*
+ * return one empty item back for v1, which does not
+ * handle -EOVERFLOW
+ */
+
+ *buf_size = sizeof(sh) + item_len;
+ item_len = 0;
+ ret = -EOVERFLOW;
+ }
+
+ if (sizeof(sh) + item_len + *sk_offset > *buf_size) {
+ ret = 1;
+ goto out;
+ }
+
+ sh.objectid = key->objectid;
+ sh.offset = key->offset;
+ sh.type = key->type;
+ sh.len = item_len;
+ sh.transid = found_transid;
+
+ /* copy search result header */
+ if (copy_to_user(ubuf + *sk_offset, &sh, sizeof(sh))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ *sk_offset += sizeof(sh);
+
+ if (item_len) {
+ char __user *up = ubuf + *sk_offset;
+ /* copy the item */
+ if (read_extent_buffer_to_user(leaf, up,
+ item_off, item_len)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ *sk_offset += item_len;
+ }
+ (*num_found)++;
+
+ if (ret) /* -EOVERFLOW from above */
+ goto out;
+
+ if (*num_found >= sk->nr_items) {
+ ret = 1;
+ goto out;
+ }
+ }
+advance_key:
+ ret = 0;
+ if (key->offset < (u64)-1 && key->offset < sk->max_offset)
+ key->offset++;
+ else if (key->type < (u8)-1 && key->type < sk->max_type) {
+ key->offset = 0;
+ key->type++;
+ } else if (key->objectid < (u64)-1 && key->objectid < sk->max_objectid) {
+ key->offset = 0;
+ key->type = 0;
+ key->objectid++;
+ } else
+ ret = 1;
+out:
+ /*
+ * 0: all items from this leaf copied, continue with next
+ * 1: * more items can be copied, but unused buffer is too small
+ * * all items were found
+ * Either way, it will stops the loop which iterates to the next
+ * leaf
+ * -EOVERFLOW: item was to large for buffer
+ * -EFAULT: could not copy extent buffer back to userspace
+ */
+ return ret;
+}
+
+static noinline int search_ioctl(struct inode *inode,
+ struct btrfs_ioctl_search_key *sk,
+ size_t *buf_size,
+ char __user *ubuf)
+{
+ struct btrfs_root *root;
+ struct btrfs_key key;
+ struct btrfs_path *path;
+ struct btrfs_fs_info *info = BTRFS_I(inode)->root->fs_info;
+ int ret;
+ int num_found = 0;
+ unsigned long sk_offset = 0;
+
+ if (*buf_size < sizeof(struct btrfs_ioctl_search_header)) {
+ *buf_size = sizeof(struct btrfs_ioctl_search_header);
+ return -EOVERFLOW;
+ }
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ if (sk->tree_id == 0) {
+ /* search the root of the inode that was passed */
+ root = BTRFS_I(inode)->root;
+ } else {
+ key.objectid = sk->tree_id;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = (u64)-1;
+ root = btrfs_read_fs_root_no_name(info, &key);
+ if (IS_ERR(root)) {
+ printk(KERN_ERR "BTRFS: could not find root %llu\n",
+ sk->tree_id);
+ btrfs_free_path(path);
+ return -ENOENT;
+ }
+ }
+
+ key.objectid = sk->min_objectid;
+ key.type = sk->min_type;
+ key.offset = sk->min_offset;
+
+ while (1) {
+ ret = btrfs_search_forward(root, &key, path, sk->min_transid);
+ if (ret != 0) {
+ if (ret > 0)
+ ret = 0;
+ goto err;
+ }
+ ret = copy_to_sk(root, path, &key, sk, buf_size, ubuf,
+ &sk_offset, &num_found);
+ btrfs_release_path(path);
+ if (ret)
+ break;
+
+ }
+ if (ret > 0)
+ ret = 0;
+err:
+ sk->nr_items = num_found;
+ btrfs_free_path(path);
+ return ret;
+}
+
+static noinline int btrfs_ioctl_tree_search(struct file *file,
+ void __user *argp)
+{
+ struct btrfs_ioctl_search_args __user *uargs;
+ struct btrfs_ioctl_search_key sk;
+ struct inode *inode;
+ int ret;
+ size_t buf_size;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ uargs = (struct btrfs_ioctl_search_args __user *)argp;
+
+ if (copy_from_user(&sk, &uargs->key, sizeof(sk)))
+ return -EFAULT;
+
+ buf_size = sizeof(uargs->buf);
+
+ inode = file_inode(file);
+ ret = search_ioctl(inode, &sk, &buf_size, uargs->buf);
+
+ /*
+ * In the origin implementation an overflow is handled by returning a
+ * search header with a len of zero, so reset ret.
+ */
+ if (ret == -EOVERFLOW)
+ ret = 0;
+
+ if (ret == 0 && copy_to_user(&uargs->key, &sk, sizeof(sk)))
+ ret = -EFAULT;
+ return ret;
+}
+
+static noinline int btrfs_ioctl_tree_search_v2(struct file *file,
+ void __user *argp)
+{
+ struct btrfs_ioctl_search_args_v2 __user *uarg;
+ struct btrfs_ioctl_search_args_v2 args;
+ struct inode *inode;
+ int ret;
+ size_t buf_size;
+ const size_t buf_limit = 16 * 1024 * 1024;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ /* copy search header and buffer size */
+ uarg = (struct btrfs_ioctl_search_args_v2 __user *)argp;
+ if (copy_from_user(&args, uarg, sizeof(args)))
+ return -EFAULT;
+
+ buf_size = args.buf_size;
+
+ if (buf_size < sizeof(struct btrfs_ioctl_search_header))
+ return -EOVERFLOW;
+
+ /* limit result size to 16MB */
+ if (buf_size > buf_limit)
+ buf_size = buf_limit;
+
+ inode = file_inode(file);
+ ret = search_ioctl(inode, &args.key, &buf_size,
+ (char *)(&uarg->buf[0]));
+ if (ret == 0 && copy_to_user(&uarg->key, &args.key, sizeof(args.key)))
+ ret = -EFAULT;
+ else if (ret == -EOVERFLOW &&
+ copy_to_user(&uarg->buf_size, &buf_size, sizeof(buf_size)))
+ ret = -EFAULT;
+
+ return ret;
+}
+
+/*
+ * Search INODE_REFs to identify path name of 'dirid' directory
+ * in a 'tree_id' tree. and sets path name to 'name'.
+ */
+static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
+ u64 tree_id, u64 dirid, char *name)
+{
+ struct btrfs_root *root;
+ struct btrfs_key key;
+ char *ptr;
+ int ret = -1;
+ int slot;
+ int len;
+ int total_len = 0;
+ struct btrfs_inode_ref *iref;
+ struct extent_buffer *l;
+ struct btrfs_path *path;
+
+ if (dirid == BTRFS_FIRST_FREE_OBJECTID) {
+ name[0]='\0';
+ return 0;
+ }
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ptr = &name[BTRFS_INO_LOOKUP_PATH_MAX];
+
+ key.objectid = tree_id;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = (u64)-1;
+ root = btrfs_read_fs_root_no_name(info, &key);
+ if (IS_ERR(root)) {
+ printk(KERN_ERR "BTRFS: could not find root %llu\n", tree_id);
+ ret = -ENOENT;
+ goto out;
+ }
+
+ key.objectid = dirid;
+ key.type = BTRFS_INODE_REF_KEY;
+ key.offset = (u64)-1;
+
+ while (1) {
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ else if (ret > 0) {
+ ret = btrfs_previous_item(root, path, dirid,
+ BTRFS_INODE_REF_KEY);
+ if (ret < 0)
+ goto out;
+ else if (ret > 0) {
+ ret = -ENOENT;
+ goto out;
+ }
+ }
+
+ l = path->nodes[0];
+ slot = path->slots[0];
+ btrfs_item_key_to_cpu(l, &key, slot);
+
+ iref = btrfs_item_ptr(l, slot, struct btrfs_inode_ref);
+ len = btrfs_inode_ref_name_len(l, iref);
+ ptr -= len + 1;
+ total_len += len + 1;
+ if (ptr < name) {
+ ret = -ENAMETOOLONG;
+ goto out;
+ }
+
+ *(ptr + len) = '/';
+ read_extent_buffer(l, ptr, (unsigned long)(iref + 1), len);
+
+ if (key.offset == BTRFS_FIRST_FREE_OBJECTID)
+ break;
+
+ btrfs_release_path(path);
+ key.objectid = key.offset;
+ key.offset = (u64)-1;
+ dirid = key.objectid;
+ }
+ memmove(name, ptr, total_len);
+ name[total_len] = '\0';
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static noinline int btrfs_ioctl_ino_lookup(struct file *file,
+ void __user *argp)
+{
+ struct btrfs_ioctl_ino_lookup_args *args;
+ struct inode *inode;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ args = memdup_user(argp, sizeof(*args));
+ if (IS_ERR(args))
+ return PTR_ERR(args);
+
+ inode = file_inode(file);
+
+ if (args->treeid == 0)
+ args->treeid = BTRFS_I(inode)->root->root_key.objectid;
+
+ ret = btrfs_search_path_in_tree(BTRFS_I(inode)->root->fs_info,
+ args->treeid, args->objectid,
+ args->name);
+
+ if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
+ ret = -EFAULT;
+
+ kfree(args);
+ return ret;
+}
+
+static noinline int btrfs_ioctl_snap_destroy(struct file *file,
+ void __user *arg)
+{
+ struct dentry *parent = file->f_path.dentry;
+ struct dentry *dentry;
+ struct inode *dir = d_inode(parent);
+ struct inode *inode;
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_root *dest = NULL;
+ struct btrfs_ioctl_vol_args *vol_args;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_block_rsv block_rsv;
+ u64 root_flags;
+ u64 qgroup_reserved;
+ int namelen;
+ int ret;
+ int err = 0;
+
+ vol_args = memdup_user(arg, sizeof(*vol_args));
+ if (IS_ERR(vol_args))
+ return PTR_ERR(vol_args);
+
+ vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+ namelen = strlen(vol_args->name);
+ if (strchr(vol_args->name, '/') ||
+ strncmp(vol_args->name, "..", namelen) == 0) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = mnt_want_write_file(file);
+ if (err)
+ goto out;
+
+
+ err = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT);
+ if (err == -EINTR)
+ goto out_drop_write;
+ dentry = lookup_one_len(vol_args->name, parent, namelen);
+ if (IS_ERR(dentry)) {
+ err = PTR_ERR(dentry);
+ goto out_unlock_dir;
+ }
+
+ if (d_really_is_negative(dentry)) {
+ err = -ENOENT;
+ goto out_dput;
+ }
+
+ inode = d_inode(dentry);
+ dest = BTRFS_I(inode)->root;
+ if (!capable(CAP_SYS_ADMIN)) {
+ /*
+ * Regular user. Only allow this with a special mount
+ * option, when the user has write+exec access to the
+ * subvol root, and when rmdir(2) would have been
+ * allowed.
+ *
+ * Note that this is _not_ check that the subvol is
+ * empty or doesn't contain data that we wouldn't
+ * otherwise be able to delete.
+ *
+ * Users who want to delete empty subvols should try
+ * rmdir(2).
+ */
+ err = -EPERM;
+ if (!btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED))
+ goto out_dput;
+
+ /*
+ * Do not allow deletion if the parent dir is the same
+ * as the dir to be deleted. That means the ioctl
+ * must be called on the dentry referencing the root
+ * of the subvol, not a random directory contained
+ * within it.
+ */
+ err = -EINVAL;
+ if (root == dest)
+ goto out_dput;
+
+ err = inode_permission(inode, MAY_WRITE | MAY_EXEC);
+ if (err)
+ goto out_dput;
+ }
+
+ /* check if subvolume may be deleted by a user */
+ err = btrfs_may_delete(dir, dentry, 1);
+ if (err)
+ goto out_dput;
+
+ if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) {
+ err = -EINVAL;
+ goto out_dput;
+ }
+
+ mutex_lock(&inode->i_mutex);
+
+ /*
+ * Don't allow to delete a subvolume with send in progress. This is
+ * inside the i_mutex so the error handling that has to drop the bit
+ * again is not run concurrently.
+ */
+ spin_lock(&dest->root_item_lock);
+ root_flags = btrfs_root_flags(&dest->root_item);
+ if (dest->send_in_progress == 0) {
+ btrfs_set_root_flags(&dest->root_item,
+ root_flags | BTRFS_ROOT_SUBVOL_DEAD);
+ spin_unlock(&dest->root_item_lock);
+ } else {
+ spin_unlock(&dest->root_item_lock);
+ btrfs_warn(root->fs_info,
+ "Attempt to delete subvolume %llu during send",
+ dest->root_key.objectid);
+ err = -EPERM;
+ goto out_unlock_inode;
+ }
+
+ down_write(&root->fs_info->subvol_sem);
+
+ err = may_destroy_subvol(dest);
+ if (err)
+ goto out_up_write;
+
+ btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
+ /*
+ * One for dir inode, two for dir entries, two for root
+ * ref/backref.
+ */
+ err = btrfs_subvolume_reserve_metadata(root, &block_rsv,
+ 5, &qgroup_reserved, true);
+ if (err)
+ goto out_up_write;
+
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ goto out_release;
+ }
+ trans->block_rsv = &block_rsv;
+ trans->bytes_reserved = block_rsv.size;
+
+ ret = btrfs_unlink_subvol(trans, root, dir,
+ dest->root_key.objectid,
+ dentry->d_name.name,
+ dentry->d_name.len);
+ if (ret) {
+ err = ret;
+ btrfs_abort_transaction(trans, root, ret);
+ goto out_end_trans;
+ }
+
+ btrfs_record_root_in_trans(trans, dest);
+
+ memset(&dest->root_item.drop_progress, 0,
+ sizeof(dest->root_item.drop_progress));
+ dest->root_item.drop_level = 0;
+ btrfs_set_root_refs(&dest->root_item, 0);
+
+ if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) {
+ ret = btrfs_insert_orphan_item(trans,
+ root->fs_info->tree_root,
+ dest->root_key.objectid);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ err = ret;
+ goto out_end_trans;
+ }
+ }
+
+ ret = btrfs_uuid_tree_rem(trans, root->fs_info->uuid_root,
+ dest->root_item.uuid, BTRFS_UUID_KEY_SUBVOL,
+ dest->root_key.objectid);
+ if (ret && ret != -ENOENT) {
+ btrfs_abort_transaction(trans, root, ret);
+ err = ret;
+ goto out_end_trans;
+ }
+ if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) {
+ ret = btrfs_uuid_tree_rem(trans, root->fs_info->uuid_root,
+ dest->root_item.received_uuid,
+ BTRFS_UUID_KEY_RECEIVED_SUBVOL,
+ dest->root_key.objectid);
+ if (ret && ret != -ENOENT) {
+ btrfs_abort_transaction(trans, root, ret);
+ err = ret;
+ goto out_end_trans;
+ }
+ }
+
+out_end_trans:
+ trans->block_rsv = NULL;
+ trans->bytes_reserved = 0;
+ ret = btrfs_end_transaction(trans, root);
+ if (ret && !err)
+ err = ret;
+ inode->i_flags |= S_DEAD;
+out_release:
+ btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved);
+out_up_write:
+ up_write(&root->fs_info->subvol_sem);
+ if (err) {
+ spin_lock(&dest->root_item_lock);
+ root_flags = btrfs_root_flags(&dest->root_item);
+ btrfs_set_root_flags(&dest->root_item,
+ root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
+ spin_unlock(&dest->root_item_lock);
+ }
+out_unlock_inode:
+ mutex_unlock(&inode->i_mutex);
+ if (!err) {
+ d_invalidate(dentry);
+ btrfs_invalidate_inodes(dest);
+ d_delete(dentry);
+ ASSERT(dest->send_in_progress == 0);
+
+ /* the last ref */
+ if (dest->ino_cache_inode) {
+ iput(dest->ino_cache_inode);
+ dest->ino_cache_inode = NULL;
+ }
+ }
+out_dput:
+ dput(dentry);
+out_unlock_dir:
+ mutex_unlock(&dir->i_mutex);
+out_drop_write:
+ mnt_drop_write_file(file);
+out:
+ kfree(vol_args);
+ return err;
+}
+
+static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
+{
+ struct inode *inode = file_inode(file);
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_ioctl_defrag_range_args *range;
+ int ret;
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
+
+ if (btrfs_root_readonly(root)) {
+ ret = -EROFS;
+ goto out;
+ }
+
+ switch (inode->i_mode & S_IFMT) {
+ case S_IFDIR:
+ if (!capable(CAP_SYS_ADMIN)) {
+ ret = -EPERM;
+ goto out;
+ }
+ ret = btrfs_defrag_root(root);
+ if (ret)
+ goto out;
+ ret = btrfs_defrag_root(root->fs_info->extent_root);
+ break;
+ case S_IFREG:
+ if (!(file->f_mode & FMODE_WRITE)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ range = kzalloc(sizeof(*range), GFP_KERNEL);
+ if (!range) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (argp) {
+ if (copy_from_user(range, argp,
+ sizeof(*range))) {
+ ret = -EFAULT;
+ kfree(range);
+ goto out;
+ }
+ /* compression requires us to start the IO */
+ if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
+ range->flags |= BTRFS_DEFRAG_RANGE_START_IO;
+ range->extent_thresh = (u32)-1;
+ }
+ } else {
+ /* the rest are all set to zero by kzalloc */
+ range->len = (u64)-1;
+ }
+ ret = btrfs_defrag_file(file_inode(file), file,
+ range, 0, 0);
+ if (ret > 0)
+ ret = 0;
+ kfree(range);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+out:
+ mnt_drop_write_file(file);
+ return ret;
+}
+
+static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
+{
+ struct btrfs_ioctl_vol_args *vol_args;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+ 1)) {
+ return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+ }
+
+ mutex_lock(&root->fs_info->volume_mutex);
+ vol_args = memdup_user(arg, sizeof(*vol_args));
+ if (IS_ERR(vol_args)) {
+ ret = PTR_ERR(vol_args);
+ goto out;
+ }
+
+ vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+ ret = btrfs_init_new_device(root, vol_args->name);
+
+ if (!ret)
+ btrfs_info(root->fs_info, "disk added %s",vol_args->name);
+
+ kfree(vol_args);
+out:
+ mutex_unlock(&root->fs_info->volume_mutex);
+ atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
+ return ret;
+}
+
+static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
+{
+ struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+ struct btrfs_ioctl_vol_args *vol_args;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
+
+ vol_args = memdup_user(arg, sizeof(*vol_args));
+ if (IS_ERR(vol_args)) {
+ ret = PTR_ERR(vol_args);
+ goto err_drop;
+ }
+
+ vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+
+ if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+ 1)) {
+ ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+ goto out;
+ }
+
+ mutex_lock(&root->fs_info->volume_mutex);
+ ret = btrfs_rm_device(root, vol_args->name);
+ mutex_unlock(&root->fs_info->volume_mutex);
+ atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
+
+ if (!ret)
+ btrfs_info(root->fs_info, "disk deleted %s",vol_args->name);
+
+out:
+ kfree(vol_args);
+err_drop:
+ mnt_drop_write_file(file);
+ return ret;
+}
+
+static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg)
+{
+ struct btrfs_ioctl_fs_info_args *fi_args;
+ struct btrfs_device *device;
+ struct btrfs_device *next;
+ struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+ int ret = 0;
+
+ fi_args = kzalloc(sizeof(*fi_args), GFP_KERNEL);
+ if (!fi_args)
+ return -ENOMEM;
+
+ mutex_lock(&fs_devices->device_list_mutex);
+ fi_args->num_devices = fs_devices->num_devices;
+ memcpy(&fi_args->fsid, root->fs_info->fsid, sizeof(fi_args->fsid));
+
+ list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
+ if (device->devid > fi_args->max_id)
+ fi_args->max_id = device->devid;
+ }
+ mutex_unlock(&fs_devices->device_list_mutex);
+
+ fi_args->nodesize = root->fs_info->super_copy->nodesize;
+ fi_args->sectorsize = root->fs_info->super_copy->sectorsize;
+ fi_args->clone_alignment = root->fs_info->super_copy->sectorsize;
+
+ if (copy_to_user(arg, fi_args, sizeof(*fi_args)))
+ ret = -EFAULT;
+
+ kfree(fi_args);
+ return ret;
+}
+
+static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)
+{
+ struct btrfs_ioctl_dev_info_args *di_args;
+ struct btrfs_device *dev;
+ struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+ int ret = 0;
+ char *s_uuid = NULL;
+
+ di_args = memdup_user(arg, sizeof(*di_args));
+ if (IS_ERR(di_args))
+ return PTR_ERR(di_args);
+
+ if (!btrfs_is_empty_uuid(di_args->uuid))
+ s_uuid = di_args->uuid;
+
+ mutex_lock(&fs_devices->device_list_mutex);
+ dev = btrfs_find_device(root->fs_info, di_args->devid, s_uuid, NULL);
+
+ if (!dev) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ di_args->devid = dev->devid;
+ di_args->bytes_used = btrfs_device_get_bytes_used(dev);
+ di_args->total_bytes = btrfs_device_get_total_bytes(dev);
+ memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid));
+ if (dev->name) {
+ struct rcu_string *name;
+
+ rcu_read_lock();
+ name = rcu_dereference(dev->name);
+ strncpy(di_args->path, name->str, sizeof(di_args->path));
+ rcu_read_unlock();
+ di_args->path[sizeof(di_args->path) - 1] = 0;
+ } else {
+ di_args->path[0] = '\0';
+ }
+
+out:
+ mutex_unlock(&fs_devices->device_list_mutex);
+ if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args)))
+ ret = -EFAULT;
+
+ kfree(di_args);
+ return ret;
+}
+
+static struct page *extent_same_get_page(struct inode *inode, u64 off)
+{
+ struct page *page;
+ pgoff_t index;
+ struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
+
+ index = off >> PAGE_CACHE_SHIFT;
+
+ page = grab_cache_page(inode->i_mapping, index);
+ if (!page)
+ return NULL;
+
+ if (!PageUptodate(page)) {
+ if (extent_read_full_page_nolock(tree, page, btrfs_get_extent,
+ 0))
+ return NULL;
+ lock_page(page);
+ if (!PageUptodate(page)) {
+ unlock_page(page);
+ page_cache_release(page);
+ return NULL;
+ }
+ }
+ unlock_page(page);
+
+ return page;
+}
+
+static inline void lock_extent_range(struct inode *inode, u64 off, u64 len)
+{
+ /* do any pending delalloc/csum calc on src, one way or
+ another, and lock file content */
+ while (1) {
+ struct btrfs_ordered_extent *ordered;
+ lock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1);
+ ordered = btrfs_lookup_first_ordered_extent(inode,
+ off + len - 1);
+ if ((!ordered ||
+ ordered->file_offset + ordered->len <= off ||
+ ordered->file_offset >= off + len) &&
+ !test_range_bit(&BTRFS_I(inode)->io_tree, off,
+ off + len - 1, EXTENT_DELALLOC, 0, NULL)) {
+ if (ordered)
+ btrfs_put_ordered_extent(ordered);
+ break;
+ }
+ unlock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1);
+ if (ordered)
+ btrfs_put_ordered_extent(ordered);
+ btrfs_wait_ordered_range(inode, off, len);
+ }
+}
+
+static void btrfs_double_unlock(struct inode *inode1, u64 loff1,
+ struct inode *inode2, u64 loff2, u64 len)
+{
+ unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
+ unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
+
+ mutex_unlock(&inode1->i_mutex);
+ mutex_unlock(&inode2->i_mutex);
+}
+
+static void btrfs_double_lock(struct inode *inode1, u64 loff1,
+ struct inode *inode2, u64 loff2, u64 len)
+{
+ if (inode1 < inode2) {
+ swap(inode1, inode2);
+ swap(loff1, loff2);
+ }
+
+ mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
+ lock_extent_range(inode1, loff1, len);
+ if (inode1 != inode2) {
+ mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
+ lock_extent_range(inode2, loff2, len);
+ }
+}
+
+static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst,
+ u64 dst_loff, u64 len)
+{
+ int ret = 0;
+ struct page *src_page, *dst_page;
+ unsigned int cmp_len = PAGE_CACHE_SIZE;
+ void *addr, *dst_addr;
+
+ while (len) {
+ if (len < PAGE_CACHE_SIZE)
+ cmp_len = len;
+
+ src_page = extent_same_get_page(src, loff);
+ if (!src_page)
+ return -EINVAL;
+ dst_page = extent_same_get_page(dst, dst_loff);
+ if (!dst_page) {
+ page_cache_release(src_page);
+ return -EINVAL;
+ }
+ addr = kmap_atomic(src_page);
+ dst_addr = kmap_atomic(dst_page);
+
+ flush_dcache_page(src_page);
+ flush_dcache_page(dst_page);
+
+ if (memcmp(addr, dst_addr, cmp_len))
+ ret = BTRFS_SAME_DATA_DIFFERS;
+
+ kunmap_atomic(addr);
+ kunmap_atomic(dst_addr);
+ page_cache_release(src_page);
+ page_cache_release(dst_page);
+
+ if (ret)
+ break;
+
+ loff += cmp_len;
+ dst_loff += cmp_len;
+ len -= cmp_len;
+ }
+
+ return ret;
+}
+
+static int extent_same_check_offsets(struct inode *inode, u64 off, u64 len)
+{
+ u64 bs = BTRFS_I(inode)->root->fs_info->sb->s_blocksize;
+
+ if (off + len > inode->i_size || off + len < off)
+ return -EINVAL;
+ /* Check that we are block aligned - btrfs_clone() requires this */
+ if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int btrfs_extent_same(struct inode *src, u64 loff, u64 len,
+ struct inode *dst, u64 dst_loff)
+{
+ int ret;
+
+ /*
+ * btrfs_clone() can't handle extents in the same file
+ * yet. Once that works, we can drop this check and replace it
+ * with a check for the same inode, but overlapping extents.
+ */
+ if (src == dst)
+ return -EINVAL;
+
+ if (len == 0)
+ return 0;
+
+ btrfs_double_lock(src, loff, dst, dst_loff, len);
+
+ ret = extent_same_check_offsets(src, loff, len);
+ if (ret)
+ goto out_unlock;
+
+ ret = extent_same_check_offsets(dst, dst_loff, len);
+ if (ret)
+ goto out_unlock;
+
+ /* don't make the dst file partly checksummed */
+ if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
+ (BTRFS_I(dst)->flags & BTRFS_INODE_NODATASUM)) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ ret = btrfs_cmp_data(src, loff, dst, dst_loff, len);
+ if (ret == 0)
+ ret = btrfs_clone(src, dst, loff, len, len, dst_loff);
+
+out_unlock:
+ btrfs_double_unlock(src, loff, dst, dst_loff, len);
+
+ return ret;
+}
+
+#define BTRFS_MAX_DEDUPE_LEN (16 * 1024 * 1024)
+
+static long btrfs_ioctl_file_extent_same(struct file *file,
+ struct btrfs_ioctl_same_args __user *argp)
+{
+ struct btrfs_ioctl_same_args *same = NULL;
+ struct btrfs_ioctl_same_extent_info *info;
+ struct inode *src = file_inode(file);
+ u64 off;
+ u64 len;
+ int i;
+ int ret;
+ unsigned long size;
+ u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
+ bool is_admin = capable(CAP_SYS_ADMIN);
+ u16 count;
+
+ if (!(file->f_mode & FMODE_READ))
+ return -EINVAL;
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
+
+ if (get_user(count, &argp->dest_count)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ size = offsetof(struct btrfs_ioctl_same_args __user, info[count]);
+
+ same = memdup_user(argp, size);
+
+ if (IS_ERR(same)) {
+ ret = PTR_ERR(same);
+ same = NULL;
+ goto out;
+ }
+
+ off = same->logical_offset;
+ len = same->length;
+
+ /*
+ * Limit the total length we will dedupe for each operation.
+ * This is intended to bound the total time spent in this
+ * ioctl to something sane.
+ */
+ if (len > BTRFS_MAX_DEDUPE_LEN)
+ len = BTRFS_MAX_DEDUPE_LEN;
+
+ if (WARN_ON_ONCE(bs < PAGE_CACHE_SIZE)) {
+ /*
+ * Btrfs does not support blocksize < page_size. As a
+ * result, btrfs_cmp_data() won't correctly handle
+ * this situation without an update.
+ */
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = -EISDIR;
+ if (S_ISDIR(src->i_mode))
+ goto out;
+
+ ret = -EACCES;
+ if (!S_ISREG(src->i_mode))
+ goto out;
+
+ /* pre-format output fields to sane values */
+ for (i = 0; i < count; i++) {
+ same->info[i].bytes_deduped = 0ULL;
+ same->info[i].status = 0;
+ }
+
+ for (i = 0, info = same->info; i < count; i++, info++) {
+ struct inode *dst;
+ struct fd dst_file = fdget(info->fd);
+ if (!dst_file.file) {
+ info->status = -EBADF;
+ continue;
+ }
+ dst = file_inode(dst_file.file);
+
+ if (!(is_admin || (dst_file.file->f_mode & FMODE_WRITE))) {
+ info->status = -EINVAL;
+ } else if (file->f_path.mnt != dst_file.file->f_path.mnt) {
+ info->status = -EXDEV;
+ } else if (S_ISDIR(dst->i_mode)) {
+ info->status = -EISDIR;
+ } else if (!S_ISREG(dst->i_mode)) {
+ info->status = -EACCES;
+ } else {
+ info->status = btrfs_extent_same(src, off, len, dst,
+ info->logical_offset);
+ if (info->status == 0)
+ info->bytes_deduped += len;
+ }
+ fdput(dst_file);
+ }
+
+ ret = copy_to_user(argp, same, size);
+ if (ret)
+ ret = -EFAULT;
+
+out:
+ mnt_drop_write_file(file);
+ kfree(same);
+ return ret;
+}
+
+/* Helper to check and see if this root currently has a ref on the given disk
+ * bytenr. If it does then we need to update the quota for this root. This
+ * doesn't do anything if quotas aren't enabled.
+ */
+static int check_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ u64 disko)
+{
+ struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem);
+ struct ulist *roots;
+ struct ulist_iterator uiter;
+ struct ulist_node *root_node = NULL;
+ int ret;
+
+ if (!root->fs_info->quota_enabled)
+ return 1;
+
+ btrfs_get_tree_mod_seq(root->fs_info, &tree_mod_seq_elem);
+ ret = btrfs_find_all_roots(trans, root->fs_info, disko,
+ tree_mod_seq_elem.seq, &roots);
+ if (ret < 0)
+ goto out;
+ ret = 0;
+ ULIST_ITER_INIT(&uiter);
+ while ((root_node = ulist_next(roots, &uiter))) {
+ if (root_node->val == root->objectid) {
+ ret = 1;
+ break;
+ }
+ }
+ ulist_free(roots);
+out:
+ btrfs_put_tree_mod_seq(root->fs_info, &tree_mod_seq_elem);
+ return ret;
+}
+
+static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
+ struct inode *inode,
+ u64 endoff,
+ const u64 destoff,
+ const u64 olen)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret;
+
+ inode_inc_iversion(inode);
+ inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ /*
+ * We round up to the block size at eof when determining which
+ * extents to clone above, but shouldn't round up the file size.
+ */
+ if (endoff > destoff + olen)
+ endoff = destoff + olen;
+ if (endoff > inode->i_size)
+ btrfs_i_size_write(inode, endoff);
+
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ btrfs_end_transaction(trans, root);
+ goto out;
+ }
+ ret = btrfs_end_transaction(trans, root);
+out:
+ return ret;
+}
+
+static void clone_update_extent_map(struct inode *inode,
+ const struct btrfs_trans_handle *trans,
+ const struct btrfs_path *path,
+ const u64 hole_offset,
+ const u64 hole_len)
+{
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ struct extent_map *em;
+ int ret;
+
+ em = alloc_extent_map();
+ if (!em) {
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
+ return;
+ }
+
+ if (path) {
+ struct btrfs_file_extent_item *fi;
+
+ fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_file_extent_item);
+ btrfs_extent_item_to_extent_map(inode, path, fi, false, em);
+ em->generation = -1;
+ if (btrfs_file_extent_type(path->nodes[0], fi) ==
+ BTRFS_FILE_EXTENT_INLINE)
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
+ } else {
+ em->start = hole_offset;
+ em->len = hole_len;
+ em->ram_bytes = em->len;
+ em->orig_start = hole_offset;
+ em->block_start = EXTENT_MAP_HOLE;
+ em->block_len = 0;
+ em->orig_block_len = 0;
+ em->compress_type = BTRFS_COMPRESS_NONE;
+ em->generation = trans->transid;
+ }
+
+ while (1) {
+ write_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, em, 1);
+ write_unlock(&em_tree->lock);
+ if (ret != -EEXIST) {
+ free_extent_map(em);
+ break;
+ }
+ btrfs_drop_extent_cache(inode, em->start,
+ em->start + em->len - 1, 0);
+ }
+
+ if (ret)
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
+}
+
+/**
+ * btrfs_clone() - clone a range from inode file to another
+ *
+ * @src: Inode to clone from
+ * @inode: Inode to clone to
+ * @off: Offset within source to start clone from
+ * @olen: Original length, passed by user, of range to clone
+ * @olen_aligned: Block-aligned value of olen, extent_same uses
+ * identical values here
+ * @destoff: Offset within @inode to start clone
+ */
+static int btrfs_clone(struct inode *src, struct inode *inode,
+ const u64 off, const u64 olen, const u64 olen_aligned,
+ const u64 destoff)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_path *path = NULL;
+ struct extent_buffer *leaf;
+ struct btrfs_trans_handle *trans;
+ char *buf = NULL;
+ struct btrfs_key key;
+ u32 nritems;
+ int slot;
+ int ret;
+ int no_quota;
+ const u64 len = olen_aligned;
+ u64 last_disko = 0;
+ u64 last_dest_end = destoff;
+
+ ret = -ENOMEM;
+ buf = vmalloc(root->nodesize);
+ if (!buf)
+ return ret;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ vfree(buf);
+ return ret;
+ }
+
+ path->reada = 2;
+ /* clone data */
+ key.objectid = btrfs_ino(src);
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = off;
+
+ while (1) {
+ u64 next_key_min_offset = key.offset + 1;
+
+ /*
+ * note the key will change type as we walk through the
+ * tree.
+ */
+ path->leave_spinning = 1;
+ ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path,
+ 0, 0);
+ if (ret < 0)
+ goto out;
+ /*
+ * First search, if no extent item that starts at offset off was
+ * found but the previous item is an extent item, it's possible
+ * it might overlap our target range, therefore process it.
+ */
+ if (key.offset == off && ret > 0 && path->slots[0] > 0) {
+ btrfs_item_key_to_cpu(path->nodes[0], &key,
+ path->slots[0] - 1);
+ if (key.type == BTRFS_EXTENT_DATA_KEY)
+ path->slots[0]--;
+ }
+
+ nritems = btrfs_header_nritems(path->nodes[0]);
+process_slot:
+ no_quota = 1;
+ if (path->slots[0] >= nritems) {
+ ret = btrfs_next_leaf(BTRFS_I(src)->root, path);
+ if (ret < 0)
+ goto out;
+ if (ret > 0)
+ break;
+ nritems = btrfs_header_nritems(path->nodes[0]);
+ }
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.type > BTRFS_EXTENT_DATA_KEY ||
+ key.objectid != btrfs_ino(src))
+ break;
+
+ if (key.type == BTRFS_EXTENT_DATA_KEY) {
+ struct btrfs_file_extent_item *extent;
+ int type;
+ u32 size;
+ struct btrfs_key new_key;
+ u64 disko = 0, diskl = 0;
+ u64 datao = 0, datal = 0;
+ u8 comp;
+ u64 drop_start;
+
+ extent = btrfs_item_ptr(leaf, slot,
+ struct btrfs_file_extent_item);
+ comp = btrfs_file_extent_compression(leaf, extent);
+ type = btrfs_file_extent_type(leaf, extent);
+ if (type == BTRFS_FILE_EXTENT_REG ||
+ type == BTRFS_FILE_EXTENT_PREALLOC) {
+ disko = btrfs_file_extent_disk_bytenr(leaf,
+ extent);
+ diskl = btrfs_file_extent_disk_num_bytes(leaf,
+ extent);
+ datao = btrfs_file_extent_offset(leaf, extent);
+ datal = btrfs_file_extent_num_bytes(leaf,
+ extent);
+ } else if (type == BTRFS_FILE_EXTENT_INLINE) {
+ /* take upper bound, may be compressed */
+ datal = btrfs_file_extent_ram_bytes(leaf,
+ extent);
+ }
+
+ /*
+ * The first search might have left us at an extent
+ * item that ends before our target range's start, can
+ * happen if we have holes and NO_HOLES feature enabled.
+ */
+ if (key.offset + datal <= off) {
+ path->slots[0]++;
+ goto process_slot;
+ } else if (key.offset >= off + len) {
+ break;
+ }
+ next_key_min_offset = key.offset + datal;
+ size = btrfs_item_size_nr(leaf, slot);
+ read_extent_buffer(leaf, buf,
+ btrfs_item_ptr_offset(leaf, slot),
+ size);
+
+ btrfs_release_path(path);
+ path->leave_spinning = 0;
+
+ memcpy(&new_key, &key, sizeof(new_key));
+ new_key.objectid = btrfs_ino(inode);
+ if (off <= key.offset)
+ new_key.offset = key.offset + destoff - off;
+ else
+ new_key.offset = destoff;
+
+ /*
+ * Deal with a hole that doesn't have an extent item
+ * that represents it (NO_HOLES feature enabled).
+ * This hole is either in the middle of the cloning
+ * range or at the beginning (fully overlaps it or
+ * partially overlaps it).
+ */
+ if (new_key.offset != last_dest_end)
+ drop_start = last_dest_end;
+ else
+ drop_start = new_key.offset;
+
+ /*
+ * 1 - adjusting old extent (we may have to split it)
+ * 1 - add new extent
+ * 1 - inode update
+ */
+ trans = btrfs_start_transaction(root, 3);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+
+ if (type == BTRFS_FILE_EXTENT_REG ||
+ type == BTRFS_FILE_EXTENT_PREALLOC) {
+ /*
+ * a | --- range to clone ---| b
+ * | ------------- extent ------------- |
+ */
+
+ /* subtract range b */
+ if (key.offset + datal > off + len)
+ datal = off + len - key.offset;
+
+ /* subtract range a */
+ if (off > key.offset) {
+ datao += off - key.offset;
+ datal -= off - key.offset;
+ }
+
+ ret = btrfs_drop_extents(trans, root, inode,
+ drop_start,
+ new_key.offset + datal,
+ 1);
+ if (ret) {
+ if (ret != -EOPNOTSUPP)
+ btrfs_abort_transaction(trans,
+ root, ret);
+ btrfs_end_transaction(trans, root);
+ goto out;
+ }
+
+ ret = btrfs_insert_empty_item(trans, root, path,
+ &new_key, size);
+ if (ret) {
+ btrfs_abort_transaction(trans, root,
+ ret);
+ btrfs_end_transaction(trans, root);
+ goto out;
+ }
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ write_extent_buffer(leaf, buf,
+ btrfs_item_ptr_offset(leaf, slot),
+ size);
+
+ extent = btrfs_item_ptr(leaf, slot,
+ struct btrfs_file_extent_item);
+
+ /* disko == 0 means it's a hole */
+ if (!disko)
+ datao = 0;
+
+ btrfs_set_file_extent_offset(leaf, extent,
+ datao);
+ btrfs_set_file_extent_num_bytes(leaf, extent,
+ datal);
+
+ /*
+ * We need to look up the roots that point at
+ * this bytenr and see if the new root does. If
+ * it does not we need to make sure we update
+ * quotas appropriately.
+ */
+ if (disko && root != BTRFS_I(src)->root &&
+ disko != last_disko) {
+ no_quota = check_ref(trans, root,
+ disko);
+ if (no_quota < 0) {
+ btrfs_abort_transaction(trans,
+ root,
+ ret);
+ btrfs_end_transaction(trans,
+ root);
+ ret = no_quota;
+ goto out;
+ }
+ }
+
+ if (disko) {
+ inode_add_bytes(inode, datal);
+ ret = btrfs_inc_extent_ref(trans, root,
+ disko, diskl, 0,
+ root->root_key.objectid,
+ btrfs_ino(inode),
+ new_key.offset - datao,
+ no_quota);
+ if (ret) {
+ btrfs_abort_transaction(trans,
+ root,
+ ret);
+ btrfs_end_transaction(trans,
+ root);
+ goto out;
+
+ }
+ }
+ } else if (type == BTRFS_FILE_EXTENT_INLINE) {
+ u64 skip = 0;
+ u64 trim = 0;
+ u64 aligned_end = 0;
+
+ /*
+ * Don't copy an inline extent into an offset
+ * greater than zero. Having an inline extent
+ * at such an offset results in chaos as btrfs
+ * isn't prepared for such cases. Just skip
+ * this case for the same reasons as commented
+ * at btrfs_ioctl_clone().
+ */
+ if (last_dest_end > 0) {
+ ret = -EOPNOTSUPP;
+ btrfs_end_transaction(trans, root);
+ goto out;
+ }
+
+ if (off > key.offset) {
+ skip = off - key.offset;
+ new_key.offset += skip;
+ }
+
+ if (key.offset + datal > off + len)
+ trim = key.offset + datal - (off + len);
+
+ if (comp && (skip || trim)) {
+ ret = -EINVAL;
+ btrfs_end_transaction(trans, root);
+ goto out;
+ }
+ size -= skip + trim;
+ datal -= skip + trim;
+
+ aligned_end = ALIGN(new_key.offset + datal,
+ root->sectorsize);
+ ret = btrfs_drop_extents(trans, root, inode,
+ drop_start,
+ aligned_end,
+ 1);
+ if (ret) {
+ if (ret != -EOPNOTSUPP)
+ btrfs_abort_transaction(trans,
+ root, ret);
+ btrfs_end_transaction(trans, root);
+ goto out;
+ }
+
+ ret = btrfs_insert_empty_item(trans, root, path,
+ &new_key, size);
+ if (ret) {
+ btrfs_abort_transaction(trans, root,
+ ret);
+ btrfs_end_transaction(trans, root);
+ goto out;
+ }
+
+ if (skip) {
+ u32 start =
+ btrfs_file_extent_calc_inline_size(0);
+ memmove(buf+start, buf+start+skip,
+ datal);
+ }
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ write_extent_buffer(leaf, buf,
+ btrfs_item_ptr_offset(leaf, slot),
+ size);
+ inode_add_bytes(inode, datal);
+ }
+
+ /* If we have an implicit hole (NO_HOLES feature). */
+ if (drop_start < new_key.offset)
+ clone_update_extent_map(inode, trans,
+ NULL, drop_start,
+ new_key.offset - drop_start);
+
+ clone_update_extent_map(inode, trans, path, 0, 0);
+
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_release_path(path);
+
+ last_dest_end = ALIGN(new_key.offset + datal,
+ root->sectorsize);
+ ret = clone_finish_inode_update(trans, inode,
+ last_dest_end,
+ destoff, olen);
+ if (ret)
+ goto out;
+ if (new_key.offset + datal >= destoff + len)
+ break;
+ }
+ btrfs_release_path(path);
+ key.offset = next_key_min_offset;
+ }
+ ret = 0;
+
+ if (last_dest_end < destoff + len) {
+ /*
+ * We have an implicit hole (NO_HOLES feature is enabled) that
+ * fully or partially overlaps our cloning range at its end.
+ */
+ btrfs_release_path(path);
+
+ /*
+ * 1 - remove extent(s)
+ * 1 - inode update
+ */
+ trans = btrfs_start_transaction(root, 2);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+ ret = btrfs_drop_extents(trans, root, inode,
+ last_dest_end, destoff + len, 1);
+ if (ret) {
+ if (ret != -EOPNOTSUPP)
+ btrfs_abort_transaction(trans, root, ret);
+ btrfs_end_transaction(trans, root);
+ goto out;
+ }
+ clone_update_extent_map(inode, trans, NULL, last_dest_end,
+ destoff + len - last_dest_end);
+ ret = clone_finish_inode_update(trans, inode, destoff + len,
+ destoff, olen);
+ }
+
+out:
+ btrfs_free_path(path);
+ vfree(buf);
+ return ret;
+}
+
+static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
+ u64 off, u64 olen, u64 destoff)
+{
+ struct inode *inode = file_inode(file);
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct fd src_file;
+ struct inode *src;
+ int ret;
+ u64 len = olen;
+ u64 bs = root->fs_info->sb->s_blocksize;
+ int same_inode = 0;
+
+ /*
+ * TODO:
+ * - split compressed inline extents. annoying: we need to
+ * decompress into destination's address_space (the file offset
+ * may change, so source mapping won't do), then recompress (or
+ * otherwise reinsert) a subrange.
+ *
+ * - split destination inode's inline extents. The inline extents can
+ * be either compressed or non-compressed.
+ */
+
+ /* the destination must be opened for writing */
+ if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND))
+ return -EINVAL;
+
+ if (btrfs_root_readonly(root))
+ return -EROFS;
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
+
+ src_file = fdget(srcfd);
+ if (!src_file.file) {
+ ret = -EBADF;
+ goto out_drop_write;
+ }
+
+ ret = -EXDEV;
+ if (src_file.file->f_path.mnt != file->f_path.mnt)
+ goto out_fput;
+
+ src = file_inode(src_file.file);
+
+ ret = -EINVAL;
+ if (src == inode)
+ same_inode = 1;
+
+ /* the src must be open for reading */
+ if (!(src_file.file->f_mode & FMODE_READ))
+ goto out_fput;
+
+ /* don't make the dst file partly checksummed */
+ if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
+ (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM))
+ goto out_fput;
+
+ ret = -EISDIR;
+ if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode))
+ goto out_fput;
+
+ ret = -EXDEV;
+ if (src->i_sb != inode->i_sb)
+ goto out_fput;
+
+ if (!same_inode) {
+ if (inode < src) {
+ mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
+ mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD);
+ } else {
+ mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT);
+ mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
+ }
+ } else {
+ mutex_lock(&src->i_mutex);
+ }
+
+ /* determine range to clone */
+ ret = -EINVAL;
+ if (off + len > src->i_size || off + len < off)
+ goto out_unlock;
+ if (len == 0)
+ olen = len = src->i_size - off;
+ /* if we extend to eof, continue to block boundary */
+ if (off + len == src->i_size)
+ len = ALIGN(src->i_size, bs) - off;
+
+ if (len == 0) {
+ ret = 0;
+ goto out_unlock;
+ }
+
+ /* verify the end result is block aligned */
+ if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) ||
+ !IS_ALIGNED(destoff, bs))
+ goto out_unlock;
+
+ /* verify if ranges are overlapped within the same file */
+ if (same_inode) {
+ if (destoff + len > off && destoff < off + len)
+ goto out_unlock;
+ }
+
+ if (destoff > inode->i_size) {
+ ret = btrfs_cont_expand(inode, inode->i_size, destoff);
+ if (ret)
+ goto out_unlock;
+ }
+
+ /*
+ * Lock the target range too. Right after we replace the file extent
+ * items in the fs tree (which now point to the cloned data), we might
+ * have a worker replace them with extent items relative to a write
+ * operation that was issued before this clone operation (i.e. confront
+ * with inode.c:btrfs_finish_ordered_io).
+ */
+ if (same_inode) {
+ u64 lock_start = min_t(u64, off, destoff);
+ u64 lock_len = max_t(u64, off, destoff) + len - lock_start;
+
+ lock_extent_range(src, lock_start, lock_len);
+ } else {
+ lock_extent_range(src, off, len);
+ lock_extent_range(inode, destoff, len);
+ }
+
+ ret = btrfs_clone(src, inode, off, olen, len, destoff);
+
+ if (same_inode) {
+ u64 lock_start = min_t(u64, off, destoff);
+ u64 lock_end = max_t(u64, off, destoff) + len - 1;
+
+ unlock_extent(&BTRFS_I(src)->io_tree, lock_start, lock_end);
+ } else {
+ unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
+ unlock_extent(&BTRFS_I(inode)->io_tree, destoff,
+ destoff + len - 1);
+ }
+ /*
+ * Truncate page cache pages so that future reads will see the cloned
+ * data immediately and not the previous data.
+ */
+ truncate_inode_pages_range(&inode->i_data, destoff,
+ PAGE_CACHE_ALIGN(destoff + len) - 1);
+out_unlock:
+ if (!same_inode) {
+ if (inode < src) {
+ mutex_unlock(&src->i_mutex);
+ mutex_unlock(&inode->i_mutex);
+ } else {
+ mutex_unlock(&inode->i_mutex);
+ mutex_unlock(&src->i_mutex);
+ }
+ } else {
+ mutex_unlock(&src->i_mutex);
+ }
+out_fput:
+ fdput(src_file);
+out_drop_write:
+ mnt_drop_write_file(file);
+ return ret;
+}
+
+static long btrfs_ioctl_clone_range(struct file *file, void __user *argp)
+{
+ struct btrfs_ioctl_clone_range_args args;
+
+ if (copy_from_user(&args, argp, sizeof(args)))
+ return -EFAULT;
+ return btrfs_ioctl_clone(file, args.src_fd, args.src_offset,
+ args.src_length, args.dest_offset);
+}
+
+/*
+ * there are many ways the trans_start and trans_end ioctls can lead
+ * to deadlocks. They should only be used by applications that
+ * basically own the machine, and have a very in depth understanding
+ * of all the possible deadlocks and enospc problems.
+ */
+static long btrfs_ioctl_trans_start(struct file *file)
+{
+ struct inode *inode = file_inode(file);
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_trans_handle *trans;
+ int ret;
+
+ ret = -EPERM;
+ if (!capable(CAP_SYS_ADMIN))
+ goto out;
+
+ ret = -EINPROGRESS;
+ if (file->private_data)
+ goto out;
+
+ ret = -EROFS;
+ if (btrfs_root_readonly(root))
+ goto out;
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ goto out;
+
+ atomic_inc(&root->fs_info->open_ioctl_trans);
+
+ ret = -ENOMEM;
+ trans = btrfs_start_ioctl_transaction(root);
+ if (IS_ERR(trans))
+ goto out_drop;
+
+ file->private_data = trans;
+ return 0;
+
+out_drop:
+ atomic_dec(&root->fs_info->open_ioctl_trans);
+ mnt_drop_write_file(file);
+out:
+ return ret;
+}
+
+static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
+{
+ struct inode *inode = file_inode(file);
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_root *new_root;
+ struct btrfs_dir_item *di;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_path *path;
+ struct btrfs_key location;
+ struct btrfs_disk_key disk_key;
+ u64 objectid = 0;
+ u64 dir_id;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
+
+ if (copy_from_user(&objectid, argp, sizeof(objectid))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ if (!objectid)
+ objectid = BTRFS_FS_TREE_OBJECTID;
+
+ location.objectid = objectid;
+ location.type = BTRFS_ROOT_ITEM_KEY;
+ location.offset = (u64)-1;
+
+ new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
+ if (IS_ERR(new_root)) {
+ ret = PTR_ERR(new_root);
+ goto out;
+ }
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ path->leave_spinning = 1;
+
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ btrfs_free_path(path);
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+
+ dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
+ di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path,
+ dir_id, "default", 7, 1);
+ if (IS_ERR_OR_NULL(di)) {
+ btrfs_free_path(path);
+ btrfs_end_transaction(trans, root);
+ btrfs_err(new_root->fs_info, "Umm, you don't have the default dir"
+ "item, this isn't going to work");
+ ret = -ENOENT;
+ goto out;
+ }
+
+ btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key);
+ btrfs_set_dir_item_key(path->nodes[0], di, &disk_key);
+ btrfs_mark_buffer_dirty(path->nodes[0]);
+ btrfs_free_path(path);
+
+ btrfs_set_fs_incompat(root->fs_info, DEFAULT_SUBVOL);
+ btrfs_end_transaction(trans, root);
+out:
+ mnt_drop_write_file(file);
+ return ret;
+}
+
+void btrfs_get_block_group_info(struct list_head *groups_list,
+ struct btrfs_ioctl_space_info *space)
+{
+ struct btrfs_block_group_cache *block_group;
+
+ space->total_bytes = 0;
+ space->used_bytes = 0;
+ space->flags = 0;
+ list_for_each_entry(block_group, groups_list, list) {
+ space->flags = block_group->flags;
+ space->total_bytes += block_group->key.offset;
+ space->used_bytes +=
+ btrfs_block_group_used(&block_group->item);
+ }
+}
+
+static long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
+{
+ struct btrfs_ioctl_space_args space_args;
+ struct btrfs_ioctl_space_info space;
+ struct btrfs_ioctl_space_info *dest;
+ struct btrfs_ioctl_space_info *dest_orig;
+ struct btrfs_ioctl_space_info __user *user_dest;
+ struct btrfs_space_info *info;
+ u64 types[] = {BTRFS_BLOCK_GROUP_DATA,
+ BTRFS_BLOCK_GROUP_SYSTEM,
+ BTRFS_BLOCK_GROUP_METADATA,
+ BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA};
+ int num_types = 4;
+ int alloc_size;
+ int ret = 0;
+ u64 slot_count = 0;
+ int i, c;
+
+ if (copy_from_user(&space_args,
+ (struct btrfs_ioctl_space_args __user *)arg,
+ sizeof(space_args)))
+ return -EFAULT;
+
+ for (i = 0; i < num_types; i++) {
+ struct btrfs_space_info *tmp;
+
+ info = NULL;
+ rcu_read_lock();
+ list_for_each_entry_rcu(tmp, &root->fs_info->space_info,
+ list) {
+ if (tmp->flags == types[i]) {
+ info = tmp;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ if (!info)
+ continue;
+
+ down_read(&info->groups_sem);
+ for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
+ if (!list_empty(&info->block_groups[c]))
+ slot_count++;
+ }
+ up_read(&info->groups_sem);
+ }
+
+ /*
+ * Global block reserve, exported as a space_info
+ */
+ slot_count++;
+
+ /* space_slots == 0 means they are asking for a count */
+ if (space_args.space_slots == 0) {
+ space_args.total_spaces = slot_count;
+ goto out;
+ }
+
+ slot_count = min_t(u64, space_args.space_slots, slot_count);
+
+ alloc_size = sizeof(*dest) * slot_count;
+
+ /* we generally have at most 6 or so space infos, one for each raid
+ * level. So, a whole page should be more than enough for everyone
+ */
+ if (alloc_size > PAGE_CACHE_SIZE)
+ return -ENOMEM;
+
+ space_args.total_spaces = 0;
+ dest = kmalloc(alloc_size, GFP_NOFS);
+ if (!dest)
+ return -ENOMEM;
+ dest_orig = dest;
+
+ /* now we have a buffer to copy into */
+ for (i = 0; i < num_types; i++) {
+ struct btrfs_space_info *tmp;
+
+ if (!slot_count)
+ break;
+
+ info = NULL;
+ rcu_read_lock();
+ list_for_each_entry_rcu(tmp, &root->fs_info->space_info,
+ list) {
+ if (tmp->flags == types[i]) {
+ info = tmp;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ if (!info)
+ continue;
+ down_read(&info->groups_sem);
+ for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
+ if (!list_empty(&info->block_groups[c])) {
+ btrfs_get_block_group_info(
+ &info->block_groups[c], &space);
+ memcpy(dest, &space, sizeof(space));
+ dest++;
+ space_args.total_spaces++;
+ slot_count--;
+ }
+ if (!slot_count)
+ break;
+ }
+ up_read(&info->groups_sem);
+ }
+
+ /*
+ * Add global block reserve
+ */
+ if (slot_count) {
+ struct btrfs_block_rsv *block_rsv = &root->fs_info->global_block_rsv;
+
+ spin_lock(&block_rsv->lock);
+ space.total_bytes = block_rsv->size;
+ space.used_bytes = block_rsv->size - block_rsv->reserved;
+ spin_unlock(&block_rsv->lock);
+ space.flags = BTRFS_SPACE_INFO_GLOBAL_RSV;
+ memcpy(dest, &space, sizeof(space));
+ space_args.total_spaces++;
+ }
+
+ user_dest = (struct btrfs_ioctl_space_info __user *)
+ (arg + sizeof(struct btrfs_ioctl_space_args));
+
+ if (copy_to_user(user_dest, dest_orig, alloc_size))
+ ret = -EFAULT;
+
+ kfree(dest_orig);
+out:
+ if (ret == 0 && copy_to_user(arg, &space_args, sizeof(space_args)))
+ ret = -EFAULT;
+
+ return ret;
+}
+
+/*
+ * there are many ways the trans_start and trans_end ioctls can lead
+ * to deadlocks. They should only be used by applications that
+ * basically own the machine, and have a very in depth understanding
+ * of all the possible deadlocks and enospc problems.
+ */
+long btrfs_ioctl_trans_end(struct file *file)
+{
+ struct inode *inode = file_inode(file);
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_trans_handle *trans;
+
+ trans = file->private_data;
+ if (!trans)
+ return -EINVAL;
+ file->private_data = NULL;
+
+ btrfs_end_transaction(trans, root);
+
+ atomic_dec(&root->fs_info->open_ioctl_trans);
+
+ mnt_drop_write_file(file);
+ return 0;
+}
+
+static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
+ void __user *argp)
+{
+ struct btrfs_trans_handle *trans;
+ u64 transid;
+ int ret;
+
+ trans = btrfs_attach_transaction_barrier(root);
+ if (IS_ERR(trans)) {
+ if (PTR_ERR(trans) != -ENOENT)
+ return PTR_ERR(trans);
+
+ /* No running transaction, don't bother */
+ transid = root->fs_info->last_trans_committed;
+ goto out;
+ }
+ transid = trans->transid;
+ ret = btrfs_commit_transaction_async(trans, root, 0);
+ if (ret) {
+ btrfs_end_transaction(trans, root);
+ return ret;
+ }
+out:
+ if (argp)
+ if (copy_to_user(argp, &transid, sizeof(transid)))
+ return -EFAULT;
+ return 0;
+}
+
+static noinline long btrfs_ioctl_wait_sync(struct btrfs_root *root,
+ void __user *argp)
+{
+ u64 transid;
+
+ if (argp) {
+ if (copy_from_user(&transid, argp, sizeof(transid)))
+ return -EFAULT;
+ } else {
+ transid = 0; /* current trans */
+ }
+ return btrfs_wait_for_commit(root, transid);
+}
+
+static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
+{
+ struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+ struct btrfs_ioctl_scrub_args *sa;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ sa = memdup_user(arg, sizeof(*sa));
+ if (IS_ERR(sa))
+ return PTR_ERR(sa);
+
+ if (!(sa->flags & BTRFS_SCRUB_READONLY)) {
+ ret = mnt_want_write_file(file);
+ if (ret)
+ goto out;
+ }
+
+ ret = btrfs_scrub_dev(root->fs_info, sa->devid, sa->start, sa->end,
+ &sa->progress, sa->flags & BTRFS_SCRUB_READONLY,
+ 0);
+
+ if (copy_to_user(arg, sa, sizeof(*sa)))
+ ret = -EFAULT;
+
+ if (!(sa->flags & BTRFS_SCRUB_READONLY))
+ mnt_drop_write_file(file);
+out:
+ kfree(sa);
+ return ret;
+}
+
+static long btrfs_ioctl_scrub_cancel(struct btrfs_root *root, void __user *arg)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ return btrfs_scrub_cancel(root->fs_info);
+}
+
+static long btrfs_ioctl_scrub_progress(struct btrfs_root *root,
+ void __user *arg)
+{
+ struct btrfs_ioctl_scrub_args *sa;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ sa = memdup_user(arg, sizeof(*sa));
+ if (IS_ERR(sa))
+ return PTR_ERR(sa);
+
+ ret = btrfs_scrub_progress(root, sa->devid, &sa->progress);
+
+ if (copy_to_user(arg, sa, sizeof(*sa)))
+ ret = -EFAULT;
+
+ kfree(sa);
+ return ret;
+}
+
+static long btrfs_ioctl_get_dev_stats(struct btrfs_root *root,
+ void __user *arg)
+{
+ struct btrfs_ioctl_get_dev_stats *sa;
+ int ret;
+
+ sa = memdup_user(arg, sizeof(*sa));
+ if (IS_ERR(sa))
+ return PTR_ERR(sa);
+
+ if ((sa->flags & BTRFS_DEV_STATS_RESET) && !capable(CAP_SYS_ADMIN)) {
+ kfree(sa);
+ return -EPERM;
+ }
+
+ ret = btrfs_get_dev_stats(root, sa);
+
+ if (copy_to_user(arg, sa, sizeof(*sa)))
+ ret = -EFAULT;
+
+ kfree(sa);
+ return ret;
+}
+
+static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg)
+{
+ struct btrfs_ioctl_dev_replace_args *p;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ p = memdup_user(arg, sizeof(*p));
+ if (IS_ERR(p))
+ return PTR_ERR(p);
+
+ switch (p->cmd) {
+ case BTRFS_IOCTL_DEV_REPLACE_CMD_START:
+ if (root->fs_info->sb->s_flags & MS_RDONLY) {
+ ret = -EROFS;
+ goto out;
+ }
+ if (atomic_xchg(
+ &root->fs_info->mutually_exclusive_operation_running,
+ 1)) {
+ ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+ } else {
+ ret = btrfs_dev_replace_start(root, p);
+ atomic_set(
+ &root->fs_info->mutually_exclusive_operation_running,
+ 0);
+ }
+ break;
+ case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS:
+ btrfs_dev_replace_status(root->fs_info, p);
+ ret = 0;
+ break;
+ case BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL:
+ ret = btrfs_dev_replace_cancel(root->fs_info, p);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ if (copy_to_user(arg, p, sizeof(*p)))
+ ret = -EFAULT;
+out:
+ kfree(p);
+ return ret;
+}
+
+static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
+{
+ int ret = 0;
+ int i;
+ u64 rel_ptr;
+ int size;
+ struct btrfs_ioctl_ino_path_args *ipa = NULL;
+ struct inode_fs_paths *ipath = NULL;
+ struct btrfs_path *path;
+
+ if (!capable(CAP_DAC_READ_SEARCH))
+ return -EPERM;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ipa = memdup_user(arg, sizeof(*ipa));
+ if (IS_ERR(ipa)) {
+ ret = PTR_ERR(ipa);
+ ipa = NULL;
+ goto out;
+ }
+
+ size = min_t(u32, ipa->size, 4096);
+ ipath = init_ipath(size, root, path);
+ if (IS_ERR(ipath)) {
+ ret = PTR_ERR(ipath);
+ ipath = NULL;
+ goto out;
+ }
+
+ ret = paths_from_inode(ipa->inum, ipath);
+ if (ret < 0)
+ goto out;
+
+ for (i = 0; i < ipath->fspath->elem_cnt; ++i) {
+ rel_ptr = ipath->fspath->val[i] -
+ (u64)(unsigned long)ipath->fspath->val;
+ ipath->fspath->val[i] = rel_ptr;
+ }
+
+ ret = copy_to_user((void *)(unsigned long)ipa->fspath,
+ (void *)(unsigned long)ipath->fspath, size);
+ if (ret) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+out:
+ btrfs_free_path(path);
+ free_ipath(ipath);
+ kfree(ipa);
+
+ return ret;
+}
+
+static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx)
+{
+ struct btrfs_data_container *inodes = ctx;
+ const size_t c = 3 * sizeof(u64);
+
+ if (inodes->bytes_left >= c) {
+ inodes->bytes_left -= c;
+ inodes->val[inodes->elem_cnt] = inum;
+ inodes->val[inodes->elem_cnt + 1] = offset;
+ inodes->val[inodes->elem_cnt + 2] = root;
+ inodes->elem_cnt += 3;
+ } else {
+ inodes->bytes_missing += c - inodes->bytes_left;
+ inodes->bytes_left = 0;
+ inodes->elem_missed += 3;
+ }
+
+ return 0;
+}
+
+static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
+ void __user *arg)
+{
+ int ret = 0;
+ int size;
+ struct btrfs_ioctl_logical_ino_args *loi;
+ struct btrfs_data_container *inodes = NULL;
+ struct btrfs_path *path = NULL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ loi = memdup_user(arg, sizeof(*loi));
+ if (IS_ERR(loi)) {
+ ret = PTR_ERR(loi);
+ loi = NULL;
+ goto out;
+ }
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ size = min_t(u32, loi->size, 64 * 1024);
+ inodes = init_data_container(size);
+ if (IS_ERR(inodes)) {
+ ret = PTR_ERR(inodes);
+ inodes = NULL;
+ goto out;
+ }
+
+ ret = iterate_inodes_from_logical(loi->logical, root->fs_info, path,
+ build_ino_list, inodes);
+ if (ret == -EINVAL)
+ ret = -ENOENT;
+ if (ret < 0)
+ goto out;
+
+ ret = copy_to_user((void *)(unsigned long)loi->inodes,
+ (void *)(unsigned long)inodes, size);
+ if (ret)
+ ret = -EFAULT;
+
+out:
+ btrfs_free_path(path);
+ vfree(inodes);
+ kfree(loi);
+
+ return ret;
+}
+
+void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
+ struct btrfs_ioctl_balance_args *bargs)
+{
+ struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+
+ bargs->flags = bctl->flags;
+
+ if (atomic_read(&fs_info->balance_running))
+ bargs->state |= BTRFS_BALANCE_STATE_RUNNING;
+ if (atomic_read(&fs_info->balance_pause_req))
+ bargs->state |= BTRFS_BALANCE_STATE_PAUSE_REQ;
+ if (atomic_read(&fs_info->balance_cancel_req))
+ bargs->state |= BTRFS_BALANCE_STATE_CANCEL_REQ;
+
+ memcpy(&bargs->data, &bctl->data, sizeof(bargs->data));
+ memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta));
+ memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys));
+
+ if (lock) {
+ spin_lock(&fs_info->balance_lock);
+ memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
+ spin_unlock(&fs_info->balance_lock);
+ } else {
+ memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
+ }
+}
+
+static long btrfs_ioctl_balance(struct file *file, void __user *arg)
+{
+ struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_ioctl_balance_args *bargs;
+ struct btrfs_balance_control *bctl;
+ bool need_unlock; /* for mut. excl. ops lock */
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
+
+again:
+ if (!atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) {
+ mutex_lock(&fs_info->volume_mutex);
+ mutex_lock(&fs_info->balance_mutex);
+ need_unlock = true;
+ goto locked;
+ }
+
+ /*
+ * mut. excl. ops lock is locked. Three possibilites:
+ * (1) some other op is running
+ * (2) balance is running
+ * (3) balance is paused -- special case (think resume)
+ */
+ mutex_lock(&fs_info->balance_mutex);
+ if (fs_info->balance_ctl) {
+ /* this is either (2) or (3) */
+ if (!atomic_read(&fs_info->balance_running)) {
+ mutex_unlock(&fs_info->balance_mutex);
+ if (!mutex_trylock(&fs_info->volume_mutex))
+ goto again;
+ mutex_lock(&fs_info->balance_mutex);
+
+ if (fs_info->balance_ctl &&
+ !atomic_read(&fs_info->balance_running)) {
+ /* this is (3) */
+ need_unlock = false;
+ goto locked;
+ }
+
+ mutex_unlock(&fs_info->balance_mutex);
+ mutex_unlock(&fs_info->volume_mutex);
+ goto again;
+ } else {
+ /* this is (2) */
+ mutex_unlock(&fs_info->balance_mutex);
+ ret = -EINPROGRESS;
+ goto out;
+ }
+ } else {
+ /* this is (1) */
+ mutex_unlock(&fs_info->balance_mutex);
+ ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+ goto out;
+ }
+
+locked:
+ BUG_ON(!atomic_read(&fs_info->mutually_exclusive_operation_running));
+
+ if (arg) {
+ bargs = memdup_user(arg, sizeof(*bargs));
+ if (IS_ERR(bargs)) {
+ ret = PTR_ERR(bargs);
+ goto out_unlock;
+ }
+
+ if (bargs->flags & BTRFS_BALANCE_RESUME) {
+ if (!fs_info->balance_ctl) {
+ ret = -ENOTCONN;
+ goto out_bargs;
+ }
+
+ bctl = fs_info->balance_ctl;
+ spin_lock(&fs_info->balance_lock);
+ bctl->flags |= BTRFS_BALANCE_RESUME;
+ spin_unlock(&fs_info->balance_lock);
+
+ goto do_balance;
+ }
+ } else {
+ bargs = NULL;
+ }
+
+ if (fs_info->balance_ctl) {
+ ret = -EINPROGRESS;
+ goto out_bargs;
+ }
+
+ bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
+ if (!bctl) {
+ ret = -ENOMEM;
+ goto out_bargs;
+ }
+
+ bctl->fs_info = fs_info;
+ if (arg) {
+ memcpy(&bctl->data, &bargs->data, sizeof(bctl->data));
+ memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta));
+ memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys));
+
+ bctl->flags = bargs->flags;
+ } else {
+ /* balance everything - no filters */
+ bctl->flags |= BTRFS_BALANCE_TYPE_MASK;
+ }
+
+do_balance:
+ /*
+ * Ownership of bctl and mutually_exclusive_operation_running
+ * goes to to btrfs_balance. bctl is freed in __cancel_balance,
+ * or, if restriper was paused all the way until unmount, in
+ * free_fs_info. mutually_exclusive_operation_running is
+ * cleared in __cancel_balance.
+ */
+ need_unlock = false;
+
+ ret = btrfs_balance(bctl, bargs);
+
+ if (arg) {
+ if (copy_to_user(arg, bargs, sizeof(*bargs)))
+ ret = -EFAULT;
+ }
+
+out_bargs:
+ kfree(bargs);
+out_unlock:
+ mutex_unlock(&fs_info->balance_mutex);
+ mutex_unlock(&fs_info->volume_mutex);
+ if (need_unlock)
+ atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+out:
+ mnt_drop_write_file(file);
+ return ret;
+}
+
+static long btrfs_ioctl_balance_ctl(struct btrfs_root *root, int cmd)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case BTRFS_BALANCE_CTL_PAUSE:
+ return btrfs_pause_balance(root->fs_info);
+ case BTRFS_BALANCE_CTL_CANCEL:
+ return btrfs_cancel_balance(root->fs_info);
+ }
+
+ return -EINVAL;
+}
+
+static long btrfs_ioctl_balance_progress(struct btrfs_root *root,
+ void __user *arg)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_ioctl_balance_args *bargs;
+ int ret = 0;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ mutex_lock(&fs_info->balance_mutex);
+ if (!fs_info->balance_ctl) {
+ ret = -ENOTCONN;
+ goto out;
+ }
+
+ bargs = kzalloc(sizeof(*bargs), GFP_NOFS);
+ if (!bargs) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ update_ioctl_balance_args(fs_info, 1, bargs);
+
+ if (copy_to_user(arg, bargs, sizeof(*bargs)))
+ ret = -EFAULT;
+
+ kfree(bargs);
+out:
+ mutex_unlock(&fs_info->balance_mutex);
+ return ret;
+}
+
+static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
+{
+ struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+ struct btrfs_ioctl_quota_ctl_args *sa;
+ struct btrfs_trans_handle *trans = NULL;
+ int ret;
+ int err;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
+
+ sa = memdup_user(arg, sizeof(*sa));
+ if (IS_ERR(sa)) {
+ ret = PTR_ERR(sa);
+ goto drop_write;
+ }
+
+ down_write(&root->fs_info->subvol_sem);
+ trans = btrfs_start_transaction(root->fs_info->tree_root, 2);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+
+ switch (sa->cmd) {
+ case BTRFS_QUOTA_CTL_ENABLE:
+ ret = btrfs_quota_enable(trans, root->fs_info);
+ break;
+ case BTRFS_QUOTA_CTL_DISABLE:
+ ret = btrfs_quota_disable(trans, root->fs_info);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ err = btrfs_commit_transaction(trans, root->fs_info->tree_root);
+ if (err && !ret)
+ ret = err;
+out:
+ kfree(sa);
+ up_write(&root->fs_info->subvol_sem);
+drop_write:
+ mnt_drop_write_file(file);
+ return ret;
+}
+
+static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
+{
+ struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+ struct btrfs_ioctl_qgroup_assign_args *sa;
+ struct btrfs_trans_handle *trans;
+ int ret;
+ int err;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
+
+ sa = memdup_user(arg, sizeof(*sa));
+ if (IS_ERR(sa)) {
+ ret = PTR_ERR(sa);
+ goto drop_write;
+ }
+
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+
+ /* FIXME: check if the IDs really exist */
+ if (sa->assign) {
+ ret = btrfs_add_qgroup_relation(trans, root->fs_info,
+ sa->src, sa->dst);
+ } else {
+ ret = btrfs_del_qgroup_relation(trans, root->fs_info,
+ sa->src, sa->dst);
+ }
+
+ /* update qgroup status and info */
+ err = btrfs_run_qgroups(trans, root->fs_info);
+ if (err < 0)
+ btrfs_error(root->fs_info, ret,
+ "failed to update qgroup status and info\n");
+ err = btrfs_end_transaction(trans, root);
+ if (err && !ret)
+ ret = err;
+
+out:
+ kfree(sa);
+drop_write:
+ mnt_drop_write_file(file);
+ return ret;
+}
+
+static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
+{
+ struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+ struct btrfs_ioctl_qgroup_create_args *sa;
+ struct btrfs_trans_handle *trans;
+ int ret;
+ int err;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
+
+ sa = memdup_user(arg, sizeof(*sa));
+ if (IS_ERR(sa)) {
+ ret = PTR_ERR(sa);
+ goto drop_write;
+ }
+
+ if (!sa->qgroupid) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+
+ /* FIXME: check if the IDs really exist */
+ if (sa->create) {
+ ret = btrfs_create_qgroup(trans, root->fs_info, sa->qgroupid);
+ } else {
+ ret = btrfs_remove_qgroup(trans, root->fs_info, sa->qgroupid);
+ }
+
+ err = btrfs_end_transaction(trans, root);
+ if (err && !ret)
+ ret = err;
+
+out:
+ kfree(sa);
+drop_write:
+ mnt_drop_write_file(file);
+ return ret;
+}
+
+static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg)
+{
+ struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+ struct btrfs_ioctl_qgroup_limit_args *sa;
+ struct btrfs_trans_handle *trans;
+ int ret;
+ int err;
+ u64 qgroupid;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
+
+ sa = memdup_user(arg, sizeof(*sa));
+ if (IS_ERR(sa)) {
+ ret = PTR_ERR(sa);
+ goto drop_write;
+ }
+
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+
+ qgroupid = sa->qgroupid;
+ if (!qgroupid) {
+ /* take the current subvol as qgroup */
+ qgroupid = root->root_key.objectid;
+ }
+
+ /* FIXME: check if the IDs really exist */
+ ret = btrfs_limit_qgroup(trans, root->fs_info, qgroupid, &sa->lim);
+
+ err = btrfs_end_transaction(trans, root);
+ if (err && !ret)
+ ret = err;
+
+out:
+ kfree(sa);
+drop_write:
+ mnt_drop_write_file(file);
+ return ret;
+}
+
+static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg)
+{
+ struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+ struct btrfs_ioctl_quota_rescan_args *qsa;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
+
+ qsa = memdup_user(arg, sizeof(*qsa));
+ if (IS_ERR(qsa)) {
+ ret = PTR_ERR(qsa);
+ goto drop_write;
+ }
+
+ if (qsa->flags) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = btrfs_qgroup_rescan(root->fs_info);
+
+out:
+ kfree(qsa);
+drop_write:
+ mnt_drop_write_file(file);
+ return ret;
+}
+
+static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg)
+{
+ struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+ struct btrfs_ioctl_quota_rescan_args *qsa;
+ int ret = 0;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ qsa = kzalloc(sizeof(*qsa), GFP_NOFS);
+ if (!qsa)
+ return -ENOMEM;
+
+ if (root->fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
+ qsa->flags = 1;
+ qsa->progress = root->fs_info->qgroup_rescan_progress.objectid;
+ }
+
+ if (copy_to_user(arg, qsa, sizeof(*qsa)))
+ ret = -EFAULT;
+
+ kfree(qsa);
+ return ret;
+}
+
+static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg)
+{
+ struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ return btrfs_qgroup_wait_for_completion(root->fs_info);
+}
+
+static long _btrfs_ioctl_set_received_subvol(struct file *file,
+ struct btrfs_ioctl_received_subvol_args *sa)
+{
+ struct inode *inode = file_inode(file);
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_root_item *root_item = &root->root_item;
+ struct btrfs_trans_handle *trans;
+ struct timespec ct = CURRENT_TIME;
+ int ret = 0;
+ int received_uuid_changed;
+
+ if (!inode_owner_or_capable(inode))
+ return -EPERM;
+
+ ret = mnt_want_write_file(file);
+ if (ret < 0)
+ return ret;
+
+ down_write(&root->fs_info->subvol_sem);
+
+ if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (btrfs_root_readonly(root)) {
+ ret = -EROFS;
+ goto out;
+ }
+
+ /*
+ * 1 - root item
+ * 2 - uuid items (received uuid + subvol uuid)
+ */
+ trans = btrfs_start_transaction(root, 3);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ trans = NULL;
+ goto out;
+ }
+
+ sa->rtransid = trans->transid;
+ sa->rtime.sec = ct.tv_sec;
+ sa->rtime.nsec = ct.tv_nsec;
+
+ received_uuid_changed = memcmp(root_item->received_uuid, sa->uuid,
+ BTRFS_UUID_SIZE);
+ if (received_uuid_changed &&
+ !btrfs_is_empty_uuid(root_item->received_uuid))
+ btrfs_uuid_tree_rem(trans, root->fs_info->uuid_root,
+ root_item->received_uuid,
+ BTRFS_UUID_KEY_RECEIVED_SUBVOL,
+ root->root_key.objectid);
+ memcpy(root_item->received_uuid, sa->uuid, BTRFS_UUID_SIZE);
+ btrfs_set_root_stransid(root_item, sa->stransid);
+ btrfs_set_root_rtransid(root_item, sa->rtransid);
+ btrfs_set_stack_timespec_sec(&root_item->stime, sa->stime.sec);
+ btrfs_set_stack_timespec_nsec(&root_item->stime, sa->stime.nsec);
+ btrfs_set_stack_timespec_sec(&root_item->rtime, sa->rtime.sec);
+ btrfs_set_stack_timespec_nsec(&root_item->rtime, sa->rtime.nsec);
+
+ ret = btrfs_update_root(trans, root->fs_info->tree_root,
+ &root->root_key, &root->root_item);
+ if (ret < 0) {
+ btrfs_end_transaction(trans, root);
+ goto out;
+ }
+ if (received_uuid_changed && !btrfs_is_empty_uuid(sa->uuid)) {
+ ret = btrfs_uuid_tree_add(trans, root->fs_info->uuid_root,
+ sa->uuid,
+ BTRFS_UUID_KEY_RECEIVED_SUBVOL,
+ root->root_key.objectid);
+ if (ret < 0 && ret != -EEXIST) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
+ }
+ ret = btrfs_commit_transaction(trans, root);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
+
+out:
+ up_write(&root->fs_info->subvol_sem);
+ mnt_drop_write_file(file);
+ return ret;
+}
+
+#ifdef CONFIG_64BIT
+static long btrfs_ioctl_set_received_subvol_32(struct file *file,
+ void __user *arg)
+{
+ struct btrfs_ioctl_received_subvol_args_32 *args32 = NULL;
+ struct btrfs_ioctl_received_subvol_args *args64 = NULL;
+ int ret = 0;
+
+ args32 = memdup_user(arg, sizeof(*args32));
+ if (IS_ERR(args32)) {
+ ret = PTR_ERR(args32);
+ args32 = NULL;
+ goto out;
+ }
+
+ args64 = kmalloc(sizeof(*args64), GFP_NOFS);
+ if (!args64) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ memcpy(args64->uuid, args32->uuid, BTRFS_UUID_SIZE);
+ args64->stransid = args32->stransid;
+ args64->rtransid = args32->rtransid;
+ args64->stime.sec = args32->stime.sec;
+ args64->stime.nsec = args32->stime.nsec;
+ args64->rtime.sec = args32->rtime.sec;
+ args64->rtime.nsec = args32->rtime.nsec;
+ args64->flags = args32->flags;
+
+ ret = _btrfs_ioctl_set_received_subvol(file, args64);
+ if (ret)
+ goto out;
+
+ memcpy(args32->uuid, args64->uuid, BTRFS_UUID_SIZE);
+ args32->stransid = args64->stransid;
+ args32->rtransid = args64->rtransid;
+ args32->stime.sec = args64->stime.sec;
+ args32->stime.nsec = args64->stime.nsec;
+ args32->rtime.sec = args64->rtime.sec;
+ args32->rtime.nsec = args64->rtime.nsec;
+ args32->flags = args64->flags;
+
+ ret = copy_to_user(arg, args32, sizeof(*args32));
+ if (ret)
+ ret = -EFAULT;
+
+out:
+ kfree(args32);
+ kfree(args64);
+ return ret;
+}
+#endif
+
+static long btrfs_ioctl_set_received_subvol(struct file *file,
+ void __user *arg)
+{
+ struct btrfs_ioctl_received_subvol_args *sa = NULL;
+ int ret = 0;
+
+ sa = memdup_user(arg, sizeof(*sa));
+ if (IS_ERR(sa)) {
+ ret = PTR_ERR(sa);
+ sa = NULL;
+ goto out;
+ }
+
+ ret = _btrfs_ioctl_set_received_subvol(file, sa);
+
+ if (ret)
+ goto out;
+
+ ret = copy_to_user(arg, sa, sizeof(*sa));
+ if (ret)
+ ret = -EFAULT;
+
+out:
+ kfree(sa);
+ return ret;
+}
+
+static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg)
+{
+ struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+ size_t len;
+ int ret;
+ char label[BTRFS_LABEL_SIZE];
+
+ spin_lock(&root->fs_info->super_lock);
+ memcpy(label, root->fs_info->super_copy->label, BTRFS_LABEL_SIZE);
+ spin_unlock(&root->fs_info->super_lock);
+
+ len = strnlen(label, BTRFS_LABEL_SIZE);
+
+ if (len == BTRFS_LABEL_SIZE) {
+ btrfs_warn(root->fs_info,
+ "label is too long, return the first %zu bytes", --len);
+ }
+
+ ret = copy_to_user(arg, label, len);
+
+ return ret ? -EFAULT : 0;
+}
+
+static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg)
+{
+ struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+ struct btrfs_super_block *super_block = root->fs_info->super_copy;
+ struct btrfs_trans_handle *trans;
+ char label[BTRFS_LABEL_SIZE];
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (copy_from_user(label, arg, sizeof(label)))
+ return -EFAULT;
+
+ if (strnlen(label, BTRFS_LABEL_SIZE) == BTRFS_LABEL_SIZE) {
+ btrfs_err(root->fs_info, "unable to set label with more than %d bytes",
+ BTRFS_LABEL_SIZE - 1);
+ return -EINVAL;
+ }
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
+
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out_unlock;
+ }
+
+ spin_lock(&root->fs_info->super_lock);
+ strcpy(super_block->label, label);
+ spin_unlock(&root->fs_info->super_lock);
+ ret = btrfs_commit_transaction(trans, root);
+
+out_unlock:
+ mnt_drop_write_file(file);
+ return ret;
+}
+
+#define INIT_FEATURE_FLAGS(suffix) \
+ { .compat_flags = BTRFS_FEATURE_COMPAT_##suffix, \
+ .compat_ro_flags = BTRFS_FEATURE_COMPAT_RO_##suffix, \
+ .incompat_flags = BTRFS_FEATURE_INCOMPAT_##suffix }
+
+static int btrfs_ioctl_get_supported_features(struct file *file,
+ void __user *arg)
+{
+ static struct btrfs_ioctl_feature_flags features[3] = {
+ INIT_FEATURE_FLAGS(SUPP),
+ INIT_FEATURE_FLAGS(SAFE_SET),
+ INIT_FEATURE_FLAGS(SAFE_CLEAR)
+ };
+
+ if (copy_to_user(arg, &features, sizeof(features)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int btrfs_ioctl_get_features(struct file *file, void __user *arg)
+{
+ struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+ struct btrfs_super_block *super_block = root->fs_info->super_copy;
+ struct btrfs_ioctl_feature_flags features;
+
+ features.compat_flags = btrfs_super_compat_flags(super_block);
+ features.compat_ro_flags = btrfs_super_compat_ro_flags(super_block);
+ features.incompat_flags = btrfs_super_incompat_flags(super_block);
+
+ if (copy_to_user(arg, &features, sizeof(features)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int check_feature_bits(struct btrfs_root *root,
+ enum btrfs_feature_set set,
+ u64 change_mask, u64 flags, u64 supported_flags,
+ u64 safe_set, u64 safe_clear)
+{
+ const char *type = btrfs_feature_set_names[set];
+ char *names;
+ u64 disallowed, unsupported;
+ u64 set_mask = flags & change_mask;
+ u64 clear_mask = ~flags & change_mask;
+
+ unsupported = set_mask & ~supported_flags;
+ if (unsupported) {
+ names = btrfs_printable_features(set, unsupported);
+ if (names) {
+ btrfs_warn(root->fs_info,
+ "this kernel does not support the %s feature bit%s",
+ names, strchr(names, ',') ? "s" : "");
+ kfree(names);
+ } else
+ btrfs_warn(root->fs_info,
+ "this kernel does not support %s bits 0x%llx",
+ type, unsupported);
+ return -EOPNOTSUPP;
+ }
+
+ disallowed = set_mask & ~safe_set;
+ if (disallowed) {
+ names = btrfs_printable_features(set, disallowed);
+ if (names) {
+ btrfs_warn(root->fs_info,
+ "can't set the %s feature bit%s while mounted",
+ names, strchr(names, ',') ? "s" : "");
+ kfree(names);
+ } else
+ btrfs_warn(root->fs_info,
+ "can't set %s bits 0x%llx while mounted",
+ type, disallowed);
+ return -EPERM;
+ }
+
+ disallowed = clear_mask & ~safe_clear;
+ if (disallowed) {
+ names = btrfs_printable_features(set, disallowed);
+ if (names) {
+ btrfs_warn(root->fs_info,
+ "can't clear the %s feature bit%s while mounted",
+ names, strchr(names, ',') ? "s" : "");
+ kfree(names);
+ } else
+ btrfs_warn(root->fs_info,
+ "can't clear %s bits 0x%llx while mounted",
+ type, disallowed);
+ return -EPERM;
+ }
+
+ return 0;
+}
+
+#define check_feature(root, change_mask, flags, mask_base) \
+check_feature_bits(root, FEAT_##mask_base, change_mask, flags, \
+ BTRFS_FEATURE_ ## mask_base ## _SUPP, \
+ BTRFS_FEATURE_ ## mask_base ## _SAFE_SET, \
+ BTRFS_FEATURE_ ## mask_base ## _SAFE_CLEAR)
+
+static int btrfs_ioctl_set_features(struct file *file, void __user *arg)
+{
+ struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+ struct btrfs_super_block *super_block = root->fs_info->super_copy;
+ struct btrfs_ioctl_feature_flags flags[2];
+ struct btrfs_trans_handle *trans;
+ u64 newflags;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (copy_from_user(flags, arg, sizeof(flags)))
+ return -EFAULT;
+
+ /* Nothing to do */
+ if (!flags[0].compat_flags && !flags[0].compat_ro_flags &&
+ !flags[0].incompat_flags)
+ return 0;
+
+ ret = check_feature(root, flags[0].compat_flags,
+ flags[1].compat_flags, COMPAT);
+ if (ret)
+ return ret;
+
+ ret = check_feature(root, flags[0].compat_ro_flags,
+ flags[1].compat_ro_flags, COMPAT_RO);
+ if (ret)
+ return ret;
+
+ ret = check_feature(root, flags[0].incompat_flags,
+ flags[1].incompat_flags, INCOMPAT);
+ if (ret)
+ return ret;
+
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ spin_lock(&root->fs_info->super_lock);
+ newflags = btrfs_super_compat_flags(super_block);
+ newflags |= flags[0].compat_flags & flags[1].compat_flags;
+ newflags &= ~(flags[0].compat_flags & ~flags[1].compat_flags);
+ btrfs_set_super_compat_flags(super_block, newflags);
+
+ newflags = btrfs_super_compat_ro_flags(super_block);
+ newflags |= flags[0].compat_ro_flags & flags[1].compat_ro_flags;
+ newflags &= ~(flags[0].compat_ro_flags & ~flags[1].compat_ro_flags);
+ btrfs_set_super_compat_ro_flags(super_block, newflags);
+
+ newflags = btrfs_super_incompat_flags(super_block);
+ newflags |= flags[0].incompat_flags & flags[1].incompat_flags;
+ newflags &= ~(flags[0].incompat_flags & ~flags[1].incompat_flags);
+ btrfs_set_super_incompat_flags(super_block, newflags);
+ spin_unlock(&root->fs_info->super_lock);
+
+ return btrfs_commit_transaction(trans, root);
+}
+
+long btrfs_ioctl(struct file *file, unsigned int
+ cmd, unsigned long arg)
+{
+ struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+ void __user *argp = (void __user *)arg;
+
+ switch (cmd) {
+ case FS_IOC_GETFLAGS:
+ return btrfs_ioctl_getflags(file, argp);
+ case FS_IOC_SETFLAGS:
+ return btrfs_ioctl_setflags(file, argp);
+ case FS_IOC_GETVERSION:
+ return btrfs_ioctl_getversion(file, argp);
+ case FITRIM:
+ return btrfs_ioctl_fitrim(file, argp);
+ case BTRFS_IOC_SNAP_CREATE:
+ return btrfs_ioctl_snap_create(file, argp, 0);
+ case BTRFS_IOC_SNAP_CREATE_V2:
+ return btrfs_ioctl_snap_create_v2(file, argp, 0);
+ case BTRFS_IOC_SUBVOL_CREATE:
+ return btrfs_ioctl_snap_create(file, argp, 1);
+ case BTRFS_IOC_SUBVOL_CREATE_V2:
+ return btrfs_ioctl_snap_create_v2(file, argp, 1);
+ case BTRFS_IOC_SNAP_DESTROY:
+ return btrfs_ioctl_snap_destroy(file, argp);
+ case BTRFS_IOC_SUBVOL_GETFLAGS:
+ return btrfs_ioctl_subvol_getflags(file, argp);
+ case BTRFS_IOC_SUBVOL_SETFLAGS:
+ return btrfs_ioctl_subvol_setflags(file, argp);
+ case BTRFS_IOC_DEFAULT_SUBVOL:
+ return btrfs_ioctl_default_subvol(file, argp);
+ case BTRFS_IOC_DEFRAG:
+ return btrfs_ioctl_defrag(file, NULL);
+ case BTRFS_IOC_DEFRAG_RANGE:
+ return btrfs_ioctl_defrag(file, argp);
+ case BTRFS_IOC_RESIZE:
+ return btrfs_ioctl_resize(file, argp);
+ case BTRFS_IOC_ADD_DEV:
+ return btrfs_ioctl_add_dev(root, argp);
+ case BTRFS_IOC_RM_DEV:
+ return btrfs_ioctl_rm_dev(file, argp);
+ case BTRFS_IOC_FS_INFO:
+ return btrfs_ioctl_fs_info(root, argp);
+ case BTRFS_IOC_DEV_INFO:
+ return btrfs_ioctl_dev_info(root, argp);
+ case BTRFS_IOC_BALANCE:
+ return btrfs_ioctl_balance(file, NULL);
+ case BTRFS_IOC_CLONE:
+ return btrfs_ioctl_clone(file, arg, 0, 0, 0);
+ case BTRFS_IOC_CLONE_RANGE:
+ return btrfs_ioctl_clone_range(file, argp);
+ case BTRFS_IOC_TRANS_START:
+ return btrfs_ioctl_trans_start(file);
+ case BTRFS_IOC_TRANS_END:
+ return btrfs_ioctl_trans_end(file);
+ case BTRFS_IOC_TREE_SEARCH:
+ return btrfs_ioctl_tree_search(file, argp);
+ case BTRFS_IOC_TREE_SEARCH_V2:
+ return btrfs_ioctl_tree_search_v2(file, argp);
+ case BTRFS_IOC_INO_LOOKUP:
+ return btrfs_ioctl_ino_lookup(file, argp);
+ case BTRFS_IOC_INO_PATHS:
+ return btrfs_ioctl_ino_to_path(root, argp);
+ case BTRFS_IOC_LOGICAL_INO:
+ return btrfs_ioctl_logical_to_ino(root, argp);
+ case BTRFS_IOC_SPACE_INFO:
+ return btrfs_ioctl_space_info(root, argp);
+ case BTRFS_IOC_SYNC: {
+ int ret;
+
+ ret = btrfs_start_delalloc_roots(root->fs_info, 0, -1);
+ if (ret)
+ return ret;
+ ret = btrfs_sync_fs(file_inode(file)->i_sb, 1);
+ /*
+ * The transaction thread may want to do more work,
+ * namely it pokes the cleaner ktread that will start
+ * processing uncleaned subvols.
+ */
+ wake_up_process(root->fs_info->transaction_kthread);
+ return ret;
+ }
+ case BTRFS_IOC_START_SYNC:
+ return btrfs_ioctl_start_sync(root, argp);
+ case BTRFS_IOC_WAIT_SYNC:
+ return btrfs_ioctl_wait_sync(root, argp);
+ case BTRFS_IOC_SCRUB:
+ return btrfs_ioctl_scrub(file, argp);
+ case BTRFS_IOC_SCRUB_CANCEL:
+ return btrfs_ioctl_scrub_cancel(root, argp);
+ case BTRFS_IOC_SCRUB_PROGRESS:
+ return btrfs_ioctl_scrub_progress(root, argp);
+ case BTRFS_IOC_BALANCE_V2:
+ return btrfs_ioctl_balance(file, argp);
+ case BTRFS_IOC_BALANCE_CTL:
+ return btrfs_ioctl_balance_ctl(root, arg);
+ case BTRFS_IOC_BALANCE_PROGRESS:
+ return btrfs_ioctl_balance_progress(root, argp);
+ case BTRFS_IOC_SET_RECEIVED_SUBVOL:
+ return btrfs_ioctl_set_received_subvol(file, argp);
+#ifdef CONFIG_64BIT
+ case BTRFS_IOC_SET_RECEIVED_SUBVOL_32:
+ return btrfs_ioctl_set_received_subvol_32(file, argp);
+#endif
+ case BTRFS_IOC_SEND:
+ return btrfs_ioctl_send(file, argp);
+ case BTRFS_IOC_GET_DEV_STATS:
+ return btrfs_ioctl_get_dev_stats(root, argp);
+ case BTRFS_IOC_QUOTA_CTL:
+ return btrfs_ioctl_quota_ctl(file, argp);
+ case BTRFS_IOC_QGROUP_ASSIGN:
+ return btrfs_ioctl_qgroup_assign(file, argp);
+ case BTRFS_IOC_QGROUP_CREATE:
+ return btrfs_ioctl_qgroup_create(file, argp);
+ case BTRFS_IOC_QGROUP_LIMIT:
+ return btrfs_ioctl_qgroup_limit(file, argp);
+ case BTRFS_IOC_QUOTA_RESCAN:
+ return btrfs_ioctl_quota_rescan(file, argp);
+ case BTRFS_IOC_QUOTA_RESCAN_STATUS:
+ return btrfs_ioctl_quota_rescan_status(file, argp);
+ case BTRFS_IOC_QUOTA_RESCAN_WAIT:
+ return btrfs_ioctl_quota_rescan_wait(file, argp);
+ case BTRFS_IOC_DEV_REPLACE:
+ return btrfs_ioctl_dev_replace(root, argp);
+ case BTRFS_IOC_GET_FSLABEL:
+ return btrfs_ioctl_get_fslabel(file, argp);
+ case BTRFS_IOC_SET_FSLABEL:
+ return btrfs_ioctl_set_fslabel(file, argp);
+ case BTRFS_IOC_FILE_EXTENT_SAME:
+ return btrfs_ioctl_file_extent_same(file, argp);
+ case BTRFS_IOC_GET_SUPPORTED_FEATURES:
+ return btrfs_ioctl_get_supported_features(file, argp);
+ case BTRFS_IOC_GET_FEATURES:
+ return btrfs_ioctl_get_features(file, argp);
+ case BTRFS_IOC_SET_FEATURES:
+ return btrfs_ioctl_set_features(file, argp);
+ }
+
+ return -ENOTTY;
+}
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
new file mode 100644
index 000000000..f8229ef1b
--- /dev/null
+++ b/fs/btrfs/locking.c
@@ -0,0 +1,300 @@
+/*
+ * Copyright (C) 2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/spinlock.h>
+#include <linux/page-flags.h>
+#include <asm/bug.h>
+#include "ctree.h"
+#include "extent_io.h"
+#include "locking.h"
+
+static void btrfs_assert_tree_read_locked(struct extent_buffer *eb);
+
+/*
+ * if we currently have a spinning reader or writer lock
+ * (indicated by the rw flag) this will bump the count
+ * of blocking holders and drop the spinlock.
+ */
+void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
+{
+ /*
+ * no lock is required. The lock owner may change if
+ * we have a read lock, but it won't change to or away
+ * from us. If we have the write lock, we are the owner
+ * and it'll never change.
+ */
+ if (eb->lock_nested && current->pid == eb->lock_owner)
+ return;
+ if (rw == BTRFS_WRITE_LOCK) {
+ if (atomic_read(&eb->blocking_writers) == 0) {
+ WARN_ON(atomic_read(&eb->spinning_writers) != 1);
+ atomic_dec(&eb->spinning_writers);
+ btrfs_assert_tree_locked(eb);
+ atomic_inc(&eb->blocking_writers);
+ write_unlock(&eb->lock);
+ }
+ } else if (rw == BTRFS_READ_LOCK) {
+ btrfs_assert_tree_read_locked(eb);
+ atomic_inc(&eb->blocking_readers);
+ WARN_ON(atomic_read(&eb->spinning_readers) == 0);
+ atomic_dec(&eb->spinning_readers);
+ read_unlock(&eb->lock);
+ }
+ return;
+}
+
+/*
+ * if we currently have a blocking lock, take the spinlock
+ * and drop our blocking count
+ */
+void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
+{
+ /*
+ * no lock is required. The lock owner may change if
+ * we have a read lock, but it won't change to or away
+ * from us. If we have the write lock, we are the owner
+ * and it'll never change.
+ */
+ if (eb->lock_nested && current->pid == eb->lock_owner)
+ return;
+
+ if (rw == BTRFS_WRITE_LOCK_BLOCKING) {
+ BUG_ON(atomic_read(&eb->blocking_writers) != 1);
+ write_lock(&eb->lock);
+ WARN_ON(atomic_read(&eb->spinning_writers));
+ atomic_inc(&eb->spinning_writers);
+ if (atomic_dec_and_test(&eb->blocking_writers) &&
+ waitqueue_active(&eb->write_lock_wq))
+ wake_up(&eb->write_lock_wq);
+ } else if (rw == BTRFS_READ_LOCK_BLOCKING) {
+ BUG_ON(atomic_read(&eb->blocking_readers) == 0);
+ read_lock(&eb->lock);
+ atomic_inc(&eb->spinning_readers);
+ if (atomic_dec_and_test(&eb->blocking_readers) &&
+ waitqueue_active(&eb->read_lock_wq))
+ wake_up(&eb->read_lock_wq);
+ }
+ return;
+}
+
+/*
+ * take a spinning read lock. This will wait for any blocking
+ * writers
+ */
+void btrfs_tree_read_lock(struct extent_buffer *eb)
+{
+again:
+ BUG_ON(!atomic_read(&eb->blocking_writers) &&
+ current->pid == eb->lock_owner);
+
+ read_lock(&eb->lock);
+ if (atomic_read(&eb->blocking_writers) &&
+ current->pid == eb->lock_owner) {
+ /*
+ * This extent is already write-locked by our thread. We allow
+ * an additional read lock to be added because it's for the same
+ * thread. btrfs_find_all_roots() depends on this as it may be
+ * called on a partly (write-)locked tree.
+ */
+ BUG_ON(eb->lock_nested);
+ eb->lock_nested = 1;
+ read_unlock(&eb->lock);
+ return;
+ }
+ if (atomic_read(&eb->blocking_writers)) {
+ read_unlock(&eb->lock);
+ wait_event(eb->write_lock_wq,
+ atomic_read(&eb->blocking_writers) == 0);
+ goto again;
+ }
+ atomic_inc(&eb->read_locks);
+ atomic_inc(&eb->spinning_readers);
+}
+
+/*
+ * take a spinning read lock.
+ * returns 1 if we get the read lock and 0 if we don't
+ * this won't wait for blocking writers
+ */
+int btrfs_tree_read_lock_atomic(struct extent_buffer *eb)
+{
+ if (atomic_read(&eb->blocking_writers))
+ return 0;
+
+ read_lock(&eb->lock);
+ if (atomic_read(&eb->blocking_writers)) {
+ read_unlock(&eb->lock);
+ return 0;
+ }
+ atomic_inc(&eb->read_locks);
+ atomic_inc(&eb->spinning_readers);
+ return 1;
+}
+
+/*
+ * returns 1 if we get the read lock and 0 if we don't
+ * this won't wait for blocking writers
+ */
+int btrfs_try_tree_read_lock(struct extent_buffer *eb)
+{
+ if (atomic_read(&eb->blocking_writers))
+ return 0;
+
+ if (!read_trylock(&eb->lock))
+ return 0;
+
+ if (atomic_read(&eb->blocking_writers)) {
+ read_unlock(&eb->lock);
+ return 0;
+ }
+ atomic_inc(&eb->read_locks);
+ atomic_inc(&eb->spinning_readers);
+ return 1;
+}
+
+/*
+ * returns 1 if we get the read lock and 0 if we don't
+ * this won't wait for blocking writers or readers
+ */
+int btrfs_try_tree_write_lock(struct extent_buffer *eb)
+{
+ if (atomic_read(&eb->blocking_writers) ||
+ atomic_read(&eb->blocking_readers))
+ return 0;
+
+ write_lock(&eb->lock);
+ if (atomic_read(&eb->blocking_writers) ||
+ atomic_read(&eb->blocking_readers)) {
+ write_unlock(&eb->lock);
+ return 0;
+ }
+ atomic_inc(&eb->write_locks);
+ atomic_inc(&eb->spinning_writers);
+ eb->lock_owner = current->pid;
+ return 1;
+}
+
+/*
+ * drop a spinning read lock
+ */
+void btrfs_tree_read_unlock(struct extent_buffer *eb)
+{
+ /*
+ * if we're nested, we have the write lock. No new locking
+ * is needed as long as we are the lock owner.
+ * The write unlock will do a barrier for us, and the lock_nested
+ * field only matters to the lock owner.
+ */
+ if (eb->lock_nested && current->pid == eb->lock_owner) {
+ eb->lock_nested = 0;
+ return;
+ }
+ btrfs_assert_tree_read_locked(eb);
+ WARN_ON(atomic_read(&eb->spinning_readers) == 0);
+ atomic_dec(&eb->spinning_readers);
+ atomic_dec(&eb->read_locks);
+ read_unlock(&eb->lock);
+}
+
+/*
+ * drop a blocking read lock
+ */
+void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
+{
+ /*
+ * if we're nested, we have the write lock. No new locking
+ * is needed as long as we are the lock owner.
+ * The write unlock will do a barrier for us, and the lock_nested
+ * field only matters to the lock owner.
+ */
+ if (eb->lock_nested && current->pid == eb->lock_owner) {
+ eb->lock_nested = 0;
+ return;
+ }
+ btrfs_assert_tree_read_locked(eb);
+ WARN_ON(atomic_read(&eb->blocking_readers) == 0);
+ if (atomic_dec_and_test(&eb->blocking_readers) &&
+ waitqueue_active(&eb->read_lock_wq))
+ wake_up(&eb->read_lock_wq);
+ atomic_dec(&eb->read_locks);
+}
+
+/*
+ * take a spinning write lock. This will wait for both
+ * blocking readers or writers
+ */
+void btrfs_tree_lock(struct extent_buffer *eb)
+{
+again:
+ wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0);
+ wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0);
+ write_lock(&eb->lock);
+ if (atomic_read(&eb->blocking_readers)) {
+ write_unlock(&eb->lock);
+ wait_event(eb->read_lock_wq,
+ atomic_read(&eb->blocking_readers) == 0);
+ goto again;
+ }
+ if (atomic_read(&eb->blocking_writers)) {
+ write_unlock(&eb->lock);
+ wait_event(eb->write_lock_wq,
+ atomic_read(&eb->blocking_writers) == 0);
+ goto again;
+ }
+ WARN_ON(atomic_read(&eb->spinning_writers));
+ atomic_inc(&eb->spinning_writers);
+ atomic_inc(&eb->write_locks);
+ eb->lock_owner = current->pid;
+}
+
+/*
+ * drop a spinning or a blocking write lock.
+ */
+void btrfs_tree_unlock(struct extent_buffer *eb)
+{
+ int blockers = atomic_read(&eb->blocking_writers);
+
+ BUG_ON(blockers > 1);
+
+ btrfs_assert_tree_locked(eb);
+ eb->lock_owner = 0;
+ atomic_dec(&eb->write_locks);
+
+ if (blockers) {
+ WARN_ON(atomic_read(&eb->spinning_writers));
+ atomic_dec(&eb->blocking_writers);
+ smp_mb();
+ if (waitqueue_active(&eb->write_lock_wq))
+ wake_up(&eb->write_lock_wq);
+ } else {
+ WARN_ON(atomic_read(&eb->spinning_writers) != 1);
+ atomic_dec(&eb->spinning_writers);
+ write_unlock(&eb->lock);
+ }
+}
+
+void btrfs_assert_tree_locked(struct extent_buffer *eb)
+{
+ BUG_ON(!atomic_read(&eb->write_locks));
+}
+
+static void btrfs_assert_tree_read_locked(struct extent_buffer *eb)
+{
+ BUG_ON(!atomic_read(&eb->read_locks));
+}
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
new file mode 100644
index 000000000..c44a9d5f5
--- /dev/null
+++ b/fs/btrfs/locking.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (C) 2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_LOCKING_
+#define __BTRFS_LOCKING_
+
+#define BTRFS_WRITE_LOCK 1
+#define BTRFS_READ_LOCK 2
+#define BTRFS_WRITE_LOCK_BLOCKING 3
+#define BTRFS_READ_LOCK_BLOCKING 4
+
+void btrfs_tree_lock(struct extent_buffer *eb);
+void btrfs_tree_unlock(struct extent_buffer *eb);
+
+void btrfs_tree_read_lock(struct extent_buffer *eb);
+void btrfs_tree_read_unlock(struct extent_buffer *eb);
+void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb);
+void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw);
+void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw);
+void btrfs_assert_tree_locked(struct extent_buffer *eb);
+int btrfs_try_tree_read_lock(struct extent_buffer *eb);
+int btrfs_try_tree_write_lock(struct extent_buffer *eb);
+int btrfs_tree_read_lock_atomic(struct extent_buffer *eb);
+
+
+static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw)
+{
+ if (rw == BTRFS_WRITE_LOCK || rw == BTRFS_WRITE_LOCK_BLOCKING)
+ btrfs_tree_unlock(eb);
+ else if (rw == BTRFS_READ_LOCK_BLOCKING)
+ btrfs_tree_read_unlock_blocking(eb);
+ else if (rw == BTRFS_READ_LOCK)
+ btrfs_tree_read_unlock(eb);
+ else
+ BUG();
+}
+
+static inline void btrfs_set_lock_blocking(struct extent_buffer *eb)
+{
+ btrfs_set_lock_blocking_rw(eb, BTRFS_WRITE_LOCK);
+}
+
+static inline void btrfs_clear_lock_blocking(struct extent_buffer *eb)
+{
+ btrfs_clear_lock_blocking_rw(eb, BTRFS_WRITE_LOCK_BLOCKING);
+}
+#endif
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
new file mode 100644
index 000000000..a2f051347
--- /dev/null
+++ b/fs/btrfs/lzo.c
@@ -0,0 +1,443 @@
+/*
+ * Copyright (C) 2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/bio.h>
+#include <linux/lzo.h>
+#include "compression.h"
+
+#define LZO_LEN 4
+
+struct workspace {
+ void *mem;
+ void *buf; /* where decompressed data goes */
+ void *cbuf; /* where compressed data goes */
+ struct list_head list;
+};
+
+static void lzo_free_workspace(struct list_head *ws)
+{
+ struct workspace *workspace = list_entry(ws, struct workspace, list);
+
+ vfree(workspace->buf);
+ vfree(workspace->cbuf);
+ vfree(workspace->mem);
+ kfree(workspace);
+}
+
+static struct list_head *lzo_alloc_workspace(void)
+{
+ struct workspace *workspace;
+
+ workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
+ if (!workspace)
+ return ERR_PTR(-ENOMEM);
+
+ workspace->mem = vmalloc(LZO1X_MEM_COMPRESS);
+ workspace->buf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE));
+ workspace->cbuf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE));
+ if (!workspace->mem || !workspace->buf || !workspace->cbuf)
+ goto fail;
+
+ INIT_LIST_HEAD(&workspace->list);
+
+ return &workspace->list;
+fail:
+ lzo_free_workspace(&workspace->list);
+ return ERR_PTR(-ENOMEM);
+}
+
+static inline void write_compress_length(char *buf, size_t len)
+{
+ __le32 dlen;
+
+ dlen = cpu_to_le32(len);
+ memcpy(buf, &dlen, LZO_LEN);
+}
+
+static inline size_t read_compress_length(char *buf)
+{
+ __le32 dlen;
+
+ memcpy(&dlen, buf, LZO_LEN);
+ return le32_to_cpu(dlen);
+}
+
+static int lzo_compress_pages(struct list_head *ws,
+ struct address_space *mapping,
+ u64 start, unsigned long len,
+ struct page **pages,
+ unsigned long nr_dest_pages,
+ unsigned long *out_pages,
+ unsigned long *total_in,
+ unsigned long *total_out,
+ unsigned long max_out)
+{
+ struct workspace *workspace = list_entry(ws, struct workspace, list);
+ int ret = 0;
+ char *data_in;
+ char *cpage_out;
+ int nr_pages = 0;
+ struct page *in_page = NULL;
+ struct page *out_page = NULL;
+ unsigned long bytes_left;
+
+ size_t in_len;
+ size_t out_len;
+ char *buf;
+ unsigned long tot_in = 0;
+ unsigned long tot_out = 0;
+ unsigned long pg_bytes_left;
+ unsigned long out_offset;
+ unsigned long bytes;
+
+ *out_pages = 0;
+ *total_out = 0;
+ *total_in = 0;
+
+ in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
+ data_in = kmap(in_page);
+
+ /*
+ * store the size of all chunks of compressed data in
+ * the first 4 bytes
+ */
+ out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ if (out_page == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ cpage_out = kmap(out_page);
+ out_offset = LZO_LEN;
+ tot_out = LZO_LEN;
+ pages[0] = out_page;
+ nr_pages = 1;
+ pg_bytes_left = PAGE_CACHE_SIZE - LZO_LEN;
+
+ /* compress at most one page of data each time */
+ in_len = min(len, PAGE_CACHE_SIZE);
+ while (tot_in < len) {
+ ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf,
+ &out_len, workspace->mem);
+ if (ret != LZO_E_OK) {
+ printk(KERN_DEBUG "BTRFS: deflate in loop returned %d\n",
+ ret);
+ ret = -EIO;
+ goto out;
+ }
+
+ /* store the size of this chunk of compressed data */
+ write_compress_length(cpage_out + out_offset, out_len);
+ tot_out += LZO_LEN;
+ out_offset += LZO_LEN;
+ pg_bytes_left -= LZO_LEN;
+
+ tot_in += in_len;
+ tot_out += out_len;
+
+ /* copy bytes from the working buffer into the pages */
+ buf = workspace->cbuf;
+ while (out_len) {
+ bytes = min_t(unsigned long, pg_bytes_left, out_len);
+
+ memcpy(cpage_out + out_offset, buf, bytes);
+
+ out_len -= bytes;
+ pg_bytes_left -= bytes;
+ buf += bytes;
+ out_offset += bytes;
+
+ /*
+ * we need another page for writing out.
+ *
+ * Note if there's less than 4 bytes left, we just
+ * skip to a new page.
+ */
+ if ((out_len == 0 && pg_bytes_left < LZO_LEN) ||
+ pg_bytes_left == 0) {
+ if (pg_bytes_left) {
+ memset(cpage_out + out_offset, 0,
+ pg_bytes_left);
+ tot_out += pg_bytes_left;
+ }
+
+ /* we're done, don't allocate new page */
+ if (out_len == 0 && tot_in >= len)
+ break;
+
+ kunmap(out_page);
+ if (nr_pages == nr_dest_pages) {
+ out_page = NULL;
+ ret = -E2BIG;
+ goto out;
+ }
+
+ out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ if (out_page == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ cpage_out = kmap(out_page);
+ pages[nr_pages++] = out_page;
+
+ pg_bytes_left = PAGE_CACHE_SIZE;
+ out_offset = 0;
+ }
+ }
+
+ /* we're making it bigger, give up */
+ if (tot_in > 8192 && tot_in < tot_out) {
+ ret = -E2BIG;
+ goto out;
+ }
+
+ /* we're all done */
+ if (tot_in >= len)
+ break;
+
+ if (tot_out > max_out)
+ break;
+
+ bytes_left = len - tot_in;
+ kunmap(in_page);
+ page_cache_release(in_page);
+
+ start += PAGE_CACHE_SIZE;
+ in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
+ data_in = kmap(in_page);
+ in_len = min(bytes_left, PAGE_CACHE_SIZE);
+ }
+
+ if (tot_out > tot_in)
+ goto out;
+
+ /* store the size of all chunks of compressed data */
+ cpage_out = kmap(pages[0]);
+ write_compress_length(cpage_out, tot_out);
+
+ kunmap(pages[0]);
+
+ ret = 0;
+ *total_out = tot_out;
+ *total_in = tot_in;
+out:
+ *out_pages = nr_pages;
+ if (out_page)
+ kunmap(out_page);
+
+ if (in_page) {
+ kunmap(in_page);
+ page_cache_release(in_page);
+ }
+
+ return ret;
+}
+
+static int lzo_decompress_biovec(struct list_head *ws,
+ struct page **pages_in,
+ u64 disk_start,
+ struct bio_vec *bvec,
+ int vcnt,
+ size_t srclen)
+{
+ struct workspace *workspace = list_entry(ws, struct workspace, list);
+ int ret = 0, ret2;
+ char *data_in;
+ unsigned long page_in_index = 0;
+ unsigned long page_out_index = 0;
+ unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_CACHE_SIZE);
+ unsigned long buf_start;
+ unsigned long buf_offset = 0;
+ unsigned long bytes;
+ unsigned long working_bytes;
+ unsigned long pg_offset;
+
+ size_t in_len;
+ size_t out_len;
+ unsigned long in_offset;
+ unsigned long in_page_bytes_left;
+ unsigned long tot_in;
+ unsigned long tot_out;
+ unsigned long tot_len;
+ char *buf;
+ bool may_late_unmap, need_unmap;
+
+ data_in = kmap(pages_in[0]);
+ tot_len = read_compress_length(data_in);
+
+ tot_in = LZO_LEN;
+ in_offset = LZO_LEN;
+ tot_len = min_t(size_t, srclen, tot_len);
+ in_page_bytes_left = PAGE_CACHE_SIZE - LZO_LEN;
+
+ tot_out = 0;
+ pg_offset = 0;
+
+ while (tot_in < tot_len) {
+ in_len = read_compress_length(data_in + in_offset);
+ in_page_bytes_left -= LZO_LEN;
+ in_offset += LZO_LEN;
+ tot_in += LZO_LEN;
+
+ tot_in += in_len;
+ working_bytes = in_len;
+ may_late_unmap = need_unmap = false;
+
+ /* fast path: avoid using the working buffer */
+ if (in_page_bytes_left >= in_len) {
+ buf = data_in + in_offset;
+ bytes = in_len;
+ may_late_unmap = true;
+ goto cont;
+ }
+
+ /* copy bytes from the pages into the working buffer */
+ buf = workspace->cbuf;
+ buf_offset = 0;
+ while (working_bytes) {
+ bytes = min(working_bytes, in_page_bytes_left);
+
+ memcpy(buf + buf_offset, data_in + in_offset, bytes);
+ buf_offset += bytes;
+cont:
+ working_bytes -= bytes;
+ in_page_bytes_left -= bytes;
+ in_offset += bytes;
+
+ /* check if we need to pick another page */
+ if ((working_bytes == 0 && in_page_bytes_left < LZO_LEN)
+ || in_page_bytes_left == 0) {
+ tot_in += in_page_bytes_left;
+
+ if (working_bytes == 0 && tot_in >= tot_len)
+ break;
+
+ if (page_in_index + 1 >= total_pages_in) {
+ ret = -EIO;
+ goto done;
+ }
+
+ if (may_late_unmap)
+ need_unmap = true;
+ else
+ kunmap(pages_in[page_in_index]);
+
+ data_in = kmap(pages_in[++page_in_index]);
+
+ in_page_bytes_left = PAGE_CACHE_SIZE;
+ in_offset = 0;
+ }
+ }
+
+ out_len = lzo1x_worst_compress(PAGE_CACHE_SIZE);
+ ret = lzo1x_decompress_safe(buf, in_len, workspace->buf,
+ &out_len);
+ if (need_unmap)
+ kunmap(pages_in[page_in_index - 1]);
+ if (ret != LZO_E_OK) {
+ printk(KERN_WARNING "BTRFS: decompress failed\n");
+ ret = -EIO;
+ break;
+ }
+
+ buf_start = tot_out;
+ tot_out += out_len;
+
+ ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start,
+ tot_out, disk_start,
+ bvec, vcnt,
+ &page_out_index, &pg_offset);
+ if (ret2 == 0)
+ break;
+ }
+done:
+ kunmap(pages_in[page_in_index]);
+ if (!ret)
+ btrfs_clear_biovec_end(bvec, vcnt, page_out_index, pg_offset);
+ return ret;
+}
+
+static int lzo_decompress(struct list_head *ws, unsigned char *data_in,
+ struct page *dest_page,
+ unsigned long start_byte,
+ size_t srclen, size_t destlen)
+{
+ struct workspace *workspace = list_entry(ws, struct workspace, list);
+ size_t in_len;
+ size_t out_len;
+ size_t tot_len;
+ int ret = 0;
+ char *kaddr;
+ unsigned long bytes;
+
+ BUG_ON(srclen < LZO_LEN);
+
+ tot_len = read_compress_length(data_in);
+ data_in += LZO_LEN;
+
+ in_len = read_compress_length(data_in);
+ data_in += LZO_LEN;
+
+ out_len = PAGE_CACHE_SIZE;
+ ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len);
+ if (ret != LZO_E_OK) {
+ printk(KERN_WARNING "BTRFS: decompress failed!\n");
+ ret = -EIO;
+ goto out;
+ }
+
+ if (out_len < start_byte) {
+ ret = -EIO;
+ goto out;
+ }
+
+ /*
+ * the caller is already checking against PAGE_SIZE, but lets
+ * move this check closer to the memcpy/memset
+ */
+ destlen = min_t(unsigned long, destlen, PAGE_SIZE);
+ bytes = min_t(unsigned long, destlen, out_len - start_byte);
+
+ kaddr = kmap_atomic(dest_page);
+ memcpy(kaddr, workspace->buf + start_byte, bytes);
+
+ /*
+ * btrfs_getblock is doing a zero on the tail of the page too,
+ * but this will cover anything missing from the decompressed
+ * data.
+ */
+ if (bytes < destlen)
+ memset(kaddr+bytes, 0, destlen-bytes);
+ kunmap_atomic(kaddr);
+out:
+ return ret;
+}
+
+const struct btrfs_compress_op btrfs_lzo_compress = {
+ .alloc_workspace = lzo_alloc_workspace,
+ .free_workspace = lzo_free_workspace,
+ .compress_pages = lzo_compress_pages,
+ .decompress_biovec = lzo_decompress_biovec,
+ .decompress = lzo_decompress,
+};
diff --git a/fs/btrfs/math.h b/fs/btrfs/math.h
new file mode 100644
index 000000000..1b10a3cd1
--- /dev/null
+++ b/fs/btrfs/math.h
@@ -0,0 +1,42 @@
+
+/*
+ * Copyright (C) 2012 Fujitsu. All rights reserved.
+ * Written by Miao Xie <miaox@cn.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_MATH_H
+#define __BTRFS_MATH_H
+
+#include <asm/div64.h>
+
+static inline u64 div_factor(u64 num, int factor)
+{
+ if (factor == 10)
+ return num;
+ num *= factor;
+ return div_u64(num, 10);
+}
+
+static inline u64 div_factor_fine(u64 num, int factor)
+{
+ if (factor == 100)
+ return num;
+ num *= factor;
+ return div_u64(num, 100);
+}
+
+#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
new file mode 100644
index 000000000..760c4a5e0
--- /dev/null
+++ b/fs/btrfs/ordered-data.c
@@ -0,0 +1,1051 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+#include "ctree.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "extent_io.h"
+#include "disk-io.h"
+
+static struct kmem_cache *btrfs_ordered_extent_cache;
+
+static u64 entry_end(struct btrfs_ordered_extent *entry)
+{
+ if (entry->file_offset + entry->len < entry->file_offset)
+ return (u64)-1;
+ return entry->file_offset + entry->len;
+}
+
+/* returns NULL if the insertion worked, or it returns the node it did find
+ * in the tree
+ */
+static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
+ struct rb_node *node)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+ struct btrfs_ordered_extent *entry;
+
+ while (*p) {
+ parent = *p;
+ entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node);
+
+ if (file_offset < entry->file_offset)
+ p = &(*p)->rb_left;
+ else if (file_offset >= entry_end(entry))
+ p = &(*p)->rb_right;
+ else
+ return parent;
+ }
+
+ rb_link_node(node, parent, p);
+ rb_insert_color(node, root);
+ return NULL;
+}
+
+static void ordered_data_tree_panic(struct inode *inode, int errno,
+ u64 offset)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ btrfs_panic(fs_info, errno, "Inconsistency in ordered tree at offset "
+ "%llu", offset);
+}
+
+/*
+ * look for a given offset in the tree, and if it can't be found return the
+ * first lesser offset
+ */
+static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
+ struct rb_node **prev_ret)
+{
+ struct rb_node *n = root->rb_node;
+ struct rb_node *prev = NULL;
+ struct rb_node *test;
+ struct btrfs_ordered_extent *entry;
+ struct btrfs_ordered_extent *prev_entry = NULL;
+
+ while (n) {
+ entry = rb_entry(n, struct btrfs_ordered_extent, rb_node);
+ prev = n;
+ prev_entry = entry;
+
+ if (file_offset < entry->file_offset)
+ n = n->rb_left;
+ else if (file_offset >= entry_end(entry))
+ n = n->rb_right;
+ else
+ return n;
+ }
+ if (!prev_ret)
+ return NULL;
+
+ while (prev && file_offset >= entry_end(prev_entry)) {
+ test = rb_next(prev);
+ if (!test)
+ break;
+ prev_entry = rb_entry(test, struct btrfs_ordered_extent,
+ rb_node);
+ if (file_offset < entry_end(prev_entry))
+ break;
+
+ prev = test;
+ }
+ if (prev)
+ prev_entry = rb_entry(prev, struct btrfs_ordered_extent,
+ rb_node);
+ while (prev && file_offset < entry_end(prev_entry)) {
+ test = rb_prev(prev);
+ if (!test)
+ break;
+ prev_entry = rb_entry(test, struct btrfs_ordered_extent,
+ rb_node);
+ prev = test;
+ }
+ *prev_ret = prev;
+ return NULL;
+}
+
+/*
+ * helper to check if a given offset is inside a given entry
+ */
+static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
+{
+ if (file_offset < entry->file_offset ||
+ entry->file_offset + entry->len <= file_offset)
+ return 0;
+ return 1;
+}
+
+static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
+ u64 len)
+{
+ if (file_offset + len <= entry->file_offset ||
+ entry->file_offset + entry->len <= file_offset)
+ return 0;
+ return 1;
+}
+
+/*
+ * look find the first ordered struct that has this offset, otherwise
+ * the first one less than this offset
+ */
+static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
+ u64 file_offset)
+{
+ struct rb_root *root = &tree->tree;
+ struct rb_node *prev = NULL;
+ struct rb_node *ret;
+ struct btrfs_ordered_extent *entry;
+
+ if (tree->last) {
+ entry = rb_entry(tree->last, struct btrfs_ordered_extent,
+ rb_node);
+ if (offset_in_entry(entry, file_offset))
+ return tree->last;
+ }
+ ret = __tree_search(root, file_offset, &prev);
+ if (!ret)
+ ret = prev;
+ if (ret)
+ tree->last = ret;
+ return ret;
+}
+
+/* allocate and add a new ordered_extent into the per-inode tree.
+ * file_offset is the logical offset in the file
+ *
+ * start is the disk block number of an extent already reserved in the
+ * extent allocation tree
+ *
+ * len is the length of the extent
+ *
+ * The tree is given a single reference on the ordered extent that was
+ * inserted.
+ */
+static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
+ u64 start, u64 len, u64 disk_len,
+ int type, int dio, int compress_type)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_ordered_inode_tree *tree;
+ struct rb_node *node;
+ struct btrfs_ordered_extent *entry;
+
+ tree = &BTRFS_I(inode)->ordered_tree;
+ entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);
+ if (!entry)
+ return -ENOMEM;
+
+ entry->file_offset = file_offset;
+ entry->start = start;
+ entry->len = len;
+ if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) &&
+ !(type == BTRFS_ORDERED_NOCOW))
+ entry->csum_bytes_left = disk_len;
+ entry->disk_len = disk_len;
+ entry->bytes_left = len;
+ entry->inode = igrab(inode);
+ entry->compress_type = compress_type;
+ entry->truncated_len = (u64)-1;
+ if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
+ set_bit(type, &entry->flags);
+
+ if (dio)
+ set_bit(BTRFS_ORDERED_DIRECT, &entry->flags);
+
+ /* one ref for the tree */
+ atomic_set(&entry->refs, 1);
+ init_waitqueue_head(&entry->wait);
+ INIT_LIST_HEAD(&entry->list);
+ INIT_LIST_HEAD(&entry->root_extent_list);
+ INIT_LIST_HEAD(&entry->work_list);
+ init_completion(&entry->completion);
+ INIT_LIST_HEAD(&entry->log_list);
+ INIT_LIST_HEAD(&entry->trans_list);
+
+ trace_btrfs_ordered_extent_add(inode, entry);
+
+ spin_lock_irq(&tree->lock);
+ node = tree_insert(&tree->tree, file_offset,
+ &entry->rb_node);
+ if (node)
+ ordered_data_tree_panic(inode, -EEXIST, file_offset);
+ spin_unlock_irq(&tree->lock);
+
+ spin_lock(&root->ordered_extent_lock);
+ list_add_tail(&entry->root_extent_list,
+ &root->ordered_extents);
+ root->nr_ordered_extents++;
+ if (root->nr_ordered_extents == 1) {
+ spin_lock(&root->fs_info->ordered_root_lock);
+ BUG_ON(!list_empty(&root->ordered_root));
+ list_add_tail(&root->ordered_root,
+ &root->fs_info->ordered_roots);
+ spin_unlock(&root->fs_info->ordered_root_lock);
+ }
+ spin_unlock(&root->ordered_extent_lock);
+
+ return 0;
+}
+
+int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
+ u64 start, u64 len, u64 disk_len, int type)
+{
+ return __btrfs_add_ordered_extent(inode, file_offset, start, len,
+ disk_len, type, 0,
+ BTRFS_COMPRESS_NONE);
+}
+
+int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
+ u64 start, u64 len, u64 disk_len, int type)
+{
+ return __btrfs_add_ordered_extent(inode, file_offset, start, len,
+ disk_len, type, 1,
+ BTRFS_COMPRESS_NONE);
+}
+
+int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
+ u64 start, u64 len, u64 disk_len,
+ int type, int compress_type)
+{
+ return __btrfs_add_ordered_extent(inode, file_offset, start, len,
+ disk_len, type, 0,
+ compress_type);
+}
+
+/*
+ * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
+ * when an ordered extent is finished. If the list covers more than one
+ * ordered extent, it is split across multiples.
+ */
+void btrfs_add_ordered_sum(struct inode *inode,
+ struct btrfs_ordered_extent *entry,
+ struct btrfs_ordered_sum *sum)
+{
+ struct btrfs_ordered_inode_tree *tree;
+
+ tree = &BTRFS_I(inode)->ordered_tree;
+ spin_lock_irq(&tree->lock);
+ list_add_tail(&sum->list, &entry->list);
+ WARN_ON(entry->csum_bytes_left < sum->len);
+ entry->csum_bytes_left -= sum->len;
+ if (entry->csum_bytes_left == 0)
+ wake_up(&entry->wait);
+ spin_unlock_irq(&tree->lock);
+}
+
+/*
+ * this is used to account for finished IO across a given range
+ * of the file. The IO may span ordered extents. If
+ * a given ordered_extent is completely done, 1 is returned, otherwise
+ * 0.
+ *
+ * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
+ * to make sure this function only returns 1 once for a given ordered extent.
+ *
+ * file_offset is updated to one byte past the range that is recorded as
+ * complete. This allows you to walk forward in the file.
+ */
+int btrfs_dec_test_first_ordered_pending(struct inode *inode,
+ struct btrfs_ordered_extent **cached,
+ u64 *file_offset, u64 io_size, int uptodate)
+{
+ struct btrfs_ordered_inode_tree *tree;
+ struct rb_node *node;
+ struct btrfs_ordered_extent *entry = NULL;
+ int ret;
+ unsigned long flags;
+ u64 dec_end;
+ u64 dec_start;
+ u64 to_dec;
+
+ tree = &BTRFS_I(inode)->ordered_tree;
+ spin_lock_irqsave(&tree->lock, flags);
+ node = tree_search(tree, *file_offset);
+ if (!node) {
+ ret = 1;
+ goto out;
+ }
+
+ entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+ if (!offset_in_entry(entry, *file_offset)) {
+ ret = 1;
+ goto out;
+ }
+
+ dec_start = max(*file_offset, entry->file_offset);
+ dec_end = min(*file_offset + io_size, entry->file_offset +
+ entry->len);
+ *file_offset = dec_end;
+ if (dec_start > dec_end) {
+ btrfs_crit(BTRFS_I(inode)->root->fs_info,
+ "bad ordering dec_start %llu end %llu", dec_start, dec_end);
+ }
+ to_dec = dec_end - dec_start;
+ if (to_dec > entry->bytes_left) {
+ btrfs_crit(BTRFS_I(inode)->root->fs_info,
+ "bad ordered accounting left %llu size %llu",
+ entry->bytes_left, to_dec);
+ }
+ entry->bytes_left -= to_dec;
+ if (!uptodate)
+ set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
+
+ if (entry->bytes_left == 0) {
+ ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
+ if (waitqueue_active(&entry->wait))
+ wake_up(&entry->wait);
+ } else {
+ ret = 1;
+ }
+out:
+ if (!ret && cached && entry) {
+ *cached = entry;
+ atomic_inc(&entry->refs);
+ }
+ spin_unlock_irqrestore(&tree->lock, flags);
+ return ret == 0;
+}
+
+/*
+ * this is used to account for finished IO across a given range
+ * of the file. The IO should not span ordered extents. If
+ * a given ordered_extent is completely done, 1 is returned, otherwise
+ * 0.
+ *
+ * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
+ * to make sure this function only returns 1 once for a given ordered extent.
+ */
+int btrfs_dec_test_ordered_pending(struct inode *inode,
+ struct btrfs_ordered_extent **cached,
+ u64 file_offset, u64 io_size, int uptodate)
+{
+ struct btrfs_ordered_inode_tree *tree;
+ struct rb_node *node;
+ struct btrfs_ordered_extent *entry = NULL;
+ unsigned long flags;
+ int ret;
+
+ tree = &BTRFS_I(inode)->ordered_tree;
+ spin_lock_irqsave(&tree->lock, flags);
+ if (cached && *cached) {
+ entry = *cached;
+ goto have_entry;
+ }
+
+ node = tree_search(tree, file_offset);
+ if (!node) {
+ ret = 1;
+ goto out;
+ }
+
+ entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+have_entry:
+ if (!offset_in_entry(entry, file_offset)) {
+ ret = 1;
+ goto out;
+ }
+
+ if (io_size > entry->bytes_left) {
+ btrfs_crit(BTRFS_I(inode)->root->fs_info,
+ "bad ordered accounting left %llu size %llu",
+ entry->bytes_left, io_size);
+ }
+ entry->bytes_left -= io_size;
+ if (!uptodate)
+ set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
+
+ if (entry->bytes_left == 0) {
+ ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
+ if (waitqueue_active(&entry->wait))
+ wake_up(&entry->wait);
+ } else {
+ ret = 1;
+ }
+out:
+ if (!ret && cached && entry) {
+ *cached = entry;
+ atomic_inc(&entry->refs);
+ }
+ spin_unlock_irqrestore(&tree->lock, flags);
+ return ret == 0;
+}
+
+/* Needs to either be called under a log transaction or the log_mutex */
+void btrfs_get_logged_extents(struct inode *inode,
+ struct list_head *logged_list,
+ const loff_t start,
+ const loff_t end)
+{
+ struct btrfs_ordered_inode_tree *tree;
+ struct btrfs_ordered_extent *ordered;
+ struct rb_node *n;
+ struct rb_node *prev;
+
+ tree = &BTRFS_I(inode)->ordered_tree;
+ spin_lock_irq(&tree->lock);
+ n = __tree_search(&tree->tree, end, &prev);
+ if (!n)
+ n = prev;
+ for (; n; n = rb_prev(n)) {
+ ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
+ if (ordered->file_offset > end)
+ continue;
+ if (entry_end(ordered) <= start)
+ break;
+ if (test_and_set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
+ continue;
+ list_add(&ordered->log_list, logged_list);
+ atomic_inc(&ordered->refs);
+ }
+ spin_unlock_irq(&tree->lock);
+}
+
+void btrfs_put_logged_extents(struct list_head *logged_list)
+{
+ struct btrfs_ordered_extent *ordered;
+
+ while (!list_empty(logged_list)) {
+ ordered = list_first_entry(logged_list,
+ struct btrfs_ordered_extent,
+ log_list);
+ list_del_init(&ordered->log_list);
+ btrfs_put_ordered_extent(ordered);
+ }
+}
+
+void btrfs_submit_logged_extents(struct list_head *logged_list,
+ struct btrfs_root *log)
+{
+ int index = log->log_transid % 2;
+
+ spin_lock_irq(&log->log_extents_lock[index]);
+ list_splice_tail(logged_list, &log->logged_list[index]);
+ spin_unlock_irq(&log->log_extents_lock[index]);
+}
+
+void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log, u64 transid)
+{
+ struct btrfs_ordered_extent *ordered;
+ int index = transid % 2;
+
+ spin_lock_irq(&log->log_extents_lock[index]);
+ while (!list_empty(&log->logged_list[index])) {
+ ordered = list_first_entry(&log->logged_list[index],
+ struct btrfs_ordered_extent,
+ log_list);
+ list_del_init(&ordered->log_list);
+ spin_unlock_irq(&log->log_extents_lock[index]);
+
+ if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) &&
+ !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) {
+ struct inode *inode = ordered->inode;
+ u64 start = ordered->file_offset;
+ u64 end = ordered->file_offset + ordered->len - 1;
+
+ WARN_ON(!inode);
+ filemap_fdatawrite_range(inode->i_mapping, start, end);
+ }
+ wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE,
+ &ordered->flags));
+
+ list_add_tail(&ordered->trans_list, &trans->ordered);
+ spin_lock_irq(&log->log_extents_lock[index]);
+ }
+ spin_unlock_irq(&log->log_extents_lock[index]);
+}
+
+void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid)
+{
+ struct btrfs_ordered_extent *ordered;
+ int index = transid % 2;
+
+ spin_lock_irq(&log->log_extents_lock[index]);
+ while (!list_empty(&log->logged_list[index])) {
+ ordered = list_first_entry(&log->logged_list[index],
+ struct btrfs_ordered_extent,
+ log_list);
+ list_del_init(&ordered->log_list);
+ spin_unlock_irq(&log->log_extents_lock[index]);
+ btrfs_put_ordered_extent(ordered);
+ spin_lock_irq(&log->log_extents_lock[index]);
+ }
+ spin_unlock_irq(&log->log_extents_lock[index]);
+}
+
+/*
+ * used to drop a reference on an ordered extent. This will free
+ * the extent if the last reference is dropped
+ */
+void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
+{
+ struct list_head *cur;
+ struct btrfs_ordered_sum *sum;
+
+ trace_btrfs_ordered_extent_put(entry->inode, entry);
+
+ if (atomic_dec_and_test(&entry->refs)) {
+ if (entry->inode)
+ btrfs_add_delayed_iput(entry->inode);
+ while (!list_empty(&entry->list)) {
+ cur = entry->list.next;
+ sum = list_entry(cur, struct btrfs_ordered_sum, list);
+ list_del(&sum->list);
+ kfree(sum);
+ }
+ kmem_cache_free(btrfs_ordered_extent_cache, entry);
+ }
+}
+
+/*
+ * remove an ordered extent from the tree. No references are dropped
+ * and waiters are woken up.
+ */
+void btrfs_remove_ordered_extent(struct inode *inode,
+ struct btrfs_ordered_extent *entry)
+{
+ struct btrfs_ordered_inode_tree *tree;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct rb_node *node;
+
+ tree = &BTRFS_I(inode)->ordered_tree;
+ spin_lock_irq(&tree->lock);
+ node = &entry->rb_node;
+ rb_erase(node, &tree->tree);
+ if (tree->last == node)
+ tree->last = NULL;
+ set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
+ spin_unlock_irq(&tree->lock);
+
+ spin_lock(&root->ordered_extent_lock);
+ list_del_init(&entry->root_extent_list);
+ root->nr_ordered_extents--;
+
+ trace_btrfs_ordered_extent_remove(inode, entry);
+
+ if (!root->nr_ordered_extents) {
+ spin_lock(&root->fs_info->ordered_root_lock);
+ BUG_ON(list_empty(&root->ordered_root));
+ list_del_init(&root->ordered_root);
+ spin_unlock(&root->fs_info->ordered_root_lock);
+ }
+ spin_unlock(&root->ordered_extent_lock);
+ wake_up(&entry->wait);
+}
+
+static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
+{
+ struct btrfs_ordered_extent *ordered;
+
+ ordered = container_of(work, struct btrfs_ordered_extent, flush_work);
+ btrfs_start_ordered_extent(ordered->inode, ordered, 1);
+ complete(&ordered->completion);
+}
+
+/*
+ * wait for all the ordered extents in a root. This is done when balancing
+ * space between drives.
+ */
+int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
+{
+ struct list_head splice, works;
+ struct btrfs_ordered_extent *ordered, *next;
+ int count = 0;
+
+ INIT_LIST_HEAD(&splice);
+ INIT_LIST_HEAD(&works);
+
+ mutex_lock(&root->ordered_extent_mutex);
+ spin_lock(&root->ordered_extent_lock);
+ list_splice_init(&root->ordered_extents, &splice);
+ while (!list_empty(&splice) && nr) {
+ ordered = list_first_entry(&splice, struct btrfs_ordered_extent,
+ root_extent_list);
+ list_move_tail(&ordered->root_extent_list,
+ &root->ordered_extents);
+ atomic_inc(&ordered->refs);
+ spin_unlock(&root->ordered_extent_lock);
+
+ btrfs_init_work(&ordered->flush_work,
+ btrfs_flush_delalloc_helper,
+ btrfs_run_ordered_extent_work, NULL, NULL);
+ list_add_tail(&ordered->work_list, &works);
+ btrfs_queue_work(root->fs_info->flush_workers,
+ &ordered->flush_work);
+
+ cond_resched();
+ spin_lock(&root->ordered_extent_lock);
+ if (nr != -1)
+ nr--;
+ count++;
+ }
+ list_splice_tail(&splice, &root->ordered_extents);
+ spin_unlock(&root->ordered_extent_lock);
+
+ list_for_each_entry_safe(ordered, next, &works, work_list) {
+ list_del_init(&ordered->work_list);
+ wait_for_completion(&ordered->completion);
+ btrfs_put_ordered_extent(ordered);
+ cond_resched();
+ }
+ mutex_unlock(&root->ordered_extent_mutex);
+
+ return count;
+}
+
+void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
+{
+ struct btrfs_root *root;
+ struct list_head splice;
+ int done;
+
+ INIT_LIST_HEAD(&splice);
+
+ mutex_lock(&fs_info->ordered_operations_mutex);
+ spin_lock(&fs_info->ordered_root_lock);
+ list_splice_init(&fs_info->ordered_roots, &splice);
+ while (!list_empty(&splice) && nr) {
+ root = list_first_entry(&splice, struct btrfs_root,
+ ordered_root);
+ root = btrfs_grab_fs_root(root);
+ BUG_ON(!root);
+ list_move_tail(&root->ordered_root,
+ &fs_info->ordered_roots);
+ spin_unlock(&fs_info->ordered_root_lock);
+
+ done = btrfs_wait_ordered_extents(root, nr);
+ btrfs_put_fs_root(root);
+
+ spin_lock(&fs_info->ordered_root_lock);
+ if (nr != -1) {
+ nr -= done;
+ WARN_ON(nr < 0);
+ }
+ }
+ list_splice_tail(&splice, &fs_info->ordered_roots);
+ spin_unlock(&fs_info->ordered_root_lock);
+ mutex_unlock(&fs_info->ordered_operations_mutex);
+}
+
+/*
+ * Used to start IO or wait for a given ordered extent to finish.
+ *
+ * If wait is one, this effectively waits on page writeback for all the pages
+ * in the extent, and it waits on the io completion code to insert
+ * metadata into the btree corresponding to the extent
+ */
+void btrfs_start_ordered_extent(struct inode *inode,
+ struct btrfs_ordered_extent *entry,
+ int wait)
+{
+ u64 start = entry->file_offset;
+ u64 end = start + entry->len - 1;
+
+ trace_btrfs_ordered_extent_start(inode, entry);
+
+ /*
+ * pages in the range can be dirty, clean or writeback. We
+ * start IO on any dirty ones so the wait doesn't stall waiting
+ * for the flusher thread to find them
+ */
+ if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
+ filemap_fdatawrite_range(inode->i_mapping, start, end);
+ if (wait) {
+ wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
+ &entry->flags));
+ }
+}
+
+/*
+ * Used to wait on ordered extents across a large range of bytes.
+ */
+int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
+{
+ int ret = 0;
+ int ret_wb = 0;
+ u64 end;
+ u64 orig_end;
+ struct btrfs_ordered_extent *ordered;
+
+ if (start + len < start) {
+ orig_end = INT_LIMIT(loff_t);
+ } else {
+ orig_end = start + len - 1;
+ if (orig_end > INT_LIMIT(loff_t))
+ orig_end = INT_LIMIT(loff_t);
+ }
+
+ /* start IO across the range first to instantiate any delalloc
+ * extents
+ */
+ ret = btrfs_fdatawrite_range(inode, start, orig_end);
+ if (ret)
+ return ret;
+
+ /*
+ * If we have a writeback error don't return immediately. Wait first
+ * for any ordered extents that haven't completed yet. This is to make
+ * sure no one can dirty the same page ranges and call writepages()
+ * before the ordered extents complete - to avoid failures (-EEXIST)
+ * when adding the new ordered extents to the ordered tree.
+ */
+ ret_wb = filemap_fdatawait_range(inode->i_mapping, start, orig_end);
+
+ end = orig_end;
+ while (1) {
+ ordered = btrfs_lookup_first_ordered_extent(inode, end);
+ if (!ordered)
+ break;
+ if (ordered->file_offset > orig_end) {
+ btrfs_put_ordered_extent(ordered);
+ break;
+ }
+ if (ordered->file_offset + ordered->len <= start) {
+ btrfs_put_ordered_extent(ordered);
+ break;
+ }
+ btrfs_start_ordered_extent(inode, ordered, 1);
+ end = ordered->file_offset;
+ if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
+ ret = -EIO;
+ btrfs_put_ordered_extent(ordered);
+ if (ret || end == 0 || end == start)
+ break;
+ end--;
+ }
+ return ret_wb ? ret_wb : ret;
+}
+
+/*
+ * find an ordered extent corresponding to file_offset. return NULL if
+ * nothing is found, otherwise take a reference on the extent and return it
+ */
+struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
+ u64 file_offset)
+{
+ struct btrfs_ordered_inode_tree *tree;
+ struct rb_node *node;
+ struct btrfs_ordered_extent *entry = NULL;
+
+ tree = &BTRFS_I(inode)->ordered_tree;
+ spin_lock_irq(&tree->lock);
+ node = tree_search(tree, file_offset);
+ if (!node)
+ goto out;
+
+ entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+ if (!offset_in_entry(entry, file_offset))
+ entry = NULL;
+ if (entry)
+ atomic_inc(&entry->refs);
+out:
+ spin_unlock_irq(&tree->lock);
+ return entry;
+}
+
+/* Since the DIO code tries to lock a wide area we need to look for any ordered
+ * extents that exist in the range, rather than just the start of the range.
+ */
+struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
+ u64 file_offset,
+ u64 len)
+{
+ struct btrfs_ordered_inode_tree *tree;
+ struct rb_node *node;
+ struct btrfs_ordered_extent *entry = NULL;
+
+ tree = &BTRFS_I(inode)->ordered_tree;
+ spin_lock_irq(&tree->lock);
+ node = tree_search(tree, file_offset);
+ if (!node) {
+ node = tree_search(tree, file_offset + len);
+ if (!node)
+ goto out;
+ }
+
+ while (1) {
+ entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+ if (range_overlaps(entry, file_offset, len))
+ break;
+
+ if (entry->file_offset >= file_offset + len) {
+ entry = NULL;
+ break;
+ }
+ entry = NULL;
+ node = rb_next(node);
+ if (!node)
+ break;
+ }
+out:
+ if (entry)
+ atomic_inc(&entry->refs);
+ spin_unlock_irq(&tree->lock);
+ return entry;
+}
+
+/*
+ * lookup and return any extent before 'file_offset'. NULL is returned
+ * if none is found
+ */
+struct btrfs_ordered_extent *
+btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
+{
+ struct btrfs_ordered_inode_tree *tree;
+ struct rb_node *node;
+ struct btrfs_ordered_extent *entry = NULL;
+
+ tree = &BTRFS_I(inode)->ordered_tree;
+ spin_lock_irq(&tree->lock);
+ node = tree_search(tree, file_offset);
+ if (!node)
+ goto out;
+
+ entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+ atomic_inc(&entry->refs);
+out:
+ spin_unlock_irq(&tree->lock);
+ return entry;
+}
+
+/*
+ * After an extent is done, call this to conditionally update the on disk
+ * i_size. i_size is updated to cover any fully written part of the file.
+ */
+int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
+ struct btrfs_ordered_extent *ordered)
+{
+ struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
+ u64 disk_i_size;
+ u64 new_i_size;
+ u64 i_size = i_size_read(inode);
+ struct rb_node *node;
+ struct rb_node *prev = NULL;
+ struct btrfs_ordered_extent *test;
+ int ret = 1;
+
+ spin_lock_irq(&tree->lock);
+ if (ordered) {
+ offset = entry_end(ordered);
+ if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags))
+ offset = min(offset,
+ ordered->file_offset +
+ ordered->truncated_len);
+ } else {
+ offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize);
+ }
+ disk_i_size = BTRFS_I(inode)->disk_i_size;
+
+ /* truncate file */
+ if (disk_i_size > i_size) {
+ BTRFS_I(inode)->disk_i_size = i_size;
+ ret = 0;
+ goto out;
+ }
+
+ /*
+ * if the disk i_size is already at the inode->i_size, or
+ * this ordered extent is inside the disk i_size, we're done
+ */
+ if (disk_i_size == i_size)
+ goto out;
+
+ /*
+ * We still need to update disk_i_size if outstanding_isize is greater
+ * than disk_i_size.
+ */
+ if (offset <= disk_i_size &&
+ (!ordered || ordered->outstanding_isize <= disk_i_size))
+ goto out;
+
+ /*
+ * walk backward from this ordered extent to disk_i_size.
+ * if we find an ordered extent then we can't update disk i_size
+ * yet
+ */
+ if (ordered) {
+ node = rb_prev(&ordered->rb_node);
+ } else {
+ prev = tree_search(tree, offset);
+ /*
+ * we insert file extents without involving ordered struct,
+ * so there should be no ordered struct cover this offset
+ */
+ if (prev) {
+ test = rb_entry(prev, struct btrfs_ordered_extent,
+ rb_node);
+ BUG_ON(offset_in_entry(test, offset));
+ }
+ node = prev;
+ }
+ for (; node; node = rb_prev(node)) {
+ test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+
+ /* We treat this entry as if it doesnt exist */
+ if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags))
+ continue;
+ if (test->file_offset + test->len <= disk_i_size)
+ break;
+ if (test->file_offset >= i_size)
+ break;
+ if (entry_end(test) > disk_i_size) {
+ /*
+ * we don't update disk_i_size now, so record this
+ * undealt i_size. Or we will not know the real
+ * i_size.
+ */
+ if (test->outstanding_isize < offset)
+ test->outstanding_isize = offset;
+ if (ordered &&
+ ordered->outstanding_isize >
+ test->outstanding_isize)
+ test->outstanding_isize =
+ ordered->outstanding_isize;
+ goto out;
+ }
+ }
+ new_i_size = min_t(u64, offset, i_size);
+
+ /*
+ * Some ordered extents may completed before the current one, and
+ * we hold the real i_size in ->outstanding_isize.
+ */
+ if (ordered && ordered->outstanding_isize > new_i_size)
+ new_i_size = min_t(u64, ordered->outstanding_isize, i_size);
+ BTRFS_I(inode)->disk_i_size = new_i_size;
+ ret = 0;
+out:
+ /*
+ * We need to do this because we can't remove ordered extents until
+ * after the i_disk_size has been updated and then the inode has been
+ * updated to reflect the change, so we need to tell anybody who finds
+ * this ordered extent that we've already done all the real work, we
+ * just haven't completed all the other work.
+ */
+ if (ordered)
+ set_bit(BTRFS_ORDERED_UPDATED_ISIZE, &ordered->flags);
+ spin_unlock_irq(&tree->lock);
+ return ret;
+}
+
+/*
+ * search the ordered extents for one corresponding to 'offset' and
+ * try to find a checksum. This is used because we allow pages to
+ * be reclaimed before their checksum is actually put into the btree
+ */
+int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
+ u32 *sum, int len)
+{
+ struct btrfs_ordered_sum *ordered_sum;
+ struct btrfs_ordered_extent *ordered;
+ struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
+ unsigned long num_sectors;
+ unsigned long i;
+ u32 sectorsize = BTRFS_I(inode)->root->sectorsize;
+ int index = 0;
+
+ ordered = btrfs_lookup_ordered_extent(inode, offset);
+ if (!ordered)
+ return 0;
+
+ spin_lock_irq(&tree->lock);
+ list_for_each_entry_reverse(ordered_sum, &ordered->list, list) {
+ if (disk_bytenr >= ordered_sum->bytenr &&
+ disk_bytenr < ordered_sum->bytenr + ordered_sum->len) {
+ i = (disk_bytenr - ordered_sum->bytenr) >>
+ inode->i_sb->s_blocksize_bits;
+ num_sectors = ordered_sum->len >>
+ inode->i_sb->s_blocksize_bits;
+ num_sectors = min_t(int, len - index, num_sectors - i);
+ memcpy(sum + index, ordered_sum->sums + i,
+ num_sectors);
+
+ index += (int)num_sectors;
+ if (index == len)
+ goto out;
+ disk_bytenr += num_sectors * sectorsize;
+ }
+ }
+out:
+ spin_unlock_irq(&tree->lock);
+ btrfs_put_ordered_extent(ordered);
+ return index;
+}
+
+int __init ordered_data_init(void)
+{
+ btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",
+ sizeof(struct btrfs_ordered_extent), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+ NULL);
+ if (!btrfs_ordered_extent_cache)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void ordered_data_exit(void)
+{
+ if (btrfs_ordered_extent_cache)
+ kmem_cache_destroy(btrfs_ordered_extent_cache);
+}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
new file mode 100644
index 000000000..e96cd4ccd
--- /dev/null
+++ b/fs/btrfs/ordered-data.h
@@ -0,0 +1,212 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_ORDERED_DATA__
+#define __BTRFS_ORDERED_DATA__
+
+/* one of these per inode */
+struct btrfs_ordered_inode_tree {
+ spinlock_t lock;
+ struct rb_root tree;
+ struct rb_node *last;
+};
+
+struct btrfs_ordered_sum {
+ /* bytenr is the start of this extent on disk */
+ u64 bytenr;
+
+ /*
+ * this is the length in bytes covered by the sums array below.
+ */
+ int len;
+ struct list_head list;
+ /* last field is a variable length array of csums */
+ u32 sums[];
+};
+
+/*
+ * bits for the flags field:
+ *
+ * BTRFS_ORDERED_IO_DONE is set when all of the blocks are written.
+ * It is used to make sure metadata is inserted into the tree only once
+ * per extent.
+ *
+ * BTRFS_ORDERED_COMPLETE is set when the extent is removed from the
+ * rbtree, just before waking any waiters. It is used to indicate the
+ * IO is done and any metadata is inserted into the tree.
+ */
+#define BTRFS_ORDERED_IO_DONE 0 /* set when all the pages are written */
+
+#define BTRFS_ORDERED_COMPLETE 1 /* set when removed from the tree */
+
+#define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */
+
+#define BTRFS_ORDERED_COMPRESSED 3 /* writing a zlib compressed extent */
+
+#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */
+
+#define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */
+
+#define BTRFS_ORDERED_IOERR 6 /* We had an io error when writing this out */
+
+#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates whether this ordered extent
+ * has done its due diligence in updating
+ * the isize. */
+#define BTRFS_ORDERED_LOGGED_CSUM 8 /* We've logged the csums on this ordered
+ ordered extent */
+#define BTRFS_ORDERED_TRUNCATED 9 /* Set when we have to truncate an extent */
+
+#define BTRFS_ORDERED_LOGGED 10 /* Set when we've waited on this ordered extent
+ * in the logging code. */
+struct btrfs_ordered_extent {
+ /* logical offset in the file */
+ u64 file_offset;
+
+ /* disk byte number */
+ u64 start;
+
+ /* ram length of the extent in bytes */
+ u64 len;
+
+ /* extent length on disk */
+ u64 disk_len;
+
+ /* number of bytes that still need writing */
+ u64 bytes_left;
+
+ /* number of bytes that still need csumming */
+ u64 csum_bytes_left;
+
+ /*
+ * the end of the ordered extent which is behind it but
+ * didn't update disk_i_size. Please see the comment of
+ * btrfs_ordered_update_i_size();
+ */
+ u64 outstanding_isize;
+
+ /*
+ * If we get truncated we need to adjust the file extent we enter for
+ * this ordered extent so that we do not expose stale data.
+ */
+ u64 truncated_len;
+
+ /* flags (described above) */
+ unsigned long flags;
+
+ /* compression algorithm */
+ int compress_type;
+
+ /* reference count */
+ atomic_t refs;
+
+ /* the inode we belong to */
+ struct inode *inode;
+
+ /* list of checksums for insertion when the extent io is done */
+ struct list_head list;
+
+ /* If we need to wait on this to be done */
+ struct list_head log_list;
+
+ /* If the transaction needs to wait on this ordered extent */
+ struct list_head trans_list;
+
+ /* used to wait for the BTRFS_ORDERED_COMPLETE bit */
+ wait_queue_head_t wait;
+
+ /* our friendly rbtree entry */
+ struct rb_node rb_node;
+
+ /* a per root list of all the pending ordered extents */
+ struct list_head root_extent_list;
+
+ struct btrfs_work work;
+
+ struct completion completion;
+ struct btrfs_work flush_work;
+ struct list_head work_list;
+};
+
+/*
+ * calculates the total size you need to allocate for an ordered sum
+ * structure spanning 'bytes' in the file
+ */
+static inline int btrfs_ordered_sum_size(struct btrfs_root *root,
+ unsigned long bytes)
+{
+ int num_sectors = (int)DIV_ROUND_UP(bytes, root->sectorsize);
+ return sizeof(struct btrfs_ordered_sum) + num_sectors * sizeof(u32);
+}
+
+static inline void
+btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
+{
+ spin_lock_init(&t->lock);
+ t->tree = RB_ROOT;
+ t->last = NULL;
+}
+
+void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
+void btrfs_remove_ordered_extent(struct inode *inode,
+ struct btrfs_ordered_extent *entry);
+int btrfs_dec_test_ordered_pending(struct inode *inode,
+ struct btrfs_ordered_extent **cached,
+ u64 file_offset, u64 io_size, int uptodate);
+int btrfs_dec_test_first_ordered_pending(struct inode *inode,
+ struct btrfs_ordered_extent **cached,
+ u64 *file_offset, u64 io_size,
+ int uptodate);
+int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
+ u64 start, u64 len, u64 disk_len, int type);
+int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
+ u64 start, u64 len, u64 disk_len, int type);
+int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
+ u64 start, u64 len, u64 disk_len,
+ int type, int compress_type);
+void btrfs_add_ordered_sum(struct inode *inode,
+ struct btrfs_ordered_extent *entry,
+ struct btrfs_ordered_sum *sum);
+struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
+ u64 file_offset);
+void btrfs_start_ordered_extent(struct inode *inode,
+ struct btrfs_ordered_extent *entry, int wait);
+int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
+struct btrfs_ordered_extent *
+btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
+struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
+ u64 file_offset,
+ u64 len);
+int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
+ struct btrfs_ordered_extent *ordered);
+int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
+ u32 *sum, int len);
+int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr);
+void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr);
+void btrfs_get_logged_extents(struct inode *inode,
+ struct list_head *logged_list,
+ const loff_t start,
+ const loff_t end);
+void btrfs_put_logged_extents(struct list_head *logged_list);
+void btrfs_submit_logged_extents(struct list_head *logged_list,
+ struct btrfs_root *log);
+void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log, u64 transid);
+void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
+int __init ordered_data_init(void);
+void ordered_data_exit(void);
+#endif
diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
new file mode 100644
index 000000000..47767d5b8
--- /dev/null
+++ b/fs/btrfs/orphan.c
@@ -0,0 +1,71 @@
+/*
+ * Copyright (C) 2008 Red Hat. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "ctree.h"
+#include "disk-io.h"
+
+int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 offset)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ int ret = 0;
+
+ key.objectid = BTRFS_ORPHAN_OBJECTID;
+ key.type = BTRFS_ORPHAN_ITEM_KEY;
+ key.offset = offset;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 offset)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ int ret = 0;
+
+ key.objectid = BTRFS_ORPHAN_OBJECTID;
+ key.type = BTRFS_ORPHAN_ITEM_KEY;
+ key.offset = offset;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret < 0)
+ goto out;
+ if (ret) { /* JDM: Really? */
+ ret = -ENOENT;
+ goto out;
+ }
+
+ ret = btrfs_del_item(trans, root, path);
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
new file mode 100644
index 000000000..647ab12fd
--- /dev/null
+++ b/fs/btrfs/print-tree.c
@@ -0,0 +1,349 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "ctree.h"
+#include "disk-io.h"
+#include "print-tree.h"
+
+static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk)
+{
+ int num_stripes = btrfs_chunk_num_stripes(eb, chunk);
+ int i;
+ printk(KERN_INFO "\t\tchunk length %llu owner %llu type %llu "
+ "num_stripes %d\n",
+ btrfs_chunk_length(eb, chunk), btrfs_chunk_owner(eb, chunk),
+ btrfs_chunk_type(eb, chunk), num_stripes);
+ for (i = 0 ; i < num_stripes ; i++) {
+ printk(KERN_INFO "\t\t\tstripe %d devid %llu offset %llu\n", i,
+ btrfs_stripe_devid_nr(eb, chunk, i),
+ btrfs_stripe_offset_nr(eb, chunk, i));
+ }
+}
+static void print_dev_item(struct extent_buffer *eb,
+ struct btrfs_dev_item *dev_item)
+{
+ printk(KERN_INFO "\t\tdev item devid %llu "
+ "total_bytes %llu bytes used %llu\n",
+ btrfs_device_id(eb, dev_item),
+ btrfs_device_total_bytes(eb, dev_item),
+ btrfs_device_bytes_used(eb, dev_item));
+}
+static void print_extent_data_ref(struct extent_buffer *eb,
+ struct btrfs_extent_data_ref *ref)
+{
+ printk(KERN_INFO "\t\textent data backref root %llu "
+ "objectid %llu offset %llu count %u\n",
+ btrfs_extent_data_ref_root(eb, ref),
+ btrfs_extent_data_ref_objectid(eb, ref),
+ btrfs_extent_data_ref_offset(eb, ref),
+ btrfs_extent_data_ref_count(eb, ref));
+}
+
+static void print_extent_item(struct extent_buffer *eb, int slot, int type)
+{
+ struct btrfs_extent_item *ei;
+ struct btrfs_extent_inline_ref *iref;
+ struct btrfs_extent_data_ref *dref;
+ struct btrfs_shared_data_ref *sref;
+ struct btrfs_disk_key key;
+ unsigned long end;
+ unsigned long ptr;
+ u32 item_size = btrfs_item_size_nr(eb, slot);
+ u64 flags;
+ u64 offset;
+
+ if (item_size < sizeof(*ei)) {
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+ struct btrfs_extent_item_v0 *ei0;
+ BUG_ON(item_size != sizeof(*ei0));
+ ei0 = btrfs_item_ptr(eb, slot, struct btrfs_extent_item_v0);
+ printk(KERN_INFO "\t\textent refs %u\n",
+ btrfs_extent_refs_v0(eb, ei0));
+ return;
+#else
+ BUG();
+#endif
+ }
+
+ ei = btrfs_item_ptr(eb, slot, struct btrfs_extent_item);
+ flags = btrfs_extent_flags(eb, ei);
+
+ printk(KERN_INFO "\t\textent refs %llu gen %llu flags %llu\n",
+ btrfs_extent_refs(eb, ei), btrfs_extent_generation(eb, ei),
+ flags);
+
+ if ((type == BTRFS_EXTENT_ITEM_KEY) &&
+ flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+ struct btrfs_tree_block_info *info;
+ info = (struct btrfs_tree_block_info *)(ei + 1);
+ btrfs_tree_block_key(eb, info, &key);
+ printk(KERN_INFO "\t\ttree block key (%llu %u %llu) "
+ "level %d\n",
+ btrfs_disk_key_objectid(&key), key.type,
+ btrfs_disk_key_offset(&key),
+ btrfs_tree_block_level(eb, info));
+ iref = (struct btrfs_extent_inline_ref *)(info + 1);
+ } else {
+ iref = (struct btrfs_extent_inline_ref *)(ei + 1);
+ }
+
+ ptr = (unsigned long)iref;
+ end = (unsigned long)ei + item_size;
+ while (ptr < end) {
+ iref = (struct btrfs_extent_inline_ref *)ptr;
+ type = btrfs_extent_inline_ref_type(eb, iref);
+ offset = btrfs_extent_inline_ref_offset(eb, iref);
+ switch (type) {
+ case BTRFS_TREE_BLOCK_REF_KEY:
+ printk(KERN_INFO "\t\ttree block backref "
+ "root %llu\n", offset);
+ break;
+ case BTRFS_SHARED_BLOCK_REF_KEY:
+ printk(KERN_INFO "\t\tshared block backref "
+ "parent %llu\n", offset);
+ break;
+ case BTRFS_EXTENT_DATA_REF_KEY:
+ dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+ print_extent_data_ref(eb, dref);
+ break;
+ case BTRFS_SHARED_DATA_REF_KEY:
+ sref = (struct btrfs_shared_data_ref *)(iref + 1);
+ printk(KERN_INFO "\t\tshared data backref "
+ "parent %llu count %u\n",
+ offset, btrfs_shared_data_ref_count(eb, sref));
+ break;
+ default:
+ BUG();
+ }
+ ptr += btrfs_extent_inline_ref_size(type);
+ }
+ WARN_ON(ptr > end);
+}
+
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+static void print_extent_ref_v0(struct extent_buffer *eb, int slot)
+{
+ struct btrfs_extent_ref_v0 *ref0;
+
+ ref0 = btrfs_item_ptr(eb, slot, struct btrfs_extent_ref_v0);
+ printk("\t\textent back ref root %llu gen %llu "
+ "owner %llu num_refs %lu\n",
+ btrfs_ref_root_v0(eb, ref0),
+ btrfs_ref_generation_v0(eb, ref0),
+ btrfs_ref_objectid_v0(eb, ref0),
+ (unsigned long)btrfs_ref_count_v0(eb, ref0));
+}
+#endif
+
+static void print_uuid_item(struct extent_buffer *l, unsigned long offset,
+ u32 item_size)
+{
+ if (!IS_ALIGNED(item_size, sizeof(u64))) {
+ pr_warn("BTRFS: uuid item with illegal size %lu!\n",
+ (unsigned long)item_size);
+ return;
+ }
+ while (item_size) {
+ __le64 subvol_id;
+
+ read_extent_buffer(l, &subvol_id, offset, sizeof(subvol_id));
+ printk(KERN_INFO "\t\tsubvol_id %llu\n",
+ (unsigned long long)le64_to_cpu(subvol_id));
+ item_size -= sizeof(u64);
+ offset += sizeof(u64);
+ }
+}
+
+void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
+{
+ int i;
+ u32 type, nr;
+ struct btrfs_item *item;
+ struct btrfs_root_item *ri;
+ struct btrfs_dir_item *di;
+ struct btrfs_inode_item *ii;
+ struct btrfs_block_group_item *bi;
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_extent_data_ref *dref;
+ struct btrfs_shared_data_ref *sref;
+ struct btrfs_dev_extent *dev_extent;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+
+ if (!l)
+ return;
+
+ nr = btrfs_header_nritems(l);
+
+ btrfs_info(root->fs_info, "leaf %llu total ptrs %d free space %d",
+ btrfs_header_bytenr(l), nr, btrfs_leaf_free_space(root, l));
+ for (i = 0 ; i < nr ; i++) {
+ item = btrfs_item_nr(i);
+ btrfs_item_key_to_cpu(l, &key, i);
+ type = key.type;
+ printk(KERN_INFO "\titem %d key (%llu %u %llu) itemoff %d "
+ "itemsize %d\n",
+ i, key.objectid, type, key.offset,
+ btrfs_item_offset(l, item), btrfs_item_size(l, item));
+ switch (type) {
+ case BTRFS_INODE_ITEM_KEY:
+ ii = btrfs_item_ptr(l, i, struct btrfs_inode_item);
+ printk(KERN_INFO "\t\tinode generation %llu size %llu "
+ "mode %o\n",
+ btrfs_inode_generation(l, ii),
+ btrfs_inode_size(l, ii),
+ btrfs_inode_mode(l, ii));
+ break;
+ case BTRFS_DIR_ITEM_KEY:
+ di = btrfs_item_ptr(l, i, struct btrfs_dir_item);
+ btrfs_dir_item_key_to_cpu(l, di, &found_key);
+ printk(KERN_INFO "\t\tdir oid %llu type %u\n",
+ found_key.objectid,
+ btrfs_dir_type(l, di));
+ break;
+ case BTRFS_ROOT_ITEM_KEY:
+ ri = btrfs_item_ptr(l, i, struct btrfs_root_item);
+ printk(KERN_INFO "\t\troot data bytenr %llu refs %u\n",
+ btrfs_disk_root_bytenr(l, ri),
+ btrfs_disk_root_refs(l, ri));
+ break;
+ case BTRFS_EXTENT_ITEM_KEY:
+ case BTRFS_METADATA_ITEM_KEY:
+ print_extent_item(l, i, type);
+ break;
+ case BTRFS_TREE_BLOCK_REF_KEY:
+ printk(KERN_INFO "\t\ttree block backref\n");
+ break;
+ case BTRFS_SHARED_BLOCK_REF_KEY:
+ printk(KERN_INFO "\t\tshared block backref\n");
+ break;
+ case BTRFS_EXTENT_DATA_REF_KEY:
+ dref = btrfs_item_ptr(l, i,
+ struct btrfs_extent_data_ref);
+ print_extent_data_ref(l, dref);
+ break;
+ case BTRFS_SHARED_DATA_REF_KEY:
+ sref = btrfs_item_ptr(l, i,
+ struct btrfs_shared_data_ref);
+ printk(KERN_INFO "\t\tshared data backref count %u\n",
+ btrfs_shared_data_ref_count(l, sref));
+ break;
+ case BTRFS_EXTENT_DATA_KEY:
+ fi = btrfs_item_ptr(l, i,
+ struct btrfs_file_extent_item);
+ if (btrfs_file_extent_type(l, fi) ==
+ BTRFS_FILE_EXTENT_INLINE) {
+ printk(KERN_INFO "\t\tinline extent data "
+ "size %u\n",
+ btrfs_file_extent_inline_len(l, i, fi));
+ break;
+ }
+ printk(KERN_INFO "\t\textent data disk bytenr %llu "
+ "nr %llu\n",
+ btrfs_file_extent_disk_bytenr(l, fi),
+ btrfs_file_extent_disk_num_bytes(l, fi));
+ printk(KERN_INFO "\t\textent data offset %llu "
+ "nr %llu ram %llu\n",
+ btrfs_file_extent_offset(l, fi),
+ btrfs_file_extent_num_bytes(l, fi),
+ btrfs_file_extent_ram_bytes(l, fi));
+ break;
+ case BTRFS_EXTENT_REF_V0_KEY:
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+ print_extent_ref_v0(l, i);
+#else
+ BUG();
+#endif
+ break;
+ case BTRFS_BLOCK_GROUP_ITEM_KEY:
+ bi = btrfs_item_ptr(l, i,
+ struct btrfs_block_group_item);
+ printk(KERN_INFO "\t\tblock group used %llu\n",
+ btrfs_disk_block_group_used(l, bi));
+ break;
+ case BTRFS_CHUNK_ITEM_KEY:
+ print_chunk(l, btrfs_item_ptr(l, i,
+ struct btrfs_chunk));
+ break;
+ case BTRFS_DEV_ITEM_KEY:
+ print_dev_item(l, btrfs_item_ptr(l, i,
+ struct btrfs_dev_item));
+ break;
+ case BTRFS_DEV_EXTENT_KEY:
+ dev_extent = btrfs_item_ptr(l, i,
+ struct btrfs_dev_extent);
+ printk(KERN_INFO "\t\tdev extent chunk_tree %llu\n"
+ "\t\tchunk objectid %llu chunk offset %llu "
+ "length %llu\n",
+ btrfs_dev_extent_chunk_tree(l, dev_extent),
+ btrfs_dev_extent_chunk_objectid(l, dev_extent),
+ btrfs_dev_extent_chunk_offset(l, dev_extent),
+ btrfs_dev_extent_length(l, dev_extent));
+ break;
+ case BTRFS_DEV_STATS_KEY:
+ printk(KERN_INFO "\t\tdevice stats\n");
+ break;
+ case BTRFS_DEV_REPLACE_KEY:
+ printk(KERN_INFO "\t\tdev replace\n");
+ break;
+ case BTRFS_UUID_KEY_SUBVOL:
+ case BTRFS_UUID_KEY_RECEIVED_SUBVOL:
+ print_uuid_item(l, btrfs_item_ptr_offset(l, i),
+ btrfs_item_size_nr(l, i));
+ break;
+ };
+ }
+}
+
+void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
+{
+ int i; u32 nr;
+ struct btrfs_key key;
+ int level;
+
+ if (!c)
+ return;
+ nr = btrfs_header_nritems(c);
+ level = btrfs_header_level(c);
+ if (level == 0) {
+ btrfs_print_leaf(root, c);
+ return;
+ }
+ btrfs_info(root->fs_info, "node %llu level %d total ptrs %d free spc %u",
+ btrfs_header_bytenr(c), level, nr,
+ (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr);
+ for (i = 0; i < nr; i++) {
+ btrfs_node_key_to_cpu(c, &key, i);
+ printk(KERN_INFO "\tkey %d (%llu %u %llu) block %llu\n",
+ i, key.objectid, key.type, key.offset,
+ btrfs_node_blockptr(c, i));
+ }
+ for (i = 0; i < nr; i++) {
+ struct extent_buffer *next = read_tree_block(root,
+ btrfs_node_blockptr(c, i),
+ btrfs_node_ptr_generation(c, i));
+ if (btrfs_is_leaf(next) &&
+ level != 1)
+ BUG();
+ if (btrfs_header_level(next) !=
+ level - 1)
+ BUG();
+ btrfs_print_tree(root, next);
+ free_extent_buffer(next);
+ }
+}
diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h
new file mode 100644
index 000000000..7faddfacc
--- /dev/null
+++ b/fs/btrfs/print-tree.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __PRINT_TREE_
+#define __PRINT_TREE_
+void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l);
+void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c);
+#endif
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
new file mode 100644
index 000000000..dca137b04
--- /dev/null
+++ b/fs/btrfs/props.c
@@ -0,0 +1,429 @@
+/*
+ * Copyright (C) 2014 Filipe David Borba Manana <fdmanana@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/hashtable.h>
+#include "props.h"
+#include "btrfs_inode.h"
+#include "hash.h"
+#include "transaction.h"
+#include "xattr.h"
+
+#define BTRFS_PROP_HANDLERS_HT_BITS 8
+static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS);
+
+struct prop_handler {
+ struct hlist_node node;
+ const char *xattr_name;
+ int (*validate)(const char *value, size_t len);
+ int (*apply)(struct inode *inode, const char *value, size_t len);
+ const char *(*extract)(struct inode *inode);
+ int inheritable;
+};
+
+static int prop_compression_validate(const char *value, size_t len);
+static int prop_compression_apply(struct inode *inode,
+ const char *value,
+ size_t len);
+static const char *prop_compression_extract(struct inode *inode);
+
+static struct prop_handler prop_handlers[] = {
+ {
+ .xattr_name = XATTR_BTRFS_PREFIX "compression",
+ .validate = prop_compression_validate,
+ .apply = prop_compression_apply,
+ .extract = prop_compression_extract,
+ .inheritable = 1
+ },
+ {
+ .xattr_name = NULL
+ }
+};
+
+void __init btrfs_props_init(void)
+{
+ struct prop_handler *p;
+
+ hash_init(prop_handlers_ht);
+
+ for (p = &prop_handlers[0]; p->xattr_name; p++) {
+ u64 h = btrfs_name_hash(p->xattr_name, strlen(p->xattr_name));
+
+ hash_add(prop_handlers_ht, &p->node, h);
+ }
+}
+
+static const struct hlist_head *find_prop_handlers_by_hash(const u64 hash)
+{
+ struct hlist_head *h;
+
+ h = &prop_handlers_ht[hash_min(hash, BTRFS_PROP_HANDLERS_HT_BITS)];
+ if (hlist_empty(h))
+ return NULL;
+
+ return h;
+}
+
+static const struct prop_handler *
+find_prop_handler(const char *name,
+ const struct hlist_head *handlers)
+{
+ struct prop_handler *h;
+
+ if (!handlers) {
+ u64 hash = btrfs_name_hash(name, strlen(name));
+
+ handlers = find_prop_handlers_by_hash(hash);
+ if (!handlers)
+ return NULL;
+ }
+
+ hlist_for_each_entry(h, handlers, node)
+ if (!strcmp(h->xattr_name, name))
+ return h;
+
+ return NULL;
+}
+
+static int __btrfs_set_prop(struct btrfs_trans_handle *trans,
+ struct inode *inode,
+ const char *name,
+ const char *value,
+ size_t value_len,
+ int flags)
+{
+ const struct prop_handler *handler;
+ int ret;
+
+ if (strlen(name) <= XATTR_BTRFS_PREFIX_LEN)
+ return -EINVAL;
+
+ handler = find_prop_handler(name, NULL);
+ if (!handler)
+ return -EINVAL;
+
+ if (value_len == 0) {
+ ret = __btrfs_setxattr(trans, inode, handler->xattr_name,
+ NULL, 0, flags);
+ if (ret)
+ return ret;
+
+ ret = handler->apply(inode, NULL, 0);
+ ASSERT(ret == 0);
+
+ return ret;
+ }
+
+ ret = handler->validate(value, value_len);
+ if (ret)
+ return ret;
+ ret = __btrfs_setxattr(trans, inode, handler->xattr_name,
+ value, value_len, flags);
+ if (ret)
+ return ret;
+ ret = handler->apply(inode, value, value_len);
+ if (ret) {
+ __btrfs_setxattr(trans, inode, handler->xattr_name,
+ NULL, 0, flags);
+ return ret;
+ }
+
+ set_bit(BTRFS_INODE_HAS_PROPS, &BTRFS_I(inode)->runtime_flags);
+
+ return 0;
+}
+
+int btrfs_set_prop(struct inode *inode,
+ const char *name,
+ const char *value,
+ size_t value_len,
+ int flags)
+{
+ return __btrfs_set_prop(NULL, inode, name, value, value_len, flags);
+}
+
+static int iterate_object_props(struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 objectid,
+ void (*iterator)(void *,
+ const struct prop_handler *,
+ const char *,
+ size_t),
+ void *ctx)
+{
+ int ret;
+ char *name_buf = NULL;
+ char *value_buf = NULL;
+ int name_buf_len = 0;
+ int value_buf_len = 0;
+
+ while (1) {
+ struct btrfs_key key;
+ struct btrfs_dir_item *di;
+ struct extent_buffer *leaf;
+ u32 total_len, cur, this_len;
+ int slot;
+ const struct hlist_head *handlers;
+
+ slot = path->slots[0];
+ leaf = path->nodes[0];
+
+ if (slot >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto out;
+ else if (ret > 0)
+ break;
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid != objectid)
+ break;
+ if (key.type != BTRFS_XATTR_ITEM_KEY)
+ break;
+
+ handlers = find_prop_handlers_by_hash(key.offset);
+ if (!handlers)
+ goto next_slot;
+
+ di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
+ cur = 0;
+ total_len = btrfs_item_size_nr(leaf, slot);
+
+ while (cur < total_len) {
+ u32 name_len = btrfs_dir_name_len(leaf, di);
+ u32 data_len = btrfs_dir_data_len(leaf, di);
+ unsigned long name_ptr, data_ptr;
+ const struct prop_handler *handler;
+
+ this_len = sizeof(*di) + name_len + data_len;
+ name_ptr = (unsigned long)(di + 1);
+ data_ptr = name_ptr + name_len;
+
+ if (name_len <= XATTR_BTRFS_PREFIX_LEN ||
+ memcmp_extent_buffer(leaf, XATTR_BTRFS_PREFIX,
+ name_ptr,
+ XATTR_BTRFS_PREFIX_LEN))
+ goto next_dir_item;
+
+ if (name_len >= name_buf_len) {
+ kfree(name_buf);
+ name_buf_len = name_len + 1;
+ name_buf = kmalloc(name_buf_len, GFP_NOFS);
+ if (!name_buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+ read_extent_buffer(leaf, name_buf, name_ptr, name_len);
+ name_buf[name_len] = '\0';
+
+ handler = find_prop_handler(name_buf, handlers);
+ if (!handler)
+ goto next_dir_item;
+
+ if (data_len > value_buf_len) {
+ kfree(value_buf);
+ value_buf_len = data_len;
+ value_buf = kmalloc(data_len, GFP_NOFS);
+ if (!value_buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+ read_extent_buffer(leaf, value_buf, data_ptr, data_len);
+
+ iterator(ctx, handler, value_buf, data_len);
+next_dir_item:
+ cur += this_len;
+ di = (struct btrfs_dir_item *)((char *) di + this_len);
+ }
+
+next_slot:
+ path->slots[0]++;
+ }
+
+ ret = 0;
+out:
+ btrfs_release_path(path);
+ kfree(name_buf);
+ kfree(value_buf);
+
+ return ret;
+}
+
+static void inode_prop_iterator(void *ctx,
+ const struct prop_handler *handler,
+ const char *value,
+ size_t len)
+{
+ struct inode *inode = ctx;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret;
+
+ ret = handler->apply(inode, value, len);
+ if (unlikely(ret))
+ btrfs_warn(root->fs_info,
+ "error applying prop %s to ino %llu (root %llu): %d",
+ handler->xattr_name, btrfs_ino(inode),
+ root->root_key.objectid, ret);
+ else
+ set_bit(BTRFS_INODE_HAS_PROPS, &BTRFS_I(inode)->runtime_flags);
+}
+
+int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ u64 ino = btrfs_ino(inode);
+ int ret;
+
+ ret = iterate_object_props(root, path, ino, inode_prop_iterator, inode);
+
+ return ret;
+}
+
+static int inherit_props(struct btrfs_trans_handle *trans,
+ struct inode *inode,
+ struct inode *parent)
+{
+ const struct prop_handler *h;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret;
+
+ if (!test_bit(BTRFS_INODE_HAS_PROPS,
+ &BTRFS_I(parent)->runtime_flags))
+ return 0;
+
+ for (h = &prop_handlers[0]; h->xattr_name; h++) {
+ const char *value;
+ u64 num_bytes;
+
+ if (!h->inheritable)
+ continue;
+
+ value = h->extract(parent);
+ if (!value)
+ continue;
+
+ num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+ ret = btrfs_block_rsv_add(root, trans->block_rsv,
+ num_bytes, BTRFS_RESERVE_NO_FLUSH);
+ if (ret)
+ goto out;
+ ret = __btrfs_set_prop(trans, inode, h->xattr_name,
+ value, strlen(value), 0);
+ btrfs_block_rsv_release(root, trans->block_rsv, num_bytes);
+ if (ret)
+ goto out;
+ }
+ ret = 0;
+out:
+ return ret;
+}
+
+int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
+ struct inode *inode,
+ struct inode *dir)
+{
+ if (!dir)
+ return 0;
+
+ return inherit_props(trans, inode, dir);
+}
+
+int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_root *parent_root)
+{
+ struct btrfs_key key;
+ struct inode *parent_inode, *child_inode;
+ int ret;
+
+ key.objectid = BTRFS_FIRST_FREE_OBJECTID;
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+
+ parent_inode = btrfs_iget(parent_root->fs_info->sb, &key,
+ parent_root, NULL);
+ if (IS_ERR(parent_inode))
+ return PTR_ERR(parent_inode);
+
+ child_inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
+ if (IS_ERR(child_inode)) {
+ iput(parent_inode);
+ return PTR_ERR(child_inode);
+ }
+
+ ret = inherit_props(trans, child_inode, parent_inode);
+ iput(child_inode);
+ iput(parent_inode);
+
+ return ret;
+}
+
+static int prop_compression_validate(const char *value, size_t len)
+{
+ if (!strncmp("lzo", value, len))
+ return 0;
+ else if (!strncmp("zlib", value, len))
+ return 0;
+
+ return -EINVAL;
+}
+
+static int prop_compression_apply(struct inode *inode,
+ const char *value,
+ size_t len)
+{
+ int type;
+
+ if (len == 0) {
+ BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
+ BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
+ BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE;
+
+ return 0;
+ }
+
+ if (!strncmp("lzo", value, len))
+ type = BTRFS_COMPRESS_LZO;
+ else if (!strncmp("zlib", value, len))
+ type = BTRFS_COMPRESS_ZLIB;
+ else
+ return -EINVAL;
+
+ BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
+ BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
+ BTRFS_I(inode)->force_compress = type;
+
+ return 0;
+}
+
+static const char *prop_compression_extract(struct inode *inode)
+{
+ switch (BTRFS_I(inode)->force_compress) {
+ case BTRFS_COMPRESS_ZLIB:
+ return "zlib";
+ case BTRFS_COMPRESS_LZO:
+ return "lzo";
+ }
+
+ return NULL;
+}
+
+
diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h
new file mode 100644
index 000000000..100f18829
--- /dev/null
+++ b/fs/btrfs/props.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (C) 2014 Filipe David Borba Manana <fdmanana@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_PROPS_H
+#define __BTRFS_PROPS_H
+
+#include "ctree.h"
+
+void __init btrfs_props_init(void);
+
+int btrfs_set_prop(struct inode *inode,
+ const char *name,
+ const char *value,
+ size_t value_len,
+ int flags);
+
+int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path);
+
+int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
+ struct inode *inode,
+ struct inode *dir);
+
+int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_root *parent_root);
+
+#endif
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
new file mode 100644
index 000000000..3d6546581
--- /dev/null
+++ b/fs/btrfs/qgroup.c
@@ -0,0 +1,2966 @@
+/*
+ * Copyright (C) 2011 STRATO. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include <linux/btrfs.h>
+
+#include "ctree.h"
+#include "transaction.h"
+#include "disk-io.h"
+#include "locking.h"
+#include "ulist.h"
+#include "backref.h"
+#include "extent_io.h"
+#include "qgroup.h"
+
+/* TODO XXX FIXME
+ * - subvol delete -> delete when ref goes to 0? delete limits also?
+ * - reorganize keys
+ * - compressed
+ * - sync
+ * - copy also limits on subvol creation
+ * - limit
+ * - caches fuer ulists
+ * - performance benchmarks
+ * - check all ioctl parameters
+ */
+
+/*
+ * one struct for each qgroup, organized in fs_info->qgroup_tree.
+ */
+struct btrfs_qgroup {
+ u64 qgroupid;
+
+ /*
+ * state
+ */
+ u64 rfer; /* referenced */
+ u64 rfer_cmpr; /* referenced compressed */
+ u64 excl; /* exclusive */
+ u64 excl_cmpr; /* exclusive compressed */
+
+ /*
+ * limits
+ */
+ u64 lim_flags; /* which limits are set */
+ u64 max_rfer;
+ u64 max_excl;
+ u64 rsv_rfer;
+ u64 rsv_excl;
+
+ /*
+ * reservation tracking
+ */
+ u64 reserved;
+
+ /*
+ * lists
+ */
+ struct list_head groups; /* groups this group is member of */
+ struct list_head members; /* groups that are members of this group */
+ struct list_head dirty; /* dirty groups */
+ struct rb_node node; /* tree of qgroups */
+
+ /*
+ * temp variables for accounting operations
+ */
+ u64 old_refcnt;
+ u64 new_refcnt;
+};
+
+/*
+ * glue structure to represent the relations between qgroups.
+ */
+struct btrfs_qgroup_list {
+ struct list_head next_group;
+ struct list_head next_member;
+ struct btrfs_qgroup *group;
+ struct btrfs_qgroup *member;
+};
+
+#define ptr_to_u64(x) ((u64)(uintptr_t)x)
+#define u64_to_ptr(x) ((struct btrfs_qgroup *)(uintptr_t)x)
+
+static int
+qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
+ int init_flags);
+static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info);
+
+/* must be called with qgroup_ioctl_lock held */
+static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info,
+ u64 qgroupid)
+{
+ struct rb_node *n = fs_info->qgroup_tree.rb_node;
+ struct btrfs_qgroup *qgroup;
+
+ while (n) {
+ qgroup = rb_entry(n, struct btrfs_qgroup, node);
+ if (qgroup->qgroupid < qgroupid)
+ n = n->rb_left;
+ else if (qgroup->qgroupid > qgroupid)
+ n = n->rb_right;
+ else
+ return qgroup;
+ }
+ return NULL;
+}
+
+/* must be called with qgroup_lock held */
+static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info,
+ u64 qgroupid)
+{
+ struct rb_node **p = &fs_info->qgroup_tree.rb_node;
+ struct rb_node *parent = NULL;
+ struct btrfs_qgroup *qgroup;
+
+ while (*p) {
+ parent = *p;
+ qgroup = rb_entry(parent, struct btrfs_qgroup, node);
+
+ if (qgroup->qgroupid < qgroupid)
+ p = &(*p)->rb_left;
+ else if (qgroup->qgroupid > qgroupid)
+ p = &(*p)->rb_right;
+ else
+ return qgroup;
+ }
+
+ qgroup = kzalloc(sizeof(*qgroup), GFP_ATOMIC);
+ if (!qgroup)
+ return ERR_PTR(-ENOMEM);
+
+ qgroup->qgroupid = qgroupid;
+ INIT_LIST_HEAD(&qgroup->groups);
+ INIT_LIST_HEAD(&qgroup->members);
+ INIT_LIST_HEAD(&qgroup->dirty);
+
+ rb_link_node(&qgroup->node, parent, p);
+ rb_insert_color(&qgroup->node, &fs_info->qgroup_tree);
+
+ return qgroup;
+}
+
+static void __del_qgroup_rb(struct btrfs_qgroup *qgroup)
+{
+ struct btrfs_qgroup_list *list;
+
+ list_del(&qgroup->dirty);
+ while (!list_empty(&qgroup->groups)) {
+ list = list_first_entry(&qgroup->groups,
+ struct btrfs_qgroup_list, next_group);
+ list_del(&list->next_group);
+ list_del(&list->next_member);
+ kfree(list);
+ }
+
+ while (!list_empty(&qgroup->members)) {
+ list = list_first_entry(&qgroup->members,
+ struct btrfs_qgroup_list, next_member);
+ list_del(&list->next_group);
+ list_del(&list->next_member);
+ kfree(list);
+ }
+ kfree(qgroup);
+}
+
+/* must be called with qgroup_lock held */
+static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid)
+{
+ struct btrfs_qgroup *qgroup = find_qgroup_rb(fs_info, qgroupid);
+
+ if (!qgroup)
+ return -ENOENT;
+
+ rb_erase(&qgroup->node, &fs_info->qgroup_tree);
+ __del_qgroup_rb(qgroup);
+ return 0;
+}
+
+/* must be called with qgroup_lock held */
+static int add_relation_rb(struct btrfs_fs_info *fs_info,
+ u64 memberid, u64 parentid)
+{
+ struct btrfs_qgroup *member;
+ struct btrfs_qgroup *parent;
+ struct btrfs_qgroup_list *list;
+
+ member = find_qgroup_rb(fs_info, memberid);
+ parent = find_qgroup_rb(fs_info, parentid);
+ if (!member || !parent)
+ return -ENOENT;
+
+ list = kzalloc(sizeof(*list), GFP_ATOMIC);
+ if (!list)
+ return -ENOMEM;
+
+ list->group = parent;
+ list->member = member;
+ list_add_tail(&list->next_group, &member->groups);
+ list_add_tail(&list->next_member, &parent->members);
+
+ return 0;
+}
+
+/* must be called with qgroup_lock held */
+static int del_relation_rb(struct btrfs_fs_info *fs_info,
+ u64 memberid, u64 parentid)
+{
+ struct btrfs_qgroup *member;
+ struct btrfs_qgroup *parent;
+ struct btrfs_qgroup_list *list;
+
+ member = find_qgroup_rb(fs_info, memberid);
+ parent = find_qgroup_rb(fs_info, parentid);
+ if (!member || !parent)
+ return -ENOENT;
+
+ list_for_each_entry(list, &member->groups, next_group) {
+ if (list->group == parent) {
+ list_del(&list->next_group);
+ list_del(&list->next_member);
+ kfree(list);
+ return 0;
+ }
+ }
+ return -ENOENT;
+}
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
+ u64 rfer, u64 excl)
+{
+ struct btrfs_qgroup *qgroup;
+
+ qgroup = find_qgroup_rb(fs_info, qgroupid);
+ if (!qgroup)
+ return -EINVAL;
+ if (qgroup->rfer != rfer || qgroup->excl != excl)
+ return -EINVAL;
+ return 0;
+}
+#endif
+
+/*
+ * The full config is read in one go, only called from open_ctree()
+ * It doesn't use any locking, as at this point we're still single-threaded
+ */
+int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ struct btrfs_root *quota_root = fs_info->quota_root;
+ struct btrfs_path *path = NULL;
+ struct extent_buffer *l;
+ int slot;
+ int ret = 0;
+ u64 flags = 0;
+ u64 rescan_progress = 0;
+
+ if (!fs_info->quota_enabled)
+ return 0;
+
+ fs_info->qgroup_ulist = ulist_alloc(GFP_NOFS);
+ if (!fs_info->qgroup_ulist) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /* default this to quota off, in case no status key is found */
+ fs_info->qgroup_flags = 0;
+
+ /*
+ * pass 1: read status, all qgroup infos and limits
+ */
+ key.objectid = 0;
+ key.type = 0;
+ key.offset = 0;
+ ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 1);
+ if (ret)
+ goto out;
+
+ while (1) {
+ struct btrfs_qgroup *qgroup;
+
+ slot = path->slots[0];
+ l = path->nodes[0];
+ btrfs_item_key_to_cpu(l, &found_key, slot);
+
+ if (found_key.type == BTRFS_QGROUP_STATUS_KEY) {
+ struct btrfs_qgroup_status_item *ptr;
+
+ ptr = btrfs_item_ptr(l, slot,
+ struct btrfs_qgroup_status_item);
+
+ if (btrfs_qgroup_status_version(l, ptr) !=
+ BTRFS_QGROUP_STATUS_VERSION) {
+ btrfs_err(fs_info,
+ "old qgroup version, quota disabled");
+ goto out;
+ }
+ if (btrfs_qgroup_status_generation(l, ptr) !=
+ fs_info->generation) {
+ flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ btrfs_err(fs_info,
+ "qgroup generation mismatch, "
+ "marked as inconsistent");
+ }
+ fs_info->qgroup_flags = btrfs_qgroup_status_flags(l,
+ ptr);
+ rescan_progress = btrfs_qgroup_status_rescan(l, ptr);
+ goto next1;
+ }
+
+ if (found_key.type != BTRFS_QGROUP_INFO_KEY &&
+ found_key.type != BTRFS_QGROUP_LIMIT_KEY)
+ goto next1;
+
+ qgroup = find_qgroup_rb(fs_info, found_key.offset);
+ if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) ||
+ (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) {
+ btrfs_err(fs_info, "inconsitent qgroup config");
+ flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ }
+ if (!qgroup) {
+ qgroup = add_qgroup_rb(fs_info, found_key.offset);
+ if (IS_ERR(qgroup)) {
+ ret = PTR_ERR(qgroup);
+ goto out;
+ }
+ }
+ switch (found_key.type) {
+ case BTRFS_QGROUP_INFO_KEY: {
+ struct btrfs_qgroup_info_item *ptr;
+
+ ptr = btrfs_item_ptr(l, slot,
+ struct btrfs_qgroup_info_item);
+ qgroup->rfer = btrfs_qgroup_info_rfer(l, ptr);
+ qgroup->rfer_cmpr = btrfs_qgroup_info_rfer_cmpr(l, ptr);
+ qgroup->excl = btrfs_qgroup_info_excl(l, ptr);
+ qgroup->excl_cmpr = btrfs_qgroup_info_excl_cmpr(l, ptr);
+ /* generation currently unused */
+ break;
+ }
+ case BTRFS_QGROUP_LIMIT_KEY: {
+ struct btrfs_qgroup_limit_item *ptr;
+
+ ptr = btrfs_item_ptr(l, slot,
+ struct btrfs_qgroup_limit_item);
+ qgroup->lim_flags = btrfs_qgroup_limit_flags(l, ptr);
+ qgroup->max_rfer = btrfs_qgroup_limit_max_rfer(l, ptr);
+ qgroup->max_excl = btrfs_qgroup_limit_max_excl(l, ptr);
+ qgroup->rsv_rfer = btrfs_qgroup_limit_rsv_rfer(l, ptr);
+ qgroup->rsv_excl = btrfs_qgroup_limit_rsv_excl(l, ptr);
+ break;
+ }
+ }
+next1:
+ ret = btrfs_next_item(quota_root, path);
+ if (ret < 0)
+ goto out;
+ if (ret)
+ break;
+ }
+ btrfs_release_path(path);
+
+ /*
+ * pass 2: read all qgroup relations
+ */
+ key.objectid = 0;
+ key.type = BTRFS_QGROUP_RELATION_KEY;
+ key.offset = 0;
+ ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 0);
+ if (ret)
+ goto out;
+ while (1) {
+ slot = path->slots[0];
+ l = path->nodes[0];
+ btrfs_item_key_to_cpu(l, &found_key, slot);
+
+ if (found_key.type != BTRFS_QGROUP_RELATION_KEY)
+ goto next2;
+
+ if (found_key.objectid > found_key.offset) {
+ /* parent <- member, not needed to build config */
+ /* FIXME should we omit the key completely? */
+ goto next2;
+ }
+
+ ret = add_relation_rb(fs_info, found_key.objectid,
+ found_key.offset);
+ if (ret == -ENOENT) {
+ btrfs_warn(fs_info,
+ "orphan qgroup relation 0x%llx->0x%llx",
+ found_key.objectid, found_key.offset);
+ ret = 0; /* ignore the error */
+ }
+ if (ret)
+ goto out;
+next2:
+ ret = btrfs_next_item(quota_root, path);
+ if (ret < 0)
+ goto out;
+ if (ret)
+ break;
+ }
+out:
+ fs_info->qgroup_flags |= flags;
+ if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) {
+ fs_info->quota_enabled = 0;
+ fs_info->pending_quota_state = 0;
+ } else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN &&
+ ret >= 0) {
+ ret = qgroup_rescan_init(fs_info, rescan_progress, 0);
+ }
+ btrfs_free_path(path);
+
+ if (ret < 0) {
+ ulist_free(fs_info->qgroup_ulist);
+ fs_info->qgroup_ulist = NULL;
+ fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+ }
+
+ return ret < 0 ? ret : 0;
+}
+
+/*
+ * This is called from close_ctree() or open_ctree() or btrfs_quota_disable(),
+ * first two are in single-threaded paths.And for the third one, we have set
+ * quota_root to be null with qgroup_lock held before, so it is safe to clean
+ * up the in-memory structures without qgroup_lock held.
+ */
+void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
+{
+ struct rb_node *n;
+ struct btrfs_qgroup *qgroup;
+
+ while ((n = rb_first(&fs_info->qgroup_tree))) {
+ qgroup = rb_entry(n, struct btrfs_qgroup, node);
+ rb_erase(n, &fs_info->qgroup_tree);
+ __del_qgroup_rb(qgroup);
+ }
+ /*
+ * we call btrfs_free_qgroup_config() when umounting
+ * filesystem and disabling quota, so we set qgroup_ulit
+ * to be null here to avoid double free.
+ */
+ ulist_free(fs_info->qgroup_ulist);
+ fs_info->qgroup_ulist = NULL;
+}
+
+static int add_qgroup_relation_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *quota_root,
+ u64 src, u64 dst)
+{
+ int ret;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = src;
+ key.type = BTRFS_QGROUP_RELATION_KEY;
+ key.offset = dst;
+
+ ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0);
+
+ btrfs_mark_buffer_dirty(path->nodes[0]);
+
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int del_qgroup_relation_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *quota_root,
+ u64 src, u64 dst)
+{
+ int ret;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = src;
+ key.type = BTRFS_QGROUP_RELATION_KEY;
+ key.offset = dst;
+
+ ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
+ if (ret < 0)
+ goto out;
+
+ if (ret > 0) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ ret = btrfs_del_item(trans, quota_root, path);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int add_qgroup_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *quota_root, u64 qgroupid)
+{
+ int ret;
+ struct btrfs_path *path;
+ struct btrfs_qgroup_info_item *qgroup_info;
+ struct btrfs_qgroup_limit_item *qgroup_limit;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+
+ if (btrfs_test_is_dummy_root(quota_root))
+ return 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = 0;
+ key.type = BTRFS_QGROUP_INFO_KEY;
+ key.offset = qgroupid;
+
+ /*
+ * Avoid a transaction abort by catching -EEXIST here. In that
+ * case, we proceed by re-initializing the existing structure
+ * on disk.
+ */
+
+ ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
+ sizeof(*qgroup_info));
+ if (ret && ret != -EEXIST)
+ goto out;
+
+ leaf = path->nodes[0];
+ qgroup_info = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_qgroup_info_item);
+ btrfs_set_qgroup_info_generation(leaf, qgroup_info, trans->transid);
+ btrfs_set_qgroup_info_rfer(leaf, qgroup_info, 0);
+ btrfs_set_qgroup_info_rfer_cmpr(leaf, qgroup_info, 0);
+ btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0);
+ btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0);
+
+ btrfs_mark_buffer_dirty(leaf);
+
+ btrfs_release_path(path);
+
+ key.type = BTRFS_QGROUP_LIMIT_KEY;
+ ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
+ sizeof(*qgroup_limit));
+ if (ret && ret != -EEXIST)
+ goto out;
+
+ leaf = path->nodes[0];
+ qgroup_limit = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_qgroup_limit_item);
+ btrfs_set_qgroup_limit_flags(leaf, qgroup_limit, 0);
+ btrfs_set_qgroup_limit_max_rfer(leaf, qgroup_limit, 0);
+ btrfs_set_qgroup_limit_max_excl(leaf, qgroup_limit, 0);
+ btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0);
+ btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0);
+
+ btrfs_mark_buffer_dirty(leaf);
+
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int del_qgroup_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *quota_root, u64 qgroupid)
+{
+ int ret;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = 0;
+ key.type = BTRFS_QGROUP_INFO_KEY;
+ key.offset = qgroupid;
+ ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
+ if (ret < 0)
+ goto out;
+
+ if (ret > 0) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ ret = btrfs_del_item(trans, quota_root, path);
+ if (ret)
+ goto out;
+
+ btrfs_release_path(path);
+
+ key.type = BTRFS_QGROUP_LIMIT_KEY;
+ ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
+ if (ret < 0)
+ goto out;
+
+ if (ret > 0) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ ret = btrfs_del_item(trans, quota_root, path);
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_qgroup *qgroup)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct extent_buffer *l;
+ struct btrfs_qgroup_limit_item *qgroup_limit;
+ int ret;
+ int slot;
+
+ key.objectid = 0;
+ key.type = BTRFS_QGROUP_LIMIT_KEY;
+ key.offset = qgroup->qgroupid;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+ if (ret > 0)
+ ret = -ENOENT;
+
+ if (ret)
+ goto out;
+
+ l = path->nodes[0];
+ slot = path->slots[0];
+ qgroup_limit = btrfs_item_ptr(l, slot, struct btrfs_qgroup_limit_item);
+ btrfs_set_qgroup_limit_flags(l, qgroup_limit, qgroup->lim_flags);
+ btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, qgroup->max_rfer);
+ btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, qgroup->max_excl);
+ btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer);
+ btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl);
+
+ btrfs_mark_buffer_dirty(l);
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_qgroup *qgroup)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct extent_buffer *l;
+ struct btrfs_qgroup_info_item *qgroup_info;
+ int ret;
+ int slot;
+
+ if (btrfs_test_is_dummy_root(root))
+ return 0;
+
+ key.objectid = 0;
+ key.type = BTRFS_QGROUP_INFO_KEY;
+ key.offset = qgroup->qgroupid;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+ if (ret > 0)
+ ret = -ENOENT;
+
+ if (ret)
+ goto out;
+
+ l = path->nodes[0];
+ slot = path->slots[0];
+ qgroup_info = btrfs_item_ptr(l, slot, struct btrfs_qgroup_info_item);
+ btrfs_set_qgroup_info_generation(l, qgroup_info, trans->transid);
+ btrfs_set_qgroup_info_rfer(l, qgroup_info, qgroup->rfer);
+ btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr);
+ btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl);
+ btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr);
+
+ btrfs_mark_buffer_dirty(l);
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int update_qgroup_status_item(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_root *root)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct extent_buffer *l;
+ struct btrfs_qgroup_status_item *ptr;
+ int ret;
+ int slot;
+
+ key.objectid = 0;
+ key.type = BTRFS_QGROUP_STATUS_KEY;
+ key.offset = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+ if (ret > 0)
+ ret = -ENOENT;
+
+ if (ret)
+ goto out;
+
+ l = path->nodes[0];
+ slot = path->slots[0];
+ ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item);
+ btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags);
+ btrfs_set_qgroup_status_generation(l, ptr, trans->transid);
+ btrfs_set_qgroup_status_rescan(l, ptr,
+ fs_info->qgroup_rescan_progress.objectid);
+
+ btrfs_mark_buffer_dirty(l);
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * called with qgroup_lock held
+ */
+static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct extent_buffer *leaf = NULL;
+ int ret;
+ int nr = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ path->leave_spinning = 1;
+
+ key.objectid = 0;
+ key.offset = 0;
+ key.type = 0;
+
+ while (1) {
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret < 0)
+ goto out;
+ leaf = path->nodes[0];
+ nr = btrfs_header_nritems(leaf);
+ if (!nr)
+ break;
+ /*
+ * delete the leaf one by one
+ * since the whole tree is going
+ * to be deleted.
+ */
+ path->slots[0] = 0;
+ ret = btrfs_del_items(trans, root, path, 0, nr);
+ if (ret)
+ goto out;
+
+ btrfs_release_path(path);
+ }
+ ret = 0;
+out:
+ root->fs_info->pending_quota_state = 0;
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_quota_enable(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *quota_root;
+ struct btrfs_root *tree_root = fs_info->tree_root;
+ struct btrfs_path *path = NULL;
+ struct btrfs_qgroup_status_item *ptr;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ struct btrfs_qgroup *qgroup = NULL;
+ int ret = 0;
+ int slot;
+
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
+ if (fs_info->quota_root) {
+ fs_info->pending_quota_state = 1;
+ goto out;
+ }
+
+ fs_info->qgroup_ulist = ulist_alloc(GFP_NOFS);
+ if (!fs_info->qgroup_ulist) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * initially create the quota tree
+ */
+ quota_root = btrfs_create_tree(trans, fs_info,
+ BTRFS_QUOTA_TREE_OBJECTID);
+ if (IS_ERR(quota_root)) {
+ ret = PTR_ERR(quota_root);
+ goto out;
+ }
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out_free_root;
+ }
+
+ key.objectid = 0;
+ key.type = BTRFS_QGROUP_STATUS_KEY;
+ key.offset = 0;
+
+ ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
+ sizeof(*ptr));
+ if (ret)
+ goto out_free_path;
+
+ leaf = path->nodes[0];
+ ptr = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_qgroup_status_item);
+ btrfs_set_qgroup_status_generation(leaf, ptr, trans->transid);
+ btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION);
+ fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON |
+ BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags);
+ btrfs_set_qgroup_status_rescan(leaf, ptr, 0);
+
+ btrfs_mark_buffer_dirty(leaf);
+
+ key.objectid = 0;
+ key.type = BTRFS_ROOT_REF_KEY;
+ key.offset = 0;
+
+ btrfs_release_path(path);
+ ret = btrfs_search_slot_for_read(tree_root, &key, path, 1, 0);
+ if (ret > 0)
+ goto out_add_root;
+ if (ret < 0)
+ goto out_free_path;
+
+
+ while (1) {
+ slot = path->slots[0];
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+ if (found_key.type == BTRFS_ROOT_REF_KEY) {
+ ret = add_qgroup_item(trans, quota_root,
+ found_key.offset);
+ if (ret)
+ goto out_free_path;
+
+ qgroup = add_qgroup_rb(fs_info, found_key.offset);
+ if (IS_ERR(qgroup)) {
+ ret = PTR_ERR(qgroup);
+ goto out_free_path;
+ }
+ }
+ ret = btrfs_next_item(tree_root, path);
+ if (ret < 0)
+ goto out_free_path;
+ if (ret)
+ break;
+ }
+
+out_add_root:
+ btrfs_release_path(path);
+ ret = add_qgroup_item(trans, quota_root, BTRFS_FS_TREE_OBJECTID);
+ if (ret)
+ goto out_free_path;
+
+ qgroup = add_qgroup_rb(fs_info, BTRFS_FS_TREE_OBJECTID);
+ if (IS_ERR(qgroup)) {
+ ret = PTR_ERR(qgroup);
+ goto out_free_path;
+ }
+ spin_lock(&fs_info->qgroup_lock);
+ fs_info->quota_root = quota_root;
+ fs_info->pending_quota_state = 1;
+ spin_unlock(&fs_info->qgroup_lock);
+out_free_path:
+ btrfs_free_path(path);
+out_free_root:
+ if (ret) {
+ free_extent_buffer(quota_root->node);
+ free_extent_buffer(quota_root->commit_root);
+ kfree(quota_root);
+ }
+out:
+ if (ret) {
+ ulist_free(fs_info->qgroup_ulist);
+ fs_info->qgroup_ulist = NULL;
+ }
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
+ return ret;
+}
+
+int btrfs_quota_disable(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *tree_root = fs_info->tree_root;
+ struct btrfs_root *quota_root;
+ int ret = 0;
+
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
+ if (!fs_info->quota_root)
+ goto out;
+ spin_lock(&fs_info->qgroup_lock);
+ fs_info->quota_enabled = 0;
+ fs_info->pending_quota_state = 0;
+ quota_root = fs_info->quota_root;
+ fs_info->quota_root = NULL;
+ fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
+ spin_unlock(&fs_info->qgroup_lock);
+
+ btrfs_free_qgroup_config(fs_info);
+
+ ret = btrfs_clean_quota_tree(trans, quota_root);
+ if (ret)
+ goto out;
+
+ ret = btrfs_del_root(trans, tree_root, &quota_root->root_key);
+ if (ret)
+ goto out;
+
+ list_del(&quota_root->dirty_list);
+
+ btrfs_tree_lock(quota_root->node);
+ clean_tree_block(trans, tree_root->fs_info, quota_root->node);
+ btrfs_tree_unlock(quota_root->node);
+ btrfs_free_tree_block(trans, quota_root, quota_root->node, 0, 1);
+
+ free_extent_buffer(quota_root->node);
+ free_extent_buffer(quota_root->commit_root);
+ kfree(quota_root);
+out:
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
+ return ret;
+}
+
+static void qgroup_dirty(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup *qgroup)
+{
+ if (list_empty(&qgroup->dirty))
+ list_add(&qgroup->dirty, &fs_info->dirty_qgroups);
+}
+
+/*
+ * The easy accounting, if we are adding/removing the only ref for an extent
+ * then this qgroup and all of the parent qgroups get their refrence and
+ * exclusive counts adjusted.
+ *
+ * Caller should hold fs_info->qgroup_lock.
+ */
+static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
+ struct ulist *tmp, u64 ref_root,
+ u64 num_bytes, int sign)
+{
+ struct btrfs_qgroup *qgroup;
+ struct btrfs_qgroup_list *glist;
+ struct ulist_node *unode;
+ struct ulist_iterator uiter;
+ int ret = 0;
+
+ qgroup = find_qgroup_rb(fs_info, ref_root);
+ if (!qgroup)
+ goto out;
+
+ qgroup->rfer += sign * num_bytes;
+ qgroup->rfer_cmpr += sign * num_bytes;
+
+ WARN_ON(sign < 0 && qgroup->excl < num_bytes);
+ qgroup->excl += sign * num_bytes;
+ qgroup->excl_cmpr += sign * num_bytes;
+ if (sign > 0)
+ qgroup->reserved -= num_bytes;
+
+ qgroup_dirty(fs_info, qgroup);
+
+ /* Get all of the parent groups that contain this qgroup */
+ list_for_each_entry(glist, &qgroup->groups, next_group) {
+ ret = ulist_add(tmp, glist->group->qgroupid,
+ ptr_to_u64(glist->group), GFP_ATOMIC);
+ if (ret < 0)
+ goto out;
+ }
+
+ /* Iterate all of the parents and adjust their reference counts */
+ ULIST_ITER_INIT(&uiter);
+ while ((unode = ulist_next(tmp, &uiter))) {
+ qgroup = u64_to_ptr(unode->aux);
+ qgroup->rfer += sign * num_bytes;
+ qgroup->rfer_cmpr += sign * num_bytes;
+ WARN_ON(sign < 0 && qgroup->excl < num_bytes);
+ qgroup->excl += sign * num_bytes;
+ if (sign > 0)
+ qgroup->reserved -= num_bytes;
+ qgroup->excl_cmpr += sign * num_bytes;
+ qgroup_dirty(fs_info, qgroup);
+
+ /* Add any parents of the parents */
+ list_for_each_entry(glist, &qgroup->groups, next_group) {
+ ret = ulist_add(tmp, glist->group->qgroupid,
+ ptr_to_u64(glist->group), GFP_ATOMIC);
+ if (ret < 0)
+ goto out;
+ }
+ }
+ ret = 0;
+out:
+ return ret;
+}
+
+
+/*
+ * Quick path for updating qgroup with only excl refs.
+ *
+ * In that case, just update all parent will be enough.
+ * Or we needs to do a full rescan.
+ * Caller should also hold fs_info->qgroup_lock.
+ *
+ * Return 0 for quick update, return >0 for need to full rescan
+ * and mark INCONSISTENT flag.
+ * Return < 0 for other error.
+ */
+static int quick_update_accounting(struct btrfs_fs_info *fs_info,
+ struct ulist *tmp, u64 src, u64 dst,
+ int sign)
+{
+ struct btrfs_qgroup *qgroup;
+ int ret = 1;
+ int err = 0;
+
+ qgroup = find_qgroup_rb(fs_info, src);
+ if (!qgroup)
+ goto out;
+ if (qgroup->excl == qgroup->rfer) {
+ ret = 0;
+ err = __qgroup_excl_accounting(fs_info, tmp, dst,
+ qgroup->excl, sign);
+ if (err < 0) {
+ ret = err;
+ goto out;
+ }
+ }
+out:
+ if (ret)
+ fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ return ret;
+}
+
+int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 src, u64 dst)
+{
+ struct btrfs_root *quota_root;
+ struct btrfs_qgroup *parent;
+ struct btrfs_qgroup *member;
+ struct btrfs_qgroup_list *list;
+ struct ulist *tmp;
+ int ret = 0;
+
+ tmp = ulist_alloc(GFP_NOFS);
+ if (!tmp)
+ return -ENOMEM;
+
+ /* Check the level of src and dst first */
+ if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst))
+ return -EINVAL;
+
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
+ quota_root = fs_info->quota_root;
+ if (!quota_root) {
+ ret = -EINVAL;
+ goto out;
+ }
+ member = find_qgroup_rb(fs_info, src);
+ parent = find_qgroup_rb(fs_info, dst);
+ if (!member || !parent) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* check if such qgroup relation exist firstly */
+ list_for_each_entry(list, &member->groups, next_group) {
+ if (list->group == parent) {
+ ret = -EEXIST;
+ goto out;
+ }
+ }
+
+ ret = add_qgroup_relation_item(trans, quota_root, src, dst);
+ if (ret)
+ goto out;
+
+ ret = add_qgroup_relation_item(trans, quota_root, dst, src);
+ if (ret) {
+ del_qgroup_relation_item(trans, quota_root, src, dst);
+ goto out;
+ }
+
+ spin_lock(&fs_info->qgroup_lock);
+ ret = add_relation_rb(quota_root->fs_info, src, dst);
+ if (ret < 0) {
+ spin_unlock(&fs_info->qgroup_lock);
+ goto out;
+ }
+ ret = quick_update_accounting(fs_info, tmp, src, dst, 1);
+ spin_unlock(&fs_info->qgroup_lock);
+out:
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
+ ulist_free(tmp);
+ return ret;
+}
+
+int __del_qgroup_relation(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 src, u64 dst)
+{
+ struct btrfs_root *quota_root;
+ struct btrfs_qgroup *parent;
+ struct btrfs_qgroup *member;
+ struct btrfs_qgroup_list *list;
+ struct ulist *tmp;
+ int ret = 0;
+ int err;
+
+ tmp = ulist_alloc(GFP_NOFS);
+ if (!tmp)
+ return -ENOMEM;
+
+ quota_root = fs_info->quota_root;
+ if (!quota_root) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ member = find_qgroup_rb(fs_info, src);
+ parent = find_qgroup_rb(fs_info, dst);
+ if (!member || !parent) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* check if such qgroup relation exist firstly */
+ list_for_each_entry(list, &member->groups, next_group) {
+ if (list->group == parent)
+ goto exist;
+ }
+ ret = -ENOENT;
+ goto out;
+exist:
+ ret = del_qgroup_relation_item(trans, quota_root, src, dst);
+ err = del_qgroup_relation_item(trans, quota_root, dst, src);
+ if (err && !ret)
+ ret = err;
+
+ spin_lock(&fs_info->qgroup_lock);
+ del_relation_rb(fs_info, src, dst);
+ ret = quick_update_accounting(fs_info, tmp, src, dst, -1);
+ spin_unlock(&fs_info->qgroup_lock);
+out:
+ ulist_free(tmp);
+ return ret;
+}
+
+int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 src, u64 dst)
+{
+ int ret = 0;
+
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
+ ret = __del_qgroup_relation(trans, fs_info, src, dst);
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
+
+ return ret;
+}
+
+int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 qgroupid)
+{
+ struct btrfs_root *quota_root;
+ struct btrfs_qgroup *qgroup;
+ int ret = 0;
+
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
+ quota_root = fs_info->quota_root;
+ if (!quota_root) {
+ ret = -EINVAL;
+ goto out;
+ }
+ qgroup = find_qgroup_rb(fs_info, qgroupid);
+ if (qgroup) {
+ ret = -EEXIST;
+ goto out;
+ }
+
+ ret = add_qgroup_item(trans, quota_root, qgroupid);
+ if (ret)
+ goto out;
+
+ spin_lock(&fs_info->qgroup_lock);
+ qgroup = add_qgroup_rb(fs_info, qgroupid);
+ spin_unlock(&fs_info->qgroup_lock);
+
+ if (IS_ERR(qgroup))
+ ret = PTR_ERR(qgroup);
+out:
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
+ return ret;
+}
+
+int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 qgroupid)
+{
+ struct btrfs_root *quota_root;
+ struct btrfs_qgroup *qgroup;
+ struct btrfs_qgroup_list *list;
+ int ret = 0;
+
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
+ quota_root = fs_info->quota_root;
+ if (!quota_root) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ qgroup = find_qgroup_rb(fs_info, qgroupid);
+ if (!qgroup) {
+ ret = -ENOENT;
+ goto out;
+ } else {
+ /* check if there are no children of this qgroup */
+ if (!list_empty(&qgroup->members)) {
+ ret = -EBUSY;
+ goto out;
+ }
+ }
+ ret = del_qgroup_item(trans, quota_root, qgroupid);
+
+ while (!list_empty(&qgroup->groups)) {
+ list = list_first_entry(&qgroup->groups,
+ struct btrfs_qgroup_list, next_group);
+ ret = __del_qgroup_relation(trans, fs_info,
+ qgroupid,
+ list->group->qgroupid);
+ if (ret)
+ goto out;
+ }
+
+ spin_lock(&fs_info->qgroup_lock);
+ del_qgroup_rb(quota_root->fs_info, qgroupid);
+ spin_unlock(&fs_info->qgroup_lock);
+out:
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
+ return ret;
+}
+
+int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 qgroupid,
+ struct btrfs_qgroup_limit *limit)
+{
+ struct btrfs_root *quota_root;
+ struct btrfs_qgroup *qgroup;
+ int ret = 0;
+
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
+ quota_root = fs_info->quota_root;
+ if (!quota_root) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ qgroup = find_qgroup_rb(fs_info, qgroupid);
+ if (!qgroup) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ spin_lock(&fs_info->qgroup_lock);
+ if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_RFER)
+ qgroup->max_rfer = limit->max_rfer;
+ if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_EXCL)
+ qgroup->max_excl = limit->max_excl;
+ if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER)
+ qgroup->rsv_rfer = limit->rsv_rfer;
+ if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL)
+ qgroup->rsv_excl = limit->rsv_excl;
+ qgroup->lim_flags |= limit->flags;
+
+ spin_unlock(&fs_info->qgroup_lock);
+
+ ret = update_qgroup_limit_item(trans, quota_root, qgroup);
+ if (ret) {
+ fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ btrfs_info(fs_info, "unable to update quota limit for %llu",
+ qgroupid);
+ }
+
+out:
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
+ return ret;
+}
+
+static int comp_oper_exist(struct btrfs_qgroup_operation *oper1,
+ struct btrfs_qgroup_operation *oper2)
+{
+ /*
+ * Ignore seq and type here, we're looking for any operation
+ * at all related to this extent on that root.
+ */
+ if (oper1->bytenr < oper2->bytenr)
+ return -1;
+ if (oper1->bytenr > oper2->bytenr)
+ return 1;
+ if (oper1->ref_root < oper2->ref_root)
+ return -1;
+ if (oper1->ref_root > oper2->ref_root)
+ return 1;
+ return 0;
+}
+
+static int qgroup_oper_exists(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup_operation *oper)
+{
+ struct rb_node *n;
+ struct btrfs_qgroup_operation *cur;
+ int cmp;
+
+ spin_lock(&fs_info->qgroup_op_lock);
+ n = fs_info->qgroup_op_tree.rb_node;
+ while (n) {
+ cur = rb_entry(n, struct btrfs_qgroup_operation, n);
+ cmp = comp_oper_exist(cur, oper);
+ if (cmp < 0) {
+ n = n->rb_right;
+ } else if (cmp) {
+ n = n->rb_left;
+ } else {
+ spin_unlock(&fs_info->qgroup_op_lock);
+ return -EEXIST;
+ }
+ }
+ spin_unlock(&fs_info->qgroup_op_lock);
+ return 0;
+}
+
+static int comp_oper(struct btrfs_qgroup_operation *oper1,
+ struct btrfs_qgroup_operation *oper2)
+{
+ if (oper1->bytenr < oper2->bytenr)
+ return -1;
+ if (oper1->bytenr > oper2->bytenr)
+ return 1;
+ if (oper1->ref_root < oper2->ref_root)
+ return -1;
+ if (oper1->ref_root > oper2->ref_root)
+ return 1;
+ if (oper1->seq < oper2->seq)
+ return -1;
+ if (oper1->seq > oper2->seq)
+ return 1;
+ if (oper1->type < oper2->type)
+ return -1;
+ if (oper1->type > oper2->type)
+ return 1;
+ return 0;
+}
+
+static int insert_qgroup_oper(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup_operation *oper)
+{
+ struct rb_node **p;
+ struct rb_node *parent = NULL;
+ struct btrfs_qgroup_operation *cur;
+ int cmp;
+
+ spin_lock(&fs_info->qgroup_op_lock);
+ p = &fs_info->qgroup_op_tree.rb_node;
+ while (*p) {
+ parent = *p;
+ cur = rb_entry(parent, struct btrfs_qgroup_operation, n);
+ cmp = comp_oper(cur, oper);
+ if (cmp < 0) {
+ p = &(*p)->rb_right;
+ } else if (cmp) {
+ p = &(*p)->rb_left;
+ } else {
+ spin_unlock(&fs_info->qgroup_op_lock);
+ return -EEXIST;
+ }
+ }
+ rb_link_node(&oper->n, parent, p);
+ rb_insert_color(&oper->n, &fs_info->qgroup_op_tree);
+ spin_unlock(&fs_info->qgroup_op_lock);
+ return 0;
+}
+
+/*
+ * Record a quota operation for processing later on.
+ * @trans: the transaction we are adding the delayed op to.
+ * @fs_info: the fs_info for this fs.
+ * @ref_root: the root of the reference we are acting on,
+ * @bytenr: the bytenr we are acting on.
+ * @num_bytes: the number of bytes in the reference.
+ * @type: the type of operation this is.
+ * @mod_seq: do we need to get a sequence number for looking up roots.
+ *
+ * We just add it to our trans qgroup_ref_list and carry on and process these
+ * operations in order at some later point. If the reference root isn't a fs
+ * root then we don't bother with doing anything.
+ *
+ * MUST BE HOLDING THE REF LOCK.
+ */
+int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 ref_root,
+ u64 bytenr, u64 num_bytes,
+ enum btrfs_qgroup_operation_type type, int mod_seq)
+{
+ struct btrfs_qgroup_operation *oper;
+ int ret;
+
+ if (!is_fstree(ref_root) || !fs_info->quota_enabled)
+ return 0;
+
+ oper = kmalloc(sizeof(*oper), GFP_NOFS);
+ if (!oper)
+ return -ENOMEM;
+
+ oper->ref_root = ref_root;
+ oper->bytenr = bytenr;
+ oper->num_bytes = num_bytes;
+ oper->type = type;
+ oper->seq = atomic_inc_return(&fs_info->qgroup_op_seq);
+ INIT_LIST_HEAD(&oper->elem.list);
+ oper->elem.seq = 0;
+
+ trace_btrfs_qgroup_record_ref(oper);
+
+ if (type == BTRFS_QGROUP_OPER_SUB_SUBTREE) {
+ /*
+ * If any operation for this bytenr/ref_root combo
+ * exists, then we know it's not exclusively owned and
+ * shouldn't be queued up.
+ *
+ * This also catches the case where we have a cloned
+ * extent that gets queued up multiple times during
+ * drop snapshot.
+ */
+ if (qgroup_oper_exists(fs_info, oper)) {
+ kfree(oper);
+ return 0;
+ }
+ }
+
+ ret = insert_qgroup_oper(fs_info, oper);
+ if (ret) {
+ /* Shouldn't happen so have an assert for developers */
+ ASSERT(0);
+ kfree(oper);
+ return ret;
+ }
+ list_add_tail(&oper->list, &trans->qgroup_ref_list);
+
+ if (mod_seq)
+ btrfs_get_tree_mod_seq(fs_info, &oper->elem);
+
+ return 0;
+}
+
+static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup_operation *oper)
+{
+ struct ulist *tmp;
+ int sign = 0;
+ int ret = 0;
+
+ tmp = ulist_alloc(GFP_NOFS);
+ if (!tmp)
+ return -ENOMEM;
+
+ spin_lock(&fs_info->qgroup_lock);
+ if (!fs_info->quota_root)
+ goto out;
+
+ switch (oper->type) {
+ case BTRFS_QGROUP_OPER_ADD_EXCL:
+ sign = 1;
+ break;
+ case BTRFS_QGROUP_OPER_SUB_EXCL:
+ sign = -1;
+ break;
+ default:
+ ASSERT(0);
+ }
+ ret = __qgroup_excl_accounting(fs_info, tmp, oper->ref_root,
+ oper->num_bytes, sign);
+out:
+ spin_unlock(&fs_info->qgroup_lock);
+ ulist_free(tmp);
+ return ret;
+}
+
+/*
+ * Walk all of the roots that pointed to our bytenr and adjust their refcnts as
+ * properly.
+ */
+static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info,
+ u64 root_to_skip, struct ulist *tmp,
+ struct ulist *roots, struct ulist *qgroups,
+ u64 seq, int *old_roots, int rescan)
+{
+ struct ulist_node *unode;
+ struct ulist_iterator uiter;
+ struct ulist_node *tmp_unode;
+ struct ulist_iterator tmp_uiter;
+ struct btrfs_qgroup *qg;
+ int ret;
+
+ ULIST_ITER_INIT(&uiter);
+ while ((unode = ulist_next(roots, &uiter))) {
+ /* We don't count our current root here */
+ if (unode->val == root_to_skip)
+ continue;
+ qg = find_qgroup_rb(fs_info, unode->val);
+ if (!qg)
+ continue;
+ /*
+ * We could have a pending removal of this same ref so we may
+ * not have actually found our ref root when doing
+ * btrfs_find_all_roots, so we need to keep track of how many
+ * old roots we find in case we removed ours and added a
+ * different one at the same time. I don't think this could
+ * happen in practice but that sort of thinking leads to pain
+ * and suffering and to the dark side.
+ */
+ (*old_roots)++;
+
+ ulist_reinit(tmp);
+ ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg),
+ GFP_ATOMIC);
+ if (ret < 0)
+ return ret;
+ ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg), GFP_ATOMIC);
+ if (ret < 0)
+ return ret;
+ ULIST_ITER_INIT(&tmp_uiter);
+ while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
+ struct btrfs_qgroup_list *glist;
+
+ qg = u64_to_ptr(tmp_unode->aux);
+ /*
+ * We use this sequence number to keep from having to
+ * run the whole list and 0 out the refcnt every time.
+ * We basically use sequnce as the known 0 count and
+ * then add 1 everytime we see a qgroup. This is how we
+ * get how many of the roots actually point up to the
+ * upper level qgroups in order to determine exclusive
+ * counts.
+ *
+ * For rescan we want to set old_refcnt to seq so our
+ * exclusive calculations end up correct.
+ */
+ if (rescan)
+ qg->old_refcnt = seq;
+ else if (qg->old_refcnt < seq)
+ qg->old_refcnt = seq + 1;
+ else
+ qg->old_refcnt++;
+
+ if (qg->new_refcnt < seq)
+ qg->new_refcnt = seq + 1;
+ else
+ qg->new_refcnt++;
+ list_for_each_entry(glist, &qg->groups, next_group) {
+ ret = ulist_add(qgroups, glist->group->qgroupid,
+ ptr_to_u64(glist->group),
+ GFP_ATOMIC);
+ if (ret < 0)
+ return ret;
+ ret = ulist_add(tmp, glist->group->qgroupid,
+ ptr_to_u64(glist->group),
+ GFP_ATOMIC);
+ if (ret < 0)
+ return ret;
+ }
+ }
+ }
+ return 0;
+}
+
+/*
+ * We need to walk forward in our operation tree and account for any roots that
+ * were deleted after we made this operation.
+ */
+static int qgroup_account_deleted_refs(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup_operation *oper,
+ struct ulist *tmp,
+ struct ulist *qgroups, u64 seq,
+ int *old_roots)
+{
+ struct ulist_node *unode;
+ struct ulist_iterator uiter;
+ struct btrfs_qgroup *qg;
+ struct btrfs_qgroup_operation *tmp_oper;
+ struct rb_node *n;
+ int ret;
+
+ ulist_reinit(tmp);
+
+ /*
+ * We only walk forward in the tree since we're only interested in
+ * removals that happened _after_ our operation.
+ */
+ spin_lock(&fs_info->qgroup_op_lock);
+ n = rb_next(&oper->n);
+ spin_unlock(&fs_info->qgroup_op_lock);
+ if (!n)
+ return 0;
+ tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n);
+ while (tmp_oper->bytenr == oper->bytenr) {
+ /*
+ * If it's not a removal we don't care, additions work out
+ * properly with our refcnt tracking.
+ */
+ if (tmp_oper->type != BTRFS_QGROUP_OPER_SUB_SHARED &&
+ tmp_oper->type != BTRFS_QGROUP_OPER_SUB_EXCL)
+ goto next;
+ qg = find_qgroup_rb(fs_info, tmp_oper->ref_root);
+ if (!qg)
+ goto next;
+ ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg),
+ GFP_ATOMIC);
+ if (ret) {
+ if (ret < 0)
+ return ret;
+ /*
+ * We only want to increase old_roots if this qgroup is
+ * not already in the list of qgroups. If it is already
+ * there then that means it must have been re-added or
+ * the delete will be discarded because we had an
+ * existing ref that we haven't looked up yet. In this
+ * case we don't want to increase old_roots. So if ret
+ * == 1 then we know that this is the first time we've
+ * seen this qgroup and we can bump the old_roots.
+ */
+ (*old_roots)++;
+ ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg),
+ GFP_ATOMIC);
+ if (ret < 0)
+ return ret;
+ }
+next:
+ spin_lock(&fs_info->qgroup_op_lock);
+ n = rb_next(&tmp_oper->n);
+ spin_unlock(&fs_info->qgroup_op_lock);
+ if (!n)
+ break;
+ tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n);
+ }
+
+ /* Ok now process the qgroups we found */
+ ULIST_ITER_INIT(&uiter);
+ while ((unode = ulist_next(tmp, &uiter))) {
+ struct btrfs_qgroup_list *glist;
+
+ qg = u64_to_ptr(unode->aux);
+ if (qg->old_refcnt < seq)
+ qg->old_refcnt = seq + 1;
+ else
+ qg->old_refcnt++;
+ if (qg->new_refcnt < seq)
+ qg->new_refcnt = seq + 1;
+ else
+ qg->new_refcnt++;
+ list_for_each_entry(glist, &qg->groups, next_group) {
+ ret = ulist_add(qgroups, glist->group->qgroupid,
+ ptr_to_u64(glist->group), GFP_ATOMIC);
+ if (ret < 0)
+ return ret;
+ ret = ulist_add(tmp, glist->group->qgroupid,
+ ptr_to_u64(glist->group), GFP_ATOMIC);
+ if (ret < 0)
+ return ret;
+ }
+ }
+ return 0;
+}
+
+/* Add refcnt for the newly added reference. */
+static int qgroup_calc_new_refcnt(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup_operation *oper,
+ struct btrfs_qgroup *qgroup,
+ struct ulist *tmp, struct ulist *qgroups,
+ u64 seq)
+{
+ struct ulist_node *unode;
+ struct ulist_iterator uiter;
+ struct btrfs_qgroup *qg;
+ int ret;
+
+ ulist_reinit(tmp);
+ ret = ulist_add(qgroups, qgroup->qgroupid, ptr_to_u64(qgroup),
+ GFP_ATOMIC);
+ if (ret < 0)
+ return ret;
+ ret = ulist_add(tmp, qgroup->qgroupid, ptr_to_u64(qgroup),
+ GFP_ATOMIC);
+ if (ret < 0)
+ return ret;
+ ULIST_ITER_INIT(&uiter);
+ while ((unode = ulist_next(tmp, &uiter))) {
+ struct btrfs_qgroup_list *glist;
+
+ qg = u64_to_ptr(unode->aux);
+ if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) {
+ if (qg->new_refcnt < seq)
+ qg->new_refcnt = seq + 1;
+ else
+ qg->new_refcnt++;
+ } else {
+ if (qg->old_refcnt < seq)
+ qg->old_refcnt = seq + 1;
+ else
+ qg->old_refcnt++;
+ }
+ list_for_each_entry(glist, &qg->groups, next_group) {
+ ret = ulist_add(tmp, glist->group->qgroupid,
+ ptr_to_u64(glist->group), GFP_ATOMIC);
+ if (ret < 0)
+ return ret;
+ ret = ulist_add(qgroups, glist->group->qgroupid,
+ ptr_to_u64(glist->group), GFP_ATOMIC);
+ if (ret < 0)
+ return ret;
+ }
+ }
+ return 0;
+}
+
+/*
+ * This adjusts the counters for all referenced qgroups if need be.
+ */
+static int qgroup_adjust_counters(struct btrfs_fs_info *fs_info,
+ u64 root_to_skip, u64 num_bytes,
+ struct ulist *qgroups, u64 seq,
+ int old_roots, int new_roots, int rescan)
+{
+ struct ulist_node *unode;
+ struct ulist_iterator uiter;
+ struct btrfs_qgroup *qg;
+ u64 cur_new_count, cur_old_count;
+
+ ULIST_ITER_INIT(&uiter);
+ while ((unode = ulist_next(qgroups, &uiter))) {
+ bool dirty = false;
+
+ qg = u64_to_ptr(unode->aux);
+ /*
+ * Wasn't referenced before but is now, add to the reference
+ * counters.
+ */
+ if (qg->old_refcnt <= seq && qg->new_refcnt > seq) {
+ qg->rfer += num_bytes;
+ qg->rfer_cmpr += num_bytes;
+ dirty = true;
+ }
+
+ /*
+ * Was referenced before but isn't now, subtract from the
+ * reference counters.
+ */
+ if (qg->old_refcnt > seq && qg->new_refcnt <= seq) {
+ qg->rfer -= num_bytes;
+ qg->rfer_cmpr -= num_bytes;
+ dirty = true;
+ }
+
+ if (qg->old_refcnt < seq)
+ cur_old_count = 0;
+ else
+ cur_old_count = qg->old_refcnt - seq;
+ if (qg->new_refcnt < seq)
+ cur_new_count = 0;
+ else
+ cur_new_count = qg->new_refcnt - seq;
+
+ /*
+ * If our refcount was the same as the roots previously but our
+ * new count isn't the same as the number of roots now then we
+ * went from having a exclusive reference on this range to not.
+ */
+ if (old_roots && cur_old_count == old_roots &&
+ (cur_new_count != new_roots || new_roots == 0)) {
+ WARN_ON(cur_new_count != new_roots && new_roots == 0);
+ qg->excl -= num_bytes;
+ qg->excl_cmpr -= num_bytes;
+ dirty = true;
+ }
+
+ /*
+ * If we didn't reference all the roots before but now we do we
+ * have an exclusive reference to this range.
+ */
+ if ((!old_roots || (old_roots && cur_old_count != old_roots))
+ && cur_new_count == new_roots) {
+ qg->excl += num_bytes;
+ qg->excl_cmpr += num_bytes;
+ dirty = true;
+ }
+
+ if (dirty)
+ qgroup_dirty(fs_info, qg);
+ }
+ return 0;
+}
+
+/*
+ * If we removed a data extent and there were other references for that bytenr
+ * then we need to lookup all referenced roots to make sure we still don't
+ * reference this bytenr. If we do then we can just discard this operation.
+ */
+static int check_existing_refs(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup_operation *oper)
+{
+ struct ulist *roots = NULL;
+ struct ulist_node *unode;
+ struct ulist_iterator uiter;
+ int ret = 0;
+
+ ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr,
+ oper->elem.seq, &roots);
+ if (ret < 0)
+ return ret;
+ ret = 0;
+
+ ULIST_ITER_INIT(&uiter);
+ while ((unode = ulist_next(roots, &uiter))) {
+ if (unode->val == oper->ref_root) {
+ ret = 1;
+ break;
+ }
+ }
+ ulist_free(roots);
+ btrfs_put_tree_mod_seq(fs_info, &oper->elem);
+
+ return ret;
+}
+
+/*
+ * If we share a reference across multiple roots then we may need to adjust
+ * various qgroups referenced and exclusive counters. The basic premise is this
+ *
+ * 1) We have seq to represent a 0 count. Instead of looping through all of the
+ * qgroups and resetting their refcount to 0 we just constantly bump this
+ * sequence number to act as the base reference count. This means that if
+ * anybody is equal to or below this sequence they were never referenced. We
+ * jack this sequence up by the number of roots we found each time in order to
+ * make sure we don't have any overlap.
+ *
+ * 2) We first search all the roots that reference the area _except_ the root
+ * we're acting on currently. This makes up the old_refcnt of all the qgroups
+ * before.
+ *
+ * 3) We walk all of the qgroups referenced by the root we are currently acting
+ * on, and will either adjust old_refcnt in the case of a removal or the
+ * new_refcnt in the case of an addition.
+ *
+ * 4) Finally we walk all the qgroups that are referenced by this range
+ * including the root we are acting on currently. We will adjust the counters
+ * based on the number of roots we had and will have after this operation.
+ *
+ * Take this example as an illustration
+ *
+ * [qgroup 1/0]
+ * / | \
+ * [qg 0/0] [qg 0/1] [qg 0/2]
+ * \ | /
+ * [ extent ]
+ *
+ * Say we are adding a reference that is covered by qg 0/0. The first step
+ * would give a refcnt of 1 to qg 0/1 and 0/2 and a refcnt of 2 to qg 1/0 with
+ * old_roots being 2. Because it is adding new_roots will be 1. We then go
+ * through qg 0/0 which will get the new_refcnt set to 1 and add 1 to qg 1/0's
+ * new_refcnt, bringing it to 3. We then walk through all of the qgroups, we
+ * notice that the old refcnt for qg 0/0 < the new refcnt, so we added a
+ * reference and thus must add the size to the referenced bytes. Everything
+ * else is the same so nothing else changes.
+ */
+static int qgroup_shared_accounting(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup_operation *oper)
+{
+ struct ulist *roots = NULL;
+ struct ulist *qgroups, *tmp;
+ struct btrfs_qgroup *qgroup;
+ struct seq_list elem = SEQ_LIST_INIT(elem);
+ u64 seq;
+ int old_roots = 0;
+ int new_roots = 0;
+ int ret = 0;
+
+ if (oper->elem.seq) {
+ ret = check_existing_refs(trans, fs_info, oper);
+ if (ret < 0)
+ return ret;
+ if (ret)
+ return 0;
+ }
+
+ qgroups = ulist_alloc(GFP_NOFS);
+ if (!qgroups)
+ return -ENOMEM;
+
+ tmp = ulist_alloc(GFP_NOFS);
+ if (!tmp) {
+ ulist_free(qgroups);
+ return -ENOMEM;
+ }
+
+ btrfs_get_tree_mod_seq(fs_info, &elem);
+ ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, elem.seq,
+ &roots);
+ btrfs_put_tree_mod_seq(fs_info, &elem);
+ if (ret < 0) {
+ ulist_free(qgroups);
+ ulist_free(tmp);
+ return ret;
+ }
+ spin_lock(&fs_info->qgroup_lock);
+ qgroup = find_qgroup_rb(fs_info, oper->ref_root);
+ if (!qgroup)
+ goto out;
+ seq = fs_info->qgroup_seq;
+
+ /*
+ * So roots is the list of all the roots currently pointing at the
+ * bytenr, including the ref we are adding if we are adding, or not if
+ * we are removing a ref. So we pass in the ref_root to skip that root
+ * in our calculations. We set old_refnct and new_refcnt cause who the
+ * hell knows what everything looked like before, and it doesn't matter
+ * except...
+ */
+ ret = qgroup_calc_old_refcnt(fs_info, oper->ref_root, tmp, roots, qgroups,
+ seq, &old_roots, 0);
+ if (ret < 0)
+ goto out;
+
+ /*
+ * Now adjust the refcounts of the qgroups that care about this
+ * reference, either the old_count in the case of removal or new_count
+ * in the case of an addition.
+ */
+ ret = qgroup_calc_new_refcnt(fs_info, oper, qgroup, tmp, qgroups,
+ seq);
+ if (ret < 0)
+ goto out;
+
+ /*
+ * ...in the case of removals. If we had a removal before we got around
+ * to processing this operation then we need to find that guy and count
+ * his references as if they really existed so we don't end up screwing
+ * up the exclusive counts. Then whenever we go to process the delete
+ * everything will be grand and we can account for whatever exclusive
+ * changes need to be made there. We also have to pass in old_roots so
+ * we have an accurate count of the roots as it pertains to this
+ * operations view of the world.
+ */
+ ret = qgroup_account_deleted_refs(fs_info, oper, tmp, qgroups, seq,
+ &old_roots);
+ if (ret < 0)
+ goto out;
+
+ /*
+ * We are adding our root, need to adjust up the number of roots,
+ * otherwise old_roots is the number of roots we want.
+ */
+ if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) {
+ new_roots = old_roots + 1;
+ } else {
+ new_roots = old_roots;
+ old_roots++;
+ }
+ fs_info->qgroup_seq += old_roots + 1;
+
+
+ /*
+ * And now the magic happens, bless Arne for having a pretty elegant
+ * solution for this.
+ */
+ qgroup_adjust_counters(fs_info, oper->ref_root, oper->num_bytes,
+ qgroups, seq, old_roots, new_roots, 0);
+out:
+ spin_unlock(&fs_info->qgroup_lock);
+ ulist_free(qgroups);
+ ulist_free(roots);
+ ulist_free(tmp);
+ return ret;
+}
+
+/*
+ * Process a reference to a shared subtree. This type of operation is
+ * queued during snapshot removal when we encounter extents which are
+ * shared between more than one root.
+ */
+static int qgroup_subtree_accounting(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup_operation *oper)
+{
+ struct ulist *roots = NULL;
+ struct ulist_node *unode;
+ struct ulist_iterator uiter;
+ struct btrfs_qgroup_list *glist;
+ struct ulist *parents;
+ int ret = 0;
+ int err;
+ struct btrfs_qgroup *qg;
+ u64 root_obj = 0;
+ struct seq_list elem = SEQ_LIST_INIT(elem);
+
+ parents = ulist_alloc(GFP_NOFS);
+ if (!parents)
+ return -ENOMEM;
+
+ btrfs_get_tree_mod_seq(fs_info, &elem);
+ ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr,
+ elem.seq, &roots);
+ btrfs_put_tree_mod_seq(fs_info, &elem);
+ if (ret < 0)
+ goto out;
+
+ if (roots->nnodes != 1)
+ goto out;
+
+ ULIST_ITER_INIT(&uiter);
+ unode = ulist_next(roots, &uiter); /* Only want 1 so no need to loop */
+ /*
+ * If we find our ref root then that means all refs
+ * this extent has to the root have not yet been
+ * deleted. In that case, we do nothing and let the
+ * last ref for this bytenr drive our update.
+ *
+ * This can happen for example if an extent is
+ * referenced multiple times in a snapshot (clone,
+ * etc). If we are in the middle of snapshot removal,
+ * queued updates for such an extent will find the
+ * root if we have not yet finished removing the
+ * snapshot.
+ */
+ if (unode->val == oper->ref_root)
+ goto out;
+
+ root_obj = unode->val;
+ BUG_ON(!root_obj);
+
+ spin_lock(&fs_info->qgroup_lock);
+ qg = find_qgroup_rb(fs_info, root_obj);
+ if (!qg)
+ goto out_unlock;
+
+ qg->excl += oper->num_bytes;
+ qg->excl_cmpr += oper->num_bytes;
+ qgroup_dirty(fs_info, qg);
+
+ /*
+ * Adjust counts for parent groups. First we find all
+ * parents, then in the 2nd loop we do the adjustment
+ * while adding parents of the parents to our ulist.
+ */
+ list_for_each_entry(glist, &qg->groups, next_group) {
+ err = ulist_add(parents, glist->group->qgroupid,
+ ptr_to_u64(glist->group), GFP_ATOMIC);
+ if (err < 0) {
+ ret = err;
+ goto out_unlock;
+ }
+ }
+
+ ULIST_ITER_INIT(&uiter);
+ while ((unode = ulist_next(parents, &uiter))) {
+ qg = u64_to_ptr(unode->aux);
+ qg->excl += oper->num_bytes;
+ qg->excl_cmpr += oper->num_bytes;
+ qgroup_dirty(fs_info, qg);
+
+ /* Add any parents of the parents */
+ list_for_each_entry(glist, &qg->groups, next_group) {
+ err = ulist_add(parents, glist->group->qgroupid,
+ ptr_to_u64(glist->group), GFP_ATOMIC);
+ if (err < 0) {
+ ret = err;
+ goto out_unlock;
+ }
+ }
+ }
+
+out_unlock:
+ spin_unlock(&fs_info->qgroup_lock);
+
+out:
+ ulist_free(roots);
+ ulist_free(parents);
+ return ret;
+}
+
+/*
+ * btrfs_qgroup_account_ref is called for every ref that is added to or deleted
+ * from the fs. First, all roots referencing the extent are searched, and
+ * then the space is accounted accordingly to the different roots. The
+ * accounting algorithm works in 3 steps documented inline.
+ */
+static int btrfs_qgroup_account(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup_operation *oper)
+{
+ int ret = 0;
+
+ if (!fs_info->quota_enabled)
+ return 0;
+
+ BUG_ON(!fs_info->quota_root);
+
+ mutex_lock(&fs_info->qgroup_rescan_lock);
+ if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
+ if (fs_info->qgroup_rescan_progress.objectid <= oper->bytenr) {
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+ return 0;
+ }
+ }
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+
+ ASSERT(is_fstree(oper->ref_root));
+
+ trace_btrfs_qgroup_account(oper);
+
+ switch (oper->type) {
+ case BTRFS_QGROUP_OPER_ADD_EXCL:
+ case BTRFS_QGROUP_OPER_SUB_EXCL:
+ ret = qgroup_excl_accounting(fs_info, oper);
+ break;
+ case BTRFS_QGROUP_OPER_ADD_SHARED:
+ case BTRFS_QGROUP_OPER_SUB_SHARED:
+ ret = qgroup_shared_accounting(trans, fs_info, oper);
+ break;
+ case BTRFS_QGROUP_OPER_SUB_SUBTREE:
+ ret = qgroup_subtree_accounting(trans, fs_info, oper);
+ break;
+ default:
+ ASSERT(0);
+ }
+ return ret;
+}
+
+/*
+ * Needs to be called everytime we run delayed refs, even if there is an error
+ * in order to cleanup outstanding operations.
+ */
+int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_qgroup_operation *oper;
+ int ret = 0;
+
+ while (!list_empty(&trans->qgroup_ref_list)) {
+ oper = list_first_entry(&trans->qgroup_ref_list,
+ struct btrfs_qgroup_operation, list);
+ list_del_init(&oper->list);
+ if (!ret || !trans->aborted)
+ ret = btrfs_qgroup_account(trans, fs_info, oper);
+ spin_lock(&fs_info->qgroup_op_lock);
+ rb_erase(&oper->n, &fs_info->qgroup_op_tree);
+ spin_unlock(&fs_info->qgroup_op_lock);
+ btrfs_put_tree_mod_seq(fs_info, &oper->elem);
+ kfree(oper);
+ }
+ return ret;
+}
+
+/*
+ * called from commit_transaction. Writes all changed qgroups to disk.
+ */
+int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *quota_root = fs_info->quota_root;
+ int ret = 0;
+ int start_rescan_worker = 0;
+
+ if (!quota_root)
+ goto out;
+
+ if (!fs_info->quota_enabled && fs_info->pending_quota_state)
+ start_rescan_worker = 1;
+
+ fs_info->quota_enabled = fs_info->pending_quota_state;
+
+ spin_lock(&fs_info->qgroup_lock);
+ while (!list_empty(&fs_info->dirty_qgroups)) {
+ struct btrfs_qgroup *qgroup;
+ qgroup = list_first_entry(&fs_info->dirty_qgroups,
+ struct btrfs_qgroup, dirty);
+ list_del_init(&qgroup->dirty);
+ spin_unlock(&fs_info->qgroup_lock);
+ ret = update_qgroup_info_item(trans, quota_root, qgroup);
+ if (ret)
+ fs_info->qgroup_flags |=
+ BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ ret = update_qgroup_limit_item(trans, quota_root, qgroup);
+ if (ret)
+ fs_info->qgroup_flags |=
+ BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ spin_lock(&fs_info->qgroup_lock);
+ }
+ if (fs_info->quota_enabled)
+ fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_ON;
+ else
+ fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
+ spin_unlock(&fs_info->qgroup_lock);
+
+ ret = update_qgroup_status_item(trans, fs_info, quota_root);
+ if (ret)
+ fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+
+ if (!ret && start_rescan_worker) {
+ ret = qgroup_rescan_init(fs_info, 0, 1);
+ if (!ret) {
+ qgroup_rescan_zero_tracking(fs_info);
+ btrfs_queue_work(fs_info->qgroup_rescan_workers,
+ &fs_info->qgroup_rescan_work);
+ }
+ ret = 0;
+ }
+
+out:
+
+ return ret;
+}
+
+/*
+ * copy the acounting information between qgroups. This is necessary when a
+ * snapshot or a subvolume is created
+ */
+int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
+ struct btrfs_qgroup_inherit *inherit)
+{
+ int ret = 0;
+ int i;
+ u64 *i_qgroups;
+ struct btrfs_root *quota_root = fs_info->quota_root;
+ struct btrfs_qgroup *srcgroup;
+ struct btrfs_qgroup *dstgroup;
+ u32 level_size = 0;
+ u64 nums;
+
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
+ if (!fs_info->quota_enabled)
+ goto out;
+
+ if (!quota_root) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (inherit) {
+ i_qgroups = (u64 *)(inherit + 1);
+ nums = inherit->num_qgroups + 2 * inherit->num_ref_copies +
+ 2 * inherit->num_excl_copies;
+ for (i = 0; i < nums; ++i) {
+ srcgroup = find_qgroup_rb(fs_info, *i_qgroups);
+ if (!srcgroup) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if ((srcgroup->qgroupid >> 48) <= (objectid >> 48)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ ++i_qgroups;
+ }
+ }
+
+ /*
+ * create a tracking group for the subvol itself
+ */
+ ret = add_qgroup_item(trans, quota_root, objectid);
+ if (ret)
+ goto out;
+
+ if (srcid) {
+ struct btrfs_root *srcroot;
+ struct btrfs_key srckey;
+
+ srckey.objectid = srcid;
+ srckey.type = BTRFS_ROOT_ITEM_KEY;
+ srckey.offset = (u64)-1;
+ srcroot = btrfs_read_fs_root_no_name(fs_info, &srckey);
+ if (IS_ERR(srcroot)) {
+ ret = PTR_ERR(srcroot);
+ goto out;
+ }
+
+ rcu_read_lock();
+ level_size = srcroot->nodesize;
+ rcu_read_unlock();
+ }
+
+ /*
+ * add qgroup to all inherited groups
+ */
+ if (inherit) {
+ i_qgroups = (u64 *)(inherit + 1);
+ for (i = 0; i < inherit->num_qgroups; ++i) {
+ ret = add_qgroup_relation_item(trans, quota_root,
+ objectid, *i_qgroups);
+ if (ret)
+ goto out;
+ ret = add_qgroup_relation_item(trans, quota_root,
+ *i_qgroups, objectid);
+ if (ret)
+ goto out;
+ ++i_qgroups;
+ }
+ }
+
+
+ spin_lock(&fs_info->qgroup_lock);
+
+ dstgroup = add_qgroup_rb(fs_info, objectid);
+ if (IS_ERR(dstgroup)) {
+ ret = PTR_ERR(dstgroup);
+ goto unlock;
+ }
+
+ if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) {
+ dstgroup->lim_flags = inherit->lim.flags;
+ dstgroup->max_rfer = inherit->lim.max_rfer;
+ dstgroup->max_excl = inherit->lim.max_excl;
+ dstgroup->rsv_rfer = inherit->lim.rsv_rfer;
+ dstgroup->rsv_excl = inherit->lim.rsv_excl;
+
+ ret = update_qgroup_limit_item(trans, quota_root, dstgroup);
+ if (ret) {
+ fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ btrfs_info(fs_info, "unable to update quota limit for %llu",
+ dstgroup->qgroupid);
+ goto unlock;
+ }
+ }
+
+ if (srcid) {
+ srcgroup = find_qgroup_rb(fs_info, srcid);
+ if (!srcgroup)
+ goto unlock;
+
+ /*
+ * We call inherit after we clone the root in order to make sure
+ * our counts don't go crazy, so at this point the only
+ * difference between the two roots should be the root node.
+ */
+ dstgroup->rfer = srcgroup->rfer;
+ dstgroup->rfer_cmpr = srcgroup->rfer_cmpr;
+ dstgroup->excl = level_size;
+ dstgroup->excl_cmpr = level_size;
+ srcgroup->excl = level_size;
+ srcgroup->excl_cmpr = level_size;
+
+ /* inherit the limit info */
+ dstgroup->lim_flags = srcgroup->lim_flags;
+ dstgroup->max_rfer = srcgroup->max_rfer;
+ dstgroup->max_excl = srcgroup->max_excl;
+ dstgroup->rsv_rfer = srcgroup->rsv_rfer;
+ dstgroup->rsv_excl = srcgroup->rsv_excl;
+
+ qgroup_dirty(fs_info, dstgroup);
+ qgroup_dirty(fs_info, srcgroup);
+ }
+
+ if (!inherit)
+ goto unlock;
+
+ i_qgroups = (u64 *)(inherit + 1);
+ for (i = 0; i < inherit->num_qgroups; ++i) {
+ ret = add_relation_rb(quota_root->fs_info, objectid,
+ *i_qgroups);
+ if (ret)
+ goto unlock;
+ ++i_qgroups;
+ }
+
+ for (i = 0; i < inherit->num_ref_copies; ++i) {
+ struct btrfs_qgroup *src;
+ struct btrfs_qgroup *dst;
+
+ src = find_qgroup_rb(fs_info, i_qgroups[0]);
+ dst = find_qgroup_rb(fs_info, i_qgroups[1]);
+
+ if (!src || !dst) {
+ ret = -EINVAL;
+ goto unlock;
+ }
+
+ dst->rfer = src->rfer - level_size;
+ dst->rfer_cmpr = src->rfer_cmpr - level_size;
+ i_qgroups += 2;
+ }
+ for (i = 0; i < inherit->num_excl_copies; ++i) {
+ struct btrfs_qgroup *src;
+ struct btrfs_qgroup *dst;
+
+ src = find_qgroup_rb(fs_info, i_qgroups[0]);
+ dst = find_qgroup_rb(fs_info, i_qgroups[1]);
+
+ if (!src || !dst) {
+ ret = -EINVAL;
+ goto unlock;
+ }
+
+ dst->excl = src->excl + level_size;
+ dst->excl_cmpr = src->excl_cmpr + level_size;
+ i_qgroups += 2;
+ }
+
+unlock:
+ spin_unlock(&fs_info->qgroup_lock);
+out:
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
+ return ret;
+}
+
+int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
+{
+ struct btrfs_root *quota_root;
+ struct btrfs_qgroup *qgroup;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ u64 ref_root = root->root_key.objectid;
+ int ret = 0;
+ struct ulist_node *unode;
+ struct ulist_iterator uiter;
+
+ if (!is_fstree(ref_root))
+ return 0;
+
+ if (num_bytes == 0)
+ return 0;
+
+ spin_lock(&fs_info->qgroup_lock);
+ quota_root = fs_info->quota_root;
+ if (!quota_root)
+ goto out;
+
+ qgroup = find_qgroup_rb(fs_info, ref_root);
+ if (!qgroup)
+ goto out;
+
+ /*
+ * in a first step, we check all affected qgroups if any limits would
+ * be exceeded
+ */
+ ulist_reinit(fs_info->qgroup_ulist);
+ ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
+ (uintptr_t)qgroup, GFP_ATOMIC);
+ if (ret < 0)
+ goto out;
+ ULIST_ITER_INIT(&uiter);
+ while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
+ struct btrfs_qgroup *qg;
+ struct btrfs_qgroup_list *glist;
+
+ qg = u64_to_ptr(unode->aux);
+
+ if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
+ qg->reserved + (s64)qg->rfer + num_bytes >
+ qg->max_rfer) {
+ ret = -EDQUOT;
+ goto out;
+ }
+
+ if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) &&
+ qg->reserved + (s64)qg->excl + num_bytes >
+ qg->max_excl) {
+ ret = -EDQUOT;
+ goto out;
+ }
+
+ list_for_each_entry(glist, &qg->groups, next_group) {
+ ret = ulist_add(fs_info->qgroup_ulist,
+ glist->group->qgroupid,
+ (uintptr_t)glist->group, GFP_ATOMIC);
+ if (ret < 0)
+ goto out;
+ }
+ }
+ ret = 0;
+ /*
+ * no limits exceeded, now record the reservation into all qgroups
+ */
+ ULIST_ITER_INIT(&uiter);
+ while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
+ struct btrfs_qgroup *qg;
+
+ qg = u64_to_ptr(unode->aux);
+
+ qg->reserved += num_bytes;
+ }
+
+out:
+ spin_unlock(&fs_info->qgroup_lock);
+ return ret;
+}
+
+void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
+{
+ struct btrfs_root *quota_root;
+ struct btrfs_qgroup *qgroup;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct ulist_node *unode;
+ struct ulist_iterator uiter;
+ u64 ref_root = root->root_key.objectid;
+ int ret = 0;
+
+ if (!is_fstree(ref_root))
+ return;
+
+ if (num_bytes == 0)
+ return;
+
+ spin_lock(&fs_info->qgroup_lock);
+
+ quota_root = fs_info->quota_root;
+ if (!quota_root)
+ goto out;
+
+ qgroup = find_qgroup_rb(fs_info, ref_root);
+ if (!qgroup)
+ goto out;
+
+ ulist_reinit(fs_info->qgroup_ulist);
+ ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
+ (uintptr_t)qgroup, GFP_ATOMIC);
+ if (ret < 0)
+ goto out;
+ ULIST_ITER_INIT(&uiter);
+ while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
+ struct btrfs_qgroup *qg;
+ struct btrfs_qgroup_list *glist;
+
+ qg = u64_to_ptr(unode->aux);
+
+ qg->reserved -= num_bytes;
+
+ list_for_each_entry(glist, &qg->groups, next_group) {
+ ret = ulist_add(fs_info->qgroup_ulist,
+ glist->group->qgroupid,
+ (uintptr_t)glist->group, GFP_ATOMIC);
+ if (ret < 0)
+ goto out;
+ }
+ }
+
+out:
+ spin_unlock(&fs_info->qgroup_lock);
+}
+
+void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
+{
+ if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq)
+ return;
+ btrfs_err(trans->root->fs_info,
+ "qgroups not uptodate in trans handle %p: list is%s empty, "
+ "seq is %#x.%x",
+ trans, list_empty(&trans->qgroup_ref_list) ? "" : " not",
+ (u32)(trans->delayed_ref_elem.seq >> 32),
+ (u32)trans->delayed_ref_elem.seq);
+ BUG();
+}
+
+/*
+ * returns < 0 on error, 0 when more leafs are to be scanned.
+ * returns 1 when done.
+ */
+static int
+qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
+ struct btrfs_trans_handle *trans, struct ulist *qgroups,
+ struct ulist *tmp, struct extent_buffer *scratch_leaf)
+{
+ struct btrfs_key found;
+ struct ulist *roots = NULL;
+ struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem);
+ u64 num_bytes;
+ u64 seq;
+ int new_roots;
+ int slot;
+ int ret;
+
+ path->leave_spinning = 1;
+ mutex_lock(&fs_info->qgroup_rescan_lock);
+ ret = btrfs_search_slot_for_read(fs_info->extent_root,
+ &fs_info->qgroup_rescan_progress,
+ path, 1, 0);
+
+ pr_debug("current progress key (%llu %u %llu), search_slot ret %d\n",
+ fs_info->qgroup_rescan_progress.objectid,
+ fs_info->qgroup_rescan_progress.type,
+ fs_info->qgroup_rescan_progress.offset, ret);
+
+ if (ret) {
+ /*
+ * The rescan is about to end, we will not be scanning any
+ * further blocks. We cannot unset the RESCAN flag here, because
+ * we want to commit the transaction if everything went well.
+ * To make the live accounting work in this phase, we set our
+ * scan progress pointer such that every real extent objectid
+ * will be smaller.
+ */
+ fs_info->qgroup_rescan_progress.objectid = (u64)-1;
+ btrfs_release_path(path);
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+ return ret;
+ }
+
+ btrfs_item_key_to_cpu(path->nodes[0], &found,
+ btrfs_header_nritems(path->nodes[0]) - 1);
+ fs_info->qgroup_rescan_progress.objectid = found.objectid + 1;
+
+ btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
+ memcpy(scratch_leaf, path->nodes[0], sizeof(*scratch_leaf));
+ slot = path->slots[0];
+ btrfs_release_path(path);
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+
+ for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) {
+ btrfs_item_key_to_cpu(scratch_leaf, &found, slot);
+ if (found.type != BTRFS_EXTENT_ITEM_KEY &&
+ found.type != BTRFS_METADATA_ITEM_KEY)
+ continue;
+ if (found.type == BTRFS_METADATA_ITEM_KEY)
+ num_bytes = fs_info->extent_root->nodesize;
+ else
+ num_bytes = found.offset;
+
+ ulist_reinit(qgroups);
+ ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0,
+ &roots);
+ if (ret < 0)
+ goto out;
+ spin_lock(&fs_info->qgroup_lock);
+ seq = fs_info->qgroup_seq;
+ fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */
+
+ new_roots = 0;
+ ret = qgroup_calc_old_refcnt(fs_info, 0, tmp, roots, qgroups,
+ seq, &new_roots, 1);
+ if (ret < 0) {
+ spin_unlock(&fs_info->qgroup_lock);
+ ulist_free(roots);
+ goto out;
+ }
+
+ ret = qgroup_adjust_counters(fs_info, 0, num_bytes, qgroups,
+ seq, 0, new_roots, 1);
+ if (ret < 0) {
+ spin_unlock(&fs_info->qgroup_lock);
+ ulist_free(roots);
+ goto out;
+ }
+ spin_unlock(&fs_info->qgroup_lock);
+ ulist_free(roots);
+ }
+out:
+ btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
+
+ return ret;
+}
+
+static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
+{
+ struct btrfs_fs_info *fs_info = container_of(work, struct btrfs_fs_info,
+ qgroup_rescan_work);
+ struct btrfs_path *path;
+ struct btrfs_trans_handle *trans = NULL;
+ struct ulist *tmp = NULL, *qgroups = NULL;
+ struct extent_buffer *scratch_leaf = NULL;
+ int err = -ENOMEM;
+ int ret = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ goto out;
+ qgroups = ulist_alloc(GFP_NOFS);
+ if (!qgroups)
+ goto out;
+ tmp = ulist_alloc(GFP_NOFS);
+ if (!tmp)
+ goto out;
+ scratch_leaf = kmalloc(sizeof(*scratch_leaf), GFP_NOFS);
+ if (!scratch_leaf)
+ goto out;
+
+ err = 0;
+ while (!err) {
+ trans = btrfs_start_transaction(fs_info->fs_root, 0);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ break;
+ }
+ if (!fs_info->quota_enabled) {
+ err = -EINTR;
+ } else {
+ err = qgroup_rescan_leaf(fs_info, path, trans,
+ qgroups, tmp, scratch_leaf);
+ }
+ if (err > 0)
+ btrfs_commit_transaction(trans, fs_info->fs_root);
+ else
+ btrfs_end_transaction(trans, fs_info->fs_root);
+ }
+
+out:
+ kfree(scratch_leaf);
+ ulist_free(qgroups);
+ ulist_free(tmp);
+ btrfs_free_path(path);
+
+ mutex_lock(&fs_info->qgroup_rescan_lock);
+ fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+
+ if (err > 0 &&
+ fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) {
+ fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ } else if (err < 0) {
+ fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ }
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+
+ /*
+ * only update status, since the previous part has alreay updated the
+ * qgroup info.
+ */
+ trans = btrfs_start_transaction(fs_info->quota_root, 1);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ btrfs_err(fs_info,
+ "fail to start transaction for status update: %d\n",
+ err);
+ goto done;
+ }
+ ret = update_qgroup_status_item(trans, fs_info, fs_info->quota_root);
+ if (ret < 0) {
+ err = ret;
+ btrfs_err(fs_info, "fail to update qgroup status: %d\n", err);
+ }
+ btrfs_end_transaction(trans, fs_info->quota_root);
+
+ if (err >= 0) {
+ btrfs_info(fs_info, "qgroup scan completed%s",
+ err > 0 ? " (inconsistency flag cleared)" : "");
+ } else {
+ btrfs_err(fs_info, "qgroup scan failed with %d", err);
+ }
+
+done:
+ complete_all(&fs_info->qgroup_rescan_completion);
+}
+
+/*
+ * Checks that (a) no rescan is running and (b) quota is enabled. Allocates all
+ * memory required for the rescan context.
+ */
+static int
+qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
+ int init_flags)
+{
+ int ret = 0;
+
+ if (!init_flags &&
+ (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) ||
+ !(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ mutex_lock(&fs_info->qgroup_rescan_lock);
+ spin_lock(&fs_info->qgroup_lock);
+
+ if (init_flags) {
+ if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
+ ret = -EINPROGRESS;
+ else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))
+ ret = -EINVAL;
+
+ if (ret) {
+ spin_unlock(&fs_info->qgroup_lock);
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+ goto err;
+ }
+ fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+ }
+
+ memset(&fs_info->qgroup_rescan_progress, 0,
+ sizeof(fs_info->qgroup_rescan_progress));
+ fs_info->qgroup_rescan_progress.objectid = progress_objectid;
+
+ spin_unlock(&fs_info->qgroup_lock);
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+
+ init_completion(&fs_info->qgroup_rescan_completion);
+
+ memset(&fs_info->qgroup_rescan_work, 0,
+ sizeof(fs_info->qgroup_rescan_work));
+ btrfs_init_work(&fs_info->qgroup_rescan_work,
+ btrfs_qgroup_rescan_helper,
+ btrfs_qgroup_rescan_worker, NULL, NULL);
+
+ if (ret) {
+err:
+ btrfs_info(fs_info, "qgroup_rescan_init failed with %d", ret);
+ return ret;
+ }
+
+ return 0;
+}
+
+static void
+qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info)
+{
+ struct rb_node *n;
+ struct btrfs_qgroup *qgroup;
+
+ spin_lock(&fs_info->qgroup_lock);
+ /* clear all current qgroup tracking information */
+ for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) {
+ qgroup = rb_entry(n, struct btrfs_qgroup, node);
+ qgroup->rfer = 0;
+ qgroup->rfer_cmpr = 0;
+ qgroup->excl = 0;
+ qgroup->excl_cmpr = 0;
+ }
+ spin_unlock(&fs_info->qgroup_lock);
+}
+
+int
+btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
+{
+ int ret = 0;
+ struct btrfs_trans_handle *trans;
+
+ ret = qgroup_rescan_init(fs_info, 0, 1);
+ if (ret)
+ return ret;
+
+ /*
+ * We have set the rescan_progress to 0, which means no more
+ * delayed refs will be accounted by btrfs_qgroup_account_ref.
+ * However, btrfs_qgroup_account_ref may be right after its call
+ * to btrfs_find_all_roots, in which case it would still do the
+ * accounting.
+ * To solve this, we're committing the transaction, which will
+ * ensure we run all delayed refs and only after that, we are
+ * going to clear all tracking information for a clean start.
+ */
+
+ trans = btrfs_join_transaction(fs_info->fs_root);
+ if (IS_ERR(trans)) {
+ fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+ return PTR_ERR(trans);
+ }
+ ret = btrfs_commit_transaction(trans, fs_info->fs_root);
+ if (ret) {
+ fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+ return ret;
+ }
+
+ qgroup_rescan_zero_tracking(fs_info);
+
+ btrfs_queue_work(fs_info->qgroup_rescan_workers,
+ &fs_info->qgroup_rescan_work);
+
+ return 0;
+}
+
+int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info)
+{
+ int running;
+ int ret = 0;
+
+ mutex_lock(&fs_info->qgroup_rescan_lock);
+ spin_lock(&fs_info->qgroup_lock);
+ running = fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+ spin_unlock(&fs_info->qgroup_lock);
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+
+ if (running)
+ ret = wait_for_completion_interruptible(
+ &fs_info->qgroup_rescan_completion);
+
+ return ret;
+}
+
+/*
+ * this is only called from open_ctree where we're still single threaded, thus
+ * locking is omitted here.
+ */
+void
+btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
+{
+ if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
+ btrfs_queue_work(fs_info->qgroup_rescan_workers,
+ &fs_info->qgroup_rescan_work);
+}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
new file mode 100644
index 000000000..c5242aa9a
--- /dev/null
+++ b/fs/btrfs/qgroup.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (C) 2014 Facebook. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_QGROUP__
+#define __BTRFS_QGROUP__
+
+/*
+ * A description of the operations, all of these operations only happen when we
+ * are adding the 1st reference for that subvolume in the case of adding space
+ * or on the last reference delete in the case of subtraction. The only
+ * exception is the last one, which is added for confusion.
+ *
+ * BTRFS_QGROUP_OPER_ADD_EXCL: adding bytes where this subvolume is the only
+ * one pointing at the bytes we are adding. This is called on the first
+ * allocation.
+ *
+ * BTRFS_QGROUP_OPER_ADD_SHARED: adding bytes where this bytenr is going to be
+ * shared between subvols. This is called on the creation of a ref that already
+ * has refs from a different subvolume, so basically reflink.
+ *
+ * BTRFS_QGROUP_OPER_SUB_EXCL: removing bytes where this subvolume is the only
+ * one referencing the range.
+ *
+ * BTRFS_QGROUP_OPER_SUB_SHARED: removing bytes where this subvolume shares with
+ * refs with other subvolumes.
+ */
+enum btrfs_qgroup_operation_type {
+ BTRFS_QGROUP_OPER_ADD_EXCL,
+ BTRFS_QGROUP_OPER_ADD_SHARED,
+ BTRFS_QGROUP_OPER_SUB_EXCL,
+ BTRFS_QGROUP_OPER_SUB_SHARED,
+ BTRFS_QGROUP_OPER_SUB_SUBTREE,
+};
+
+struct btrfs_qgroup_operation {
+ u64 ref_root;
+ u64 bytenr;
+ u64 num_bytes;
+ u64 seq;
+ enum btrfs_qgroup_operation_type type;
+ struct seq_list elem;
+ struct rb_node n;
+ struct list_head list;
+};
+
+int btrfs_quota_enable(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info);
+int btrfs_quota_disable(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info);
+int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info);
+void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info);
+int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info);
+int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 src, u64 dst);
+int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 src, u64 dst);
+int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 qgroupid);
+int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 qgroupid);
+int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 qgroupid,
+ struct btrfs_qgroup_limit *limit);
+int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info);
+void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
+struct btrfs_delayed_extent_op;
+int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 ref_root,
+ u64 bytenr, u64 num_bytes,
+ enum btrfs_qgroup_operation_type type,
+ int mod_seq);
+int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info);
+void btrfs_remove_qgroup_operation(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup_operation *oper);
+int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info);
+int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
+ struct btrfs_qgroup_inherit *inherit);
+int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes);
+void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes);
+
+void assert_qgroups_uptodate(struct btrfs_trans_handle *trans);
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
+ u64 rfer, u64 excl);
+#endif
+
+#endif /* __BTRFS_QGROUP__ */
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
new file mode 100644
index 000000000..fa72068bd
--- /dev/null
+++ b/fs/btrfs/raid56.c
@@ -0,0 +1,2670 @@
+/*
+ * Copyright (C) 2012 Fusion-io All rights reserved.
+ * Copyright (C) 2012 Intel Corp. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/bio.h>
+#include <linux/slab.h>
+#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
+#include <linux/random.h>
+#include <linux/iocontext.h>
+#include <linux/capability.h>
+#include <linux/ratelimit.h>
+#include <linux/kthread.h>
+#include <linux/raid/pq.h>
+#include <linux/hash.h>
+#include <linux/list_sort.h>
+#include <linux/raid/xor.h>
+#include <linux/vmalloc.h>
+#include <asm/div64.h>
+#include "ctree.h"
+#include "extent_map.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "print-tree.h"
+#include "volumes.h"
+#include "raid56.h"
+#include "async-thread.h"
+#include "check-integrity.h"
+#include "rcu-string.h"
+
+/* set when additional merges to this rbio are not allowed */
+#define RBIO_RMW_LOCKED_BIT 1
+
+/*
+ * set when this rbio is sitting in the hash, but it is just a cache
+ * of past RMW
+ */
+#define RBIO_CACHE_BIT 2
+
+/*
+ * set when it is safe to trust the stripe_pages for caching
+ */
+#define RBIO_CACHE_READY_BIT 3
+
+#define RBIO_CACHE_SIZE 1024
+
+enum btrfs_rbio_ops {
+ BTRFS_RBIO_WRITE = 0,
+ BTRFS_RBIO_READ_REBUILD = 1,
+ BTRFS_RBIO_PARITY_SCRUB = 2,
+};
+
+struct btrfs_raid_bio {
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_bio *bbio;
+
+ /* while we're doing rmw on a stripe
+ * we put it into a hash table so we can
+ * lock the stripe and merge more rbios
+ * into it.
+ */
+ struct list_head hash_list;
+
+ /*
+ * LRU list for the stripe cache
+ */
+ struct list_head stripe_cache;
+
+ /*
+ * for scheduling work in the helper threads
+ */
+ struct btrfs_work work;
+
+ /*
+ * bio list and bio_list_lock are used
+ * to add more bios into the stripe
+ * in hopes of avoiding the full rmw
+ */
+ struct bio_list bio_list;
+ spinlock_t bio_list_lock;
+
+ /* also protected by the bio_list_lock, the
+ * plug list is used by the plugging code
+ * to collect partial bios while plugged. The
+ * stripe locking code also uses it to hand off
+ * the stripe lock to the next pending IO
+ */
+ struct list_head plug_list;
+
+ /*
+ * flags that tell us if it is safe to
+ * merge with this bio
+ */
+ unsigned long flags;
+
+ /* size of each individual stripe on disk */
+ int stripe_len;
+
+ /* number of data stripes (no p/q) */
+ int nr_data;
+
+ int real_stripes;
+
+ int stripe_npages;
+ /*
+ * set if we're doing a parity rebuild
+ * for a read from higher up, which is handled
+ * differently from a parity rebuild as part of
+ * rmw
+ */
+ enum btrfs_rbio_ops operation;
+
+ /* first bad stripe */
+ int faila;
+
+ /* second bad stripe (for raid6 use) */
+ int failb;
+
+ int scrubp;
+ /*
+ * number of pages needed to represent the full
+ * stripe
+ */
+ int nr_pages;
+
+ /*
+ * size of all the bios in the bio_list. This
+ * helps us decide if the rbio maps to a full
+ * stripe or not
+ */
+ int bio_list_bytes;
+
+ int generic_bio_cnt;
+
+ atomic_t refs;
+
+ atomic_t stripes_pending;
+
+ atomic_t error;
+ /*
+ * these are two arrays of pointers. We allocate the
+ * rbio big enough to hold them both and setup their
+ * locations when the rbio is allocated
+ */
+
+ /* pointers to pages that we allocated for
+ * reading/writing stripes directly from the disk (including P/Q)
+ */
+ struct page **stripe_pages;
+
+ /*
+ * pointers to the pages in the bio_list. Stored
+ * here for faster lookup
+ */
+ struct page **bio_pages;
+
+ /*
+ * bitmap to record which horizontal stripe has data
+ */
+ unsigned long *dbitmap;
+};
+
+static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
+static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
+static void rmw_work(struct btrfs_work *work);
+static void read_rebuild_work(struct btrfs_work *work);
+static void async_rmw_stripe(struct btrfs_raid_bio *rbio);
+static void async_read_rebuild(struct btrfs_raid_bio *rbio);
+static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
+static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
+static void __free_raid_bio(struct btrfs_raid_bio *rbio);
+static void index_rbio_pages(struct btrfs_raid_bio *rbio);
+static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
+
+static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
+ int need_check);
+static void async_scrub_parity(struct btrfs_raid_bio *rbio);
+
+/*
+ * the stripe hash table is used for locking, and to collect
+ * bios in hopes of making a full stripe
+ */
+int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
+{
+ struct btrfs_stripe_hash_table *table;
+ struct btrfs_stripe_hash_table *x;
+ struct btrfs_stripe_hash *cur;
+ struct btrfs_stripe_hash *h;
+ int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
+ int i;
+ int table_size;
+
+ if (info->stripe_hash_table)
+ return 0;
+
+ /*
+ * The table is large, starting with order 4 and can go as high as
+ * order 7 in case lock debugging is turned on.
+ *
+ * Try harder to allocate and fallback to vmalloc to lower the chance
+ * of a failing mount.
+ */
+ table_size = sizeof(*table) + sizeof(*h) * num_entries;
+ table = kzalloc(table_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
+ if (!table) {
+ table = vzalloc(table_size);
+ if (!table)
+ return -ENOMEM;
+ }
+
+ spin_lock_init(&table->cache_lock);
+ INIT_LIST_HEAD(&table->stripe_cache);
+
+ h = table->table;
+
+ for (i = 0; i < num_entries; i++) {
+ cur = h + i;
+ INIT_LIST_HEAD(&cur->hash_list);
+ spin_lock_init(&cur->lock);
+ init_waitqueue_head(&cur->wait);
+ }
+
+ x = cmpxchg(&info->stripe_hash_table, NULL, table);
+ if (x)
+ kvfree(x);
+ return 0;
+}
+
+/*
+ * caching an rbio means to copy anything from the
+ * bio_pages array into the stripe_pages array. We
+ * use the page uptodate bit in the stripe cache array
+ * to indicate if it has valid data
+ *
+ * once the caching is done, we set the cache ready
+ * bit.
+ */
+static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
+{
+ int i;
+ char *s;
+ char *d;
+ int ret;
+
+ ret = alloc_rbio_pages(rbio);
+ if (ret)
+ return;
+
+ for (i = 0; i < rbio->nr_pages; i++) {
+ if (!rbio->bio_pages[i])
+ continue;
+
+ s = kmap(rbio->bio_pages[i]);
+ d = kmap(rbio->stripe_pages[i]);
+
+ memcpy(d, s, PAGE_CACHE_SIZE);
+
+ kunmap(rbio->bio_pages[i]);
+ kunmap(rbio->stripe_pages[i]);
+ SetPageUptodate(rbio->stripe_pages[i]);
+ }
+ set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
+}
+
+/*
+ * we hash on the first logical address of the stripe
+ */
+static int rbio_bucket(struct btrfs_raid_bio *rbio)
+{
+ u64 num = rbio->bbio->raid_map[0];
+
+ /*
+ * we shift down quite a bit. We're using byte
+ * addressing, and most of the lower bits are zeros.
+ * This tends to upset hash_64, and it consistently
+ * returns just one or two different values.
+ *
+ * shifting off the lower bits fixes things.
+ */
+ return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
+}
+
+/*
+ * stealing an rbio means taking all the uptodate pages from the stripe
+ * array in the source rbio and putting them into the destination rbio
+ */
+static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
+{
+ int i;
+ struct page *s;
+ struct page *d;
+
+ if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
+ return;
+
+ for (i = 0; i < dest->nr_pages; i++) {
+ s = src->stripe_pages[i];
+ if (!s || !PageUptodate(s)) {
+ continue;
+ }
+
+ d = dest->stripe_pages[i];
+ if (d)
+ __free_page(d);
+
+ dest->stripe_pages[i] = s;
+ src->stripe_pages[i] = NULL;
+ }
+}
+
+/*
+ * merging means we take the bio_list from the victim and
+ * splice it into the destination. The victim should
+ * be discarded afterwards.
+ *
+ * must be called with dest->rbio_list_lock held
+ */
+static void merge_rbio(struct btrfs_raid_bio *dest,
+ struct btrfs_raid_bio *victim)
+{
+ bio_list_merge(&dest->bio_list, &victim->bio_list);
+ dest->bio_list_bytes += victim->bio_list_bytes;
+ dest->generic_bio_cnt += victim->generic_bio_cnt;
+ bio_list_init(&victim->bio_list);
+}
+
+/*
+ * used to prune items that are in the cache. The caller
+ * must hold the hash table lock.
+ */
+static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
+{
+ int bucket = rbio_bucket(rbio);
+ struct btrfs_stripe_hash_table *table;
+ struct btrfs_stripe_hash *h;
+ int freeit = 0;
+
+ /*
+ * check the bit again under the hash table lock.
+ */
+ if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
+ return;
+
+ table = rbio->fs_info->stripe_hash_table;
+ h = table->table + bucket;
+
+ /* hold the lock for the bucket because we may be
+ * removing it from the hash table
+ */
+ spin_lock(&h->lock);
+
+ /*
+ * hold the lock for the bio list because we need
+ * to make sure the bio list is empty
+ */
+ spin_lock(&rbio->bio_list_lock);
+
+ if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
+ list_del_init(&rbio->stripe_cache);
+ table->cache_size -= 1;
+ freeit = 1;
+
+ /* if the bio list isn't empty, this rbio is
+ * still involved in an IO. We take it out
+ * of the cache list, and drop the ref that
+ * was held for the list.
+ *
+ * If the bio_list was empty, we also remove
+ * the rbio from the hash_table, and drop
+ * the corresponding ref
+ */
+ if (bio_list_empty(&rbio->bio_list)) {
+ if (!list_empty(&rbio->hash_list)) {
+ list_del_init(&rbio->hash_list);
+ atomic_dec(&rbio->refs);
+ BUG_ON(!list_empty(&rbio->plug_list));
+ }
+ }
+ }
+
+ spin_unlock(&rbio->bio_list_lock);
+ spin_unlock(&h->lock);
+
+ if (freeit)
+ __free_raid_bio(rbio);
+}
+
+/*
+ * prune a given rbio from the cache
+ */
+static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
+{
+ struct btrfs_stripe_hash_table *table;
+ unsigned long flags;
+
+ if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
+ return;
+
+ table = rbio->fs_info->stripe_hash_table;
+
+ spin_lock_irqsave(&table->cache_lock, flags);
+ __remove_rbio_from_cache(rbio);
+ spin_unlock_irqrestore(&table->cache_lock, flags);
+}
+
+/*
+ * remove everything in the cache
+ */
+static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
+{
+ struct btrfs_stripe_hash_table *table;
+ unsigned long flags;
+ struct btrfs_raid_bio *rbio;
+
+ table = info->stripe_hash_table;
+
+ spin_lock_irqsave(&table->cache_lock, flags);
+ while (!list_empty(&table->stripe_cache)) {
+ rbio = list_entry(table->stripe_cache.next,
+ struct btrfs_raid_bio,
+ stripe_cache);
+ __remove_rbio_from_cache(rbio);
+ }
+ spin_unlock_irqrestore(&table->cache_lock, flags);
+}
+
+/*
+ * remove all cached entries and free the hash table
+ * used by unmount
+ */
+void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
+{
+ if (!info->stripe_hash_table)
+ return;
+ btrfs_clear_rbio_cache(info);
+ kvfree(info->stripe_hash_table);
+ info->stripe_hash_table = NULL;
+}
+
+/*
+ * insert an rbio into the stripe cache. It
+ * must have already been prepared by calling
+ * cache_rbio_pages
+ *
+ * If this rbio was already cached, it gets
+ * moved to the front of the lru.
+ *
+ * If the size of the rbio cache is too big, we
+ * prune an item.
+ */
+static void cache_rbio(struct btrfs_raid_bio *rbio)
+{
+ struct btrfs_stripe_hash_table *table;
+ unsigned long flags;
+
+ if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
+ return;
+
+ table = rbio->fs_info->stripe_hash_table;
+
+ spin_lock_irqsave(&table->cache_lock, flags);
+ spin_lock(&rbio->bio_list_lock);
+
+ /* bump our ref if we were not in the list before */
+ if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
+ atomic_inc(&rbio->refs);
+
+ if (!list_empty(&rbio->stripe_cache)){
+ list_move(&rbio->stripe_cache, &table->stripe_cache);
+ } else {
+ list_add(&rbio->stripe_cache, &table->stripe_cache);
+ table->cache_size += 1;
+ }
+
+ spin_unlock(&rbio->bio_list_lock);
+
+ if (table->cache_size > RBIO_CACHE_SIZE) {
+ struct btrfs_raid_bio *found;
+
+ found = list_entry(table->stripe_cache.prev,
+ struct btrfs_raid_bio,
+ stripe_cache);
+
+ if (found != rbio)
+ __remove_rbio_from_cache(found);
+ }
+
+ spin_unlock_irqrestore(&table->cache_lock, flags);
+ return;
+}
+
+/*
+ * helper function to run the xor_blocks api. It is only
+ * able to do MAX_XOR_BLOCKS at a time, so we need to
+ * loop through.
+ */
+static void run_xor(void **pages, int src_cnt, ssize_t len)
+{
+ int src_off = 0;
+ int xor_src_cnt = 0;
+ void *dest = pages[src_cnt];
+
+ while(src_cnt > 0) {
+ xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
+ xor_blocks(xor_src_cnt, len, dest, pages + src_off);
+
+ src_cnt -= xor_src_cnt;
+ src_off += xor_src_cnt;
+ }
+}
+
+/*
+ * returns true if the bio list inside this rbio
+ * covers an entire stripe (no rmw required).
+ * Must be called with the bio list lock held, or
+ * at a time when you know it is impossible to add
+ * new bios into the list
+ */
+static int __rbio_is_full(struct btrfs_raid_bio *rbio)
+{
+ unsigned long size = rbio->bio_list_bytes;
+ int ret = 1;
+
+ if (size != rbio->nr_data * rbio->stripe_len)
+ ret = 0;
+
+ BUG_ON(size > rbio->nr_data * rbio->stripe_len);
+ return ret;
+}
+
+static int rbio_is_full(struct btrfs_raid_bio *rbio)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&rbio->bio_list_lock, flags);
+ ret = __rbio_is_full(rbio);
+ spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
+ return ret;
+}
+
+/*
+ * returns 1 if it is safe to merge two rbios together.
+ * The merging is safe if the two rbios correspond to
+ * the same stripe and if they are both going in the same
+ * direction (read vs write), and if neither one is
+ * locked for final IO
+ *
+ * The caller is responsible for locking such that
+ * rmw_locked is safe to test
+ */
+static int rbio_can_merge(struct btrfs_raid_bio *last,
+ struct btrfs_raid_bio *cur)
+{
+ if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
+ test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
+ return 0;
+
+ /*
+ * we can't merge with cached rbios, since the
+ * idea is that when we merge the destination
+ * rbio is going to run our IO for us. We can
+ * steal from cached rbio's though, other functions
+ * handle that.
+ */
+ if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
+ test_bit(RBIO_CACHE_BIT, &cur->flags))
+ return 0;
+
+ if (last->bbio->raid_map[0] !=
+ cur->bbio->raid_map[0])
+ return 0;
+
+ /* we can't merge with different operations */
+ if (last->operation != cur->operation)
+ return 0;
+ /*
+ * We've need read the full stripe from the drive.
+ * check and repair the parity and write the new results.
+ *
+ * We're not allowed to add any new bios to the
+ * bio list here, anyone else that wants to
+ * change this stripe needs to do their own rmw.
+ */
+ if (last->operation == BTRFS_RBIO_PARITY_SCRUB ||
+ cur->operation == BTRFS_RBIO_PARITY_SCRUB)
+ return 0;
+
+ return 1;
+}
+
+/*
+ * helper to index into the pstripe
+ */
+static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
+{
+ index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
+ return rbio->stripe_pages[index];
+}
+
+/*
+ * helper to index into the qstripe, returns null
+ * if there is no qstripe
+ */
+static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
+{
+ if (rbio->nr_data + 1 == rbio->real_stripes)
+ return NULL;
+
+ index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
+ PAGE_CACHE_SHIFT;
+ return rbio->stripe_pages[index];
+}
+
+/*
+ * The first stripe in the table for a logical address
+ * has the lock. rbios are added in one of three ways:
+ *
+ * 1) Nobody has the stripe locked yet. The rbio is given
+ * the lock and 0 is returned. The caller must start the IO
+ * themselves.
+ *
+ * 2) Someone has the stripe locked, but we're able to merge
+ * with the lock owner. The rbio is freed and the IO will
+ * start automatically along with the existing rbio. 1 is returned.
+ *
+ * 3) Someone has the stripe locked, but we're not able to merge.
+ * The rbio is added to the lock owner's plug list, or merged into
+ * an rbio already on the plug list. When the lock owner unlocks,
+ * the next rbio on the list is run and the IO is started automatically.
+ * 1 is returned
+ *
+ * If we return 0, the caller still owns the rbio and must continue with
+ * IO submission. If we return 1, the caller must assume the rbio has
+ * already been freed.
+ */
+static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
+{
+ int bucket = rbio_bucket(rbio);
+ struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket;
+ struct btrfs_raid_bio *cur;
+ struct btrfs_raid_bio *pending;
+ unsigned long flags;
+ DEFINE_WAIT(wait);
+ struct btrfs_raid_bio *freeit = NULL;
+ struct btrfs_raid_bio *cache_drop = NULL;
+ int ret = 0;
+ int walk = 0;
+
+ spin_lock_irqsave(&h->lock, flags);
+ list_for_each_entry(cur, &h->hash_list, hash_list) {
+ walk++;
+ if (cur->bbio->raid_map[0] == rbio->bbio->raid_map[0]) {
+ spin_lock(&cur->bio_list_lock);
+
+ /* can we steal this cached rbio's pages? */
+ if (bio_list_empty(&cur->bio_list) &&
+ list_empty(&cur->plug_list) &&
+ test_bit(RBIO_CACHE_BIT, &cur->flags) &&
+ !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
+ list_del_init(&cur->hash_list);
+ atomic_dec(&cur->refs);
+
+ steal_rbio(cur, rbio);
+ cache_drop = cur;
+ spin_unlock(&cur->bio_list_lock);
+
+ goto lockit;
+ }
+
+ /* can we merge into the lock owner? */
+ if (rbio_can_merge(cur, rbio)) {
+ merge_rbio(cur, rbio);
+ spin_unlock(&cur->bio_list_lock);
+ freeit = rbio;
+ ret = 1;
+ goto out;
+ }
+
+
+ /*
+ * we couldn't merge with the running
+ * rbio, see if we can merge with the
+ * pending ones. We don't have to
+ * check for rmw_locked because there
+ * is no way they are inside finish_rmw
+ * right now
+ */
+ list_for_each_entry(pending, &cur->plug_list,
+ plug_list) {
+ if (rbio_can_merge(pending, rbio)) {
+ merge_rbio(pending, rbio);
+ spin_unlock(&cur->bio_list_lock);
+ freeit = rbio;
+ ret = 1;
+ goto out;
+ }
+ }
+
+ /* no merging, put us on the tail of the plug list,
+ * our rbio will be started with the currently
+ * running rbio unlocks
+ */
+ list_add_tail(&rbio->plug_list, &cur->plug_list);
+ spin_unlock(&cur->bio_list_lock);
+ ret = 1;
+ goto out;
+ }
+ }
+lockit:
+ atomic_inc(&rbio->refs);
+ list_add(&rbio->hash_list, &h->hash_list);
+out:
+ spin_unlock_irqrestore(&h->lock, flags);
+ if (cache_drop)
+ remove_rbio_from_cache(cache_drop);
+ if (freeit)
+ __free_raid_bio(freeit);
+ return ret;
+}
+
+/*
+ * called as rmw or parity rebuild is completed. If the plug list has more
+ * rbios waiting for this stripe, the next one on the list will be started
+ */
+static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
+{
+ int bucket;
+ struct btrfs_stripe_hash *h;
+ unsigned long flags;
+ int keep_cache = 0;
+
+ bucket = rbio_bucket(rbio);
+ h = rbio->fs_info->stripe_hash_table->table + bucket;
+
+ if (list_empty(&rbio->plug_list))
+ cache_rbio(rbio);
+
+ spin_lock_irqsave(&h->lock, flags);
+ spin_lock(&rbio->bio_list_lock);
+
+ if (!list_empty(&rbio->hash_list)) {
+ /*
+ * if we're still cached and there is no other IO
+ * to perform, just leave this rbio here for others
+ * to steal from later
+ */
+ if (list_empty(&rbio->plug_list) &&
+ test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
+ keep_cache = 1;
+ clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
+ BUG_ON(!bio_list_empty(&rbio->bio_list));
+ goto done;
+ }
+
+ list_del_init(&rbio->hash_list);
+ atomic_dec(&rbio->refs);
+
+ /*
+ * we use the plug list to hold all the rbios
+ * waiting for the chance to lock this stripe.
+ * hand the lock over to one of them.
+ */
+ if (!list_empty(&rbio->plug_list)) {
+ struct btrfs_raid_bio *next;
+ struct list_head *head = rbio->plug_list.next;
+
+ next = list_entry(head, struct btrfs_raid_bio,
+ plug_list);
+
+ list_del_init(&rbio->plug_list);
+
+ list_add(&next->hash_list, &h->hash_list);
+ atomic_inc(&next->refs);
+ spin_unlock(&rbio->bio_list_lock);
+ spin_unlock_irqrestore(&h->lock, flags);
+
+ if (next->operation == BTRFS_RBIO_READ_REBUILD)
+ async_read_rebuild(next);
+ else if (next->operation == BTRFS_RBIO_WRITE) {
+ steal_rbio(rbio, next);
+ async_rmw_stripe(next);
+ } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
+ steal_rbio(rbio, next);
+ async_scrub_parity(next);
+ }
+
+ goto done_nolock;
+ } else if (waitqueue_active(&h->wait)) {
+ spin_unlock(&rbio->bio_list_lock);
+ spin_unlock_irqrestore(&h->lock, flags);
+ wake_up(&h->wait);
+ goto done_nolock;
+ }
+ }
+done:
+ spin_unlock(&rbio->bio_list_lock);
+ spin_unlock_irqrestore(&h->lock, flags);
+
+done_nolock:
+ if (!keep_cache)
+ remove_rbio_from_cache(rbio);
+}
+
+static void __free_raid_bio(struct btrfs_raid_bio *rbio)
+{
+ int i;
+
+ WARN_ON(atomic_read(&rbio->refs) < 0);
+ if (!atomic_dec_and_test(&rbio->refs))
+ return;
+
+ WARN_ON(!list_empty(&rbio->stripe_cache));
+ WARN_ON(!list_empty(&rbio->hash_list));
+ WARN_ON(!bio_list_empty(&rbio->bio_list));
+
+ for (i = 0; i < rbio->nr_pages; i++) {
+ if (rbio->stripe_pages[i]) {
+ __free_page(rbio->stripe_pages[i]);
+ rbio->stripe_pages[i] = NULL;
+ }
+ }
+
+ btrfs_put_bbio(rbio->bbio);
+ kfree(rbio);
+}
+
+static void free_raid_bio(struct btrfs_raid_bio *rbio)
+{
+ unlock_stripe(rbio);
+ __free_raid_bio(rbio);
+}
+
+/*
+ * this frees the rbio and runs through all the bios in the
+ * bio_list and calls end_io on them
+ */
+static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
+{
+ struct bio *cur = bio_list_get(&rbio->bio_list);
+ struct bio *next;
+
+ if (rbio->generic_bio_cnt)
+ btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt);
+
+ free_raid_bio(rbio);
+
+ while (cur) {
+ next = cur->bi_next;
+ cur->bi_next = NULL;
+ if (uptodate)
+ set_bit(BIO_UPTODATE, &cur->bi_flags);
+ bio_endio(cur, err);
+ cur = next;
+ }
+}
+
+/*
+ * end io function used by finish_rmw. When we finally
+ * get here, we've written a full stripe
+ */
+static void raid_write_end_io(struct bio *bio, int err)
+{
+ struct btrfs_raid_bio *rbio = bio->bi_private;
+
+ if (err)
+ fail_bio_stripe(rbio, bio);
+
+ bio_put(bio);
+
+ if (!atomic_dec_and_test(&rbio->stripes_pending))
+ return;
+
+ err = 0;
+
+ /* OK, we have read all the stripes we need to. */
+ if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
+ err = -EIO;
+
+ rbio_orig_end_io(rbio, err, 0);
+ return;
+}
+
+/*
+ * the read/modify/write code wants to use the original bio for
+ * any pages it included, and then use the rbio for everything
+ * else. This function decides if a given index (stripe number)
+ * and page number in that stripe fall inside the original bio
+ * or the rbio.
+ *
+ * if you set bio_list_only, you'll get a NULL back for any ranges
+ * that are outside the bio_list
+ *
+ * This doesn't take any refs on anything, you get a bare page pointer
+ * and the caller must bump refs as required.
+ *
+ * You must call index_rbio_pages once before you can trust
+ * the answers from this function.
+ */
+static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
+ int index, int pagenr, int bio_list_only)
+{
+ int chunk_page;
+ struct page *p = NULL;
+
+ chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
+
+ spin_lock_irq(&rbio->bio_list_lock);
+ p = rbio->bio_pages[chunk_page];
+ spin_unlock_irq(&rbio->bio_list_lock);
+
+ if (p || bio_list_only)
+ return p;
+
+ return rbio->stripe_pages[chunk_page];
+}
+
+/*
+ * number of pages we need for the entire stripe across all the
+ * drives
+ */
+static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
+{
+ unsigned long nr = stripe_len * nr_stripes;
+ return DIV_ROUND_UP(nr, PAGE_CACHE_SIZE);
+}
+
+/*
+ * allocation and initial setup for the btrfs_raid_bio. Not
+ * this does not allocate any pages for rbio->pages.
+ */
+static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
+ struct btrfs_bio *bbio, u64 stripe_len)
+{
+ struct btrfs_raid_bio *rbio;
+ int nr_data = 0;
+ int real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
+ int num_pages = rbio_nr_pages(stripe_len, real_stripes);
+ int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE);
+ void *p;
+
+ rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2 +
+ DIV_ROUND_UP(stripe_npages, BITS_PER_LONG / 8),
+ GFP_NOFS);
+ if (!rbio)
+ return ERR_PTR(-ENOMEM);
+
+ bio_list_init(&rbio->bio_list);
+ INIT_LIST_HEAD(&rbio->plug_list);
+ spin_lock_init(&rbio->bio_list_lock);
+ INIT_LIST_HEAD(&rbio->stripe_cache);
+ INIT_LIST_HEAD(&rbio->hash_list);
+ rbio->bbio = bbio;
+ rbio->fs_info = root->fs_info;
+ rbio->stripe_len = stripe_len;
+ rbio->nr_pages = num_pages;
+ rbio->real_stripes = real_stripes;
+ rbio->stripe_npages = stripe_npages;
+ rbio->faila = -1;
+ rbio->failb = -1;
+ atomic_set(&rbio->refs, 1);
+ atomic_set(&rbio->error, 0);
+ atomic_set(&rbio->stripes_pending, 0);
+
+ /*
+ * the stripe_pages and bio_pages array point to the extra
+ * memory we allocated past the end of the rbio
+ */
+ p = rbio + 1;
+ rbio->stripe_pages = p;
+ rbio->bio_pages = p + sizeof(struct page *) * num_pages;
+ rbio->dbitmap = p + sizeof(struct page *) * num_pages * 2;
+
+ if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
+ nr_data = real_stripes - 1;
+ else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
+ nr_data = real_stripes - 2;
+ else
+ BUG();
+
+ rbio->nr_data = nr_data;
+ return rbio;
+}
+
+/* allocate pages for all the stripes in the bio, including parity */
+static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
+{
+ int i;
+ struct page *page;
+
+ for (i = 0; i < rbio->nr_pages; i++) {
+ if (rbio->stripe_pages[i])
+ continue;
+ page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ if (!page)
+ return -ENOMEM;
+ rbio->stripe_pages[i] = page;
+ ClearPageUptodate(page);
+ }
+ return 0;
+}
+
+/* allocate pages for just the p/q stripes */
+static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
+{
+ int i;
+ struct page *page;
+
+ i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
+
+ for (; i < rbio->nr_pages; i++) {
+ if (rbio->stripe_pages[i])
+ continue;
+ page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ if (!page)
+ return -ENOMEM;
+ rbio->stripe_pages[i] = page;
+ }
+ return 0;
+}
+
+/*
+ * add a single page from a specific stripe into our list of bios for IO
+ * this will try to merge into existing bios if possible, and returns
+ * zero if all went well.
+ */
+static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
+ struct bio_list *bio_list,
+ struct page *page,
+ int stripe_nr,
+ unsigned long page_index,
+ unsigned long bio_max_len)
+{
+ struct bio *last = bio_list->tail;
+ u64 last_end = 0;
+ int ret;
+ struct bio *bio;
+ struct btrfs_bio_stripe *stripe;
+ u64 disk_start;
+
+ stripe = &rbio->bbio->stripes[stripe_nr];
+ disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT);
+
+ /* if the device is missing, just fail this stripe */
+ if (!stripe->dev->bdev)
+ return fail_rbio_index(rbio, stripe_nr);
+
+ /* see if we can add this page onto our existing bio */
+ if (last) {
+ last_end = (u64)last->bi_iter.bi_sector << 9;
+ last_end += last->bi_iter.bi_size;
+
+ /*
+ * we can't merge these if they are from different
+ * devices or if they are not contiguous
+ */
+ if (last_end == disk_start && stripe->dev->bdev &&
+ test_bit(BIO_UPTODATE, &last->bi_flags) &&
+ last->bi_bdev == stripe->dev->bdev) {
+ ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0);
+ if (ret == PAGE_CACHE_SIZE)
+ return 0;
+ }
+ }
+
+ /* put a new bio on the list */
+ bio = btrfs_io_bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1);
+ if (!bio)
+ return -ENOMEM;
+
+ bio->bi_iter.bi_size = 0;
+ bio->bi_bdev = stripe->dev->bdev;
+ bio->bi_iter.bi_sector = disk_start >> 9;
+ set_bit(BIO_UPTODATE, &bio->bi_flags);
+
+ bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
+ bio_list_add(bio_list, bio);
+ return 0;
+}
+
+/*
+ * while we're doing the read/modify/write cycle, we could
+ * have errors in reading pages off the disk. This checks
+ * for errors and if we're not able to read the page it'll
+ * trigger parity reconstruction. The rmw will be finished
+ * after we've reconstructed the failed stripes
+ */
+static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
+{
+ if (rbio->faila >= 0 || rbio->failb >= 0) {
+ BUG_ON(rbio->faila == rbio->real_stripes - 1);
+ __raid56_parity_recover(rbio);
+ } else {
+ finish_rmw(rbio);
+ }
+}
+
+/*
+ * these are just the pages from the rbio array, not from anything
+ * the FS sent down to us
+ */
+static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page)
+{
+ int index;
+ index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT);
+ index += page;
+ return rbio->stripe_pages[index];
+}
+
+/*
+ * helper function to walk our bio list and populate the bio_pages array with
+ * the result. This seems expensive, but it is faster than constantly
+ * searching through the bio list as we setup the IO in finish_rmw or stripe
+ * reconstruction.
+ *
+ * This must be called before you trust the answers from page_in_rbio
+ */
+static void index_rbio_pages(struct btrfs_raid_bio *rbio)
+{
+ struct bio *bio;
+ u64 start;
+ unsigned long stripe_offset;
+ unsigned long page_index;
+ struct page *p;
+ int i;
+
+ spin_lock_irq(&rbio->bio_list_lock);
+ bio_list_for_each(bio, &rbio->bio_list) {
+ start = (u64)bio->bi_iter.bi_sector << 9;
+ stripe_offset = start - rbio->bbio->raid_map[0];
+ page_index = stripe_offset >> PAGE_CACHE_SHIFT;
+
+ for (i = 0; i < bio->bi_vcnt; i++) {
+ p = bio->bi_io_vec[i].bv_page;
+ rbio->bio_pages[page_index + i] = p;
+ }
+ }
+ spin_unlock_irq(&rbio->bio_list_lock);
+}
+
+/*
+ * this is called from one of two situations. We either
+ * have a full stripe from the higher layers, or we've read all
+ * the missing bits off disk.
+ *
+ * This will calculate the parity and then send down any
+ * changed blocks.
+ */
+static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
+{
+ struct btrfs_bio *bbio = rbio->bbio;
+ void *pointers[rbio->real_stripes];
+ int stripe_len = rbio->stripe_len;
+ int nr_data = rbio->nr_data;
+ int stripe;
+ int pagenr;
+ int p_stripe = -1;
+ int q_stripe = -1;
+ struct bio_list bio_list;
+ struct bio *bio;
+ int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT;
+ int ret;
+
+ bio_list_init(&bio_list);
+
+ if (rbio->real_stripes - rbio->nr_data == 1) {
+ p_stripe = rbio->real_stripes - 1;
+ } else if (rbio->real_stripes - rbio->nr_data == 2) {
+ p_stripe = rbio->real_stripes - 2;
+ q_stripe = rbio->real_stripes - 1;
+ } else {
+ BUG();
+ }
+
+ /* at this point we either have a full stripe,
+ * or we've read the full stripe from the drive.
+ * recalculate the parity and write the new results.
+ *
+ * We're not allowed to add any new bios to the
+ * bio list here, anyone else that wants to
+ * change this stripe needs to do their own rmw.
+ */
+ spin_lock_irq(&rbio->bio_list_lock);
+ set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
+ spin_unlock_irq(&rbio->bio_list_lock);
+
+ atomic_set(&rbio->error, 0);
+
+ /*
+ * now that we've set rmw_locked, run through the
+ * bio list one last time and map the page pointers
+ *
+ * We don't cache full rbios because we're assuming
+ * the higher layers are unlikely to use this area of
+ * the disk again soon. If they do use it again,
+ * hopefully they will send another full bio.
+ */
+ index_rbio_pages(rbio);
+ if (!rbio_is_full(rbio))
+ cache_rbio_pages(rbio);
+ else
+ clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
+
+ for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
+ struct page *p;
+ /* first collect one page from each data stripe */
+ for (stripe = 0; stripe < nr_data; stripe++) {
+ p = page_in_rbio(rbio, stripe, pagenr, 0);
+ pointers[stripe] = kmap(p);
+ }
+
+ /* then add the parity stripe */
+ p = rbio_pstripe_page(rbio, pagenr);
+ SetPageUptodate(p);
+ pointers[stripe++] = kmap(p);
+
+ if (q_stripe != -1) {
+
+ /*
+ * raid6, add the qstripe and call the
+ * library function to fill in our p/q
+ */
+ p = rbio_qstripe_page(rbio, pagenr);
+ SetPageUptodate(p);
+ pointers[stripe++] = kmap(p);
+
+ raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
+ pointers);
+ } else {
+ /* raid5 */
+ memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
+ run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
+ }
+
+
+ for (stripe = 0; stripe < rbio->real_stripes; stripe++)
+ kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
+ }
+
+ /*
+ * time to start writing. Make bios for everything from the
+ * higher layers (the bio_list in our rbio) and our p/q. Ignore
+ * everything else.
+ */
+ for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
+ for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
+ struct page *page;
+ if (stripe < rbio->nr_data) {
+ page = page_in_rbio(rbio, stripe, pagenr, 1);
+ if (!page)
+ continue;
+ } else {
+ page = rbio_stripe_page(rbio, stripe, pagenr);
+ }
+
+ ret = rbio_add_io_page(rbio, &bio_list,
+ page, stripe, pagenr, rbio->stripe_len);
+ if (ret)
+ goto cleanup;
+ }
+ }
+
+ if (likely(!bbio->num_tgtdevs))
+ goto write_data;
+
+ for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
+ if (!bbio->tgtdev_map[stripe])
+ continue;
+
+ for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
+ struct page *page;
+ if (stripe < rbio->nr_data) {
+ page = page_in_rbio(rbio, stripe, pagenr, 1);
+ if (!page)
+ continue;
+ } else {
+ page = rbio_stripe_page(rbio, stripe, pagenr);
+ }
+
+ ret = rbio_add_io_page(rbio, &bio_list, page,
+ rbio->bbio->tgtdev_map[stripe],
+ pagenr, rbio->stripe_len);
+ if (ret)
+ goto cleanup;
+ }
+ }
+
+write_data:
+ atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
+ BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
+
+ while (1) {
+ bio = bio_list_pop(&bio_list);
+ if (!bio)
+ break;
+
+ bio->bi_private = rbio;
+ bio->bi_end_io = raid_write_end_io;
+ BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
+ submit_bio(WRITE, bio);
+ }
+ return;
+
+cleanup:
+ rbio_orig_end_io(rbio, -EIO, 0);
+}
+
+/*
+ * helper to find the stripe number for a given bio. Used to figure out which
+ * stripe has failed. This expects the bio to correspond to a physical disk,
+ * so it looks up based on physical sector numbers.
+ */
+static int find_bio_stripe(struct btrfs_raid_bio *rbio,
+ struct bio *bio)
+{
+ u64 physical = bio->bi_iter.bi_sector;
+ u64 stripe_start;
+ int i;
+ struct btrfs_bio_stripe *stripe;
+
+ physical <<= 9;
+
+ for (i = 0; i < rbio->bbio->num_stripes; i++) {
+ stripe = &rbio->bbio->stripes[i];
+ stripe_start = stripe->physical;
+ if (physical >= stripe_start &&
+ physical < stripe_start + rbio->stripe_len &&
+ bio->bi_bdev == stripe->dev->bdev) {
+ return i;
+ }
+ }
+ return -1;
+}
+
+/*
+ * helper to find the stripe number for a given
+ * bio (before mapping). Used to figure out which stripe has
+ * failed. This looks up based on logical block numbers.
+ */
+static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
+ struct bio *bio)
+{
+ u64 logical = bio->bi_iter.bi_sector;
+ u64 stripe_start;
+ int i;
+
+ logical <<= 9;
+
+ for (i = 0; i < rbio->nr_data; i++) {
+ stripe_start = rbio->bbio->raid_map[i];
+ if (logical >= stripe_start &&
+ logical < stripe_start + rbio->stripe_len) {
+ return i;
+ }
+ }
+ return -1;
+}
+
+/*
+ * returns -EIO if we had too many failures
+ */
+static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
+{
+ unsigned long flags;
+ int ret = 0;
+
+ spin_lock_irqsave(&rbio->bio_list_lock, flags);
+
+ /* we already know this stripe is bad, move on */
+ if (rbio->faila == failed || rbio->failb == failed)
+ goto out;
+
+ if (rbio->faila == -1) {
+ /* first failure on this rbio */
+ rbio->faila = failed;
+ atomic_inc(&rbio->error);
+ } else if (rbio->failb == -1) {
+ /* second failure on this rbio */
+ rbio->failb = failed;
+ atomic_inc(&rbio->error);
+ } else {
+ ret = -EIO;
+ }
+out:
+ spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
+
+ return ret;
+}
+
+/*
+ * helper to fail a stripe based on a physical disk
+ * bio.
+ */
+static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
+ struct bio *bio)
+{
+ int failed = find_bio_stripe(rbio, bio);
+
+ if (failed < 0)
+ return -EIO;
+
+ return fail_rbio_index(rbio, failed);
+}
+
+/*
+ * this sets each page in the bio uptodate. It should only be used on private
+ * rbio pages, nothing that comes in from the higher layers
+ */
+static void set_bio_pages_uptodate(struct bio *bio)
+{
+ int i;
+ struct page *p;
+
+ for (i = 0; i < bio->bi_vcnt; i++) {
+ p = bio->bi_io_vec[i].bv_page;
+ SetPageUptodate(p);
+ }
+}
+
+/*
+ * end io for the read phase of the rmw cycle. All the bios here are physical
+ * stripe bios we've read from the disk so we can recalculate the parity of the
+ * stripe.
+ *
+ * This will usually kick off finish_rmw once all the bios are read in, but it
+ * may trigger parity reconstruction if we had any errors along the way
+ */
+static void raid_rmw_end_io(struct bio *bio, int err)
+{
+ struct btrfs_raid_bio *rbio = bio->bi_private;
+
+ if (err)
+ fail_bio_stripe(rbio, bio);
+ else
+ set_bio_pages_uptodate(bio);
+
+ bio_put(bio);
+
+ if (!atomic_dec_and_test(&rbio->stripes_pending))
+ return;
+
+ err = 0;
+ if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
+ goto cleanup;
+
+ /*
+ * this will normally call finish_rmw to start our write
+ * but if there are any failed stripes we'll reconstruct
+ * from parity first
+ */
+ validate_rbio_for_rmw(rbio);
+ return;
+
+cleanup:
+
+ rbio_orig_end_io(rbio, -EIO, 0);
+}
+
+static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
+{
+ btrfs_init_work(&rbio->work, btrfs_rmw_helper,
+ rmw_work, NULL, NULL);
+
+ btrfs_queue_work(rbio->fs_info->rmw_workers,
+ &rbio->work);
+}
+
+static void async_read_rebuild(struct btrfs_raid_bio *rbio)
+{
+ btrfs_init_work(&rbio->work, btrfs_rmw_helper,
+ read_rebuild_work, NULL, NULL);
+
+ btrfs_queue_work(rbio->fs_info->rmw_workers,
+ &rbio->work);
+}
+
+/*
+ * the stripe must be locked by the caller. It will
+ * unlock after all the writes are done
+ */
+static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
+{
+ int bios_to_read = 0;
+ struct bio_list bio_list;
+ int ret;
+ int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
+ int pagenr;
+ int stripe;
+ struct bio *bio;
+
+ bio_list_init(&bio_list);
+
+ ret = alloc_rbio_pages(rbio);
+ if (ret)
+ goto cleanup;
+
+ index_rbio_pages(rbio);
+
+ atomic_set(&rbio->error, 0);
+ /*
+ * build a list of bios to read all the missing parts of this
+ * stripe
+ */
+ for (stripe = 0; stripe < rbio->nr_data; stripe++) {
+ for (pagenr = 0; pagenr < nr_pages; pagenr++) {
+ struct page *page;
+ /*
+ * we want to find all the pages missing from
+ * the rbio and read them from the disk. If
+ * page_in_rbio finds a page in the bio list
+ * we don't need to read it off the stripe.
+ */
+ page = page_in_rbio(rbio, stripe, pagenr, 1);
+ if (page)
+ continue;
+
+ page = rbio_stripe_page(rbio, stripe, pagenr);
+ /*
+ * the bio cache may have handed us an uptodate
+ * page. If so, be happy and use it
+ */
+ if (PageUptodate(page))
+ continue;
+
+ ret = rbio_add_io_page(rbio, &bio_list, page,
+ stripe, pagenr, rbio->stripe_len);
+ if (ret)
+ goto cleanup;
+ }
+ }
+
+ bios_to_read = bio_list_size(&bio_list);
+ if (!bios_to_read) {
+ /*
+ * this can happen if others have merged with
+ * us, it means there is nothing left to read.
+ * But if there are missing devices it may not be
+ * safe to do the full stripe write yet.
+ */
+ goto finish;
+ }
+
+ /*
+ * the bbio may be freed once we submit the last bio. Make sure
+ * not to touch it after that
+ */
+ atomic_set(&rbio->stripes_pending, bios_to_read);
+ while (1) {
+ bio = bio_list_pop(&bio_list);
+ if (!bio)
+ break;
+
+ bio->bi_private = rbio;
+ bio->bi_end_io = raid_rmw_end_io;
+
+ btrfs_bio_wq_end_io(rbio->fs_info, bio,
+ BTRFS_WQ_ENDIO_RAID56);
+
+ BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
+ submit_bio(READ, bio);
+ }
+ /* the actual write will happen once the reads are done */
+ return 0;
+
+cleanup:
+ rbio_orig_end_io(rbio, -EIO, 0);
+ return -EIO;
+
+finish:
+ validate_rbio_for_rmw(rbio);
+ return 0;
+}
+
+/*
+ * if the upper layers pass in a full stripe, we thank them by only allocating
+ * enough pages to hold the parity, and sending it all down quickly.
+ */
+static int full_stripe_write(struct btrfs_raid_bio *rbio)
+{
+ int ret;
+
+ ret = alloc_rbio_parity_pages(rbio);
+ if (ret) {
+ __free_raid_bio(rbio);
+ return ret;
+ }
+
+ ret = lock_stripe_add(rbio);
+ if (ret == 0)
+ finish_rmw(rbio);
+ return 0;
+}
+
+/*
+ * partial stripe writes get handed over to async helpers.
+ * We're really hoping to merge a few more writes into this
+ * rbio before calculating new parity
+ */
+static int partial_stripe_write(struct btrfs_raid_bio *rbio)
+{
+ int ret;
+
+ ret = lock_stripe_add(rbio);
+ if (ret == 0)
+ async_rmw_stripe(rbio);
+ return 0;
+}
+
+/*
+ * sometimes while we were reading from the drive to
+ * recalculate parity, enough new bios come into create
+ * a full stripe. So we do a check here to see if we can
+ * go directly to finish_rmw
+ */
+static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
+{
+ /* head off into rmw land if we don't have a full stripe */
+ if (!rbio_is_full(rbio))
+ return partial_stripe_write(rbio);
+ return full_stripe_write(rbio);
+}
+
+/*
+ * We use plugging call backs to collect full stripes.
+ * Any time we get a partial stripe write while plugged
+ * we collect it into a list. When the unplug comes down,
+ * we sort the list by logical block number and merge
+ * everything we can into the same rbios
+ */
+struct btrfs_plug_cb {
+ struct blk_plug_cb cb;
+ struct btrfs_fs_info *info;
+ struct list_head rbio_list;
+ struct btrfs_work work;
+};
+
+/*
+ * rbios on the plug list are sorted for easier merging.
+ */
+static int plug_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+ struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
+ plug_list);
+ struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
+ plug_list);
+ u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
+ u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
+
+ if (a_sector < b_sector)
+ return -1;
+ if (a_sector > b_sector)
+ return 1;
+ return 0;
+}
+
+static void run_plug(struct btrfs_plug_cb *plug)
+{
+ struct btrfs_raid_bio *cur;
+ struct btrfs_raid_bio *last = NULL;
+
+ /*
+ * sort our plug list then try to merge
+ * everything we can in hopes of creating full
+ * stripes.
+ */
+ list_sort(NULL, &plug->rbio_list, plug_cmp);
+ while (!list_empty(&plug->rbio_list)) {
+ cur = list_entry(plug->rbio_list.next,
+ struct btrfs_raid_bio, plug_list);
+ list_del_init(&cur->plug_list);
+
+ if (rbio_is_full(cur)) {
+ /* we have a full stripe, send it down */
+ full_stripe_write(cur);
+ continue;
+ }
+ if (last) {
+ if (rbio_can_merge(last, cur)) {
+ merge_rbio(last, cur);
+ __free_raid_bio(cur);
+ continue;
+
+ }
+ __raid56_parity_write(last);
+ }
+ last = cur;
+ }
+ if (last) {
+ __raid56_parity_write(last);
+ }
+ kfree(plug);
+}
+
+/*
+ * if the unplug comes from schedule, we have to push the
+ * work off to a helper thread
+ */
+static void unplug_work(struct btrfs_work *work)
+{
+ struct btrfs_plug_cb *plug;
+ plug = container_of(work, struct btrfs_plug_cb, work);
+ run_plug(plug);
+}
+
+static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
+{
+ struct btrfs_plug_cb *plug;
+ plug = container_of(cb, struct btrfs_plug_cb, cb);
+
+ if (from_schedule) {
+ btrfs_init_work(&plug->work, btrfs_rmw_helper,
+ unplug_work, NULL, NULL);
+ btrfs_queue_work(plug->info->rmw_workers,
+ &plug->work);
+ return;
+ }
+ run_plug(plug);
+}
+
+/*
+ * our main entry point for writes from the rest of the FS.
+ */
+int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
+ struct btrfs_bio *bbio, u64 stripe_len)
+{
+ struct btrfs_raid_bio *rbio;
+ struct btrfs_plug_cb *plug = NULL;
+ struct blk_plug_cb *cb;
+ int ret;
+
+ rbio = alloc_rbio(root, bbio, stripe_len);
+ if (IS_ERR(rbio)) {
+ btrfs_put_bbio(bbio);
+ return PTR_ERR(rbio);
+ }
+ bio_list_add(&rbio->bio_list, bio);
+ rbio->bio_list_bytes = bio->bi_iter.bi_size;
+ rbio->operation = BTRFS_RBIO_WRITE;
+
+ btrfs_bio_counter_inc_noblocked(root->fs_info);
+ rbio->generic_bio_cnt = 1;
+
+ /*
+ * don't plug on full rbios, just get them out the door
+ * as quickly as we can
+ */
+ if (rbio_is_full(rbio)) {
+ ret = full_stripe_write(rbio);
+ if (ret)
+ btrfs_bio_counter_dec(root->fs_info);
+ return ret;
+ }
+
+ cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info,
+ sizeof(*plug));
+ if (cb) {
+ plug = container_of(cb, struct btrfs_plug_cb, cb);
+ if (!plug->info) {
+ plug->info = root->fs_info;
+ INIT_LIST_HEAD(&plug->rbio_list);
+ }
+ list_add_tail(&rbio->plug_list, &plug->rbio_list);
+ ret = 0;
+ } else {
+ ret = __raid56_parity_write(rbio);
+ if (ret)
+ btrfs_bio_counter_dec(root->fs_info);
+ }
+ return ret;
+}
+
+/*
+ * all parity reconstruction happens here. We've read in everything
+ * we can find from the drives and this does the heavy lifting of
+ * sorting the good from the bad.
+ */
+static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
+{
+ int pagenr, stripe;
+ void **pointers;
+ int faila = -1, failb = -1;
+ int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
+ struct page *page;
+ int err;
+ int i;
+
+ pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
+ if (!pointers) {
+ err = -ENOMEM;
+ goto cleanup_io;
+ }
+
+ faila = rbio->faila;
+ failb = rbio->failb;
+
+ if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
+ spin_lock_irq(&rbio->bio_list_lock);
+ set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
+ spin_unlock_irq(&rbio->bio_list_lock);
+ }
+
+ index_rbio_pages(rbio);
+
+ for (pagenr = 0; pagenr < nr_pages; pagenr++) {
+ /*
+ * Now we just use bitmap to mark the horizontal stripes in
+ * which we have data when doing parity scrub.
+ */
+ if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
+ !test_bit(pagenr, rbio->dbitmap))
+ continue;
+
+ /* setup our array of pointers with pages
+ * from each stripe
+ */
+ for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
+ /*
+ * if we're rebuilding a read, we have to use
+ * pages from the bio list
+ */
+ if (rbio->operation == BTRFS_RBIO_READ_REBUILD &&
+ (stripe == faila || stripe == failb)) {
+ page = page_in_rbio(rbio, stripe, pagenr, 0);
+ } else {
+ page = rbio_stripe_page(rbio, stripe, pagenr);
+ }
+ pointers[stripe] = kmap(page);
+ }
+
+ /* all raid6 handling here */
+ if (rbio->bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) {
+ /*
+ * single failure, rebuild from parity raid5
+ * style
+ */
+ if (failb < 0) {
+ if (faila == rbio->nr_data) {
+ /*
+ * Just the P stripe has failed, without
+ * a bad data or Q stripe.
+ * TODO, we should redo the xor here.
+ */
+ err = -EIO;
+ goto cleanup;
+ }
+ /*
+ * a single failure in raid6 is rebuilt
+ * in the pstripe code below
+ */
+ goto pstripe;
+ }
+
+ /* make sure our ps and qs are in order */
+ if (faila > failb) {
+ int tmp = failb;
+ failb = faila;
+ faila = tmp;
+ }
+
+ /* if the q stripe is failed, do a pstripe reconstruction
+ * from the xors.
+ * If both the q stripe and the P stripe are failed, we're
+ * here due to a crc mismatch and we can't give them the
+ * data they want
+ */
+ if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) {
+ if (rbio->bbio->raid_map[faila] ==
+ RAID5_P_STRIPE) {
+ err = -EIO;
+ goto cleanup;
+ }
+ /*
+ * otherwise we have one bad data stripe and
+ * a good P stripe. raid5!
+ */
+ goto pstripe;
+ }
+
+ if (rbio->bbio->raid_map[failb] == RAID5_P_STRIPE) {
+ raid6_datap_recov(rbio->real_stripes,
+ PAGE_SIZE, faila, pointers);
+ } else {
+ raid6_2data_recov(rbio->real_stripes,
+ PAGE_SIZE, faila, failb,
+ pointers);
+ }
+ } else {
+ void *p;
+
+ /* rebuild from P stripe here (raid5 or raid6) */
+ BUG_ON(failb != -1);
+pstripe:
+ /* Copy parity block into failed block to start with */
+ memcpy(pointers[faila],
+ pointers[rbio->nr_data],
+ PAGE_CACHE_SIZE);
+
+ /* rearrange the pointer array */
+ p = pointers[faila];
+ for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
+ pointers[stripe] = pointers[stripe + 1];
+ pointers[rbio->nr_data - 1] = p;
+
+ /* xor in the rest */
+ run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE);
+ }
+ /* if we're doing this rebuild as part of an rmw, go through
+ * and set all of our private rbio pages in the
+ * failed stripes as uptodate. This way finish_rmw will
+ * know they can be trusted. If this was a read reconstruction,
+ * other endio functions will fiddle the uptodate bits
+ */
+ if (rbio->operation == BTRFS_RBIO_WRITE) {
+ for (i = 0; i < nr_pages; i++) {
+ if (faila != -1) {
+ page = rbio_stripe_page(rbio, faila, i);
+ SetPageUptodate(page);
+ }
+ if (failb != -1) {
+ page = rbio_stripe_page(rbio, failb, i);
+ SetPageUptodate(page);
+ }
+ }
+ }
+ for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
+ /*
+ * if we're rebuilding a read, we have to use
+ * pages from the bio list
+ */
+ if (rbio->operation == BTRFS_RBIO_READ_REBUILD &&
+ (stripe == faila || stripe == failb)) {
+ page = page_in_rbio(rbio, stripe, pagenr, 0);
+ } else {
+ page = rbio_stripe_page(rbio, stripe, pagenr);
+ }
+ kunmap(page);
+ }
+ }
+
+ err = 0;
+cleanup:
+ kfree(pointers);
+
+cleanup_io:
+ if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
+ if (err == 0)
+ cache_rbio_pages(rbio);
+ else
+ clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
+
+ rbio_orig_end_io(rbio, err, err == 0);
+ } else if (err == 0) {
+ rbio->faila = -1;
+ rbio->failb = -1;
+
+ if (rbio->operation == BTRFS_RBIO_WRITE)
+ finish_rmw(rbio);
+ else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB)
+ finish_parity_scrub(rbio, 0);
+ else
+ BUG();
+ } else {
+ rbio_orig_end_io(rbio, err, 0);
+ }
+}
+
+/*
+ * This is called only for stripes we've read from disk to
+ * reconstruct the parity.
+ */
+static void raid_recover_end_io(struct bio *bio, int err)
+{
+ struct btrfs_raid_bio *rbio = bio->bi_private;
+
+ /*
+ * we only read stripe pages off the disk, set them
+ * up to date if there were no errors
+ */
+ if (err)
+ fail_bio_stripe(rbio, bio);
+ else
+ set_bio_pages_uptodate(bio);
+ bio_put(bio);
+
+ if (!atomic_dec_and_test(&rbio->stripes_pending))
+ return;
+
+ if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
+ rbio_orig_end_io(rbio, -EIO, 0);
+ else
+ __raid_recover_end_io(rbio);
+}
+
+/*
+ * reads everything we need off the disk to reconstruct
+ * the parity. endio handlers trigger final reconstruction
+ * when the IO is done.
+ *
+ * This is used both for reads from the higher layers and for
+ * parity construction required to finish a rmw cycle.
+ */
+static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
+{
+ int bios_to_read = 0;
+ struct bio_list bio_list;
+ int ret;
+ int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
+ int pagenr;
+ int stripe;
+ struct bio *bio;
+
+ bio_list_init(&bio_list);
+
+ ret = alloc_rbio_pages(rbio);
+ if (ret)
+ goto cleanup;
+
+ atomic_set(&rbio->error, 0);
+
+ /*
+ * read everything that hasn't failed. Thanks to the
+ * stripe cache, it is possible that some or all of these
+ * pages are going to be uptodate.
+ */
+ for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
+ if (rbio->faila == stripe || rbio->failb == stripe) {
+ atomic_inc(&rbio->error);
+ continue;
+ }
+
+ for (pagenr = 0; pagenr < nr_pages; pagenr++) {
+ struct page *p;
+
+ /*
+ * the rmw code may have already read this
+ * page in
+ */
+ p = rbio_stripe_page(rbio, stripe, pagenr);
+ if (PageUptodate(p))
+ continue;
+
+ ret = rbio_add_io_page(rbio, &bio_list,
+ rbio_stripe_page(rbio, stripe, pagenr),
+ stripe, pagenr, rbio->stripe_len);
+ if (ret < 0)
+ goto cleanup;
+ }
+ }
+
+ bios_to_read = bio_list_size(&bio_list);
+ if (!bios_to_read) {
+ /*
+ * we might have no bios to read just because the pages
+ * were up to date, or we might have no bios to read because
+ * the devices were gone.
+ */
+ if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) {
+ __raid_recover_end_io(rbio);
+ goto out;
+ } else {
+ goto cleanup;
+ }
+ }
+
+ /*
+ * the bbio may be freed once we submit the last bio. Make sure
+ * not to touch it after that
+ */
+ atomic_set(&rbio->stripes_pending, bios_to_read);
+ while (1) {
+ bio = bio_list_pop(&bio_list);
+ if (!bio)
+ break;
+
+ bio->bi_private = rbio;
+ bio->bi_end_io = raid_recover_end_io;
+
+ btrfs_bio_wq_end_io(rbio->fs_info, bio,
+ BTRFS_WQ_ENDIO_RAID56);
+
+ BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
+ submit_bio(READ, bio);
+ }
+out:
+ return 0;
+
+cleanup:
+ if (rbio->operation == BTRFS_RBIO_READ_REBUILD)
+ rbio_orig_end_io(rbio, -EIO, 0);
+ return -EIO;
+}
+
+/*
+ * the main entry point for reads from the higher layers. This
+ * is really only called when the normal read path had a failure,
+ * so we assume the bio they send down corresponds to a failed part
+ * of the drive.
+ */
+int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
+ struct btrfs_bio *bbio, u64 stripe_len,
+ int mirror_num, int generic_io)
+{
+ struct btrfs_raid_bio *rbio;
+ int ret;
+
+ rbio = alloc_rbio(root, bbio, stripe_len);
+ if (IS_ERR(rbio)) {
+ if (generic_io)
+ btrfs_put_bbio(bbio);
+ return PTR_ERR(rbio);
+ }
+
+ rbio->operation = BTRFS_RBIO_READ_REBUILD;
+ bio_list_add(&rbio->bio_list, bio);
+ rbio->bio_list_bytes = bio->bi_iter.bi_size;
+
+ rbio->faila = find_logical_bio_stripe(rbio, bio);
+ if (rbio->faila == -1) {
+ BUG();
+ if (generic_io)
+ btrfs_put_bbio(bbio);
+ kfree(rbio);
+ return -EIO;
+ }
+
+ if (generic_io) {
+ btrfs_bio_counter_inc_noblocked(root->fs_info);
+ rbio->generic_bio_cnt = 1;
+ } else {
+ btrfs_get_bbio(bbio);
+ }
+
+ /*
+ * reconstruct from the q stripe if they are
+ * asking for mirror 3
+ */
+ if (mirror_num == 3)
+ rbio->failb = rbio->real_stripes - 2;
+
+ ret = lock_stripe_add(rbio);
+
+ /*
+ * __raid56_parity_recover will end the bio with
+ * any errors it hits. We don't want to return
+ * its error value up the stack because our caller
+ * will end up calling bio_endio with any nonzero
+ * return
+ */
+ if (ret == 0)
+ __raid56_parity_recover(rbio);
+ /*
+ * our rbio has been added to the list of
+ * rbios that will be handled after the
+ * currently lock owner is done
+ */
+ return 0;
+
+}
+
+static void rmw_work(struct btrfs_work *work)
+{
+ struct btrfs_raid_bio *rbio;
+
+ rbio = container_of(work, struct btrfs_raid_bio, work);
+ raid56_rmw_stripe(rbio);
+}
+
+static void read_rebuild_work(struct btrfs_work *work)
+{
+ struct btrfs_raid_bio *rbio;
+
+ rbio = container_of(work, struct btrfs_raid_bio, work);
+ __raid56_parity_recover(rbio);
+}
+
+/*
+ * The following code is used to scrub/replace the parity stripe
+ *
+ * Note: We need make sure all the pages that add into the scrub/replace
+ * raid bio are correct and not be changed during the scrub/replace. That
+ * is those pages just hold metadata or file data with checksum.
+ */
+
+struct btrfs_raid_bio *
+raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio,
+ struct btrfs_bio *bbio, u64 stripe_len,
+ struct btrfs_device *scrub_dev,
+ unsigned long *dbitmap, int stripe_nsectors)
+{
+ struct btrfs_raid_bio *rbio;
+ int i;
+
+ rbio = alloc_rbio(root, bbio, stripe_len);
+ if (IS_ERR(rbio))
+ return NULL;
+ bio_list_add(&rbio->bio_list, bio);
+ /*
+ * This is a special bio which is used to hold the completion handler
+ * and make the scrub rbio is similar to the other types
+ */
+ ASSERT(!bio->bi_iter.bi_size);
+ rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
+
+ for (i = 0; i < rbio->real_stripes; i++) {
+ if (bbio->stripes[i].dev == scrub_dev) {
+ rbio->scrubp = i;
+ break;
+ }
+ }
+
+ /* Now we just support the sectorsize equals to page size */
+ ASSERT(root->sectorsize == PAGE_SIZE);
+ ASSERT(rbio->stripe_npages == stripe_nsectors);
+ bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
+
+ return rbio;
+}
+
+void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio,
+ struct page *page, u64 logical)
+{
+ int stripe_offset;
+ int index;
+
+ ASSERT(logical >= rbio->bbio->raid_map[0]);
+ ASSERT(logical + PAGE_SIZE <= rbio->bbio->raid_map[0] +
+ rbio->stripe_len * rbio->nr_data);
+ stripe_offset = (int)(logical - rbio->bbio->raid_map[0]);
+ index = stripe_offset >> PAGE_CACHE_SHIFT;
+ rbio->bio_pages[index] = page;
+}
+
+/*
+ * We just scrub the parity that we have correct data on the same horizontal,
+ * so we needn't allocate all pages for all the stripes.
+ */
+static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
+{
+ int i;
+ int bit;
+ int index;
+ struct page *page;
+
+ for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) {
+ for (i = 0; i < rbio->real_stripes; i++) {
+ index = i * rbio->stripe_npages + bit;
+ if (rbio->stripe_pages[index])
+ continue;
+
+ page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ if (!page)
+ return -ENOMEM;
+ rbio->stripe_pages[index] = page;
+ ClearPageUptodate(page);
+ }
+ }
+ return 0;
+}
+
+/*
+ * end io function used by finish_rmw. When we finally
+ * get here, we've written a full stripe
+ */
+static void raid_write_parity_end_io(struct bio *bio, int err)
+{
+ struct btrfs_raid_bio *rbio = bio->bi_private;
+
+ if (err)
+ fail_bio_stripe(rbio, bio);
+
+ bio_put(bio);
+
+ if (!atomic_dec_and_test(&rbio->stripes_pending))
+ return;
+
+ err = 0;
+
+ if (atomic_read(&rbio->error))
+ err = -EIO;
+
+ rbio_orig_end_io(rbio, err, 0);
+}
+
+static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
+ int need_check)
+{
+ struct btrfs_bio *bbio = rbio->bbio;
+ void *pointers[rbio->real_stripes];
+ DECLARE_BITMAP(pbitmap, rbio->stripe_npages);
+ int nr_data = rbio->nr_data;
+ int stripe;
+ int pagenr;
+ int p_stripe = -1;
+ int q_stripe = -1;
+ struct page *p_page = NULL;
+ struct page *q_page = NULL;
+ struct bio_list bio_list;
+ struct bio *bio;
+ int is_replace = 0;
+ int ret;
+
+ bio_list_init(&bio_list);
+
+ if (rbio->real_stripes - rbio->nr_data == 1) {
+ p_stripe = rbio->real_stripes - 1;
+ } else if (rbio->real_stripes - rbio->nr_data == 2) {
+ p_stripe = rbio->real_stripes - 2;
+ q_stripe = rbio->real_stripes - 1;
+ } else {
+ BUG();
+ }
+
+ if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) {
+ is_replace = 1;
+ bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages);
+ }
+
+ /*
+ * Because the higher layers(scrubber) are unlikely to
+ * use this area of the disk again soon, so don't cache
+ * it.
+ */
+ clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
+
+ if (!need_check)
+ goto writeback;
+
+ p_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ if (!p_page)
+ goto cleanup;
+ SetPageUptodate(p_page);
+
+ if (q_stripe != -1) {
+ q_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ if (!q_page) {
+ __free_page(p_page);
+ goto cleanup;
+ }
+ SetPageUptodate(q_page);
+ }
+
+ atomic_set(&rbio->error, 0);
+
+ for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
+ struct page *p;
+ void *parity;
+ /* first collect one page from each data stripe */
+ for (stripe = 0; stripe < nr_data; stripe++) {
+ p = page_in_rbio(rbio, stripe, pagenr, 0);
+ pointers[stripe] = kmap(p);
+ }
+
+ /* then add the parity stripe */
+ pointers[stripe++] = kmap(p_page);
+
+ if (q_stripe != -1) {
+
+ /*
+ * raid6, add the qstripe and call the
+ * library function to fill in our p/q
+ */
+ pointers[stripe++] = kmap(q_page);
+
+ raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
+ pointers);
+ } else {
+ /* raid5 */
+ memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
+ run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
+ }
+
+ /* Check scrubbing pairty and repair it */
+ p = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
+ parity = kmap(p);
+ if (memcmp(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE))
+ memcpy(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE);
+ else
+ /* Parity is right, needn't writeback */
+ bitmap_clear(rbio->dbitmap, pagenr, 1);
+ kunmap(p);
+
+ for (stripe = 0; stripe < rbio->real_stripes; stripe++)
+ kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
+ }
+
+ __free_page(p_page);
+ if (q_page)
+ __free_page(q_page);
+
+writeback:
+ /*
+ * time to start writing. Make bios for everything from the
+ * higher layers (the bio_list in our rbio) and our p/q. Ignore
+ * everything else.
+ */
+ for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
+ struct page *page;
+
+ page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
+ ret = rbio_add_io_page(rbio, &bio_list,
+ page, rbio->scrubp, pagenr, rbio->stripe_len);
+ if (ret)
+ goto cleanup;
+ }
+
+ if (!is_replace)
+ goto submit_write;
+
+ for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) {
+ struct page *page;
+
+ page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
+ ret = rbio_add_io_page(rbio, &bio_list, page,
+ bbio->tgtdev_map[rbio->scrubp],
+ pagenr, rbio->stripe_len);
+ if (ret)
+ goto cleanup;
+ }
+
+submit_write:
+ nr_data = bio_list_size(&bio_list);
+ if (!nr_data) {
+ /* Every parity is right */
+ rbio_orig_end_io(rbio, 0, 0);
+ return;
+ }
+
+ atomic_set(&rbio->stripes_pending, nr_data);
+
+ while (1) {
+ bio = bio_list_pop(&bio_list);
+ if (!bio)
+ break;
+
+ bio->bi_private = rbio;
+ bio->bi_end_io = raid_write_parity_end_io;
+ BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
+ submit_bio(WRITE, bio);
+ }
+ return;
+
+cleanup:
+ rbio_orig_end_io(rbio, -EIO, 0);
+}
+
+static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
+{
+ if (stripe >= 0 && stripe < rbio->nr_data)
+ return 1;
+ return 0;
+}
+
+/*
+ * While we're doing the parity check and repair, we could have errors
+ * in reading pages off the disk. This checks for errors and if we're
+ * not able to read the page it'll trigger parity reconstruction. The
+ * parity scrub will be finished after we've reconstructed the failed
+ * stripes
+ */
+static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
+{
+ if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
+ goto cleanup;
+
+ if (rbio->faila >= 0 || rbio->failb >= 0) {
+ int dfail = 0, failp = -1;
+
+ if (is_data_stripe(rbio, rbio->faila))
+ dfail++;
+ else if (is_parity_stripe(rbio->faila))
+ failp = rbio->faila;
+
+ if (is_data_stripe(rbio, rbio->failb))
+ dfail++;
+ else if (is_parity_stripe(rbio->failb))
+ failp = rbio->failb;
+
+ /*
+ * Because we can not use a scrubbing parity to repair
+ * the data, so the capability of the repair is declined.
+ * (In the case of RAID5, we can not repair anything)
+ */
+ if (dfail > rbio->bbio->max_errors - 1)
+ goto cleanup;
+
+ /*
+ * If all data is good, only parity is correctly, just
+ * repair the parity.
+ */
+ if (dfail == 0) {
+ finish_parity_scrub(rbio, 0);
+ return;
+ }
+
+ /*
+ * Here means we got one corrupted data stripe and one
+ * corrupted parity on RAID6, if the corrupted parity
+ * is scrubbing parity, luckly, use the other one to repair
+ * the data, or we can not repair the data stripe.
+ */
+ if (failp != rbio->scrubp)
+ goto cleanup;
+
+ __raid_recover_end_io(rbio);
+ } else {
+ finish_parity_scrub(rbio, 1);
+ }
+ return;
+
+cleanup:
+ rbio_orig_end_io(rbio, -EIO, 0);
+}
+
+/*
+ * end io for the read phase of the rmw cycle. All the bios here are physical
+ * stripe bios we've read from the disk so we can recalculate the parity of the
+ * stripe.
+ *
+ * This will usually kick off finish_rmw once all the bios are read in, but it
+ * may trigger parity reconstruction if we had any errors along the way
+ */
+static void raid56_parity_scrub_end_io(struct bio *bio, int err)
+{
+ struct btrfs_raid_bio *rbio = bio->bi_private;
+
+ if (err)
+ fail_bio_stripe(rbio, bio);
+ else
+ set_bio_pages_uptodate(bio);
+
+ bio_put(bio);
+
+ if (!atomic_dec_and_test(&rbio->stripes_pending))
+ return;
+
+ /*
+ * this will normally call finish_rmw to start our write
+ * but if there are any failed stripes we'll reconstruct
+ * from parity first
+ */
+ validate_rbio_for_parity_scrub(rbio);
+}
+
+static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
+{
+ int bios_to_read = 0;
+ struct bio_list bio_list;
+ int ret;
+ int pagenr;
+ int stripe;
+ struct bio *bio;
+
+ ret = alloc_rbio_essential_pages(rbio);
+ if (ret)
+ goto cleanup;
+
+ bio_list_init(&bio_list);
+
+ atomic_set(&rbio->error, 0);
+ /*
+ * build a list of bios to read all the missing parts of this
+ * stripe
+ */
+ for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
+ for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
+ struct page *page;
+ /*
+ * we want to find all the pages missing from
+ * the rbio and read them from the disk. If
+ * page_in_rbio finds a page in the bio list
+ * we don't need to read it off the stripe.
+ */
+ page = page_in_rbio(rbio, stripe, pagenr, 1);
+ if (page)
+ continue;
+
+ page = rbio_stripe_page(rbio, stripe, pagenr);
+ /*
+ * the bio cache may have handed us an uptodate
+ * page. If so, be happy and use it
+ */
+ if (PageUptodate(page))
+ continue;
+
+ ret = rbio_add_io_page(rbio, &bio_list, page,
+ stripe, pagenr, rbio->stripe_len);
+ if (ret)
+ goto cleanup;
+ }
+ }
+
+ bios_to_read = bio_list_size(&bio_list);
+ if (!bios_to_read) {
+ /*
+ * this can happen if others have merged with
+ * us, it means there is nothing left to read.
+ * But if there are missing devices it may not be
+ * safe to do the full stripe write yet.
+ */
+ goto finish;
+ }
+
+ /*
+ * the bbio may be freed once we submit the last bio. Make sure
+ * not to touch it after that
+ */
+ atomic_set(&rbio->stripes_pending, bios_to_read);
+ while (1) {
+ bio = bio_list_pop(&bio_list);
+ if (!bio)
+ break;
+
+ bio->bi_private = rbio;
+ bio->bi_end_io = raid56_parity_scrub_end_io;
+
+ btrfs_bio_wq_end_io(rbio->fs_info, bio,
+ BTRFS_WQ_ENDIO_RAID56);
+
+ BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
+ submit_bio(READ, bio);
+ }
+ /* the actual write will happen once the reads are done */
+ return;
+
+cleanup:
+ rbio_orig_end_io(rbio, -EIO, 0);
+ return;
+
+finish:
+ validate_rbio_for_parity_scrub(rbio);
+}
+
+static void scrub_parity_work(struct btrfs_work *work)
+{
+ struct btrfs_raid_bio *rbio;
+
+ rbio = container_of(work, struct btrfs_raid_bio, work);
+ raid56_parity_scrub_stripe(rbio);
+}
+
+static void async_scrub_parity(struct btrfs_raid_bio *rbio)
+{
+ btrfs_init_work(&rbio->work, btrfs_rmw_helper,
+ scrub_parity_work, NULL, NULL);
+
+ btrfs_queue_work(rbio->fs_info->rmw_workers,
+ &rbio->work);
+}
+
+void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
+{
+ if (!lock_stripe_add(rbio))
+ async_scrub_parity(rbio);
+}
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
new file mode 100644
index 000000000..2b5d7977d
--- /dev/null
+++ b/fs/btrfs/raid56.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (C) 2012 Fusion-io All rights reserved.
+ * Copyright (C) 2012 Intel Corp. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_RAID56__
+#define __BTRFS_RAID56__
+static inline int nr_parity_stripes(struct map_lookup *map)
+{
+ if (map->type & BTRFS_BLOCK_GROUP_RAID5)
+ return 1;
+ else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
+ return 2;
+ else
+ return 0;
+}
+
+static inline int nr_data_stripes(struct map_lookup *map)
+{
+ return map->num_stripes - nr_parity_stripes(map);
+}
+#define RAID5_P_STRIPE ((u64)-2)
+#define RAID6_Q_STRIPE ((u64)-1)
+
+#define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) || \
+ ((x) == RAID6_Q_STRIPE))
+
+struct btrfs_raid_bio;
+struct btrfs_device;
+
+int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
+ struct btrfs_bio *bbio, u64 stripe_len,
+ int mirror_num, int generic_io);
+int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
+ struct btrfs_bio *bbio, u64 stripe_len);
+
+struct btrfs_raid_bio *
+raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio,
+ struct btrfs_bio *bbio, u64 stripe_len,
+ struct btrfs_device *scrub_dev,
+ unsigned long *dbitmap, int stripe_nsectors);
+void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio,
+ struct page *page, u64 logical);
+void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio);
+
+int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
+void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info);
+#endif
diff --git a/fs/btrfs/rcu-string.h b/fs/btrfs/rcu-string.h
new file mode 100644
index 000000000..9e111e457
--- /dev/null
+++ b/fs/btrfs/rcu-string.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (C) 2012 Red Hat. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+struct rcu_string {
+ struct rcu_head rcu;
+ char str[0];
+};
+
+static inline struct rcu_string *rcu_string_strdup(const char *src, gfp_t mask)
+{
+ size_t len = strlen(src) + 1;
+ struct rcu_string *ret = kzalloc(sizeof(struct rcu_string) +
+ (len * sizeof(char)), mask);
+ if (!ret)
+ return ret;
+ strncpy(ret->str, src, len);
+ return ret;
+}
+
+static inline void rcu_string_free(struct rcu_string *str)
+{
+ if (str)
+ kfree_rcu(str, rcu);
+}
+
+#define printk_in_rcu(fmt, ...) do { \
+ rcu_read_lock(); \
+ printk(fmt, __VA_ARGS__); \
+ rcu_read_unlock(); \
+} while (0)
+
+#define printk_ratelimited_in_rcu(fmt, ...) do { \
+ rcu_read_lock(); \
+ printk_ratelimited(fmt, __VA_ARGS__); \
+ rcu_read_unlock(); \
+} while (0)
+
+#define rcu_str_deref(rcu_str) ({ \
+ struct rcu_string *__str = rcu_dereference(rcu_str); \
+ __str->str; \
+})
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
new file mode 100644
index 000000000..0e7beea92
--- /dev/null
+++ b/fs/btrfs/reada.c
@@ -0,0 +1,992 @@
+/*
+ * Copyright (C) 2011 STRATO. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include "ctree.h"
+#include "volumes.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "dev-replace.h"
+
+#undef DEBUG
+
+/*
+ * This is the implementation for the generic read ahead framework.
+ *
+ * To trigger a readahead, btrfs_reada_add must be called. It will start
+ * a read ahead for the given range [start, end) on tree root. The returned
+ * handle can either be used to wait on the readahead to finish
+ * (btrfs_reada_wait), or to send it to the background (btrfs_reada_detach).
+ *
+ * The read ahead works as follows:
+ * On btrfs_reada_add, the root of the tree is inserted into a radix_tree.
+ * reada_start_machine will then search for extents to prefetch and trigger
+ * some reads. When a read finishes for a node, all contained node/leaf
+ * pointers that lie in the given range will also be enqueued. The reads will
+ * be triggered in sequential order, thus giving a big win over a naive
+ * enumeration. It will also make use of multi-device layouts. Each disk
+ * will have its on read pointer and all disks will by utilized in parallel.
+ * Also will no two disks read both sides of a mirror simultaneously, as this
+ * would waste seeking capacity. Instead both disks will read different parts
+ * of the filesystem.
+ * Any number of readaheads can be started in parallel. The read order will be
+ * determined globally, i.e. 2 parallel readaheads will normally finish faster
+ * than the 2 started one after another.
+ */
+
+#define MAX_IN_FLIGHT 6
+
+struct reada_extctl {
+ struct list_head list;
+ struct reada_control *rc;
+ u64 generation;
+};
+
+struct reada_extent {
+ u64 logical;
+ struct btrfs_key top;
+ int err;
+ struct list_head extctl;
+ int refcnt;
+ spinlock_t lock;
+ struct reada_zone *zones[BTRFS_MAX_MIRRORS];
+ int nzones;
+ struct btrfs_device *scheduled_for;
+};
+
+struct reada_zone {
+ u64 start;
+ u64 end;
+ u64 elems;
+ struct list_head list;
+ spinlock_t lock;
+ int locked;
+ struct btrfs_device *device;
+ struct btrfs_device *devs[BTRFS_MAX_MIRRORS]; /* full list, incl
+ * self */
+ int ndevs;
+ struct kref refcnt;
+};
+
+struct reada_machine_work {
+ struct btrfs_work work;
+ struct btrfs_fs_info *fs_info;
+};
+
+static void reada_extent_put(struct btrfs_fs_info *, struct reada_extent *);
+static void reada_control_release(struct kref *kref);
+static void reada_zone_release(struct kref *kref);
+static void reada_start_machine(struct btrfs_fs_info *fs_info);
+static void __reada_start_machine(struct btrfs_fs_info *fs_info);
+
+static int reada_add_block(struct reada_control *rc, u64 logical,
+ struct btrfs_key *top, int level, u64 generation);
+
+/* recurses */
+/* in case of err, eb might be NULL */
+static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
+ u64 start, int err)
+{
+ int level = 0;
+ int nritems;
+ int i;
+ u64 bytenr;
+ u64 generation;
+ struct reada_extent *re;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct list_head list;
+ unsigned long index = start >> PAGE_CACHE_SHIFT;
+ struct btrfs_device *for_dev;
+
+ if (eb)
+ level = btrfs_header_level(eb);
+
+ /* find extent */
+ spin_lock(&fs_info->reada_lock);
+ re = radix_tree_lookup(&fs_info->reada_tree, index);
+ if (re)
+ re->refcnt++;
+ spin_unlock(&fs_info->reada_lock);
+
+ if (!re)
+ return -1;
+
+ spin_lock(&re->lock);
+ /*
+ * just take the full list from the extent. afterwards we
+ * don't need the lock anymore
+ */
+ list_replace_init(&re->extctl, &list);
+ for_dev = re->scheduled_for;
+ re->scheduled_for = NULL;
+ spin_unlock(&re->lock);
+
+ if (err == 0) {
+ nritems = level ? btrfs_header_nritems(eb) : 0;
+ generation = btrfs_header_generation(eb);
+ /*
+ * FIXME: currently we just set nritems to 0 if this is a leaf,
+ * effectively ignoring the content. In a next step we could
+ * trigger more readahead depending from the content, e.g.
+ * fetch the checksums for the extents in the leaf.
+ */
+ } else {
+ /*
+ * this is the error case, the extent buffer has not been
+ * read correctly. We won't access anything from it and
+ * just cleanup our data structures. Effectively this will
+ * cut the branch below this node from read ahead.
+ */
+ nritems = 0;
+ generation = 0;
+ }
+
+ for (i = 0; i < nritems; i++) {
+ struct reada_extctl *rec;
+ u64 n_gen;
+ struct btrfs_key key;
+ struct btrfs_key next_key;
+
+ btrfs_node_key_to_cpu(eb, &key, i);
+ if (i + 1 < nritems)
+ btrfs_node_key_to_cpu(eb, &next_key, i + 1);
+ else
+ next_key = re->top;
+ bytenr = btrfs_node_blockptr(eb, i);
+ n_gen = btrfs_node_ptr_generation(eb, i);
+
+ list_for_each_entry(rec, &list, list) {
+ struct reada_control *rc = rec->rc;
+
+ /*
+ * if the generation doesn't match, just ignore this
+ * extctl. This will probably cut off a branch from
+ * prefetch. Alternatively one could start a new (sub-)
+ * prefetch for this branch, starting again from root.
+ * FIXME: move the generation check out of this loop
+ */
+#ifdef DEBUG
+ if (rec->generation != generation) {
+ btrfs_debug(root->fs_info,
+ "generation mismatch for (%llu,%d,%llu) %llu != %llu",
+ key.objectid, key.type, key.offset,
+ rec->generation, generation);
+ }
+#endif
+ if (rec->generation == generation &&
+ btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 &&
+ btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0)
+ reada_add_block(rc, bytenr, &next_key,
+ level - 1, n_gen);
+ }
+ }
+ /*
+ * free extctl records
+ */
+ while (!list_empty(&list)) {
+ struct reada_control *rc;
+ struct reada_extctl *rec;
+
+ rec = list_first_entry(&list, struct reada_extctl, list);
+ list_del(&rec->list);
+ rc = rec->rc;
+ kfree(rec);
+
+ kref_get(&rc->refcnt);
+ if (atomic_dec_and_test(&rc->elems)) {
+ kref_put(&rc->refcnt, reada_control_release);
+ wake_up(&rc->wait);
+ }
+ kref_put(&rc->refcnt, reada_control_release);
+
+ reada_extent_put(fs_info, re); /* one ref for each entry */
+ }
+ reada_extent_put(fs_info, re); /* our ref */
+ if (for_dev)
+ atomic_dec(&for_dev->reada_in_flight);
+
+ return 0;
+}
+
+/*
+ * start is passed separately in case eb in NULL, which may be the case with
+ * failed I/O
+ */
+int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
+ u64 start, int err)
+{
+ int ret;
+
+ ret = __readahead_hook(root, eb, start, err);
+
+ reada_start_machine(root->fs_info);
+
+ return ret;
+}
+
+static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *dev, u64 logical,
+ struct btrfs_bio *bbio)
+{
+ int ret;
+ struct reada_zone *zone;
+ struct btrfs_block_group_cache *cache = NULL;
+ u64 start;
+ u64 end;
+ int i;
+
+ zone = NULL;
+ spin_lock(&fs_info->reada_lock);
+ ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
+ logical >> PAGE_CACHE_SHIFT, 1);
+ if (ret == 1)
+ kref_get(&zone->refcnt);
+ spin_unlock(&fs_info->reada_lock);
+
+ if (ret == 1) {
+ if (logical >= zone->start && logical < zone->end)
+ return zone;
+ spin_lock(&fs_info->reada_lock);
+ kref_put(&zone->refcnt, reada_zone_release);
+ spin_unlock(&fs_info->reada_lock);
+ }
+
+ cache = btrfs_lookup_block_group(fs_info, logical);
+ if (!cache)
+ return NULL;
+
+ start = cache->key.objectid;
+ end = start + cache->key.offset - 1;
+ btrfs_put_block_group(cache);
+
+ zone = kzalloc(sizeof(*zone), GFP_NOFS);
+ if (!zone)
+ return NULL;
+
+ zone->start = start;
+ zone->end = end;
+ INIT_LIST_HEAD(&zone->list);
+ spin_lock_init(&zone->lock);
+ zone->locked = 0;
+ kref_init(&zone->refcnt);
+ zone->elems = 0;
+ zone->device = dev; /* our device always sits at index 0 */
+ for (i = 0; i < bbio->num_stripes; ++i) {
+ /* bounds have already been checked */
+ zone->devs[i] = bbio->stripes[i].dev;
+ }
+ zone->ndevs = bbio->num_stripes;
+
+ spin_lock(&fs_info->reada_lock);
+ ret = radix_tree_insert(&dev->reada_zones,
+ (unsigned long)(zone->end >> PAGE_CACHE_SHIFT),
+ zone);
+
+ if (ret == -EEXIST) {
+ kfree(zone);
+ ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
+ logical >> PAGE_CACHE_SHIFT, 1);
+ if (ret == 1)
+ kref_get(&zone->refcnt);
+ }
+ spin_unlock(&fs_info->reada_lock);
+
+ return zone;
+}
+
+static struct reada_extent *reada_find_extent(struct btrfs_root *root,
+ u64 logical,
+ struct btrfs_key *top, int level)
+{
+ int ret;
+ struct reada_extent *re = NULL;
+ struct reada_extent *re_exist = NULL;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_bio *bbio = NULL;
+ struct btrfs_device *dev;
+ struct btrfs_device *prev_dev;
+ u32 blocksize;
+ u64 length;
+ int nzones = 0;
+ int i;
+ unsigned long index = logical >> PAGE_CACHE_SHIFT;
+ int dev_replace_is_ongoing;
+
+ spin_lock(&fs_info->reada_lock);
+ re = radix_tree_lookup(&fs_info->reada_tree, index);
+ if (re)
+ re->refcnt++;
+ spin_unlock(&fs_info->reada_lock);
+
+ if (re)
+ return re;
+
+ re = kzalloc(sizeof(*re), GFP_NOFS);
+ if (!re)
+ return NULL;
+
+ blocksize = root->nodesize;
+ re->logical = logical;
+ re->top = *top;
+ INIT_LIST_HEAD(&re->extctl);
+ spin_lock_init(&re->lock);
+ re->refcnt = 1;
+
+ /*
+ * map block
+ */
+ length = blocksize;
+ ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical, &length,
+ &bbio, 0);
+ if (ret || !bbio || length < blocksize)
+ goto error;
+
+ if (bbio->num_stripes > BTRFS_MAX_MIRRORS) {
+ btrfs_err(root->fs_info,
+ "readahead: more than %d copies not supported",
+ BTRFS_MAX_MIRRORS);
+ goto error;
+ }
+
+ for (nzones = 0; nzones < bbio->num_stripes; ++nzones) {
+ struct reada_zone *zone;
+
+ dev = bbio->stripes[nzones].dev;
+ zone = reada_find_zone(fs_info, dev, logical, bbio);
+ if (!zone)
+ break;
+
+ re->zones[nzones] = zone;
+ spin_lock(&zone->lock);
+ if (!zone->elems)
+ kref_get(&zone->refcnt);
+ ++zone->elems;
+ spin_unlock(&zone->lock);
+ spin_lock(&fs_info->reada_lock);
+ kref_put(&zone->refcnt, reada_zone_release);
+ spin_unlock(&fs_info->reada_lock);
+ }
+ re->nzones = nzones;
+ if (nzones == 0) {
+ /* not a single zone found, error and out */
+ goto error;
+ }
+
+ /* insert extent in reada_tree + all per-device trees, all or nothing */
+ btrfs_dev_replace_lock(&fs_info->dev_replace);
+ spin_lock(&fs_info->reada_lock);
+ ret = radix_tree_insert(&fs_info->reada_tree, index, re);
+ if (ret == -EEXIST) {
+ re_exist = radix_tree_lookup(&fs_info->reada_tree, index);
+ BUG_ON(!re_exist);
+ re_exist->refcnt++;
+ spin_unlock(&fs_info->reada_lock);
+ btrfs_dev_replace_unlock(&fs_info->dev_replace);
+ goto error;
+ }
+ if (ret) {
+ spin_unlock(&fs_info->reada_lock);
+ btrfs_dev_replace_unlock(&fs_info->dev_replace);
+ goto error;
+ }
+ prev_dev = NULL;
+ dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(
+ &fs_info->dev_replace);
+ for (i = 0; i < nzones; ++i) {
+ dev = bbio->stripes[i].dev;
+ if (dev == prev_dev) {
+ /*
+ * in case of DUP, just add the first zone. As both
+ * are on the same device, there's nothing to gain
+ * from adding both.
+ * Also, it wouldn't work, as the tree is per device
+ * and adding would fail with EEXIST
+ */
+ continue;
+ }
+ if (!dev->bdev) {
+ /*
+ * cannot read ahead on missing device, but for RAID5/6,
+ * REQ_GET_READ_MIRRORS return 1. So don't skip missing
+ * device for such case.
+ */
+ if (nzones > 1)
+ continue;
+ }
+ if (dev_replace_is_ongoing &&
+ dev == fs_info->dev_replace.tgtdev) {
+ /*
+ * as this device is selected for reading only as
+ * a last resort, skip it for read ahead.
+ */
+ continue;
+ }
+ prev_dev = dev;
+ ret = radix_tree_insert(&dev->reada_extents, index, re);
+ if (ret) {
+ while (--i >= 0) {
+ dev = bbio->stripes[i].dev;
+ BUG_ON(dev == NULL);
+ /* ignore whether the entry was inserted */
+ radix_tree_delete(&dev->reada_extents, index);
+ }
+ BUG_ON(fs_info == NULL);
+ radix_tree_delete(&fs_info->reada_tree, index);
+ spin_unlock(&fs_info->reada_lock);
+ btrfs_dev_replace_unlock(&fs_info->dev_replace);
+ goto error;
+ }
+ }
+ spin_unlock(&fs_info->reada_lock);
+ btrfs_dev_replace_unlock(&fs_info->dev_replace);
+
+ btrfs_put_bbio(bbio);
+ return re;
+
+error:
+ while (nzones) {
+ struct reada_zone *zone;
+
+ --nzones;
+ zone = re->zones[nzones];
+ kref_get(&zone->refcnt);
+ spin_lock(&zone->lock);
+ --zone->elems;
+ if (zone->elems == 0) {
+ /*
+ * no fs_info->reada_lock needed, as this can't be
+ * the last ref
+ */
+ kref_put(&zone->refcnt, reada_zone_release);
+ }
+ spin_unlock(&zone->lock);
+
+ spin_lock(&fs_info->reada_lock);
+ kref_put(&zone->refcnt, reada_zone_release);
+ spin_unlock(&fs_info->reada_lock);
+ }
+ btrfs_put_bbio(bbio);
+ kfree(re);
+ return re_exist;
+}
+
+static void reada_extent_put(struct btrfs_fs_info *fs_info,
+ struct reada_extent *re)
+{
+ int i;
+ unsigned long index = re->logical >> PAGE_CACHE_SHIFT;
+
+ spin_lock(&fs_info->reada_lock);
+ if (--re->refcnt) {
+ spin_unlock(&fs_info->reada_lock);
+ return;
+ }
+
+ radix_tree_delete(&fs_info->reada_tree, index);
+ for (i = 0; i < re->nzones; ++i) {
+ struct reada_zone *zone = re->zones[i];
+
+ radix_tree_delete(&zone->device->reada_extents, index);
+ }
+
+ spin_unlock(&fs_info->reada_lock);
+
+ for (i = 0; i < re->nzones; ++i) {
+ struct reada_zone *zone = re->zones[i];
+
+ kref_get(&zone->refcnt);
+ spin_lock(&zone->lock);
+ --zone->elems;
+ if (zone->elems == 0) {
+ /* no fs_info->reada_lock needed, as this can't be
+ * the last ref */
+ kref_put(&zone->refcnt, reada_zone_release);
+ }
+ spin_unlock(&zone->lock);
+
+ spin_lock(&fs_info->reada_lock);
+ kref_put(&zone->refcnt, reada_zone_release);
+ spin_unlock(&fs_info->reada_lock);
+ }
+ if (re->scheduled_for)
+ atomic_dec(&re->scheduled_for->reada_in_flight);
+
+ kfree(re);
+}
+
+static void reada_zone_release(struct kref *kref)
+{
+ struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt);
+
+ radix_tree_delete(&zone->device->reada_zones,
+ zone->end >> PAGE_CACHE_SHIFT);
+
+ kfree(zone);
+}
+
+static void reada_control_release(struct kref *kref)
+{
+ struct reada_control *rc = container_of(kref, struct reada_control,
+ refcnt);
+
+ kfree(rc);
+}
+
+static int reada_add_block(struct reada_control *rc, u64 logical,
+ struct btrfs_key *top, int level, u64 generation)
+{
+ struct btrfs_root *root = rc->root;
+ struct reada_extent *re;
+ struct reada_extctl *rec;
+
+ re = reada_find_extent(root, logical, top, level); /* takes one ref */
+ if (!re)
+ return -1;
+
+ rec = kzalloc(sizeof(*rec), GFP_NOFS);
+ if (!rec) {
+ reada_extent_put(root->fs_info, re);
+ return -1;
+ }
+
+ rec->rc = rc;
+ rec->generation = generation;
+ atomic_inc(&rc->elems);
+
+ spin_lock(&re->lock);
+ list_add_tail(&rec->list, &re->extctl);
+ spin_unlock(&re->lock);
+
+ /* leave the ref on the extent */
+
+ return 0;
+}
+
+/*
+ * called with fs_info->reada_lock held
+ */
+static void reada_peer_zones_set_lock(struct reada_zone *zone, int lock)
+{
+ int i;
+ unsigned long index = zone->end >> PAGE_CACHE_SHIFT;
+
+ for (i = 0; i < zone->ndevs; ++i) {
+ struct reada_zone *peer;
+ peer = radix_tree_lookup(&zone->devs[i]->reada_zones, index);
+ if (peer && peer->device != zone->device)
+ peer->locked = lock;
+ }
+}
+
+/*
+ * called with fs_info->reada_lock held
+ */
+static int reada_pick_zone(struct btrfs_device *dev)
+{
+ struct reada_zone *top_zone = NULL;
+ struct reada_zone *top_locked_zone = NULL;
+ u64 top_elems = 0;
+ u64 top_locked_elems = 0;
+ unsigned long index = 0;
+ int ret;
+
+ if (dev->reada_curr_zone) {
+ reada_peer_zones_set_lock(dev->reada_curr_zone, 0);
+ kref_put(&dev->reada_curr_zone->refcnt, reada_zone_release);
+ dev->reada_curr_zone = NULL;
+ }
+ /* pick the zone with the most elements */
+ while (1) {
+ struct reada_zone *zone;
+
+ ret = radix_tree_gang_lookup(&dev->reada_zones,
+ (void **)&zone, index, 1);
+ if (ret == 0)
+ break;
+ index = (zone->end >> PAGE_CACHE_SHIFT) + 1;
+ if (zone->locked) {
+ if (zone->elems > top_locked_elems) {
+ top_locked_elems = zone->elems;
+ top_locked_zone = zone;
+ }
+ } else {
+ if (zone->elems > top_elems) {
+ top_elems = zone->elems;
+ top_zone = zone;
+ }
+ }
+ }
+ if (top_zone)
+ dev->reada_curr_zone = top_zone;
+ else if (top_locked_zone)
+ dev->reada_curr_zone = top_locked_zone;
+ else
+ return 0;
+
+ dev->reada_next = dev->reada_curr_zone->start;
+ kref_get(&dev->reada_curr_zone->refcnt);
+ reada_peer_zones_set_lock(dev->reada_curr_zone, 1);
+
+ return 1;
+}
+
+static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *dev)
+{
+ struct reada_extent *re = NULL;
+ int mirror_num = 0;
+ struct extent_buffer *eb = NULL;
+ u64 logical;
+ int ret;
+ int i;
+ int need_kick = 0;
+
+ spin_lock(&fs_info->reada_lock);
+ if (dev->reada_curr_zone == NULL) {
+ ret = reada_pick_zone(dev);
+ if (!ret) {
+ spin_unlock(&fs_info->reada_lock);
+ return 0;
+ }
+ }
+ /*
+ * FIXME currently we issue the reads one extent at a time. If we have
+ * a contiguous block of extents, we could also coagulate them or use
+ * plugging to speed things up
+ */
+ ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
+ dev->reada_next >> PAGE_CACHE_SHIFT, 1);
+ if (ret == 0 || re->logical >= dev->reada_curr_zone->end) {
+ ret = reada_pick_zone(dev);
+ if (!ret) {
+ spin_unlock(&fs_info->reada_lock);
+ return 0;
+ }
+ re = NULL;
+ ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
+ dev->reada_next >> PAGE_CACHE_SHIFT, 1);
+ }
+ if (ret == 0) {
+ spin_unlock(&fs_info->reada_lock);
+ return 0;
+ }
+ dev->reada_next = re->logical + fs_info->tree_root->nodesize;
+ re->refcnt++;
+
+ spin_unlock(&fs_info->reada_lock);
+
+ /*
+ * find mirror num
+ */
+ for (i = 0; i < re->nzones; ++i) {
+ if (re->zones[i]->device == dev) {
+ mirror_num = i + 1;
+ break;
+ }
+ }
+ logical = re->logical;
+
+ spin_lock(&re->lock);
+ if (re->scheduled_for == NULL) {
+ re->scheduled_for = dev;
+ need_kick = 1;
+ }
+ spin_unlock(&re->lock);
+
+ reada_extent_put(fs_info, re);
+
+ if (!need_kick)
+ return 0;
+
+ atomic_inc(&dev->reada_in_flight);
+ ret = reada_tree_block_flagged(fs_info->extent_root, logical,
+ mirror_num, &eb);
+ if (ret)
+ __readahead_hook(fs_info->extent_root, NULL, logical, ret);
+ else if (eb)
+ __readahead_hook(fs_info->extent_root, eb, eb->start, ret);
+
+ if (eb)
+ free_extent_buffer(eb);
+
+ return 1;
+
+}
+
+static void reada_start_machine_worker(struct btrfs_work *work)
+{
+ struct reada_machine_work *rmw;
+ struct btrfs_fs_info *fs_info;
+ int old_ioprio;
+
+ rmw = container_of(work, struct reada_machine_work, work);
+ fs_info = rmw->fs_info;
+
+ kfree(rmw);
+
+ old_ioprio = IOPRIO_PRIO_VALUE(task_nice_ioclass(current),
+ task_nice_ioprio(current));
+ set_task_ioprio(current, BTRFS_IOPRIO_READA);
+ __reada_start_machine(fs_info);
+ set_task_ioprio(current, old_ioprio);
+}
+
+static void __reada_start_machine(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_device *device;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ u64 enqueued;
+ u64 total = 0;
+ int i;
+
+ do {
+ enqueued = 0;
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ if (atomic_read(&device->reada_in_flight) <
+ MAX_IN_FLIGHT)
+ enqueued += reada_start_machine_dev(fs_info,
+ device);
+ }
+ total += enqueued;
+ } while (enqueued && total < 10000);
+
+ if (enqueued == 0)
+ return;
+
+ /*
+ * If everything is already in the cache, this is effectively single
+ * threaded. To a) not hold the caller for too long and b) to utilize
+ * more cores, we broke the loop above after 10000 iterations and now
+ * enqueue to workers to finish it. This will distribute the load to
+ * the cores.
+ */
+ for (i = 0; i < 2; ++i)
+ reada_start_machine(fs_info);
+}
+
+static void reada_start_machine(struct btrfs_fs_info *fs_info)
+{
+ struct reada_machine_work *rmw;
+
+ rmw = kzalloc(sizeof(*rmw), GFP_NOFS);
+ if (!rmw) {
+ /* FIXME we cannot handle this properly right now */
+ BUG();
+ }
+ btrfs_init_work(&rmw->work, btrfs_readahead_helper,
+ reada_start_machine_worker, NULL, NULL);
+ rmw->fs_info = fs_info;
+
+ btrfs_queue_work(fs_info->readahead_workers, &rmw->work);
+}
+
+#ifdef DEBUG
+static void dump_devs(struct btrfs_fs_info *fs_info, int all)
+{
+ struct btrfs_device *device;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ unsigned long index;
+ int ret;
+ int i;
+ int j;
+ int cnt;
+
+ spin_lock(&fs_info->reada_lock);
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ printk(KERN_DEBUG "dev %lld has %d in flight\n", device->devid,
+ atomic_read(&device->reada_in_flight));
+ index = 0;
+ while (1) {
+ struct reada_zone *zone;
+ ret = radix_tree_gang_lookup(&device->reada_zones,
+ (void **)&zone, index, 1);
+ if (ret == 0)
+ break;
+ printk(KERN_DEBUG " zone %llu-%llu elems %llu locked "
+ "%d devs", zone->start, zone->end, zone->elems,
+ zone->locked);
+ for (j = 0; j < zone->ndevs; ++j) {
+ printk(KERN_CONT " %lld",
+ zone->devs[j]->devid);
+ }
+ if (device->reada_curr_zone == zone)
+ printk(KERN_CONT " curr off %llu",
+ device->reada_next - zone->start);
+ printk(KERN_CONT "\n");
+ index = (zone->end >> PAGE_CACHE_SHIFT) + 1;
+ }
+ cnt = 0;
+ index = 0;
+ while (all) {
+ struct reada_extent *re = NULL;
+
+ ret = radix_tree_gang_lookup(&device->reada_extents,
+ (void **)&re, index, 1);
+ if (ret == 0)
+ break;
+ printk(KERN_DEBUG
+ " re: logical %llu size %u empty %d for %lld",
+ re->logical, fs_info->tree_root->nodesize,
+ list_empty(&re->extctl), re->scheduled_for ?
+ re->scheduled_for->devid : -1);
+
+ for (i = 0; i < re->nzones; ++i) {
+ printk(KERN_CONT " zone %llu-%llu devs",
+ re->zones[i]->start,
+ re->zones[i]->end);
+ for (j = 0; j < re->zones[i]->ndevs; ++j) {
+ printk(KERN_CONT " %lld",
+ re->zones[i]->devs[j]->devid);
+ }
+ }
+ printk(KERN_CONT "\n");
+ index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
+ if (++cnt > 15)
+ break;
+ }
+ }
+
+ index = 0;
+ cnt = 0;
+ while (all) {
+ struct reada_extent *re = NULL;
+
+ ret = radix_tree_gang_lookup(&fs_info->reada_tree, (void **)&re,
+ index, 1);
+ if (ret == 0)
+ break;
+ if (!re->scheduled_for) {
+ index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
+ continue;
+ }
+ printk(KERN_DEBUG
+ "re: logical %llu size %u list empty %d for %lld",
+ re->logical, fs_info->tree_root->nodesize,
+ list_empty(&re->extctl),
+ re->scheduled_for ? re->scheduled_for->devid : -1);
+ for (i = 0; i < re->nzones; ++i) {
+ printk(KERN_CONT " zone %llu-%llu devs",
+ re->zones[i]->start,
+ re->zones[i]->end);
+ for (i = 0; i < re->nzones; ++i) {
+ printk(KERN_CONT " zone %llu-%llu devs",
+ re->zones[i]->start,
+ re->zones[i]->end);
+ for (j = 0; j < re->zones[i]->ndevs; ++j) {
+ printk(KERN_CONT " %lld",
+ re->zones[i]->devs[j]->devid);
+ }
+ }
+ }
+ printk(KERN_CONT "\n");
+ index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
+ }
+ spin_unlock(&fs_info->reada_lock);
+}
+#endif
+
+/*
+ * interface
+ */
+struct reada_control *btrfs_reada_add(struct btrfs_root *root,
+ struct btrfs_key *key_start, struct btrfs_key *key_end)
+{
+ struct reada_control *rc;
+ u64 start;
+ u64 generation;
+ int level;
+ struct extent_buffer *node;
+ static struct btrfs_key max_key = {
+ .objectid = (u64)-1,
+ .type = (u8)-1,
+ .offset = (u64)-1
+ };
+
+ rc = kzalloc(sizeof(*rc), GFP_NOFS);
+ if (!rc)
+ return ERR_PTR(-ENOMEM);
+
+ rc->root = root;
+ rc->key_start = *key_start;
+ rc->key_end = *key_end;
+ atomic_set(&rc->elems, 0);
+ init_waitqueue_head(&rc->wait);
+ kref_init(&rc->refcnt);
+ kref_get(&rc->refcnt); /* one ref for having elements */
+
+ node = btrfs_root_node(root);
+ start = node->start;
+ level = btrfs_header_level(node);
+ generation = btrfs_header_generation(node);
+ free_extent_buffer(node);
+
+ if (reada_add_block(rc, start, &max_key, level, generation)) {
+ kfree(rc);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ reada_start_machine(root->fs_info);
+
+ return rc;
+}
+
+#ifdef DEBUG
+int btrfs_reada_wait(void *handle)
+{
+ struct reada_control *rc = handle;
+
+ while (atomic_read(&rc->elems)) {
+ wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0,
+ 5 * HZ);
+ dump_devs(rc->root->fs_info,
+ atomic_read(&rc->elems) < 10 ? 1 : 0);
+ }
+
+ dump_devs(rc->root->fs_info, atomic_read(&rc->elems) < 10 ? 1 : 0);
+
+ kref_put(&rc->refcnt, reada_control_release);
+
+ return 0;
+}
+#else
+int btrfs_reada_wait(void *handle)
+{
+ struct reada_control *rc = handle;
+
+ while (atomic_read(&rc->elems)) {
+ wait_event(rc->wait, atomic_read(&rc->elems) == 0);
+ }
+
+ kref_put(&rc->refcnt, reada_control_release);
+
+ return 0;
+}
+#endif
+
+void btrfs_reada_detach(void *handle)
+{
+ struct reada_control *rc = handle;
+
+ kref_put(&rc->refcnt, reada_control_release);
+}
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
new file mode 100644
index 000000000..74b24b01d
--- /dev/null
+++ b/fs/btrfs/relocation.c
@@ -0,0 +1,4658 @@
+/*
+ * Copyright (C) 2009 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "volumes.h"
+#include "locking.h"
+#include "btrfs_inode.h"
+#include "async-thread.h"
+#include "free-space-cache.h"
+#include "inode-map.h"
+
+/*
+ * backref_node, mapping_node and tree_block start with this
+ */
+struct tree_entry {
+ struct rb_node rb_node;
+ u64 bytenr;
+};
+
+/*
+ * present a tree block in the backref cache
+ */
+struct backref_node {
+ struct rb_node rb_node;
+ u64 bytenr;
+
+ u64 new_bytenr;
+ /* objectid of tree block owner, can be not uptodate */
+ u64 owner;
+ /* link to pending, changed or detached list */
+ struct list_head list;
+ /* list of upper level blocks reference this block */
+ struct list_head upper;
+ /* list of child blocks in the cache */
+ struct list_head lower;
+ /* NULL if this node is not tree root */
+ struct btrfs_root *root;
+ /* extent buffer got by COW the block */
+ struct extent_buffer *eb;
+ /* level of tree block */
+ unsigned int level:8;
+ /* is the block in non-reference counted tree */
+ unsigned int cowonly:1;
+ /* 1 if no child node in the cache */
+ unsigned int lowest:1;
+ /* is the extent buffer locked */
+ unsigned int locked:1;
+ /* has the block been processed */
+ unsigned int processed:1;
+ /* have backrefs of this block been checked */
+ unsigned int checked:1;
+ /*
+ * 1 if corresponding block has been cowed but some upper
+ * level block pointers may not point to the new location
+ */
+ unsigned int pending:1;
+ /*
+ * 1 if the backref node isn't connected to any other
+ * backref node.
+ */
+ unsigned int detached:1;
+};
+
+/*
+ * present a block pointer in the backref cache
+ */
+struct backref_edge {
+ struct list_head list[2];
+ struct backref_node *node[2];
+};
+
+#define LOWER 0
+#define UPPER 1
+#define RELOCATION_RESERVED_NODES 256
+
+struct backref_cache {
+ /* red black tree of all backref nodes in the cache */
+ struct rb_root rb_root;
+ /* for passing backref nodes to btrfs_reloc_cow_block */
+ struct backref_node *path[BTRFS_MAX_LEVEL];
+ /*
+ * list of blocks that have been cowed but some block
+ * pointers in upper level blocks may not reflect the
+ * new location
+ */
+ struct list_head pending[BTRFS_MAX_LEVEL];
+ /* list of backref nodes with no child node */
+ struct list_head leaves;
+ /* list of blocks that have been cowed in current transaction */
+ struct list_head changed;
+ /* list of detached backref node. */
+ struct list_head detached;
+
+ u64 last_trans;
+
+ int nr_nodes;
+ int nr_edges;
+};
+
+/*
+ * map address of tree root to tree
+ */
+struct mapping_node {
+ struct rb_node rb_node;
+ u64 bytenr;
+ void *data;
+};
+
+struct mapping_tree {
+ struct rb_root rb_root;
+ spinlock_t lock;
+};
+
+/*
+ * present a tree block to process
+ */
+struct tree_block {
+ struct rb_node rb_node;
+ u64 bytenr;
+ struct btrfs_key key;
+ unsigned int level:8;
+ unsigned int key_ready:1;
+};
+
+#define MAX_EXTENTS 128
+
+struct file_extent_cluster {
+ u64 start;
+ u64 end;
+ u64 boundary[MAX_EXTENTS];
+ unsigned int nr;
+};
+
+struct reloc_control {
+ /* block group to relocate */
+ struct btrfs_block_group_cache *block_group;
+ /* extent tree */
+ struct btrfs_root *extent_root;
+ /* inode for moving data */
+ struct inode *data_inode;
+
+ struct btrfs_block_rsv *block_rsv;
+
+ struct backref_cache backref_cache;
+
+ struct file_extent_cluster cluster;
+ /* tree blocks have been processed */
+ struct extent_io_tree processed_blocks;
+ /* map start of tree root to corresponding reloc tree */
+ struct mapping_tree reloc_root_tree;
+ /* list of reloc trees */
+ struct list_head reloc_roots;
+ /* size of metadata reservation for merging reloc trees */
+ u64 merging_rsv_size;
+ /* size of relocated tree nodes */
+ u64 nodes_relocated;
+ /* reserved size for block group relocation*/
+ u64 reserved_bytes;
+
+ u64 search_start;
+ u64 extents_found;
+
+ unsigned int stage:8;
+ unsigned int create_reloc_tree:1;
+ unsigned int merge_reloc_tree:1;
+ unsigned int found_file_extent:1;
+};
+
+/* stages of data relocation */
+#define MOVE_DATA_EXTENTS 0
+#define UPDATE_DATA_PTRS 1
+
+static void remove_backref_node(struct backref_cache *cache,
+ struct backref_node *node);
+static void __mark_block_processed(struct reloc_control *rc,
+ struct backref_node *node);
+
+static void mapping_tree_init(struct mapping_tree *tree)
+{
+ tree->rb_root = RB_ROOT;
+ spin_lock_init(&tree->lock);
+}
+
+static void backref_cache_init(struct backref_cache *cache)
+{
+ int i;
+ cache->rb_root = RB_ROOT;
+ for (i = 0; i < BTRFS_MAX_LEVEL; i++)
+ INIT_LIST_HEAD(&cache->pending[i]);
+ INIT_LIST_HEAD(&cache->changed);
+ INIT_LIST_HEAD(&cache->detached);
+ INIT_LIST_HEAD(&cache->leaves);
+}
+
+static void backref_cache_cleanup(struct backref_cache *cache)
+{
+ struct backref_node *node;
+ int i;
+
+ while (!list_empty(&cache->detached)) {
+ node = list_entry(cache->detached.next,
+ struct backref_node, list);
+ remove_backref_node(cache, node);
+ }
+
+ while (!list_empty(&cache->leaves)) {
+ node = list_entry(cache->leaves.next,
+ struct backref_node, lower);
+ remove_backref_node(cache, node);
+ }
+
+ cache->last_trans = 0;
+
+ for (i = 0; i < BTRFS_MAX_LEVEL; i++)
+ BUG_ON(!list_empty(&cache->pending[i]));
+ BUG_ON(!list_empty(&cache->changed));
+ BUG_ON(!list_empty(&cache->detached));
+ BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
+ BUG_ON(cache->nr_nodes);
+ BUG_ON(cache->nr_edges);
+}
+
+static struct backref_node *alloc_backref_node(struct backref_cache *cache)
+{
+ struct backref_node *node;
+
+ node = kzalloc(sizeof(*node), GFP_NOFS);
+ if (node) {
+ INIT_LIST_HEAD(&node->list);
+ INIT_LIST_HEAD(&node->upper);
+ INIT_LIST_HEAD(&node->lower);
+ RB_CLEAR_NODE(&node->rb_node);
+ cache->nr_nodes++;
+ }
+ return node;
+}
+
+static void free_backref_node(struct backref_cache *cache,
+ struct backref_node *node)
+{
+ if (node) {
+ cache->nr_nodes--;
+ kfree(node);
+ }
+}
+
+static struct backref_edge *alloc_backref_edge(struct backref_cache *cache)
+{
+ struct backref_edge *edge;
+
+ edge = kzalloc(sizeof(*edge), GFP_NOFS);
+ if (edge)
+ cache->nr_edges++;
+ return edge;
+}
+
+static void free_backref_edge(struct backref_cache *cache,
+ struct backref_edge *edge)
+{
+ if (edge) {
+ cache->nr_edges--;
+ kfree(edge);
+ }
+}
+
+static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
+ struct rb_node *node)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+ struct tree_entry *entry;
+
+ while (*p) {
+ parent = *p;
+ entry = rb_entry(parent, struct tree_entry, rb_node);
+
+ if (bytenr < entry->bytenr)
+ p = &(*p)->rb_left;
+ else if (bytenr > entry->bytenr)
+ p = &(*p)->rb_right;
+ else
+ return parent;
+ }
+
+ rb_link_node(node, parent, p);
+ rb_insert_color(node, root);
+ return NULL;
+}
+
+static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
+{
+ struct rb_node *n = root->rb_node;
+ struct tree_entry *entry;
+
+ while (n) {
+ entry = rb_entry(n, struct tree_entry, rb_node);
+
+ if (bytenr < entry->bytenr)
+ n = n->rb_left;
+ else if (bytenr > entry->bytenr)
+ n = n->rb_right;
+ else
+ return n;
+ }
+ return NULL;
+}
+
+static void backref_tree_panic(struct rb_node *rb_node, int errno, u64 bytenr)
+{
+
+ struct btrfs_fs_info *fs_info = NULL;
+ struct backref_node *bnode = rb_entry(rb_node, struct backref_node,
+ rb_node);
+ if (bnode->root)
+ fs_info = bnode->root->fs_info;
+ btrfs_panic(fs_info, errno, "Inconsistency in backref cache "
+ "found at offset %llu", bytenr);
+}
+
+/*
+ * walk up backref nodes until reach node presents tree root
+ */
+static struct backref_node *walk_up_backref(struct backref_node *node,
+ struct backref_edge *edges[],
+ int *index)
+{
+ struct backref_edge *edge;
+ int idx = *index;
+
+ while (!list_empty(&node->upper)) {
+ edge = list_entry(node->upper.next,
+ struct backref_edge, list[LOWER]);
+ edges[idx++] = edge;
+ node = edge->node[UPPER];
+ }
+ BUG_ON(node->detached);
+ *index = idx;
+ return node;
+}
+
+/*
+ * walk down backref nodes to find start of next reference path
+ */
+static struct backref_node *walk_down_backref(struct backref_edge *edges[],
+ int *index)
+{
+ struct backref_edge *edge;
+ struct backref_node *lower;
+ int idx = *index;
+
+ while (idx > 0) {
+ edge = edges[idx - 1];
+ lower = edge->node[LOWER];
+ if (list_is_last(&edge->list[LOWER], &lower->upper)) {
+ idx--;
+ continue;
+ }
+ edge = list_entry(edge->list[LOWER].next,
+ struct backref_edge, list[LOWER]);
+ edges[idx - 1] = edge;
+ *index = idx;
+ return edge->node[UPPER];
+ }
+ *index = 0;
+ return NULL;
+}
+
+static void unlock_node_buffer(struct backref_node *node)
+{
+ if (node->locked) {
+ btrfs_tree_unlock(node->eb);
+ node->locked = 0;
+ }
+}
+
+static void drop_node_buffer(struct backref_node *node)
+{
+ if (node->eb) {
+ unlock_node_buffer(node);
+ free_extent_buffer(node->eb);
+ node->eb = NULL;
+ }
+}
+
+static void drop_backref_node(struct backref_cache *tree,
+ struct backref_node *node)
+{
+ BUG_ON(!list_empty(&node->upper));
+
+ drop_node_buffer(node);
+ list_del(&node->list);
+ list_del(&node->lower);
+ if (!RB_EMPTY_NODE(&node->rb_node))
+ rb_erase(&node->rb_node, &tree->rb_root);
+ free_backref_node(tree, node);
+}
+
+/*
+ * remove a backref node from the backref cache
+ */
+static void remove_backref_node(struct backref_cache *cache,
+ struct backref_node *node)
+{
+ struct backref_node *upper;
+ struct backref_edge *edge;
+
+ if (!node)
+ return;
+
+ BUG_ON(!node->lowest && !node->detached);
+ while (!list_empty(&node->upper)) {
+ edge = list_entry(node->upper.next, struct backref_edge,
+ list[LOWER]);
+ upper = edge->node[UPPER];
+ list_del(&edge->list[LOWER]);
+ list_del(&edge->list[UPPER]);
+ free_backref_edge(cache, edge);
+
+ if (RB_EMPTY_NODE(&upper->rb_node)) {
+ BUG_ON(!list_empty(&node->upper));
+ drop_backref_node(cache, node);
+ node = upper;
+ node->lowest = 1;
+ continue;
+ }
+ /*
+ * add the node to leaf node list if no other
+ * child block cached.
+ */
+ if (list_empty(&upper->lower)) {
+ list_add_tail(&upper->lower, &cache->leaves);
+ upper->lowest = 1;
+ }
+ }
+
+ drop_backref_node(cache, node);
+}
+
+static void update_backref_node(struct backref_cache *cache,
+ struct backref_node *node, u64 bytenr)
+{
+ struct rb_node *rb_node;
+ rb_erase(&node->rb_node, &cache->rb_root);
+ node->bytenr = bytenr;
+ rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
+ if (rb_node)
+ backref_tree_panic(rb_node, -EEXIST, bytenr);
+}
+
+/*
+ * update backref cache after a transaction commit
+ */
+static int update_backref_cache(struct btrfs_trans_handle *trans,
+ struct backref_cache *cache)
+{
+ struct backref_node *node;
+ int level = 0;
+
+ if (cache->last_trans == 0) {
+ cache->last_trans = trans->transid;
+ return 0;
+ }
+
+ if (cache->last_trans == trans->transid)
+ return 0;
+
+ /*
+ * detached nodes are used to avoid unnecessary backref
+ * lookup. transaction commit changes the extent tree.
+ * so the detached nodes are no longer useful.
+ */
+ while (!list_empty(&cache->detached)) {
+ node = list_entry(cache->detached.next,
+ struct backref_node, list);
+ remove_backref_node(cache, node);
+ }
+
+ while (!list_empty(&cache->changed)) {
+ node = list_entry(cache->changed.next,
+ struct backref_node, list);
+ list_del_init(&node->list);
+ BUG_ON(node->pending);
+ update_backref_node(cache, node, node->new_bytenr);
+ }
+
+ /*
+ * some nodes can be left in the pending list if there were
+ * errors during processing the pending nodes.
+ */
+ for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
+ list_for_each_entry(node, &cache->pending[level], list) {
+ BUG_ON(!node->pending);
+ if (node->bytenr == node->new_bytenr)
+ continue;
+ update_backref_node(cache, node, node->new_bytenr);
+ }
+ }
+
+ cache->last_trans = 0;
+ return 1;
+}
+
+
+static int should_ignore_root(struct btrfs_root *root)
+{
+ struct btrfs_root *reloc_root;
+
+ if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
+ return 0;
+
+ reloc_root = root->reloc_root;
+ if (!reloc_root)
+ return 0;
+
+ if (btrfs_root_last_snapshot(&reloc_root->root_item) ==
+ root->fs_info->running_transaction->transid - 1)
+ return 0;
+ /*
+ * if there is reloc tree and it was created in previous
+ * transaction backref lookup can find the reloc tree,
+ * so backref node for the fs tree root is useless for
+ * relocation.
+ */
+ return 1;
+}
+/*
+ * find reloc tree by address of tree root
+ */
+static struct btrfs_root *find_reloc_root(struct reloc_control *rc,
+ u64 bytenr)
+{
+ struct rb_node *rb_node;
+ struct mapping_node *node;
+ struct btrfs_root *root = NULL;
+
+ spin_lock(&rc->reloc_root_tree.lock);
+ rb_node = tree_search(&rc->reloc_root_tree.rb_root, bytenr);
+ if (rb_node) {
+ node = rb_entry(rb_node, struct mapping_node, rb_node);
+ root = (struct btrfs_root *)node->data;
+ }
+ spin_unlock(&rc->reloc_root_tree.lock);
+ return root;
+}
+
+static int is_cowonly_root(u64 root_objectid)
+{
+ if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
+ root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
+ root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
+ root_objectid == BTRFS_DEV_TREE_OBJECTID ||
+ root_objectid == BTRFS_TREE_LOG_OBJECTID ||
+ root_objectid == BTRFS_CSUM_TREE_OBJECTID ||
+ root_objectid == BTRFS_UUID_TREE_OBJECTID ||
+ root_objectid == BTRFS_QUOTA_TREE_OBJECTID)
+ return 1;
+ return 0;
+}
+
+static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info,
+ u64 root_objectid)
+{
+ struct btrfs_key key;
+
+ key.objectid = root_objectid;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ if (is_cowonly_root(root_objectid))
+ key.offset = 0;
+ else
+ key.offset = (u64)-1;
+
+ return btrfs_get_fs_root(fs_info, &key, false);
+}
+
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+static noinline_for_stack
+struct btrfs_root *find_tree_root(struct reloc_control *rc,
+ struct extent_buffer *leaf,
+ struct btrfs_extent_ref_v0 *ref0)
+{
+ struct btrfs_root *root;
+ u64 root_objectid = btrfs_ref_root_v0(leaf, ref0);
+ u64 generation = btrfs_ref_generation_v0(leaf, ref0);
+
+ BUG_ON(root_objectid == BTRFS_TREE_RELOC_OBJECTID);
+
+ root = read_fs_root(rc->extent_root->fs_info, root_objectid);
+ BUG_ON(IS_ERR(root));
+
+ if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+ generation != btrfs_root_generation(&root->root_item))
+ return NULL;
+
+ return root;
+}
+#endif
+
+static noinline_for_stack
+int find_inline_backref(struct extent_buffer *leaf, int slot,
+ unsigned long *ptr, unsigned long *end)
+{
+ struct btrfs_key key;
+ struct btrfs_extent_item *ei;
+ struct btrfs_tree_block_info *bi;
+ u32 item_size;
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+
+ item_size = btrfs_item_size_nr(leaf, slot);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+ if (item_size < sizeof(*ei)) {
+ WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
+ return 1;
+ }
+#endif
+ ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
+ WARN_ON(!(btrfs_extent_flags(leaf, ei) &
+ BTRFS_EXTENT_FLAG_TREE_BLOCK));
+
+ if (key.type == BTRFS_EXTENT_ITEM_KEY &&
+ item_size <= sizeof(*ei) + sizeof(*bi)) {
+ WARN_ON(item_size < sizeof(*ei) + sizeof(*bi));
+ return 1;
+ }
+ if (key.type == BTRFS_METADATA_ITEM_KEY &&
+ item_size <= sizeof(*ei)) {
+ WARN_ON(item_size < sizeof(*ei));
+ return 1;
+ }
+
+ if (key.type == BTRFS_EXTENT_ITEM_KEY) {
+ bi = (struct btrfs_tree_block_info *)(ei + 1);
+ *ptr = (unsigned long)(bi + 1);
+ } else {
+ *ptr = (unsigned long)(ei + 1);
+ }
+ *end = (unsigned long)ei + item_size;
+ return 0;
+}
+
+/*
+ * build backref tree for a given tree block. root of the backref tree
+ * corresponds the tree block, leaves of the backref tree correspond
+ * roots of b-trees that reference the tree block.
+ *
+ * the basic idea of this function is check backrefs of a given block
+ * to find upper level blocks that refernece the block, and then check
+ * bakcrefs of these upper level blocks recursively. the recursion stop
+ * when tree root is reached or backrefs for the block is cached.
+ *
+ * NOTE: if we find backrefs for a block are cached, we know backrefs
+ * for all upper level blocks that directly/indirectly reference the
+ * block are also cached.
+ */
+static noinline_for_stack
+struct backref_node *build_backref_tree(struct reloc_control *rc,
+ struct btrfs_key *node_key,
+ int level, u64 bytenr)
+{
+ struct backref_cache *cache = &rc->backref_cache;
+ struct btrfs_path *path1;
+ struct btrfs_path *path2;
+ struct extent_buffer *eb;
+ struct btrfs_root *root;
+ struct backref_node *cur;
+ struct backref_node *upper;
+ struct backref_node *lower;
+ struct backref_node *node = NULL;
+ struct backref_node *exist = NULL;
+ struct backref_edge *edge;
+ struct rb_node *rb_node;
+ struct btrfs_key key;
+ unsigned long end;
+ unsigned long ptr;
+ LIST_HEAD(list);
+ LIST_HEAD(useless);
+ int cowonly;
+ int ret;
+ int err = 0;
+ bool need_check = true;
+
+ path1 = btrfs_alloc_path();
+ path2 = btrfs_alloc_path();
+ if (!path1 || !path2) {
+ err = -ENOMEM;
+ goto out;
+ }
+ path1->reada = 1;
+ path2->reada = 2;
+
+ node = alloc_backref_node(cache);
+ if (!node) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ node->bytenr = bytenr;
+ node->level = level;
+ node->lowest = 1;
+ cur = node;
+again:
+ end = 0;
+ ptr = 0;
+ key.objectid = cur->bytenr;
+ key.type = BTRFS_METADATA_ITEM_KEY;
+ key.offset = (u64)-1;
+
+ path1->search_commit_root = 1;
+ path1->skip_locking = 1;
+ ret = btrfs_search_slot(NULL, rc->extent_root, &key, path1,
+ 0, 0);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+ ASSERT(ret);
+ ASSERT(path1->slots[0]);
+
+ path1->slots[0]--;
+
+ WARN_ON(cur->checked);
+ if (!list_empty(&cur->upper)) {
+ /*
+ * the backref was added previously when processing
+ * backref of type BTRFS_TREE_BLOCK_REF_KEY
+ */
+ ASSERT(list_is_singular(&cur->upper));
+ edge = list_entry(cur->upper.next, struct backref_edge,
+ list[LOWER]);
+ ASSERT(list_empty(&edge->list[UPPER]));
+ exist = edge->node[UPPER];
+ /*
+ * add the upper level block to pending list if we need
+ * check its backrefs
+ */
+ if (!exist->checked)
+ list_add_tail(&edge->list[UPPER], &list);
+ } else {
+ exist = NULL;
+ }
+
+ while (1) {
+ cond_resched();
+ eb = path1->nodes[0];
+
+ if (ptr >= end) {
+ if (path1->slots[0] >= btrfs_header_nritems(eb)) {
+ ret = btrfs_next_leaf(rc->extent_root, path1);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+ if (ret > 0)
+ break;
+ eb = path1->nodes[0];
+ }
+
+ btrfs_item_key_to_cpu(eb, &key, path1->slots[0]);
+ if (key.objectid != cur->bytenr) {
+ WARN_ON(exist);
+ break;
+ }
+
+ if (key.type == BTRFS_EXTENT_ITEM_KEY ||
+ key.type == BTRFS_METADATA_ITEM_KEY) {
+ ret = find_inline_backref(eb, path1->slots[0],
+ &ptr, &end);
+ if (ret)
+ goto next;
+ }
+ }
+
+ if (ptr < end) {
+ /* update key for inline back ref */
+ struct btrfs_extent_inline_ref *iref;
+ iref = (struct btrfs_extent_inline_ref *)ptr;
+ key.type = btrfs_extent_inline_ref_type(eb, iref);
+ key.offset = btrfs_extent_inline_ref_offset(eb, iref);
+ WARN_ON(key.type != BTRFS_TREE_BLOCK_REF_KEY &&
+ key.type != BTRFS_SHARED_BLOCK_REF_KEY);
+ }
+
+ if (exist &&
+ ((key.type == BTRFS_TREE_BLOCK_REF_KEY &&
+ exist->owner == key.offset) ||
+ (key.type == BTRFS_SHARED_BLOCK_REF_KEY &&
+ exist->bytenr == key.offset))) {
+ exist = NULL;
+ goto next;
+ }
+
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+ if (key.type == BTRFS_SHARED_BLOCK_REF_KEY ||
+ key.type == BTRFS_EXTENT_REF_V0_KEY) {
+ if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
+ struct btrfs_extent_ref_v0 *ref0;
+ ref0 = btrfs_item_ptr(eb, path1->slots[0],
+ struct btrfs_extent_ref_v0);
+ if (key.objectid == key.offset) {
+ root = find_tree_root(rc, eb, ref0);
+ if (root && !should_ignore_root(root))
+ cur->root = root;
+ else
+ list_add(&cur->list, &useless);
+ break;
+ }
+ if (is_cowonly_root(btrfs_ref_root_v0(eb,
+ ref0)))
+ cur->cowonly = 1;
+ }
+#else
+ ASSERT(key.type != BTRFS_EXTENT_REF_V0_KEY);
+ if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
+#endif
+ if (key.objectid == key.offset) {
+ /*
+ * only root blocks of reloc trees use
+ * backref of this type.
+ */
+ root = find_reloc_root(rc, cur->bytenr);
+ ASSERT(root);
+ cur->root = root;
+ break;
+ }
+
+ edge = alloc_backref_edge(cache);
+ if (!edge) {
+ err = -ENOMEM;
+ goto out;
+ }
+ rb_node = tree_search(&cache->rb_root, key.offset);
+ if (!rb_node) {
+ upper = alloc_backref_node(cache);
+ if (!upper) {
+ free_backref_edge(cache, edge);
+ err = -ENOMEM;
+ goto out;
+ }
+ upper->bytenr = key.offset;
+ upper->level = cur->level + 1;
+ /*
+ * backrefs for the upper level block isn't
+ * cached, add the block to pending list
+ */
+ list_add_tail(&edge->list[UPPER], &list);
+ } else {
+ upper = rb_entry(rb_node, struct backref_node,
+ rb_node);
+ ASSERT(upper->checked);
+ INIT_LIST_HEAD(&edge->list[UPPER]);
+ }
+ list_add_tail(&edge->list[LOWER], &cur->upper);
+ edge->node[LOWER] = cur;
+ edge->node[UPPER] = upper;
+
+ goto next;
+ } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
+ goto next;
+ }
+
+ /* key.type == BTRFS_TREE_BLOCK_REF_KEY */
+ root = read_fs_root(rc->extent_root->fs_info, key.offset);
+ if (IS_ERR(root)) {
+ err = PTR_ERR(root);
+ goto out;
+ }
+
+ if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
+ cur->cowonly = 1;
+
+ if (btrfs_root_level(&root->root_item) == cur->level) {
+ /* tree root */
+ ASSERT(btrfs_root_bytenr(&root->root_item) ==
+ cur->bytenr);
+ if (should_ignore_root(root))
+ list_add(&cur->list, &useless);
+ else
+ cur->root = root;
+ break;
+ }
+
+ level = cur->level + 1;
+
+ /*
+ * searching the tree to find upper level blocks
+ * reference the block.
+ */
+ path2->search_commit_root = 1;
+ path2->skip_locking = 1;
+ path2->lowest_level = level;
+ ret = btrfs_search_slot(NULL, root, node_key, path2, 0, 0);
+ path2->lowest_level = 0;
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+ if (ret > 0 && path2->slots[level] > 0)
+ path2->slots[level]--;
+
+ eb = path2->nodes[level];
+ WARN_ON(btrfs_node_blockptr(eb, path2->slots[level]) !=
+ cur->bytenr);
+
+ lower = cur;
+ need_check = true;
+ for (; level < BTRFS_MAX_LEVEL; level++) {
+ if (!path2->nodes[level]) {
+ ASSERT(btrfs_root_bytenr(&root->root_item) ==
+ lower->bytenr);
+ if (should_ignore_root(root))
+ list_add(&lower->list, &useless);
+ else
+ lower->root = root;
+ break;
+ }
+
+ edge = alloc_backref_edge(cache);
+ if (!edge) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ eb = path2->nodes[level];
+ rb_node = tree_search(&cache->rb_root, eb->start);
+ if (!rb_node) {
+ upper = alloc_backref_node(cache);
+ if (!upper) {
+ free_backref_edge(cache, edge);
+ err = -ENOMEM;
+ goto out;
+ }
+ upper->bytenr = eb->start;
+ upper->owner = btrfs_header_owner(eb);
+ upper->level = lower->level + 1;
+ if (!test_bit(BTRFS_ROOT_REF_COWS,
+ &root->state))
+ upper->cowonly = 1;
+
+ /*
+ * if we know the block isn't shared
+ * we can void checking its backrefs.
+ */
+ if (btrfs_block_can_be_shared(root, eb))
+ upper->checked = 0;
+ else
+ upper->checked = 1;
+
+ /*
+ * add the block to pending list if we
+ * need check its backrefs, we only do this once
+ * while walking up a tree as we will catch
+ * anything else later on.
+ */
+ if (!upper->checked && need_check) {
+ need_check = false;
+ list_add_tail(&edge->list[UPPER],
+ &list);
+ } else {
+ if (upper->checked)
+ need_check = true;
+ INIT_LIST_HEAD(&edge->list[UPPER]);
+ }
+ } else {
+ upper = rb_entry(rb_node, struct backref_node,
+ rb_node);
+ ASSERT(upper->checked);
+ INIT_LIST_HEAD(&edge->list[UPPER]);
+ if (!upper->owner)
+ upper->owner = btrfs_header_owner(eb);
+ }
+ list_add_tail(&edge->list[LOWER], &lower->upper);
+ edge->node[LOWER] = lower;
+ edge->node[UPPER] = upper;
+
+ if (rb_node)
+ break;
+ lower = upper;
+ upper = NULL;
+ }
+ btrfs_release_path(path2);
+next:
+ if (ptr < end) {
+ ptr += btrfs_extent_inline_ref_size(key.type);
+ if (ptr >= end) {
+ WARN_ON(ptr > end);
+ ptr = 0;
+ end = 0;
+ }
+ }
+ if (ptr >= end)
+ path1->slots[0]++;
+ }
+ btrfs_release_path(path1);
+
+ cur->checked = 1;
+ WARN_ON(exist);
+
+ /* the pending list isn't empty, take the first block to process */
+ if (!list_empty(&list)) {
+ edge = list_entry(list.next, struct backref_edge, list[UPPER]);
+ list_del_init(&edge->list[UPPER]);
+ cur = edge->node[UPPER];
+ goto again;
+ }
+
+ /*
+ * everything goes well, connect backref nodes and insert backref nodes
+ * into the cache.
+ */
+ ASSERT(node->checked);
+ cowonly = node->cowonly;
+ if (!cowonly) {
+ rb_node = tree_insert(&cache->rb_root, node->bytenr,
+ &node->rb_node);
+ if (rb_node)
+ backref_tree_panic(rb_node, -EEXIST, node->bytenr);
+ list_add_tail(&node->lower, &cache->leaves);
+ }
+
+ list_for_each_entry(edge, &node->upper, list[LOWER])
+ list_add_tail(&edge->list[UPPER], &list);
+
+ while (!list_empty(&list)) {
+ edge = list_entry(list.next, struct backref_edge, list[UPPER]);
+ list_del_init(&edge->list[UPPER]);
+ upper = edge->node[UPPER];
+ if (upper->detached) {
+ list_del(&edge->list[LOWER]);
+ lower = edge->node[LOWER];
+ free_backref_edge(cache, edge);
+ if (list_empty(&lower->upper))
+ list_add(&lower->list, &useless);
+ continue;
+ }
+
+ if (!RB_EMPTY_NODE(&upper->rb_node)) {
+ if (upper->lowest) {
+ list_del_init(&upper->lower);
+ upper->lowest = 0;
+ }
+
+ list_add_tail(&edge->list[UPPER], &upper->lower);
+ continue;
+ }
+
+ if (!upper->checked) {
+ /*
+ * Still want to blow up for developers since this is a
+ * logic bug.
+ */
+ ASSERT(0);
+ err = -EINVAL;
+ goto out;
+ }
+ if (cowonly != upper->cowonly) {
+ ASSERT(0);
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (!cowonly) {
+ rb_node = tree_insert(&cache->rb_root, upper->bytenr,
+ &upper->rb_node);
+ if (rb_node)
+ backref_tree_panic(rb_node, -EEXIST,
+ upper->bytenr);
+ }
+
+ list_add_tail(&edge->list[UPPER], &upper->lower);
+
+ list_for_each_entry(edge, &upper->upper, list[LOWER])
+ list_add_tail(&edge->list[UPPER], &list);
+ }
+ /*
+ * process useless backref nodes. backref nodes for tree leaves
+ * are deleted from the cache. backref nodes for upper level
+ * tree blocks are left in the cache to avoid unnecessary backref
+ * lookup.
+ */
+ while (!list_empty(&useless)) {
+ upper = list_entry(useless.next, struct backref_node, list);
+ list_del_init(&upper->list);
+ ASSERT(list_empty(&upper->upper));
+ if (upper == node)
+ node = NULL;
+ if (upper->lowest) {
+ list_del_init(&upper->lower);
+ upper->lowest = 0;
+ }
+ while (!list_empty(&upper->lower)) {
+ edge = list_entry(upper->lower.next,
+ struct backref_edge, list[UPPER]);
+ list_del(&edge->list[UPPER]);
+ list_del(&edge->list[LOWER]);
+ lower = edge->node[LOWER];
+ free_backref_edge(cache, edge);
+
+ if (list_empty(&lower->upper))
+ list_add(&lower->list, &useless);
+ }
+ __mark_block_processed(rc, upper);
+ if (upper->level > 0) {
+ list_add(&upper->list, &cache->detached);
+ upper->detached = 1;
+ } else {
+ rb_erase(&upper->rb_node, &cache->rb_root);
+ free_backref_node(cache, upper);
+ }
+ }
+out:
+ btrfs_free_path(path1);
+ btrfs_free_path(path2);
+ if (err) {
+ while (!list_empty(&useless)) {
+ lower = list_entry(useless.next,
+ struct backref_node, list);
+ list_del_init(&lower->list);
+ }
+ while (!list_empty(&list)) {
+ edge = list_first_entry(&list, struct backref_edge,
+ list[UPPER]);
+ list_del(&edge->list[UPPER]);
+ list_del(&edge->list[LOWER]);
+ lower = edge->node[LOWER];
+ upper = edge->node[UPPER];
+ free_backref_edge(cache, edge);
+
+ /*
+ * Lower is no longer linked to any upper backref nodes
+ * and isn't in the cache, we can free it ourselves.
+ */
+ if (list_empty(&lower->upper) &&
+ RB_EMPTY_NODE(&lower->rb_node))
+ list_add(&lower->list, &useless);
+
+ if (!RB_EMPTY_NODE(&upper->rb_node))
+ continue;
+
+ /* Add this guy's upper edges to the list to proces */
+ list_for_each_entry(edge, &upper->upper, list[LOWER])
+ list_add_tail(&edge->list[UPPER], &list);
+ if (list_empty(&upper->upper))
+ list_add(&upper->list, &useless);
+ }
+
+ while (!list_empty(&useless)) {
+ lower = list_entry(useless.next,
+ struct backref_node, list);
+ list_del_init(&lower->list);
+ free_backref_node(cache, lower);
+ }
+ return ERR_PTR(err);
+ }
+ ASSERT(!node || !node->detached);
+ return node;
+}
+
+/*
+ * helper to add backref node for the newly created snapshot.
+ * the backref node is created by cloning backref node that
+ * corresponds to root of source tree
+ */
+static int clone_backref_node(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc,
+ struct btrfs_root *src,
+ struct btrfs_root *dest)
+{
+ struct btrfs_root *reloc_root = src->reloc_root;
+ struct backref_cache *cache = &rc->backref_cache;
+ struct backref_node *node = NULL;
+ struct backref_node *new_node;
+ struct backref_edge *edge;
+ struct backref_edge *new_edge;
+ struct rb_node *rb_node;
+
+ if (cache->last_trans > 0)
+ update_backref_cache(trans, cache);
+
+ rb_node = tree_search(&cache->rb_root, src->commit_root->start);
+ if (rb_node) {
+ node = rb_entry(rb_node, struct backref_node, rb_node);
+ if (node->detached)
+ node = NULL;
+ else
+ BUG_ON(node->new_bytenr != reloc_root->node->start);
+ }
+
+ if (!node) {
+ rb_node = tree_search(&cache->rb_root,
+ reloc_root->commit_root->start);
+ if (rb_node) {
+ node = rb_entry(rb_node, struct backref_node,
+ rb_node);
+ BUG_ON(node->detached);
+ }
+ }
+
+ if (!node)
+ return 0;
+
+ new_node = alloc_backref_node(cache);
+ if (!new_node)
+ return -ENOMEM;
+
+ new_node->bytenr = dest->node->start;
+ new_node->level = node->level;
+ new_node->lowest = node->lowest;
+ new_node->checked = 1;
+ new_node->root = dest;
+
+ if (!node->lowest) {
+ list_for_each_entry(edge, &node->lower, list[UPPER]) {
+ new_edge = alloc_backref_edge(cache);
+ if (!new_edge)
+ goto fail;
+
+ new_edge->node[UPPER] = new_node;
+ new_edge->node[LOWER] = edge->node[LOWER];
+ list_add_tail(&new_edge->list[UPPER],
+ &new_node->lower);
+ }
+ } else {
+ list_add_tail(&new_node->lower, &cache->leaves);
+ }
+
+ rb_node = tree_insert(&cache->rb_root, new_node->bytenr,
+ &new_node->rb_node);
+ if (rb_node)
+ backref_tree_panic(rb_node, -EEXIST, new_node->bytenr);
+
+ if (!new_node->lowest) {
+ list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
+ list_add_tail(&new_edge->list[LOWER],
+ &new_edge->node[LOWER]->upper);
+ }
+ }
+ return 0;
+fail:
+ while (!list_empty(&new_node->lower)) {
+ new_edge = list_entry(new_node->lower.next,
+ struct backref_edge, list[UPPER]);
+ list_del(&new_edge->list[UPPER]);
+ free_backref_edge(cache, new_edge);
+ }
+ free_backref_node(cache, new_node);
+ return -ENOMEM;
+}
+
+/*
+ * helper to add 'address of tree root -> reloc tree' mapping
+ */
+static int __must_check __add_reloc_root(struct btrfs_root *root)
+{
+ struct rb_node *rb_node;
+ struct mapping_node *node;
+ struct reloc_control *rc = root->fs_info->reloc_ctl;
+
+ node = kmalloc(sizeof(*node), GFP_NOFS);
+ if (!node)
+ return -ENOMEM;
+
+ node->bytenr = root->node->start;
+ node->data = root;
+
+ spin_lock(&rc->reloc_root_tree.lock);
+ rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
+ node->bytenr, &node->rb_node);
+ spin_unlock(&rc->reloc_root_tree.lock);
+ if (rb_node) {
+ btrfs_panic(root->fs_info, -EEXIST, "Duplicate root found "
+ "for start=%llu while inserting into relocation "
+ "tree", node->bytenr);
+ kfree(node);
+ return -EEXIST;
+ }
+
+ list_add_tail(&root->root_list, &rc->reloc_roots);
+ return 0;
+}
+
+/*
+ * helper to delete the 'address of tree root -> reloc tree'
+ * mapping
+ */
+static void __del_reloc_root(struct btrfs_root *root)
+{
+ struct rb_node *rb_node;
+ struct mapping_node *node = NULL;
+ struct reloc_control *rc = root->fs_info->reloc_ctl;
+
+ spin_lock(&rc->reloc_root_tree.lock);
+ rb_node = tree_search(&rc->reloc_root_tree.rb_root,
+ root->node->start);
+ if (rb_node) {
+ node = rb_entry(rb_node, struct mapping_node, rb_node);
+ rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
+ }
+ spin_unlock(&rc->reloc_root_tree.lock);
+
+ if (!node)
+ return;
+ BUG_ON((struct btrfs_root *)node->data != root);
+
+ spin_lock(&root->fs_info->trans_lock);
+ list_del_init(&root->root_list);
+ spin_unlock(&root->fs_info->trans_lock);
+ kfree(node);
+}
+
+/*
+ * helper to update the 'address of tree root -> reloc tree'
+ * mapping
+ */
+static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr)
+{
+ struct rb_node *rb_node;
+ struct mapping_node *node = NULL;
+ struct reloc_control *rc = root->fs_info->reloc_ctl;
+
+ spin_lock(&rc->reloc_root_tree.lock);
+ rb_node = tree_search(&rc->reloc_root_tree.rb_root,
+ root->node->start);
+ if (rb_node) {
+ node = rb_entry(rb_node, struct mapping_node, rb_node);
+ rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
+ }
+ spin_unlock(&rc->reloc_root_tree.lock);
+
+ if (!node)
+ return 0;
+ BUG_ON((struct btrfs_root *)node->data != root);
+
+ spin_lock(&rc->reloc_root_tree.lock);
+ node->bytenr = new_bytenr;
+ rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
+ node->bytenr, &node->rb_node);
+ spin_unlock(&rc->reloc_root_tree.lock);
+ if (rb_node)
+ backref_tree_panic(rb_node, -EEXIST, node->bytenr);
+ return 0;
+}
+
+static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 objectid)
+{
+ struct btrfs_root *reloc_root;
+ struct extent_buffer *eb;
+ struct btrfs_root_item *root_item;
+ struct btrfs_key root_key;
+ u64 last_snap = 0;
+ int ret;
+
+ root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
+ BUG_ON(!root_item);
+
+ root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
+ root_key.type = BTRFS_ROOT_ITEM_KEY;
+ root_key.offset = objectid;
+
+ if (root->root_key.objectid == objectid) {
+ /* called by btrfs_init_reloc_root */
+ ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
+ BTRFS_TREE_RELOC_OBJECTID);
+ BUG_ON(ret);
+
+ last_snap = btrfs_root_last_snapshot(&root->root_item);
+ btrfs_set_root_last_snapshot(&root->root_item,
+ trans->transid - 1);
+ } else {
+ /*
+ * called by btrfs_reloc_post_snapshot_hook.
+ * the source tree is a reloc tree, all tree blocks
+ * modified after it was created have RELOC flag
+ * set in their headers. so it's OK to not update
+ * the 'last_snapshot'.
+ */
+ ret = btrfs_copy_root(trans, root, root->node, &eb,
+ BTRFS_TREE_RELOC_OBJECTID);
+ BUG_ON(ret);
+ }
+
+ memcpy(root_item, &root->root_item, sizeof(*root_item));
+ btrfs_set_root_bytenr(root_item, eb->start);
+ btrfs_set_root_level(root_item, btrfs_header_level(eb));
+ btrfs_set_root_generation(root_item, trans->transid);
+
+ if (root->root_key.objectid == objectid) {
+ btrfs_set_root_refs(root_item, 0);
+ memset(&root_item->drop_progress, 0,
+ sizeof(struct btrfs_disk_key));
+ root_item->drop_level = 0;
+ /*
+ * abuse rtransid, it is safe because it is impossible to
+ * receive data into a relocation tree.
+ */
+ btrfs_set_root_rtransid(root_item, last_snap);
+ btrfs_set_root_otransid(root_item, trans->transid);
+ }
+
+ btrfs_tree_unlock(eb);
+ free_extent_buffer(eb);
+
+ ret = btrfs_insert_root(trans, root->fs_info->tree_root,
+ &root_key, root_item);
+ BUG_ON(ret);
+ kfree(root_item);
+
+ reloc_root = btrfs_read_fs_root(root->fs_info->tree_root, &root_key);
+ BUG_ON(IS_ERR(reloc_root));
+ reloc_root->last_trans = trans->transid;
+ return reloc_root;
+}
+
+/*
+ * create reloc tree for a given fs tree. reloc tree is just a
+ * snapshot of the fs tree with special root objectid.
+ */
+int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_root *reloc_root;
+ struct reloc_control *rc = root->fs_info->reloc_ctl;
+ struct btrfs_block_rsv *rsv;
+ int clear_rsv = 0;
+ int ret;
+
+ if (root->reloc_root) {
+ reloc_root = root->reloc_root;
+ reloc_root->last_trans = trans->transid;
+ return 0;
+ }
+
+ if (!rc || !rc->create_reloc_tree ||
+ root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+ return 0;
+
+ if (!trans->reloc_reserved) {
+ rsv = trans->block_rsv;
+ trans->block_rsv = rc->block_rsv;
+ clear_rsv = 1;
+ }
+ reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
+ if (clear_rsv)
+ trans->block_rsv = rsv;
+
+ ret = __add_reloc_root(reloc_root);
+ BUG_ON(ret < 0);
+ root->reloc_root = reloc_root;
+ return 0;
+}
+
+/*
+ * update root item of reloc tree
+ */
+int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_root *reloc_root;
+ struct btrfs_root_item *root_item;
+ int ret;
+
+ if (!root->reloc_root)
+ goto out;
+
+ reloc_root = root->reloc_root;
+ root_item = &reloc_root->root_item;
+
+ if (root->fs_info->reloc_ctl->merge_reloc_tree &&
+ btrfs_root_refs(root_item) == 0) {
+ root->reloc_root = NULL;
+ __del_reloc_root(reloc_root);
+ }
+
+ if (reloc_root->commit_root != reloc_root->node) {
+ btrfs_set_root_node(root_item, reloc_root->node);
+ free_extent_buffer(reloc_root->commit_root);
+ reloc_root->commit_root = btrfs_root_node(reloc_root);
+ }
+
+ ret = btrfs_update_root(trans, root->fs_info->tree_root,
+ &reloc_root->root_key, root_item);
+ BUG_ON(ret);
+
+out:
+ return 0;
+}
+
+/*
+ * helper to find first cached inode with inode number >= objectid
+ * in a subvolume
+ */
+static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid)
+{
+ struct rb_node *node;
+ struct rb_node *prev;
+ struct btrfs_inode *entry;
+ struct inode *inode;
+
+ spin_lock(&root->inode_lock);
+again:
+ node = root->inode_tree.rb_node;
+ prev = NULL;
+ while (node) {
+ prev = node;
+ entry = rb_entry(node, struct btrfs_inode, rb_node);
+
+ if (objectid < btrfs_ino(&entry->vfs_inode))
+ node = node->rb_left;
+ else if (objectid > btrfs_ino(&entry->vfs_inode))
+ node = node->rb_right;
+ else
+ break;
+ }
+ if (!node) {
+ while (prev) {
+ entry = rb_entry(prev, struct btrfs_inode, rb_node);
+ if (objectid <= btrfs_ino(&entry->vfs_inode)) {
+ node = prev;
+ break;
+ }
+ prev = rb_next(prev);
+ }
+ }
+ while (node) {
+ entry = rb_entry(node, struct btrfs_inode, rb_node);
+ inode = igrab(&entry->vfs_inode);
+ if (inode) {
+ spin_unlock(&root->inode_lock);
+ return inode;
+ }
+
+ objectid = btrfs_ino(&entry->vfs_inode) + 1;
+ if (cond_resched_lock(&root->inode_lock))
+ goto again;
+
+ node = rb_next(node);
+ }
+ spin_unlock(&root->inode_lock);
+ return NULL;
+}
+
+static int in_block_group(u64 bytenr,
+ struct btrfs_block_group_cache *block_group)
+{
+ if (bytenr >= block_group->key.objectid &&
+ bytenr < block_group->key.objectid + block_group->key.offset)
+ return 1;
+ return 0;
+}
+
+/*
+ * get new location of data
+ */
+static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
+ u64 bytenr, u64 num_bytes)
+{
+ struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
+ struct btrfs_path *path;
+ struct btrfs_file_extent_item *fi;
+ struct extent_buffer *leaf;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ bytenr -= BTRFS_I(reloc_inode)->index_cnt;
+ ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(reloc_inode),
+ bytenr, 0);
+ if (ret < 0)
+ goto out;
+ if (ret > 0) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ leaf = path->nodes[0];
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+
+ BUG_ON(btrfs_file_extent_offset(leaf, fi) ||
+ btrfs_file_extent_compression(leaf, fi) ||
+ btrfs_file_extent_encryption(leaf, fi) ||
+ btrfs_file_extent_other_encoding(leaf, fi));
+
+ if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * update file extent items in the tree leaf to point to
+ * the new locations.
+ */
+static noinline_for_stack
+int replace_file_extents(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc,
+ struct btrfs_root *root,
+ struct extent_buffer *leaf)
+{
+ struct btrfs_key key;
+ struct btrfs_file_extent_item *fi;
+ struct inode *inode = NULL;
+ u64 parent;
+ u64 bytenr;
+ u64 new_bytenr = 0;
+ u64 num_bytes;
+ u64 end;
+ u32 nritems;
+ u32 i;
+ int ret = 0;
+ int first = 1;
+ int dirty = 0;
+
+ if (rc->stage != UPDATE_DATA_PTRS)
+ return 0;
+
+ /* reloc trees always use full backref */
+ if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+ parent = leaf->start;
+ else
+ parent = 0;
+
+ nritems = btrfs_header_nritems(leaf);
+ for (i = 0; i < nritems; i++) {
+ cond_resched();
+ btrfs_item_key_to_cpu(leaf, &key, i);
+ if (key.type != BTRFS_EXTENT_DATA_KEY)
+ continue;
+ fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
+ if (btrfs_file_extent_type(leaf, fi) ==
+ BTRFS_FILE_EXTENT_INLINE)
+ continue;
+ bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+ num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+ if (bytenr == 0)
+ continue;
+ if (!in_block_group(bytenr, rc->block_group))
+ continue;
+
+ /*
+ * if we are modifying block in fs tree, wait for readpage
+ * to complete and drop the extent cache
+ */
+ if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
+ if (first) {
+ inode = find_next_inode(root, key.objectid);
+ first = 0;
+ } else if (inode && btrfs_ino(inode) < key.objectid) {
+ btrfs_add_delayed_iput(inode);
+ inode = find_next_inode(root, key.objectid);
+ }
+ if (inode && btrfs_ino(inode) == key.objectid) {
+ end = key.offset +
+ btrfs_file_extent_num_bytes(leaf, fi);
+ WARN_ON(!IS_ALIGNED(key.offset,
+ root->sectorsize));
+ WARN_ON(!IS_ALIGNED(end, root->sectorsize));
+ end--;
+ ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
+ key.offset, end);
+ if (!ret)
+ continue;
+
+ btrfs_drop_extent_cache(inode, key.offset, end,
+ 1);
+ unlock_extent(&BTRFS_I(inode)->io_tree,
+ key.offset, end);
+ }
+ }
+
+ ret = get_new_location(rc->data_inode, &new_bytenr,
+ bytenr, num_bytes);
+ if (ret) {
+ /*
+ * Don't have to abort since we've not changed anything
+ * in the file extent yet.
+ */
+ break;
+ }
+
+ btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
+ dirty = 1;
+
+ key.offset -= btrfs_file_extent_offset(leaf, fi);
+ ret = btrfs_inc_extent_ref(trans, root, new_bytenr,
+ num_bytes, parent,
+ btrfs_header_owner(leaf),
+ key.objectid, key.offset, 1);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ break;
+ }
+
+ ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
+ parent, btrfs_header_owner(leaf),
+ key.objectid, key.offset, 1);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ break;
+ }
+ }
+ if (dirty)
+ btrfs_mark_buffer_dirty(leaf);
+ if (inode)
+ btrfs_add_delayed_iput(inode);
+ return ret;
+}
+
+static noinline_for_stack
+int memcmp_node_keys(struct extent_buffer *eb, int slot,
+ struct btrfs_path *path, int level)
+{
+ struct btrfs_disk_key key1;
+ struct btrfs_disk_key key2;
+ btrfs_node_key(eb, &key1, slot);
+ btrfs_node_key(path->nodes[level], &key2, path->slots[level]);
+ return memcmp(&key1, &key2, sizeof(key1));
+}
+
+/*
+ * try to replace tree blocks in fs tree with the new blocks
+ * in reloc tree. tree blocks haven't been modified since the
+ * reloc tree was create can be replaced.
+ *
+ * if a block was replaced, level of the block + 1 is returned.
+ * if no block got replaced, 0 is returned. if there are other
+ * errors, a negative error number is returned.
+ */
+static noinline_for_stack
+int replace_path(struct btrfs_trans_handle *trans,
+ struct btrfs_root *dest, struct btrfs_root *src,
+ struct btrfs_path *path, struct btrfs_key *next_key,
+ int lowest_level, int max_level)
+{
+ struct extent_buffer *eb;
+ struct extent_buffer *parent;
+ struct btrfs_key key;
+ u64 old_bytenr;
+ u64 new_bytenr;
+ u64 old_ptr_gen;
+ u64 new_ptr_gen;
+ u64 last_snapshot;
+ u32 blocksize;
+ int cow = 0;
+ int level;
+ int ret;
+ int slot;
+
+ BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
+ BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
+
+ last_snapshot = btrfs_root_last_snapshot(&src->root_item);
+again:
+ slot = path->slots[lowest_level];
+ btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
+
+ eb = btrfs_lock_root_node(dest);
+ btrfs_set_lock_blocking(eb);
+ level = btrfs_header_level(eb);
+
+ if (level < lowest_level) {
+ btrfs_tree_unlock(eb);
+ free_extent_buffer(eb);
+ return 0;
+ }
+
+ if (cow) {
+ ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
+ BUG_ON(ret);
+ }
+ btrfs_set_lock_blocking(eb);
+
+ if (next_key) {
+ next_key->objectid = (u64)-1;
+ next_key->type = (u8)-1;
+ next_key->offset = (u64)-1;
+ }
+
+ parent = eb;
+ while (1) {
+ level = btrfs_header_level(parent);
+ BUG_ON(level < lowest_level);
+
+ ret = btrfs_bin_search(parent, &key, level, &slot);
+ if (ret && slot > 0)
+ slot--;
+
+ if (next_key && slot + 1 < btrfs_header_nritems(parent))
+ btrfs_node_key_to_cpu(parent, next_key, slot + 1);
+
+ old_bytenr = btrfs_node_blockptr(parent, slot);
+ blocksize = dest->nodesize;
+ old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
+
+ if (level <= max_level) {
+ eb = path->nodes[level];
+ new_bytenr = btrfs_node_blockptr(eb,
+ path->slots[level]);
+ new_ptr_gen = btrfs_node_ptr_generation(eb,
+ path->slots[level]);
+ } else {
+ new_bytenr = 0;
+ new_ptr_gen = 0;
+ }
+
+ if (WARN_ON(new_bytenr > 0 && new_bytenr == old_bytenr)) {
+ ret = level;
+ break;
+ }
+
+ if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
+ memcmp_node_keys(parent, slot, path, level)) {
+ if (level <= lowest_level) {
+ ret = 0;
+ break;
+ }
+
+ eb = read_tree_block(dest, old_bytenr, old_ptr_gen);
+ if (!eb || !extent_buffer_uptodate(eb)) {
+ ret = (!eb) ? -ENOMEM : -EIO;
+ free_extent_buffer(eb);
+ break;
+ }
+ btrfs_tree_lock(eb);
+ if (cow) {
+ ret = btrfs_cow_block(trans, dest, eb, parent,
+ slot, &eb);
+ BUG_ON(ret);
+ }
+ btrfs_set_lock_blocking(eb);
+
+ btrfs_tree_unlock(parent);
+ free_extent_buffer(parent);
+
+ parent = eb;
+ continue;
+ }
+
+ if (!cow) {
+ btrfs_tree_unlock(parent);
+ free_extent_buffer(parent);
+ cow = 1;
+ goto again;
+ }
+
+ btrfs_node_key_to_cpu(path->nodes[level], &key,
+ path->slots[level]);
+ btrfs_release_path(path);
+
+ path->lowest_level = level;
+ ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
+ path->lowest_level = 0;
+ BUG_ON(ret);
+
+ /*
+ * swap blocks in fs tree and reloc tree.
+ */
+ btrfs_set_node_blockptr(parent, slot, new_bytenr);
+ btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen);
+ btrfs_mark_buffer_dirty(parent);
+
+ btrfs_set_node_blockptr(path->nodes[level],
+ path->slots[level], old_bytenr);
+ btrfs_set_node_ptr_generation(path->nodes[level],
+ path->slots[level], old_ptr_gen);
+ btrfs_mark_buffer_dirty(path->nodes[level]);
+
+ ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize,
+ path->nodes[level]->start,
+ src->root_key.objectid, level - 1, 0,
+ 1);
+ BUG_ON(ret);
+ ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize,
+ 0, dest->root_key.objectid, level - 1,
+ 0, 1);
+ BUG_ON(ret);
+
+ ret = btrfs_free_extent(trans, src, new_bytenr, blocksize,
+ path->nodes[level]->start,
+ src->root_key.objectid, level - 1, 0,
+ 1);
+ BUG_ON(ret);
+
+ ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize,
+ 0, dest->root_key.objectid, level - 1,
+ 0, 1);
+ BUG_ON(ret);
+
+ btrfs_unlock_up_safe(path, 0);
+
+ ret = level;
+ break;
+ }
+ btrfs_tree_unlock(parent);
+ free_extent_buffer(parent);
+ return ret;
+}
+
+/*
+ * helper to find next relocated block in reloc tree
+ */
+static noinline_for_stack
+int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
+ int *level)
+{
+ struct extent_buffer *eb;
+ int i;
+ u64 last_snapshot;
+ u32 nritems;
+
+ last_snapshot = btrfs_root_last_snapshot(&root->root_item);
+
+ for (i = 0; i < *level; i++) {
+ free_extent_buffer(path->nodes[i]);
+ path->nodes[i] = NULL;
+ }
+
+ for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
+ eb = path->nodes[i];
+ nritems = btrfs_header_nritems(eb);
+ while (path->slots[i] + 1 < nritems) {
+ path->slots[i]++;
+ if (btrfs_node_ptr_generation(eb, path->slots[i]) <=
+ last_snapshot)
+ continue;
+
+ *level = i;
+ return 0;
+ }
+ free_extent_buffer(path->nodes[i]);
+ path->nodes[i] = NULL;
+ }
+ return 1;
+}
+
+/*
+ * walk down reloc tree to find relocated block of lowest level
+ */
+static noinline_for_stack
+int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
+ int *level)
+{
+ struct extent_buffer *eb = NULL;
+ int i;
+ u64 bytenr;
+ u64 ptr_gen = 0;
+ u64 last_snapshot;
+ u32 nritems;
+
+ last_snapshot = btrfs_root_last_snapshot(&root->root_item);
+
+ for (i = *level; i > 0; i--) {
+ eb = path->nodes[i];
+ nritems = btrfs_header_nritems(eb);
+ while (path->slots[i] < nritems) {
+ ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]);
+ if (ptr_gen > last_snapshot)
+ break;
+ path->slots[i]++;
+ }
+ if (path->slots[i] >= nritems) {
+ if (i == *level)
+ break;
+ *level = i + 1;
+ return 0;
+ }
+ if (i == 1) {
+ *level = i;
+ return 0;
+ }
+
+ bytenr = btrfs_node_blockptr(eb, path->slots[i]);
+ eb = read_tree_block(root, bytenr, ptr_gen);
+ if (!eb || !extent_buffer_uptodate(eb)) {
+ free_extent_buffer(eb);
+ return -EIO;
+ }
+ BUG_ON(btrfs_header_level(eb) != i - 1);
+ path->nodes[i - 1] = eb;
+ path->slots[i - 1] = 0;
+ }
+ return 1;
+}
+
+/*
+ * invalidate extent cache for file extents whose key in range of
+ * [min_key, max_key)
+ */
+static int invalidate_extent_cache(struct btrfs_root *root,
+ struct btrfs_key *min_key,
+ struct btrfs_key *max_key)
+{
+ struct inode *inode = NULL;
+ u64 objectid;
+ u64 start, end;
+ u64 ino;
+
+ objectid = min_key->objectid;
+ while (1) {
+ cond_resched();
+ iput(inode);
+
+ if (objectid > max_key->objectid)
+ break;
+
+ inode = find_next_inode(root, objectid);
+ if (!inode)
+ break;
+ ino = btrfs_ino(inode);
+
+ if (ino > max_key->objectid) {
+ iput(inode);
+ break;
+ }
+
+ objectid = ino + 1;
+ if (!S_ISREG(inode->i_mode))
+ continue;
+
+ if (unlikely(min_key->objectid == ino)) {
+ if (min_key->type > BTRFS_EXTENT_DATA_KEY)
+ continue;
+ if (min_key->type < BTRFS_EXTENT_DATA_KEY)
+ start = 0;
+ else {
+ start = min_key->offset;
+ WARN_ON(!IS_ALIGNED(start, root->sectorsize));
+ }
+ } else {
+ start = 0;
+ }
+
+ if (unlikely(max_key->objectid == ino)) {
+ if (max_key->type < BTRFS_EXTENT_DATA_KEY)
+ continue;
+ if (max_key->type > BTRFS_EXTENT_DATA_KEY) {
+ end = (u64)-1;
+ } else {
+ if (max_key->offset == 0)
+ continue;
+ end = max_key->offset;
+ WARN_ON(!IS_ALIGNED(end, root->sectorsize));
+ end--;
+ }
+ } else {
+ end = (u64)-1;
+ }
+
+ /* the lock_extent waits for readpage to complete */
+ lock_extent(&BTRFS_I(inode)->io_tree, start, end);
+ btrfs_drop_extent_cache(inode, start, end, 1);
+ unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
+ }
+ return 0;
+}
+
+static int find_next_key(struct btrfs_path *path, int level,
+ struct btrfs_key *key)
+
+{
+ while (level < BTRFS_MAX_LEVEL) {
+ if (!path->nodes[level])
+ break;
+ if (path->slots[level] + 1 <
+ btrfs_header_nritems(path->nodes[level])) {
+ btrfs_node_key_to_cpu(path->nodes[level], key,
+ path->slots[level] + 1);
+ return 0;
+ }
+ level++;
+ }
+ return 1;
+}
+
+/*
+ * merge the relocated tree blocks in reloc tree with corresponding
+ * fs tree.
+ */
+static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
+ struct btrfs_root *root)
+{
+ LIST_HEAD(inode_list);
+ struct btrfs_key key;
+ struct btrfs_key next_key;
+ struct btrfs_trans_handle *trans = NULL;
+ struct btrfs_root *reloc_root;
+ struct btrfs_root_item *root_item;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ int level;
+ int max_level;
+ int replaced = 0;
+ int ret;
+ int err = 0;
+ u32 min_reserved;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ path->reada = 1;
+
+ reloc_root = root->reloc_root;
+ root_item = &reloc_root->root_item;
+
+ if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
+ level = btrfs_root_level(root_item);
+ extent_buffer_get(reloc_root->node);
+ path->nodes[level] = reloc_root->node;
+ path->slots[level] = 0;
+ } else {
+ btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
+
+ level = root_item->drop_level;
+ BUG_ON(level == 0);
+ path->lowest_level = level;
+ ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
+ path->lowest_level = 0;
+ if (ret < 0) {
+ btrfs_free_path(path);
+ return ret;
+ }
+
+ btrfs_node_key_to_cpu(path->nodes[level], &next_key,
+ path->slots[level]);
+ WARN_ON(memcmp(&key, &next_key, sizeof(key)));
+
+ btrfs_unlock_up_safe(path, 0);
+ }
+
+ min_reserved = root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
+ memset(&next_key, 0, sizeof(next_key));
+
+ while (1) {
+ ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved,
+ BTRFS_RESERVE_FLUSH_ALL);
+ if (ret) {
+ err = ret;
+ goto out;
+ }
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ trans = NULL;
+ goto out;
+ }
+ trans->block_rsv = rc->block_rsv;
+
+ replaced = 0;
+ max_level = level;
+
+ ret = walk_down_reloc_tree(reloc_root, path, &level);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+ if (ret > 0)
+ break;
+
+ if (!find_next_key(path, level, &key) &&
+ btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
+ ret = 0;
+ } else {
+ ret = replace_path(trans, root, reloc_root, path,
+ &next_key, level, max_level);
+ }
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+
+ if (ret > 0) {
+ level = ret;
+ btrfs_node_key_to_cpu(path->nodes[level], &key,
+ path->slots[level]);
+ replaced = 1;
+ }
+
+ ret = walk_up_reloc_tree(reloc_root, path, &level);
+ if (ret > 0)
+ break;
+
+ BUG_ON(level == 0);
+ /*
+ * save the merging progress in the drop_progress.
+ * this is OK since root refs == 1 in this case.
+ */
+ btrfs_node_key(path->nodes[level], &root_item->drop_progress,
+ path->slots[level]);
+ root_item->drop_level = level;
+
+ btrfs_end_transaction_throttle(trans, root);
+ trans = NULL;
+
+ btrfs_btree_balance_dirty(root);
+
+ if (replaced && rc->stage == UPDATE_DATA_PTRS)
+ invalidate_extent_cache(root, &key, &next_key);
+ }
+
+ /*
+ * handle the case only one block in the fs tree need to be
+ * relocated and the block is tree root.
+ */
+ leaf = btrfs_lock_root_node(root);
+ ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf);
+ btrfs_tree_unlock(leaf);
+ free_extent_buffer(leaf);
+ if (ret < 0)
+ err = ret;
+out:
+ btrfs_free_path(path);
+
+ if (err == 0) {
+ memset(&root_item->drop_progress, 0,
+ sizeof(root_item->drop_progress));
+ root_item->drop_level = 0;
+ btrfs_set_root_refs(root_item, 0);
+ btrfs_update_reloc_root(trans, root);
+ }
+
+ if (trans)
+ btrfs_end_transaction_throttle(trans, root);
+
+ btrfs_btree_balance_dirty(root);
+
+ if (replaced && rc->stage == UPDATE_DATA_PTRS)
+ invalidate_extent_cache(root, &key, &next_key);
+
+ return err;
+}
+
+static noinline_for_stack
+int prepare_to_merge(struct reloc_control *rc, int err)
+{
+ struct btrfs_root *root = rc->extent_root;
+ struct btrfs_root *reloc_root;
+ struct btrfs_trans_handle *trans;
+ LIST_HEAD(reloc_roots);
+ u64 num_bytes = 0;
+ int ret;
+
+ mutex_lock(&root->fs_info->reloc_mutex);
+ rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
+ rc->merging_rsv_size += rc->nodes_relocated * 2;
+ mutex_unlock(&root->fs_info->reloc_mutex);
+
+again:
+ if (!err) {
+ num_bytes = rc->merging_rsv_size;
+ ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
+ BTRFS_RESERVE_FLUSH_ALL);
+ if (ret)
+ err = ret;
+ }
+
+ trans = btrfs_join_transaction(rc->extent_root);
+ if (IS_ERR(trans)) {
+ if (!err)
+ btrfs_block_rsv_release(rc->extent_root,
+ rc->block_rsv, num_bytes);
+ return PTR_ERR(trans);
+ }
+
+ if (!err) {
+ if (num_bytes != rc->merging_rsv_size) {
+ btrfs_end_transaction(trans, rc->extent_root);
+ btrfs_block_rsv_release(rc->extent_root,
+ rc->block_rsv, num_bytes);
+ goto again;
+ }
+ }
+
+ rc->merge_reloc_tree = 1;
+
+ while (!list_empty(&rc->reloc_roots)) {
+ reloc_root = list_entry(rc->reloc_roots.next,
+ struct btrfs_root, root_list);
+ list_del_init(&reloc_root->root_list);
+
+ root = read_fs_root(reloc_root->fs_info,
+ reloc_root->root_key.offset);
+ BUG_ON(IS_ERR(root));
+ BUG_ON(root->reloc_root != reloc_root);
+
+ /*
+ * set reference count to 1, so btrfs_recover_relocation
+ * knows it should resumes merging
+ */
+ if (!err)
+ btrfs_set_root_refs(&reloc_root->root_item, 1);
+ btrfs_update_reloc_root(trans, root);
+
+ list_add(&reloc_root->root_list, &reloc_roots);
+ }
+
+ list_splice(&reloc_roots, &rc->reloc_roots);
+
+ if (!err)
+ btrfs_commit_transaction(trans, rc->extent_root);
+ else
+ btrfs_end_transaction(trans, rc->extent_root);
+ return err;
+}
+
+static noinline_for_stack
+void free_reloc_roots(struct list_head *list)
+{
+ struct btrfs_root *reloc_root;
+
+ while (!list_empty(list)) {
+ reloc_root = list_entry(list->next, struct btrfs_root,
+ root_list);
+ __del_reloc_root(reloc_root);
+ }
+}
+
+static noinline_for_stack
+void merge_reloc_roots(struct reloc_control *rc)
+{
+ struct btrfs_root *root;
+ struct btrfs_root *reloc_root;
+ u64 last_snap;
+ u64 otransid;
+ u64 objectid;
+ LIST_HEAD(reloc_roots);
+ int found = 0;
+ int ret = 0;
+again:
+ root = rc->extent_root;
+
+ /*
+ * this serializes us with btrfs_record_root_in_transaction,
+ * we have to make sure nobody is in the middle of
+ * adding their roots to the list while we are
+ * doing this splice
+ */
+ mutex_lock(&root->fs_info->reloc_mutex);
+ list_splice_init(&rc->reloc_roots, &reloc_roots);
+ mutex_unlock(&root->fs_info->reloc_mutex);
+
+ while (!list_empty(&reloc_roots)) {
+ found = 1;
+ reloc_root = list_entry(reloc_roots.next,
+ struct btrfs_root, root_list);
+
+ if (btrfs_root_refs(&reloc_root->root_item) > 0) {
+ root = read_fs_root(reloc_root->fs_info,
+ reloc_root->root_key.offset);
+ BUG_ON(IS_ERR(root));
+ BUG_ON(root->reloc_root != reloc_root);
+
+ ret = merge_reloc_root(rc, root);
+ if (ret) {
+ if (list_empty(&reloc_root->root_list))
+ list_add_tail(&reloc_root->root_list,
+ &reloc_roots);
+ goto out;
+ }
+ } else {
+ list_del_init(&reloc_root->root_list);
+ }
+
+ /*
+ * we keep the old last snapshod transid in rtranid when we
+ * created the relocation tree.
+ */
+ last_snap = btrfs_root_rtransid(&reloc_root->root_item);
+ otransid = btrfs_root_otransid(&reloc_root->root_item);
+ objectid = reloc_root->root_key.offset;
+
+ ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1);
+ if (ret < 0) {
+ if (list_empty(&reloc_root->root_list))
+ list_add_tail(&reloc_root->root_list,
+ &reloc_roots);
+ goto out;
+ }
+ }
+
+ if (found) {
+ found = 0;
+ goto again;
+ }
+out:
+ if (ret) {
+ btrfs_std_error(root->fs_info, ret);
+ if (!list_empty(&reloc_roots))
+ free_reloc_roots(&reloc_roots);
+
+ /* new reloc root may be added */
+ mutex_lock(&root->fs_info->reloc_mutex);
+ list_splice_init(&rc->reloc_roots, &reloc_roots);
+ mutex_unlock(&root->fs_info->reloc_mutex);
+ if (!list_empty(&reloc_roots))
+ free_reloc_roots(&reloc_roots);
+ }
+
+ BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
+}
+
+static void free_block_list(struct rb_root *blocks)
+{
+ struct tree_block *block;
+ struct rb_node *rb_node;
+ while ((rb_node = rb_first(blocks))) {
+ block = rb_entry(rb_node, struct tree_block, rb_node);
+ rb_erase(rb_node, blocks);
+ kfree(block);
+ }
+}
+
+static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
+ struct btrfs_root *reloc_root)
+{
+ struct btrfs_root *root;
+
+ if (reloc_root->last_trans == trans->transid)
+ return 0;
+
+ root = read_fs_root(reloc_root->fs_info, reloc_root->root_key.offset);
+ BUG_ON(IS_ERR(root));
+ BUG_ON(root->reloc_root != reloc_root);
+
+ return btrfs_record_root_in_trans(trans, root);
+}
+
+static noinline_for_stack
+struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc,
+ struct backref_node *node,
+ struct backref_edge *edges[])
+{
+ struct backref_node *next;
+ struct btrfs_root *root;
+ int index = 0;
+
+ next = node;
+ while (1) {
+ cond_resched();
+ next = walk_up_backref(next, edges, &index);
+ root = next->root;
+ BUG_ON(!root);
+ BUG_ON(!test_bit(BTRFS_ROOT_REF_COWS, &root->state));
+
+ if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
+ record_reloc_root_in_trans(trans, root);
+ break;
+ }
+
+ btrfs_record_root_in_trans(trans, root);
+ root = root->reloc_root;
+
+ if (next->new_bytenr != root->node->start) {
+ BUG_ON(next->new_bytenr);
+ BUG_ON(!list_empty(&next->list));
+ next->new_bytenr = root->node->start;
+ next->root = root;
+ list_add_tail(&next->list,
+ &rc->backref_cache.changed);
+ __mark_block_processed(rc, next);
+ break;
+ }
+
+ WARN_ON(1);
+ root = NULL;
+ next = walk_down_backref(edges, &index);
+ if (!next || next->level <= node->level)
+ break;
+ }
+ if (!root)
+ return NULL;
+
+ next = node;
+ /* setup backref node path for btrfs_reloc_cow_block */
+ while (1) {
+ rc->backref_cache.path[next->level] = next;
+ if (--index < 0)
+ break;
+ next = edges[index]->node[UPPER];
+ }
+ return root;
+}
+
+/*
+ * select a tree root for relocation. return NULL if the block
+ * is reference counted. we should use do_relocation() in this
+ * case. return a tree root pointer if the block isn't reference
+ * counted. return -ENOENT if the block is root of reloc tree.
+ */
+static noinline_for_stack
+struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
+ struct backref_node *node)
+{
+ struct backref_node *next;
+ struct btrfs_root *root;
+ struct btrfs_root *fs_root = NULL;
+ struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
+ int index = 0;
+
+ next = node;
+ while (1) {
+ cond_resched();
+ next = walk_up_backref(next, edges, &index);
+ root = next->root;
+ BUG_ON(!root);
+
+ /* no other choice for non-references counted tree */
+ if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
+ return root;
+
+ if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)
+ fs_root = root;
+
+ if (next != node)
+ return NULL;
+
+ next = walk_down_backref(edges, &index);
+ if (!next || next->level <= node->level)
+ break;
+ }
+
+ if (!fs_root)
+ return ERR_PTR(-ENOENT);
+ return fs_root;
+}
+
+static noinline_for_stack
+u64 calcu_metadata_size(struct reloc_control *rc,
+ struct backref_node *node, int reserve)
+{
+ struct backref_node *next = node;
+ struct backref_edge *edge;
+ struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
+ u64 num_bytes = 0;
+ int index = 0;
+
+ BUG_ON(reserve && node->processed);
+
+ while (next) {
+ cond_resched();
+ while (1) {
+ if (next->processed && (reserve || next != node))
+ break;
+
+ num_bytes += rc->extent_root->nodesize;
+
+ if (list_empty(&next->upper))
+ break;
+
+ edge = list_entry(next->upper.next,
+ struct backref_edge, list[LOWER]);
+ edges[index++] = edge;
+ next = edge->node[UPPER];
+ }
+ next = walk_down_backref(edges, &index);
+ }
+ return num_bytes;
+}
+
+static int reserve_metadata_space(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc,
+ struct backref_node *node)
+{
+ struct btrfs_root *root = rc->extent_root;
+ u64 num_bytes;
+ int ret;
+ u64 tmp;
+
+ num_bytes = calcu_metadata_size(rc, node, 1) * 2;
+
+ trans->block_rsv = rc->block_rsv;
+ rc->reserved_bytes += num_bytes;
+ ret = btrfs_block_rsv_refill(root, rc->block_rsv, num_bytes,
+ BTRFS_RESERVE_FLUSH_ALL);
+ if (ret) {
+ if (ret == -EAGAIN) {
+ tmp = rc->extent_root->nodesize *
+ RELOCATION_RESERVED_NODES;
+ while (tmp <= rc->reserved_bytes)
+ tmp <<= 1;
+ /*
+ * only one thread can access block_rsv at this point,
+ * so we don't need hold lock to protect block_rsv.
+ * we expand more reservation size here to allow enough
+ * space for relocation and we will return eailer in
+ * enospc case.
+ */
+ rc->block_rsv->size = tmp + rc->extent_root->nodesize *
+ RELOCATION_RESERVED_NODES;
+ }
+ return ret;
+ }
+
+ return 0;
+}
+
+/*
+ * relocate a block tree, and then update pointers in upper level
+ * blocks that reference the block to point to the new location.
+ *
+ * if called by link_to_upper, the block has already been relocated.
+ * in that case this function just updates pointers.
+ */
+static int do_relocation(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc,
+ struct backref_node *node,
+ struct btrfs_key *key,
+ struct btrfs_path *path, int lowest)
+{
+ struct backref_node *upper;
+ struct backref_edge *edge;
+ struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
+ struct btrfs_root *root;
+ struct extent_buffer *eb;
+ u32 blocksize;
+ u64 bytenr;
+ u64 generation;
+ int slot;
+ int ret;
+ int err = 0;
+
+ BUG_ON(lowest && node->eb);
+
+ path->lowest_level = node->level + 1;
+ rc->backref_cache.path[node->level] = node;
+ list_for_each_entry(edge, &node->upper, list[LOWER]) {
+ cond_resched();
+
+ upper = edge->node[UPPER];
+ root = select_reloc_root(trans, rc, upper, edges);
+ BUG_ON(!root);
+
+ if (upper->eb && !upper->locked) {
+ if (!lowest) {
+ ret = btrfs_bin_search(upper->eb, key,
+ upper->level, &slot);
+ BUG_ON(ret);
+ bytenr = btrfs_node_blockptr(upper->eb, slot);
+ if (node->eb->start == bytenr)
+ goto next;
+ }
+ drop_node_buffer(upper);
+ }
+
+ if (!upper->eb) {
+ ret = btrfs_search_slot(trans, root, key, path, 0, 1);
+ if (ret < 0) {
+ err = ret;
+ break;
+ }
+ BUG_ON(ret > 0);
+
+ if (!upper->eb) {
+ upper->eb = path->nodes[upper->level];
+ path->nodes[upper->level] = NULL;
+ } else {
+ BUG_ON(upper->eb != path->nodes[upper->level]);
+ }
+
+ upper->locked = 1;
+ path->locks[upper->level] = 0;
+
+ slot = path->slots[upper->level];
+ btrfs_release_path(path);
+ } else {
+ ret = btrfs_bin_search(upper->eb, key, upper->level,
+ &slot);
+ BUG_ON(ret);
+ }
+
+ bytenr = btrfs_node_blockptr(upper->eb, slot);
+ if (lowest) {
+ BUG_ON(bytenr != node->bytenr);
+ } else {
+ if (node->eb->start == bytenr)
+ goto next;
+ }
+
+ blocksize = root->nodesize;
+ generation = btrfs_node_ptr_generation(upper->eb, slot);
+ eb = read_tree_block(root, bytenr, generation);
+ if (!eb || !extent_buffer_uptodate(eb)) {
+ free_extent_buffer(eb);
+ err = -EIO;
+ goto next;
+ }
+ btrfs_tree_lock(eb);
+ btrfs_set_lock_blocking(eb);
+
+ if (!node->eb) {
+ ret = btrfs_cow_block(trans, root, eb, upper->eb,
+ slot, &eb);
+ btrfs_tree_unlock(eb);
+ free_extent_buffer(eb);
+ if (ret < 0) {
+ err = ret;
+ goto next;
+ }
+ BUG_ON(node->eb != eb);
+ } else {
+ btrfs_set_node_blockptr(upper->eb, slot,
+ node->eb->start);
+ btrfs_set_node_ptr_generation(upper->eb, slot,
+ trans->transid);
+ btrfs_mark_buffer_dirty(upper->eb);
+
+ ret = btrfs_inc_extent_ref(trans, root,
+ node->eb->start, blocksize,
+ upper->eb->start,
+ btrfs_header_owner(upper->eb),
+ node->level, 0, 1);
+ BUG_ON(ret);
+
+ ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
+ BUG_ON(ret);
+ }
+next:
+ if (!upper->pending)
+ drop_node_buffer(upper);
+ else
+ unlock_node_buffer(upper);
+ if (err)
+ break;
+ }
+
+ if (!err && node->pending) {
+ drop_node_buffer(node);
+ list_move_tail(&node->list, &rc->backref_cache.changed);
+ node->pending = 0;
+ }
+
+ path->lowest_level = 0;
+ BUG_ON(err == -ENOSPC);
+ return err;
+}
+
+static int link_to_upper(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc,
+ struct backref_node *node,
+ struct btrfs_path *path)
+{
+ struct btrfs_key key;
+
+ btrfs_node_key_to_cpu(node->eb, &key, 0);
+ return do_relocation(trans, rc, node, &key, path, 0);
+}
+
+static int finish_pending_nodes(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc,
+ struct btrfs_path *path, int err)
+{
+ LIST_HEAD(list);
+ struct backref_cache *cache = &rc->backref_cache;
+ struct backref_node *node;
+ int level;
+ int ret;
+
+ for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
+ while (!list_empty(&cache->pending[level])) {
+ node = list_entry(cache->pending[level].next,
+ struct backref_node, list);
+ list_move_tail(&node->list, &list);
+ BUG_ON(!node->pending);
+
+ if (!err) {
+ ret = link_to_upper(trans, rc, node, path);
+ if (ret < 0)
+ err = ret;
+ }
+ }
+ list_splice_init(&list, &cache->pending[level]);
+ }
+ return err;
+}
+
+static void mark_block_processed(struct reloc_control *rc,
+ u64 bytenr, u32 blocksize)
+{
+ set_extent_bits(&rc->processed_blocks, bytenr, bytenr + blocksize - 1,
+ EXTENT_DIRTY, GFP_NOFS);
+}
+
+static void __mark_block_processed(struct reloc_control *rc,
+ struct backref_node *node)
+{
+ u32 blocksize;
+ if (node->level == 0 ||
+ in_block_group(node->bytenr, rc->block_group)) {
+ blocksize = rc->extent_root->nodesize;
+ mark_block_processed(rc, node->bytenr, blocksize);
+ }
+ node->processed = 1;
+}
+
+/*
+ * mark a block and all blocks directly/indirectly reference the block
+ * as processed.
+ */
+static void update_processed_blocks(struct reloc_control *rc,
+ struct backref_node *node)
+{
+ struct backref_node *next = node;
+ struct backref_edge *edge;
+ struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
+ int index = 0;
+
+ while (next) {
+ cond_resched();
+ while (1) {
+ if (next->processed)
+ break;
+
+ __mark_block_processed(rc, next);
+
+ if (list_empty(&next->upper))
+ break;
+
+ edge = list_entry(next->upper.next,
+ struct backref_edge, list[LOWER]);
+ edges[index++] = edge;
+ next = edge->node[UPPER];
+ }
+ next = walk_down_backref(edges, &index);
+ }
+}
+
+static int tree_block_processed(u64 bytenr, struct reloc_control *rc)
+{
+ u32 blocksize = rc->extent_root->nodesize;
+
+ if (test_range_bit(&rc->processed_blocks, bytenr,
+ bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
+ return 1;
+ return 0;
+}
+
+static int get_tree_block_key(struct reloc_control *rc,
+ struct tree_block *block)
+{
+ struct extent_buffer *eb;
+
+ BUG_ON(block->key_ready);
+ eb = read_tree_block(rc->extent_root, block->bytenr,
+ block->key.offset);
+ if (!eb || !extent_buffer_uptodate(eb)) {
+ free_extent_buffer(eb);
+ return -EIO;
+ }
+ WARN_ON(btrfs_header_level(eb) != block->level);
+ if (block->level == 0)
+ btrfs_item_key_to_cpu(eb, &block->key, 0);
+ else
+ btrfs_node_key_to_cpu(eb, &block->key, 0);
+ free_extent_buffer(eb);
+ block->key_ready = 1;
+ return 0;
+}
+
+/*
+ * helper function to relocate a tree block
+ */
+static int relocate_tree_block(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc,
+ struct backref_node *node,
+ struct btrfs_key *key,
+ struct btrfs_path *path)
+{
+ struct btrfs_root *root;
+ int ret = 0;
+
+ if (!node)
+ return 0;
+
+ BUG_ON(node->processed);
+ root = select_one_root(trans, node);
+ if (root == ERR_PTR(-ENOENT)) {
+ update_processed_blocks(rc, node);
+ goto out;
+ }
+
+ if (!root || test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
+ ret = reserve_metadata_space(trans, rc, node);
+ if (ret)
+ goto out;
+ }
+
+ if (root) {
+ if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
+ BUG_ON(node->new_bytenr);
+ BUG_ON(!list_empty(&node->list));
+ btrfs_record_root_in_trans(trans, root);
+ root = root->reloc_root;
+ node->new_bytenr = root->node->start;
+ node->root = root;
+ list_add_tail(&node->list, &rc->backref_cache.changed);
+ } else {
+ path->lowest_level = node->level;
+ ret = btrfs_search_slot(trans, root, key, path, 0, 1);
+ btrfs_release_path(path);
+ if (ret > 0)
+ ret = 0;
+ }
+ if (!ret)
+ update_processed_blocks(rc, node);
+ } else {
+ ret = do_relocation(trans, rc, node, key, path, 1);
+ }
+out:
+ if (ret || node->level == 0 || node->cowonly)
+ remove_backref_node(&rc->backref_cache, node);
+ return ret;
+}
+
+/*
+ * relocate a list of blocks
+ */
+static noinline_for_stack
+int relocate_tree_blocks(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc, struct rb_root *blocks)
+{
+ struct backref_node *node;
+ struct btrfs_path *path;
+ struct tree_block *block;
+ struct rb_node *rb_node;
+ int ret;
+ int err = 0;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ err = -ENOMEM;
+ goto out_free_blocks;
+ }
+
+ rb_node = rb_first(blocks);
+ while (rb_node) {
+ block = rb_entry(rb_node, struct tree_block, rb_node);
+ if (!block->key_ready)
+ readahead_tree_block(rc->extent_root, block->bytenr);
+ rb_node = rb_next(rb_node);
+ }
+
+ rb_node = rb_first(blocks);
+ while (rb_node) {
+ block = rb_entry(rb_node, struct tree_block, rb_node);
+ if (!block->key_ready) {
+ err = get_tree_block_key(rc, block);
+ if (err)
+ goto out_free_path;
+ }
+ rb_node = rb_next(rb_node);
+ }
+
+ rb_node = rb_first(blocks);
+ while (rb_node) {
+ block = rb_entry(rb_node, struct tree_block, rb_node);
+
+ node = build_backref_tree(rc, &block->key,
+ block->level, block->bytenr);
+ if (IS_ERR(node)) {
+ err = PTR_ERR(node);
+ goto out;
+ }
+
+ ret = relocate_tree_block(trans, rc, node, &block->key,
+ path);
+ if (ret < 0) {
+ if (ret != -EAGAIN || rb_node == rb_first(blocks))
+ err = ret;
+ goto out;
+ }
+ rb_node = rb_next(rb_node);
+ }
+out:
+ err = finish_pending_nodes(trans, rc, path, err);
+
+out_free_path:
+ btrfs_free_path(path);
+out_free_blocks:
+ free_block_list(blocks);
+ return err;
+}
+
+static noinline_for_stack
+int prealloc_file_extent_cluster(struct inode *inode,
+ struct file_extent_cluster *cluster)
+{
+ u64 alloc_hint = 0;
+ u64 start;
+ u64 end;
+ u64 offset = BTRFS_I(inode)->index_cnt;
+ u64 num_bytes;
+ int nr = 0;
+ int ret = 0;
+
+ BUG_ON(cluster->start != cluster->boundary[0]);
+ mutex_lock(&inode->i_mutex);
+
+ ret = btrfs_check_data_free_space(inode, cluster->end +
+ 1 - cluster->start, 0);
+ if (ret)
+ goto out;
+
+ while (nr < cluster->nr) {
+ start = cluster->boundary[nr] - offset;
+ if (nr + 1 < cluster->nr)
+ end = cluster->boundary[nr + 1] - 1 - offset;
+ else
+ end = cluster->end - offset;
+
+ lock_extent(&BTRFS_I(inode)->io_tree, start, end);
+ num_bytes = end + 1 - start;
+ ret = btrfs_prealloc_file_range(inode, 0, start,
+ num_bytes, num_bytes,
+ end + 1, &alloc_hint);
+ unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
+ if (ret)
+ break;
+ nr++;
+ }
+ btrfs_free_reserved_data_space(inode, cluster->end +
+ 1 - cluster->start);
+out:
+ mutex_unlock(&inode->i_mutex);
+ return ret;
+}
+
+static noinline_for_stack
+int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
+ u64 block_start)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ struct extent_map *em;
+ int ret = 0;
+
+ em = alloc_extent_map();
+ if (!em)
+ return -ENOMEM;
+
+ em->start = start;
+ em->len = end + 1 - start;
+ em->block_len = em->len;
+ em->block_start = block_start;
+ em->bdev = root->fs_info->fs_devices->latest_bdev;
+ set_bit(EXTENT_FLAG_PINNED, &em->flags);
+
+ lock_extent(&BTRFS_I(inode)->io_tree, start, end);
+ while (1) {
+ write_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, em, 0);
+ write_unlock(&em_tree->lock);
+ if (ret != -EEXIST) {
+ free_extent_map(em);
+ break;
+ }
+ btrfs_drop_extent_cache(inode, start, end, 0);
+ }
+ unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
+ return ret;
+}
+
+static int relocate_file_extent_cluster(struct inode *inode,
+ struct file_extent_cluster *cluster)
+{
+ u64 page_start;
+ u64 page_end;
+ u64 offset = BTRFS_I(inode)->index_cnt;
+ unsigned long index;
+ unsigned long last_index;
+ struct page *page;
+ struct file_ra_state *ra;
+ gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
+ int nr = 0;
+ int ret = 0;
+
+ if (!cluster->nr)
+ return 0;
+
+ ra = kzalloc(sizeof(*ra), GFP_NOFS);
+ if (!ra)
+ return -ENOMEM;
+
+ ret = prealloc_file_extent_cluster(inode, cluster);
+ if (ret)
+ goto out;
+
+ file_ra_state_init(ra, inode->i_mapping);
+
+ ret = setup_extent_mapping(inode, cluster->start - offset,
+ cluster->end - offset, cluster->start);
+ if (ret)
+ goto out;
+
+ index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
+ last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
+ while (index <= last_index) {
+ ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE);
+ if (ret)
+ goto out;
+
+ page = find_lock_page(inode->i_mapping, index);
+ if (!page) {
+ page_cache_sync_readahead(inode->i_mapping,
+ ra, NULL, index,
+ last_index + 1 - index);
+ page = find_or_create_page(inode->i_mapping, index,
+ mask);
+ if (!page) {
+ btrfs_delalloc_release_metadata(inode,
+ PAGE_CACHE_SIZE);
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
+ if (PageReadahead(page)) {
+ page_cache_async_readahead(inode->i_mapping,
+ ra, NULL, page, index,
+ last_index + 1 - index);
+ }
+
+ if (!PageUptodate(page)) {
+ btrfs_readpage(NULL, page);
+ lock_page(page);
+ if (!PageUptodate(page)) {
+ unlock_page(page);
+ page_cache_release(page);
+ btrfs_delalloc_release_metadata(inode,
+ PAGE_CACHE_SIZE);
+ ret = -EIO;
+ goto out;
+ }
+ }
+
+ page_start = page_offset(page);
+ page_end = page_start + PAGE_CACHE_SIZE - 1;
+
+ lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end);
+
+ set_page_extent_mapped(page);
+
+ if (nr < cluster->nr &&
+ page_start + offset == cluster->boundary[nr]) {
+ set_extent_bits(&BTRFS_I(inode)->io_tree,
+ page_start, page_end,
+ EXTENT_BOUNDARY, GFP_NOFS);
+ nr++;
+ }
+
+ btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
+ set_page_dirty(page);
+
+ unlock_extent(&BTRFS_I(inode)->io_tree,
+ page_start, page_end);
+ unlock_page(page);
+ page_cache_release(page);
+
+ index++;
+ balance_dirty_pages_ratelimited(inode->i_mapping);
+ btrfs_throttle(BTRFS_I(inode)->root);
+ }
+ WARN_ON(nr != cluster->nr);
+out:
+ kfree(ra);
+ return ret;
+}
+
+static noinline_for_stack
+int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
+ struct file_extent_cluster *cluster)
+{
+ int ret;
+
+ if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
+ ret = relocate_file_extent_cluster(inode, cluster);
+ if (ret)
+ return ret;
+ cluster->nr = 0;
+ }
+
+ if (!cluster->nr)
+ cluster->start = extent_key->objectid;
+ else
+ BUG_ON(cluster->nr >= MAX_EXTENTS);
+ cluster->end = extent_key->objectid + extent_key->offset - 1;
+ cluster->boundary[cluster->nr] = extent_key->objectid;
+ cluster->nr++;
+
+ if (cluster->nr >= MAX_EXTENTS) {
+ ret = relocate_file_extent_cluster(inode, cluster);
+ if (ret)
+ return ret;
+ cluster->nr = 0;
+ }
+ return 0;
+}
+
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+static int get_ref_objectid_v0(struct reloc_control *rc,
+ struct btrfs_path *path,
+ struct btrfs_key *extent_key,
+ u64 *ref_objectid, int *path_change)
+{
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ struct btrfs_extent_ref_v0 *ref0;
+ int ret;
+ int slot;
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ while (1) {
+ if (slot >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(rc->extent_root, path);
+ if (ret < 0)
+ return ret;
+ BUG_ON(ret > 0);
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ if (path_change)
+ *path_change = 1;
+ }
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid != extent_key->objectid)
+ return -ENOENT;
+
+ if (key.type != BTRFS_EXTENT_REF_V0_KEY) {
+ slot++;
+ continue;
+ }
+ ref0 = btrfs_item_ptr(leaf, slot,
+ struct btrfs_extent_ref_v0);
+ *ref_objectid = btrfs_ref_objectid_v0(leaf, ref0);
+ break;
+ }
+ return 0;
+}
+#endif
+
+/*
+ * helper to add a tree block to the list.
+ * the major work is getting the generation and level of the block
+ */
+static int add_tree_block(struct reloc_control *rc,
+ struct btrfs_key *extent_key,
+ struct btrfs_path *path,
+ struct rb_root *blocks)
+{
+ struct extent_buffer *eb;
+ struct btrfs_extent_item *ei;
+ struct btrfs_tree_block_info *bi;
+ struct tree_block *block;
+ struct rb_node *rb_node;
+ u32 item_size;
+ int level = -1;
+ u64 generation;
+
+ eb = path->nodes[0];
+ item_size = btrfs_item_size_nr(eb, path->slots[0]);
+
+ if (extent_key->type == BTRFS_METADATA_ITEM_KEY ||
+ item_size >= sizeof(*ei) + sizeof(*bi)) {
+ ei = btrfs_item_ptr(eb, path->slots[0],
+ struct btrfs_extent_item);
+ if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) {
+ bi = (struct btrfs_tree_block_info *)(ei + 1);
+ level = btrfs_tree_block_level(eb, bi);
+ } else {
+ level = (int)extent_key->offset;
+ }
+ generation = btrfs_extent_generation(eb, ei);
+ } else {
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+ u64 ref_owner;
+ int ret;
+
+ BUG_ON(item_size != sizeof(struct btrfs_extent_item_v0));
+ ret = get_ref_objectid_v0(rc, path, extent_key,
+ &ref_owner, NULL);
+ if (ret < 0)
+ return ret;
+ BUG_ON(ref_owner >= BTRFS_MAX_LEVEL);
+ level = (int)ref_owner;
+ /* FIXME: get real generation */
+ generation = 0;
+#else
+ BUG();
+#endif
+ }
+
+ btrfs_release_path(path);
+
+ BUG_ON(level == -1);
+
+ block = kmalloc(sizeof(*block), GFP_NOFS);
+ if (!block)
+ return -ENOMEM;
+
+ block->bytenr = extent_key->objectid;
+ block->key.objectid = rc->extent_root->nodesize;
+ block->key.offset = generation;
+ block->level = level;
+ block->key_ready = 0;
+
+ rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
+ if (rb_node)
+ backref_tree_panic(rb_node, -EEXIST, block->bytenr);
+
+ return 0;
+}
+
+/*
+ * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY
+ */
+static int __add_tree_block(struct reloc_control *rc,
+ u64 bytenr, u32 blocksize,
+ struct rb_root *blocks)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ int ret;
+ bool skinny = btrfs_fs_incompat(rc->extent_root->fs_info,
+ SKINNY_METADATA);
+
+ if (tree_block_processed(bytenr, rc))
+ return 0;
+
+ if (tree_search(blocks, bytenr))
+ return 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+again:
+ key.objectid = bytenr;
+ if (skinny) {
+ key.type = BTRFS_METADATA_ITEM_KEY;
+ key.offset = (u64)-1;
+ } else {
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = blocksize;
+ }
+
+ path->search_commit_root = 1;
+ path->skip_locking = 1;
+ ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ if (ret > 0 && skinny) {
+ if (path->slots[0]) {
+ path->slots[0]--;
+ btrfs_item_key_to_cpu(path->nodes[0], &key,
+ path->slots[0]);
+ if (key.objectid == bytenr &&
+ (key.type == BTRFS_METADATA_ITEM_KEY ||
+ (key.type == BTRFS_EXTENT_ITEM_KEY &&
+ key.offset == blocksize)))
+ ret = 0;
+ }
+
+ if (ret) {
+ skinny = false;
+ btrfs_release_path(path);
+ goto again;
+ }
+ }
+ BUG_ON(ret);
+
+ ret = add_tree_block(rc, &key, path, blocks);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * helper to check if the block use full backrefs for pointers in it
+ */
+static int block_use_full_backref(struct reloc_control *rc,
+ struct extent_buffer *eb)
+{
+ u64 flags;
+ int ret;
+
+ if (btrfs_header_flag(eb, BTRFS_HEADER_FLAG_RELOC) ||
+ btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV)
+ return 1;
+
+ ret = btrfs_lookup_extent_info(NULL, rc->extent_root,
+ eb->start, btrfs_header_level(eb), 1,
+ NULL, &flags);
+ BUG_ON(ret);
+
+ if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
+ ret = 1;
+ else
+ ret = 0;
+ return ret;
+}
+
+static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct inode *inode,
+ u64 ino)
+{
+ struct btrfs_key key;
+ struct btrfs_root *root = fs_info->tree_root;
+ struct btrfs_trans_handle *trans;
+ int ret = 0;
+
+ if (inode)
+ goto truncate;
+
+ key.objectid = ino;
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+
+ inode = btrfs_iget(fs_info->sb, &key, root, NULL);
+ if (IS_ERR(inode) || is_bad_inode(inode)) {
+ if (!IS_ERR(inode))
+ iput(inode);
+ return -ENOENT;
+ }
+
+truncate:
+ ret = btrfs_check_trunc_cache_free_space(root,
+ &fs_info->global_block_rsv);
+ if (ret)
+ goto out;
+
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+
+ ret = btrfs_truncate_free_space_cache(root, trans, block_group, inode);
+
+ btrfs_end_transaction(trans, root);
+ btrfs_btree_balance_dirty(root);
+out:
+ iput(inode);
+ return ret;
+}
+
+/*
+ * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY
+ * this function scans fs tree to find blocks reference the data extent
+ */
+static int find_data_references(struct reloc_control *rc,
+ struct btrfs_key *extent_key,
+ struct extent_buffer *leaf,
+ struct btrfs_extent_data_ref *ref,
+ struct rb_root *blocks)
+{
+ struct btrfs_path *path;
+ struct tree_block *block;
+ struct btrfs_root *root;
+ struct btrfs_file_extent_item *fi;
+ struct rb_node *rb_node;
+ struct btrfs_key key;
+ u64 ref_root;
+ u64 ref_objectid;
+ u64 ref_offset;
+ u32 ref_count;
+ u32 nritems;
+ int err = 0;
+ int added = 0;
+ int counted;
+ int ret;
+
+ ref_root = btrfs_extent_data_ref_root(leaf, ref);
+ ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref);
+ ref_offset = btrfs_extent_data_ref_offset(leaf, ref);
+ ref_count = btrfs_extent_data_ref_count(leaf, ref);
+
+ /*
+ * This is an extent belonging to the free space cache, lets just delete
+ * it and redo the search.
+ */
+ if (ref_root == BTRFS_ROOT_TREE_OBJECTID) {
+ ret = delete_block_group_cache(rc->extent_root->fs_info,
+ rc->block_group,
+ NULL, ref_objectid);
+ if (ret != -ENOENT)
+ return ret;
+ ret = 0;
+ }
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ path->reada = 1;
+
+ root = read_fs_root(rc->extent_root->fs_info, ref_root);
+ if (IS_ERR(root)) {
+ err = PTR_ERR(root);
+ goto out;
+ }
+
+ key.objectid = ref_objectid;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ if (ref_offset > ((u64)-1 << 32))
+ key.offset = 0;
+ else
+ key.offset = ref_offset;
+
+ path->search_commit_root = 1;
+ path->skip_locking = 1;
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ /*
+ * the references in tree blocks that use full backrefs
+ * are not counted in
+ */
+ if (block_use_full_backref(rc, leaf))
+ counted = 0;
+ else
+ counted = 1;
+ rb_node = tree_search(blocks, leaf->start);
+ if (rb_node) {
+ if (counted)
+ added = 1;
+ else
+ path->slots[0] = nritems;
+ }
+
+ while (ref_count > 0) {
+ while (path->slots[0] >= nritems) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+ if (WARN_ON(ret > 0))
+ goto out;
+
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ added = 0;
+
+ if (block_use_full_backref(rc, leaf))
+ counted = 0;
+ else
+ counted = 1;
+ rb_node = tree_search(blocks, leaf->start);
+ if (rb_node) {
+ if (counted)
+ added = 1;
+ else
+ path->slots[0] = nritems;
+ }
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (WARN_ON(key.objectid != ref_objectid ||
+ key.type != BTRFS_EXTENT_DATA_KEY))
+ break;
+
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+
+ if (btrfs_file_extent_type(leaf, fi) ==
+ BTRFS_FILE_EXTENT_INLINE)
+ goto next;
+
+ if (btrfs_file_extent_disk_bytenr(leaf, fi) !=
+ extent_key->objectid)
+ goto next;
+
+ key.offset -= btrfs_file_extent_offset(leaf, fi);
+ if (key.offset != ref_offset)
+ goto next;
+
+ if (counted)
+ ref_count--;
+ if (added)
+ goto next;
+
+ if (!tree_block_processed(leaf->start, rc)) {
+ block = kmalloc(sizeof(*block), GFP_NOFS);
+ if (!block) {
+ err = -ENOMEM;
+ break;
+ }
+ block->bytenr = leaf->start;
+ btrfs_item_key_to_cpu(leaf, &block->key, 0);
+ block->level = 0;
+ block->key_ready = 1;
+ rb_node = tree_insert(blocks, block->bytenr,
+ &block->rb_node);
+ if (rb_node)
+ backref_tree_panic(rb_node, -EEXIST,
+ block->bytenr);
+ }
+ if (counted)
+ added = 1;
+ else
+ path->slots[0] = nritems;
+next:
+ path->slots[0]++;
+
+ }
+out:
+ btrfs_free_path(path);
+ return err;
+}
+
+/*
+ * helper to find all tree blocks that reference a given data extent
+ */
+static noinline_for_stack
+int add_data_references(struct reloc_control *rc,
+ struct btrfs_key *extent_key,
+ struct btrfs_path *path,
+ struct rb_root *blocks)
+{
+ struct btrfs_key key;
+ struct extent_buffer *eb;
+ struct btrfs_extent_data_ref *dref;
+ struct btrfs_extent_inline_ref *iref;
+ unsigned long ptr;
+ unsigned long end;
+ u32 blocksize = rc->extent_root->nodesize;
+ int ret = 0;
+ int err = 0;
+
+ eb = path->nodes[0];
+ ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
+ end = ptr + btrfs_item_size_nr(eb, path->slots[0]);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+ if (ptr + sizeof(struct btrfs_extent_item_v0) == end)
+ ptr = end;
+ else
+#endif
+ ptr += sizeof(struct btrfs_extent_item);
+
+ while (ptr < end) {
+ iref = (struct btrfs_extent_inline_ref *)ptr;
+ key.type = btrfs_extent_inline_ref_type(eb, iref);
+ if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
+ key.offset = btrfs_extent_inline_ref_offset(eb, iref);
+ ret = __add_tree_block(rc, key.offset, blocksize,
+ blocks);
+ } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
+ dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+ ret = find_data_references(rc, extent_key,
+ eb, dref, blocks);
+ } else {
+ BUG();
+ }
+ if (ret) {
+ err = ret;
+ goto out;
+ }
+ ptr += btrfs_extent_inline_ref_size(key.type);
+ }
+ WARN_ON(ptr > end);
+
+ while (1) {
+ cond_resched();
+ eb = path->nodes[0];
+ if (path->slots[0] >= btrfs_header_nritems(eb)) {
+ ret = btrfs_next_leaf(rc->extent_root, path);
+ if (ret < 0) {
+ err = ret;
+ break;
+ }
+ if (ret > 0)
+ break;
+ eb = path->nodes[0];
+ }
+
+ btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
+ if (key.objectid != extent_key->objectid)
+ break;
+
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+ if (key.type == BTRFS_SHARED_DATA_REF_KEY ||
+ key.type == BTRFS_EXTENT_REF_V0_KEY) {
+#else
+ BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
+ if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
+#endif
+ ret = __add_tree_block(rc, key.offset, blocksize,
+ blocks);
+ } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
+ dref = btrfs_item_ptr(eb, path->slots[0],
+ struct btrfs_extent_data_ref);
+ ret = find_data_references(rc, extent_key,
+ eb, dref, blocks);
+ } else {
+ ret = 0;
+ }
+ if (ret) {
+ err = ret;
+ break;
+ }
+ path->slots[0]++;
+ }
+out:
+ btrfs_release_path(path);
+ if (err)
+ free_block_list(blocks);
+ return err;
+}
+
+/*
+ * helper to find next unprocessed extent
+ */
+static noinline_for_stack
+int find_next_extent(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc, struct btrfs_path *path,
+ struct btrfs_key *extent_key)
+{
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ u64 start, end, last;
+ int ret;
+
+ last = rc->block_group->key.objectid + rc->block_group->key.offset;
+ while (1) {
+ cond_resched();
+ if (rc->search_start >= last) {
+ ret = 1;
+ break;
+ }
+
+ key.objectid = rc->search_start;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = 0;
+
+ path->search_commit_root = 1;
+ path->skip_locking = 1;
+ ret = btrfs_search_slot(NULL, rc->extent_root, &key, path,
+ 0, 0);
+ if (ret < 0)
+ break;
+next:
+ leaf = path->nodes[0];
+ if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(rc->extent_root, path);
+ if (ret != 0)
+ break;
+ leaf = path->nodes[0];
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid >= last) {
+ ret = 1;
+ break;
+ }
+
+ if (key.type != BTRFS_EXTENT_ITEM_KEY &&
+ key.type != BTRFS_METADATA_ITEM_KEY) {
+ path->slots[0]++;
+ goto next;
+ }
+
+ if (key.type == BTRFS_EXTENT_ITEM_KEY &&
+ key.objectid + key.offset <= rc->search_start) {
+ path->slots[0]++;
+ goto next;
+ }
+
+ if (key.type == BTRFS_METADATA_ITEM_KEY &&
+ key.objectid + rc->extent_root->nodesize <=
+ rc->search_start) {
+ path->slots[0]++;
+ goto next;
+ }
+
+ ret = find_first_extent_bit(&rc->processed_blocks,
+ key.objectid, &start, &end,
+ EXTENT_DIRTY, NULL);
+
+ if (ret == 0 && start <= key.objectid) {
+ btrfs_release_path(path);
+ rc->search_start = end + 1;
+ } else {
+ if (key.type == BTRFS_EXTENT_ITEM_KEY)
+ rc->search_start = key.objectid + key.offset;
+ else
+ rc->search_start = key.objectid +
+ rc->extent_root->nodesize;
+ memcpy(extent_key, &key, sizeof(key));
+ return 0;
+ }
+ }
+ btrfs_release_path(path);
+ return ret;
+}
+
+static void set_reloc_control(struct reloc_control *rc)
+{
+ struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
+
+ mutex_lock(&fs_info->reloc_mutex);
+ fs_info->reloc_ctl = rc;
+ mutex_unlock(&fs_info->reloc_mutex);
+}
+
+static void unset_reloc_control(struct reloc_control *rc)
+{
+ struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
+
+ mutex_lock(&fs_info->reloc_mutex);
+ fs_info->reloc_ctl = NULL;
+ mutex_unlock(&fs_info->reloc_mutex);
+}
+
+static int check_extent_flags(u64 flags)
+{
+ if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
+ (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
+ return 1;
+ if (!(flags & BTRFS_EXTENT_FLAG_DATA) &&
+ !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
+ return 1;
+ if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
+ (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
+ return 1;
+ return 0;
+}
+
+static noinline_for_stack
+int prepare_to_relocate(struct reloc_control *rc)
+{
+ struct btrfs_trans_handle *trans;
+
+ rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root,
+ BTRFS_BLOCK_RSV_TEMP);
+ if (!rc->block_rsv)
+ return -ENOMEM;
+
+ memset(&rc->cluster, 0, sizeof(rc->cluster));
+ rc->search_start = rc->block_group->key.objectid;
+ rc->extents_found = 0;
+ rc->nodes_relocated = 0;
+ rc->merging_rsv_size = 0;
+ rc->reserved_bytes = 0;
+ rc->block_rsv->size = rc->extent_root->nodesize *
+ RELOCATION_RESERVED_NODES;
+
+ rc->create_reloc_tree = 1;
+ set_reloc_control(rc);
+
+ trans = btrfs_join_transaction(rc->extent_root);
+ if (IS_ERR(trans)) {
+ unset_reloc_control(rc);
+ /*
+ * extent tree is not a ref_cow tree and has no reloc_root to
+ * cleanup. And callers are responsible to free the above
+ * block rsv.
+ */
+ return PTR_ERR(trans);
+ }
+ btrfs_commit_transaction(trans, rc->extent_root);
+ return 0;
+}
+
+static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
+{
+ struct rb_root blocks = RB_ROOT;
+ struct btrfs_key key;
+ struct btrfs_trans_handle *trans = NULL;
+ struct btrfs_path *path;
+ struct btrfs_extent_item *ei;
+ u64 flags;
+ u32 item_size;
+ int ret;
+ int err = 0;
+ int progress = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ path->reada = 1;
+
+ ret = prepare_to_relocate(rc);
+ if (ret) {
+ err = ret;
+ goto out_free;
+ }
+
+ while (1) {
+ rc->reserved_bytes = 0;
+ ret = btrfs_block_rsv_refill(rc->extent_root,
+ rc->block_rsv, rc->block_rsv->size,
+ BTRFS_RESERVE_FLUSH_ALL);
+ if (ret) {
+ err = ret;
+ break;
+ }
+ progress++;
+ trans = btrfs_start_transaction(rc->extent_root, 0);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ trans = NULL;
+ break;
+ }
+restart:
+ if (update_backref_cache(trans, &rc->backref_cache)) {
+ btrfs_end_transaction(trans, rc->extent_root);
+ continue;
+ }
+
+ ret = find_next_extent(trans, rc, path, &key);
+ if (ret < 0)
+ err = ret;
+ if (ret != 0)
+ break;
+
+ rc->extents_found++;
+
+ ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_extent_item);
+ item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
+ if (item_size >= sizeof(*ei)) {
+ flags = btrfs_extent_flags(path->nodes[0], ei);
+ ret = check_extent_flags(flags);
+ BUG_ON(ret);
+
+ } else {
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+ u64 ref_owner;
+ int path_change = 0;
+
+ BUG_ON(item_size !=
+ sizeof(struct btrfs_extent_item_v0));
+ ret = get_ref_objectid_v0(rc, path, &key, &ref_owner,
+ &path_change);
+ if (ref_owner < BTRFS_FIRST_FREE_OBJECTID)
+ flags = BTRFS_EXTENT_FLAG_TREE_BLOCK;
+ else
+ flags = BTRFS_EXTENT_FLAG_DATA;
+
+ if (path_change) {
+ btrfs_release_path(path);
+
+ path->search_commit_root = 1;
+ path->skip_locking = 1;
+ ret = btrfs_search_slot(NULL, rc->extent_root,
+ &key, path, 0, 0);
+ if (ret < 0) {
+ err = ret;
+ break;
+ }
+ BUG_ON(ret > 0);
+ }
+#else
+ BUG();
+#endif
+ }
+
+ if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+ ret = add_tree_block(rc, &key, path, &blocks);
+ } else if (rc->stage == UPDATE_DATA_PTRS &&
+ (flags & BTRFS_EXTENT_FLAG_DATA)) {
+ ret = add_data_references(rc, &key, path, &blocks);
+ } else {
+ btrfs_release_path(path);
+ ret = 0;
+ }
+ if (ret < 0) {
+ err = ret;
+ break;
+ }
+
+ if (!RB_EMPTY_ROOT(&blocks)) {
+ ret = relocate_tree_blocks(trans, rc, &blocks);
+ if (ret < 0) {
+ /*
+ * if we fail to relocate tree blocks, force to update
+ * backref cache when committing transaction.
+ */
+ rc->backref_cache.last_trans = trans->transid - 1;
+
+ if (ret != -EAGAIN) {
+ err = ret;
+ break;
+ }
+ rc->extents_found--;
+ rc->search_start = key.objectid;
+ }
+ }
+
+ btrfs_end_transaction_throttle(trans, rc->extent_root);
+ btrfs_btree_balance_dirty(rc->extent_root);
+ trans = NULL;
+
+ if (rc->stage == MOVE_DATA_EXTENTS &&
+ (flags & BTRFS_EXTENT_FLAG_DATA)) {
+ rc->found_file_extent = 1;
+ ret = relocate_data_extent(rc->data_inode,
+ &key, &rc->cluster);
+ if (ret < 0) {
+ err = ret;
+ break;
+ }
+ }
+ }
+ if (trans && progress && err == -ENOSPC) {
+ ret = btrfs_force_chunk_alloc(trans, rc->extent_root,
+ rc->block_group->flags);
+ if (ret == 0) {
+ err = 0;
+ progress = 0;
+ goto restart;
+ }
+ }
+
+ btrfs_release_path(path);
+ clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
+ GFP_NOFS);
+
+ if (trans) {
+ btrfs_end_transaction_throttle(trans, rc->extent_root);
+ btrfs_btree_balance_dirty(rc->extent_root);
+ }
+
+ if (!err) {
+ ret = relocate_file_extent_cluster(rc->data_inode,
+ &rc->cluster);
+ if (ret < 0)
+ err = ret;
+ }
+
+ rc->create_reloc_tree = 0;
+ set_reloc_control(rc);
+
+ backref_cache_cleanup(&rc->backref_cache);
+ btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
+
+ err = prepare_to_merge(rc, err);
+
+ merge_reloc_roots(rc);
+
+ rc->merge_reloc_tree = 0;
+ unset_reloc_control(rc);
+ btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
+
+ /* get rid of pinned extents */
+ trans = btrfs_join_transaction(rc->extent_root);
+ if (IS_ERR(trans))
+ err = PTR_ERR(trans);
+ else
+ btrfs_commit_transaction(trans, rc->extent_root);
+out_free:
+ btrfs_free_block_rsv(rc->extent_root, rc->block_rsv);
+ btrfs_free_path(path);
+ return err;
+}
+
+static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 objectid)
+{
+ struct btrfs_path *path;
+ struct btrfs_inode_item *item;
+ struct extent_buffer *leaf;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = btrfs_insert_empty_inode(trans, root, path, objectid);
+ if (ret)
+ goto out;
+
+ leaf = path->nodes[0];
+ item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
+ memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
+ btrfs_set_inode_generation(leaf, item, 1);
+ btrfs_set_inode_size(leaf, item, 0);
+ btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
+ btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
+ BTRFS_INODE_PREALLOC);
+ btrfs_mark_buffer_dirty(leaf);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * helper to create inode for data relocation.
+ * the inode is in data relocation tree and its link count is 0
+ */
+static noinline_for_stack
+struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *group)
+{
+ struct inode *inode = NULL;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root;
+ struct btrfs_key key;
+ u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
+ int err = 0;
+
+ root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
+ if (IS_ERR(root))
+ return ERR_CAST(root);
+
+ trans = btrfs_start_transaction(root, 6);
+ if (IS_ERR(trans))
+ return ERR_CAST(trans);
+
+ err = btrfs_find_free_objectid(root, &objectid);
+ if (err)
+ goto out;
+
+ err = __insert_orphan_inode(trans, root, objectid);
+ BUG_ON(err);
+
+ key.objectid = objectid;
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+ inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
+ BUG_ON(IS_ERR(inode) || is_bad_inode(inode));
+ BTRFS_I(inode)->index_cnt = group->key.objectid;
+
+ err = btrfs_orphan_add(trans, inode);
+out:
+ btrfs_end_transaction(trans, root);
+ btrfs_btree_balance_dirty(root);
+ if (err) {
+ if (inode)
+ iput(inode);
+ inode = ERR_PTR(err);
+ }
+ return inode;
+}
+
+static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
+{
+ struct reloc_control *rc;
+
+ rc = kzalloc(sizeof(*rc), GFP_NOFS);
+ if (!rc)
+ return NULL;
+
+ INIT_LIST_HEAD(&rc->reloc_roots);
+ backref_cache_init(&rc->backref_cache);
+ mapping_tree_init(&rc->reloc_root_tree);
+ extent_io_tree_init(&rc->processed_blocks,
+ fs_info->btree_inode->i_mapping);
+ return rc;
+}
+
+/*
+ * function to relocate all extents in a block group.
+ */
+int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
+{
+ struct btrfs_fs_info *fs_info = extent_root->fs_info;
+ struct reloc_control *rc;
+ struct inode *inode;
+ struct btrfs_path *path;
+ int ret;
+ int rw = 0;
+ int err = 0;
+
+ rc = alloc_reloc_control(fs_info);
+ if (!rc)
+ return -ENOMEM;
+
+ rc->extent_root = extent_root;
+
+ rc->block_group = btrfs_lookup_block_group(fs_info, group_start);
+ BUG_ON(!rc->block_group);
+
+ if (!rc->block_group->ro) {
+ ret = btrfs_set_block_group_ro(extent_root, rc->block_group);
+ if (ret) {
+ err = ret;
+ goto out;
+ }
+ rw = 1;
+ }
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ inode = lookup_free_space_inode(fs_info->tree_root, rc->block_group,
+ path);
+ btrfs_free_path(path);
+
+ if (!IS_ERR(inode))
+ ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0);
+ else
+ ret = PTR_ERR(inode);
+
+ if (ret && ret != -ENOENT) {
+ err = ret;
+ goto out;
+ }
+
+ rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
+ if (IS_ERR(rc->data_inode)) {
+ err = PTR_ERR(rc->data_inode);
+ rc->data_inode = NULL;
+ goto out;
+ }
+
+ btrfs_info(extent_root->fs_info, "relocating block group %llu flags %llu",
+ rc->block_group->key.objectid, rc->block_group->flags);
+
+ ret = btrfs_start_delalloc_roots(fs_info, 0, -1);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+ btrfs_wait_ordered_roots(fs_info, -1);
+
+ while (1) {
+ mutex_lock(&fs_info->cleaner_mutex);
+ ret = relocate_block_group(rc);
+ mutex_unlock(&fs_info->cleaner_mutex);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+
+ if (rc->extents_found == 0)
+ break;
+
+ btrfs_info(extent_root->fs_info, "found %llu extents",
+ rc->extents_found);
+
+ if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
+ ret = btrfs_wait_ordered_range(rc->data_inode, 0,
+ (u64)-1);
+ if (ret) {
+ err = ret;
+ goto out;
+ }
+ invalidate_mapping_pages(rc->data_inode->i_mapping,
+ 0, -1);
+ rc->stage = UPDATE_DATA_PTRS;
+ }
+ }
+
+ WARN_ON(rc->block_group->pinned > 0);
+ WARN_ON(rc->block_group->reserved > 0);
+ WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
+out:
+ if (err && rw)
+ btrfs_set_block_group_rw(extent_root, rc->block_group);
+ iput(rc->data_inode);
+ btrfs_put_block_group(rc->block_group);
+ kfree(rc);
+ return err;
+}
+
+static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
+{
+ struct btrfs_trans_handle *trans;
+ int ret, err;
+
+ trans = btrfs_start_transaction(root->fs_info->tree_root, 0);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ memset(&root->root_item.drop_progress, 0,
+ sizeof(root->root_item.drop_progress));
+ root->root_item.drop_level = 0;
+ btrfs_set_root_refs(&root->root_item, 0);
+ ret = btrfs_update_root(trans, root->fs_info->tree_root,
+ &root->root_key, &root->root_item);
+
+ err = btrfs_end_transaction(trans, root->fs_info->tree_root);
+ if (err)
+ return err;
+ return ret;
+}
+
+/*
+ * recover relocation interrupted by system crash.
+ *
+ * this function resumes merging reloc trees with corresponding fs trees.
+ * this is important for keeping the sharing of tree blocks
+ */
+int btrfs_recover_relocation(struct btrfs_root *root)
+{
+ LIST_HEAD(reloc_roots);
+ struct btrfs_key key;
+ struct btrfs_root *fs_root;
+ struct btrfs_root *reloc_root;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct reloc_control *rc = NULL;
+ struct btrfs_trans_handle *trans;
+ int ret;
+ int err = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ path->reada = -1;
+
+ key.objectid = BTRFS_TREE_RELOC_OBJECTID;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = (u64)-1;
+
+ while (1) {
+ ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key,
+ path, 0, 0);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+ if (ret > 0) {
+ if (path->slots[0] == 0)
+ break;
+ path->slots[0]--;
+ }
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ btrfs_release_path(path);
+
+ if (key.objectid != BTRFS_TREE_RELOC_OBJECTID ||
+ key.type != BTRFS_ROOT_ITEM_KEY)
+ break;
+
+ reloc_root = btrfs_read_fs_root(root, &key);
+ if (IS_ERR(reloc_root)) {
+ err = PTR_ERR(reloc_root);
+ goto out;
+ }
+
+ list_add(&reloc_root->root_list, &reloc_roots);
+
+ if (btrfs_root_refs(&reloc_root->root_item) > 0) {
+ fs_root = read_fs_root(root->fs_info,
+ reloc_root->root_key.offset);
+ if (IS_ERR(fs_root)) {
+ ret = PTR_ERR(fs_root);
+ if (ret != -ENOENT) {
+ err = ret;
+ goto out;
+ }
+ ret = mark_garbage_root(reloc_root);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+ }
+ }
+
+ if (key.offset == 0)
+ break;
+
+ key.offset--;
+ }
+ btrfs_release_path(path);
+
+ if (list_empty(&reloc_roots))
+ goto out;
+
+ rc = alloc_reloc_control(root->fs_info);
+ if (!rc) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ rc->extent_root = root->fs_info->extent_root;
+
+ set_reloc_control(rc);
+
+ trans = btrfs_join_transaction(rc->extent_root);
+ if (IS_ERR(trans)) {
+ unset_reloc_control(rc);
+ err = PTR_ERR(trans);
+ goto out_free;
+ }
+
+ rc->merge_reloc_tree = 1;
+
+ while (!list_empty(&reloc_roots)) {
+ reloc_root = list_entry(reloc_roots.next,
+ struct btrfs_root, root_list);
+ list_del(&reloc_root->root_list);
+
+ if (btrfs_root_refs(&reloc_root->root_item) == 0) {
+ list_add_tail(&reloc_root->root_list,
+ &rc->reloc_roots);
+ continue;
+ }
+
+ fs_root = read_fs_root(root->fs_info,
+ reloc_root->root_key.offset);
+ if (IS_ERR(fs_root)) {
+ err = PTR_ERR(fs_root);
+ goto out_free;
+ }
+
+ err = __add_reloc_root(reloc_root);
+ BUG_ON(err < 0); /* -ENOMEM or logic error */
+ fs_root->reloc_root = reloc_root;
+ }
+
+ err = btrfs_commit_transaction(trans, rc->extent_root);
+ if (err)
+ goto out_free;
+
+ merge_reloc_roots(rc);
+
+ unset_reloc_control(rc);
+
+ trans = btrfs_join_transaction(rc->extent_root);
+ if (IS_ERR(trans))
+ err = PTR_ERR(trans);
+ else
+ err = btrfs_commit_transaction(trans, rc->extent_root);
+out_free:
+ kfree(rc);
+out:
+ if (!list_empty(&reloc_roots))
+ free_reloc_roots(&reloc_roots);
+
+ btrfs_free_path(path);
+
+ if (err == 0) {
+ /* cleanup orphan inode in data relocation tree */
+ fs_root = read_fs_root(root->fs_info,
+ BTRFS_DATA_RELOC_TREE_OBJECTID);
+ if (IS_ERR(fs_root))
+ err = PTR_ERR(fs_root);
+ else
+ err = btrfs_orphan_cleanup(fs_root);
+ }
+ return err;
+}
+
+/*
+ * helper to add ordered checksum for data relocation.
+ *
+ * cloning checksum properly handles the nodatasum extents.
+ * it also saves CPU time to re-calculate the checksum.
+ */
+int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
+{
+ struct btrfs_ordered_sum *sums;
+ struct btrfs_ordered_extent *ordered;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret;
+ u64 disk_bytenr;
+ u64 new_bytenr;
+ LIST_HEAD(list);
+
+ ordered = btrfs_lookup_ordered_extent(inode, file_pos);
+ BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
+
+ disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
+ ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
+ disk_bytenr + len - 1, &list, 0);
+ if (ret)
+ goto out;
+
+ while (!list_empty(&list)) {
+ sums = list_entry(list.next, struct btrfs_ordered_sum, list);
+ list_del_init(&sums->list);
+
+ /*
+ * We need to offset the new_bytenr based on where the csum is.
+ * We need to do this because we will read in entire prealloc
+ * extents but we may have written to say the middle of the
+ * prealloc extent, so we need to make sure the csum goes with
+ * the right disk offset.
+ *
+ * We can do this because the data reloc inode refers strictly
+ * to the on disk bytes, so we don't have to worry about
+ * disk_len vs real len like with real inodes since it's all
+ * disk length.
+ */
+ new_bytenr = ordered->start + (sums->bytenr - disk_bytenr);
+ sums->bytenr = new_bytenr;
+
+ btrfs_add_ordered_sum(inode, ordered, sums);
+ }
+out:
+ btrfs_put_ordered_extent(ordered);
+ return ret;
+}
+
+int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct extent_buffer *buf,
+ struct extent_buffer *cow)
+{
+ struct reloc_control *rc;
+ struct backref_node *node;
+ int first_cow = 0;
+ int level;
+ int ret = 0;
+
+ rc = root->fs_info->reloc_ctl;
+ if (!rc)
+ return 0;
+
+ BUG_ON(rc->stage == UPDATE_DATA_PTRS &&
+ root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID);
+
+ if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
+ if (buf == root->node)
+ __update_reloc_root(root, cow->start);
+ }
+
+ level = btrfs_header_level(buf);
+ if (btrfs_header_generation(buf) <=
+ btrfs_root_last_snapshot(&root->root_item))
+ first_cow = 1;
+
+ if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID &&
+ rc->create_reloc_tree) {
+ WARN_ON(!first_cow && level == 0);
+
+ node = rc->backref_cache.path[level];
+ BUG_ON(node->bytenr != buf->start &&
+ node->new_bytenr != buf->start);
+
+ drop_node_buffer(node);
+ extent_buffer_get(cow);
+ node->eb = cow;
+ node->new_bytenr = cow->start;
+
+ if (!node->pending) {
+ list_move_tail(&node->list,
+ &rc->backref_cache.pending[level]);
+ node->pending = 1;
+ }
+
+ if (first_cow)
+ __mark_block_processed(rc, node);
+
+ if (first_cow && level > 0)
+ rc->nodes_relocated += buf->len;
+ }
+
+ if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS)
+ ret = replace_file_extents(trans, rc, root, cow);
+ return ret;
+}
+
+/*
+ * called before creating snapshot. it calculates metadata reservation
+ * requried for relocating tree blocks in the snapshot
+ */
+void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
+ struct btrfs_pending_snapshot *pending,
+ u64 *bytes_to_reserve)
+{
+ struct btrfs_root *root;
+ struct reloc_control *rc;
+
+ root = pending->root;
+ if (!root->reloc_root)
+ return;
+
+ rc = root->fs_info->reloc_ctl;
+ if (!rc->merge_reloc_tree)
+ return;
+
+ root = root->reloc_root;
+ BUG_ON(btrfs_root_refs(&root->root_item) == 0);
+ /*
+ * relocation is in the stage of merging trees. the space
+ * used by merging a reloc tree is twice the size of
+ * relocated tree nodes in the worst case. half for cowing
+ * the reloc tree, half for cowing the fs tree. the space
+ * used by cowing the reloc tree will be freed after the
+ * tree is dropped. if we create snapshot, cowing the fs
+ * tree may use more space than it frees. so we need
+ * reserve extra space.
+ */
+ *bytes_to_reserve += rc->nodes_relocated;
+}
+
+/*
+ * called after snapshot is created. migrate block reservation
+ * and create reloc root for the newly created snapshot
+ */
+int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
+ struct btrfs_pending_snapshot *pending)
+{
+ struct btrfs_root *root = pending->root;
+ struct btrfs_root *reloc_root;
+ struct btrfs_root *new_root;
+ struct reloc_control *rc;
+ int ret;
+
+ if (!root->reloc_root)
+ return 0;
+
+ rc = root->fs_info->reloc_ctl;
+ rc->merging_rsv_size += rc->nodes_relocated;
+
+ if (rc->merge_reloc_tree) {
+ ret = btrfs_block_rsv_migrate(&pending->block_rsv,
+ rc->block_rsv,
+ rc->nodes_relocated);
+ if (ret)
+ return ret;
+ }
+
+ new_root = pending->snap;
+ reloc_root = create_reloc_root(trans, root->reloc_root,
+ new_root->root_key.objectid);
+ if (IS_ERR(reloc_root))
+ return PTR_ERR(reloc_root);
+
+ ret = __add_reloc_root(reloc_root);
+ BUG_ON(ret < 0);
+ new_root->reloc_root = reloc_root;
+
+ if (rc->create_reloc_tree)
+ ret = clone_backref_node(trans, rc, root, reloc_root);
+ return ret;
+}
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
new file mode 100644
index 000000000..360a728a6
--- /dev/null
+++ b/fs/btrfs/root-tree.c
@@ -0,0 +1,497 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/err.h>
+#include <linux/uuid.h>
+#include "ctree.h"
+#include "transaction.h"
+#include "disk-io.h"
+#include "print-tree.h"
+
+/*
+ * Read a root item from the tree. In case we detect a root item smaller then
+ * sizeof(root_item), we know it's an old version of the root structure and
+ * initialize all new fields to zero. The same happens if we detect mismatching
+ * generation numbers as then we know the root was once mounted with an older
+ * kernel that was not aware of the root item structure change.
+ */
+static void btrfs_read_root_item(struct extent_buffer *eb, int slot,
+ struct btrfs_root_item *item)
+{
+ uuid_le uuid;
+ int len;
+ int need_reset = 0;
+
+ len = btrfs_item_size_nr(eb, slot);
+ read_extent_buffer(eb, item, btrfs_item_ptr_offset(eb, slot),
+ min_t(int, len, (int)sizeof(*item)));
+ if (len < sizeof(*item))
+ need_reset = 1;
+ if (!need_reset && btrfs_root_generation(item)
+ != btrfs_root_generation_v2(item)) {
+ if (btrfs_root_generation_v2(item) != 0) {
+ printk(KERN_WARNING "BTRFS: mismatching "
+ "generation and generation_v2 "
+ "found in root item. This root "
+ "was probably mounted with an "
+ "older kernel. Resetting all "
+ "new fields.\n");
+ }
+ need_reset = 1;
+ }
+ if (need_reset) {
+ memset(&item->generation_v2, 0,
+ sizeof(*item) - offsetof(struct btrfs_root_item,
+ generation_v2));
+
+ uuid_le_gen(&uuid);
+ memcpy(item->uuid, uuid.b, BTRFS_UUID_SIZE);
+ }
+}
+
+/*
+ * btrfs_find_root - lookup the root by the key.
+ * root: the root of the root tree
+ * search_key: the key to search
+ * path: the path we search
+ * root_item: the root item of the tree we look for
+ * root_key: the reak key of the tree we look for
+ *
+ * If ->offset of 'seach_key' is -1ULL, it means we are not sure the offset
+ * of the search key, just lookup the root with the highest offset for a
+ * given objectid.
+ *
+ * If we find something return 0, otherwise > 0, < 0 on error.
+ */
+int btrfs_find_root(struct btrfs_root *root, struct btrfs_key *search_key,
+ struct btrfs_path *path, struct btrfs_root_item *root_item,
+ struct btrfs_key *root_key)
+{
+ struct btrfs_key found_key;
+ struct extent_buffer *l;
+ int ret;
+ int slot;
+
+ ret = btrfs_search_slot(NULL, root, search_key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+
+ if (search_key->offset != -1ULL) { /* the search key is exact */
+ if (ret > 0)
+ goto out;
+ } else {
+ BUG_ON(ret == 0); /* Logical error */
+ if (path->slots[0] == 0)
+ goto out;
+ path->slots[0]--;
+ ret = 0;
+ }
+
+ l = path->nodes[0];
+ slot = path->slots[0];
+
+ btrfs_item_key_to_cpu(l, &found_key, slot);
+ if (found_key.objectid != search_key->objectid ||
+ found_key.type != BTRFS_ROOT_ITEM_KEY) {
+ ret = 1;
+ goto out;
+ }
+
+ if (root_item)
+ btrfs_read_root_item(l, slot, root_item);
+ if (root_key)
+ memcpy(root_key, &found_key, sizeof(found_key));
+out:
+ btrfs_release_path(path);
+ return ret;
+}
+
+void btrfs_set_root_node(struct btrfs_root_item *item,
+ struct extent_buffer *node)
+{
+ btrfs_set_root_bytenr(item, node->start);
+ btrfs_set_root_level(item, btrfs_header_level(node));
+ btrfs_set_root_generation(item, btrfs_header_generation(node));
+}
+
+/*
+ * copy the data in 'item' into the btree
+ */
+int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_key *key, struct btrfs_root_item
+ *item)
+{
+ struct btrfs_path *path;
+ struct extent_buffer *l;
+ int ret;
+ int slot;
+ unsigned long ptr;
+ int old_len;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = btrfs_search_slot(trans, root, key, path, 0, 1);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
+
+ if (ret != 0) {
+ btrfs_print_leaf(root, path->nodes[0]);
+ btrfs_crit(root->fs_info, "unable to update root key %llu %u %llu",
+ key->objectid, key->type, key->offset);
+ BUG_ON(1);
+ }
+
+ l = path->nodes[0];
+ slot = path->slots[0];
+ ptr = btrfs_item_ptr_offset(l, slot);
+ old_len = btrfs_item_size_nr(l, slot);
+
+ /*
+ * If this is the first time we update the root item which originated
+ * from an older kernel, we need to enlarge the item size to make room
+ * for the added fields.
+ */
+ if (old_len < sizeof(*item)) {
+ btrfs_release_path(path);
+ ret = btrfs_search_slot(trans, root, key, path,
+ -1, 1);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
+
+ ret = btrfs_del_item(trans, root, path);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
+ btrfs_release_path(path);
+ ret = btrfs_insert_empty_item(trans, root, path,
+ key, sizeof(*item));
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
+ l = path->nodes[0];
+ slot = path->slots[0];
+ ptr = btrfs_item_ptr_offset(l, slot);
+ }
+
+ /*
+ * Update generation_v2 so at the next mount we know the new root
+ * fields are valid.
+ */
+ btrfs_set_root_generation_v2(item, btrfs_root_generation(item));
+
+ write_extent_buffer(l, item, ptr, sizeof(*item));
+ btrfs_mark_buffer_dirty(path->nodes[0]);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct btrfs_key *key, struct btrfs_root_item *item)
+{
+ /*
+ * Make sure generation v1 and v2 match. See update_root for details.
+ */
+ btrfs_set_root_generation_v2(item, btrfs_root_generation(item));
+ return btrfs_insert_item(trans, root, key, item, sizeof(*item));
+}
+
+int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
+{
+ struct extent_buffer *leaf;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct btrfs_key root_key;
+ struct btrfs_root *root;
+ int err = 0;
+ int ret;
+ bool can_recover = true;
+
+ if (tree_root->fs_info->sb->s_flags & MS_RDONLY)
+ can_recover = false;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = BTRFS_ORPHAN_OBJECTID;
+ key.type = BTRFS_ORPHAN_ITEM_KEY;
+ key.offset = 0;
+
+ root_key.type = BTRFS_ROOT_ITEM_KEY;
+ root_key.offset = (u64)-1;
+
+ while (1) {
+ ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
+ if (ret < 0) {
+ err = ret;
+ break;
+ }
+
+ leaf = path->nodes[0];
+ if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(tree_root, path);
+ if (ret < 0)
+ err = ret;
+ if (ret != 0)
+ break;
+ leaf = path->nodes[0];
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ btrfs_release_path(path);
+
+ if (key.objectid != BTRFS_ORPHAN_OBJECTID ||
+ key.type != BTRFS_ORPHAN_ITEM_KEY)
+ break;
+
+ root_key.objectid = key.offset;
+ key.offset++;
+
+ root = btrfs_read_fs_root(tree_root, &root_key);
+ err = PTR_ERR_OR_ZERO(root);
+ if (err && err != -ENOENT) {
+ break;
+ } else if (err == -ENOENT) {
+ struct btrfs_trans_handle *trans;
+
+ btrfs_release_path(path);
+
+ trans = btrfs_join_transaction(tree_root);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ btrfs_error(tree_root->fs_info, err,
+ "Failed to start trans to delete "
+ "orphan item");
+ break;
+ }
+ err = btrfs_del_orphan_item(trans, tree_root,
+ root_key.objectid);
+ btrfs_end_transaction(trans, tree_root);
+ if (err) {
+ btrfs_error(tree_root->fs_info, err,
+ "Failed to delete root orphan "
+ "item");
+ break;
+ }
+ continue;
+ }
+
+ err = btrfs_init_fs_root(root);
+ if (err) {
+ btrfs_free_fs_root(root);
+ break;
+ }
+
+ set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state);
+
+ err = btrfs_insert_fs_root(root->fs_info, root);
+ if (err) {
+ BUG_ON(err == -EEXIST);
+ btrfs_free_fs_root(root);
+ break;
+ }
+
+ if (btrfs_root_refs(&root->root_item) == 0)
+ btrfs_add_dead_root(root);
+ }
+
+ btrfs_free_path(path);
+ return err;
+}
+
+/* drop the root item for 'key' from 'root' */
+int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct btrfs_key *key)
+{
+ struct btrfs_path *path;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ ret = btrfs_search_slot(trans, root, key, path, -1, 1);
+ if (ret < 0)
+ goto out;
+
+ BUG_ON(ret != 0);
+
+ ret = btrfs_del_item(trans, root, path);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *tree_root,
+ u64 root_id, u64 ref_id, u64 dirid, u64 *sequence,
+ const char *name, int name_len)
+
+{
+ struct btrfs_path *path;
+ struct btrfs_root_ref *ref;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ unsigned long ptr;
+ int err = 0;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = root_id;
+ key.type = BTRFS_ROOT_BACKREF_KEY;
+ key.offset = ref_id;
+again:
+ ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
+ BUG_ON(ret < 0);
+ if (ret == 0) {
+ leaf = path->nodes[0];
+ ref = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_root_ref);
+
+ WARN_ON(btrfs_root_ref_dirid(leaf, ref) != dirid);
+ WARN_ON(btrfs_root_ref_name_len(leaf, ref) != name_len);
+ ptr = (unsigned long)(ref + 1);
+ WARN_ON(memcmp_extent_buffer(leaf, name, ptr, name_len));
+ *sequence = btrfs_root_ref_sequence(leaf, ref);
+
+ ret = btrfs_del_item(trans, tree_root, path);
+ if (ret) {
+ err = ret;
+ goto out;
+ }
+ } else
+ err = -ENOENT;
+
+ if (key.type == BTRFS_ROOT_BACKREF_KEY) {
+ btrfs_release_path(path);
+ key.objectid = ref_id;
+ key.type = BTRFS_ROOT_REF_KEY;
+ key.offset = root_id;
+ goto again;
+ }
+
+out:
+ btrfs_free_path(path);
+ return err;
+}
+
+/*
+ * add a btrfs_root_ref item. type is either BTRFS_ROOT_REF_KEY
+ * or BTRFS_ROOT_BACKREF_KEY.
+ *
+ * The dirid, sequence, name and name_len refer to the directory entry
+ * that is referencing the root.
+ *
+ * For a forward ref, the root_id is the id of the tree referencing
+ * the root and ref_id is the id of the subvol or snapshot.
+ *
+ * For a back ref the root_id is the id of the subvol or snapshot and
+ * ref_id is the id of the tree referencing it.
+ *
+ * Will return 0, -ENOMEM, or anything from the CoW path
+ */
+int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *tree_root,
+ u64 root_id, u64 ref_id, u64 dirid, u64 sequence,
+ const char *name, int name_len)
+{
+ struct btrfs_key key;
+ int ret;
+ struct btrfs_path *path;
+ struct btrfs_root_ref *ref;
+ struct extent_buffer *leaf;
+ unsigned long ptr;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = root_id;
+ key.type = BTRFS_ROOT_BACKREF_KEY;
+ key.offset = ref_id;
+again:
+ ret = btrfs_insert_empty_item(trans, tree_root, path, &key,
+ sizeof(*ref) + name_len);
+ if (ret) {
+ btrfs_abort_transaction(trans, tree_root, ret);
+ btrfs_free_path(path);
+ return ret;
+ }
+
+ leaf = path->nodes[0];
+ ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
+ btrfs_set_root_ref_dirid(leaf, ref, dirid);
+ btrfs_set_root_ref_sequence(leaf, ref, sequence);
+ btrfs_set_root_ref_name_len(leaf, ref, name_len);
+ ptr = (unsigned long)(ref + 1);
+ write_extent_buffer(leaf, name, ptr, name_len);
+ btrfs_mark_buffer_dirty(leaf);
+
+ if (key.type == BTRFS_ROOT_BACKREF_KEY) {
+ btrfs_release_path(path);
+ key.objectid = ref_id;
+ key.type = BTRFS_ROOT_REF_KEY;
+ key.offset = root_id;
+ goto again;
+ }
+
+ btrfs_free_path(path);
+ return 0;
+}
+
+/*
+ * Old btrfs forgets to init root_item->flags and root_item->byte_limit
+ * for subvolumes. To work around this problem, we steal a bit from
+ * root_item->inode_item->flags, and use it to indicate if those fields
+ * have been properly initialized.
+ */
+void btrfs_check_and_init_root_item(struct btrfs_root_item *root_item)
+{
+ u64 inode_flags = btrfs_stack_inode_flags(&root_item->inode);
+
+ if (!(inode_flags & BTRFS_INODE_ROOT_ITEM_INIT)) {
+ inode_flags |= BTRFS_INODE_ROOT_ITEM_INIT;
+ btrfs_set_stack_inode_flags(&root_item->inode, inode_flags);
+ btrfs_set_root_flags(root_item, 0);
+ btrfs_set_root_limit(root_item, 0);
+ }
+}
+
+void btrfs_update_root_times(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_root_item *item = &root->root_item;
+ struct timespec ct = CURRENT_TIME;
+
+ spin_lock(&root->root_item_lock);
+ btrfs_set_root_ctransid(item, trans->transid);
+ btrfs_set_stack_timespec_sec(&item->ctime, ct.tv_sec);
+ btrfs_set_stack_timespec_nsec(&item->ctime, ct.tv_nsec);
+ spin_unlock(&root->root_item_lock);
+}
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
new file mode 100644
index 000000000..ab5811545
--- /dev/null
+++ b/fs/btrfs/scrub.c
@@ -0,0 +1,4228 @@
+/*
+ * Copyright (C) 2011, 2012 STRATO. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/blkdev.h>
+#include <linux/ratelimit.h>
+#include "ctree.h"
+#include "volumes.h"
+#include "disk-io.h"
+#include "ordered-data.h"
+#include "transaction.h"
+#include "backref.h"
+#include "extent_io.h"
+#include "dev-replace.h"
+#include "check-integrity.h"
+#include "rcu-string.h"
+#include "raid56.h"
+
+/*
+ * This is only the first step towards a full-features scrub. It reads all
+ * extent and super block and verifies the checksums. In case a bad checksum
+ * is found or the extent cannot be read, good data will be written back if
+ * any can be found.
+ *
+ * Future enhancements:
+ * - In case an unrepairable extent is encountered, track which files are
+ * affected and report them
+ * - track and record media errors, throw out bad devices
+ * - add a mode to also read unallocated space
+ */
+
+struct scrub_block;
+struct scrub_ctx;
+
+/*
+ * the following three values only influence the performance.
+ * The last one configures the number of parallel and outstanding I/O
+ * operations. The first two values configure an upper limit for the number
+ * of (dynamically allocated) pages that are added to a bio.
+ */
+#define SCRUB_PAGES_PER_RD_BIO 32 /* 128k per bio */
+#define SCRUB_PAGES_PER_WR_BIO 32 /* 128k per bio */
+#define SCRUB_BIOS_PER_SCTX 64 /* 8MB per device in flight */
+
+/*
+ * the following value times PAGE_SIZE needs to be large enough to match the
+ * largest node/leaf/sector size that shall be supported.
+ * Values larger than BTRFS_STRIPE_LEN are not supported.
+ */
+#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */
+
+struct scrub_recover {
+ atomic_t refs;
+ struct btrfs_bio *bbio;
+ u64 map_length;
+};
+
+struct scrub_page {
+ struct scrub_block *sblock;
+ struct page *page;
+ struct btrfs_device *dev;
+ struct list_head list;
+ u64 flags; /* extent flags */
+ u64 generation;
+ u64 logical;
+ u64 physical;
+ u64 physical_for_dev_replace;
+ atomic_t refs;
+ struct {
+ unsigned int mirror_num:8;
+ unsigned int have_csum:1;
+ unsigned int io_error:1;
+ };
+ u8 csum[BTRFS_CSUM_SIZE];
+
+ struct scrub_recover *recover;
+};
+
+struct scrub_bio {
+ int index;
+ struct scrub_ctx *sctx;
+ struct btrfs_device *dev;
+ struct bio *bio;
+ int err;
+ u64 logical;
+ u64 physical;
+#if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
+ struct scrub_page *pagev[SCRUB_PAGES_PER_WR_BIO];
+#else
+ struct scrub_page *pagev[SCRUB_PAGES_PER_RD_BIO];
+#endif
+ int page_count;
+ int next_free;
+ struct btrfs_work work;
+};
+
+struct scrub_block {
+ struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
+ int page_count;
+ atomic_t outstanding_pages;
+ atomic_t refs; /* free mem on transition to zero */
+ struct scrub_ctx *sctx;
+ struct scrub_parity *sparity;
+ struct {
+ unsigned int header_error:1;
+ unsigned int checksum_error:1;
+ unsigned int no_io_error_seen:1;
+ unsigned int generation_error:1; /* also sets header_error */
+
+ /* The following is for the data used to check parity */
+ /* It is for the data with checksum */
+ unsigned int data_corrected:1;
+ };
+};
+
+/* Used for the chunks with parity stripe such RAID5/6 */
+struct scrub_parity {
+ struct scrub_ctx *sctx;
+
+ struct btrfs_device *scrub_dev;
+
+ u64 logic_start;
+
+ u64 logic_end;
+
+ int nsectors;
+
+ int stripe_len;
+
+ atomic_t refs;
+
+ struct list_head spages;
+
+ /* Work of parity check and repair */
+ struct btrfs_work work;
+
+ /* Mark the parity blocks which have data */
+ unsigned long *dbitmap;
+
+ /*
+ * Mark the parity blocks which have data, but errors happen when
+ * read data or check data
+ */
+ unsigned long *ebitmap;
+
+ unsigned long bitmap[0];
+};
+
+struct scrub_wr_ctx {
+ struct scrub_bio *wr_curr_bio;
+ struct btrfs_device *tgtdev;
+ int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
+ atomic_t flush_all_writes;
+ struct mutex wr_lock;
+};
+
+struct scrub_ctx {
+ struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX];
+ struct btrfs_root *dev_root;
+ int first_free;
+ int curr;
+ atomic_t bios_in_flight;
+ atomic_t workers_pending;
+ spinlock_t list_lock;
+ wait_queue_head_t list_wait;
+ u16 csum_size;
+ struct list_head csum_list;
+ atomic_t cancel_req;
+ int readonly;
+ int pages_per_rd_bio;
+ u32 sectorsize;
+ u32 nodesize;
+
+ int is_dev_replace;
+ struct scrub_wr_ctx wr_ctx;
+
+ /*
+ * statistics
+ */
+ struct btrfs_scrub_progress stat;
+ spinlock_t stat_lock;
+
+ /*
+ * Use a ref counter to avoid use-after-free issues. Scrub workers
+ * decrement bios_in_flight and workers_pending and then do a wakeup
+ * on the list_wait wait queue. We must ensure the main scrub task
+ * doesn't free the scrub context before or while the workers are
+ * doing the wakeup() call.
+ */
+ atomic_t refs;
+};
+
+struct scrub_fixup_nodatasum {
+ struct scrub_ctx *sctx;
+ struct btrfs_device *dev;
+ u64 logical;
+ struct btrfs_root *root;
+ struct btrfs_work work;
+ int mirror_num;
+};
+
+struct scrub_nocow_inode {
+ u64 inum;
+ u64 offset;
+ u64 root;
+ struct list_head list;
+};
+
+struct scrub_copy_nocow_ctx {
+ struct scrub_ctx *sctx;
+ u64 logical;
+ u64 len;
+ int mirror_num;
+ u64 physical_for_dev_replace;
+ struct list_head inodes;
+ struct btrfs_work work;
+};
+
+struct scrub_warning {
+ struct btrfs_path *path;
+ u64 extent_item_size;
+ const char *errstr;
+ sector_t sector;
+ u64 logical;
+ struct btrfs_device *dev;
+};
+
+static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
+static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
+static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
+static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
+static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
+static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
+ struct scrub_block *sblocks_for_recheck);
+static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
+ struct scrub_block *sblock, int is_metadata,
+ int have_csum, u8 *csum, u64 generation,
+ u16 csum_size, int retry_failed_mirror);
+static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
+ struct scrub_block *sblock,
+ int is_metadata, int have_csum,
+ const u8 *csum, u64 generation,
+ u16 csum_size);
+static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
+ struct scrub_block *sblock_good);
+static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
+ struct scrub_block *sblock_good,
+ int page_num, int force_write);
+static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
+static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
+ int page_num);
+static int scrub_checksum_data(struct scrub_block *sblock);
+static int scrub_checksum_tree_block(struct scrub_block *sblock);
+static int scrub_checksum_super(struct scrub_block *sblock);
+static void scrub_block_get(struct scrub_block *sblock);
+static void scrub_block_put(struct scrub_block *sblock);
+static void scrub_page_get(struct scrub_page *spage);
+static void scrub_page_put(struct scrub_page *spage);
+static void scrub_parity_get(struct scrub_parity *sparity);
+static void scrub_parity_put(struct scrub_parity *sparity);
+static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
+ struct scrub_page *spage);
+static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
+ u64 physical, struct btrfs_device *dev, u64 flags,
+ u64 gen, int mirror_num, u8 *csum, int force,
+ u64 physical_for_dev_replace);
+static void scrub_bio_end_io(struct bio *bio, int err);
+static void scrub_bio_end_io_worker(struct btrfs_work *work);
+static void scrub_block_complete(struct scrub_block *sblock);
+static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
+ u64 extent_logical, u64 extent_len,
+ u64 *extent_physical,
+ struct btrfs_device **extent_dev,
+ int *extent_mirror_num);
+static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
+ struct scrub_wr_ctx *wr_ctx,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_device *dev,
+ int is_dev_replace);
+static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx);
+static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
+ struct scrub_page *spage);
+static void scrub_wr_submit(struct scrub_ctx *sctx);
+static void scrub_wr_bio_end_io(struct bio *bio, int err);
+static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
+static int write_page_nocow(struct scrub_ctx *sctx,
+ u64 physical_for_dev_replace, struct page *page);
+static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
+ struct scrub_copy_nocow_ctx *ctx);
+static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
+ int mirror_num, u64 physical_for_dev_replace);
+static void copy_nocow_pages_worker(struct btrfs_work *work);
+static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
+static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
+static void scrub_put_ctx(struct scrub_ctx *sctx);
+
+
+static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
+{
+ atomic_inc(&sctx->refs);
+ atomic_inc(&sctx->bios_in_flight);
+}
+
+static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
+{
+ atomic_dec(&sctx->bios_in_flight);
+ wake_up(&sctx->list_wait);
+ scrub_put_ctx(sctx);
+}
+
+static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
+{
+ while (atomic_read(&fs_info->scrub_pause_req)) {
+ mutex_unlock(&fs_info->scrub_lock);
+ wait_event(fs_info->scrub_pause_wait,
+ atomic_read(&fs_info->scrub_pause_req) == 0);
+ mutex_lock(&fs_info->scrub_lock);
+ }
+}
+
+static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
+{
+ atomic_inc(&fs_info->scrubs_paused);
+ wake_up(&fs_info->scrub_pause_wait);
+
+ mutex_lock(&fs_info->scrub_lock);
+ __scrub_blocked_if_needed(fs_info);
+ atomic_dec(&fs_info->scrubs_paused);
+ mutex_unlock(&fs_info->scrub_lock);
+
+ wake_up(&fs_info->scrub_pause_wait);
+}
+
+/*
+ * used for workers that require transaction commits (i.e., for the
+ * NOCOW case)
+ */
+static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
+{
+ struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+
+ atomic_inc(&sctx->refs);
+ /*
+ * increment scrubs_running to prevent cancel requests from
+ * completing as long as a worker is running. we must also
+ * increment scrubs_paused to prevent deadlocking on pause
+ * requests used for transactions commits (as the worker uses a
+ * transaction context). it is safe to regard the worker
+ * as paused for all matters practical. effectively, we only
+ * avoid cancellation requests from completing.
+ */
+ mutex_lock(&fs_info->scrub_lock);
+ atomic_inc(&fs_info->scrubs_running);
+ atomic_inc(&fs_info->scrubs_paused);
+ mutex_unlock(&fs_info->scrub_lock);
+
+ /*
+ * check if @scrubs_running=@scrubs_paused condition
+ * inside wait_event() is not an atomic operation.
+ * which means we may inc/dec @scrub_running/paused
+ * at any time. Let's wake up @scrub_pause_wait as
+ * much as we can to let commit transaction blocked less.
+ */
+ wake_up(&fs_info->scrub_pause_wait);
+
+ atomic_inc(&sctx->workers_pending);
+}
+
+/* used for workers that require transaction commits */
+static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
+{
+ struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+
+ /*
+ * see scrub_pending_trans_workers_inc() why we're pretending
+ * to be paused in the scrub counters
+ */
+ mutex_lock(&fs_info->scrub_lock);
+ atomic_dec(&fs_info->scrubs_running);
+ atomic_dec(&fs_info->scrubs_paused);
+ mutex_unlock(&fs_info->scrub_lock);
+ atomic_dec(&sctx->workers_pending);
+ wake_up(&fs_info->scrub_pause_wait);
+ wake_up(&sctx->list_wait);
+ scrub_put_ctx(sctx);
+}
+
+static void scrub_free_csums(struct scrub_ctx *sctx)
+{
+ while (!list_empty(&sctx->csum_list)) {
+ struct btrfs_ordered_sum *sum;
+ sum = list_first_entry(&sctx->csum_list,
+ struct btrfs_ordered_sum, list);
+ list_del(&sum->list);
+ kfree(sum);
+ }
+}
+
+static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
+{
+ int i;
+
+ if (!sctx)
+ return;
+
+ scrub_free_wr_ctx(&sctx->wr_ctx);
+
+ /* this can happen when scrub is cancelled */
+ if (sctx->curr != -1) {
+ struct scrub_bio *sbio = sctx->bios[sctx->curr];
+
+ for (i = 0; i < sbio->page_count; i++) {
+ WARN_ON(!sbio->pagev[i]->page);
+ scrub_block_put(sbio->pagev[i]->sblock);
+ }
+ bio_put(sbio->bio);
+ }
+
+ for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
+ struct scrub_bio *sbio = sctx->bios[i];
+
+ if (!sbio)
+ break;
+ kfree(sbio);
+ }
+
+ scrub_free_csums(sctx);
+ kfree(sctx);
+}
+
+static void scrub_put_ctx(struct scrub_ctx *sctx)
+{
+ if (atomic_dec_and_test(&sctx->refs))
+ scrub_free_ctx(sctx);
+}
+
+static noinline_for_stack
+struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
+{
+ struct scrub_ctx *sctx;
+ int i;
+ struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
+ int pages_per_rd_bio;
+ int ret;
+
+ /*
+ * the setting of pages_per_rd_bio is correct for scrub but might
+ * be wrong for the dev_replace code where we might read from
+ * different devices in the initial huge bios. However, that
+ * code is able to correctly handle the case when adding a page
+ * to a bio fails.
+ */
+ if (dev->bdev)
+ pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO,
+ bio_get_nr_vecs(dev->bdev));
+ else
+ pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
+ sctx = kzalloc(sizeof(*sctx), GFP_NOFS);
+ if (!sctx)
+ goto nomem;
+ atomic_set(&sctx->refs, 1);
+ sctx->is_dev_replace = is_dev_replace;
+ sctx->pages_per_rd_bio = pages_per_rd_bio;
+ sctx->curr = -1;
+ sctx->dev_root = dev->dev_root;
+ for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
+ struct scrub_bio *sbio;
+
+ sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
+ if (!sbio)
+ goto nomem;
+ sctx->bios[i] = sbio;
+
+ sbio->index = i;
+ sbio->sctx = sctx;
+ sbio->page_count = 0;
+ btrfs_init_work(&sbio->work, btrfs_scrub_helper,
+ scrub_bio_end_io_worker, NULL, NULL);
+
+ if (i != SCRUB_BIOS_PER_SCTX - 1)
+ sctx->bios[i]->next_free = i + 1;
+ else
+ sctx->bios[i]->next_free = -1;
+ }
+ sctx->first_free = 0;
+ sctx->nodesize = dev->dev_root->nodesize;
+ sctx->sectorsize = dev->dev_root->sectorsize;
+ atomic_set(&sctx->bios_in_flight, 0);
+ atomic_set(&sctx->workers_pending, 0);
+ atomic_set(&sctx->cancel_req, 0);
+ sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
+ INIT_LIST_HEAD(&sctx->csum_list);
+
+ spin_lock_init(&sctx->list_lock);
+ spin_lock_init(&sctx->stat_lock);
+ init_waitqueue_head(&sctx->list_wait);
+
+ ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info,
+ fs_info->dev_replace.tgtdev, is_dev_replace);
+ if (ret) {
+ scrub_free_ctx(sctx);
+ return ERR_PTR(ret);
+ }
+ return sctx;
+
+nomem:
+ scrub_free_ctx(sctx);
+ return ERR_PTR(-ENOMEM);
+}
+
+static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
+ void *warn_ctx)
+{
+ u64 isize;
+ u32 nlink;
+ int ret;
+ int i;
+ struct extent_buffer *eb;
+ struct btrfs_inode_item *inode_item;
+ struct scrub_warning *swarn = warn_ctx;
+ struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
+ struct inode_fs_paths *ipath = NULL;
+ struct btrfs_root *local_root;
+ struct btrfs_key root_key;
+ struct btrfs_key key;
+
+ root_key.objectid = root;
+ root_key.type = BTRFS_ROOT_ITEM_KEY;
+ root_key.offset = (u64)-1;
+ local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
+ if (IS_ERR(local_root)) {
+ ret = PTR_ERR(local_root);
+ goto err;
+ }
+
+ /*
+ * this makes the path point to (inum INODE_ITEM ioff)
+ */
+ key.objectid = inum;
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
+ if (ret) {
+ btrfs_release_path(swarn->path);
+ goto err;
+ }
+
+ eb = swarn->path->nodes[0];
+ inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
+ struct btrfs_inode_item);
+ isize = btrfs_inode_size(eb, inode_item);
+ nlink = btrfs_inode_nlink(eb, inode_item);
+ btrfs_release_path(swarn->path);
+
+ ipath = init_ipath(4096, local_root, swarn->path);
+ if (IS_ERR(ipath)) {
+ ret = PTR_ERR(ipath);
+ ipath = NULL;
+ goto err;
+ }
+ ret = paths_from_inode(inum, ipath);
+
+ if (ret < 0)
+ goto err;
+
+ /*
+ * we deliberately ignore the bit ipath might have been too small to
+ * hold all of the paths here
+ */
+ for (i = 0; i < ipath->fspath->elem_cnt; ++i)
+ printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "
+ "%s, sector %llu, root %llu, inode %llu, offset %llu, "
+ "length %llu, links %u (path: %s)\n", swarn->errstr,
+ swarn->logical, rcu_str_deref(swarn->dev->name),
+ (unsigned long long)swarn->sector, root, inum, offset,
+ min(isize - offset, (u64)PAGE_SIZE), nlink,
+ (char *)(unsigned long)ipath->fspath->val[i]);
+
+ free_ipath(ipath);
+ return 0;
+
+err:
+ printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "
+ "%s, sector %llu, root %llu, inode %llu, offset %llu: path "
+ "resolving failed with ret=%d\n", swarn->errstr,
+ swarn->logical, rcu_str_deref(swarn->dev->name),
+ (unsigned long long)swarn->sector, root, inum, offset, ret);
+
+ free_ipath(ipath);
+ return 0;
+}
+
+static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
+{
+ struct btrfs_device *dev;
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_path *path;
+ struct btrfs_key found_key;
+ struct extent_buffer *eb;
+ struct btrfs_extent_item *ei;
+ struct scrub_warning swarn;
+ unsigned long ptr = 0;
+ u64 extent_item_pos;
+ u64 flags = 0;
+ u64 ref_root;
+ u32 item_size;
+ u8 ref_level;
+ int ret;
+
+ WARN_ON(sblock->page_count < 1);
+ dev = sblock->pagev[0]->dev;
+ fs_info = sblock->sctx->dev_root->fs_info;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return;
+
+ swarn.sector = (sblock->pagev[0]->physical) >> 9;
+ swarn.logical = sblock->pagev[0]->logical;
+ swarn.errstr = errstr;
+ swarn.dev = NULL;
+
+ ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
+ &flags);
+ if (ret < 0)
+ goto out;
+
+ extent_item_pos = swarn.logical - found_key.objectid;
+ swarn.extent_item_size = found_key.offset;
+
+ eb = path->nodes[0];
+ ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
+ item_size = btrfs_item_size_nr(eb, path->slots[0]);
+
+ if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+ do {
+ ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
+ item_size, &ref_root,
+ &ref_level);
+ printk_in_rcu(KERN_WARNING
+ "BTRFS: %s at logical %llu on dev %s, "
+ "sector %llu: metadata %s (level %d) in tree "
+ "%llu\n", errstr, swarn.logical,
+ rcu_str_deref(dev->name),
+ (unsigned long long)swarn.sector,
+ ref_level ? "node" : "leaf",
+ ret < 0 ? -1 : ref_level,
+ ret < 0 ? -1 : ref_root);
+ } while (ret != 1);
+ btrfs_release_path(path);
+ } else {
+ btrfs_release_path(path);
+ swarn.path = path;
+ swarn.dev = dev;
+ iterate_extent_inodes(fs_info, found_key.objectid,
+ extent_item_pos, 1,
+ scrub_print_warning_inode, &swarn);
+ }
+
+out:
+ btrfs_free_path(path);
+}
+
+static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
+{
+ struct page *page = NULL;
+ unsigned long index;
+ struct scrub_fixup_nodatasum *fixup = fixup_ctx;
+ int ret;
+ int corrected = 0;
+ struct btrfs_key key;
+ struct inode *inode = NULL;
+ struct btrfs_fs_info *fs_info;
+ u64 end = offset + PAGE_SIZE - 1;
+ struct btrfs_root *local_root;
+ int srcu_index;
+
+ key.objectid = root;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = (u64)-1;
+
+ fs_info = fixup->root->fs_info;
+ srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
+
+ local_root = btrfs_read_fs_root_no_name(fs_info, &key);
+ if (IS_ERR(local_root)) {
+ srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
+ return PTR_ERR(local_root);
+ }
+
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.objectid = inum;
+ key.offset = 0;
+ inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
+ srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ index = offset >> PAGE_CACHE_SHIFT;
+
+ page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+ if (!page) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (PageUptodate(page)) {
+ if (PageDirty(page)) {
+ /*
+ * we need to write the data to the defect sector. the
+ * data that was in that sector is not in memory,
+ * because the page was modified. we must not write the
+ * modified page to that sector.
+ *
+ * TODO: what could be done here: wait for the delalloc
+ * runner to write out that page (might involve
+ * COW) and see whether the sector is still
+ * referenced afterwards.
+ *
+ * For the meantime, we'll treat this error
+ * incorrectable, although there is a chance that a
+ * later scrub will find the bad sector again and that
+ * there's no dirty page in memory, then.
+ */
+ ret = -EIO;
+ goto out;
+ }
+ ret = repair_io_failure(inode, offset, PAGE_SIZE,
+ fixup->logical, page,
+ offset - page_offset(page),
+ fixup->mirror_num);
+ unlock_page(page);
+ corrected = !ret;
+ } else {
+ /*
+ * we need to get good data first. the general readpage path
+ * will call repair_io_failure for us, we just have to make
+ * sure we read the bad mirror.
+ */
+ ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
+ EXTENT_DAMAGED, GFP_NOFS);
+ if (ret) {
+ /* set_extent_bits should give proper error */
+ WARN_ON(ret > 0);
+ if (ret > 0)
+ ret = -EFAULT;
+ goto out;
+ }
+
+ ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
+ btrfs_get_extent,
+ fixup->mirror_num);
+ wait_on_page_locked(page);
+
+ corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,
+ end, EXTENT_DAMAGED, 0, NULL);
+ if (!corrected)
+ clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
+ EXTENT_DAMAGED, GFP_NOFS);
+ }
+
+out:
+ if (page)
+ put_page(page);
+
+ iput(inode);
+
+ if (ret < 0)
+ return ret;
+
+ if (ret == 0 && corrected) {
+ /*
+ * we only need to call readpage for one of the inodes belonging
+ * to this extent. so make iterate_extent_inodes stop
+ */
+ return 1;
+ }
+
+ return -EIO;
+}
+
+static void scrub_fixup_nodatasum(struct btrfs_work *work)
+{
+ int ret;
+ struct scrub_fixup_nodatasum *fixup;
+ struct scrub_ctx *sctx;
+ struct btrfs_trans_handle *trans = NULL;
+ struct btrfs_path *path;
+ int uncorrectable = 0;
+
+ fixup = container_of(work, struct scrub_fixup_nodatasum, work);
+ sctx = fixup->sctx;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ spin_lock(&sctx->stat_lock);
+ ++sctx->stat.malloc_errors;
+ spin_unlock(&sctx->stat_lock);
+ uncorrectable = 1;
+ goto out;
+ }
+
+ trans = btrfs_join_transaction(fixup->root);
+ if (IS_ERR(trans)) {
+ uncorrectable = 1;
+ goto out;
+ }
+
+ /*
+ * the idea is to trigger a regular read through the standard path. we
+ * read a page from the (failed) logical address by specifying the
+ * corresponding copynum of the failed sector. thus, that readpage is
+ * expected to fail.
+ * that is the point where on-the-fly error correction will kick in
+ * (once it's finished) and rewrite the failed sector if a good copy
+ * can be found.
+ */
+ ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info,
+ path, scrub_fixup_readpage,
+ fixup);
+ if (ret < 0) {
+ uncorrectable = 1;
+ goto out;
+ }
+ WARN_ON(ret != 1);
+
+ spin_lock(&sctx->stat_lock);
+ ++sctx->stat.corrected_errors;
+ spin_unlock(&sctx->stat_lock);
+
+out:
+ if (trans && !IS_ERR(trans))
+ btrfs_end_transaction(trans, fixup->root);
+ if (uncorrectable) {
+ spin_lock(&sctx->stat_lock);
+ ++sctx->stat.uncorrectable_errors;
+ spin_unlock(&sctx->stat_lock);
+ btrfs_dev_replace_stats_inc(
+ &sctx->dev_root->fs_info->dev_replace.
+ num_uncorrectable_read_errors);
+ printk_ratelimited_in_rcu(KERN_ERR "BTRFS: "
+ "unable to fixup (nodatasum) error at logical %llu on dev %s\n",
+ fixup->logical, rcu_str_deref(fixup->dev->name));
+ }
+
+ btrfs_free_path(path);
+ kfree(fixup);
+
+ scrub_pending_trans_workers_dec(sctx);
+}
+
+static inline void scrub_get_recover(struct scrub_recover *recover)
+{
+ atomic_inc(&recover->refs);
+}
+
+static inline void scrub_put_recover(struct scrub_recover *recover)
+{
+ if (atomic_dec_and_test(&recover->refs)) {
+ btrfs_put_bbio(recover->bbio);
+ kfree(recover);
+ }
+}
+
+/*
+ * scrub_handle_errored_block gets called when either verification of the
+ * pages failed or the bio failed to read, e.g. with EIO. In the latter
+ * case, this function handles all pages in the bio, even though only one
+ * may be bad.
+ * The goal of this function is to repair the errored block by using the
+ * contents of one of the mirrors.
+ */
+static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
+{
+ struct scrub_ctx *sctx = sblock_to_check->sctx;
+ struct btrfs_device *dev;
+ struct btrfs_fs_info *fs_info;
+ u64 length;
+ u64 logical;
+ u64 generation;
+ unsigned int failed_mirror_index;
+ unsigned int is_metadata;
+ unsigned int have_csum;
+ u8 *csum;
+ struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
+ struct scrub_block *sblock_bad;
+ int ret;
+ int mirror_index;
+ int page_num;
+ int success;
+ static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+
+ BUG_ON(sblock_to_check->page_count < 1);
+ fs_info = sctx->dev_root->fs_info;
+ if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
+ /*
+ * if we find an error in a super block, we just report it.
+ * They will get written with the next transaction commit
+ * anyway
+ */
+ spin_lock(&sctx->stat_lock);
+ ++sctx->stat.super_errors;
+ spin_unlock(&sctx->stat_lock);
+ return 0;
+ }
+ length = sblock_to_check->page_count * PAGE_SIZE;
+ logical = sblock_to_check->pagev[0]->logical;
+ generation = sblock_to_check->pagev[0]->generation;
+ BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
+ failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
+ is_metadata = !(sblock_to_check->pagev[0]->flags &
+ BTRFS_EXTENT_FLAG_DATA);
+ have_csum = sblock_to_check->pagev[0]->have_csum;
+ csum = sblock_to_check->pagev[0]->csum;
+ dev = sblock_to_check->pagev[0]->dev;
+
+ if (sctx->is_dev_replace && !is_metadata && !have_csum) {
+ sblocks_for_recheck = NULL;
+ goto nodatasum_case;
+ }
+
+ /*
+ * read all mirrors one after the other. This includes to
+ * re-read the extent or metadata block that failed (that was
+ * the cause that this fixup code is called) another time,
+ * page by page this time in order to know which pages
+ * caused I/O errors and which ones are good (for all mirrors).
+ * It is the goal to handle the situation when more than one
+ * mirror contains I/O errors, but the errors do not
+ * overlap, i.e. the data can be repaired by selecting the
+ * pages from those mirrors without I/O error on the
+ * particular pages. One example (with blocks >= 2 * PAGE_SIZE)
+ * would be that mirror #1 has an I/O error on the first page,
+ * the second page is good, and mirror #2 has an I/O error on
+ * the second page, but the first page is good.
+ * Then the first page of the first mirror can be repaired by
+ * taking the first page of the second mirror, and the
+ * second page of the second mirror can be repaired by
+ * copying the contents of the 2nd page of the 1st mirror.
+ * One more note: if the pages of one mirror contain I/O
+ * errors, the checksum cannot be verified. In order to get
+ * the best data for repairing, the first attempt is to find
+ * a mirror without I/O errors and with a validated checksum.
+ * Only if this is not possible, the pages are picked from
+ * mirrors with I/O errors without considering the checksum.
+ * If the latter is the case, at the end, the checksum of the
+ * repaired area is verified in order to correctly maintain
+ * the statistics.
+ */
+
+ sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS,
+ sizeof(*sblocks_for_recheck), GFP_NOFS);
+ if (!sblocks_for_recheck) {
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.malloc_errors++;
+ sctx->stat.read_errors++;
+ sctx->stat.uncorrectable_errors++;
+ spin_unlock(&sctx->stat_lock);
+ btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
+ goto out;
+ }
+
+ /* setup the context, map the logical blocks and alloc the pages */
+ ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
+ if (ret) {
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.read_errors++;
+ sctx->stat.uncorrectable_errors++;
+ spin_unlock(&sctx->stat_lock);
+ btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
+ goto out;
+ }
+ BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
+ sblock_bad = sblocks_for_recheck + failed_mirror_index;
+
+ /* build and submit the bios for the failed mirror, check checksums */
+ scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
+ csum, generation, sctx->csum_size, 1);
+
+ if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
+ sblock_bad->no_io_error_seen) {
+ /*
+ * the error disappeared after reading page by page, or
+ * the area was part of a huge bio and other parts of the
+ * bio caused I/O errors, or the block layer merged several
+ * read requests into one and the error is caused by a
+ * different bio (usually one of the two latter cases is
+ * the cause)
+ */
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.unverified_errors++;
+ sblock_to_check->data_corrected = 1;
+ spin_unlock(&sctx->stat_lock);
+
+ if (sctx->is_dev_replace)
+ scrub_write_block_to_dev_replace(sblock_bad);
+ goto out;
+ }
+
+ if (!sblock_bad->no_io_error_seen) {
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.read_errors++;
+ spin_unlock(&sctx->stat_lock);
+ if (__ratelimit(&_rs))
+ scrub_print_warning("i/o error", sblock_to_check);
+ btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
+ } else if (sblock_bad->checksum_error) {
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.csum_errors++;
+ spin_unlock(&sctx->stat_lock);
+ if (__ratelimit(&_rs))
+ scrub_print_warning("checksum error", sblock_to_check);
+ btrfs_dev_stat_inc_and_print(dev,
+ BTRFS_DEV_STAT_CORRUPTION_ERRS);
+ } else if (sblock_bad->header_error) {
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.verify_errors++;
+ spin_unlock(&sctx->stat_lock);
+ if (__ratelimit(&_rs))
+ scrub_print_warning("checksum/header error",
+ sblock_to_check);
+ if (sblock_bad->generation_error)
+ btrfs_dev_stat_inc_and_print(dev,
+ BTRFS_DEV_STAT_GENERATION_ERRS);
+ else
+ btrfs_dev_stat_inc_and_print(dev,
+ BTRFS_DEV_STAT_CORRUPTION_ERRS);
+ }
+
+ if (sctx->readonly) {
+ ASSERT(!sctx->is_dev_replace);
+ goto out;
+ }
+
+ if (!is_metadata && !have_csum) {
+ struct scrub_fixup_nodatasum *fixup_nodatasum;
+
+ WARN_ON(sctx->is_dev_replace);
+
+nodatasum_case:
+
+ /*
+ * !is_metadata and !have_csum, this means that the data
+ * might not be COW'ed, that it might be modified
+ * concurrently. The general strategy to work on the
+ * commit root does not help in the case when COW is not
+ * used.
+ */
+ fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
+ if (!fixup_nodatasum)
+ goto did_not_correct_error;
+ fixup_nodatasum->sctx = sctx;
+ fixup_nodatasum->dev = dev;
+ fixup_nodatasum->logical = logical;
+ fixup_nodatasum->root = fs_info->extent_root;
+ fixup_nodatasum->mirror_num = failed_mirror_index + 1;
+ scrub_pending_trans_workers_inc(sctx);
+ btrfs_init_work(&fixup_nodatasum->work, btrfs_scrub_helper,
+ scrub_fixup_nodatasum, NULL, NULL);
+ btrfs_queue_work(fs_info->scrub_workers,
+ &fixup_nodatasum->work);
+ goto out;
+ }
+
+ /*
+ * now build and submit the bios for the other mirrors, check
+ * checksums.
+ * First try to pick the mirror which is completely without I/O
+ * errors and also does not have a checksum error.
+ * If one is found, and if a checksum is present, the full block
+ * that is known to contain an error is rewritten. Afterwards
+ * the block is known to be corrected.
+ * If a mirror is found which is completely correct, and no
+ * checksum is present, only those pages are rewritten that had
+ * an I/O error in the block to be repaired, since it cannot be
+ * determined, which copy of the other pages is better (and it
+ * could happen otherwise that a correct page would be
+ * overwritten by a bad one).
+ */
+ for (mirror_index = 0;
+ mirror_index < BTRFS_MAX_MIRRORS &&
+ sblocks_for_recheck[mirror_index].page_count > 0;
+ mirror_index++) {
+ struct scrub_block *sblock_other;
+
+ if (mirror_index == failed_mirror_index)
+ continue;
+ sblock_other = sblocks_for_recheck + mirror_index;
+
+ /* build and submit the bios, check checksums */
+ scrub_recheck_block(fs_info, sblock_other, is_metadata,
+ have_csum, csum, generation,
+ sctx->csum_size, 0);
+
+ if (!sblock_other->header_error &&
+ !sblock_other->checksum_error &&
+ sblock_other->no_io_error_seen) {
+ if (sctx->is_dev_replace) {
+ scrub_write_block_to_dev_replace(sblock_other);
+ goto corrected_error;
+ } else {
+ ret = scrub_repair_block_from_good_copy(
+ sblock_bad, sblock_other);
+ if (!ret)
+ goto corrected_error;
+ }
+ }
+ }
+
+ if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
+ goto did_not_correct_error;
+
+ /*
+ * In case of I/O errors in the area that is supposed to be
+ * repaired, continue by picking good copies of those pages.
+ * Select the good pages from mirrors to rewrite bad pages from
+ * the area to fix. Afterwards verify the checksum of the block
+ * that is supposed to be repaired. This verification step is
+ * only done for the purpose of statistic counting and for the
+ * final scrub report, whether errors remain.
+ * A perfect algorithm could make use of the checksum and try
+ * all possible combinations of pages from the different mirrors
+ * until the checksum verification succeeds. For example, when
+ * the 2nd page of mirror #1 faces I/O errors, and the 2nd page
+ * of mirror #2 is readable but the final checksum test fails,
+ * then the 2nd page of mirror #3 could be tried, whether now
+ * the final checksum succeedes. But this would be a rare
+ * exception and is therefore not implemented. At least it is
+ * avoided that the good copy is overwritten.
+ * A more useful improvement would be to pick the sectors
+ * without I/O error based on sector sizes (512 bytes on legacy
+ * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
+ * mirror could be repaired by taking 512 byte of a different
+ * mirror, even if other 512 byte sectors in the same PAGE_SIZE
+ * area are unreadable.
+ */
+ success = 1;
+ for (page_num = 0; page_num < sblock_bad->page_count;
+ page_num++) {
+ struct scrub_page *page_bad = sblock_bad->pagev[page_num];
+ struct scrub_block *sblock_other = NULL;
+
+ /* skip no-io-error page in scrub */
+ if (!page_bad->io_error && !sctx->is_dev_replace)
+ continue;
+
+ /* try to find no-io-error page in mirrors */
+ if (page_bad->io_error) {
+ for (mirror_index = 0;
+ mirror_index < BTRFS_MAX_MIRRORS &&
+ sblocks_for_recheck[mirror_index].page_count > 0;
+ mirror_index++) {
+ if (!sblocks_for_recheck[mirror_index].
+ pagev[page_num]->io_error) {
+ sblock_other = sblocks_for_recheck +
+ mirror_index;
+ break;
+ }
+ }
+ if (!sblock_other)
+ success = 0;
+ }
+
+ if (sctx->is_dev_replace) {
+ /*
+ * did not find a mirror to fetch the page
+ * from. scrub_write_page_to_dev_replace()
+ * handles this case (page->io_error), by
+ * filling the block with zeros before
+ * submitting the write request
+ */
+ if (!sblock_other)
+ sblock_other = sblock_bad;
+
+ if (scrub_write_page_to_dev_replace(sblock_other,
+ page_num) != 0) {
+ btrfs_dev_replace_stats_inc(
+ &sctx->dev_root->
+ fs_info->dev_replace.
+ num_write_errors);
+ success = 0;
+ }
+ } else if (sblock_other) {
+ ret = scrub_repair_page_from_good_copy(sblock_bad,
+ sblock_other,
+ page_num, 0);
+ if (0 == ret)
+ page_bad->io_error = 0;
+ else
+ success = 0;
+ }
+ }
+
+ if (success && !sctx->is_dev_replace) {
+ if (is_metadata || have_csum) {
+ /*
+ * need to verify the checksum now that all
+ * sectors on disk are repaired (the write
+ * request for data to be repaired is on its way).
+ * Just be lazy and use scrub_recheck_block()
+ * which re-reads the data before the checksum
+ * is verified, but most likely the data comes out
+ * of the page cache.
+ */
+ scrub_recheck_block(fs_info, sblock_bad,
+ is_metadata, have_csum, csum,
+ generation, sctx->csum_size, 1);
+ if (!sblock_bad->header_error &&
+ !sblock_bad->checksum_error &&
+ sblock_bad->no_io_error_seen)
+ goto corrected_error;
+ else
+ goto did_not_correct_error;
+ } else {
+corrected_error:
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.corrected_errors++;
+ sblock_to_check->data_corrected = 1;
+ spin_unlock(&sctx->stat_lock);
+ printk_ratelimited_in_rcu(KERN_ERR
+ "BTRFS: fixed up error at logical %llu on dev %s\n",
+ logical, rcu_str_deref(dev->name));
+ }
+ } else {
+did_not_correct_error:
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.uncorrectable_errors++;
+ spin_unlock(&sctx->stat_lock);
+ printk_ratelimited_in_rcu(KERN_ERR
+ "BTRFS: unable to fixup (regular) error at logical %llu on dev %s\n",
+ logical, rcu_str_deref(dev->name));
+ }
+
+out:
+ if (sblocks_for_recheck) {
+ for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
+ mirror_index++) {
+ struct scrub_block *sblock = sblocks_for_recheck +
+ mirror_index;
+ struct scrub_recover *recover;
+ int page_index;
+
+ for (page_index = 0; page_index < sblock->page_count;
+ page_index++) {
+ sblock->pagev[page_index]->sblock = NULL;
+ recover = sblock->pagev[page_index]->recover;
+ if (recover) {
+ scrub_put_recover(recover);
+ sblock->pagev[page_index]->recover =
+ NULL;
+ }
+ scrub_page_put(sblock->pagev[page_index]);
+ }
+ }
+ kfree(sblocks_for_recheck);
+ }
+
+ return 0;
+}
+
+static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio)
+{
+ if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
+ return 2;
+ else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
+ return 3;
+ else
+ return (int)bbio->num_stripes;
+}
+
+static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
+ u64 *raid_map,
+ u64 mapped_length,
+ int nstripes, int mirror,
+ int *stripe_index,
+ u64 *stripe_offset)
+{
+ int i;
+
+ if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+ /* RAID5/6 */
+ for (i = 0; i < nstripes; i++) {
+ if (raid_map[i] == RAID6_Q_STRIPE ||
+ raid_map[i] == RAID5_P_STRIPE)
+ continue;
+
+ if (logical >= raid_map[i] &&
+ logical < raid_map[i] + mapped_length)
+ break;
+ }
+
+ *stripe_index = i;
+ *stripe_offset = logical - raid_map[i];
+ } else {
+ /* The other RAID type */
+ *stripe_index = mirror;
+ *stripe_offset = 0;
+ }
+}
+
+static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
+ struct scrub_block *sblocks_for_recheck)
+{
+ struct scrub_ctx *sctx = original_sblock->sctx;
+ struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+ u64 length = original_sblock->page_count * PAGE_SIZE;
+ u64 logical = original_sblock->pagev[0]->logical;
+ struct scrub_recover *recover;
+ struct btrfs_bio *bbio;
+ u64 sublen;
+ u64 mapped_length;
+ u64 stripe_offset;
+ int stripe_index;
+ int page_index = 0;
+ int mirror_index;
+ int nmirrors;
+ int ret;
+
+ /*
+ * note: the two members refs and outstanding_pages
+ * are not used (and not set) in the blocks that are used for
+ * the recheck procedure
+ */
+
+ while (length > 0) {
+ sublen = min_t(u64, length, PAGE_SIZE);
+ mapped_length = sublen;
+ bbio = NULL;
+
+ /*
+ * with a length of PAGE_SIZE, each returned stripe
+ * represents one mirror
+ */
+ ret = btrfs_map_sblock(fs_info, REQ_GET_READ_MIRRORS, logical,
+ &mapped_length, &bbio, 0, 1);
+ if (ret || !bbio || mapped_length < sublen) {
+ btrfs_put_bbio(bbio);
+ return -EIO;
+ }
+
+ recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
+ if (!recover) {
+ btrfs_put_bbio(bbio);
+ return -ENOMEM;
+ }
+
+ atomic_set(&recover->refs, 1);
+ recover->bbio = bbio;
+ recover->map_length = mapped_length;
+
+ BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
+
+ nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS);
+
+ for (mirror_index = 0; mirror_index < nmirrors;
+ mirror_index++) {
+ struct scrub_block *sblock;
+ struct scrub_page *page;
+
+ sblock = sblocks_for_recheck + mirror_index;
+ sblock->sctx = sctx;
+ page = kzalloc(sizeof(*page), GFP_NOFS);
+ if (!page) {
+leave_nomem:
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.malloc_errors++;
+ spin_unlock(&sctx->stat_lock);
+ scrub_put_recover(recover);
+ return -ENOMEM;
+ }
+ scrub_page_get(page);
+ sblock->pagev[page_index] = page;
+ page->logical = logical;
+
+ scrub_stripe_index_and_offset(logical,
+ bbio->map_type,
+ bbio->raid_map,
+ mapped_length,
+ bbio->num_stripes -
+ bbio->num_tgtdevs,
+ mirror_index,
+ &stripe_index,
+ &stripe_offset);
+ page->physical = bbio->stripes[stripe_index].physical +
+ stripe_offset;
+ page->dev = bbio->stripes[stripe_index].dev;
+
+ BUG_ON(page_index >= original_sblock->page_count);
+ page->physical_for_dev_replace =
+ original_sblock->pagev[page_index]->
+ physical_for_dev_replace;
+ /* for missing devices, dev->bdev is NULL */
+ page->mirror_num = mirror_index + 1;
+ sblock->page_count++;
+ page->page = alloc_page(GFP_NOFS);
+ if (!page->page)
+ goto leave_nomem;
+
+ scrub_get_recover(recover);
+ page->recover = recover;
+ }
+ scrub_put_recover(recover);
+ length -= sublen;
+ logical += sublen;
+ page_index++;
+ }
+
+ return 0;
+}
+
+struct scrub_bio_ret {
+ struct completion event;
+ int error;
+};
+
+static void scrub_bio_wait_endio(struct bio *bio, int error)
+{
+ struct scrub_bio_ret *ret = bio->bi_private;
+
+ ret->error = error;
+ complete(&ret->event);
+}
+
+static inline int scrub_is_page_on_raid56(struct scrub_page *page)
+{
+ return page->recover &&
+ (page->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
+}
+
+static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
+ struct bio *bio,
+ struct scrub_page *page)
+{
+ struct scrub_bio_ret done;
+ int ret;
+
+ init_completion(&done.event);
+ done.error = 0;
+ bio->bi_iter.bi_sector = page->logical >> 9;
+ bio->bi_private = &done;
+ bio->bi_end_io = scrub_bio_wait_endio;
+
+ ret = raid56_parity_recover(fs_info->fs_root, bio, page->recover->bbio,
+ page->recover->map_length,
+ page->mirror_num, 0);
+ if (ret)
+ return ret;
+
+ wait_for_completion(&done.event);
+ if (done.error)
+ return -EIO;
+
+ return 0;
+}
+
+/*
+ * this function will check the on disk data for checksum errors, header
+ * errors and read I/O errors. If any I/O errors happen, the exact pages
+ * which are errored are marked as being bad. The goal is to enable scrub
+ * to take those pages that are not errored from all the mirrors so that
+ * the pages that are errored in the just handled mirror can be repaired.
+ */
+static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
+ struct scrub_block *sblock, int is_metadata,
+ int have_csum, u8 *csum, u64 generation,
+ u16 csum_size, int retry_failed_mirror)
+{
+ int page_num;
+
+ sblock->no_io_error_seen = 1;
+ sblock->header_error = 0;
+ sblock->checksum_error = 0;
+
+ for (page_num = 0; page_num < sblock->page_count; page_num++) {
+ struct bio *bio;
+ struct scrub_page *page = sblock->pagev[page_num];
+
+ if (page->dev->bdev == NULL) {
+ page->io_error = 1;
+ sblock->no_io_error_seen = 0;
+ continue;
+ }
+
+ WARN_ON(!page->page);
+ bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
+ if (!bio) {
+ page->io_error = 1;
+ sblock->no_io_error_seen = 0;
+ continue;
+ }
+ bio->bi_bdev = page->dev->bdev;
+
+ bio_add_page(bio, page->page, PAGE_SIZE, 0);
+ if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) {
+ if (scrub_submit_raid56_bio_wait(fs_info, bio, page))
+ sblock->no_io_error_seen = 0;
+ } else {
+ bio->bi_iter.bi_sector = page->physical >> 9;
+
+ if (btrfsic_submit_bio_wait(READ, bio))
+ sblock->no_io_error_seen = 0;
+ }
+
+ bio_put(bio);
+ }
+
+ if (sblock->no_io_error_seen)
+ scrub_recheck_block_checksum(fs_info, sblock, is_metadata,
+ have_csum, csum, generation,
+ csum_size);
+
+ return;
+}
+
+static inline int scrub_check_fsid(u8 fsid[],
+ struct scrub_page *spage)
+{
+ struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices;
+ int ret;
+
+ ret = memcmp(fsid, fs_devices->fsid, BTRFS_UUID_SIZE);
+ return !ret;
+}
+
+static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
+ struct scrub_block *sblock,
+ int is_metadata, int have_csum,
+ const u8 *csum, u64 generation,
+ u16 csum_size)
+{
+ int page_num;
+ u8 calculated_csum[BTRFS_CSUM_SIZE];
+ u32 crc = ~(u32)0;
+ void *mapped_buffer;
+
+ WARN_ON(!sblock->pagev[0]->page);
+ if (is_metadata) {
+ struct btrfs_header *h;
+
+ mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
+ h = (struct btrfs_header *)mapped_buffer;
+
+ if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h) ||
+ !scrub_check_fsid(h->fsid, sblock->pagev[0]) ||
+ memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
+ BTRFS_UUID_SIZE)) {
+ sblock->header_error = 1;
+ } else if (generation != btrfs_stack_header_generation(h)) {
+ sblock->header_error = 1;
+ sblock->generation_error = 1;
+ }
+ csum = h->csum;
+ } else {
+ if (!have_csum)
+ return;
+
+ mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
+ }
+
+ for (page_num = 0;;) {
+ if (page_num == 0 && is_metadata)
+ crc = btrfs_csum_data(
+ ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE,
+ crc, PAGE_SIZE - BTRFS_CSUM_SIZE);
+ else
+ crc = btrfs_csum_data(mapped_buffer, crc, PAGE_SIZE);
+
+ kunmap_atomic(mapped_buffer);
+ page_num++;
+ if (page_num >= sblock->page_count)
+ break;
+ WARN_ON(!sblock->pagev[page_num]->page);
+
+ mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page);
+ }
+
+ btrfs_csum_final(crc, calculated_csum);
+ if (memcmp(calculated_csum, csum, csum_size))
+ sblock->checksum_error = 1;
+}
+
+static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
+ struct scrub_block *sblock_good)
+{
+ int page_num;
+ int ret = 0;
+
+ for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
+ int ret_sub;
+
+ ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
+ sblock_good,
+ page_num, 1);
+ if (ret_sub)
+ ret = ret_sub;
+ }
+
+ return ret;
+}
+
+static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
+ struct scrub_block *sblock_good,
+ int page_num, int force_write)
+{
+ struct scrub_page *page_bad = sblock_bad->pagev[page_num];
+ struct scrub_page *page_good = sblock_good->pagev[page_num];
+
+ BUG_ON(page_bad->page == NULL);
+ BUG_ON(page_good->page == NULL);
+ if (force_write || sblock_bad->header_error ||
+ sblock_bad->checksum_error || page_bad->io_error) {
+ struct bio *bio;
+ int ret;
+
+ if (!page_bad->dev->bdev) {
+ printk_ratelimited(KERN_WARNING "BTRFS: "
+ "scrub_repair_page_from_good_copy(bdev == NULL) "
+ "is unexpected!\n");
+ return -EIO;
+ }
+
+ bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
+ if (!bio)
+ return -EIO;
+ bio->bi_bdev = page_bad->dev->bdev;
+ bio->bi_iter.bi_sector = page_bad->physical >> 9;
+
+ ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
+ if (PAGE_SIZE != ret) {
+ bio_put(bio);
+ return -EIO;
+ }
+
+ if (btrfsic_submit_bio_wait(WRITE, bio)) {
+ btrfs_dev_stat_inc_and_print(page_bad->dev,
+ BTRFS_DEV_STAT_WRITE_ERRS);
+ btrfs_dev_replace_stats_inc(
+ &sblock_bad->sctx->dev_root->fs_info->
+ dev_replace.num_write_errors);
+ bio_put(bio);
+ return -EIO;
+ }
+ bio_put(bio);
+ }
+
+ return 0;
+}
+
+static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
+{
+ int page_num;
+
+ /*
+ * This block is used for the check of the parity on the source device,
+ * so the data needn't be written into the destination device.
+ */
+ if (sblock->sparity)
+ return;
+
+ for (page_num = 0; page_num < sblock->page_count; page_num++) {
+ int ret;
+
+ ret = scrub_write_page_to_dev_replace(sblock, page_num);
+ if (ret)
+ btrfs_dev_replace_stats_inc(
+ &sblock->sctx->dev_root->fs_info->dev_replace.
+ num_write_errors);
+ }
+}
+
+static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
+ int page_num)
+{
+ struct scrub_page *spage = sblock->pagev[page_num];
+
+ BUG_ON(spage->page == NULL);
+ if (spage->io_error) {
+ void *mapped_buffer = kmap_atomic(spage->page);
+
+ memset(mapped_buffer, 0, PAGE_CACHE_SIZE);
+ flush_dcache_page(spage->page);
+ kunmap_atomic(mapped_buffer);
+ }
+ return scrub_add_page_to_wr_bio(sblock->sctx, spage);
+}
+
+static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
+ struct scrub_page *spage)
+{
+ struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
+ struct scrub_bio *sbio;
+ int ret;
+
+ mutex_lock(&wr_ctx->wr_lock);
+again:
+ if (!wr_ctx->wr_curr_bio) {
+ wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio),
+ GFP_NOFS);
+ if (!wr_ctx->wr_curr_bio) {
+ mutex_unlock(&wr_ctx->wr_lock);
+ return -ENOMEM;
+ }
+ wr_ctx->wr_curr_bio->sctx = sctx;
+ wr_ctx->wr_curr_bio->page_count = 0;
+ }
+ sbio = wr_ctx->wr_curr_bio;
+ if (sbio->page_count == 0) {
+ struct bio *bio;
+
+ sbio->physical = spage->physical_for_dev_replace;
+ sbio->logical = spage->logical;
+ sbio->dev = wr_ctx->tgtdev;
+ bio = sbio->bio;
+ if (!bio) {
+ bio = btrfs_io_bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio);
+ if (!bio) {
+ mutex_unlock(&wr_ctx->wr_lock);
+ return -ENOMEM;
+ }
+ sbio->bio = bio;
+ }
+
+ bio->bi_private = sbio;
+ bio->bi_end_io = scrub_wr_bio_end_io;
+ bio->bi_bdev = sbio->dev->bdev;
+ bio->bi_iter.bi_sector = sbio->physical >> 9;
+ sbio->err = 0;
+ } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
+ spage->physical_for_dev_replace ||
+ sbio->logical + sbio->page_count * PAGE_SIZE !=
+ spage->logical) {
+ scrub_wr_submit(sctx);
+ goto again;
+ }
+
+ ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
+ if (ret != PAGE_SIZE) {
+ if (sbio->page_count < 1) {
+ bio_put(sbio->bio);
+ sbio->bio = NULL;
+ mutex_unlock(&wr_ctx->wr_lock);
+ return -EIO;
+ }
+ scrub_wr_submit(sctx);
+ goto again;
+ }
+
+ sbio->pagev[sbio->page_count] = spage;
+ scrub_page_get(spage);
+ sbio->page_count++;
+ if (sbio->page_count == wr_ctx->pages_per_wr_bio)
+ scrub_wr_submit(sctx);
+ mutex_unlock(&wr_ctx->wr_lock);
+
+ return 0;
+}
+
+static void scrub_wr_submit(struct scrub_ctx *sctx)
+{
+ struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
+ struct scrub_bio *sbio;
+
+ if (!wr_ctx->wr_curr_bio)
+ return;
+
+ sbio = wr_ctx->wr_curr_bio;
+ wr_ctx->wr_curr_bio = NULL;
+ WARN_ON(!sbio->bio->bi_bdev);
+ scrub_pending_bio_inc(sctx);
+ /* process all writes in a single worker thread. Then the block layer
+ * orders the requests before sending them to the driver which
+ * doubled the write performance on spinning disks when measured
+ * with Linux 3.5 */
+ btrfsic_submit_bio(WRITE, sbio->bio);
+}
+
+static void scrub_wr_bio_end_io(struct bio *bio, int err)
+{
+ struct scrub_bio *sbio = bio->bi_private;
+ struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
+
+ sbio->err = err;
+ sbio->bio = bio;
+
+ btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper,
+ scrub_wr_bio_end_io_worker, NULL, NULL);
+ btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
+}
+
+static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
+{
+ struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
+ struct scrub_ctx *sctx = sbio->sctx;
+ int i;
+
+ WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
+ if (sbio->err) {
+ struct btrfs_dev_replace *dev_replace =
+ &sbio->sctx->dev_root->fs_info->dev_replace;
+
+ for (i = 0; i < sbio->page_count; i++) {
+ struct scrub_page *spage = sbio->pagev[i];
+
+ spage->io_error = 1;
+ btrfs_dev_replace_stats_inc(&dev_replace->
+ num_write_errors);
+ }
+ }
+
+ for (i = 0; i < sbio->page_count; i++)
+ scrub_page_put(sbio->pagev[i]);
+
+ bio_put(sbio->bio);
+ kfree(sbio);
+ scrub_pending_bio_dec(sctx);
+}
+
+static int scrub_checksum(struct scrub_block *sblock)
+{
+ u64 flags;
+ int ret;
+
+ WARN_ON(sblock->page_count < 1);
+ flags = sblock->pagev[0]->flags;
+ ret = 0;
+ if (flags & BTRFS_EXTENT_FLAG_DATA)
+ ret = scrub_checksum_data(sblock);
+ else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+ ret = scrub_checksum_tree_block(sblock);
+ else if (flags & BTRFS_EXTENT_FLAG_SUPER)
+ (void)scrub_checksum_super(sblock);
+ else
+ WARN_ON(1);
+ if (ret)
+ scrub_handle_errored_block(sblock);
+
+ return ret;
+}
+
+static int scrub_checksum_data(struct scrub_block *sblock)
+{
+ struct scrub_ctx *sctx = sblock->sctx;
+ u8 csum[BTRFS_CSUM_SIZE];
+ u8 *on_disk_csum;
+ struct page *page;
+ void *buffer;
+ u32 crc = ~(u32)0;
+ int fail = 0;
+ u64 len;
+ int index;
+
+ BUG_ON(sblock->page_count < 1);
+ if (!sblock->pagev[0]->have_csum)
+ return 0;
+
+ on_disk_csum = sblock->pagev[0]->csum;
+ page = sblock->pagev[0]->page;
+ buffer = kmap_atomic(page);
+
+ len = sctx->sectorsize;
+ index = 0;
+ for (;;) {
+ u64 l = min_t(u64, len, PAGE_SIZE);
+
+ crc = btrfs_csum_data(buffer, crc, l);
+ kunmap_atomic(buffer);
+ len -= l;
+ if (len == 0)
+ break;
+ index++;
+ BUG_ON(index >= sblock->page_count);
+ BUG_ON(!sblock->pagev[index]->page);
+ page = sblock->pagev[index]->page;
+ buffer = kmap_atomic(page);
+ }
+
+ btrfs_csum_final(crc, csum);
+ if (memcmp(csum, on_disk_csum, sctx->csum_size))
+ fail = 1;
+
+ return fail;
+}
+
+static int scrub_checksum_tree_block(struct scrub_block *sblock)
+{
+ struct scrub_ctx *sctx = sblock->sctx;
+ struct btrfs_header *h;
+ struct btrfs_root *root = sctx->dev_root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ u8 calculated_csum[BTRFS_CSUM_SIZE];
+ u8 on_disk_csum[BTRFS_CSUM_SIZE];
+ struct page *page;
+ void *mapped_buffer;
+ u64 mapped_size;
+ void *p;
+ u32 crc = ~(u32)0;
+ int fail = 0;
+ int crc_fail = 0;
+ u64 len;
+ int index;
+
+ BUG_ON(sblock->page_count < 1);
+ page = sblock->pagev[0]->page;
+ mapped_buffer = kmap_atomic(page);
+ h = (struct btrfs_header *)mapped_buffer;
+ memcpy(on_disk_csum, h->csum, sctx->csum_size);
+
+ /*
+ * we don't use the getter functions here, as we
+ * a) don't have an extent buffer and
+ * b) the page is already kmapped
+ */
+
+ if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h))
+ ++fail;
+
+ if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h))
+ ++fail;
+
+ if (!scrub_check_fsid(h->fsid, sblock->pagev[0]))
+ ++fail;
+
+ if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
+ BTRFS_UUID_SIZE))
+ ++fail;
+
+ len = sctx->nodesize - BTRFS_CSUM_SIZE;
+ mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
+ p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
+ index = 0;
+ for (;;) {
+ u64 l = min_t(u64, len, mapped_size);
+
+ crc = btrfs_csum_data(p, crc, l);
+ kunmap_atomic(mapped_buffer);
+ len -= l;
+ if (len == 0)
+ break;
+ index++;
+ BUG_ON(index >= sblock->page_count);
+ BUG_ON(!sblock->pagev[index]->page);
+ page = sblock->pagev[index]->page;
+ mapped_buffer = kmap_atomic(page);
+ mapped_size = PAGE_SIZE;
+ p = mapped_buffer;
+ }
+
+ btrfs_csum_final(crc, calculated_csum);
+ if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
+ ++crc_fail;
+
+ return fail || crc_fail;
+}
+
+static int scrub_checksum_super(struct scrub_block *sblock)
+{
+ struct btrfs_super_block *s;
+ struct scrub_ctx *sctx = sblock->sctx;
+ u8 calculated_csum[BTRFS_CSUM_SIZE];
+ u8 on_disk_csum[BTRFS_CSUM_SIZE];
+ struct page *page;
+ void *mapped_buffer;
+ u64 mapped_size;
+ void *p;
+ u32 crc = ~(u32)0;
+ int fail_gen = 0;
+ int fail_cor = 0;
+ u64 len;
+ int index;
+
+ BUG_ON(sblock->page_count < 1);
+ page = sblock->pagev[0]->page;
+ mapped_buffer = kmap_atomic(page);
+ s = (struct btrfs_super_block *)mapped_buffer;
+ memcpy(on_disk_csum, s->csum, sctx->csum_size);
+
+ if (sblock->pagev[0]->logical != btrfs_super_bytenr(s))
+ ++fail_cor;
+
+ if (sblock->pagev[0]->generation != btrfs_super_generation(s))
+ ++fail_gen;
+
+ if (!scrub_check_fsid(s->fsid, sblock->pagev[0]))
+ ++fail_cor;
+
+ len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
+ mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
+ p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
+ index = 0;
+ for (;;) {
+ u64 l = min_t(u64, len, mapped_size);
+
+ crc = btrfs_csum_data(p, crc, l);
+ kunmap_atomic(mapped_buffer);
+ len -= l;
+ if (len == 0)
+ break;
+ index++;
+ BUG_ON(index >= sblock->page_count);
+ BUG_ON(!sblock->pagev[index]->page);
+ page = sblock->pagev[index]->page;
+ mapped_buffer = kmap_atomic(page);
+ mapped_size = PAGE_SIZE;
+ p = mapped_buffer;
+ }
+
+ btrfs_csum_final(crc, calculated_csum);
+ if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
+ ++fail_cor;
+
+ if (fail_cor + fail_gen) {
+ /*
+ * if we find an error in a super block, we just report it.
+ * They will get written with the next transaction commit
+ * anyway
+ */
+ spin_lock(&sctx->stat_lock);
+ ++sctx->stat.super_errors;
+ spin_unlock(&sctx->stat_lock);
+ if (fail_cor)
+ btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
+ BTRFS_DEV_STAT_CORRUPTION_ERRS);
+ else
+ btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
+ BTRFS_DEV_STAT_GENERATION_ERRS);
+ }
+
+ return fail_cor + fail_gen;
+}
+
+static void scrub_block_get(struct scrub_block *sblock)
+{
+ atomic_inc(&sblock->refs);
+}
+
+static void scrub_block_put(struct scrub_block *sblock)
+{
+ if (atomic_dec_and_test(&sblock->refs)) {
+ int i;
+
+ if (sblock->sparity)
+ scrub_parity_put(sblock->sparity);
+
+ for (i = 0; i < sblock->page_count; i++)
+ scrub_page_put(sblock->pagev[i]);
+ kfree(sblock);
+ }
+}
+
+static void scrub_page_get(struct scrub_page *spage)
+{
+ atomic_inc(&spage->refs);
+}
+
+static void scrub_page_put(struct scrub_page *spage)
+{
+ if (atomic_dec_and_test(&spage->refs)) {
+ if (spage->page)
+ __free_page(spage->page);
+ kfree(spage);
+ }
+}
+
+static void scrub_submit(struct scrub_ctx *sctx)
+{
+ struct scrub_bio *sbio;
+
+ if (sctx->curr == -1)
+ return;
+
+ sbio = sctx->bios[sctx->curr];
+ sctx->curr = -1;
+ scrub_pending_bio_inc(sctx);
+
+ if (!sbio->bio->bi_bdev) {
+ /*
+ * this case should not happen. If btrfs_map_block() is
+ * wrong, it could happen for dev-replace operations on
+ * missing devices when no mirrors are available, but in
+ * this case it should already fail the mount.
+ * This case is handled correctly (but _very_ slowly).
+ */
+ printk_ratelimited(KERN_WARNING
+ "BTRFS: scrub_submit(bio bdev == NULL) is unexpected!\n");
+ bio_endio(sbio->bio, -EIO);
+ } else {
+ btrfsic_submit_bio(READ, sbio->bio);
+ }
+}
+
+static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
+ struct scrub_page *spage)
+{
+ struct scrub_block *sblock = spage->sblock;
+ struct scrub_bio *sbio;
+ int ret;
+
+again:
+ /*
+ * grab a fresh bio or wait for one to become available
+ */
+ while (sctx->curr == -1) {
+ spin_lock(&sctx->list_lock);
+ sctx->curr = sctx->first_free;
+ if (sctx->curr != -1) {
+ sctx->first_free = sctx->bios[sctx->curr]->next_free;
+ sctx->bios[sctx->curr]->next_free = -1;
+ sctx->bios[sctx->curr]->page_count = 0;
+ spin_unlock(&sctx->list_lock);
+ } else {
+ spin_unlock(&sctx->list_lock);
+ wait_event(sctx->list_wait, sctx->first_free != -1);
+ }
+ }
+ sbio = sctx->bios[sctx->curr];
+ if (sbio->page_count == 0) {
+ struct bio *bio;
+
+ sbio->physical = spage->physical;
+ sbio->logical = spage->logical;
+ sbio->dev = spage->dev;
+ bio = sbio->bio;
+ if (!bio) {
+ bio = btrfs_io_bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio);
+ if (!bio)
+ return -ENOMEM;
+ sbio->bio = bio;
+ }
+
+ bio->bi_private = sbio;
+ bio->bi_end_io = scrub_bio_end_io;
+ bio->bi_bdev = sbio->dev->bdev;
+ bio->bi_iter.bi_sector = sbio->physical >> 9;
+ sbio->err = 0;
+ } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
+ spage->physical ||
+ sbio->logical + sbio->page_count * PAGE_SIZE !=
+ spage->logical ||
+ sbio->dev != spage->dev) {
+ scrub_submit(sctx);
+ goto again;
+ }
+
+ sbio->pagev[sbio->page_count] = spage;
+ ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
+ if (ret != PAGE_SIZE) {
+ if (sbio->page_count < 1) {
+ bio_put(sbio->bio);
+ sbio->bio = NULL;
+ return -EIO;
+ }
+ scrub_submit(sctx);
+ goto again;
+ }
+
+ scrub_block_get(sblock); /* one for the page added to the bio */
+ atomic_inc(&sblock->outstanding_pages);
+ sbio->page_count++;
+ if (sbio->page_count == sctx->pages_per_rd_bio)
+ scrub_submit(sctx);
+
+ return 0;
+}
+
+static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
+ u64 physical, struct btrfs_device *dev, u64 flags,
+ u64 gen, int mirror_num, u8 *csum, int force,
+ u64 physical_for_dev_replace)
+{
+ struct scrub_block *sblock;
+ int index;
+
+ sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
+ if (!sblock) {
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.malloc_errors++;
+ spin_unlock(&sctx->stat_lock);
+ return -ENOMEM;
+ }
+
+ /* one ref inside this function, plus one for each page added to
+ * a bio later on */
+ atomic_set(&sblock->refs, 1);
+ sblock->sctx = sctx;
+ sblock->no_io_error_seen = 1;
+
+ for (index = 0; len > 0; index++) {
+ struct scrub_page *spage;
+ u64 l = min_t(u64, len, PAGE_SIZE);
+
+ spage = kzalloc(sizeof(*spage), GFP_NOFS);
+ if (!spage) {
+leave_nomem:
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.malloc_errors++;
+ spin_unlock(&sctx->stat_lock);
+ scrub_block_put(sblock);
+ return -ENOMEM;
+ }
+ BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
+ scrub_page_get(spage);
+ sblock->pagev[index] = spage;
+ spage->sblock = sblock;
+ spage->dev = dev;
+ spage->flags = flags;
+ spage->generation = gen;
+ spage->logical = logical;
+ spage->physical = physical;
+ spage->physical_for_dev_replace = physical_for_dev_replace;
+ spage->mirror_num = mirror_num;
+ if (csum) {
+ spage->have_csum = 1;
+ memcpy(spage->csum, csum, sctx->csum_size);
+ } else {
+ spage->have_csum = 0;
+ }
+ sblock->page_count++;
+ spage->page = alloc_page(GFP_NOFS);
+ if (!spage->page)
+ goto leave_nomem;
+ len -= l;
+ logical += l;
+ physical += l;
+ physical_for_dev_replace += l;
+ }
+
+ WARN_ON(sblock->page_count == 0);
+ for (index = 0; index < sblock->page_count; index++) {
+ struct scrub_page *spage = sblock->pagev[index];
+ int ret;
+
+ ret = scrub_add_page_to_rd_bio(sctx, spage);
+ if (ret) {
+ scrub_block_put(sblock);
+ return ret;
+ }
+ }
+
+ if (force)
+ scrub_submit(sctx);
+
+ /* last one frees, either here or in bio completion for last page */
+ scrub_block_put(sblock);
+ return 0;
+}
+
+static void scrub_bio_end_io(struct bio *bio, int err)
+{
+ struct scrub_bio *sbio = bio->bi_private;
+ struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
+
+ sbio->err = err;
+ sbio->bio = bio;
+
+ btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
+}
+
+static void scrub_bio_end_io_worker(struct btrfs_work *work)
+{
+ struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
+ struct scrub_ctx *sctx = sbio->sctx;
+ int i;
+
+ BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
+ if (sbio->err) {
+ for (i = 0; i < sbio->page_count; i++) {
+ struct scrub_page *spage = sbio->pagev[i];
+
+ spage->io_error = 1;
+ spage->sblock->no_io_error_seen = 0;
+ }
+ }
+
+ /* now complete the scrub_block items that have all pages completed */
+ for (i = 0; i < sbio->page_count; i++) {
+ struct scrub_page *spage = sbio->pagev[i];
+ struct scrub_block *sblock = spage->sblock;
+
+ if (atomic_dec_and_test(&sblock->outstanding_pages))
+ scrub_block_complete(sblock);
+ scrub_block_put(sblock);
+ }
+
+ bio_put(sbio->bio);
+ sbio->bio = NULL;
+ spin_lock(&sctx->list_lock);
+ sbio->next_free = sctx->first_free;
+ sctx->first_free = sbio->index;
+ spin_unlock(&sctx->list_lock);
+
+ if (sctx->is_dev_replace &&
+ atomic_read(&sctx->wr_ctx.flush_all_writes)) {
+ mutex_lock(&sctx->wr_ctx.wr_lock);
+ scrub_wr_submit(sctx);
+ mutex_unlock(&sctx->wr_ctx.wr_lock);
+ }
+
+ scrub_pending_bio_dec(sctx);
+}
+
+static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
+ unsigned long *bitmap,
+ u64 start, u64 len)
+{
+ u32 offset;
+ int nsectors;
+ int sectorsize = sparity->sctx->dev_root->sectorsize;
+
+ if (len >= sparity->stripe_len) {
+ bitmap_set(bitmap, 0, sparity->nsectors);
+ return;
+ }
+
+ start -= sparity->logic_start;
+ start = div_u64_rem(start, sparity->stripe_len, &offset);
+ offset /= sectorsize;
+ nsectors = (int)len / sectorsize;
+
+ if (offset + nsectors <= sparity->nsectors) {
+ bitmap_set(bitmap, offset, nsectors);
+ return;
+ }
+
+ bitmap_set(bitmap, offset, sparity->nsectors - offset);
+ bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
+}
+
+static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
+ u64 start, u64 len)
+{
+ __scrub_mark_bitmap(sparity, sparity->ebitmap, start, len);
+}
+
+static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
+ u64 start, u64 len)
+{
+ __scrub_mark_bitmap(sparity, sparity->dbitmap, start, len);
+}
+
+static void scrub_block_complete(struct scrub_block *sblock)
+{
+ int corrupted = 0;
+
+ if (!sblock->no_io_error_seen) {
+ corrupted = 1;
+ scrub_handle_errored_block(sblock);
+ } else {
+ /*
+ * if has checksum error, write via repair mechanism in
+ * dev replace case, otherwise write here in dev replace
+ * case.
+ */
+ corrupted = scrub_checksum(sblock);
+ if (!corrupted && sblock->sctx->is_dev_replace)
+ scrub_write_block_to_dev_replace(sblock);
+ }
+
+ if (sblock->sparity && corrupted && !sblock->data_corrected) {
+ u64 start = sblock->pagev[0]->logical;
+ u64 end = sblock->pagev[sblock->page_count - 1]->logical +
+ PAGE_SIZE;
+
+ scrub_parity_mark_sectors_error(sblock->sparity,
+ start, end - start);
+ }
+}
+
+static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
+ u8 *csum)
+{
+ struct btrfs_ordered_sum *sum = NULL;
+ unsigned long index;
+ unsigned long num_sectors;
+
+ while (!list_empty(&sctx->csum_list)) {
+ sum = list_first_entry(&sctx->csum_list,
+ struct btrfs_ordered_sum, list);
+ if (sum->bytenr > logical)
+ return 0;
+ if (sum->bytenr + sum->len > logical)
+ break;
+
+ ++sctx->stat.csum_discards;
+ list_del(&sum->list);
+ kfree(sum);
+ sum = NULL;
+ }
+ if (!sum)
+ return 0;
+
+ index = ((u32)(logical - sum->bytenr)) / sctx->sectorsize;
+ num_sectors = sum->len / sctx->sectorsize;
+ memcpy(csum, sum->sums + index, sctx->csum_size);
+ if (index == num_sectors - 1) {
+ list_del(&sum->list);
+ kfree(sum);
+ }
+ return 1;
+}
+
+/* scrub extent tries to collect up to 64 kB for each bio */
+static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
+ u64 physical, struct btrfs_device *dev, u64 flags,
+ u64 gen, int mirror_num, u64 physical_for_dev_replace)
+{
+ int ret;
+ u8 csum[BTRFS_CSUM_SIZE];
+ u32 blocksize;
+
+ if (flags & BTRFS_EXTENT_FLAG_DATA) {
+ blocksize = sctx->sectorsize;
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.data_extents_scrubbed++;
+ sctx->stat.data_bytes_scrubbed += len;
+ spin_unlock(&sctx->stat_lock);
+ } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+ blocksize = sctx->nodesize;
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.tree_extents_scrubbed++;
+ sctx->stat.tree_bytes_scrubbed += len;
+ spin_unlock(&sctx->stat_lock);
+ } else {
+ blocksize = sctx->sectorsize;
+ WARN_ON(1);
+ }
+
+ while (len) {
+ u64 l = min_t(u64, len, blocksize);
+ int have_csum = 0;
+
+ if (flags & BTRFS_EXTENT_FLAG_DATA) {
+ /* push csums to sbio */
+ have_csum = scrub_find_csum(sctx, logical, l, csum);
+ if (have_csum == 0)
+ ++sctx->stat.no_csum;
+ if (sctx->is_dev_replace && !have_csum) {
+ ret = copy_nocow_pages(sctx, logical, l,
+ mirror_num,
+ physical_for_dev_replace);
+ goto behind_scrub_pages;
+ }
+ }
+ ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
+ mirror_num, have_csum ? csum : NULL, 0,
+ physical_for_dev_replace);
+behind_scrub_pages:
+ if (ret)
+ return ret;
+ len -= l;
+ logical += l;
+ physical += l;
+ physical_for_dev_replace += l;
+ }
+ return 0;
+}
+
+static int scrub_pages_for_parity(struct scrub_parity *sparity,
+ u64 logical, u64 len,
+ u64 physical, struct btrfs_device *dev,
+ u64 flags, u64 gen, int mirror_num, u8 *csum)
+{
+ struct scrub_ctx *sctx = sparity->sctx;
+ struct scrub_block *sblock;
+ int index;
+
+ sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
+ if (!sblock) {
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.malloc_errors++;
+ spin_unlock(&sctx->stat_lock);
+ return -ENOMEM;
+ }
+
+ /* one ref inside this function, plus one for each page added to
+ * a bio later on */
+ atomic_set(&sblock->refs, 1);
+ sblock->sctx = sctx;
+ sblock->no_io_error_seen = 1;
+ sblock->sparity = sparity;
+ scrub_parity_get(sparity);
+
+ for (index = 0; len > 0; index++) {
+ struct scrub_page *spage;
+ u64 l = min_t(u64, len, PAGE_SIZE);
+
+ spage = kzalloc(sizeof(*spage), GFP_NOFS);
+ if (!spage) {
+leave_nomem:
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.malloc_errors++;
+ spin_unlock(&sctx->stat_lock);
+ scrub_block_put(sblock);
+ return -ENOMEM;
+ }
+ BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
+ /* For scrub block */
+ scrub_page_get(spage);
+ sblock->pagev[index] = spage;
+ /* For scrub parity */
+ scrub_page_get(spage);
+ list_add_tail(&spage->list, &sparity->spages);
+ spage->sblock = sblock;
+ spage->dev = dev;
+ spage->flags = flags;
+ spage->generation = gen;
+ spage->logical = logical;
+ spage->physical = physical;
+ spage->mirror_num = mirror_num;
+ if (csum) {
+ spage->have_csum = 1;
+ memcpy(spage->csum, csum, sctx->csum_size);
+ } else {
+ spage->have_csum = 0;
+ }
+ sblock->page_count++;
+ spage->page = alloc_page(GFP_NOFS);
+ if (!spage->page)
+ goto leave_nomem;
+ len -= l;
+ logical += l;
+ physical += l;
+ }
+
+ WARN_ON(sblock->page_count == 0);
+ for (index = 0; index < sblock->page_count; index++) {
+ struct scrub_page *spage = sblock->pagev[index];
+ int ret;
+
+ ret = scrub_add_page_to_rd_bio(sctx, spage);
+ if (ret) {
+ scrub_block_put(sblock);
+ return ret;
+ }
+ }
+
+ /* last one frees, either here or in bio completion for last page */
+ scrub_block_put(sblock);
+ return 0;
+}
+
+static int scrub_extent_for_parity(struct scrub_parity *sparity,
+ u64 logical, u64 len,
+ u64 physical, struct btrfs_device *dev,
+ u64 flags, u64 gen, int mirror_num)
+{
+ struct scrub_ctx *sctx = sparity->sctx;
+ int ret;
+ u8 csum[BTRFS_CSUM_SIZE];
+ u32 blocksize;
+
+ if (flags & BTRFS_EXTENT_FLAG_DATA) {
+ blocksize = sctx->sectorsize;
+ } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+ blocksize = sctx->nodesize;
+ } else {
+ blocksize = sctx->sectorsize;
+ WARN_ON(1);
+ }
+
+ while (len) {
+ u64 l = min_t(u64, len, blocksize);
+ int have_csum = 0;
+
+ if (flags & BTRFS_EXTENT_FLAG_DATA) {
+ /* push csums to sbio */
+ have_csum = scrub_find_csum(sctx, logical, l, csum);
+ if (have_csum == 0)
+ goto skip;
+ }
+ ret = scrub_pages_for_parity(sparity, logical, l, physical, dev,
+ flags, gen, mirror_num,
+ have_csum ? csum : NULL);
+ if (ret)
+ return ret;
+skip:
+ len -= l;
+ logical += l;
+ physical += l;
+ }
+ return 0;
+}
+
+/*
+ * Given a physical address, this will calculate it's
+ * logical offset. if this is a parity stripe, it will return
+ * the most left data stripe's logical offset.
+ *
+ * return 0 if it is a data stripe, 1 means parity stripe.
+ */
+static int get_raid56_logic_offset(u64 physical, int num,
+ struct map_lookup *map, u64 *offset,
+ u64 *stripe_start)
+{
+ int i;
+ int j = 0;
+ u64 stripe_nr;
+ u64 last_offset;
+ u32 stripe_index;
+ u32 rot;
+
+ last_offset = (physical - map->stripes[num].physical) *
+ nr_data_stripes(map);
+ if (stripe_start)
+ *stripe_start = last_offset;
+
+ *offset = last_offset;
+ for (i = 0; i < nr_data_stripes(map); i++) {
+ *offset = last_offset + i * map->stripe_len;
+
+ stripe_nr = div_u64(*offset, map->stripe_len);
+ stripe_nr = div_u64(stripe_nr, nr_data_stripes(map));
+
+ /* Work out the disk rotation on this stripe-set */
+ stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
+ /* calculate which stripe this data locates */
+ rot += i;
+ stripe_index = rot % map->num_stripes;
+ if (stripe_index == num)
+ return 0;
+ if (stripe_index < num)
+ j++;
+ }
+ *offset = last_offset + j * map->stripe_len;
+ return 1;
+}
+
+static void scrub_free_parity(struct scrub_parity *sparity)
+{
+ struct scrub_ctx *sctx = sparity->sctx;
+ struct scrub_page *curr, *next;
+ int nbits;
+
+ nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors);
+ if (nbits) {
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.read_errors += nbits;
+ sctx->stat.uncorrectable_errors += nbits;
+ spin_unlock(&sctx->stat_lock);
+ }
+
+ list_for_each_entry_safe(curr, next, &sparity->spages, list) {
+ list_del_init(&curr->list);
+ scrub_page_put(curr);
+ }
+
+ kfree(sparity);
+}
+
+static void scrub_parity_bio_endio(struct bio *bio, int error)
+{
+ struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
+ struct scrub_ctx *sctx = sparity->sctx;
+
+ if (error)
+ bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
+ sparity->nsectors);
+
+ scrub_free_parity(sparity);
+ scrub_pending_bio_dec(sctx);
+ bio_put(bio);
+}
+
+static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
+{
+ struct scrub_ctx *sctx = sparity->sctx;
+ struct bio *bio;
+ struct btrfs_raid_bio *rbio;
+ struct scrub_page *spage;
+ struct btrfs_bio *bbio = NULL;
+ u64 length;
+ int ret;
+
+ if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap,
+ sparity->nsectors))
+ goto out;
+
+ length = sparity->logic_end - sparity->logic_start + 1;
+ ret = btrfs_map_sblock(sctx->dev_root->fs_info, WRITE,
+ sparity->logic_start,
+ &length, &bbio, 0, 1);
+ if (ret || !bbio || !bbio->raid_map)
+ goto bbio_out;
+
+ bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
+ if (!bio)
+ goto bbio_out;
+
+ bio->bi_iter.bi_sector = sparity->logic_start >> 9;
+ bio->bi_private = sparity;
+ bio->bi_end_io = scrub_parity_bio_endio;
+
+ rbio = raid56_parity_alloc_scrub_rbio(sctx->dev_root, bio, bbio,
+ length, sparity->scrub_dev,
+ sparity->dbitmap,
+ sparity->nsectors);
+ if (!rbio)
+ goto rbio_out;
+
+ list_for_each_entry(spage, &sparity->spages, list)
+ raid56_parity_add_scrub_pages(rbio, spage->page,
+ spage->logical);
+
+ scrub_pending_bio_inc(sctx);
+ raid56_parity_submit_scrub_rbio(rbio);
+ return;
+
+rbio_out:
+ bio_put(bio);
+bbio_out:
+ btrfs_put_bbio(bbio);
+ bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
+ sparity->nsectors);
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.malloc_errors++;
+ spin_unlock(&sctx->stat_lock);
+out:
+ scrub_free_parity(sparity);
+}
+
+static inline int scrub_calc_parity_bitmap_len(int nsectors)
+{
+ return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * (BITS_PER_LONG / 8);
+}
+
+static void scrub_parity_get(struct scrub_parity *sparity)
+{
+ atomic_inc(&sparity->refs);
+}
+
+static void scrub_parity_put(struct scrub_parity *sparity)
+{
+ if (!atomic_dec_and_test(&sparity->refs))
+ return;
+
+ scrub_parity_check_and_repair(sparity);
+}
+
+static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
+ struct map_lookup *map,
+ struct btrfs_device *sdev,
+ struct btrfs_path *path,
+ u64 logic_start,
+ u64 logic_end)
+{
+ struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+ struct btrfs_root *root = fs_info->extent_root;
+ struct btrfs_root *csum_root = fs_info->csum_root;
+ struct btrfs_extent_item *extent;
+ u64 flags;
+ int ret;
+ int slot;
+ struct extent_buffer *l;
+ struct btrfs_key key;
+ u64 generation;
+ u64 extent_logical;
+ u64 extent_physical;
+ u64 extent_len;
+ struct btrfs_device *extent_dev;
+ struct scrub_parity *sparity;
+ int nsectors;
+ int bitmap_len;
+ int extent_mirror_num;
+ int stop_loop = 0;
+
+ nsectors = map->stripe_len / root->sectorsize;
+ bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
+ sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len,
+ GFP_NOFS);
+ if (!sparity) {
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.malloc_errors++;
+ spin_unlock(&sctx->stat_lock);
+ return -ENOMEM;
+ }
+
+ sparity->stripe_len = map->stripe_len;
+ sparity->nsectors = nsectors;
+ sparity->sctx = sctx;
+ sparity->scrub_dev = sdev;
+ sparity->logic_start = logic_start;
+ sparity->logic_end = logic_end;
+ atomic_set(&sparity->refs, 1);
+ INIT_LIST_HEAD(&sparity->spages);
+ sparity->dbitmap = sparity->bitmap;
+ sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
+
+ ret = 0;
+ while (logic_start < logic_end) {
+ if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
+ key.type = BTRFS_METADATA_ITEM_KEY;
+ else
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.objectid = logic_start;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ if (ret > 0) {
+ ret = btrfs_previous_extent_item(root, path, 0);
+ if (ret < 0)
+ goto out;
+ if (ret > 0) {
+ btrfs_release_path(path);
+ ret = btrfs_search_slot(NULL, root, &key,
+ path, 0, 0);
+ if (ret < 0)
+ goto out;
+ }
+ }
+
+ stop_loop = 0;
+ while (1) {
+ u64 bytes;
+
+ l = path->nodes[0];
+ slot = path->slots[0];
+ if (slot >= btrfs_header_nritems(l)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret == 0)
+ continue;
+ if (ret < 0)
+ goto out;
+
+ stop_loop = 1;
+ break;
+ }
+ btrfs_item_key_to_cpu(l, &key, slot);
+
+ if (key.type == BTRFS_METADATA_ITEM_KEY)
+ bytes = root->nodesize;
+ else
+ bytes = key.offset;
+
+ if (key.objectid + bytes <= logic_start)
+ goto next;
+
+ if (key.type != BTRFS_EXTENT_ITEM_KEY &&
+ key.type != BTRFS_METADATA_ITEM_KEY)
+ goto next;
+
+ if (key.objectid > logic_end) {
+ stop_loop = 1;
+ break;
+ }
+
+ while (key.objectid >= logic_start + map->stripe_len)
+ logic_start += map->stripe_len;
+
+ extent = btrfs_item_ptr(l, slot,
+ struct btrfs_extent_item);
+ flags = btrfs_extent_flags(l, extent);
+ generation = btrfs_extent_generation(l, extent);
+
+ if (key.objectid < logic_start &&
+ (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
+ btrfs_err(fs_info,
+ "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
+ key.objectid, logic_start);
+ goto next;
+ }
+again:
+ extent_logical = key.objectid;
+ extent_len = bytes;
+
+ if (extent_logical < logic_start) {
+ extent_len -= logic_start - extent_logical;
+ extent_logical = logic_start;
+ }
+
+ if (extent_logical + extent_len >
+ logic_start + map->stripe_len)
+ extent_len = logic_start + map->stripe_len -
+ extent_logical;
+
+ scrub_parity_mark_sectors_data(sparity, extent_logical,
+ extent_len);
+
+ scrub_remap_extent(fs_info, extent_logical,
+ extent_len, &extent_physical,
+ &extent_dev,
+ &extent_mirror_num);
+
+ ret = btrfs_lookup_csums_range(csum_root,
+ extent_logical,
+ extent_logical + extent_len - 1,
+ &sctx->csum_list, 1);
+ if (ret)
+ goto out;
+
+ ret = scrub_extent_for_parity(sparity, extent_logical,
+ extent_len,
+ extent_physical,
+ extent_dev, flags,
+ generation,
+ extent_mirror_num);
+ if (ret)
+ goto out;
+
+ scrub_free_csums(sctx);
+ if (extent_logical + extent_len <
+ key.objectid + bytes) {
+ logic_start += map->stripe_len;
+
+ if (logic_start >= logic_end) {
+ stop_loop = 1;
+ break;
+ }
+
+ if (logic_start < key.objectid + bytes) {
+ cond_resched();
+ goto again;
+ }
+ }
+next:
+ path->slots[0]++;
+ }
+
+ btrfs_release_path(path);
+
+ if (stop_loop)
+ break;
+
+ logic_start += map->stripe_len;
+ }
+out:
+ if (ret < 0)
+ scrub_parity_mark_sectors_error(sparity, logic_start,
+ logic_end - logic_start + 1);
+ scrub_parity_put(sparity);
+ scrub_submit(sctx);
+ mutex_lock(&sctx->wr_ctx.wr_lock);
+ scrub_wr_submit(sctx);
+ mutex_unlock(&sctx->wr_ctx.wr_lock);
+
+ btrfs_release_path(path);
+ return ret < 0 ? ret : 0;
+}
+
+static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
+ struct map_lookup *map,
+ struct btrfs_device *scrub_dev,
+ int num, u64 base, u64 length,
+ int is_dev_replace)
+{
+ struct btrfs_path *path, *ppath;
+ struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+ struct btrfs_root *root = fs_info->extent_root;
+ struct btrfs_root *csum_root = fs_info->csum_root;
+ struct btrfs_extent_item *extent;
+ struct blk_plug plug;
+ u64 flags;
+ int ret;
+ int slot;
+ u64 nstripes;
+ struct extent_buffer *l;
+ struct btrfs_key key;
+ u64 physical;
+ u64 logical;
+ u64 logic_end;
+ u64 physical_end;
+ u64 generation;
+ int mirror_num;
+ struct reada_control *reada1;
+ struct reada_control *reada2;
+ struct btrfs_key key_start;
+ struct btrfs_key key_end;
+ u64 increment = map->stripe_len;
+ u64 offset;
+ u64 extent_logical;
+ u64 extent_physical;
+ u64 extent_len;
+ u64 stripe_logical;
+ u64 stripe_end;
+ struct btrfs_device *extent_dev;
+ int extent_mirror_num;
+ int stop_loop = 0;
+
+ physical = map->stripes[num].physical;
+ offset = 0;
+ nstripes = div_u64(length, map->stripe_len);
+ if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
+ offset = map->stripe_len * num;
+ increment = map->stripe_len * map->num_stripes;
+ mirror_num = 1;
+ } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+ int factor = map->num_stripes / map->sub_stripes;
+ offset = map->stripe_len * (num / map->sub_stripes);
+ increment = map->stripe_len * factor;
+ mirror_num = num % map->sub_stripes + 1;
+ } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
+ increment = map->stripe_len;
+ mirror_num = num % map->num_stripes + 1;
+ } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
+ increment = map->stripe_len;
+ mirror_num = num % map->num_stripes + 1;
+ } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+ get_raid56_logic_offset(physical, num, map, &offset, NULL);
+ increment = map->stripe_len * nr_data_stripes(map);
+ mirror_num = 1;
+ } else {
+ increment = map->stripe_len;
+ mirror_num = 1;
+ }
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ppath = btrfs_alloc_path();
+ if (!ppath) {
+ btrfs_free_path(path);
+ return -ENOMEM;
+ }
+
+ /*
+ * work on commit root. The related disk blocks are static as
+ * long as COW is applied. This means, it is save to rewrite
+ * them to repair disk errors without any race conditions
+ */
+ path->search_commit_root = 1;
+ path->skip_locking = 1;
+
+ ppath->search_commit_root = 1;
+ ppath->skip_locking = 1;
+ /*
+ * trigger the readahead for extent tree csum tree and wait for
+ * completion. During readahead, the scrub is officially paused
+ * to not hold off transaction commits
+ */
+ logical = base + offset;
+ physical_end = physical + nstripes * map->stripe_len;
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+ get_raid56_logic_offset(physical_end, num,
+ map, &logic_end, NULL);
+ logic_end += base;
+ } else {
+ logic_end = logical + increment * nstripes;
+ }
+ wait_event(sctx->list_wait,
+ atomic_read(&sctx->bios_in_flight) == 0);
+ scrub_blocked_if_needed(fs_info);
+
+ /* FIXME it might be better to start readahead at commit root */
+ key_start.objectid = logical;
+ key_start.type = BTRFS_EXTENT_ITEM_KEY;
+ key_start.offset = (u64)0;
+ key_end.objectid = logic_end;
+ key_end.type = BTRFS_METADATA_ITEM_KEY;
+ key_end.offset = (u64)-1;
+ reada1 = btrfs_reada_add(root, &key_start, &key_end);
+
+ key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+ key_start.type = BTRFS_EXTENT_CSUM_KEY;
+ key_start.offset = logical;
+ key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+ key_end.type = BTRFS_EXTENT_CSUM_KEY;
+ key_end.offset = logic_end;
+ reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
+
+ if (!IS_ERR(reada1))
+ btrfs_reada_wait(reada1);
+ if (!IS_ERR(reada2))
+ btrfs_reada_wait(reada2);
+
+
+ /*
+ * collect all data csums for the stripe to avoid seeking during
+ * the scrub. This might currently (crc32) end up to be about 1MB
+ */
+ blk_start_plug(&plug);
+
+ /*
+ * now find all extents for each stripe and scrub them
+ */
+ ret = 0;
+ while (physical < physical_end) {
+ /* for raid56, we skip parity stripe */
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+ ret = get_raid56_logic_offset(physical, num,
+ map, &logical, &stripe_logical);
+ logical += base;
+ if (ret) {
+ stripe_logical += base;
+ stripe_end = stripe_logical + increment - 1;
+ ret = scrub_raid56_parity(sctx, map, scrub_dev,
+ ppath, stripe_logical,
+ stripe_end);
+ if (ret)
+ goto out;
+ goto skip;
+ }
+ }
+ /*
+ * canceled?
+ */
+ if (atomic_read(&fs_info->scrub_cancel_req) ||
+ atomic_read(&sctx->cancel_req)) {
+ ret = -ECANCELED;
+ goto out;
+ }
+ /*
+ * check to see if we have to pause
+ */
+ if (atomic_read(&fs_info->scrub_pause_req)) {
+ /* push queued extents */
+ atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
+ scrub_submit(sctx);
+ mutex_lock(&sctx->wr_ctx.wr_lock);
+ scrub_wr_submit(sctx);
+ mutex_unlock(&sctx->wr_ctx.wr_lock);
+ wait_event(sctx->list_wait,
+ atomic_read(&sctx->bios_in_flight) == 0);
+ atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
+ scrub_blocked_if_needed(fs_info);
+ }
+
+ if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
+ key.type = BTRFS_METADATA_ITEM_KEY;
+ else
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.objectid = logical;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ if (ret > 0) {
+ ret = btrfs_previous_extent_item(root, path, 0);
+ if (ret < 0)
+ goto out;
+ if (ret > 0) {
+ /* there's no smaller item, so stick with the
+ * larger one */
+ btrfs_release_path(path);
+ ret = btrfs_search_slot(NULL, root, &key,
+ path, 0, 0);
+ if (ret < 0)
+ goto out;
+ }
+ }
+
+ stop_loop = 0;
+ while (1) {
+ u64 bytes;
+
+ l = path->nodes[0];
+ slot = path->slots[0];
+ if (slot >= btrfs_header_nritems(l)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret == 0)
+ continue;
+ if (ret < 0)
+ goto out;
+
+ stop_loop = 1;
+ break;
+ }
+ btrfs_item_key_to_cpu(l, &key, slot);
+
+ if (key.type == BTRFS_METADATA_ITEM_KEY)
+ bytes = root->nodesize;
+ else
+ bytes = key.offset;
+
+ if (key.objectid + bytes <= logical)
+ goto next;
+
+ if (key.type != BTRFS_EXTENT_ITEM_KEY &&
+ key.type != BTRFS_METADATA_ITEM_KEY)
+ goto next;
+
+ if (key.objectid >= logical + map->stripe_len) {
+ /* out of this device extent */
+ if (key.objectid >= logic_end)
+ stop_loop = 1;
+ break;
+ }
+
+ extent = btrfs_item_ptr(l, slot,
+ struct btrfs_extent_item);
+ flags = btrfs_extent_flags(l, extent);
+ generation = btrfs_extent_generation(l, extent);
+
+ if (key.objectid < logical &&
+ (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
+ btrfs_err(fs_info,
+ "scrub: tree block %llu spanning "
+ "stripes, ignored. logical=%llu",
+ key.objectid, logical);
+ goto next;
+ }
+
+again:
+ extent_logical = key.objectid;
+ extent_len = bytes;
+
+ /*
+ * trim extent to this stripe
+ */
+ if (extent_logical < logical) {
+ extent_len -= logical - extent_logical;
+ extent_logical = logical;
+ }
+ if (extent_logical + extent_len >
+ logical + map->stripe_len) {
+ extent_len = logical + map->stripe_len -
+ extent_logical;
+ }
+
+ extent_physical = extent_logical - logical + physical;
+ extent_dev = scrub_dev;
+ extent_mirror_num = mirror_num;
+ if (is_dev_replace)
+ scrub_remap_extent(fs_info, extent_logical,
+ extent_len, &extent_physical,
+ &extent_dev,
+ &extent_mirror_num);
+
+ ret = btrfs_lookup_csums_range(csum_root, logical,
+ logical + map->stripe_len - 1,
+ &sctx->csum_list, 1);
+ if (ret)
+ goto out;
+
+ ret = scrub_extent(sctx, extent_logical, extent_len,
+ extent_physical, extent_dev, flags,
+ generation, extent_mirror_num,
+ extent_logical - logical + physical);
+ if (ret)
+ goto out;
+
+ scrub_free_csums(sctx);
+ if (extent_logical + extent_len <
+ key.objectid + bytes) {
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+ /*
+ * loop until we find next data stripe
+ * or we have finished all stripes.
+ */
+loop:
+ physical += map->stripe_len;
+ ret = get_raid56_logic_offset(physical,
+ num, map, &logical,
+ &stripe_logical);
+ logical += base;
+
+ if (ret && physical < physical_end) {
+ stripe_logical += base;
+ stripe_end = stripe_logical +
+ increment - 1;
+ ret = scrub_raid56_parity(sctx,
+ map, scrub_dev, ppath,
+ stripe_logical,
+ stripe_end);
+ if (ret)
+ goto out;
+ goto loop;
+ }
+ } else {
+ physical += map->stripe_len;
+ logical += increment;
+ }
+ if (logical < key.objectid + bytes) {
+ cond_resched();
+ goto again;
+ }
+
+ if (physical >= physical_end) {
+ stop_loop = 1;
+ break;
+ }
+ }
+next:
+ path->slots[0]++;
+ }
+ btrfs_release_path(path);
+skip:
+ logical += increment;
+ physical += map->stripe_len;
+ spin_lock(&sctx->stat_lock);
+ if (stop_loop)
+ sctx->stat.last_physical = map->stripes[num].physical +
+ length;
+ else
+ sctx->stat.last_physical = physical;
+ spin_unlock(&sctx->stat_lock);
+ if (stop_loop)
+ break;
+ }
+out:
+ /* push queued extents */
+ scrub_submit(sctx);
+ mutex_lock(&sctx->wr_ctx.wr_lock);
+ scrub_wr_submit(sctx);
+ mutex_unlock(&sctx->wr_ctx.wr_lock);
+
+ blk_finish_plug(&plug);
+ btrfs_free_path(path);
+ btrfs_free_path(ppath);
+ return ret < 0 ? ret : 0;
+}
+
+static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
+ struct btrfs_device *scrub_dev,
+ u64 chunk_tree, u64 chunk_objectid,
+ u64 chunk_offset, u64 length,
+ u64 dev_offset, int is_dev_replace)
+{
+ struct btrfs_mapping_tree *map_tree =
+ &sctx->dev_root->fs_info->mapping_tree;
+ struct map_lookup *map;
+ struct extent_map *em;
+ int i;
+ int ret = 0;
+
+ read_lock(&map_tree->map_tree.lock);
+ em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
+ read_unlock(&map_tree->map_tree.lock);
+
+ if (!em)
+ return -EINVAL;
+
+ map = (struct map_lookup *)em->bdev;
+ if (em->start != chunk_offset)
+ goto out;
+
+ if (em->len < length)
+ goto out;
+
+ for (i = 0; i < map->num_stripes; ++i) {
+ if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
+ map->stripes[i].physical == dev_offset) {
+ ret = scrub_stripe(sctx, map, scrub_dev, i,
+ chunk_offset, length,
+ is_dev_replace);
+ if (ret)
+ goto out;
+ }
+ }
+out:
+ free_extent_map(em);
+
+ return ret;
+}
+
+static noinline_for_stack
+int scrub_enumerate_chunks(struct scrub_ctx *sctx,
+ struct btrfs_device *scrub_dev, u64 start, u64 end,
+ int is_dev_replace)
+{
+ struct btrfs_dev_extent *dev_extent = NULL;
+ struct btrfs_path *path;
+ struct btrfs_root *root = sctx->dev_root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ u64 length;
+ u64 chunk_tree;
+ u64 chunk_objectid;
+ u64 chunk_offset;
+ int ret;
+ int slot;
+ struct extent_buffer *l;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ struct btrfs_block_group_cache *cache;
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ path->reada = 2;
+ path->search_commit_root = 1;
+ path->skip_locking = 1;
+
+ key.objectid = scrub_dev->devid;
+ key.offset = 0ull;
+ key.type = BTRFS_DEV_EXTENT_KEY;
+
+ while (1) {
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ break;
+ if (ret > 0) {
+ if (path->slots[0] >=
+ btrfs_header_nritems(path->nodes[0])) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret)
+ break;
+ }
+ }
+
+ l = path->nodes[0];
+ slot = path->slots[0];
+
+ btrfs_item_key_to_cpu(l, &found_key, slot);
+
+ if (found_key.objectid != scrub_dev->devid)
+ break;
+
+ if (found_key.type != BTRFS_DEV_EXTENT_KEY)
+ break;
+
+ if (found_key.offset >= end)
+ break;
+
+ if (found_key.offset < key.offset)
+ break;
+
+ dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
+ length = btrfs_dev_extent_length(l, dev_extent);
+
+ if (found_key.offset + length <= start)
+ goto skip;
+
+ chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
+ chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
+ chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
+
+ /*
+ * get a reference on the corresponding block group to prevent
+ * the chunk from going away while we scrub it
+ */
+ cache = btrfs_lookup_block_group(fs_info, chunk_offset);
+
+ /* some chunks are removed but not committed to disk yet,
+ * continue scrubbing */
+ if (!cache)
+ goto skip;
+
+ dev_replace->cursor_right = found_key.offset + length;
+ dev_replace->cursor_left = found_key.offset;
+ dev_replace->item_needs_writeback = 1;
+ ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid,
+ chunk_offset, length, found_key.offset,
+ is_dev_replace);
+
+ /*
+ * flush, submit all pending read and write bios, afterwards
+ * wait for them.
+ * Note that in the dev replace case, a read request causes
+ * write requests that are submitted in the read completion
+ * worker. Therefore in the current situation, it is required
+ * that all write requests are flushed, so that all read and
+ * write requests are really completed when bios_in_flight
+ * changes to 0.
+ */
+ atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
+ scrub_submit(sctx);
+ mutex_lock(&sctx->wr_ctx.wr_lock);
+ scrub_wr_submit(sctx);
+ mutex_unlock(&sctx->wr_ctx.wr_lock);
+
+ wait_event(sctx->list_wait,
+ atomic_read(&sctx->bios_in_flight) == 0);
+ atomic_inc(&fs_info->scrubs_paused);
+ wake_up(&fs_info->scrub_pause_wait);
+
+ /*
+ * must be called before we decrease @scrub_paused.
+ * make sure we don't block transaction commit while
+ * we are waiting pending workers finished.
+ */
+ wait_event(sctx->list_wait,
+ atomic_read(&sctx->workers_pending) == 0);
+ atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
+
+ mutex_lock(&fs_info->scrub_lock);
+ __scrub_blocked_if_needed(fs_info);
+ atomic_dec(&fs_info->scrubs_paused);
+ mutex_unlock(&fs_info->scrub_lock);
+ wake_up(&fs_info->scrub_pause_wait);
+
+ btrfs_put_block_group(cache);
+ if (ret)
+ break;
+ if (is_dev_replace &&
+ atomic64_read(&dev_replace->num_write_errors) > 0) {
+ ret = -EIO;
+ break;
+ }
+ if (sctx->stat.malloc_errors > 0) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ dev_replace->cursor_left = dev_replace->cursor_right;
+ dev_replace->item_needs_writeback = 1;
+skip:
+ key.offset = found_key.offset + length;
+ btrfs_release_path(path);
+ }
+
+ btrfs_free_path(path);
+
+ /*
+ * ret can still be 1 from search_slot or next_leaf,
+ * that's not an error
+ */
+ return ret < 0 ? ret : 0;
+}
+
+static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
+ struct btrfs_device *scrub_dev)
+{
+ int i;
+ u64 bytenr;
+ u64 gen;
+ int ret;
+ struct btrfs_root *root = sctx->dev_root;
+
+ if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
+ return -EIO;
+
+ /* Seed devices of a new filesystem has their own generation. */
+ if (scrub_dev->fs_devices != root->fs_info->fs_devices)
+ gen = scrub_dev->generation;
+ else
+ gen = root->fs_info->last_trans_committed;
+
+ for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+ bytenr = btrfs_sb_offset(i);
+ if (bytenr + BTRFS_SUPER_INFO_SIZE >
+ scrub_dev->commit_total_bytes)
+ break;
+
+ ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
+ scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
+ NULL, 1, bytenr);
+ if (ret)
+ return ret;
+ }
+ wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
+
+ return 0;
+}
+
+/*
+ * get a reference count on fs_info->scrub_workers. start worker if necessary
+ */
+static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
+ int is_dev_replace)
+{
+ int ret = 0;
+ unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
+ int max_active = fs_info->thread_pool_size;
+
+ if (fs_info->scrub_workers_refcnt == 0) {
+ if (is_dev_replace)
+ fs_info->scrub_workers =
+ btrfs_alloc_workqueue("btrfs-scrub", flags,
+ 1, 4);
+ else
+ fs_info->scrub_workers =
+ btrfs_alloc_workqueue("btrfs-scrub", flags,
+ max_active, 4);
+ if (!fs_info->scrub_workers) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ fs_info->scrub_wr_completion_workers =
+ btrfs_alloc_workqueue("btrfs-scrubwrc", flags,
+ max_active, 2);
+ if (!fs_info->scrub_wr_completion_workers) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ fs_info->scrub_nocow_workers =
+ btrfs_alloc_workqueue("btrfs-scrubnc", flags, 1, 0);
+ if (!fs_info->scrub_nocow_workers) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+ ++fs_info->scrub_workers_refcnt;
+out:
+ return ret;
+}
+
+static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
+{
+ if (--fs_info->scrub_workers_refcnt == 0) {
+ btrfs_destroy_workqueue(fs_info->scrub_workers);
+ btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
+ btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);
+ }
+ WARN_ON(fs_info->scrub_workers_refcnt < 0);
+}
+
+int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
+ u64 end, struct btrfs_scrub_progress *progress,
+ int readonly, int is_dev_replace)
+{
+ struct scrub_ctx *sctx;
+ int ret;
+ struct btrfs_device *dev;
+ struct rcu_string *name;
+
+ if (btrfs_fs_closing(fs_info))
+ return -EINVAL;
+
+ if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) {
+ /*
+ * in this case scrub is unable to calculate the checksum
+ * the way scrub is implemented. Do not handle this
+ * situation at all because it won't ever happen.
+ */
+ btrfs_err(fs_info,
+ "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
+ fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN);
+ return -EINVAL;
+ }
+
+ if (fs_info->chunk_root->sectorsize != PAGE_SIZE) {
+ /* not supported for data w/o checksums */
+ btrfs_err(fs_info,
+ "scrub: size assumption sectorsize != PAGE_SIZE "
+ "(%d != %lu) fails",
+ fs_info->chunk_root->sectorsize, PAGE_SIZE);
+ return -EINVAL;
+ }
+
+ if (fs_info->chunk_root->nodesize >
+ PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
+ fs_info->chunk_root->sectorsize >
+ PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
+ /*
+ * would exhaust the array bounds of pagev member in
+ * struct scrub_block
+ */
+ btrfs_err(fs_info, "scrub: size assumption nodesize and sectorsize "
+ "<= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails",
+ fs_info->chunk_root->nodesize,
+ SCRUB_MAX_PAGES_PER_BLOCK,
+ fs_info->chunk_root->sectorsize,
+ SCRUB_MAX_PAGES_PER_BLOCK);
+ return -EINVAL;
+ }
+
+
+ mutex_lock(&fs_info->fs_devices->device_list_mutex);
+ dev = btrfs_find_device(fs_info, devid, NULL, NULL);
+ if (!dev || (dev->missing && !is_dev_replace)) {
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ return -ENODEV;
+ }
+
+ if (!is_dev_replace && !readonly && !dev->writeable) {
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ rcu_read_lock();
+ name = rcu_dereference(dev->name);
+ btrfs_err(fs_info, "scrub: device %s is not writable",
+ name->str);
+ rcu_read_unlock();
+ return -EROFS;
+ }
+
+ mutex_lock(&fs_info->scrub_lock);
+ if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {
+ mutex_unlock(&fs_info->scrub_lock);
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ return -EIO;
+ }
+
+ btrfs_dev_replace_lock(&fs_info->dev_replace);
+ if (dev->scrub_device ||
+ (!is_dev_replace &&
+ btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
+ btrfs_dev_replace_unlock(&fs_info->dev_replace);
+ mutex_unlock(&fs_info->scrub_lock);
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ return -EINPROGRESS;
+ }
+ btrfs_dev_replace_unlock(&fs_info->dev_replace);
+
+ ret = scrub_workers_get(fs_info, is_dev_replace);
+ if (ret) {
+ mutex_unlock(&fs_info->scrub_lock);
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ return ret;
+ }
+
+ sctx = scrub_setup_ctx(dev, is_dev_replace);
+ if (IS_ERR(sctx)) {
+ mutex_unlock(&fs_info->scrub_lock);
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ scrub_workers_put(fs_info);
+ return PTR_ERR(sctx);
+ }
+ sctx->readonly = readonly;
+ dev->scrub_device = sctx;
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+
+ /*
+ * checking @scrub_pause_req here, we can avoid
+ * race between committing transaction and scrubbing.
+ */
+ __scrub_blocked_if_needed(fs_info);
+ atomic_inc(&fs_info->scrubs_running);
+ mutex_unlock(&fs_info->scrub_lock);
+
+ if (!is_dev_replace) {
+ /*
+ * by holding device list mutex, we can
+ * kick off writing super in log tree sync.
+ */
+ mutex_lock(&fs_info->fs_devices->device_list_mutex);
+ ret = scrub_supers(sctx, dev);
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ }
+
+ if (!ret)
+ ret = scrub_enumerate_chunks(sctx, dev, start, end,
+ is_dev_replace);
+
+ wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
+ atomic_dec(&fs_info->scrubs_running);
+ wake_up(&fs_info->scrub_pause_wait);
+
+ wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
+
+ if (progress)
+ memcpy(progress, &sctx->stat, sizeof(*progress));
+
+ mutex_lock(&fs_info->scrub_lock);
+ dev->scrub_device = NULL;
+ scrub_workers_put(fs_info);
+ mutex_unlock(&fs_info->scrub_lock);
+
+ scrub_put_ctx(sctx);
+
+ return ret;
+}
+
+void btrfs_scrub_pause(struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+
+ mutex_lock(&fs_info->scrub_lock);
+ atomic_inc(&fs_info->scrub_pause_req);
+ while (atomic_read(&fs_info->scrubs_paused) !=
+ atomic_read(&fs_info->scrubs_running)) {
+ mutex_unlock(&fs_info->scrub_lock);
+ wait_event(fs_info->scrub_pause_wait,
+ atomic_read(&fs_info->scrubs_paused) ==
+ atomic_read(&fs_info->scrubs_running));
+ mutex_lock(&fs_info->scrub_lock);
+ }
+ mutex_unlock(&fs_info->scrub_lock);
+}
+
+void btrfs_scrub_continue(struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+
+ atomic_dec(&fs_info->scrub_pause_req);
+ wake_up(&fs_info->scrub_pause_wait);
+}
+
+int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
+{
+ mutex_lock(&fs_info->scrub_lock);
+ if (!atomic_read(&fs_info->scrubs_running)) {
+ mutex_unlock(&fs_info->scrub_lock);
+ return -ENOTCONN;
+ }
+
+ atomic_inc(&fs_info->scrub_cancel_req);
+ while (atomic_read(&fs_info->scrubs_running)) {
+ mutex_unlock(&fs_info->scrub_lock);
+ wait_event(fs_info->scrub_pause_wait,
+ atomic_read(&fs_info->scrubs_running) == 0);
+ mutex_lock(&fs_info->scrub_lock);
+ }
+ atomic_dec(&fs_info->scrub_cancel_req);
+ mutex_unlock(&fs_info->scrub_lock);
+
+ return 0;
+}
+
+int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *dev)
+{
+ struct scrub_ctx *sctx;
+
+ mutex_lock(&fs_info->scrub_lock);
+ sctx = dev->scrub_device;
+ if (!sctx) {
+ mutex_unlock(&fs_info->scrub_lock);
+ return -ENOTCONN;
+ }
+ atomic_inc(&sctx->cancel_req);
+ while (dev->scrub_device) {
+ mutex_unlock(&fs_info->scrub_lock);
+ wait_event(fs_info->scrub_pause_wait,
+ dev->scrub_device == NULL);
+ mutex_lock(&fs_info->scrub_lock);
+ }
+ mutex_unlock(&fs_info->scrub_lock);
+
+ return 0;
+}
+
+int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
+ struct btrfs_scrub_progress *progress)
+{
+ struct btrfs_device *dev;
+ struct scrub_ctx *sctx = NULL;
+
+ mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+ dev = btrfs_find_device(root->fs_info, devid, NULL, NULL);
+ if (dev)
+ sctx = dev->scrub_device;
+ if (sctx)
+ memcpy(progress, &sctx->stat, sizeof(*progress));
+ mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
+ return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
+}
+
+static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
+ u64 extent_logical, u64 extent_len,
+ u64 *extent_physical,
+ struct btrfs_device **extent_dev,
+ int *extent_mirror_num)
+{
+ u64 mapped_length;
+ struct btrfs_bio *bbio = NULL;
+ int ret;
+
+ mapped_length = extent_len;
+ ret = btrfs_map_block(fs_info, READ, extent_logical,
+ &mapped_length, &bbio, 0);
+ if (ret || !bbio || mapped_length < extent_len ||
+ !bbio->stripes[0].dev->bdev) {
+ btrfs_put_bbio(bbio);
+ return;
+ }
+
+ *extent_physical = bbio->stripes[0].physical;
+ *extent_mirror_num = bbio->mirror_num;
+ *extent_dev = bbio->stripes[0].dev;
+ btrfs_put_bbio(bbio);
+}
+
+static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
+ struct scrub_wr_ctx *wr_ctx,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_device *dev,
+ int is_dev_replace)
+{
+ WARN_ON(wr_ctx->wr_curr_bio != NULL);
+
+ mutex_init(&wr_ctx->wr_lock);
+ wr_ctx->wr_curr_bio = NULL;
+ if (!is_dev_replace)
+ return 0;
+
+ WARN_ON(!dev->bdev);
+ wr_ctx->pages_per_wr_bio = min_t(int, SCRUB_PAGES_PER_WR_BIO,
+ bio_get_nr_vecs(dev->bdev));
+ wr_ctx->tgtdev = dev;
+ atomic_set(&wr_ctx->flush_all_writes, 0);
+ return 0;
+}
+
+static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx)
+{
+ mutex_lock(&wr_ctx->wr_lock);
+ kfree(wr_ctx->wr_curr_bio);
+ wr_ctx->wr_curr_bio = NULL;
+ mutex_unlock(&wr_ctx->wr_lock);
+}
+
+static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
+ int mirror_num, u64 physical_for_dev_replace)
+{
+ struct scrub_copy_nocow_ctx *nocow_ctx;
+ struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+
+ nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS);
+ if (!nocow_ctx) {
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.malloc_errors++;
+ spin_unlock(&sctx->stat_lock);
+ return -ENOMEM;
+ }
+
+ scrub_pending_trans_workers_inc(sctx);
+
+ nocow_ctx->sctx = sctx;
+ nocow_ctx->logical = logical;
+ nocow_ctx->len = len;
+ nocow_ctx->mirror_num = mirror_num;
+ nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
+ btrfs_init_work(&nocow_ctx->work, btrfs_scrubnc_helper,
+ copy_nocow_pages_worker, NULL, NULL);
+ INIT_LIST_HEAD(&nocow_ctx->inodes);
+ btrfs_queue_work(fs_info->scrub_nocow_workers,
+ &nocow_ctx->work);
+
+ return 0;
+}
+
+static int record_inode_for_nocow(u64 inum, u64 offset, u64 root, void *ctx)
+{
+ struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
+ struct scrub_nocow_inode *nocow_inode;
+
+ nocow_inode = kzalloc(sizeof(*nocow_inode), GFP_NOFS);
+ if (!nocow_inode)
+ return -ENOMEM;
+ nocow_inode->inum = inum;
+ nocow_inode->offset = offset;
+ nocow_inode->root = root;
+ list_add_tail(&nocow_inode->list, &nocow_ctx->inodes);
+ return 0;
+}
+
+#define COPY_COMPLETE 1
+
+static void copy_nocow_pages_worker(struct btrfs_work *work)
+{
+ struct scrub_copy_nocow_ctx *nocow_ctx =
+ container_of(work, struct scrub_copy_nocow_ctx, work);
+ struct scrub_ctx *sctx = nocow_ctx->sctx;
+ u64 logical = nocow_ctx->logical;
+ u64 len = nocow_ctx->len;
+ int mirror_num = nocow_ctx->mirror_num;
+ u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
+ int ret;
+ struct btrfs_trans_handle *trans = NULL;
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_path *path;
+ struct btrfs_root *root;
+ int not_written = 0;
+
+ fs_info = sctx->dev_root->fs_info;
+ root = fs_info->extent_root;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.malloc_errors++;
+ spin_unlock(&sctx->stat_lock);
+ not_written = 1;
+ goto out;
+ }
+
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ not_written = 1;
+ goto out;
+ }
+
+ ret = iterate_inodes_from_logical(logical, fs_info, path,
+ record_inode_for_nocow, nocow_ctx);
+ if (ret != 0 && ret != -ENOENT) {
+ btrfs_warn(fs_info, "iterate_inodes_from_logical() failed: log %llu, "
+ "phys %llu, len %llu, mir %u, ret %d",
+ logical, physical_for_dev_replace, len, mirror_num,
+ ret);
+ not_written = 1;
+ goto out;
+ }
+
+ btrfs_end_transaction(trans, root);
+ trans = NULL;
+ while (!list_empty(&nocow_ctx->inodes)) {
+ struct scrub_nocow_inode *entry;
+ entry = list_first_entry(&nocow_ctx->inodes,
+ struct scrub_nocow_inode,
+ list);
+ list_del_init(&entry->list);
+ ret = copy_nocow_pages_for_inode(entry->inum, entry->offset,
+ entry->root, nocow_ctx);
+ kfree(entry);
+ if (ret == COPY_COMPLETE) {
+ ret = 0;
+ break;
+ } else if (ret) {
+ break;
+ }
+ }
+out:
+ while (!list_empty(&nocow_ctx->inodes)) {
+ struct scrub_nocow_inode *entry;
+ entry = list_first_entry(&nocow_ctx->inodes,
+ struct scrub_nocow_inode,
+ list);
+ list_del_init(&entry->list);
+ kfree(entry);
+ }
+ if (trans && !IS_ERR(trans))
+ btrfs_end_transaction(trans, root);
+ if (not_written)
+ btrfs_dev_replace_stats_inc(&fs_info->dev_replace.
+ num_uncorrectable_read_errors);
+
+ btrfs_free_path(path);
+ kfree(nocow_ctx);
+
+ scrub_pending_trans_workers_dec(sctx);
+}
+
+static int check_extent_to_block(struct inode *inode, u64 start, u64 len,
+ u64 logical)
+{
+ struct extent_state *cached_state = NULL;
+ struct btrfs_ordered_extent *ordered;
+ struct extent_io_tree *io_tree;
+ struct extent_map *em;
+ u64 lockstart = start, lockend = start + len - 1;
+ int ret = 0;
+
+ io_tree = &BTRFS_I(inode)->io_tree;
+
+ lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state);
+ ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
+ if (ordered) {
+ btrfs_put_ordered_extent(ordered);
+ ret = 1;
+ goto out_unlock;
+ }
+
+ em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto out_unlock;
+ }
+
+ /*
+ * This extent does not actually cover the logical extent anymore,
+ * move on to the next inode.
+ */
+ if (em->block_start > logical ||
+ em->block_start + em->block_len < logical + len) {
+ free_extent_map(em);
+ ret = 1;
+ goto out_unlock;
+ }
+ free_extent_map(em);
+
+out_unlock:
+ unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,
+ GFP_NOFS);
+ return ret;
+}
+
+static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
+ struct scrub_copy_nocow_ctx *nocow_ctx)
+{
+ struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
+ struct btrfs_key key;
+ struct inode *inode;
+ struct page *page;
+ struct btrfs_root *local_root;
+ struct extent_io_tree *io_tree;
+ u64 physical_for_dev_replace;
+ u64 nocow_ctx_logical;
+ u64 len = nocow_ctx->len;
+ unsigned long index;
+ int srcu_index;
+ int ret = 0;
+ int err = 0;
+
+ key.objectid = root;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = (u64)-1;
+
+ srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
+
+ local_root = btrfs_read_fs_root_no_name(fs_info, &key);
+ if (IS_ERR(local_root)) {
+ srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
+ return PTR_ERR(local_root);
+ }
+
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.objectid = inum;
+ key.offset = 0;
+ inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
+ srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ /* Avoid truncate/dio/punch hole.. */
+ mutex_lock(&inode->i_mutex);
+ inode_dio_wait(inode);
+
+ physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
+ io_tree = &BTRFS_I(inode)->io_tree;
+ nocow_ctx_logical = nocow_ctx->logical;
+
+ ret = check_extent_to_block(inode, offset, len, nocow_ctx_logical);
+ if (ret) {
+ ret = ret > 0 ? 0 : ret;
+ goto out;
+ }
+
+ while (len >= PAGE_CACHE_SIZE) {
+ index = offset >> PAGE_CACHE_SHIFT;
+again:
+ page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+ if (!page) {
+ btrfs_err(fs_info, "find_or_create_page() failed");
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (PageUptodate(page)) {
+ if (PageDirty(page))
+ goto next_page;
+ } else {
+ ClearPageError(page);
+ err = extent_read_full_page(io_tree, page,
+ btrfs_get_extent,
+ nocow_ctx->mirror_num);
+ if (err) {
+ ret = err;
+ goto next_page;
+ }
+
+ lock_page(page);
+ /*
+ * If the page has been remove from the page cache,
+ * the data on it is meaningless, because it may be
+ * old one, the new data may be written into the new
+ * page in the page cache.
+ */
+ if (page->mapping != inode->i_mapping) {
+ unlock_page(page);
+ page_cache_release(page);
+ goto again;
+ }
+ if (!PageUptodate(page)) {
+ ret = -EIO;
+ goto next_page;
+ }
+ }
+
+ ret = check_extent_to_block(inode, offset, len,
+ nocow_ctx_logical);
+ if (ret) {
+ ret = ret > 0 ? 0 : ret;
+ goto next_page;
+ }
+
+ err = write_page_nocow(nocow_ctx->sctx,
+ physical_for_dev_replace, page);
+ if (err)
+ ret = err;
+next_page:
+ unlock_page(page);
+ page_cache_release(page);
+
+ if (ret)
+ break;
+
+ offset += PAGE_CACHE_SIZE;
+ physical_for_dev_replace += PAGE_CACHE_SIZE;
+ nocow_ctx_logical += PAGE_CACHE_SIZE;
+ len -= PAGE_CACHE_SIZE;
+ }
+ ret = COPY_COMPLETE;
+out:
+ mutex_unlock(&inode->i_mutex);
+ iput(inode);
+ return ret;
+}
+
+static int write_page_nocow(struct scrub_ctx *sctx,
+ u64 physical_for_dev_replace, struct page *page)
+{
+ struct bio *bio;
+ struct btrfs_device *dev;
+ int ret;
+
+ dev = sctx->wr_ctx.tgtdev;
+ if (!dev)
+ return -EIO;
+ if (!dev->bdev) {
+ printk_ratelimited(KERN_WARNING
+ "BTRFS: scrub write_page_nocow(bdev == NULL) is unexpected!\n");
+ return -EIO;
+ }
+ bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
+ if (!bio) {
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.malloc_errors++;
+ spin_unlock(&sctx->stat_lock);
+ return -ENOMEM;
+ }
+ bio->bi_iter.bi_size = 0;
+ bio->bi_iter.bi_sector = physical_for_dev_replace >> 9;
+ bio->bi_bdev = dev->bdev;
+ ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
+ if (ret != PAGE_CACHE_SIZE) {
+leave_with_eio:
+ bio_put(bio);
+ btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
+ return -EIO;
+ }
+
+ if (btrfsic_submit_bio_wait(WRITE_SYNC, bio))
+ goto leave_with_eio;
+
+ bio_put(bio);
+ return 0;
+}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
new file mode 100644
index 000000000..5cf7838fb
--- /dev/null
+++ b/fs/btrfs/send.c
@@ -0,0 +1,5978 @@
+/*
+ * Copyright (C) 2012 Alexander Block. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/bsearch.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/sort.h>
+#include <linux/mount.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/radix-tree.h>
+#include <linux/vmalloc.h>
+#include <linux/string.h>
+
+#include "send.h"
+#include "backref.h"
+#include "hash.h"
+#include "locking.h"
+#include "disk-io.h"
+#include "btrfs_inode.h"
+#include "transaction.h"
+
+static int g_verbose = 0;
+
+#define verbose_printk(...) if (g_verbose) printk(__VA_ARGS__)
+
+/*
+ * A fs_path is a helper to dynamically build path names with unknown size.
+ * It reallocates the internal buffer on demand.
+ * It allows fast adding of path elements on the right side (normal path) and
+ * fast adding to the left side (reversed path). A reversed path can also be
+ * unreversed if needed.
+ */
+struct fs_path {
+ union {
+ struct {
+ char *start;
+ char *end;
+
+ char *buf;
+ unsigned short buf_len:15;
+ unsigned short reversed:1;
+ char inline_buf[];
+ };
+ /*
+ * Average path length does not exceed 200 bytes, we'll have
+ * better packing in the slab and higher chance to satisfy
+ * a allocation later during send.
+ */
+ char pad[256];
+ };
+};
+#define FS_PATH_INLINE_SIZE \
+ (sizeof(struct fs_path) - offsetof(struct fs_path, inline_buf))
+
+
+/* reused for each extent */
+struct clone_root {
+ struct btrfs_root *root;
+ u64 ino;
+ u64 offset;
+
+ u64 found_refs;
+};
+
+#define SEND_CTX_MAX_NAME_CACHE_SIZE 128
+#define SEND_CTX_NAME_CACHE_CLEAN_SIZE (SEND_CTX_MAX_NAME_CACHE_SIZE * 2)
+
+struct send_ctx {
+ struct file *send_filp;
+ loff_t send_off;
+ char *send_buf;
+ u32 send_size;
+ u32 send_max_size;
+ u64 total_send_size;
+ u64 cmd_send_size[BTRFS_SEND_C_MAX + 1];
+ u64 flags; /* 'flags' member of btrfs_ioctl_send_args is u64 */
+
+ struct btrfs_root *send_root;
+ struct btrfs_root *parent_root;
+ struct clone_root *clone_roots;
+ int clone_roots_cnt;
+
+ /* current state of the compare_tree call */
+ struct btrfs_path *left_path;
+ struct btrfs_path *right_path;
+ struct btrfs_key *cmp_key;
+
+ /*
+ * infos of the currently processed inode. In case of deleted inodes,
+ * these are the values from the deleted inode.
+ */
+ u64 cur_ino;
+ u64 cur_inode_gen;
+ int cur_inode_new;
+ int cur_inode_new_gen;
+ int cur_inode_deleted;
+ u64 cur_inode_size;
+ u64 cur_inode_mode;
+ u64 cur_inode_rdev;
+ u64 cur_inode_last_extent;
+
+ u64 send_progress;
+
+ struct list_head new_refs;
+ struct list_head deleted_refs;
+
+ struct radix_tree_root name_cache;
+ struct list_head name_cache_list;
+ int name_cache_size;
+
+ struct file_ra_state ra;
+
+ char *read_buf;
+
+ /*
+ * We process inodes by their increasing order, so if before an
+ * incremental send we reverse the parent/child relationship of
+ * directories such that a directory with a lower inode number was
+ * the parent of a directory with a higher inode number, and the one
+ * becoming the new parent got renamed too, we can't rename/move the
+ * directory with lower inode number when we finish processing it - we
+ * must process the directory with higher inode number first, then
+ * rename/move it and then rename/move the directory with lower inode
+ * number. Example follows.
+ *
+ * Tree state when the first send was performed:
+ *
+ * .
+ * |-- a (ino 257)
+ * |-- b (ino 258)
+ * |
+ * |
+ * |-- c (ino 259)
+ * | |-- d (ino 260)
+ * |
+ * |-- c2 (ino 261)
+ *
+ * Tree state when the second (incremental) send is performed:
+ *
+ * .
+ * |-- a (ino 257)
+ * |-- b (ino 258)
+ * |-- c2 (ino 261)
+ * |-- d2 (ino 260)
+ * |-- cc (ino 259)
+ *
+ * The sequence of steps that lead to the second state was:
+ *
+ * mv /a/b/c/d /a/b/c2/d2
+ * mv /a/b/c /a/b/c2/d2/cc
+ *
+ * "c" has lower inode number, but we can't move it (2nd mv operation)
+ * before we move "d", which has higher inode number.
+ *
+ * So we just memorize which move/rename operations must be performed
+ * later when their respective parent is processed and moved/renamed.
+ */
+
+ /* Indexed by parent directory inode number. */
+ struct rb_root pending_dir_moves;
+
+ /*
+ * Reverse index, indexed by the inode number of a directory that
+ * is waiting for the move/rename of its immediate parent before its
+ * own move/rename can be performed.
+ */
+ struct rb_root waiting_dir_moves;
+
+ /*
+ * A directory that is going to be rm'ed might have a child directory
+ * which is in the pending directory moves index above. In this case,
+ * the directory can only be removed after the move/rename of its child
+ * is performed. Example:
+ *
+ * Parent snapshot:
+ *
+ * . (ino 256)
+ * |-- a/ (ino 257)
+ * |-- b/ (ino 258)
+ * |-- c/ (ino 259)
+ * | |-- x/ (ino 260)
+ * |
+ * |-- y/ (ino 261)
+ *
+ * Send snapshot:
+ *
+ * . (ino 256)
+ * |-- a/ (ino 257)
+ * |-- b/ (ino 258)
+ * |-- YY/ (ino 261)
+ * |-- x/ (ino 260)
+ *
+ * Sequence of steps that lead to the send snapshot:
+ * rm -f /a/b/c/foo.txt
+ * mv /a/b/y /a/b/YY
+ * mv /a/b/c/x /a/b/YY
+ * rmdir /a/b/c
+ *
+ * When the child is processed, its move/rename is delayed until its
+ * parent is processed (as explained above), but all other operations
+ * like update utimes, chown, chgrp, etc, are performed and the paths
+ * that it uses for those operations must use the orphanized name of
+ * its parent (the directory we're going to rm later), so we need to
+ * memorize that name.
+ *
+ * Indexed by the inode number of the directory to be deleted.
+ */
+ struct rb_root orphan_dirs;
+};
+
+struct pending_dir_move {
+ struct rb_node node;
+ struct list_head list;
+ u64 parent_ino;
+ u64 ino;
+ u64 gen;
+ bool is_orphan;
+ struct list_head update_refs;
+};
+
+struct waiting_dir_move {
+ struct rb_node node;
+ u64 ino;
+ /*
+ * There might be some directory that could not be removed because it
+ * was waiting for this directory inode to be moved first. Therefore
+ * after this directory is moved, we can try to rmdir the ino rmdir_ino.
+ */
+ u64 rmdir_ino;
+};
+
+struct orphan_dir_info {
+ struct rb_node node;
+ u64 ino;
+ u64 gen;
+};
+
+struct name_cache_entry {
+ struct list_head list;
+ /*
+ * radix_tree has only 32bit entries but we need to handle 64bit inums.
+ * We use the lower 32bit of the 64bit inum to store it in the tree. If
+ * more then one inum would fall into the same entry, we use radix_list
+ * to store the additional entries. radix_list is also used to store
+ * entries where two entries have the same inum but different
+ * generations.
+ */
+ struct list_head radix_list;
+ u64 ino;
+ u64 gen;
+ u64 parent_ino;
+ u64 parent_gen;
+ int ret;
+ int need_later_update;
+ int name_len;
+ char name[];
+};
+
+static int is_waiting_for_move(struct send_ctx *sctx, u64 ino);
+
+static struct waiting_dir_move *
+get_waiting_dir_move(struct send_ctx *sctx, u64 ino);
+
+static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino);
+
+static int need_send_hole(struct send_ctx *sctx)
+{
+ return (sctx->parent_root && !sctx->cur_inode_new &&
+ !sctx->cur_inode_new_gen && !sctx->cur_inode_deleted &&
+ S_ISREG(sctx->cur_inode_mode));
+}
+
+static void fs_path_reset(struct fs_path *p)
+{
+ if (p->reversed) {
+ p->start = p->buf + p->buf_len - 1;
+ p->end = p->start;
+ *p->start = 0;
+ } else {
+ p->start = p->buf;
+ p->end = p->start;
+ *p->start = 0;
+ }
+}
+
+static struct fs_path *fs_path_alloc(void)
+{
+ struct fs_path *p;
+
+ p = kmalloc(sizeof(*p), GFP_NOFS);
+ if (!p)
+ return NULL;
+ p->reversed = 0;
+ p->buf = p->inline_buf;
+ p->buf_len = FS_PATH_INLINE_SIZE;
+ fs_path_reset(p);
+ return p;
+}
+
+static struct fs_path *fs_path_alloc_reversed(void)
+{
+ struct fs_path *p;
+
+ p = fs_path_alloc();
+ if (!p)
+ return NULL;
+ p->reversed = 1;
+ fs_path_reset(p);
+ return p;
+}
+
+static void fs_path_free(struct fs_path *p)
+{
+ if (!p)
+ return;
+ if (p->buf != p->inline_buf)
+ kfree(p->buf);
+ kfree(p);
+}
+
+static int fs_path_len(struct fs_path *p)
+{
+ return p->end - p->start;
+}
+
+static int fs_path_ensure_buf(struct fs_path *p, int len)
+{
+ char *tmp_buf;
+ int path_len;
+ int old_buf_len;
+
+ len++;
+
+ if (p->buf_len >= len)
+ return 0;
+
+ if (len > PATH_MAX) {
+ WARN_ON(1);
+ return -ENOMEM;
+ }
+
+ path_len = p->end - p->start;
+ old_buf_len = p->buf_len;
+
+ /*
+ * First time the inline_buf does not suffice
+ */
+ if (p->buf == p->inline_buf) {
+ tmp_buf = kmalloc(len, GFP_NOFS);
+ if (tmp_buf)
+ memcpy(tmp_buf, p->buf, old_buf_len);
+ } else {
+ tmp_buf = krealloc(p->buf, len, GFP_NOFS);
+ }
+ if (!tmp_buf)
+ return -ENOMEM;
+ p->buf = tmp_buf;
+ /*
+ * The real size of the buffer is bigger, this will let the fast path
+ * happen most of the time
+ */
+ p->buf_len = ksize(p->buf);
+
+ if (p->reversed) {
+ tmp_buf = p->buf + old_buf_len - path_len - 1;
+ p->end = p->buf + p->buf_len - 1;
+ p->start = p->end - path_len;
+ memmove(p->start, tmp_buf, path_len + 1);
+ } else {
+ p->start = p->buf;
+ p->end = p->start + path_len;
+ }
+ return 0;
+}
+
+static int fs_path_prepare_for_add(struct fs_path *p, int name_len,
+ char **prepared)
+{
+ int ret;
+ int new_len;
+
+ new_len = p->end - p->start + name_len;
+ if (p->start != p->end)
+ new_len++;
+ ret = fs_path_ensure_buf(p, new_len);
+ if (ret < 0)
+ goto out;
+
+ if (p->reversed) {
+ if (p->start != p->end)
+ *--p->start = '/';
+ p->start -= name_len;
+ *prepared = p->start;
+ } else {
+ if (p->start != p->end)
+ *p->end++ = '/';
+ *prepared = p->end;
+ p->end += name_len;
+ *p->end = 0;
+ }
+
+out:
+ return ret;
+}
+
+static int fs_path_add(struct fs_path *p, const char *name, int name_len)
+{
+ int ret;
+ char *prepared;
+
+ ret = fs_path_prepare_for_add(p, name_len, &prepared);
+ if (ret < 0)
+ goto out;
+ memcpy(prepared, name, name_len);
+
+out:
+ return ret;
+}
+
+static int fs_path_add_path(struct fs_path *p, struct fs_path *p2)
+{
+ int ret;
+ char *prepared;
+
+ ret = fs_path_prepare_for_add(p, p2->end - p2->start, &prepared);
+ if (ret < 0)
+ goto out;
+ memcpy(prepared, p2->start, p2->end - p2->start);
+
+out:
+ return ret;
+}
+
+static int fs_path_add_from_extent_buffer(struct fs_path *p,
+ struct extent_buffer *eb,
+ unsigned long off, int len)
+{
+ int ret;
+ char *prepared;
+
+ ret = fs_path_prepare_for_add(p, len, &prepared);
+ if (ret < 0)
+ goto out;
+
+ read_extent_buffer(eb, prepared, off, len);
+
+out:
+ return ret;
+}
+
+static int fs_path_copy(struct fs_path *p, struct fs_path *from)
+{
+ int ret;
+
+ p->reversed = from->reversed;
+ fs_path_reset(p);
+
+ ret = fs_path_add_path(p, from);
+
+ return ret;
+}
+
+
+static void fs_path_unreverse(struct fs_path *p)
+{
+ char *tmp;
+ int len;
+
+ if (!p->reversed)
+ return;
+
+ tmp = p->start;
+ len = p->end - p->start;
+ p->start = p->buf;
+ p->end = p->start + len;
+ memmove(p->start, tmp, len + 1);
+ p->reversed = 0;
+}
+
+static struct btrfs_path *alloc_path_for_send(void)
+{
+ struct btrfs_path *path;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return NULL;
+ path->search_commit_root = 1;
+ path->skip_locking = 1;
+ path->need_commit_sem = 1;
+ return path;
+}
+
+static int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off)
+{
+ int ret;
+ mm_segment_t old_fs;
+ u32 pos = 0;
+
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+
+ while (pos < len) {
+ ret = vfs_write(filp, (__force const char __user *)buf + pos,
+ len - pos, off);
+ /* TODO handle that correctly */
+ /*if (ret == -ERESTARTSYS) {
+ continue;
+ }*/
+ if (ret < 0)
+ goto out;
+ if (ret == 0) {
+ ret = -EIO;
+ goto out;
+ }
+ pos += ret;
+ }
+
+ ret = 0;
+
+out:
+ set_fs(old_fs);
+ return ret;
+}
+
+static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len)
+{
+ struct btrfs_tlv_header *hdr;
+ int total_len = sizeof(*hdr) + len;
+ int left = sctx->send_max_size - sctx->send_size;
+
+ if (unlikely(left < total_len))
+ return -EOVERFLOW;
+
+ hdr = (struct btrfs_tlv_header *) (sctx->send_buf + sctx->send_size);
+ hdr->tlv_type = cpu_to_le16(attr);
+ hdr->tlv_len = cpu_to_le16(len);
+ memcpy(hdr + 1, data, len);
+ sctx->send_size += total_len;
+
+ return 0;
+}
+
+#define TLV_PUT_DEFINE_INT(bits) \
+ static int tlv_put_u##bits(struct send_ctx *sctx, \
+ u##bits attr, u##bits value) \
+ { \
+ __le##bits __tmp = cpu_to_le##bits(value); \
+ return tlv_put(sctx, attr, &__tmp, sizeof(__tmp)); \
+ }
+
+TLV_PUT_DEFINE_INT(64)
+
+static int tlv_put_string(struct send_ctx *sctx, u16 attr,
+ const char *str, int len)
+{
+ if (len == -1)
+ len = strlen(str);
+ return tlv_put(sctx, attr, str, len);
+}
+
+static int tlv_put_uuid(struct send_ctx *sctx, u16 attr,
+ const u8 *uuid)
+{
+ return tlv_put(sctx, attr, uuid, BTRFS_UUID_SIZE);
+}
+
+static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr,
+ struct extent_buffer *eb,
+ struct btrfs_timespec *ts)
+{
+ struct btrfs_timespec bts;
+ read_extent_buffer(eb, &bts, (unsigned long)ts, sizeof(bts));
+ return tlv_put(sctx, attr, &bts, sizeof(bts));
+}
+
+
+#define TLV_PUT(sctx, attrtype, attrlen, data) \
+ do { \
+ ret = tlv_put(sctx, attrtype, attrlen, data); \
+ if (ret < 0) \
+ goto tlv_put_failure; \
+ } while (0)
+
+#define TLV_PUT_INT(sctx, attrtype, bits, value) \
+ do { \
+ ret = tlv_put_u##bits(sctx, attrtype, value); \
+ if (ret < 0) \
+ goto tlv_put_failure; \
+ } while (0)
+
+#define TLV_PUT_U8(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 8, data)
+#define TLV_PUT_U16(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 16, data)
+#define TLV_PUT_U32(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 32, data)
+#define TLV_PUT_U64(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 64, data)
+#define TLV_PUT_STRING(sctx, attrtype, str, len) \
+ do { \
+ ret = tlv_put_string(sctx, attrtype, str, len); \
+ if (ret < 0) \
+ goto tlv_put_failure; \
+ } while (0)
+#define TLV_PUT_PATH(sctx, attrtype, p) \
+ do { \
+ ret = tlv_put_string(sctx, attrtype, p->start, \
+ p->end - p->start); \
+ if (ret < 0) \
+ goto tlv_put_failure; \
+ } while(0)
+#define TLV_PUT_UUID(sctx, attrtype, uuid) \
+ do { \
+ ret = tlv_put_uuid(sctx, attrtype, uuid); \
+ if (ret < 0) \
+ goto tlv_put_failure; \
+ } while (0)
+#define TLV_PUT_BTRFS_TIMESPEC(sctx, attrtype, eb, ts) \
+ do { \
+ ret = tlv_put_btrfs_timespec(sctx, attrtype, eb, ts); \
+ if (ret < 0) \
+ goto tlv_put_failure; \
+ } while (0)
+
+static int send_header(struct send_ctx *sctx)
+{
+ struct btrfs_stream_header hdr;
+
+ strcpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC);
+ hdr.version = cpu_to_le32(BTRFS_SEND_STREAM_VERSION);
+
+ return write_buf(sctx->send_filp, &hdr, sizeof(hdr),
+ &sctx->send_off);
+}
+
+/*
+ * For each command/item we want to send to userspace, we call this function.
+ */
+static int begin_cmd(struct send_ctx *sctx, int cmd)
+{
+ struct btrfs_cmd_header *hdr;
+
+ if (WARN_ON(!sctx->send_buf))
+ return -EINVAL;
+
+ BUG_ON(sctx->send_size);
+
+ sctx->send_size += sizeof(*hdr);
+ hdr = (struct btrfs_cmd_header *)sctx->send_buf;
+ hdr->cmd = cpu_to_le16(cmd);
+
+ return 0;
+}
+
+static int send_cmd(struct send_ctx *sctx)
+{
+ int ret;
+ struct btrfs_cmd_header *hdr;
+ u32 crc;
+
+ hdr = (struct btrfs_cmd_header *)sctx->send_buf;
+ hdr->len = cpu_to_le32(sctx->send_size - sizeof(*hdr));
+ hdr->crc = 0;
+
+ crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
+ hdr->crc = cpu_to_le32(crc);
+
+ ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
+ &sctx->send_off);
+
+ sctx->total_send_size += sctx->send_size;
+ sctx->cmd_send_size[le16_to_cpu(hdr->cmd)] += sctx->send_size;
+ sctx->send_size = 0;
+
+ return ret;
+}
+
+/*
+ * Sends a move instruction to user space
+ */
+static int send_rename(struct send_ctx *sctx,
+ struct fs_path *from, struct fs_path *to)
+{
+ int ret;
+
+verbose_printk("btrfs: send_rename %s -> %s\n", from->start, to->start);
+
+ ret = begin_cmd(sctx, BTRFS_SEND_C_RENAME);
+ if (ret < 0)
+ goto out;
+
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, from);
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_TO, to);
+
+ ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+ return ret;
+}
+
+/*
+ * Sends a link instruction to user space
+ */
+static int send_link(struct send_ctx *sctx,
+ struct fs_path *path, struct fs_path *lnk)
+{
+ int ret;
+
+verbose_printk("btrfs: send_link %s -> %s\n", path->start, lnk->start);
+
+ ret = begin_cmd(sctx, BTRFS_SEND_C_LINK);
+ if (ret < 0)
+ goto out;
+
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, lnk);
+
+ ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+ return ret;
+}
+
+/*
+ * Sends an unlink instruction to user space
+ */
+static int send_unlink(struct send_ctx *sctx, struct fs_path *path)
+{
+ int ret;
+
+verbose_printk("btrfs: send_unlink %s\n", path->start);
+
+ ret = begin_cmd(sctx, BTRFS_SEND_C_UNLINK);
+ if (ret < 0)
+ goto out;
+
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
+
+ ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+ return ret;
+}
+
+/*
+ * Sends a rmdir instruction to user space
+ */
+static int send_rmdir(struct send_ctx *sctx, struct fs_path *path)
+{
+ int ret;
+
+verbose_printk("btrfs: send_rmdir %s\n", path->start);
+
+ ret = begin_cmd(sctx, BTRFS_SEND_C_RMDIR);
+ if (ret < 0)
+ goto out;
+
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
+
+ ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+ return ret;
+}
+
+/*
+ * Helper function to retrieve some fields from an inode item.
+ */
+static int __get_inode_info(struct btrfs_root *root, struct btrfs_path *path,
+ u64 ino, u64 *size, u64 *gen, u64 *mode, u64 *uid,
+ u64 *gid, u64 *rdev)
+{
+ int ret;
+ struct btrfs_inode_item *ii;
+ struct btrfs_key key;
+
+ key.objectid = ino;
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret) {
+ if (ret > 0)
+ ret = -ENOENT;
+ return ret;
+ }
+
+ ii = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_inode_item);
+ if (size)
+ *size = btrfs_inode_size(path->nodes[0], ii);
+ if (gen)
+ *gen = btrfs_inode_generation(path->nodes[0], ii);
+ if (mode)
+ *mode = btrfs_inode_mode(path->nodes[0], ii);
+ if (uid)
+ *uid = btrfs_inode_uid(path->nodes[0], ii);
+ if (gid)
+ *gid = btrfs_inode_gid(path->nodes[0], ii);
+ if (rdev)
+ *rdev = btrfs_inode_rdev(path->nodes[0], ii);
+
+ return ret;
+}
+
+static int get_inode_info(struct btrfs_root *root,
+ u64 ino, u64 *size, u64 *gen,
+ u64 *mode, u64 *uid, u64 *gid,
+ u64 *rdev)
+{
+ struct btrfs_path *path;
+ int ret;
+
+ path = alloc_path_for_send();
+ if (!path)
+ return -ENOMEM;
+ ret = __get_inode_info(root, path, ino, size, gen, mode, uid, gid,
+ rdev);
+ btrfs_free_path(path);
+ return ret;
+}
+
+typedef int (*iterate_inode_ref_t)(int num, u64 dir, int index,
+ struct fs_path *p,
+ void *ctx);
+
+/*
+ * Helper function to iterate the entries in ONE btrfs_inode_ref or
+ * btrfs_inode_extref.
+ * The iterate callback may return a non zero value to stop iteration. This can
+ * be a negative value for error codes or 1 to simply stop it.
+ *
+ * path must point to the INODE_REF or INODE_EXTREF when called.
+ */
+static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
+ struct btrfs_key *found_key, int resolve,
+ iterate_inode_ref_t iterate, void *ctx)
+{
+ struct extent_buffer *eb = path->nodes[0];
+ struct btrfs_item *item;
+ struct btrfs_inode_ref *iref;
+ struct btrfs_inode_extref *extref;
+ struct btrfs_path *tmp_path;
+ struct fs_path *p;
+ u32 cur = 0;
+ u32 total;
+ int slot = path->slots[0];
+ u32 name_len;
+ char *start;
+ int ret = 0;
+ int num = 0;
+ int index;
+ u64 dir;
+ unsigned long name_off;
+ unsigned long elem_size;
+ unsigned long ptr;
+
+ p = fs_path_alloc_reversed();
+ if (!p)
+ return -ENOMEM;
+
+ tmp_path = alloc_path_for_send();
+ if (!tmp_path) {
+ fs_path_free(p);
+ return -ENOMEM;
+ }
+
+
+ if (found_key->type == BTRFS_INODE_REF_KEY) {
+ ptr = (unsigned long)btrfs_item_ptr(eb, slot,
+ struct btrfs_inode_ref);
+ item = btrfs_item_nr(slot);
+ total = btrfs_item_size(eb, item);
+ elem_size = sizeof(*iref);
+ } else {
+ ptr = btrfs_item_ptr_offset(eb, slot);
+ total = btrfs_item_size_nr(eb, slot);
+ elem_size = sizeof(*extref);
+ }
+
+ while (cur < total) {
+ fs_path_reset(p);
+
+ if (found_key->type == BTRFS_INODE_REF_KEY) {
+ iref = (struct btrfs_inode_ref *)(ptr + cur);
+ name_len = btrfs_inode_ref_name_len(eb, iref);
+ name_off = (unsigned long)(iref + 1);
+ index = btrfs_inode_ref_index(eb, iref);
+ dir = found_key->offset;
+ } else {
+ extref = (struct btrfs_inode_extref *)(ptr + cur);
+ name_len = btrfs_inode_extref_name_len(eb, extref);
+ name_off = (unsigned long)&extref->name;
+ index = btrfs_inode_extref_index(eb, extref);
+ dir = btrfs_inode_extref_parent(eb, extref);
+ }
+
+ if (resolve) {
+ start = btrfs_ref_to_path(root, tmp_path, name_len,
+ name_off, eb, dir,
+ p->buf, p->buf_len);
+ if (IS_ERR(start)) {
+ ret = PTR_ERR(start);
+ goto out;
+ }
+ if (start < p->buf) {
+ /* overflow , try again with larger buffer */
+ ret = fs_path_ensure_buf(p,
+ p->buf_len + p->buf - start);
+ if (ret < 0)
+ goto out;
+ start = btrfs_ref_to_path(root, tmp_path,
+ name_len, name_off,
+ eb, dir,
+ p->buf, p->buf_len);
+ if (IS_ERR(start)) {
+ ret = PTR_ERR(start);
+ goto out;
+ }
+ BUG_ON(start < p->buf);
+ }
+ p->start = start;
+ } else {
+ ret = fs_path_add_from_extent_buffer(p, eb, name_off,
+ name_len);
+ if (ret < 0)
+ goto out;
+ }
+
+ cur += elem_size + name_len;
+ ret = iterate(num, dir, index, p, ctx);
+ if (ret)
+ goto out;
+ num++;
+ }
+
+out:
+ btrfs_free_path(tmp_path);
+ fs_path_free(p);
+ return ret;
+}
+
+typedef int (*iterate_dir_item_t)(int num, struct btrfs_key *di_key,
+ const char *name, int name_len,
+ const char *data, int data_len,
+ u8 type, void *ctx);
+
+/*
+ * Helper function to iterate the entries in ONE btrfs_dir_item.
+ * The iterate callback may return a non zero value to stop iteration. This can
+ * be a negative value for error codes or 1 to simply stop it.
+ *
+ * path must point to the dir item when called.
+ */
+static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
+ struct btrfs_key *found_key,
+ iterate_dir_item_t iterate, void *ctx)
+{
+ int ret = 0;
+ struct extent_buffer *eb;
+ struct btrfs_item *item;
+ struct btrfs_dir_item *di;
+ struct btrfs_key di_key;
+ char *buf = NULL;
+ int buf_len;
+ u32 name_len;
+ u32 data_len;
+ u32 cur;
+ u32 len;
+ u32 total;
+ int slot;
+ int num;
+ u8 type;
+
+ /*
+ * Start with a small buffer (1 page). If later we end up needing more
+ * space, which can happen for xattrs on a fs with a leaf size greater
+ * then the page size, attempt to increase the buffer. Typically xattr
+ * values are small.
+ */
+ buf_len = PATH_MAX;
+ buf = kmalloc(buf_len, GFP_NOFS);
+ if (!buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ eb = path->nodes[0];
+ slot = path->slots[0];
+ item = btrfs_item_nr(slot);
+ di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
+ cur = 0;
+ len = 0;
+ total = btrfs_item_size(eb, item);
+
+ num = 0;
+ while (cur < total) {
+ name_len = btrfs_dir_name_len(eb, di);
+ data_len = btrfs_dir_data_len(eb, di);
+ type = btrfs_dir_type(eb, di);
+ btrfs_dir_item_key_to_cpu(eb, di, &di_key);
+
+ if (type == BTRFS_FT_XATTR) {
+ if (name_len > XATTR_NAME_MAX) {
+ ret = -ENAMETOOLONG;
+ goto out;
+ }
+ if (name_len + data_len > BTRFS_MAX_XATTR_SIZE(root)) {
+ ret = -E2BIG;
+ goto out;
+ }
+ } else {
+ /*
+ * Path too long
+ */
+ if (name_len + data_len > PATH_MAX) {
+ ret = -ENAMETOOLONG;
+ goto out;
+ }
+ }
+
+ if (name_len + data_len > buf_len) {
+ buf_len = name_len + data_len;
+ if (is_vmalloc_addr(buf)) {
+ vfree(buf);
+ buf = NULL;
+ } else {
+ char *tmp = krealloc(buf, buf_len,
+ GFP_NOFS | __GFP_NOWARN);
+
+ if (!tmp)
+ kfree(buf);
+ buf = tmp;
+ }
+ if (!buf) {
+ buf = vmalloc(buf_len);
+ if (!buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+ }
+
+ read_extent_buffer(eb, buf, (unsigned long)(di + 1),
+ name_len + data_len);
+
+ len = sizeof(*di) + name_len + data_len;
+ di = (struct btrfs_dir_item *)((char *)di + len);
+ cur += len;
+
+ ret = iterate(num, &di_key, buf, name_len, buf + name_len,
+ data_len, type, ctx);
+ if (ret < 0)
+ goto out;
+ if (ret) {
+ ret = 0;
+ goto out;
+ }
+
+ num++;
+ }
+
+out:
+ kvfree(buf);
+ return ret;
+}
+
+static int __copy_first_ref(int num, u64 dir, int index,
+ struct fs_path *p, void *ctx)
+{
+ int ret;
+ struct fs_path *pt = ctx;
+
+ ret = fs_path_copy(pt, p);
+ if (ret < 0)
+ return ret;
+
+ /* we want the first only */
+ return 1;
+}
+
+/*
+ * Retrieve the first path of an inode. If an inode has more then one
+ * ref/hardlink, this is ignored.
+ */
+static int get_inode_path(struct btrfs_root *root,
+ u64 ino, struct fs_path *path)
+{
+ int ret;
+ struct btrfs_key key, found_key;
+ struct btrfs_path *p;
+
+ p = alloc_path_for_send();
+ if (!p)
+ return -ENOMEM;
+
+ fs_path_reset(path);
+
+ key.objectid = ino;
+ key.type = BTRFS_INODE_REF_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot_for_read(root, &key, p, 1, 0);
+ if (ret < 0)
+ goto out;
+ if (ret) {
+ ret = 1;
+ goto out;
+ }
+ btrfs_item_key_to_cpu(p->nodes[0], &found_key, p->slots[0]);
+ if (found_key.objectid != ino ||
+ (found_key.type != BTRFS_INODE_REF_KEY &&
+ found_key.type != BTRFS_INODE_EXTREF_KEY)) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ ret = iterate_inode_ref(root, p, &found_key, 1,
+ __copy_first_ref, path);
+ if (ret < 0)
+ goto out;
+ ret = 0;
+
+out:
+ btrfs_free_path(p);
+ return ret;
+}
+
+struct backref_ctx {
+ struct send_ctx *sctx;
+
+ struct btrfs_path *path;
+ /* number of total found references */
+ u64 found;
+
+ /*
+ * used for clones found in send_root. clones found behind cur_objectid
+ * and cur_offset are not considered as allowed clones.
+ */
+ u64 cur_objectid;
+ u64 cur_offset;
+
+ /* may be truncated in case it's the last extent in a file */
+ u64 extent_len;
+
+ /* data offset in the file extent item */
+ u64 data_offset;
+
+ /* Just to check for bugs in backref resolving */
+ int found_itself;
+};
+
+static int __clone_root_cmp_bsearch(const void *key, const void *elt)
+{
+ u64 root = (u64)(uintptr_t)key;
+ struct clone_root *cr = (struct clone_root *)elt;
+
+ if (root < cr->root->objectid)
+ return -1;
+ if (root > cr->root->objectid)
+ return 1;
+ return 0;
+}
+
+static int __clone_root_cmp_sort(const void *e1, const void *e2)
+{
+ struct clone_root *cr1 = (struct clone_root *)e1;
+ struct clone_root *cr2 = (struct clone_root *)e2;
+
+ if (cr1->root->objectid < cr2->root->objectid)
+ return -1;
+ if (cr1->root->objectid > cr2->root->objectid)
+ return 1;
+ return 0;
+}
+
+/*
+ * Called for every backref that is found for the current extent.
+ * Results are collected in sctx->clone_roots->ino/offset/found_refs
+ */
+static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
+{
+ struct backref_ctx *bctx = ctx_;
+ struct clone_root *found;
+ int ret;
+ u64 i_size;
+
+ /* First check if the root is in the list of accepted clone sources */
+ found = bsearch((void *)(uintptr_t)root, bctx->sctx->clone_roots,
+ bctx->sctx->clone_roots_cnt,
+ sizeof(struct clone_root),
+ __clone_root_cmp_bsearch);
+ if (!found)
+ return 0;
+
+ if (found->root == bctx->sctx->send_root &&
+ ino == bctx->cur_objectid &&
+ offset == bctx->cur_offset) {
+ bctx->found_itself = 1;
+ }
+
+ /*
+ * There are inodes that have extents that lie behind its i_size. Don't
+ * accept clones from these extents.
+ */
+ ret = __get_inode_info(found->root, bctx->path, ino, &i_size, NULL, NULL,
+ NULL, NULL, NULL);
+ btrfs_release_path(bctx->path);
+ if (ret < 0)
+ return ret;
+
+ if (offset + bctx->data_offset + bctx->extent_len > i_size)
+ return 0;
+
+ /*
+ * Make sure we don't consider clones from send_root that are
+ * behind the current inode/offset.
+ */
+ if (found->root == bctx->sctx->send_root) {
+ /*
+ * TODO for the moment we don't accept clones from the inode
+ * that is currently send. We may change this when
+ * BTRFS_IOC_CLONE_RANGE supports cloning from and to the same
+ * file.
+ */
+ if (ino >= bctx->cur_objectid)
+ return 0;
+#if 0
+ if (ino > bctx->cur_objectid)
+ return 0;
+ if (offset + bctx->extent_len > bctx->cur_offset)
+ return 0;
+#endif
+ }
+
+ bctx->found++;
+ found->found_refs++;
+ if (ino < found->ino) {
+ found->ino = ino;
+ found->offset = offset;
+ } else if (found->ino == ino) {
+ /*
+ * same extent found more then once in the same file.
+ */
+ if (found->offset > offset + bctx->extent_len)
+ found->offset = offset;
+ }
+
+ return 0;
+}
+
+/*
+ * Given an inode, offset and extent item, it finds a good clone for a clone
+ * instruction. Returns -ENOENT when none could be found. The function makes
+ * sure that the returned clone is usable at the point where sending is at the
+ * moment. This means, that no clones are accepted which lie behind the current
+ * inode+offset.
+ *
+ * path must point to the extent item when called.
+ */
+static int find_extent_clone(struct send_ctx *sctx,
+ struct btrfs_path *path,
+ u64 ino, u64 data_offset,
+ u64 ino_size,
+ struct clone_root **found)
+{
+ int ret;
+ int extent_type;
+ u64 logical;
+ u64 disk_byte;
+ u64 num_bytes;
+ u64 extent_item_pos;
+ u64 flags = 0;
+ struct btrfs_file_extent_item *fi;
+ struct extent_buffer *eb = path->nodes[0];
+ struct backref_ctx *backref_ctx = NULL;
+ struct clone_root *cur_clone_root;
+ struct btrfs_key found_key;
+ struct btrfs_path *tmp_path;
+ int compressed;
+ u32 i;
+
+ tmp_path = alloc_path_for_send();
+ if (!tmp_path)
+ return -ENOMEM;
+
+ /* We only use this path under the commit sem */
+ tmp_path->need_commit_sem = 0;
+
+ backref_ctx = kmalloc(sizeof(*backref_ctx), GFP_NOFS);
+ if (!backref_ctx) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ backref_ctx->path = tmp_path;
+
+ if (data_offset >= ino_size) {
+ /*
+ * There may be extents that lie behind the file's size.
+ * I at least had this in combination with snapshotting while
+ * writing large files.
+ */
+ ret = 0;
+ goto out;
+ }
+
+ fi = btrfs_item_ptr(eb, path->slots[0],
+ struct btrfs_file_extent_item);
+ extent_type = btrfs_file_extent_type(eb, fi);
+ if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+ ret = -ENOENT;
+ goto out;
+ }
+ compressed = btrfs_file_extent_compression(eb, fi);
+
+ num_bytes = btrfs_file_extent_num_bytes(eb, fi);
+ disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
+ if (disk_byte == 0) {
+ ret = -ENOENT;
+ goto out;
+ }
+ logical = disk_byte + btrfs_file_extent_offset(eb, fi);
+
+ down_read(&sctx->send_root->fs_info->commit_root_sem);
+ ret = extent_from_logical(sctx->send_root->fs_info, disk_byte, tmp_path,
+ &found_key, &flags);
+ up_read(&sctx->send_root->fs_info->commit_root_sem);
+ btrfs_release_path(tmp_path);
+
+ if (ret < 0)
+ goto out;
+ if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+ ret = -EIO;
+ goto out;
+ }
+
+ /*
+ * Setup the clone roots.
+ */
+ for (i = 0; i < sctx->clone_roots_cnt; i++) {
+ cur_clone_root = sctx->clone_roots + i;
+ cur_clone_root->ino = (u64)-1;
+ cur_clone_root->offset = 0;
+ cur_clone_root->found_refs = 0;
+ }
+
+ backref_ctx->sctx = sctx;
+ backref_ctx->found = 0;
+ backref_ctx->cur_objectid = ino;
+ backref_ctx->cur_offset = data_offset;
+ backref_ctx->found_itself = 0;
+ backref_ctx->extent_len = num_bytes;
+ /*
+ * For non-compressed extents iterate_extent_inodes() gives us extent
+ * offsets that already take into account the data offset, but not for
+ * compressed extents, since the offset is logical and not relative to
+ * the physical extent locations. We must take this into account to
+ * avoid sending clone offsets that go beyond the source file's size,
+ * which would result in the clone ioctl failing with -EINVAL on the
+ * receiving end.
+ */
+ if (compressed == BTRFS_COMPRESS_NONE)
+ backref_ctx->data_offset = 0;
+ else
+ backref_ctx->data_offset = btrfs_file_extent_offset(eb, fi);
+
+ /*
+ * The last extent of a file may be too large due to page alignment.
+ * We need to adjust extent_len in this case so that the checks in
+ * __iterate_backrefs work.
+ */
+ if (data_offset + num_bytes >= ino_size)
+ backref_ctx->extent_len = ino_size - data_offset;
+
+ /*
+ * Now collect all backrefs.
+ */
+ if (compressed == BTRFS_COMPRESS_NONE)
+ extent_item_pos = logical - found_key.objectid;
+ else
+ extent_item_pos = 0;
+ ret = iterate_extent_inodes(sctx->send_root->fs_info,
+ found_key.objectid, extent_item_pos, 1,
+ __iterate_backrefs, backref_ctx);
+
+ if (ret < 0)
+ goto out;
+
+ if (!backref_ctx->found_itself) {
+ /* found a bug in backref code? */
+ ret = -EIO;
+ btrfs_err(sctx->send_root->fs_info, "did not find backref in "
+ "send_root. inode=%llu, offset=%llu, "
+ "disk_byte=%llu found extent=%llu",
+ ino, data_offset, disk_byte, found_key.objectid);
+ goto out;
+ }
+
+verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, "
+ "ino=%llu, "
+ "num_bytes=%llu, logical=%llu\n",
+ data_offset, ino, num_bytes, logical);
+
+ if (!backref_ctx->found)
+ verbose_printk("btrfs: no clones found\n");
+
+ cur_clone_root = NULL;
+ for (i = 0; i < sctx->clone_roots_cnt; i++) {
+ if (sctx->clone_roots[i].found_refs) {
+ if (!cur_clone_root)
+ cur_clone_root = sctx->clone_roots + i;
+ else if (sctx->clone_roots[i].root == sctx->send_root)
+ /* prefer clones from send_root over others */
+ cur_clone_root = sctx->clone_roots + i;
+ }
+
+ }
+
+ if (cur_clone_root) {
+ if (compressed != BTRFS_COMPRESS_NONE) {
+ /*
+ * Offsets given by iterate_extent_inodes() are relative
+ * to the start of the extent, we need to add logical
+ * offset from the file extent item.
+ * (See why at backref.c:check_extent_in_eb())
+ */
+ cur_clone_root->offset += btrfs_file_extent_offset(eb,
+ fi);
+ }
+ *found = cur_clone_root;
+ ret = 0;
+ } else {
+ ret = -ENOENT;
+ }
+
+out:
+ btrfs_free_path(tmp_path);
+ kfree(backref_ctx);
+ return ret;
+}
+
+static int read_symlink(struct btrfs_root *root,
+ u64 ino,
+ struct fs_path *dest)
+{
+ int ret;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct btrfs_file_extent_item *ei;
+ u8 type;
+ u8 compression;
+ unsigned long off;
+ int len;
+
+ path = alloc_path_for_send();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = 0;
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ BUG_ON(ret);
+
+ ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_file_extent_item);
+ type = btrfs_file_extent_type(path->nodes[0], ei);
+ compression = btrfs_file_extent_compression(path->nodes[0], ei);
+ BUG_ON(type != BTRFS_FILE_EXTENT_INLINE);
+ BUG_ON(compression);
+
+ off = btrfs_file_extent_inline_start(ei);
+ len = btrfs_file_extent_inline_len(path->nodes[0], path->slots[0], ei);
+
+ ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len);
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * Helper function to generate a file name that is unique in the root of
+ * send_root and parent_root. This is used to generate names for orphan inodes.
+ */
+static int gen_unique_name(struct send_ctx *sctx,
+ u64 ino, u64 gen,
+ struct fs_path *dest)
+{
+ int ret = 0;
+ struct btrfs_path *path;
+ struct btrfs_dir_item *di;
+ char tmp[64];
+ int len;
+ u64 idx = 0;
+
+ path = alloc_path_for_send();
+ if (!path)
+ return -ENOMEM;
+
+ while (1) {
+ len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu",
+ ino, gen, idx);
+ ASSERT(len < sizeof(tmp));
+
+ di = btrfs_lookup_dir_item(NULL, sctx->send_root,
+ path, BTRFS_FIRST_FREE_OBJECTID,
+ tmp, strlen(tmp), 0);
+ btrfs_release_path(path);
+ if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
+ goto out;
+ }
+ if (di) {
+ /* not unique, try again */
+ idx++;
+ continue;
+ }
+
+ if (!sctx->parent_root) {
+ /* unique */
+ ret = 0;
+ break;
+ }
+
+ di = btrfs_lookup_dir_item(NULL, sctx->parent_root,
+ path, BTRFS_FIRST_FREE_OBJECTID,
+ tmp, strlen(tmp), 0);
+ btrfs_release_path(path);
+ if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
+ goto out;
+ }
+ if (di) {
+ /* not unique, try again */
+ idx++;
+ continue;
+ }
+ /* unique */
+ break;
+ }
+
+ ret = fs_path_add(dest, tmp, strlen(tmp));
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+enum inode_state {
+ inode_state_no_change,
+ inode_state_will_create,
+ inode_state_did_create,
+ inode_state_will_delete,
+ inode_state_did_delete,
+};
+
+static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)
+{
+ int ret;
+ int left_ret;
+ int right_ret;
+ u64 left_gen;
+ u64 right_gen;
+
+ ret = get_inode_info(sctx->send_root, ino, NULL, &left_gen, NULL, NULL,
+ NULL, NULL);
+ if (ret < 0 && ret != -ENOENT)
+ goto out;
+ left_ret = ret;
+
+ if (!sctx->parent_root) {
+ right_ret = -ENOENT;
+ } else {
+ ret = get_inode_info(sctx->parent_root, ino, NULL, &right_gen,
+ NULL, NULL, NULL, NULL);
+ if (ret < 0 && ret != -ENOENT)
+ goto out;
+ right_ret = ret;
+ }
+
+ if (!left_ret && !right_ret) {
+ if (left_gen == gen && right_gen == gen) {
+ ret = inode_state_no_change;
+ } else if (left_gen == gen) {
+ if (ino < sctx->send_progress)
+ ret = inode_state_did_create;
+ else
+ ret = inode_state_will_create;
+ } else if (right_gen == gen) {
+ if (ino < sctx->send_progress)
+ ret = inode_state_did_delete;
+ else
+ ret = inode_state_will_delete;
+ } else {
+ ret = -ENOENT;
+ }
+ } else if (!left_ret) {
+ if (left_gen == gen) {
+ if (ino < sctx->send_progress)
+ ret = inode_state_did_create;
+ else
+ ret = inode_state_will_create;
+ } else {
+ ret = -ENOENT;
+ }
+ } else if (!right_ret) {
+ if (right_gen == gen) {
+ if (ino < sctx->send_progress)
+ ret = inode_state_did_delete;
+ else
+ ret = inode_state_will_delete;
+ } else {
+ ret = -ENOENT;
+ }
+ } else {
+ ret = -ENOENT;
+ }
+
+out:
+ return ret;
+}
+
+static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen)
+{
+ int ret;
+
+ ret = get_cur_inode_state(sctx, ino, gen);
+ if (ret < 0)
+ goto out;
+
+ if (ret == inode_state_no_change ||
+ ret == inode_state_did_create ||
+ ret == inode_state_will_delete)
+ ret = 1;
+ else
+ ret = 0;
+
+out:
+ return ret;
+}
+
+/*
+ * Helper function to lookup a dir item in a dir.
+ */
+static int lookup_dir_item_inode(struct btrfs_root *root,
+ u64 dir, const char *name, int name_len,
+ u64 *found_inode,
+ u8 *found_type)
+{
+ int ret = 0;
+ struct btrfs_dir_item *di;
+ struct btrfs_key key;
+ struct btrfs_path *path;
+
+ path = alloc_path_for_send();
+ if (!path)
+ return -ENOMEM;
+
+ di = btrfs_lookup_dir_item(NULL, root, path,
+ dir, name, name_len, 0);
+ if (!di) {
+ ret = -ENOENT;
+ goto out;
+ }
+ if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
+ goto out;
+ }
+ btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
+ if (key.type == BTRFS_ROOT_ITEM_KEY) {
+ ret = -ENOENT;
+ goto out;
+ }
+ *found_inode = key.objectid;
+ *found_type = btrfs_dir_type(path->nodes[0], di);
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * Looks up the first btrfs_inode_ref of a given ino. It returns the parent dir,
+ * generation of the parent dir and the name of the dir entry.
+ */
+static int get_first_ref(struct btrfs_root *root, u64 ino,
+ u64 *dir, u64 *dir_gen, struct fs_path *name)
+{
+ int ret;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ struct btrfs_path *path;
+ int len;
+ u64 parent_dir;
+
+ path = alloc_path_for_send();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = ino;
+ key.type = BTRFS_INODE_REF_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
+ if (ret < 0)
+ goto out;
+ if (!ret)
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+ path->slots[0]);
+ if (ret || found_key.objectid != ino ||
+ (found_key.type != BTRFS_INODE_REF_KEY &&
+ found_key.type != BTRFS_INODE_EXTREF_KEY)) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ if (found_key.type == BTRFS_INODE_REF_KEY) {
+ struct btrfs_inode_ref *iref;
+ iref = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_inode_ref);
+ len = btrfs_inode_ref_name_len(path->nodes[0], iref);
+ ret = fs_path_add_from_extent_buffer(name, path->nodes[0],
+ (unsigned long)(iref + 1),
+ len);
+ parent_dir = found_key.offset;
+ } else {
+ struct btrfs_inode_extref *extref;
+ extref = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_inode_extref);
+ len = btrfs_inode_extref_name_len(path->nodes[0], extref);
+ ret = fs_path_add_from_extent_buffer(name, path->nodes[0],
+ (unsigned long)&extref->name, len);
+ parent_dir = btrfs_inode_extref_parent(path->nodes[0], extref);
+ }
+ if (ret < 0)
+ goto out;
+ btrfs_release_path(path);
+
+ if (dir_gen) {
+ ret = get_inode_info(root, parent_dir, NULL, dir_gen, NULL,
+ NULL, NULL, NULL);
+ if (ret < 0)
+ goto out;
+ }
+
+ *dir = parent_dir;
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int is_first_ref(struct btrfs_root *root,
+ u64 ino, u64 dir,
+ const char *name, int name_len)
+{
+ int ret;
+ struct fs_path *tmp_name;
+ u64 tmp_dir;
+
+ tmp_name = fs_path_alloc();
+ if (!tmp_name)
+ return -ENOMEM;
+
+ ret = get_first_ref(root, ino, &tmp_dir, NULL, tmp_name);
+ if (ret < 0)
+ goto out;
+
+ if (dir != tmp_dir || name_len != fs_path_len(tmp_name)) {
+ ret = 0;
+ goto out;
+ }
+
+ ret = !memcmp(tmp_name->start, name, name_len);
+
+out:
+ fs_path_free(tmp_name);
+ return ret;
+}
+
+/*
+ * Used by process_recorded_refs to determine if a new ref would overwrite an
+ * already existing ref. In case it detects an overwrite, it returns the
+ * inode/gen in who_ino/who_gen.
+ * When an overwrite is detected, process_recorded_refs does proper orphanizing
+ * to make sure later references to the overwritten inode are possible.
+ * Orphanizing is however only required for the first ref of an inode.
+ * process_recorded_refs does an additional is_first_ref check to see if
+ * orphanizing is really required.
+ */
+static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
+ const char *name, int name_len,
+ u64 *who_ino, u64 *who_gen)
+{
+ int ret = 0;
+ u64 gen;
+ u64 other_inode = 0;
+ u8 other_type = 0;
+
+ if (!sctx->parent_root)
+ goto out;
+
+ ret = is_inode_existent(sctx, dir, dir_gen);
+ if (ret <= 0)
+ goto out;
+
+ /*
+ * If we have a parent root we need to verify that the parent dir was
+ * not delted and then re-created, if it was then we have no overwrite
+ * and we can just unlink this entry.
+ */
+ if (sctx->parent_root) {
+ ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL,
+ NULL, NULL, NULL);
+ if (ret < 0 && ret != -ENOENT)
+ goto out;
+ if (ret) {
+ ret = 0;
+ goto out;
+ }
+ if (gen != dir_gen)
+ goto out;
+ }
+
+ ret = lookup_dir_item_inode(sctx->parent_root, dir, name, name_len,
+ &other_inode, &other_type);
+ if (ret < 0 && ret != -ENOENT)
+ goto out;
+ if (ret) {
+ ret = 0;
+ goto out;
+ }
+
+ /*
+ * Check if the overwritten ref was already processed. If yes, the ref
+ * was already unlinked/moved, so we can safely assume that we will not
+ * overwrite anything at this point in time.
+ */
+ if (other_inode > sctx->send_progress) {
+ ret = get_inode_info(sctx->parent_root, other_inode, NULL,
+ who_gen, NULL, NULL, NULL, NULL);
+ if (ret < 0)
+ goto out;
+
+ ret = 1;
+ *who_ino = other_inode;
+ } else {
+ ret = 0;
+ }
+
+out:
+ return ret;
+}
+
+/*
+ * Checks if the ref was overwritten by an already processed inode. This is
+ * used by __get_cur_name_and_parent to find out if the ref was orphanized and
+ * thus the orphan name needs be used.
+ * process_recorded_refs also uses it to avoid unlinking of refs that were
+ * overwritten.
+ */
+static int did_overwrite_ref(struct send_ctx *sctx,
+ u64 dir, u64 dir_gen,
+ u64 ino, u64 ino_gen,
+ const char *name, int name_len)
+{
+ int ret = 0;
+ u64 gen;
+ u64 ow_inode;
+ u8 other_type;
+
+ if (!sctx->parent_root)
+ goto out;
+
+ ret = is_inode_existent(sctx, dir, dir_gen);
+ if (ret <= 0)
+ goto out;
+
+ /* check if the ref was overwritten by another ref */
+ ret = lookup_dir_item_inode(sctx->send_root, dir, name, name_len,
+ &ow_inode, &other_type);
+ if (ret < 0 && ret != -ENOENT)
+ goto out;
+ if (ret) {
+ /* was never and will never be overwritten */
+ ret = 0;
+ goto out;
+ }
+
+ ret = get_inode_info(sctx->send_root, ow_inode, NULL, &gen, NULL, NULL,
+ NULL, NULL);
+ if (ret < 0)
+ goto out;
+
+ if (ow_inode == ino && gen == ino_gen) {
+ ret = 0;
+ goto out;
+ }
+
+ /* we know that it is or will be overwritten. check this now */
+ if (ow_inode < sctx->send_progress)
+ ret = 1;
+ else
+ ret = 0;
+
+out:
+ return ret;
+}
+
+/*
+ * Same as did_overwrite_ref, but also checks if it is the first ref of an inode
+ * that got overwritten. This is used by process_recorded_refs to determine
+ * if it has to use the path as returned by get_cur_path or the orphan name.
+ */
+static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
+{
+ int ret = 0;
+ struct fs_path *name = NULL;
+ u64 dir;
+ u64 dir_gen;
+
+ if (!sctx->parent_root)
+ goto out;
+
+ name = fs_path_alloc();
+ if (!name)
+ return -ENOMEM;
+
+ ret = get_first_ref(sctx->parent_root, ino, &dir, &dir_gen, name);
+ if (ret < 0)
+ goto out;
+
+ ret = did_overwrite_ref(sctx, dir, dir_gen, ino, gen,
+ name->start, fs_path_len(name));
+
+out:
+ fs_path_free(name);
+ return ret;
+}
+
+/*
+ * Insert a name cache entry. On 32bit kernels the radix tree index is 32bit,
+ * so we need to do some special handling in case we have clashes. This function
+ * takes care of this with the help of name_cache_entry::radix_list.
+ * In case of error, nce is kfreed.
+ */
+static int name_cache_insert(struct send_ctx *sctx,
+ struct name_cache_entry *nce)
+{
+ int ret = 0;
+ struct list_head *nce_head;
+
+ nce_head = radix_tree_lookup(&sctx->name_cache,
+ (unsigned long)nce->ino);
+ if (!nce_head) {
+ nce_head = kmalloc(sizeof(*nce_head), GFP_NOFS);
+ if (!nce_head) {
+ kfree(nce);
+ return -ENOMEM;
+ }
+ INIT_LIST_HEAD(nce_head);
+
+ ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head);
+ if (ret < 0) {
+ kfree(nce_head);
+ kfree(nce);
+ return ret;
+ }
+ }
+ list_add_tail(&nce->radix_list, nce_head);
+ list_add_tail(&nce->list, &sctx->name_cache_list);
+ sctx->name_cache_size++;
+
+ return ret;
+}
+
+static void name_cache_delete(struct send_ctx *sctx,
+ struct name_cache_entry *nce)
+{
+ struct list_head *nce_head;
+
+ nce_head = radix_tree_lookup(&sctx->name_cache,
+ (unsigned long)nce->ino);
+ if (!nce_head) {
+ btrfs_err(sctx->send_root->fs_info,
+ "name_cache_delete lookup failed ino %llu cache size %d, leaking memory",
+ nce->ino, sctx->name_cache_size);
+ }
+
+ list_del(&nce->radix_list);
+ list_del(&nce->list);
+ sctx->name_cache_size--;
+
+ /*
+ * We may not get to the final release of nce_head if the lookup fails
+ */
+ if (nce_head && list_empty(nce_head)) {
+ radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino);
+ kfree(nce_head);
+ }
+}
+
+static struct name_cache_entry *name_cache_search(struct send_ctx *sctx,
+ u64 ino, u64 gen)
+{
+ struct list_head *nce_head;
+ struct name_cache_entry *cur;
+
+ nce_head = radix_tree_lookup(&sctx->name_cache, (unsigned long)ino);
+ if (!nce_head)
+ return NULL;
+
+ list_for_each_entry(cur, nce_head, radix_list) {
+ if (cur->ino == ino && cur->gen == gen)
+ return cur;
+ }
+ return NULL;
+}
+
+/*
+ * Removes the entry from the list and adds it back to the end. This marks the
+ * entry as recently used so that name_cache_clean_unused does not remove it.
+ */
+static void name_cache_used(struct send_ctx *sctx, struct name_cache_entry *nce)
+{
+ list_del(&nce->list);
+ list_add_tail(&nce->list, &sctx->name_cache_list);
+}
+
+/*
+ * Remove some entries from the beginning of name_cache_list.
+ */
+static void name_cache_clean_unused(struct send_ctx *sctx)
+{
+ struct name_cache_entry *nce;
+
+ if (sctx->name_cache_size < SEND_CTX_NAME_CACHE_CLEAN_SIZE)
+ return;
+
+ while (sctx->name_cache_size > SEND_CTX_MAX_NAME_CACHE_SIZE) {
+ nce = list_entry(sctx->name_cache_list.next,
+ struct name_cache_entry, list);
+ name_cache_delete(sctx, nce);
+ kfree(nce);
+ }
+}
+
+static void name_cache_free(struct send_ctx *sctx)
+{
+ struct name_cache_entry *nce;
+
+ while (!list_empty(&sctx->name_cache_list)) {
+ nce = list_entry(sctx->name_cache_list.next,
+ struct name_cache_entry, list);
+ name_cache_delete(sctx, nce);
+ kfree(nce);
+ }
+}
+
+/*
+ * Used by get_cur_path for each ref up to the root.
+ * Returns 0 if it succeeded.
+ * Returns 1 if the inode is not existent or got overwritten. In that case, the
+ * name is an orphan name. This instructs get_cur_path to stop iterating. If 1
+ * is returned, parent_ino/parent_gen are not guaranteed to be valid.
+ * Returns <0 in case of error.
+ */
+static int __get_cur_name_and_parent(struct send_ctx *sctx,
+ u64 ino, u64 gen,
+ u64 *parent_ino,
+ u64 *parent_gen,
+ struct fs_path *dest)
+{
+ int ret;
+ int nce_ret;
+ struct name_cache_entry *nce = NULL;
+
+ /*
+ * First check if we already did a call to this function with the same
+ * ino/gen. If yes, check if the cache entry is still up-to-date. If yes
+ * return the cached result.
+ */
+ nce = name_cache_search(sctx, ino, gen);
+ if (nce) {
+ if (ino < sctx->send_progress && nce->need_later_update) {
+ name_cache_delete(sctx, nce);
+ kfree(nce);
+ nce = NULL;
+ } else {
+ name_cache_used(sctx, nce);
+ *parent_ino = nce->parent_ino;
+ *parent_gen = nce->parent_gen;
+ ret = fs_path_add(dest, nce->name, nce->name_len);
+ if (ret < 0)
+ goto out;
+ ret = nce->ret;
+ goto out;
+ }
+ }
+
+ /*
+ * If the inode is not existent yet, add the orphan name and return 1.
+ * This should only happen for the parent dir that we determine in
+ * __record_new_ref
+ */
+ ret = is_inode_existent(sctx, ino, gen);
+ if (ret < 0)
+ goto out;
+
+ if (!ret) {
+ ret = gen_unique_name(sctx, ino, gen, dest);
+ if (ret < 0)
+ goto out;
+ ret = 1;
+ goto out_cache;
+ }
+
+ /*
+ * Depending on whether the inode was already processed or not, use
+ * send_root or parent_root for ref lookup.
+ */
+ if (ino < sctx->send_progress)
+ ret = get_first_ref(sctx->send_root, ino,
+ parent_ino, parent_gen, dest);
+ else
+ ret = get_first_ref(sctx->parent_root, ino,
+ parent_ino, parent_gen, dest);
+ if (ret < 0)
+ goto out;
+
+ /*
+ * Check if the ref was overwritten by an inode's ref that was processed
+ * earlier. If yes, treat as orphan and return 1.
+ */
+ ret = did_overwrite_ref(sctx, *parent_ino, *parent_gen, ino, gen,
+ dest->start, dest->end - dest->start);
+ if (ret < 0)
+ goto out;
+ if (ret) {
+ fs_path_reset(dest);
+ ret = gen_unique_name(sctx, ino, gen, dest);
+ if (ret < 0)
+ goto out;
+ ret = 1;
+ }
+
+out_cache:
+ /*
+ * Store the result of the lookup in the name cache.
+ */
+ nce = kmalloc(sizeof(*nce) + fs_path_len(dest) + 1, GFP_NOFS);
+ if (!nce) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ nce->ino = ino;
+ nce->gen = gen;
+ nce->parent_ino = *parent_ino;
+ nce->parent_gen = *parent_gen;
+ nce->name_len = fs_path_len(dest);
+ nce->ret = ret;
+ strcpy(nce->name, dest->start);
+
+ if (ino < sctx->send_progress)
+ nce->need_later_update = 0;
+ else
+ nce->need_later_update = 1;
+
+ nce_ret = name_cache_insert(sctx, nce);
+ if (nce_ret < 0)
+ ret = nce_ret;
+ name_cache_clean_unused(sctx);
+
+out:
+ return ret;
+}
+
+/*
+ * Magic happens here. This function returns the first ref to an inode as it
+ * would look like while receiving the stream at this point in time.
+ * We walk the path up to the root. For every inode in between, we check if it
+ * was already processed/sent. If yes, we continue with the parent as found
+ * in send_root. If not, we continue with the parent as found in parent_root.
+ * If we encounter an inode that was deleted at this point in time, we use the
+ * inodes "orphan" name instead of the real name and stop. Same with new inodes
+ * that were not created yet and overwritten inodes/refs.
+ *
+ * When do we have have orphan inodes:
+ * 1. When an inode is freshly created and thus no valid refs are available yet
+ * 2. When a directory lost all it's refs (deleted) but still has dir items
+ * inside which were not processed yet (pending for move/delete). If anyone
+ * tried to get the path to the dir items, it would get a path inside that
+ * orphan directory.
+ * 3. When an inode is moved around or gets new links, it may overwrite the ref
+ * of an unprocessed inode. If in that case the first ref would be
+ * overwritten, the overwritten inode gets "orphanized". Later when we
+ * process this overwritten inode, it is restored at a new place by moving
+ * the orphan inode.
+ *
+ * sctx->send_progress tells this function at which point in time receiving
+ * would be.
+ */
+static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
+ struct fs_path *dest)
+{
+ int ret = 0;
+ struct fs_path *name = NULL;
+ u64 parent_inode = 0;
+ u64 parent_gen = 0;
+ int stop = 0;
+
+ name = fs_path_alloc();
+ if (!name) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ dest->reversed = 1;
+ fs_path_reset(dest);
+
+ while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) {
+ fs_path_reset(name);
+
+ if (is_waiting_for_rm(sctx, ino)) {
+ ret = gen_unique_name(sctx, ino, gen, name);
+ if (ret < 0)
+ goto out;
+ ret = fs_path_add_path(dest, name);
+ break;
+ }
+
+ if (is_waiting_for_move(sctx, ino)) {
+ ret = get_first_ref(sctx->parent_root, ino,
+ &parent_inode, &parent_gen, name);
+ } else {
+ ret = __get_cur_name_and_parent(sctx, ino, gen,
+ &parent_inode,
+ &parent_gen, name);
+ if (ret)
+ stop = 1;
+ }
+
+ if (ret < 0)
+ goto out;
+
+ ret = fs_path_add_path(dest, name);
+ if (ret < 0)
+ goto out;
+
+ ino = parent_inode;
+ gen = parent_gen;
+ }
+
+out:
+ fs_path_free(name);
+ if (!ret)
+ fs_path_unreverse(dest);
+ return ret;
+}
+
+/*
+ * Sends a BTRFS_SEND_C_SUBVOL command/item to userspace
+ */
+static int send_subvol_begin(struct send_ctx *sctx)
+{
+ int ret;
+ struct btrfs_root *send_root = sctx->send_root;
+ struct btrfs_root *parent_root = sctx->parent_root;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct btrfs_root_ref *ref;
+ struct extent_buffer *leaf;
+ char *name = NULL;
+ int namelen;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ name = kmalloc(BTRFS_PATH_NAME_MAX, GFP_NOFS);
+ if (!name) {
+ btrfs_free_path(path);
+ return -ENOMEM;
+ }
+
+ key.objectid = send_root->objectid;
+ key.type = BTRFS_ROOT_BACKREF_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot_for_read(send_root->fs_info->tree_root,
+ &key, path, 1, 0);
+ if (ret < 0)
+ goto out;
+ if (ret) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.type != BTRFS_ROOT_BACKREF_KEY ||
+ key.objectid != send_root->objectid) {
+ ret = -ENOENT;
+ goto out;
+ }
+ ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
+ namelen = btrfs_root_ref_name_len(leaf, ref);
+ read_extent_buffer(leaf, name, (unsigned long)(ref + 1), namelen);
+ btrfs_release_path(path);
+
+ if (parent_root) {
+ ret = begin_cmd(sctx, BTRFS_SEND_C_SNAPSHOT);
+ if (ret < 0)
+ goto out;
+ } else {
+ ret = begin_cmd(sctx, BTRFS_SEND_C_SUBVOL);
+ if (ret < 0)
+ goto out;
+ }
+
+ TLV_PUT_STRING(sctx, BTRFS_SEND_A_PATH, name, namelen);
+ TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
+ sctx->send_root->root_item.uuid);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID,
+ le64_to_cpu(sctx->send_root->root_item.ctransid));
+ if (parent_root) {
+ TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
+ sctx->parent_root->root_item.uuid);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
+ le64_to_cpu(sctx->parent_root->root_item.ctransid));
+ }
+
+ ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+ btrfs_free_path(path);
+ kfree(name);
+ return ret;
+}
+
+static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size)
+{
+ int ret = 0;
+ struct fs_path *p;
+
+verbose_printk("btrfs: send_truncate %llu size=%llu\n", ino, size);
+
+ p = fs_path_alloc();
+ if (!p)
+ return -ENOMEM;
+
+ ret = begin_cmd(sctx, BTRFS_SEND_C_TRUNCATE);
+ if (ret < 0)
+ goto out;
+
+ ret = get_cur_path(sctx, ino, gen, p);
+ if (ret < 0)
+ goto out;
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, size);
+
+ ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+ fs_path_free(p);
+ return ret;
+}
+
+static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode)
+{
+ int ret = 0;
+ struct fs_path *p;
+
+verbose_printk("btrfs: send_chmod %llu mode=%llu\n", ino, mode);
+
+ p = fs_path_alloc();
+ if (!p)
+ return -ENOMEM;
+
+ ret = begin_cmd(sctx, BTRFS_SEND_C_CHMOD);
+ if (ret < 0)
+ goto out;
+
+ ret = get_cur_path(sctx, ino, gen, p);
+ if (ret < 0)
+ goto out;
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode & 07777);
+
+ ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+ fs_path_free(p);
+ return ret;
+}
+
+static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid)
+{
+ int ret = 0;
+ struct fs_path *p;
+
+verbose_printk("btrfs: send_chown %llu uid=%llu, gid=%llu\n", ino, uid, gid);
+
+ p = fs_path_alloc();
+ if (!p)
+ return -ENOMEM;
+
+ ret = begin_cmd(sctx, BTRFS_SEND_C_CHOWN);
+ if (ret < 0)
+ goto out;
+
+ ret = get_cur_path(sctx, ino, gen, p);
+ if (ret < 0)
+ goto out;
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_UID, uid);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_GID, gid);
+
+ ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+ fs_path_free(p);
+ return ret;
+}
+
+static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen)
+{
+ int ret = 0;
+ struct fs_path *p = NULL;
+ struct btrfs_inode_item *ii;
+ struct btrfs_path *path = NULL;
+ struct extent_buffer *eb;
+ struct btrfs_key key;
+ int slot;
+
+verbose_printk("btrfs: send_utimes %llu\n", ino);
+
+ p = fs_path_alloc();
+ if (!p)
+ return -ENOMEM;
+
+ path = alloc_path_for_send();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ key.objectid = ino;
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+ ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ eb = path->nodes[0];
+ slot = path->slots[0];
+ ii = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
+
+ ret = begin_cmd(sctx, BTRFS_SEND_C_UTIMES);
+ if (ret < 0)
+ goto out;
+
+ ret = get_cur_path(sctx, ino, gen, p);
+ if (ret < 0)
+ goto out;
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+ TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb, &ii->atime);
+ TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb, &ii->mtime);
+ TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb, &ii->ctime);
+ /* TODO Add otime support when the otime patches get into upstream */
+
+ ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+ fs_path_free(p);
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * Sends a BTRFS_SEND_C_MKXXX or SYMLINK command to user space. We don't have
+ * a valid path yet because we did not process the refs yet. So, the inode
+ * is created as orphan.
+ */
+static int send_create_inode(struct send_ctx *sctx, u64 ino)
+{
+ int ret = 0;
+ struct fs_path *p;
+ int cmd;
+ u64 gen;
+ u64 mode;
+ u64 rdev;
+
+verbose_printk("btrfs: send_create_inode %llu\n", ino);
+
+ p = fs_path_alloc();
+ if (!p)
+ return -ENOMEM;
+
+ if (ino != sctx->cur_ino) {
+ ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode,
+ NULL, NULL, &rdev);
+ if (ret < 0)
+ goto out;
+ } else {
+ gen = sctx->cur_inode_gen;
+ mode = sctx->cur_inode_mode;
+ rdev = sctx->cur_inode_rdev;
+ }
+
+ if (S_ISREG(mode)) {
+ cmd = BTRFS_SEND_C_MKFILE;
+ } else if (S_ISDIR(mode)) {
+ cmd = BTRFS_SEND_C_MKDIR;
+ } else if (S_ISLNK(mode)) {
+ cmd = BTRFS_SEND_C_SYMLINK;
+ } else if (S_ISCHR(mode) || S_ISBLK(mode)) {
+ cmd = BTRFS_SEND_C_MKNOD;
+ } else if (S_ISFIFO(mode)) {
+ cmd = BTRFS_SEND_C_MKFIFO;
+ } else if (S_ISSOCK(mode)) {
+ cmd = BTRFS_SEND_C_MKSOCK;
+ } else {
+ printk(KERN_WARNING "btrfs: unexpected inode type %o",
+ (int)(mode & S_IFMT));
+ ret = -ENOTSUPP;
+ goto out;
+ }
+
+ ret = begin_cmd(sctx, cmd);
+ if (ret < 0)
+ goto out;
+
+ ret = gen_unique_name(sctx, ino, gen, p);
+ if (ret < 0)
+ goto out;
+
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_INO, ino);
+
+ if (S_ISLNK(mode)) {
+ fs_path_reset(p);
+ ret = read_symlink(sctx->send_root, ino, p);
+ if (ret < 0)
+ goto out;
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p);
+ } else if (S_ISCHR(mode) || S_ISBLK(mode) ||
+ S_ISFIFO(mode) || S_ISSOCK(mode)) {
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_RDEV, new_encode_dev(rdev));
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode);
+ }
+
+ ret = send_cmd(sctx);
+ if (ret < 0)
+ goto out;
+
+
+tlv_put_failure:
+out:
+ fs_path_free(p);
+ return ret;
+}
+
+/*
+ * We need some special handling for inodes that get processed before the parent
+ * directory got created. See process_recorded_refs for details.
+ * This function does the check if we already created the dir out of order.
+ */
+static int did_create_dir(struct send_ctx *sctx, u64 dir)
+{
+ int ret = 0;
+ struct btrfs_path *path = NULL;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ struct btrfs_key di_key;
+ struct extent_buffer *eb;
+ struct btrfs_dir_item *di;
+ int slot;
+
+ path = alloc_path_for_send();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ key.objectid = dir;
+ key.type = BTRFS_DIR_INDEX_KEY;
+ key.offset = 0;
+ ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ while (1) {
+ eb = path->nodes[0];
+ slot = path->slots[0];
+ if (slot >= btrfs_header_nritems(eb)) {
+ ret = btrfs_next_leaf(sctx->send_root, path);
+ if (ret < 0) {
+ goto out;
+ } else if (ret > 0) {
+ ret = 0;
+ break;
+ }
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(eb, &found_key, slot);
+ if (found_key.objectid != key.objectid ||
+ found_key.type != key.type) {
+ ret = 0;
+ goto out;
+ }
+
+ di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
+ btrfs_dir_item_key_to_cpu(eb, di, &di_key);
+
+ if (di_key.type != BTRFS_ROOT_ITEM_KEY &&
+ di_key.objectid < sctx->send_progress) {
+ ret = 1;
+ goto out;
+ }
+
+ path->slots[0]++;
+ }
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * Only creates the inode if it is:
+ * 1. Not a directory
+ * 2. Or a directory which was not created already due to out of order
+ * directories. See did_create_dir and process_recorded_refs for details.
+ */
+static int send_create_inode_if_needed(struct send_ctx *sctx)
+{
+ int ret;
+
+ if (S_ISDIR(sctx->cur_inode_mode)) {
+ ret = did_create_dir(sctx, sctx->cur_ino);
+ if (ret < 0)
+ goto out;
+ if (ret) {
+ ret = 0;
+ goto out;
+ }
+ }
+
+ ret = send_create_inode(sctx, sctx->cur_ino);
+ if (ret < 0)
+ goto out;
+
+out:
+ return ret;
+}
+
+struct recorded_ref {
+ struct list_head list;
+ char *dir_path;
+ char *name;
+ struct fs_path *full_path;
+ u64 dir;
+ u64 dir_gen;
+ int dir_path_len;
+ int name_len;
+};
+
+/*
+ * We need to process new refs before deleted refs, but compare_tree gives us
+ * everything mixed. So we first record all refs and later process them.
+ * This function is a helper to record one ref.
+ */
+static int __record_ref(struct list_head *head, u64 dir,
+ u64 dir_gen, struct fs_path *path)
+{
+ struct recorded_ref *ref;
+
+ ref = kmalloc(sizeof(*ref), GFP_NOFS);
+ if (!ref)
+ return -ENOMEM;
+
+ ref->dir = dir;
+ ref->dir_gen = dir_gen;
+ ref->full_path = path;
+
+ ref->name = (char *)kbasename(ref->full_path->start);
+ ref->name_len = ref->full_path->end - ref->name;
+ ref->dir_path = ref->full_path->start;
+ if (ref->name == ref->full_path->start)
+ ref->dir_path_len = 0;
+ else
+ ref->dir_path_len = ref->full_path->end -
+ ref->full_path->start - 1 - ref->name_len;
+
+ list_add_tail(&ref->list, head);
+ return 0;
+}
+
+static int dup_ref(struct recorded_ref *ref, struct list_head *list)
+{
+ struct recorded_ref *new;
+
+ new = kmalloc(sizeof(*ref), GFP_NOFS);
+ if (!new)
+ return -ENOMEM;
+
+ new->dir = ref->dir;
+ new->dir_gen = ref->dir_gen;
+ new->full_path = NULL;
+ INIT_LIST_HEAD(&new->list);
+ list_add_tail(&new->list, list);
+ return 0;
+}
+
+static void __free_recorded_refs(struct list_head *head)
+{
+ struct recorded_ref *cur;
+
+ while (!list_empty(head)) {
+ cur = list_entry(head->next, struct recorded_ref, list);
+ fs_path_free(cur->full_path);
+ list_del(&cur->list);
+ kfree(cur);
+ }
+}
+
+static void free_recorded_refs(struct send_ctx *sctx)
+{
+ __free_recorded_refs(&sctx->new_refs);
+ __free_recorded_refs(&sctx->deleted_refs);
+}
+
+/*
+ * Renames/moves a file/dir to its orphan name. Used when the first
+ * ref of an unprocessed inode gets overwritten and for all non empty
+ * directories.
+ */
+static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen,
+ struct fs_path *path)
+{
+ int ret;
+ struct fs_path *orphan;
+
+ orphan = fs_path_alloc();
+ if (!orphan)
+ return -ENOMEM;
+
+ ret = gen_unique_name(sctx, ino, gen, orphan);
+ if (ret < 0)
+ goto out;
+
+ ret = send_rename(sctx, path, orphan);
+
+out:
+ fs_path_free(orphan);
+ return ret;
+}
+
+static struct orphan_dir_info *
+add_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
+{
+ struct rb_node **p = &sctx->orphan_dirs.rb_node;
+ struct rb_node *parent = NULL;
+ struct orphan_dir_info *entry, *odi;
+
+ odi = kmalloc(sizeof(*odi), GFP_NOFS);
+ if (!odi)
+ return ERR_PTR(-ENOMEM);
+ odi->ino = dir_ino;
+ odi->gen = 0;
+
+ while (*p) {
+ parent = *p;
+ entry = rb_entry(parent, struct orphan_dir_info, node);
+ if (dir_ino < entry->ino) {
+ p = &(*p)->rb_left;
+ } else if (dir_ino > entry->ino) {
+ p = &(*p)->rb_right;
+ } else {
+ kfree(odi);
+ return entry;
+ }
+ }
+
+ rb_link_node(&odi->node, parent, p);
+ rb_insert_color(&odi->node, &sctx->orphan_dirs);
+ return odi;
+}
+
+static struct orphan_dir_info *
+get_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
+{
+ struct rb_node *n = sctx->orphan_dirs.rb_node;
+ struct orphan_dir_info *entry;
+
+ while (n) {
+ entry = rb_entry(n, struct orphan_dir_info, node);
+ if (dir_ino < entry->ino)
+ n = n->rb_left;
+ else if (dir_ino > entry->ino)
+ n = n->rb_right;
+ else
+ return entry;
+ }
+ return NULL;
+}
+
+static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino)
+{
+ struct orphan_dir_info *odi = get_orphan_dir_info(sctx, dir_ino);
+
+ return odi != NULL;
+}
+
+static void free_orphan_dir_info(struct send_ctx *sctx,
+ struct orphan_dir_info *odi)
+{
+ if (!odi)
+ return;
+ rb_erase(&odi->node, &sctx->orphan_dirs);
+ kfree(odi);
+}
+
+/*
+ * Returns 1 if a directory can be removed at this point in time.
+ * We check this by iterating all dir items and checking if the inode behind
+ * the dir item was already processed.
+ */
+static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
+ u64 send_progress)
+{
+ int ret = 0;
+ struct btrfs_root *root = sctx->parent_root;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ struct btrfs_key loc;
+ struct btrfs_dir_item *di;
+
+ /*
+ * Don't try to rmdir the top/root subvolume dir.
+ */
+ if (dir == BTRFS_FIRST_FREE_OBJECTID)
+ return 0;
+
+ path = alloc_path_for_send();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = dir;
+ key.type = BTRFS_DIR_INDEX_KEY;
+ key.offset = 0;
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ while (1) {
+ struct waiting_dir_move *dm;
+
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto out;
+ else if (ret > 0)
+ break;
+ continue;
+ }
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+ path->slots[0]);
+ if (found_key.objectid != key.objectid ||
+ found_key.type != key.type)
+ break;
+
+ di = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_dir_item);
+ btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc);
+
+ dm = get_waiting_dir_move(sctx, loc.objectid);
+ if (dm) {
+ struct orphan_dir_info *odi;
+
+ odi = add_orphan_dir_info(sctx, dir);
+ if (IS_ERR(odi)) {
+ ret = PTR_ERR(odi);
+ goto out;
+ }
+ odi->gen = dir_gen;
+ dm->rmdir_ino = dir;
+ ret = 0;
+ goto out;
+ }
+
+ if (loc.objectid > send_progress) {
+ ret = 0;
+ goto out;
+ }
+
+ path->slots[0]++;
+ }
+
+ ret = 1;
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int is_waiting_for_move(struct send_ctx *sctx, u64 ino)
+{
+ struct waiting_dir_move *entry = get_waiting_dir_move(sctx, ino);
+
+ return entry != NULL;
+}
+
+static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino)
+{
+ struct rb_node **p = &sctx->waiting_dir_moves.rb_node;
+ struct rb_node *parent = NULL;
+ struct waiting_dir_move *entry, *dm;
+
+ dm = kmalloc(sizeof(*dm), GFP_NOFS);
+ if (!dm)
+ return -ENOMEM;
+ dm->ino = ino;
+ dm->rmdir_ino = 0;
+
+ while (*p) {
+ parent = *p;
+ entry = rb_entry(parent, struct waiting_dir_move, node);
+ if (ino < entry->ino) {
+ p = &(*p)->rb_left;
+ } else if (ino > entry->ino) {
+ p = &(*p)->rb_right;
+ } else {
+ kfree(dm);
+ return -EEXIST;
+ }
+ }
+
+ rb_link_node(&dm->node, parent, p);
+ rb_insert_color(&dm->node, &sctx->waiting_dir_moves);
+ return 0;
+}
+
+static struct waiting_dir_move *
+get_waiting_dir_move(struct send_ctx *sctx, u64 ino)
+{
+ struct rb_node *n = sctx->waiting_dir_moves.rb_node;
+ struct waiting_dir_move *entry;
+
+ while (n) {
+ entry = rb_entry(n, struct waiting_dir_move, node);
+ if (ino < entry->ino)
+ n = n->rb_left;
+ else if (ino > entry->ino)
+ n = n->rb_right;
+ else
+ return entry;
+ }
+ return NULL;
+}
+
+static void free_waiting_dir_move(struct send_ctx *sctx,
+ struct waiting_dir_move *dm)
+{
+ if (!dm)
+ return;
+ rb_erase(&dm->node, &sctx->waiting_dir_moves);
+ kfree(dm);
+}
+
+static int add_pending_dir_move(struct send_ctx *sctx,
+ u64 ino,
+ u64 ino_gen,
+ u64 parent_ino,
+ struct list_head *new_refs,
+ struct list_head *deleted_refs,
+ const bool is_orphan)
+{
+ struct rb_node **p = &sctx->pending_dir_moves.rb_node;
+ struct rb_node *parent = NULL;
+ struct pending_dir_move *entry = NULL, *pm;
+ struct recorded_ref *cur;
+ int exists = 0;
+ int ret;
+
+ pm = kmalloc(sizeof(*pm), GFP_NOFS);
+ if (!pm)
+ return -ENOMEM;
+ pm->parent_ino = parent_ino;
+ pm->ino = ino;
+ pm->gen = ino_gen;
+ pm->is_orphan = is_orphan;
+ INIT_LIST_HEAD(&pm->list);
+ INIT_LIST_HEAD(&pm->update_refs);
+ RB_CLEAR_NODE(&pm->node);
+
+ while (*p) {
+ parent = *p;
+ entry = rb_entry(parent, struct pending_dir_move, node);
+ if (parent_ino < entry->parent_ino) {
+ p = &(*p)->rb_left;
+ } else if (parent_ino > entry->parent_ino) {
+ p = &(*p)->rb_right;
+ } else {
+ exists = 1;
+ break;
+ }
+ }
+
+ list_for_each_entry(cur, deleted_refs, list) {
+ ret = dup_ref(cur, &pm->update_refs);
+ if (ret < 0)
+ goto out;
+ }
+ list_for_each_entry(cur, new_refs, list) {
+ ret = dup_ref(cur, &pm->update_refs);
+ if (ret < 0)
+ goto out;
+ }
+
+ ret = add_waiting_dir_move(sctx, pm->ino);
+ if (ret)
+ goto out;
+
+ if (exists) {
+ list_add_tail(&pm->list, &entry->list);
+ } else {
+ rb_link_node(&pm->node, parent, p);
+ rb_insert_color(&pm->node, &sctx->pending_dir_moves);
+ }
+ ret = 0;
+out:
+ if (ret) {
+ __free_recorded_refs(&pm->update_refs);
+ kfree(pm);
+ }
+ return ret;
+}
+
+static struct pending_dir_move *get_pending_dir_moves(struct send_ctx *sctx,
+ u64 parent_ino)
+{
+ struct rb_node *n = sctx->pending_dir_moves.rb_node;
+ struct pending_dir_move *entry;
+
+ while (n) {
+ entry = rb_entry(n, struct pending_dir_move, node);
+ if (parent_ino < entry->parent_ino)
+ n = n->rb_left;
+ else if (parent_ino > entry->parent_ino)
+ n = n->rb_right;
+ else
+ return entry;
+ }
+ return NULL;
+}
+
+static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
+{
+ struct fs_path *from_path = NULL;
+ struct fs_path *to_path = NULL;
+ struct fs_path *name = NULL;
+ u64 orig_progress = sctx->send_progress;
+ struct recorded_ref *cur;
+ u64 parent_ino, parent_gen;
+ struct waiting_dir_move *dm = NULL;
+ u64 rmdir_ino = 0;
+ int ret;
+
+ name = fs_path_alloc();
+ from_path = fs_path_alloc();
+ if (!name || !from_path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ dm = get_waiting_dir_move(sctx, pm->ino);
+ ASSERT(dm);
+ rmdir_ino = dm->rmdir_ino;
+ free_waiting_dir_move(sctx, dm);
+
+ if (pm->is_orphan) {
+ ret = gen_unique_name(sctx, pm->ino,
+ pm->gen, from_path);
+ } else {
+ ret = get_first_ref(sctx->parent_root, pm->ino,
+ &parent_ino, &parent_gen, name);
+ if (ret < 0)
+ goto out;
+ ret = get_cur_path(sctx, parent_ino, parent_gen,
+ from_path);
+ if (ret < 0)
+ goto out;
+ ret = fs_path_add_path(from_path, name);
+ }
+ if (ret < 0)
+ goto out;
+
+ sctx->send_progress = sctx->cur_ino + 1;
+ fs_path_reset(name);
+ to_path = name;
+ name = NULL;
+ ret = get_cur_path(sctx, pm->ino, pm->gen, to_path);
+ if (ret < 0)
+ goto out;
+
+ ret = send_rename(sctx, from_path, to_path);
+ if (ret < 0)
+ goto out;
+
+ if (rmdir_ino) {
+ struct orphan_dir_info *odi;
+
+ odi = get_orphan_dir_info(sctx, rmdir_ino);
+ if (!odi) {
+ /* already deleted */
+ goto finish;
+ }
+ ret = can_rmdir(sctx, rmdir_ino, odi->gen, sctx->cur_ino + 1);
+ if (ret < 0)
+ goto out;
+ if (!ret)
+ goto finish;
+
+ name = fs_path_alloc();
+ if (!name) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ret = get_cur_path(sctx, rmdir_ino, odi->gen, name);
+ if (ret < 0)
+ goto out;
+ ret = send_rmdir(sctx, name);
+ if (ret < 0)
+ goto out;
+ free_orphan_dir_info(sctx, odi);
+ }
+
+finish:
+ ret = send_utimes(sctx, pm->ino, pm->gen);
+ if (ret < 0)
+ goto out;
+
+ /*
+ * After rename/move, need to update the utimes of both new parent(s)
+ * and old parent(s).
+ */
+ list_for_each_entry(cur, &pm->update_refs, list) {
+ if (cur->dir == rmdir_ino)
+ continue;
+ ret = send_utimes(sctx, cur->dir, cur->dir_gen);
+ if (ret < 0)
+ goto out;
+ }
+
+out:
+ fs_path_free(name);
+ fs_path_free(from_path);
+ fs_path_free(to_path);
+ sctx->send_progress = orig_progress;
+
+ return ret;
+}
+
+static void free_pending_move(struct send_ctx *sctx, struct pending_dir_move *m)
+{
+ if (!list_empty(&m->list))
+ list_del(&m->list);
+ if (!RB_EMPTY_NODE(&m->node))
+ rb_erase(&m->node, &sctx->pending_dir_moves);
+ __free_recorded_refs(&m->update_refs);
+ kfree(m);
+}
+
+static void tail_append_pending_moves(struct pending_dir_move *moves,
+ struct list_head *stack)
+{
+ if (list_empty(&moves->list)) {
+ list_add_tail(&moves->list, stack);
+ } else {
+ LIST_HEAD(list);
+ list_splice_init(&moves->list, &list);
+ list_add_tail(&moves->list, stack);
+ list_splice_tail(&list, stack);
+ }
+}
+
+static int apply_children_dir_moves(struct send_ctx *sctx)
+{
+ struct pending_dir_move *pm;
+ struct list_head stack;
+ u64 parent_ino = sctx->cur_ino;
+ int ret = 0;
+
+ pm = get_pending_dir_moves(sctx, parent_ino);
+ if (!pm)
+ return 0;
+
+ INIT_LIST_HEAD(&stack);
+ tail_append_pending_moves(pm, &stack);
+
+ while (!list_empty(&stack)) {
+ pm = list_first_entry(&stack, struct pending_dir_move, list);
+ parent_ino = pm->ino;
+ ret = apply_dir_move(sctx, pm);
+ free_pending_move(sctx, pm);
+ if (ret)
+ goto out;
+ pm = get_pending_dir_moves(sctx, parent_ino);
+ if (pm)
+ tail_append_pending_moves(pm, &stack);
+ }
+ return 0;
+
+out:
+ while (!list_empty(&stack)) {
+ pm = list_first_entry(&stack, struct pending_dir_move, list);
+ free_pending_move(sctx, pm);
+ }
+ return ret;
+}
+
+/*
+ * We might need to delay a directory rename even when no ancestor directory
+ * (in the send root) with a higher inode number than ours (sctx->cur_ino) was
+ * renamed. This happens when we rename a directory to the old name (the name
+ * in the parent root) of some other unrelated directory that got its rename
+ * delayed due to some ancestor with higher number that got renamed.
+ *
+ * Example:
+ *
+ * Parent snapshot:
+ * . (ino 256)
+ * |---- a/ (ino 257)
+ * | |---- file (ino 260)
+ * |
+ * |---- b/ (ino 258)
+ * |---- c/ (ino 259)
+ *
+ * Send snapshot:
+ * . (ino 256)
+ * |---- a/ (ino 258)
+ * |---- x/ (ino 259)
+ * |---- y/ (ino 257)
+ * |----- file (ino 260)
+ *
+ * Here we can not rename 258 from 'b' to 'a' without the rename of inode 257
+ * from 'a' to 'x/y' happening first, which in turn depends on the rename of
+ * inode 259 from 'c' to 'x'. So the order of rename commands the send stream
+ * must issue is:
+ *
+ * 1 - rename 259 from 'c' to 'x'
+ * 2 - rename 257 from 'a' to 'x/y'
+ * 3 - rename 258 from 'b' to 'a'
+ *
+ * Returns 1 if the rename of sctx->cur_ino needs to be delayed, 0 if it can
+ * be done right away and < 0 on error.
+ */
+static int wait_for_dest_dir_move(struct send_ctx *sctx,
+ struct recorded_ref *parent_ref,
+ const bool is_orphan)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct btrfs_key di_key;
+ struct btrfs_dir_item *di;
+ u64 left_gen;
+ u64 right_gen;
+ int ret = 0;
+
+ if (RB_EMPTY_ROOT(&sctx->waiting_dir_moves))
+ return 0;
+
+ path = alloc_path_for_send();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = parent_ref->dir;
+ key.type = BTRFS_DIR_ITEM_KEY;
+ key.offset = btrfs_name_hash(parent_ref->name, parent_ref->name_len);
+
+ ret = btrfs_search_slot(NULL, sctx->parent_root, &key, path, 0, 0);
+ if (ret < 0) {
+ goto out;
+ } else if (ret > 0) {
+ ret = 0;
+ goto out;
+ }
+
+ di = btrfs_match_dir_item_name(sctx->parent_root, path,
+ parent_ref->name, parent_ref->name_len);
+ if (!di) {
+ ret = 0;
+ goto out;
+ }
+ /*
+ * di_key.objectid has the number of the inode that has a dentry in the
+ * parent directory with the same name that sctx->cur_ino is being
+ * renamed to. We need to check if that inode is in the send root as
+ * well and if it is currently marked as an inode with a pending rename,
+ * if it is, we need to delay the rename of sctx->cur_ino as well, so
+ * that it happens after that other inode is renamed.
+ */
+ btrfs_dir_item_key_to_cpu(path->nodes[0], di, &di_key);
+ if (di_key.type != BTRFS_INODE_ITEM_KEY) {
+ ret = 0;
+ goto out;
+ }
+
+ ret = get_inode_info(sctx->parent_root, di_key.objectid, NULL,
+ &left_gen, NULL, NULL, NULL, NULL);
+ if (ret < 0)
+ goto out;
+ ret = get_inode_info(sctx->send_root, di_key.objectid, NULL,
+ &right_gen, NULL, NULL, NULL, NULL);
+ if (ret < 0) {
+ if (ret == -ENOENT)
+ ret = 0;
+ goto out;
+ }
+
+ /* Different inode, no need to delay the rename of sctx->cur_ino */
+ if (right_gen != left_gen) {
+ ret = 0;
+ goto out;
+ }
+
+ if (is_waiting_for_move(sctx, di_key.objectid)) {
+ ret = add_pending_dir_move(sctx,
+ sctx->cur_ino,
+ sctx->cur_inode_gen,
+ di_key.objectid,
+ &sctx->new_refs,
+ &sctx->deleted_refs,
+ is_orphan);
+ if (!ret)
+ ret = 1;
+ }
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int wait_for_parent_move(struct send_ctx *sctx,
+ struct recorded_ref *parent_ref)
+{
+ int ret = 0;
+ u64 ino = parent_ref->dir;
+ u64 parent_ino_before, parent_ino_after;
+ struct fs_path *path_before = NULL;
+ struct fs_path *path_after = NULL;
+ int len1, len2;
+
+ path_after = fs_path_alloc();
+ path_before = fs_path_alloc();
+ if (!path_after || !path_before) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * Our current directory inode may not yet be renamed/moved because some
+ * ancestor (immediate or not) has to be renamed/moved first. So find if
+ * such ancestor exists and make sure our own rename/move happens after
+ * that ancestor is processed.
+ */
+ while (ino > BTRFS_FIRST_FREE_OBJECTID) {
+ if (is_waiting_for_move(sctx, ino)) {
+ ret = 1;
+ break;
+ }
+
+ fs_path_reset(path_before);
+ fs_path_reset(path_after);
+
+ ret = get_first_ref(sctx->send_root, ino, &parent_ino_after,
+ NULL, path_after);
+ if (ret < 0)
+ goto out;
+ ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before,
+ NULL, path_before);
+ if (ret < 0 && ret != -ENOENT) {
+ goto out;
+ } else if (ret == -ENOENT) {
+ ret = 0;
+ break;
+ }
+
+ len1 = fs_path_len(path_before);
+ len2 = fs_path_len(path_after);
+ if (ino > sctx->cur_ino &&
+ (parent_ino_before != parent_ino_after || len1 != len2 ||
+ memcmp(path_before->start, path_after->start, len1))) {
+ ret = 1;
+ break;
+ }
+ ino = parent_ino_after;
+ }
+
+out:
+ fs_path_free(path_before);
+ fs_path_free(path_after);
+
+ if (ret == 1) {
+ ret = add_pending_dir_move(sctx,
+ sctx->cur_ino,
+ sctx->cur_inode_gen,
+ ino,
+ &sctx->new_refs,
+ &sctx->deleted_refs,
+ false);
+ if (!ret)
+ ret = 1;
+ }
+
+ return ret;
+}
+
+/*
+ * This does all the move/link/unlink/rmdir magic.
+ */
+static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
+{
+ int ret = 0;
+ struct recorded_ref *cur;
+ struct recorded_ref *cur2;
+ struct list_head check_dirs;
+ struct fs_path *valid_path = NULL;
+ u64 ow_inode = 0;
+ u64 ow_gen;
+ int did_overwrite = 0;
+ int is_orphan = 0;
+ u64 last_dir_ino_rm = 0;
+ bool can_rename = true;
+
+verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
+
+ /*
+ * This should never happen as the root dir always has the same ref
+ * which is always '..'
+ */
+ BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID);
+ INIT_LIST_HEAD(&check_dirs);
+
+ valid_path = fs_path_alloc();
+ if (!valid_path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * First, check if the first ref of the current inode was overwritten
+ * before. If yes, we know that the current inode was already orphanized
+ * and thus use the orphan name. If not, we can use get_cur_path to
+ * get the path of the first ref as it would like while receiving at
+ * this point in time.
+ * New inodes are always orphan at the beginning, so force to use the
+ * orphan name in this case.
+ * The first ref is stored in valid_path and will be updated if it
+ * gets moved around.
+ */
+ if (!sctx->cur_inode_new) {
+ ret = did_overwrite_first_ref(sctx, sctx->cur_ino,
+ sctx->cur_inode_gen);
+ if (ret < 0)
+ goto out;
+ if (ret)
+ did_overwrite = 1;
+ }
+ if (sctx->cur_inode_new || did_overwrite) {
+ ret = gen_unique_name(sctx, sctx->cur_ino,
+ sctx->cur_inode_gen, valid_path);
+ if (ret < 0)
+ goto out;
+ is_orphan = 1;
+ } else {
+ ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen,
+ valid_path);
+ if (ret < 0)
+ goto out;
+ }
+
+ list_for_each_entry(cur, &sctx->new_refs, list) {
+ /*
+ * We may have refs where the parent directory does not exist
+ * yet. This happens if the parent directories inum is higher
+ * the the current inum. To handle this case, we create the
+ * parent directory out of order. But we need to check if this
+ * did already happen before due to other refs in the same dir.
+ */
+ ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
+ if (ret < 0)
+ goto out;
+ if (ret == inode_state_will_create) {
+ ret = 0;
+ /*
+ * First check if any of the current inodes refs did
+ * already create the dir.
+ */
+ list_for_each_entry(cur2, &sctx->new_refs, list) {
+ if (cur == cur2)
+ break;
+ if (cur2->dir == cur->dir) {
+ ret = 1;
+ break;
+ }
+ }
+
+ /*
+ * If that did not happen, check if a previous inode
+ * did already create the dir.
+ */
+ if (!ret)
+ ret = did_create_dir(sctx, cur->dir);
+ if (ret < 0)
+ goto out;
+ if (!ret) {
+ ret = send_create_inode(sctx, cur->dir);
+ if (ret < 0)
+ goto out;
+ }
+ }
+
+ /*
+ * Check if this new ref would overwrite the first ref of
+ * another unprocessed inode. If yes, orphanize the
+ * overwritten inode. If we find an overwritten ref that is
+ * not the first ref, simply unlink it.
+ */
+ ret = will_overwrite_ref(sctx, cur->dir, cur->dir_gen,
+ cur->name, cur->name_len,
+ &ow_inode, &ow_gen);
+ if (ret < 0)
+ goto out;
+ if (ret) {
+ ret = is_first_ref(sctx->parent_root,
+ ow_inode, cur->dir, cur->name,
+ cur->name_len);
+ if (ret < 0)
+ goto out;
+ if (ret) {
+ struct name_cache_entry *nce;
+
+ ret = orphanize_inode(sctx, ow_inode, ow_gen,
+ cur->full_path);
+ if (ret < 0)
+ goto out;
+ /*
+ * Make sure we clear our orphanized inode's
+ * name from the name cache. This is because the
+ * inode ow_inode might be an ancestor of some
+ * other inode that will be orphanized as well
+ * later and has an inode number greater than
+ * sctx->send_progress. We need to prevent
+ * future name lookups from using the old name
+ * and get instead the orphan name.
+ */
+ nce = name_cache_search(sctx, ow_inode, ow_gen);
+ if (nce) {
+ name_cache_delete(sctx, nce);
+ kfree(nce);
+ }
+ } else {
+ ret = send_unlink(sctx, cur->full_path);
+ if (ret < 0)
+ goto out;
+ }
+ }
+
+ if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root) {
+ ret = wait_for_dest_dir_move(sctx, cur, is_orphan);
+ if (ret < 0)
+ goto out;
+ if (ret == 1) {
+ can_rename = false;
+ *pending_move = 1;
+ }
+ }
+
+ /*
+ * link/move the ref to the new place. If we have an orphan
+ * inode, move it and update valid_path. If not, link or move
+ * it depending on the inode mode.
+ */
+ if (is_orphan && can_rename) {
+ ret = send_rename(sctx, valid_path, cur->full_path);
+ if (ret < 0)
+ goto out;
+ is_orphan = 0;
+ ret = fs_path_copy(valid_path, cur->full_path);
+ if (ret < 0)
+ goto out;
+ } else if (can_rename) {
+ if (S_ISDIR(sctx->cur_inode_mode)) {
+ /*
+ * Dirs can't be linked, so move it. For moved
+ * dirs, we always have one new and one deleted
+ * ref. The deleted ref is ignored later.
+ */
+ ret = wait_for_parent_move(sctx, cur);
+ if (ret < 0)
+ goto out;
+ if (ret) {
+ *pending_move = 1;
+ } else {
+ ret = send_rename(sctx, valid_path,
+ cur->full_path);
+ if (!ret)
+ ret = fs_path_copy(valid_path,
+ cur->full_path);
+ }
+ if (ret < 0)
+ goto out;
+ } else {
+ ret = send_link(sctx, cur->full_path,
+ valid_path);
+ if (ret < 0)
+ goto out;
+ }
+ }
+ ret = dup_ref(cur, &check_dirs);
+ if (ret < 0)
+ goto out;
+ }
+
+ if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_deleted) {
+ /*
+ * Check if we can already rmdir the directory. If not,
+ * orphanize it. For every dir item inside that gets deleted
+ * later, we do this check again and rmdir it then if possible.
+ * See the use of check_dirs for more details.
+ */
+ ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen,
+ sctx->cur_ino);
+ if (ret < 0)
+ goto out;
+ if (ret) {
+ ret = send_rmdir(sctx, valid_path);
+ if (ret < 0)
+ goto out;
+ } else if (!is_orphan) {
+ ret = orphanize_inode(sctx, sctx->cur_ino,
+ sctx->cur_inode_gen, valid_path);
+ if (ret < 0)
+ goto out;
+ is_orphan = 1;
+ }
+
+ list_for_each_entry(cur, &sctx->deleted_refs, list) {
+ ret = dup_ref(cur, &check_dirs);
+ if (ret < 0)
+ goto out;
+ }
+ } else if (S_ISDIR(sctx->cur_inode_mode) &&
+ !list_empty(&sctx->deleted_refs)) {
+ /*
+ * We have a moved dir. Add the old parent to check_dirs
+ */
+ cur = list_entry(sctx->deleted_refs.next, struct recorded_ref,
+ list);
+ ret = dup_ref(cur, &check_dirs);
+ if (ret < 0)
+ goto out;
+ } else if (!S_ISDIR(sctx->cur_inode_mode)) {
+ /*
+ * We have a non dir inode. Go through all deleted refs and
+ * unlink them if they were not already overwritten by other
+ * inodes.
+ */
+ list_for_each_entry(cur, &sctx->deleted_refs, list) {
+ ret = did_overwrite_ref(sctx, cur->dir, cur->dir_gen,
+ sctx->cur_ino, sctx->cur_inode_gen,
+ cur->name, cur->name_len);
+ if (ret < 0)
+ goto out;
+ if (!ret) {
+ ret = send_unlink(sctx, cur->full_path);
+ if (ret < 0)
+ goto out;
+ }
+ ret = dup_ref(cur, &check_dirs);
+ if (ret < 0)
+ goto out;
+ }
+ /*
+ * If the inode is still orphan, unlink the orphan. This may
+ * happen when a previous inode did overwrite the first ref
+ * of this inode and no new refs were added for the current
+ * inode. Unlinking does not mean that the inode is deleted in
+ * all cases. There may still be links to this inode in other
+ * places.
+ */
+ if (is_orphan) {
+ ret = send_unlink(sctx, valid_path);
+ if (ret < 0)
+ goto out;
+ }
+ }
+
+ /*
+ * We did collect all parent dirs where cur_inode was once located. We
+ * now go through all these dirs and check if they are pending for
+ * deletion and if it's finally possible to perform the rmdir now.
+ * We also update the inode stats of the parent dirs here.
+ */
+ list_for_each_entry(cur, &check_dirs, list) {
+ /*
+ * In case we had refs into dirs that were not processed yet,
+ * we don't need to do the utime and rmdir logic for these dirs.
+ * The dir will be processed later.
+ */
+ if (cur->dir > sctx->cur_ino)
+ continue;
+
+ ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
+ if (ret < 0)
+ goto out;
+
+ if (ret == inode_state_did_create ||
+ ret == inode_state_no_change) {
+ /* TODO delayed utimes */
+ ret = send_utimes(sctx, cur->dir, cur->dir_gen);
+ if (ret < 0)
+ goto out;
+ } else if (ret == inode_state_did_delete &&
+ cur->dir != last_dir_ino_rm) {
+ ret = can_rmdir(sctx, cur->dir, cur->dir_gen,
+ sctx->cur_ino);
+ if (ret < 0)
+ goto out;
+ if (ret) {
+ ret = get_cur_path(sctx, cur->dir,
+ cur->dir_gen, valid_path);
+ if (ret < 0)
+ goto out;
+ ret = send_rmdir(sctx, valid_path);
+ if (ret < 0)
+ goto out;
+ last_dir_ino_rm = cur->dir;
+ }
+ }
+ }
+
+ ret = 0;
+
+out:
+ __free_recorded_refs(&check_dirs);
+ free_recorded_refs(sctx);
+ fs_path_free(valid_path);
+ return ret;
+}
+
+static int record_ref(struct btrfs_root *root, int num, u64 dir, int index,
+ struct fs_path *name, void *ctx, struct list_head *refs)
+{
+ int ret = 0;
+ struct send_ctx *sctx = ctx;
+ struct fs_path *p;
+ u64 gen;
+
+ p = fs_path_alloc();
+ if (!p)
+ return -ENOMEM;
+
+ ret = get_inode_info(root, dir, NULL, &gen, NULL, NULL,
+ NULL, NULL);
+ if (ret < 0)
+ goto out;
+
+ ret = get_cur_path(sctx, dir, gen, p);
+ if (ret < 0)
+ goto out;
+ ret = fs_path_add_path(p, name);
+ if (ret < 0)
+ goto out;
+
+ ret = __record_ref(refs, dir, gen, p);
+
+out:
+ if (ret)
+ fs_path_free(p);
+ return ret;
+}
+
+static int __record_new_ref(int num, u64 dir, int index,
+ struct fs_path *name,
+ void *ctx)
+{
+ struct send_ctx *sctx = ctx;
+ return record_ref(sctx->send_root, num, dir, index, name,
+ ctx, &sctx->new_refs);
+}
+
+
+static int __record_deleted_ref(int num, u64 dir, int index,
+ struct fs_path *name,
+ void *ctx)
+{
+ struct send_ctx *sctx = ctx;
+ return record_ref(sctx->parent_root, num, dir, index, name,
+ ctx, &sctx->deleted_refs);
+}
+
+static int record_new_ref(struct send_ctx *sctx)
+{
+ int ret;
+
+ ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
+ sctx->cmp_key, 0, __record_new_ref, sctx);
+ if (ret < 0)
+ goto out;
+ ret = 0;
+
+out:
+ return ret;
+}
+
+static int record_deleted_ref(struct send_ctx *sctx)
+{
+ int ret;
+
+ ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
+ sctx->cmp_key, 0, __record_deleted_ref, sctx);
+ if (ret < 0)
+ goto out;
+ ret = 0;
+
+out:
+ return ret;
+}
+
+struct find_ref_ctx {
+ u64 dir;
+ u64 dir_gen;
+ struct btrfs_root *root;
+ struct fs_path *name;
+ int found_idx;
+};
+
+static int __find_iref(int num, u64 dir, int index,
+ struct fs_path *name,
+ void *ctx_)
+{
+ struct find_ref_ctx *ctx = ctx_;
+ u64 dir_gen;
+ int ret;
+
+ if (dir == ctx->dir && fs_path_len(name) == fs_path_len(ctx->name) &&
+ strncmp(name->start, ctx->name->start, fs_path_len(name)) == 0) {
+ /*
+ * To avoid doing extra lookups we'll only do this if everything
+ * else matches.
+ */
+ ret = get_inode_info(ctx->root, dir, NULL, &dir_gen, NULL,
+ NULL, NULL, NULL);
+ if (ret)
+ return ret;
+ if (dir_gen != ctx->dir_gen)
+ return 0;
+ ctx->found_idx = num;
+ return 1;
+ }
+ return 0;
+}
+
+static int find_iref(struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_key *key,
+ u64 dir, u64 dir_gen, struct fs_path *name)
+{
+ int ret;
+ struct find_ref_ctx ctx;
+
+ ctx.dir = dir;
+ ctx.name = name;
+ ctx.dir_gen = dir_gen;
+ ctx.found_idx = -1;
+ ctx.root = root;
+
+ ret = iterate_inode_ref(root, path, key, 0, __find_iref, &ctx);
+ if (ret < 0)
+ return ret;
+
+ if (ctx.found_idx == -1)
+ return -ENOENT;
+
+ return ctx.found_idx;
+}
+
+static int __record_changed_new_ref(int num, u64 dir, int index,
+ struct fs_path *name,
+ void *ctx)
+{
+ u64 dir_gen;
+ int ret;
+ struct send_ctx *sctx = ctx;
+
+ ret = get_inode_info(sctx->send_root, dir, NULL, &dir_gen, NULL,
+ NULL, NULL, NULL);
+ if (ret)
+ return ret;
+
+ ret = find_iref(sctx->parent_root, sctx->right_path,
+ sctx->cmp_key, dir, dir_gen, name);
+ if (ret == -ENOENT)
+ ret = __record_new_ref(num, dir, index, name, sctx);
+ else if (ret > 0)
+ ret = 0;
+
+ return ret;
+}
+
+static int __record_changed_deleted_ref(int num, u64 dir, int index,
+ struct fs_path *name,
+ void *ctx)
+{
+ u64 dir_gen;
+ int ret;
+ struct send_ctx *sctx = ctx;
+
+ ret = get_inode_info(sctx->parent_root, dir, NULL, &dir_gen, NULL,
+ NULL, NULL, NULL);
+ if (ret)
+ return ret;
+
+ ret = find_iref(sctx->send_root, sctx->left_path, sctx->cmp_key,
+ dir, dir_gen, name);
+ if (ret == -ENOENT)
+ ret = __record_deleted_ref(num, dir, index, name, sctx);
+ else if (ret > 0)
+ ret = 0;
+
+ return ret;
+}
+
+static int record_changed_ref(struct send_ctx *sctx)
+{
+ int ret = 0;
+
+ ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
+ sctx->cmp_key, 0, __record_changed_new_ref, sctx);
+ if (ret < 0)
+ goto out;
+ ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
+ sctx->cmp_key, 0, __record_changed_deleted_ref, sctx);
+ if (ret < 0)
+ goto out;
+ ret = 0;
+
+out:
+ return ret;
+}
+
+/*
+ * Record and process all refs at once. Needed when an inode changes the
+ * generation number, which means that it was deleted and recreated.
+ */
+static int process_all_refs(struct send_ctx *sctx,
+ enum btrfs_compare_tree_result cmd)
+{
+ int ret;
+ struct btrfs_root *root;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ struct extent_buffer *eb;
+ int slot;
+ iterate_inode_ref_t cb;
+ int pending_move = 0;
+
+ path = alloc_path_for_send();
+ if (!path)
+ return -ENOMEM;
+
+ if (cmd == BTRFS_COMPARE_TREE_NEW) {
+ root = sctx->send_root;
+ cb = __record_new_ref;
+ } else if (cmd == BTRFS_COMPARE_TREE_DELETED) {
+ root = sctx->parent_root;
+ cb = __record_deleted_ref;
+ } else {
+ btrfs_err(sctx->send_root->fs_info,
+ "Wrong command %d in process_all_refs", cmd);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ key.objectid = sctx->cmp_key->objectid;
+ key.type = BTRFS_INODE_REF_KEY;
+ key.offset = 0;
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ while (1) {
+ eb = path->nodes[0];
+ slot = path->slots[0];
+ if (slot >= btrfs_header_nritems(eb)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto out;
+ else if (ret > 0)
+ break;
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(eb, &found_key, slot);
+
+ if (found_key.objectid != key.objectid ||
+ (found_key.type != BTRFS_INODE_REF_KEY &&
+ found_key.type != BTRFS_INODE_EXTREF_KEY))
+ break;
+
+ ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx);
+ if (ret < 0)
+ goto out;
+
+ path->slots[0]++;
+ }
+ btrfs_release_path(path);
+
+ ret = process_recorded_refs(sctx, &pending_move);
+ /* Only applicable to an incremental send. */
+ ASSERT(pending_move == 0);
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int send_set_xattr(struct send_ctx *sctx,
+ struct fs_path *path,
+ const char *name, int name_len,
+ const char *data, int data_len)
+{
+ int ret = 0;
+
+ ret = begin_cmd(sctx, BTRFS_SEND_C_SET_XATTR);
+ if (ret < 0)
+ goto out;
+
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
+ TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len);
+ TLV_PUT(sctx, BTRFS_SEND_A_XATTR_DATA, data, data_len);
+
+ ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+ return ret;
+}
+
+static int send_remove_xattr(struct send_ctx *sctx,
+ struct fs_path *path,
+ const char *name, int name_len)
+{
+ int ret = 0;
+
+ ret = begin_cmd(sctx, BTRFS_SEND_C_REMOVE_XATTR);
+ if (ret < 0)
+ goto out;
+
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
+ TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len);
+
+ ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+ return ret;
+}
+
+static int __process_new_xattr(int num, struct btrfs_key *di_key,
+ const char *name, int name_len,
+ const char *data, int data_len,
+ u8 type, void *ctx)
+{
+ int ret;
+ struct send_ctx *sctx = ctx;
+ struct fs_path *p;
+ posix_acl_xattr_header dummy_acl;
+
+ p = fs_path_alloc();
+ if (!p)
+ return -ENOMEM;
+
+ /*
+ * This hack is needed because empty acl's are stored as zero byte
+ * data in xattrs. Problem with that is, that receiving these zero byte
+ * acl's will fail later. To fix this, we send a dummy acl list that
+ * only contains the version number and no entries.
+ */
+ if (!strncmp(name, XATTR_NAME_POSIX_ACL_ACCESS, name_len) ||
+ !strncmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, name_len)) {
+ if (data_len == 0) {
+ dummy_acl.a_version =
+ cpu_to_le32(POSIX_ACL_XATTR_VERSION);
+ data = (char *)&dummy_acl;
+ data_len = sizeof(dummy_acl);
+ }
+ }
+
+ ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
+ if (ret < 0)
+ goto out;
+
+ ret = send_set_xattr(sctx, p, name, name_len, data, data_len);
+
+out:
+ fs_path_free(p);
+ return ret;
+}
+
+static int __process_deleted_xattr(int num, struct btrfs_key *di_key,
+ const char *name, int name_len,
+ const char *data, int data_len,
+ u8 type, void *ctx)
+{
+ int ret;
+ struct send_ctx *sctx = ctx;
+ struct fs_path *p;
+
+ p = fs_path_alloc();
+ if (!p)
+ return -ENOMEM;
+
+ ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
+ if (ret < 0)
+ goto out;
+
+ ret = send_remove_xattr(sctx, p, name, name_len);
+
+out:
+ fs_path_free(p);
+ return ret;
+}
+
+static int process_new_xattr(struct send_ctx *sctx)
+{
+ int ret = 0;
+
+ ret = iterate_dir_item(sctx->send_root, sctx->left_path,
+ sctx->cmp_key, __process_new_xattr, sctx);
+
+ return ret;
+}
+
+static int process_deleted_xattr(struct send_ctx *sctx)
+{
+ int ret;
+
+ ret = iterate_dir_item(sctx->parent_root, sctx->right_path,
+ sctx->cmp_key, __process_deleted_xattr, sctx);
+
+ return ret;
+}
+
+struct find_xattr_ctx {
+ const char *name;
+ int name_len;
+ int found_idx;
+ char *found_data;
+ int found_data_len;
+};
+
+static int __find_xattr(int num, struct btrfs_key *di_key,
+ const char *name, int name_len,
+ const char *data, int data_len,
+ u8 type, void *vctx)
+{
+ struct find_xattr_ctx *ctx = vctx;
+
+ if (name_len == ctx->name_len &&
+ strncmp(name, ctx->name, name_len) == 0) {
+ ctx->found_idx = num;
+ ctx->found_data_len = data_len;
+ ctx->found_data = kmemdup(data, data_len, GFP_NOFS);
+ if (!ctx->found_data)
+ return -ENOMEM;
+ return 1;
+ }
+ return 0;
+}
+
+static int find_xattr(struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_key *key,
+ const char *name, int name_len,
+ char **data, int *data_len)
+{
+ int ret;
+ struct find_xattr_ctx ctx;
+
+ ctx.name = name;
+ ctx.name_len = name_len;
+ ctx.found_idx = -1;
+ ctx.found_data = NULL;
+ ctx.found_data_len = 0;
+
+ ret = iterate_dir_item(root, path, key, __find_xattr, &ctx);
+ if (ret < 0)
+ return ret;
+
+ if (ctx.found_idx == -1)
+ return -ENOENT;
+ if (data) {
+ *data = ctx.found_data;
+ *data_len = ctx.found_data_len;
+ } else {
+ kfree(ctx.found_data);
+ }
+ return ctx.found_idx;
+}
+
+
+static int __process_changed_new_xattr(int num, struct btrfs_key *di_key,
+ const char *name, int name_len,
+ const char *data, int data_len,
+ u8 type, void *ctx)
+{
+ int ret;
+ struct send_ctx *sctx = ctx;
+ char *found_data = NULL;
+ int found_data_len = 0;
+
+ ret = find_xattr(sctx->parent_root, sctx->right_path,
+ sctx->cmp_key, name, name_len, &found_data,
+ &found_data_len);
+ if (ret == -ENOENT) {
+ ret = __process_new_xattr(num, di_key, name, name_len, data,
+ data_len, type, ctx);
+ } else if (ret >= 0) {
+ if (data_len != found_data_len ||
+ memcmp(data, found_data, data_len)) {
+ ret = __process_new_xattr(num, di_key, name, name_len,
+ data, data_len, type, ctx);
+ } else {
+ ret = 0;
+ }
+ }
+
+ kfree(found_data);
+ return ret;
+}
+
+static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key,
+ const char *name, int name_len,
+ const char *data, int data_len,
+ u8 type, void *ctx)
+{
+ int ret;
+ struct send_ctx *sctx = ctx;
+
+ ret = find_xattr(sctx->send_root, sctx->left_path, sctx->cmp_key,
+ name, name_len, NULL, NULL);
+ if (ret == -ENOENT)
+ ret = __process_deleted_xattr(num, di_key, name, name_len, data,
+ data_len, type, ctx);
+ else if (ret >= 0)
+ ret = 0;
+
+ return ret;
+}
+
+static int process_changed_xattr(struct send_ctx *sctx)
+{
+ int ret = 0;
+
+ ret = iterate_dir_item(sctx->send_root, sctx->left_path,
+ sctx->cmp_key, __process_changed_new_xattr, sctx);
+ if (ret < 0)
+ goto out;
+ ret = iterate_dir_item(sctx->parent_root, sctx->right_path,
+ sctx->cmp_key, __process_changed_deleted_xattr, sctx);
+
+out:
+ return ret;
+}
+
+static int process_all_new_xattrs(struct send_ctx *sctx)
+{
+ int ret;
+ struct btrfs_root *root;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ struct extent_buffer *eb;
+ int slot;
+
+ path = alloc_path_for_send();
+ if (!path)
+ return -ENOMEM;
+
+ root = sctx->send_root;
+
+ key.objectid = sctx->cmp_key->objectid;
+ key.type = BTRFS_XATTR_ITEM_KEY;
+ key.offset = 0;
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ while (1) {
+ eb = path->nodes[0];
+ slot = path->slots[0];
+ if (slot >= btrfs_header_nritems(eb)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0) {
+ goto out;
+ } else if (ret > 0) {
+ ret = 0;
+ break;
+ }
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(eb, &found_key, slot);
+ if (found_key.objectid != key.objectid ||
+ found_key.type != key.type) {
+ ret = 0;
+ goto out;
+ }
+
+ ret = iterate_dir_item(root, path, &found_key,
+ __process_new_xattr, sctx);
+ if (ret < 0)
+ goto out;
+
+ path->slots[0]++;
+ }
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
+{
+ struct btrfs_root *root = sctx->send_root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct inode *inode;
+ struct page *page;
+ char *addr;
+ struct btrfs_key key;
+ pgoff_t index = offset >> PAGE_CACHE_SHIFT;
+ pgoff_t last_index;
+ unsigned pg_offset = offset & ~PAGE_CACHE_MASK;
+ ssize_t ret = 0;
+
+ key.objectid = sctx->cur_ino;
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+
+ inode = btrfs_iget(fs_info->sb, &key, root, NULL);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ if (offset + len > i_size_read(inode)) {
+ if (offset > i_size_read(inode))
+ len = 0;
+ else
+ len = offset - i_size_read(inode);
+ }
+ if (len == 0)
+ goto out;
+
+ last_index = (offset + len - 1) >> PAGE_CACHE_SHIFT;
+
+ /* initial readahead */
+ memset(&sctx->ra, 0, sizeof(struct file_ra_state));
+ file_ra_state_init(&sctx->ra, inode->i_mapping);
+ btrfs_force_ra(inode->i_mapping, &sctx->ra, NULL, index,
+ last_index - index + 1);
+
+ while (index <= last_index) {
+ unsigned cur_len = min_t(unsigned, len,
+ PAGE_CACHE_SIZE - pg_offset);
+ page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+ if (!page) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ if (!PageUptodate(page)) {
+ btrfs_readpage(NULL, page);
+ lock_page(page);
+ if (!PageUptodate(page)) {
+ unlock_page(page);
+ page_cache_release(page);
+ ret = -EIO;
+ break;
+ }
+ }
+
+ addr = kmap(page);
+ memcpy(sctx->read_buf + ret, addr + pg_offset, cur_len);
+ kunmap(page);
+ unlock_page(page);
+ page_cache_release(page);
+ index++;
+ pg_offset = 0;
+ len -= cur_len;
+ ret += cur_len;
+ }
+out:
+ iput(inode);
+ return ret;
+}
+
+/*
+ * Read some bytes from the current inode/file and send a write command to
+ * user space.
+ */
+static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
+{
+ int ret = 0;
+ struct fs_path *p;
+ ssize_t num_read = 0;
+
+ p = fs_path_alloc();
+ if (!p)
+ return -ENOMEM;
+
+verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len);
+
+ num_read = fill_read_buf(sctx, offset, len);
+ if (num_read <= 0) {
+ if (num_read < 0)
+ ret = num_read;
+ goto out;
+ }
+
+ ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
+ if (ret < 0)
+ goto out;
+
+ ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
+ if (ret < 0)
+ goto out;
+
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
+ TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, num_read);
+
+ ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+ fs_path_free(p);
+ if (ret < 0)
+ return ret;
+ return num_read;
+}
+
+/*
+ * Send a clone command to user space.
+ */
+static int send_clone(struct send_ctx *sctx,
+ u64 offset, u32 len,
+ struct clone_root *clone_root)
+{
+ int ret = 0;
+ struct fs_path *p;
+ u64 gen;
+
+verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
+ "clone_inode=%llu, clone_offset=%llu\n", offset, len,
+ clone_root->root->objectid, clone_root->ino,
+ clone_root->offset);
+
+ p = fs_path_alloc();
+ if (!p)
+ return -ENOMEM;
+
+ ret = begin_cmd(sctx, BTRFS_SEND_C_CLONE);
+ if (ret < 0)
+ goto out;
+
+ ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
+ if (ret < 0)
+ goto out;
+
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_LEN, len);
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+
+ if (clone_root->root == sctx->send_root) {
+ ret = get_inode_info(sctx->send_root, clone_root->ino, NULL,
+ &gen, NULL, NULL, NULL, NULL);
+ if (ret < 0)
+ goto out;
+ ret = get_cur_path(sctx, clone_root->ino, gen, p);
+ } else {
+ ret = get_inode_path(clone_root->root, clone_root->ino, p);
+ }
+ if (ret < 0)
+ goto out;
+
+ TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
+ clone_root->root->root_item.uuid);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
+ le64_to_cpu(clone_root->root->root_item.ctransid));
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET,
+ clone_root->offset);
+
+ ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+ fs_path_free(p);
+ return ret;
+}
+
+/*
+ * Send an update extent command to user space.
+ */
+static int send_update_extent(struct send_ctx *sctx,
+ u64 offset, u32 len)
+{
+ int ret = 0;
+ struct fs_path *p;
+
+ p = fs_path_alloc();
+ if (!p)
+ return -ENOMEM;
+
+ ret = begin_cmd(sctx, BTRFS_SEND_C_UPDATE_EXTENT);
+ if (ret < 0)
+ goto out;
+
+ ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
+ if (ret < 0)
+ goto out;
+
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, len);
+
+ ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+ fs_path_free(p);
+ return ret;
+}
+
+static int send_hole(struct send_ctx *sctx, u64 end)
+{
+ struct fs_path *p = NULL;
+ u64 offset = sctx->cur_inode_last_extent;
+ u64 len;
+ int ret = 0;
+
+ p = fs_path_alloc();
+ if (!p)
+ return -ENOMEM;
+ ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
+ if (ret < 0)
+ goto tlv_put_failure;
+ memset(sctx->read_buf, 0, BTRFS_SEND_READ_SIZE);
+ while (offset < end) {
+ len = min_t(u64, end - offset, BTRFS_SEND_READ_SIZE);
+
+ ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
+ if (ret < 0)
+ break;
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
+ TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, len);
+ ret = send_cmd(sctx);
+ if (ret < 0)
+ break;
+ offset += len;
+ }
+tlv_put_failure:
+ fs_path_free(p);
+ return ret;
+}
+
+static int send_write_or_clone(struct send_ctx *sctx,
+ struct btrfs_path *path,
+ struct btrfs_key *key,
+ struct clone_root *clone_root)
+{
+ int ret = 0;
+ struct btrfs_file_extent_item *ei;
+ u64 offset = key->offset;
+ u64 pos = 0;
+ u64 len;
+ u32 l;
+ u8 type;
+ u64 bs = sctx->send_root->fs_info->sb->s_blocksize;
+
+ ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_file_extent_item);
+ type = btrfs_file_extent_type(path->nodes[0], ei);
+ if (type == BTRFS_FILE_EXTENT_INLINE) {
+ len = btrfs_file_extent_inline_len(path->nodes[0],
+ path->slots[0], ei);
+ /*
+ * it is possible the inline item won't cover the whole page,
+ * but there may be items after this page. Make
+ * sure to send the whole thing
+ */
+ len = PAGE_CACHE_ALIGN(len);
+ } else {
+ len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
+ }
+
+ if (offset + len > sctx->cur_inode_size)
+ len = sctx->cur_inode_size - offset;
+ if (len == 0) {
+ ret = 0;
+ goto out;
+ }
+
+ if (clone_root && IS_ALIGNED(offset + len, bs)) {
+ ret = send_clone(sctx, offset, len, clone_root);
+ } else if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) {
+ ret = send_update_extent(sctx, offset, len);
+ } else {
+ while (pos < len) {
+ l = len - pos;
+ if (l > BTRFS_SEND_READ_SIZE)
+ l = BTRFS_SEND_READ_SIZE;
+ ret = send_write(sctx, pos + offset, l);
+ if (ret < 0)
+ goto out;
+ if (!ret)
+ break;
+ pos += ret;
+ }
+ ret = 0;
+ }
+out:
+ return ret;
+}
+
+static int is_extent_unchanged(struct send_ctx *sctx,
+ struct btrfs_path *left_path,
+ struct btrfs_key *ekey)
+{
+ int ret = 0;
+ struct btrfs_key key;
+ struct btrfs_path *path = NULL;
+ struct extent_buffer *eb;
+ int slot;
+ struct btrfs_key found_key;
+ struct btrfs_file_extent_item *ei;
+ u64 left_disknr;
+ u64 right_disknr;
+ u64 left_offset;
+ u64 right_offset;
+ u64 left_offset_fixed;
+ u64 left_len;
+ u64 right_len;
+ u64 left_gen;
+ u64 right_gen;
+ u8 left_type;
+ u8 right_type;
+
+ path = alloc_path_for_send();
+ if (!path)
+ return -ENOMEM;
+
+ eb = left_path->nodes[0];
+ slot = left_path->slots[0];
+ ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
+ left_type = btrfs_file_extent_type(eb, ei);
+
+ if (left_type != BTRFS_FILE_EXTENT_REG) {
+ ret = 0;
+ goto out;
+ }
+ left_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
+ left_len = btrfs_file_extent_num_bytes(eb, ei);
+ left_offset = btrfs_file_extent_offset(eb, ei);
+ left_gen = btrfs_file_extent_generation(eb, ei);
+
+ /*
+ * Following comments will refer to these graphics. L is the left
+ * extents which we are checking at the moment. 1-8 are the right
+ * extents that we iterate.
+ *
+ * |-----L-----|
+ * |-1-|-2a-|-3-|-4-|-5-|-6-|
+ *
+ * |-----L-----|
+ * |--1--|-2b-|...(same as above)
+ *
+ * Alternative situation. Happens on files where extents got split.
+ * |-----L-----|
+ * |-----------7-----------|-6-|
+ *
+ * Alternative situation. Happens on files which got larger.
+ * |-----L-----|
+ * |-8-|
+ * Nothing follows after 8.
+ */
+
+ key.objectid = ekey->objectid;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = ekey->offset;
+ ret = btrfs_search_slot_for_read(sctx->parent_root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ if (ret) {
+ ret = 0;
+ goto out;
+ }
+
+ /*
+ * Handle special case where the right side has no extents at all.
+ */
+ eb = path->nodes[0];
+ slot = path->slots[0];
+ btrfs_item_key_to_cpu(eb, &found_key, slot);
+ if (found_key.objectid != key.objectid ||
+ found_key.type != key.type) {
+ /* If we're a hole then just pretend nothing changed */
+ ret = (left_disknr) ? 0 : 1;
+ goto out;
+ }
+
+ /*
+ * We're now on 2a, 2b or 7.
+ */
+ key = found_key;
+ while (key.offset < ekey->offset + left_len) {
+ ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
+ right_type = btrfs_file_extent_type(eb, ei);
+ if (right_type != BTRFS_FILE_EXTENT_REG) {
+ ret = 0;
+ goto out;
+ }
+
+ right_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
+ right_len = btrfs_file_extent_num_bytes(eb, ei);
+ right_offset = btrfs_file_extent_offset(eb, ei);
+ right_gen = btrfs_file_extent_generation(eb, ei);
+
+ /*
+ * Are we at extent 8? If yes, we know the extent is changed.
+ * This may only happen on the first iteration.
+ */
+ if (found_key.offset + right_len <= ekey->offset) {
+ /* If we're a hole just pretend nothing changed */
+ ret = (left_disknr) ? 0 : 1;
+ goto out;
+ }
+
+ left_offset_fixed = left_offset;
+ if (key.offset < ekey->offset) {
+ /* Fix the right offset for 2a and 7. */
+ right_offset += ekey->offset - key.offset;
+ } else {
+ /* Fix the left offset for all behind 2a and 2b */
+ left_offset_fixed += key.offset - ekey->offset;
+ }
+
+ /*
+ * Check if we have the same extent.
+ */
+ if (left_disknr != right_disknr ||
+ left_offset_fixed != right_offset ||
+ left_gen != right_gen) {
+ ret = 0;
+ goto out;
+ }
+
+ /*
+ * Go to the next extent.
+ */
+ ret = btrfs_next_item(sctx->parent_root, path);
+ if (ret < 0)
+ goto out;
+ if (!ret) {
+ eb = path->nodes[0];
+ slot = path->slots[0];
+ btrfs_item_key_to_cpu(eb, &found_key, slot);
+ }
+ if (ret || found_key.objectid != key.objectid ||
+ found_key.type != key.type) {
+ key.offset += right_len;
+ break;
+ }
+ if (found_key.offset != key.offset + right_len) {
+ ret = 0;
+ goto out;
+ }
+ key = found_key;
+ }
+
+ /*
+ * We're now behind the left extent (treat as unchanged) or at the end
+ * of the right side (treat as changed).
+ */
+ if (key.offset >= ekey->offset + left_len)
+ ret = 1;
+ else
+ ret = 0;
+
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int get_last_extent(struct send_ctx *sctx, u64 offset)
+{
+ struct btrfs_path *path;
+ struct btrfs_root *root = sctx->send_root;
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_key key;
+ u64 extent_end;
+ u8 type;
+ int ret;
+
+ path = alloc_path_for_send();
+ if (!path)
+ return -ENOMEM;
+
+ sctx->cur_inode_last_extent = 0;
+
+ key.objectid = sctx->cur_ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = offset;
+ ret = btrfs_search_slot_for_read(root, &key, path, 0, 1);
+ if (ret < 0)
+ goto out;
+ ret = 0;
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.objectid != sctx->cur_ino || key.type != BTRFS_EXTENT_DATA_KEY)
+ goto out;
+
+ fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_file_extent_item);
+ type = btrfs_file_extent_type(path->nodes[0], fi);
+ if (type == BTRFS_FILE_EXTENT_INLINE) {
+ u64 size = btrfs_file_extent_inline_len(path->nodes[0],
+ path->slots[0], fi);
+ extent_end = ALIGN(key.offset + size,
+ sctx->send_root->sectorsize);
+ } else {
+ extent_end = key.offset +
+ btrfs_file_extent_num_bytes(path->nodes[0], fi);
+ }
+ sctx->cur_inode_last_extent = extent_end;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path,
+ struct btrfs_key *key)
+{
+ struct btrfs_file_extent_item *fi;
+ u64 extent_end;
+ u8 type;
+ int ret = 0;
+
+ if (sctx->cur_ino != key->objectid || !need_send_hole(sctx))
+ return 0;
+
+ if (sctx->cur_inode_last_extent == (u64)-1) {
+ ret = get_last_extent(sctx, key->offset - 1);
+ if (ret)
+ return ret;
+ }
+
+ fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_file_extent_item);
+ type = btrfs_file_extent_type(path->nodes[0], fi);
+ if (type == BTRFS_FILE_EXTENT_INLINE) {
+ u64 size = btrfs_file_extent_inline_len(path->nodes[0],
+ path->slots[0], fi);
+ extent_end = ALIGN(key->offset + size,
+ sctx->send_root->sectorsize);
+ } else {
+ extent_end = key->offset +
+ btrfs_file_extent_num_bytes(path->nodes[0], fi);
+ }
+
+ if (path->slots[0] == 0 &&
+ sctx->cur_inode_last_extent < key->offset) {
+ /*
+ * We might have skipped entire leafs that contained only
+ * file extent items for our current inode. These leafs have
+ * a generation number smaller (older) than the one in the
+ * current leaf and the leaf our last extent came from, and
+ * are located between these 2 leafs.
+ */
+ ret = get_last_extent(sctx, key->offset - 1);
+ if (ret)
+ return ret;
+ }
+
+ if (sctx->cur_inode_last_extent < key->offset)
+ ret = send_hole(sctx, key->offset);
+ sctx->cur_inode_last_extent = extent_end;
+ return ret;
+}
+
+static int process_extent(struct send_ctx *sctx,
+ struct btrfs_path *path,
+ struct btrfs_key *key)
+{
+ struct clone_root *found_clone = NULL;
+ int ret = 0;
+
+ if (S_ISLNK(sctx->cur_inode_mode))
+ return 0;
+
+ if (sctx->parent_root && !sctx->cur_inode_new) {
+ ret = is_extent_unchanged(sctx, path, key);
+ if (ret < 0)
+ goto out;
+ if (ret) {
+ ret = 0;
+ goto out_hole;
+ }
+ } else {
+ struct btrfs_file_extent_item *ei;
+ u8 type;
+
+ ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_file_extent_item);
+ type = btrfs_file_extent_type(path->nodes[0], ei);
+ if (type == BTRFS_FILE_EXTENT_PREALLOC ||
+ type == BTRFS_FILE_EXTENT_REG) {
+ /*
+ * The send spec does not have a prealloc command yet,
+ * so just leave a hole for prealloc'ed extents until
+ * we have enough commands queued up to justify rev'ing
+ * the send spec.
+ */
+ if (type == BTRFS_FILE_EXTENT_PREALLOC) {
+ ret = 0;
+ goto out;
+ }
+
+ /* Have a hole, just skip it. */
+ if (btrfs_file_extent_disk_bytenr(path->nodes[0], ei) == 0) {
+ ret = 0;
+ goto out;
+ }
+ }
+ }
+
+ ret = find_extent_clone(sctx, path, key->objectid, key->offset,
+ sctx->cur_inode_size, &found_clone);
+ if (ret != -ENOENT && ret < 0)
+ goto out;
+
+ ret = send_write_or_clone(sctx, path, key, found_clone);
+ if (ret)
+ goto out;
+out_hole:
+ ret = maybe_send_hole(sctx, path, key);
+out:
+ return ret;
+}
+
+static int process_all_extents(struct send_ctx *sctx)
+{
+ int ret;
+ struct btrfs_root *root;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ struct extent_buffer *eb;
+ int slot;
+
+ root = sctx->send_root;
+ path = alloc_path_for_send();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = sctx->cmp_key->objectid;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = 0;
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ while (1) {
+ eb = path->nodes[0];
+ slot = path->slots[0];
+
+ if (slot >= btrfs_header_nritems(eb)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0) {
+ goto out;
+ } else if (ret > 0) {
+ ret = 0;
+ break;
+ }
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(eb, &found_key, slot);
+
+ if (found_key.objectid != key.objectid ||
+ found_key.type != key.type) {
+ ret = 0;
+ goto out;
+ }
+
+ ret = process_extent(sctx, path, &found_key);
+ if (ret < 0)
+ goto out;
+
+ path->slots[0]++;
+ }
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end,
+ int *pending_move,
+ int *refs_processed)
+{
+ int ret = 0;
+
+ if (sctx->cur_ino == 0)
+ goto out;
+ if (!at_end && sctx->cur_ino == sctx->cmp_key->objectid &&
+ sctx->cmp_key->type <= BTRFS_INODE_EXTREF_KEY)
+ goto out;
+ if (list_empty(&sctx->new_refs) && list_empty(&sctx->deleted_refs))
+ goto out;
+
+ ret = process_recorded_refs(sctx, pending_move);
+ if (ret < 0)
+ goto out;
+
+ *refs_processed = 1;
+out:
+ return ret;
+}
+
+static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
+{
+ int ret = 0;
+ u64 left_mode;
+ u64 left_uid;
+ u64 left_gid;
+ u64 right_mode;
+ u64 right_uid;
+ u64 right_gid;
+ int need_chmod = 0;
+ int need_chown = 0;
+ int pending_move = 0;
+ int refs_processed = 0;
+
+ ret = process_recorded_refs_if_needed(sctx, at_end, &pending_move,
+ &refs_processed);
+ if (ret < 0)
+ goto out;
+
+ /*
+ * We have processed the refs and thus need to advance send_progress.
+ * Now, calls to get_cur_xxx will take the updated refs of the current
+ * inode into account.
+ *
+ * On the other hand, if our current inode is a directory and couldn't
+ * be moved/renamed because its parent was renamed/moved too and it has
+ * a higher inode number, we can only move/rename our current inode
+ * after we moved/renamed its parent. Therefore in this case operate on
+ * the old path (pre move/rename) of our current inode, and the
+ * move/rename will be performed later.
+ */
+ if (refs_processed && !pending_move)
+ sctx->send_progress = sctx->cur_ino + 1;
+
+ if (sctx->cur_ino == 0 || sctx->cur_inode_deleted)
+ goto out;
+ if (!at_end && sctx->cmp_key->objectid == sctx->cur_ino)
+ goto out;
+
+ ret = get_inode_info(sctx->send_root, sctx->cur_ino, NULL, NULL,
+ &left_mode, &left_uid, &left_gid, NULL);
+ if (ret < 0)
+ goto out;
+
+ if (!sctx->parent_root || sctx->cur_inode_new) {
+ need_chown = 1;
+ if (!S_ISLNK(sctx->cur_inode_mode))
+ need_chmod = 1;
+ } else {
+ ret = get_inode_info(sctx->parent_root, sctx->cur_ino,
+ NULL, NULL, &right_mode, &right_uid,
+ &right_gid, NULL);
+ if (ret < 0)
+ goto out;
+
+ if (left_uid != right_uid || left_gid != right_gid)
+ need_chown = 1;
+ if (!S_ISLNK(sctx->cur_inode_mode) && left_mode != right_mode)
+ need_chmod = 1;
+ }
+
+ if (S_ISREG(sctx->cur_inode_mode)) {
+ if (need_send_hole(sctx)) {
+ if (sctx->cur_inode_last_extent == (u64)-1 ||
+ sctx->cur_inode_last_extent <
+ sctx->cur_inode_size) {
+ ret = get_last_extent(sctx, (u64)-1);
+ if (ret)
+ goto out;
+ }
+ if (sctx->cur_inode_last_extent <
+ sctx->cur_inode_size) {
+ ret = send_hole(sctx, sctx->cur_inode_size);
+ if (ret)
+ goto out;
+ }
+ }
+ ret = send_truncate(sctx, sctx->cur_ino, sctx->cur_inode_gen,
+ sctx->cur_inode_size);
+ if (ret < 0)
+ goto out;
+ }
+
+ if (need_chown) {
+ ret = send_chown(sctx, sctx->cur_ino, sctx->cur_inode_gen,
+ left_uid, left_gid);
+ if (ret < 0)
+ goto out;
+ }
+ if (need_chmod) {
+ ret = send_chmod(sctx, sctx->cur_ino, sctx->cur_inode_gen,
+ left_mode);
+ if (ret < 0)
+ goto out;
+ }
+
+ /*
+ * If other directory inodes depended on our current directory
+ * inode's move/rename, now do their move/rename operations.
+ */
+ if (!is_waiting_for_move(sctx, sctx->cur_ino)) {
+ ret = apply_children_dir_moves(sctx);
+ if (ret)
+ goto out;
+ /*
+ * Need to send that every time, no matter if it actually
+ * changed between the two trees as we have done changes to
+ * the inode before. If our inode is a directory and it's
+ * waiting to be moved/renamed, we will send its utimes when
+ * it's moved/renamed, therefore we don't need to do it here.
+ */
+ sctx->send_progress = sctx->cur_ino + 1;
+ ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
+ if (ret < 0)
+ goto out;
+ }
+
+out:
+ return ret;
+}
+
+static int changed_inode(struct send_ctx *sctx,
+ enum btrfs_compare_tree_result result)
+{
+ int ret = 0;
+ struct btrfs_key *key = sctx->cmp_key;
+ struct btrfs_inode_item *left_ii = NULL;
+ struct btrfs_inode_item *right_ii = NULL;
+ u64 left_gen = 0;
+ u64 right_gen = 0;
+
+ sctx->cur_ino = key->objectid;
+ sctx->cur_inode_new_gen = 0;
+ sctx->cur_inode_last_extent = (u64)-1;
+
+ /*
+ * Set send_progress to current inode. This will tell all get_cur_xxx
+ * functions that the current inode's refs are not updated yet. Later,
+ * when process_recorded_refs is finished, it is set to cur_ino + 1.
+ */
+ sctx->send_progress = sctx->cur_ino;
+
+ if (result == BTRFS_COMPARE_TREE_NEW ||
+ result == BTRFS_COMPARE_TREE_CHANGED) {
+ left_ii = btrfs_item_ptr(sctx->left_path->nodes[0],
+ sctx->left_path->slots[0],
+ struct btrfs_inode_item);
+ left_gen = btrfs_inode_generation(sctx->left_path->nodes[0],
+ left_ii);
+ } else {
+ right_ii = btrfs_item_ptr(sctx->right_path->nodes[0],
+ sctx->right_path->slots[0],
+ struct btrfs_inode_item);
+ right_gen = btrfs_inode_generation(sctx->right_path->nodes[0],
+ right_ii);
+ }
+ if (result == BTRFS_COMPARE_TREE_CHANGED) {
+ right_ii = btrfs_item_ptr(sctx->right_path->nodes[0],
+ sctx->right_path->slots[0],
+ struct btrfs_inode_item);
+
+ right_gen = btrfs_inode_generation(sctx->right_path->nodes[0],
+ right_ii);
+
+ /*
+ * The cur_ino = root dir case is special here. We can't treat
+ * the inode as deleted+reused because it would generate a
+ * stream that tries to delete/mkdir the root dir.
+ */
+ if (left_gen != right_gen &&
+ sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
+ sctx->cur_inode_new_gen = 1;
+ }
+
+ if (result == BTRFS_COMPARE_TREE_NEW) {
+ sctx->cur_inode_gen = left_gen;
+ sctx->cur_inode_new = 1;
+ sctx->cur_inode_deleted = 0;
+ sctx->cur_inode_size = btrfs_inode_size(
+ sctx->left_path->nodes[0], left_ii);
+ sctx->cur_inode_mode = btrfs_inode_mode(
+ sctx->left_path->nodes[0], left_ii);
+ sctx->cur_inode_rdev = btrfs_inode_rdev(
+ sctx->left_path->nodes[0], left_ii);
+ if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
+ ret = send_create_inode_if_needed(sctx);
+ } else if (result == BTRFS_COMPARE_TREE_DELETED) {
+ sctx->cur_inode_gen = right_gen;
+ sctx->cur_inode_new = 0;
+ sctx->cur_inode_deleted = 1;
+ sctx->cur_inode_size = btrfs_inode_size(
+ sctx->right_path->nodes[0], right_ii);
+ sctx->cur_inode_mode = btrfs_inode_mode(
+ sctx->right_path->nodes[0], right_ii);
+ } else if (result == BTRFS_COMPARE_TREE_CHANGED) {
+ /*
+ * We need to do some special handling in case the inode was
+ * reported as changed with a changed generation number. This
+ * means that the original inode was deleted and new inode
+ * reused the same inum. So we have to treat the old inode as
+ * deleted and the new one as new.
+ */
+ if (sctx->cur_inode_new_gen) {
+ /*
+ * First, process the inode as if it was deleted.
+ */
+ sctx->cur_inode_gen = right_gen;
+ sctx->cur_inode_new = 0;
+ sctx->cur_inode_deleted = 1;
+ sctx->cur_inode_size = btrfs_inode_size(
+ sctx->right_path->nodes[0], right_ii);
+ sctx->cur_inode_mode = btrfs_inode_mode(
+ sctx->right_path->nodes[0], right_ii);
+ ret = process_all_refs(sctx,
+ BTRFS_COMPARE_TREE_DELETED);
+ if (ret < 0)
+ goto out;
+
+ /*
+ * Now process the inode as if it was new.
+ */
+ sctx->cur_inode_gen = left_gen;
+ sctx->cur_inode_new = 1;
+ sctx->cur_inode_deleted = 0;
+ sctx->cur_inode_size = btrfs_inode_size(
+ sctx->left_path->nodes[0], left_ii);
+ sctx->cur_inode_mode = btrfs_inode_mode(
+ sctx->left_path->nodes[0], left_ii);
+ sctx->cur_inode_rdev = btrfs_inode_rdev(
+ sctx->left_path->nodes[0], left_ii);
+ ret = send_create_inode_if_needed(sctx);
+ if (ret < 0)
+ goto out;
+
+ ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW);
+ if (ret < 0)
+ goto out;
+ /*
+ * Advance send_progress now as we did not get into
+ * process_recorded_refs_if_needed in the new_gen case.
+ */
+ sctx->send_progress = sctx->cur_ino + 1;
+
+ /*
+ * Now process all extents and xattrs of the inode as if
+ * they were all new.
+ */
+ ret = process_all_extents(sctx);
+ if (ret < 0)
+ goto out;
+ ret = process_all_new_xattrs(sctx);
+ if (ret < 0)
+ goto out;
+ } else {
+ sctx->cur_inode_gen = left_gen;
+ sctx->cur_inode_new = 0;
+ sctx->cur_inode_new_gen = 0;
+ sctx->cur_inode_deleted = 0;
+ sctx->cur_inode_size = btrfs_inode_size(
+ sctx->left_path->nodes[0], left_ii);
+ sctx->cur_inode_mode = btrfs_inode_mode(
+ sctx->left_path->nodes[0], left_ii);
+ }
+ }
+
+out:
+ return ret;
+}
+
+/*
+ * We have to process new refs before deleted refs, but compare_trees gives us
+ * the new and deleted refs mixed. To fix this, we record the new/deleted refs
+ * first and later process them in process_recorded_refs.
+ * For the cur_inode_new_gen case, we skip recording completely because
+ * changed_inode did already initiate processing of refs. The reason for this is
+ * that in this case, compare_tree actually compares the refs of 2 different
+ * inodes. To fix this, process_all_refs is used in changed_inode to handle all
+ * refs of the right tree as deleted and all refs of the left tree as new.
+ */
+static int changed_ref(struct send_ctx *sctx,
+ enum btrfs_compare_tree_result result)
+{
+ int ret = 0;
+
+ BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid);
+
+ if (!sctx->cur_inode_new_gen &&
+ sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) {
+ if (result == BTRFS_COMPARE_TREE_NEW)
+ ret = record_new_ref(sctx);
+ else if (result == BTRFS_COMPARE_TREE_DELETED)
+ ret = record_deleted_ref(sctx);
+ else if (result == BTRFS_COMPARE_TREE_CHANGED)
+ ret = record_changed_ref(sctx);
+ }
+
+ return ret;
+}
+
+/*
+ * Process new/deleted/changed xattrs. We skip processing in the
+ * cur_inode_new_gen case because changed_inode did already initiate processing
+ * of xattrs. The reason is the same as in changed_ref
+ */
+static int changed_xattr(struct send_ctx *sctx,
+ enum btrfs_compare_tree_result result)
+{
+ int ret = 0;
+
+ BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid);
+
+ if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
+ if (result == BTRFS_COMPARE_TREE_NEW)
+ ret = process_new_xattr(sctx);
+ else if (result == BTRFS_COMPARE_TREE_DELETED)
+ ret = process_deleted_xattr(sctx);
+ else if (result == BTRFS_COMPARE_TREE_CHANGED)
+ ret = process_changed_xattr(sctx);
+ }
+
+ return ret;
+}
+
+/*
+ * Process new/deleted/changed extents. We skip processing in the
+ * cur_inode_new_gen case because changed_inode did already initiate processing
+ * of extents. The reason is the same as in changed_ref
+ */
+static int changed_extent(struct send_ctx *sctx,
+ enum btrfs_compare_tree_result result)
+{
+ int ret = 0;
+
+ BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid);
+
+ if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
+ if (result != BTRFS_COMPARE_TREE_DELETED)
+ ret = process_extent(sctx, sctx->left_path,
+ sctx->cmp_key);
+ }
+
+ return ret;
+}
+
+static int dir_changed(struct send_ctx *sctx, u64 dir)
+{
+ u64 orig_gen, new_gen;
+ int ret;
+
+ ret = get_inode_info(sctx->send_root, dir, NULL, &new_gen, NULL, NULL,
+ NULL, NULL);
+ if (ret)
+ return ret;
+
+ ret = get_inode_info(sctx->parent_root, dir, NULL, &orig_gen, NULL,
+ NULL, NULL, NULL);
+ if (ret)
+ return ret;
+
+ return (orig_gen != new_gen) ? 1 : 0;
+}
+
+static int compare_refs(struct send_ctx *sctx, struct btrfs_path *path,
+ struct btrfs_key *key)
+{
+ struct btrfs_inode_extref *extref;
+ struct extent_buffer *leaf;
+ u64 dirid = 0, last_dirid = 0;
+ unsigned long ptr;
+ u32 item_size;
+ u32 cur_offset = 0;
+ int ref_name_len;
+ int ret = 0;
+
+ /* Easy case, just check this one dirid */
+ if (key->type == BTRFS_INODE_REF_KEY) {
+ dirid = key->offset;
+
+ ret = dir_changed(sctx, dirid);
+ goto out;
+ }
+
+ leaf = path->nodes[0];
+ item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ while (cur_offset < item_size) {
+ extref = (struct btrfs_inode_extref *)(ptr +
+ cur_offset);
+ dirid = btrfs_inode_extref_parent(leaf, extref);
+ ref_name_len = btrfs_inode_extref_name_len(leaf, extref);
+ cur_offset += ref_name_len + sizeof(*extref);
+ if (dirid == last_dirid)
+ continue;
+ ret = dir_changed(sctx, dirid);
+ if (ret)
+ break;
+ last_dirid = dirid;
+ }
+out:
+ return ret;
+}
+
+/*
+ * Updates compare related fields in sctx and simply forwards to the actual
+ * changed_xxx functions.
+ */
+static int changed_cb(struct btrfs_root *left_root,
+ struct btrfs_root *right_root,
+ struct btrfs_path *left_path,
+ struct btrfs_path *right_path,
+ struct btrfs_key *key,
+ enum btrfs_compare_tree_result result,
+ void *ctx)
+{
+ int ret = 0;
+ struct send_ctx *sctx = ctx;
+
+ if (result == BTRFS_COMPARE_TREE_SAME) {
+ if (key->type == BTRFS_INODE_REF_KEY ||
+ key->type == BTRFS_INODE_EXTREF_KEY) {
+ ret = compare_refs(sctx, left_path, key);
+ if (!ret)
+ return 0;
+ if (ret < 0)
+ return ret;
+ } else if (key->type == BTRFS_EXTENT_DATA_KEY) {
+ return maybe_send_hole(sctx, left_path, key);
+ } else {
+ return 0;
+ }
+ result = BTRFS_COMPARE_TREE_CHANGED;
+ ret = 0;
+ }
+
+ sctx->left_path = left_path;
+ sctx->right_path = right_path;
+ sctx->cmp_key = key;
+
+ ret = finish_inode_if_needed(sctx, 0);
+ if (ret < 0)
+ goto out;
+
+ /* Ignore non-FS objects */
+ if (key->objectid == BTRFS_FREE_INO_OBJECTID ||
+ key->objectid == BTRFS_FREE_SPACE_OBJECTID)
+ goto out;
+
+ if (key->type == BTRFS_INODE_ITEM_KEY)
+ ret = changed_inode(sctx, result);
+ else if (key->type == BTRFS_INODE_REF_KEY ||
+ key->type == BTRFS_INODE_EXTREF_KEY)
+ ret = changed_ref(sctx, result);
+ else if (key->type == BTRFS_XATTR_ITEM_KEY)
+ ret = changed_xattr(sctx, result);
+ else if (key->type == BTRFS_EXTENT_DATA_KEY)
+ ret = changed_extent(sctx, result);
+
+out:
+ return ret;
+}
+
+static int full_send_tree(struct send_ctx *sctx)
+{
+ int ret;
+ struct btrfs_root *send_root = sctx->send_root;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ struct btrfs_path *path;
+ struct extent_buffer *eb;
+ int slot;
+
+ path = alloc_path_for_send();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = BTRFS_FIRST_FREE_OBJECTID;
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0);
+ if (ret < 0)
+ goto out;
+ if (ret)
+ goto out_finish;
+
+ while (1) {
+ eb = path->nodes[0];
+ slot = path->slots[0];
+ btrfs_item_key_to_cpu(eb, &found_key, slot);
+
+ ret = changed_cb(send_root, NULL, path, NULL,
+ &found_key, BTRFS_COMPARE_TREE_NEW, sctx);
+ if (ret < 0)
+ goto out;
+
+ key.objectid = found_key.objectid;
+ key.type = found_key.type;
+ key.offset = found_key.offset + 1;
+
+ ret = btrfs_next_item(send_root, path);
+ if (ret < 0)
+ goto out;
+ if (ret) {
+ ret = 0;
+ break;
+ }
+ }
+
+out_finish:
+ ret = finish_inode_if_needed(sctx, 1);
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int send_subvol(struct send_ctx *sctx)
+{
+ int ret;
+
+ if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_STREAM_HEADER)) {
+ ret = send_header(sctx);
+ if (ret < 0)
+ goto out;
+ }
+
+ ret = send_subvol_begin(sctx);
+ if (ret < 0)
+ goto out;
+
+ if (sctx->parent_root) {
+ ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root,
+ changed_cb, sctx);
+ if (ret < 0)
+ goto out;
+ ret = finish_inode_if_needed(sctx, 1);
+ if (ret < 0)
+ goto out;
+ } else {
+ ret = full_send_tree(sctx);
+ if (ret < 0)
+ goto out;
+ }
+
+out:
+ free_recorded_refs(sctx);
+ return ret;
+}
+
+/*
+ * If orphan cleanup did remove any orphans from a root, it means the tree
+ * was modified and therefore the commit root is not the same as the current
+ * root anymore. This is a problem, because send uses the commit root and
+ * therefore can see inode items that don't exist in the current root anymore,
+ * and for example make calls to btrfs_iget, which will do tree lookups based
+ * on the current root and not on the commit root. Those lookups will fail,
+ * returning a -ESTALE error, and making send fail with that error. So make
+ * sure a send does not see any orphans we have just removed, and that it will
+ * see the same inodes regardless of whether a transaction commit happened
+ * before it started (meaning that the commit root will be the same as the
+ * current root) or not.
+ */
+static int ensure_commit_roots_uptodate(struct send_ctx *sctx)
+{
+ int i;
+ struct btrfs_trans_handle *trans = NULL;
+
+again:
+ if (sctx->parent_root &&
+ sctx->parent_root->node != sctx->parent_root->commit_root)
+ goto commit_trans;
+
+ for (i = 0; i < sctx->clone_roots_cnt; i++)
+ if (sctx->clone_roots[i].root->node !=
+ sctx->clone_roots[i].root->commit_root)
+ goto commit_trans;
+
+ if (trans)
+ return btrfs_end_transaction(trans, sctx->send_root);
+
+ return 0;
+
+commit_trans:
+ /* Use any root, all fs roots will get their commit roots updated. */
+ if (!trans) {
+ trans = btrfs_join_transaction(sctx->send_root);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+ goto again;
+ }
+
+ return btrfs_commit_transaction(trans, sctx->send_root);
+}
+
+static void btrfs_root_dec_send_in_progress(struct btrfs_root* root)
+{
+ spin_lock(&root->root_item_lock);
+ root->send_in_progress--;
+ /*
+ * Not much left to do, we don't know why it's unbalanced and
+ * can't blindly reset it to 0.
+ */
+ if (root->send_in_progress < 0)
+ btrfs_err(root->fs_info,
+ "send_in_progres unbalanced %d root %llu",
+ root->send_in_progress, root->root_key.objectid);
+ spin_unlock(&root->root_item_lock);
+}
+
+long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
+{
+ int ret = 0;
+ struct btrfs_root *send_root;
+ struct btrfs_root *clone_root;
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_ioctl_send_args *arg = NULL;
+ struct btrfs_key key;
+ struct send_ctx *sctx = NULL;
+ u32 i;
+ u64 *clone_sources_tmp = NULL;
+ int clone_sources_to_rollback = 0;
+ int sort_clone_roots = 0;
+ int index;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ send_root = BTRFS_I(file_inode(mnt_file))->root;
+ fs_info = send_root->fs_info;
+
+ /*
+ * The subvolume must remain read-only during send, protect against
+ * making it RW. This also protects against deletion.
+ */
+ spin_lock(&send_root->root_item_lock);
+ send_root->send_in_progress++;
+ spin_unlock(&send_root->root_item_lock);
+
+ /*
+ * This is done when we lookup the root, it should already be complete
+ * by the time we get here.
+ */
+ WARN_ON(send_root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE);
+
+ /*
+ * Userspace tools do the checks and warn the user if it's
+ * not RO.
+ */
+ if (!btrfs_root_readonly(send_root)) {
+ ret = -EPERM;
+ goto out;
+ }
+
+ arg = memdup_user(arg_, sizeof(*arg));
+ if (IS_ERR(arg)) {
+ ret = PTR_ERR(arg);
+ arg = NULL;
+ goto out;
+ }
+
+ if (!access_ok(VERIFY_READ, arg->clone_sources,
+ sizeof(*arg->clone_sources) *
+ arg->clone_sources_count)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ if (arg->flags & ~BTRFS_SEND_FLAG_MASK) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ sctx = kzalloc(sizeof(struct send_ctx), GFP_NOFS);
+ if (!sctx) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ INIT_LIST_HEAD(&sctx->new_refs);
+ INIT_LIST_HEAD(&sctx->deleted_refs);
+ INIT_RADIX_TREE(&sctx->name_cache, GFP_NOFS);
+ INIT_LIST_HEAD(&sctx->name_cache_list);
+
+ sctx->flags = arg->flags;
+
+ sctx->send_filp = fget(arg->send_fd);
+ if (!sctx->send_filp) {
+ ret = -EBADF;
+ goto out;
+ }
+
+ sctx->send_root = send_root;
+ /*
+ * Unlikely but possible, if the subvolume is marked for deletion but
+ * is slow to remove the directory entry, send can still be started
+ */
+ if (btrfs_root_dead(sctx->send_root)) {
+ ret = -EPERM;
+ goto out;
+ }
+
+ sctx->clone_roots_cnt = arg->clone_sources_count;
+
+ sctx->send_max_size = BTRFS_SEND_BUF_SIZE;
+ sctx->send_buf = vmalloc(sctx->send_max_size);
+ if (!sctx->send_buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ sctx->read_buf = vmalloc(BTRFS_SEND_READ_SIZE);
+ if (!sctx->read_buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ sctx->pending_dir_moves = RB_ROOT;
+ sctx->waiting_dir_moves = RB_ROOT;
+ sctx->orphan_dirs = RB_ROOT;
+
+ sctx->clone_roots = vzalloc(sizeof(struct clone_root) *
+ (arg->clone_sources_count + 1));
+ if (!sctx->clone_roots) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (arg->clone_sources_count) {
+ clone_sources_tmp = vmalloc(arg->clone_sources_count *
+ sizeof(*arg->clone_sources));
+ if (!clone_sources_tmp) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = copy_from_user(clone_sources_tmp, arg->clone_sources,
+ arg->clone_sources_count *
+ sizeof(*arg->clone_sources));
+ if (ret) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ for (i = 0; i < arg->clone_sources_count; i++) {
+ key.objectid = clone_sources_tmp[i];
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = (u64)-1;
+
+ index = srcu_read_lock(&fs_info->subvol_srcu);
+
+ clone_root = btrfs_read_fs_root_no_name(fs_info, &key);
+ if (IS_ERR(clone_root)) {
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
+ ret = PTR_ERR(clone_root);
+ goto out;
+ }
+ spin_lock(&clone_root->root_item_lock);
+ if (!btrfs_root_readonly(clone_root) ||
+ btrfs_root_dead(clone_root)) {
+ spin_unlock(&clone_root->root_item_lock);
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
+ ret = -EPERM;
+ goto out;
+ }
+ clone_root->send_in_progress++;
+ spin_unlock(&clone_root->root_item_lock);
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
+
+ sctx->clone_roots[i].root = clone_root;
+ clone_sources_to_rollback = i + 1;
+ }
+ vfree(clone_sources_tmp);
+ clone_sources_tmp = NULL;
+ }
+
+ if (arg->parent_root) {
+ key.objectid = arg->parent_root;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = (u64)-1;
+
+ index = srcu_read_lock(&fs_info->subvol_srcu);
+
+ sctx->parent_root = btrfs_read_fs_root_no_name(fs_info, &key);
+ if (IS_ERR(sctx->parent_root)) {
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
+ ret = PTR_ERR(sctx->parent_root);
+ goto out;
+ }
+
+ spin_lock(&sctx->parent_root->root_item_lock);
+ sctx->parent_root->send_in_progress++;
+ if (!btrfs_root_readonly(sctx->parent_root) ||
+ btrfs_root_dead(sctx->parent_root)) {
+ spin_unlock(&sctx->parent_root->root_item_lock);
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
+ ret = -EPERM;
+ goto out;
+ }
+ spin_unlock(&sctx->parent_root->root_item_lock);
+
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
+ }
+
+ /*
+ * Clones from send_root are allowed, but only if the clone source
+ * is behind the current send position. This is checked while searching
+ * for possible clone sources.
+ */
+ sctx->clone_roots[sctx->clone_roots_cnt++].root = sctx->send_root;
+
+ /* We do a bsearch later */
+ sort(sctx->clone_roots, sctx->clone_roots_cnt,
+ sizeof(*sctx->clone_roots), __clone_root_cmp_sort,
+ NULL);
+ sort_clone_roots = 1;
+
+ ret = ensure_commit_roots_uptodate(sctx);
+ if (ret)
+ goto out;
+
+ current->journal_info = BTRFS_SEND_TRANS_STUB;
+ ret = send_subvol(sctx);
+ current->journal_info = NULL;
+ if (ret < 0)
+ goto out;
+
+ if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_END_CMD)) {
+ ret = begin_cmd(sctx, BTRFS_SEND_C_END);
+ if (ret < 0)
+ goto out;
+ ret = send_cmd(sctx);
+ if (ret < 0)
+ goto out;
+ }
+
+out:
+ WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->pending_dir_moves));
+ while (sctx && !RB_EMPTY_ROOT(&sctx->pending_dir_moves)) {
+ struct rb_node *n;
+ struct pending_dir_move *pm;
+
+ n = rb_first(&sctx->pending_dir_moves);
+ pm = rb_entry(n, struct pending_dir_move, node);
+ while (!list_empty(&pm->list)) {
+ struct pending_dir_move *pm2;
+
+ pm2 = list_first_entry(&pm->list,
+ struct pending_dir_move, list);
+ free_pending_move(sctx, pm2);
+ }
+ free_pending_move(sctx, pm);
+ }
+
+ WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves));
+ while (sctx && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves)) {
+ struct rb_node *n;
+ struct waiting_dir_move *dm;
+
+ n = rb_first(&sctx->waiting_dir_moves);
+ dm = rb_entry(n, struct waiting_dir_move, node);
+ rb_erase(&dm->node, &sctx->waiting_dir_moves);
+ kfree(dm);
+ }
+
+ WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->orphan_dirs));
+ while (sctx && !RB_EMPTY_ROOT(&sctx->orphan_dirs)) {
+ struct rb_node *n;
+ struct orphan_dir_info *odi;
+
+ n = rb_first(&sctx->orphan_dirs);
+ odi = rb_entry(n, struct orphan_dir_info, node);
+ free_orphan_dir_info(sctx, odi);
+ }
+
+ if (sort_clone_roots) {
+ for (i = 0; i < sctx->clone_roots_cnt; i++)
+ btrfs_root_dec_send_in_progress(
+ sctx->clone_roots[i].root);
+ } else {
+ for (i = 0; sctx && i < clone_sources_to_rollback; i++)
+ btrfs_root_dec_send_in_progress(
+ sctx->clone_roots[i].root);
+
+ btrfs_root_dec_send_in_progress(send_root);
+ }
+ if (sctx && !IS_ERR_OR_NULL(sctx->parent_root))
+ btrfs_root_dec_send_in_progress(sctx->parent_root);
+
+ kfree(arg);
+ vfree(clone_sources_tmp);
+
+ if (sctx) {
+ if (sctx->send_filp)
+ fput(sctx->send_filp);
+
+ vfree(sctx->clone_roots);
+ vfree(sctx->send_buf);
+ vfree(sctx->read_buf);
+
+ name_cache_free(sctx);
+
+ kfree(sctx);
+ }
+
+ return ret;
+}
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
new file mode 100644
index 000000000..48d425aef
--- /dev/null
+++ b/fs/btrfs/send.h
@@ -0,0 +1,134 @@
+/*
+ * Copyright (C) 2012 Alexander Block. All rights reserved.
+ * Copyright (C) 2012 STRATO. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "ctree.h"
+
+#define BTRFS_SEND_STREAM_MAGIC "btrfs-stream"
+#define BTRFS_SEND_STREAM_VERSION 1
+
+#define BTRFS_SEND_BUF_SIZE (1024 * 64)
+#define BTRFS_SEND_READ_SIZE (1024 * 48)
+
+enum btrfs_tlv_type {
+ BTRFS_TLV_U8,
+ BTRFS_TLV_U16,
+ BTRFS_TLV_U32,
+ BTRFS_TLV_U64,
+ BTRFS_TLV_BINARY,
+ BTRFS_TLV_STRING,
+ BTRFS_TLV_UUID,
+ BTRFS_TLV_TIMESPEC,
+};
+
+struct btrfs_stream_header {
+ char magic[sizeof(BTRFS_SEND_STREAM_MAGIC)];
+ __le32 version;
+} __attribute__ ((__packed__));
+
+struct btrfs_cmd_header {
+ /* len excluding the header */
+ __le32 len;
+ __le16 cmd;
+ /* crc including the header with zero crc field */
+ __le32 crc;
+} __attribute__ ((__packed__));
+
+struct btrfs_tlv_header {
+ __le16 tlv_type;
+ /* len excluding the header */
+ __le16 tlv_len;
+} __attribute__ ((__packed__));
+
+/* commands */
+enum btrfs_send_cmd {
+ BTRFS_SEND_C_UNSPEC,
+
+ BTRFS_SEND_C_SUBVOL,
+ BTRFS_SEND_C_SNAPSHOT,
+
+ BTRFS_SEND_C_MKFILE,
+ BTRFS_SEND_C_MKDIR,
+ BTRFS_SEND_C_MKNOD,
+ BTRFS_SEND_C_MKFIFO,
+ BTRFS_SEND_C_MKSOCK,
+ BTRFS_SEND_C_SYMLINK,
+
+ BTRFS_SEND_C_RENAME,
+ BTRFS_SEND_C_LINK,
+ BTRFS_SEND_C_UNLINK,
+ BTRFS_SEND_C_RMDIR,
+
+ BTRFS_SEND_C_SET_XATTR,
+ BTRFS_SEND_C_REMOVE_XATTR,
+
+ BTRFS_SEND_C_WRITE,
+ BTRFS_SEND_C_CLONE,
+
+ BTRFS_SEND_C_TRUNCATE,
+ BTRFS_SEND_C_CHMOD,
+ BTRFS_SEND_C_CHOWN,
+ BTRFS_SEND_C_UTIMES,
+
+ BTRFS_SEND_C_END,
+ BTRFS_SEND_C_UPDATE_EXTENT,
+ __BTRFS_SEND_C_MAX,
+};
+#define BTRFS_SEND_C_MAX (__BTRFS_SEND_C_MAX - 1)
+
+/* attributes in send stream */
+enum {
+ BTRFS_SEND_A_UNSPEC,
+
+ BTRFS_SEND_A_UUID,
+ BTRFS_SEND_A_CTRANSID,
+
+ BTRFS_SEND_A_INO,
+ BTRFS_SEND_A_SIZE,
+ BTRFS_SEND_A_MODE,
+ BTRFS_SEND_A_UID,
+ BTRFS_SEND_A_GID,
+ BTRFS_SEND_A_RDEV,
+ BTRFS_SEND_A_CTIME,
+ BTRFS_SEND_A_MTIME,
+ BTRFS_SEND_A_ATIME,
+ BTRFS_SEND_A_OTIME,
+
+ BTRFS_SEND_A_XATTR_NAME,
+ BTRFS_SEND_A_XATTR_DATA,
+
+ BTRFS_SEND_A_PATH,
+ BTRFS_SEND_A_PATH_TO,
+ BTRFS_SEND_A_PATH_LINK,
+
+ BTRFS_SEND_A_FILE_OFFSET,
+ BTRFS_SEND_A_DATA,
+
+ BTRFS_SEND_A_CLONE_UUID,
+ BTRFS_SEND_A_CLONE_CTRANSID,
+ BTRFS_SEND_A_CLONE_PATH,
+ BTRFS_SEND_A_CLONE_OFFSET,
+ BTRFS_SEND_A_CLONE_LEN,
+
+ __BTRFS_SEND_A_MAX,
+};
+#define BTRFS_SEND_A_MAX (__BTRFS_SEND_A_MAX - 1)
+
+#ifdef __KERNEL__
+long btrfs_ioctl_send(struct file *mnt_file, void __user *arg);
+#endif
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
new file mode 100644
index 000000000..b976597b0
--- /dev/null
+++ b/fs/btrfs/struct-funcs.c
@@ -0,0 +1,142 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/highmem.h>
+#include <asm/unaligned.h>
+
+#include "ctree.h"
+
+static inline u8 get_unaligned_le8(const void *p)
+{
+ return *(u8 *)p;
+}
+
+static inline void put_unaligned_le8(u8 val, void *p)
+{
+ *(u8 *)p = val;
+}
+
+/*
+ * this is some deeply nasty code.
+ *
+ * The end result is that anyone who #includes ctree.h gets a
+ * declaration for the btrfs_set_foo functions and btrfs_foo functions,
+ * which are wappers of btrfs_set_token_#bits functions and
+ * btrfs_get_token_#bits functions, which are defined in this file.
+ *
+ * These setget functions do all the extent_buffer related mapping
+ * required to efficiently read and write specific fields in the extent
+ * buffers. Every pointer to metadata items in btrfs is really just
+ * an unsigned long offset into the extent buffer which has been
+ * cast to a specific type. This gives us all the gcc type checking.
+ *
+ * The extent buffer api is used to do the page spanning work required to
+ * have a metadata blocksize different from the page size.
+ */
+
+#define DEFINE_BTRFS_SETGET_BITS(bits) \
+u##bits btrfs_get_token_##bits(struct extent_buffer *eb, void *ptr, \
+ unsigned long off, \
+ struct btrfs_map_token *token) \
+{ \
+ unsigned long part_offset = (unsigned long)ptr; \
+ unsigned long offset = part_offset + off; \
+ void *p; \
+ int err; \
+ char *kaddr; \
+ unsigned long map_start; \
+ unsigned long map_len; \
+ int size = sizeof(u##bits); \
+ u##bits res; \
+ \
+ if (token && token->kaddr && token->offset <= offset && \
+ token->eb == eb && \
+ (token->offset + PAGE_CACHE_SIZE >= offset + size)) { \
+ kaddr = token->kaddr; \
+ p = kaddr + part_offset - token->offset; \
+ res = get_unaligned_le##bits(p + off); \
+ return res; \
+ } \
+ err = map_private_extent_buffer(eb, offset, size, \
+ &kaddr, &map_start, &map_len); \
+ if (err) { \
+ __le##bits leres; \
+ \
+ read_extent_buffer(eb, &leres, offset, size); \
+ return le##bits##_to_cpu(leres); \
+ } \
+ p = kaddr + part_offset - map_start; \
+ res = get_unaligned_le##bits(p + off); \
+ if (token) { \
+ token->kaddr = kaddr; \
+ token->offset = map_start; \
+ token->eb = eb; \
+ } \
+ return res; \
+} \
+void btrfs_set_token_##bits(struct extent_buffer *eb, \
+ void *ptr, unsigned long off, u##bits val, \
+ struct btrfs_map_token *token) \
+{ \
+ unsigned long part_offset = (unsigned long)ptr; \
+ unsigned long offset = part_offset + off; \
+ void *p; \
+ int err; \
+ char *kaddr; \
+ unsigned long map_start; \
+ unsigned long map_len; \
+ int size = sizeof(u##bits); \
+ \
+ if (token && token->kaddr && token->offset <= offset && \
+ token->eb == eb && \
+ (token->offset + PAGE_CACHE_SIZE >= offset + size)) { \
+ kaddr = token->kaddr; \
+ p = kaddr + part_offset - token->offset; \
+ put_unaligned_le##bits(val, p + off); \
+ return; \
+ } \
+ err = map_private_extent_buffer(eb, offset, size, \
+ &kaddr, &map_start, &map_len); \
+ if (err) { \
+ __le##bits val2; \
+ \
+ val2 = cpu_to_le##bits(val); \
+ write_extent_buffer(eb, &val2, offset, size); \
+ return; \
+ } \
+ p = kaddr + part_offset - map_start; \
+ put_unaligned_le##bits(val, p + off); \
+ if (token) { \
+ token->kaddr = kaddr; \
+ token->offset = map_start; \
+ token->eb = eb; \
+ } \
+}
+
+DEFINE_BTRFS_SETGET_BITS(8)
+DEFINE_BTRFS_SETGET_BITS(16)
+DEFINE_BTRFS_SETGET_BITS(32)
+DEFINE_BTRFS_SETGET_BITS(64)
+
+void btrfs_node_key(struct extent_buffer *eb,
+ struct btrfs_disk_key *disk_key, int nr)
+{
+ unsigned long ptr = btrfs_node_key_ptr_offset(nr);
+ read_eb_member(eb, (struct btrfs_key_ptr *)ptr,
+ struct btrfs_key_ptr, key, disk_key);
+}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
new file mode 100644
index 000000000..9e66f5e72
--- /dev/null
+++ b/fs/btrfs/super.c
@@ -0,0 +1,2220 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/blkdev.h>
+#include <linux/module.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/seq_file.h>
+#include <linux/string.h>
+#include <linux/backing-dev.h>
+#include <linux/mount.h>
+#include <linux/mpage.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/statfs.h>
+#include <linux/compat.h>
+#include <linux/parser.h>
+#include <linux/ctype.h>
+#include <linux/namei.h>
+#include <linux/miscdevice.h>
+#include <linux/magic.h>
+#include <linux/slab.h>
+#include <linux/cleancache.h>
+#include <linux/ratelimit.h>
+#include <linux/btrfs.h>
+#include "delayed-inode.h"
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "print-tree.h"
+#include "hash.h"
+#include "props.h"
+#include "xattr.h"
+#include "volumes.h"
+#include "export.h"
+#include "compression.h"
+#include "rcu-string.h"
+#include "dev-replace.h"
+#include "free-space-cache.h"
+#include "backref.h"
+#include "tests/btrfs-tests.h"
+
+#include "qgroup.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/btrfs.h>
+
+static const struct super_operations btrfs_super_ops;
+static struct file_system_type btrfs_fs_type;
+
+static int btrfs_remount(struct super_block *sb, int *flags, char *data);
+
+static const char *btrfs_decode_error(int errno)
+{
+ char *errstr = "unknown";
+
+ switch (errno) {
+ case -EIO:
+ errstr = "IO failure";
+ break;
+ case -ENOMEM:
+ errstr = "Out of memory";
+ break;
+ case -EROFS:
+ errstr = "Readonly filesystem";
+ break;
+ case -EEXIST:
+ errstr = "Object already exists";
+ break;
+ case -ENOSPC:
+ errstr = "No space left";
+ break;
+ case -ENOENT:
+ errstr = "No such entry";
+ break;
+ }
+
+ return errstr;
+}
+
+static void save_error_info(struct btrfs_fs_info *fs_info)
+{
+ /*
+ * today we only save the error info into ram. Long term we'll
+ * also send it down to the disk
+ */
+ set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
+}
+
+/* btrfs handle error by forcing the filesystem readonly */
+static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
+{
+ struct super_block *sb = fs_info->sb;
+
+ if (sb->s_flags & MS_RDONLY)
+ return;
+
+ if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+ sb->s_flags |= MS_RDONLY;
+ btrfs_info(fs_info, "forced readonly");
+ /*
+ * Note that a running device replace operation is not
+ * canceled here although there is no way to update
+ * the progress. It would add the risk of a deadlock,
+ * therefore the canceling is ommited. The only penalty
+ * is that some I/O remains active until the procedure
+ * completes. The next time when the filesystem is
+ * mounted writeable again, the device replace
+ * operation continues.
+ */
+ }
+}
+
+#ifdef CONFIG_PRINTK
+/*
+ * __btrfs_std_error decodes expected errors from the caller and
+ * invokes the approciate error response.
+ */
+void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
+ unsigned int line, int errno, const char *fmt, ...)
+{
+ struct super_block *sb = fs_info->sb;
+ const char *errstr;
+
+ /*
+ * Special case: if the error is EROFS, and we're already
+ * under MS_RDONLY, then it is safe here.
+ */
+ if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
+ return;
+
+ errstr = btrfs_decode_error(errno);
+ if (fmt) {
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ printk(KERN_CRIT
+ "BTRFS: error (device %s) in %s:%d: errno=%d %s (%pV)\n",
+ sb->s_id, function, line, errno, errstr, &vaf);
+ va_end(args);
+ } else {
+ printk(KERN_CRIT "BTRFS: error (device %s) in %s:%d: errno=%d %s\n",
+ sb->s_id, function, line, errno, errstr);
+ }
+
+ /* Don't go through full error handling during mount */
+ save_error_info(fs_info);
+ if (sb->s_flags & MS_BORN)
+ btrfs_handle_error(fs_info);
+}
+
+static const char * const logtypes[] = {
+ "emergency",
+ "alert",
+ "critical",
+ "error",
+ "warning",
+ "notice",
+ "info",
+ "debug",
+};
+
+void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
+{
+ struct super_block *sb = fs_info->sb;
+ char lvl[4];
+ struct va_format vaf;
+ va_list args;
+ const char *type = logtypes[4];
+ int kern_level;
+
+ va_start(args, fmt);
+
+ kern_level = printk_get_level(fmt);
+ if (kern_level) {
+ size_t size = printk_skip_level(fmt) - fmt;
+ memcpy(lvl, fmt, size);
+ lvl[size] = '\0';
+ fmt += size;
+ type = logtypes[kern_level - '0'];
+ } else
+ *lvl = '\0';
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ printk("%sBTRFS %s (device %s): %pV\n", lvl, type, sb->s_id, &vaf);
+
+ va_end(args);
+}
+
+#else
+
+void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
+ unsigned int line, int errno, const char *fmt, ...)
+{
+ struct super_block *sb = fs_info->sb;
+
+ /*
+ * Special case: if the error is EROFS, and we're already
+ * under MS_RDONLY, then it is safe here.
+ */
+ if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
+ return;
+
+ /* Don't go through full error handling during mount */
+ if (sb->s_flags & MS_BORN) {
+ save_error_info(fs_info);
+ btrfs_handle_error(fs_info);
+ }
+}
+#endif
+
+/*
+ * We only mark the transaction aborted and then set the file system read-only.
+ * This will prevent new transactions from starting or trying to join this
+ * one.
+ *
+ * This means that error recovery at the call site is limited to freeing
+ * any local memory allocations and passing the error code up without
+ * further cleanup. The transaction should complete as it normally would
+ * in the call path but will return -EIO.
+ *
+ * We'll complete the cleanup in btrfs_end_transaction and
+ * btrfs_commit_transaction.
+ */
+void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, const char *function,
+ unsigned int line, int errno)
+{
+ /*
+ * Report first abort since mount
+ */
+ if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED,
+ &root->fs_info->fs_state)) {
+ WARN(1, KERN_DEBUG "BTRFS: Transaction aborted (error %d)\n",
+ errno);
+ }
+ trans->aborted = errno;
+ /* Nothing used. The other threads that have joined this
+ * transaction may be able to continue. */
+ if (!trans->blocks_used && list_empty(&trans->new_bgs)) {
+ const char *errstr;
+
+ errstr = btrfs_decode_error(errno);
+ btrfs_warn(root->fs_info,
+ "%s:%d: Aborting unused transaction(%s).",
+ function, line, errstr);
+ return;
+ }
+ ACCESS_ONCE(trans->transaction->aborted) = errno;
+ /* Wake up anybody who may be waiting on this transaction */
+ wake_up(&root->fs_info->transaction_wait);
+ wake_up(&root->fs_info->transaction_blocked_wait);
+ __btrfs_std_error(root->fs_info, function, line, errno, NULL);
+}
+/*
+ * __btrfs_panic decodes unexpected, fatal errors from the caller,
+ * issues an alert, and either panics or BUGs, depending on mount options.
+ */
+void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
+ unsigned int line, int errno, const char *fmt, ...)
+{
+ char *s_id = "<unknown>";
+ const char *errstr;
+ struct va_format vaf = { .fmt = fmt };
+ va_list args;
+
+ if (fs_info)
+ s_id = fs_info->sb->s_id;
+
+ va_start(args, fmt);
+ vaf.va = &args;
+
+ errstr = btrfs_decode_error(errno);
+ if (fs_info && (fs_info->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR))
+ panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n",
+ s_id, function, line, &vaf, errno, errstr);
+
+ btrfs_crit(fs_info, "panic in %s:%d: %pV (errno=%d %s)",
+ function, line, &vaf, errno, errstr);
+ va_end(args);
+ /* Caller calls BUG() */
+}
+
+static void btrfs_put_super(struct super_block *sb)
+{
+ close_ctree(btrfs_sb(sb)->tree_root);
+}
+
+enum {
+ Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum,
+ Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd,
+ Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
+ Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
+ Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
+ Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
+ Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache,
+ Opt_no_space_cache, Opt_recovery, Opt_skip_balance,
+ Opt_check_integrity, Opt_check_integrity_including_extent_data,
+ Opt_check_integrity_print_mask, Opt_fatal_errors, Opt_rescan_uuid_tree,
+ Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard,
+ Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl, Opt_datacow,
+ Opt_datasum, Opt_treelog, Opt_noinode_cache,
+ Opt_err,
+};
+
+static match_table_t tokens = {
+ {Opt_degraded, "degraded"},
+ {Opt_subvol, "subvol=%s"},
+ {Opt_subvolid, "subvolid=%s"},
+ {Opt_device, "device=%s"},
+ {Opt_nodatasum, "nodatasum"},
+ {Opt_datasum, "datasum"},
+ {Opt_nodatacow, "nodatacow"},
+ {Opt_datacow, "datacow"},
+ {Opt_nobarrier, "nobarrier"},
+ {Opt_barrier, "barrier"},
+ {Opt_max_inline, "max_inline=%s"},
+ {Opt_alloc_start, "alloc_start=%s"},
+ {Opt_thread_pool, "thread_pool=%d"},
+ {Opt_compress, "compress"},
+ {Opt_compress_type, "compress=%s"},
+ {Opt_compress_force, "compress-force"},
+ {Opt_compress_force_type, "compress-force=%s"},
+ {Opt_ssd, "ssd"},
+ {Opt_ssd_spread, "ssd_spread"},
+ {Opt_nossd, "nossd"},
+ {Opt_acl, "acl"},
+ {Opt_noacl, "noacl"},
+ {Opt_notreelog, "notreelog"},
+ {Opt_treelog, "treelog"},
+ {Opt_flushoncommit, "flushoncommit"},
+ {Opt_noflushoncommit, "noflushoncommit"},
+ {Opt_ratio, "metadata_ratio=%d"},
+ {Opt_discard, "discard"},
+ {Opt_nodiscard, "nodiscard"},
+ {Opt_space_cache, "space_cache"},
+ {Opt_clear_cache, "clear_cache"},
+ {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
+ {Opt_enospc_debug, "enospc_debug"},
+ {Opt_noenospc_debug, "noenospc_debug"},
+ {Opt_subvolrootid, "subvolrootid=%d"},
+ {Opt_defrag, "autodefrag"},
+ {Opt_nodefrag, "noautodefrag"},
+ {Opt_inode_cache, "inode_cache"},
+ {Opt_noinode_cache, "noinode_cache"},
+ {Opt_no_space_cache, "nospace_cache"},
+ {Opt_recovery, "recovery"},
+ {Opt_skip_balance, "skip_balance"},
+ {Opt_check_integrity, "check_int"},
+ {Opt_check_integrity_including_extent_data, "check_int_data"},
+ {Opt_check_integrity_print_mask, "check_int_print_mask=%d"},
+ {Opt_rescan_uuid_tree, "rescan_uuid_tree"},
+ {Opt_fatal_errors, "fatal_errors=%s"},
+ {Opt_commit_interval, "commit=%d"},
+ {Opt_err, NULL},
+};
+
+/*
+ * Regular mount options parser. Everything that is needed only when
+ * reading in a new superblock is parsed here.
+ * XXX JDM: This needs to be cleaned up for remount.
+ */
+int btrfs_parse_options(struct btrfs_root *root, char *options)
+{
+ struct btrfs_fs_info *info = root->fs_info;
+ substring_t args[MAX_OPT_ARGS];
+ char *p, *num, *orig = NULL;
+ u64 cache_gen;
+ int intarg;
+ int ret = 0;
+ char *compress_type;
+ bool compress_force = false;
+
+ cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
+ if (cache_gen)
+ btrfs_set_opt(info->mount_opt, SPACE_CACHE);
+
+ if (!options)
+ goto out;
+
+ /*
+ * strsep changes the string, duplicate it because parse_options
+ * gets called twice
+ */
+ options = kstrdup(options, GFP_NOFS);
+ if (!options)
+ return -ENOMEM;
+
+ orig = options;
+
+ while ((p = strsep(&options, ",")) != NULL) {
+ int token;
+ if (!*p)
+ continue;
+
+ token = match_token(p, tokens, args);
+ switch (token) {
+ case Opt_degraded:
+ btrfs_info(root->fs_info, "allowing degraded mounts");
+ btrfs_set_opt(info->mount_opt, DEGRADED);
+ break;
+ case Opt_subvol:
+ case Opt_subvolid:
+ case Opt_subvolrootid:
+ case Opt_device:
+ /*
+ * These are parsed by btrfs_parse_early_options
+ * and can be happily ignored here.
+ */
+ break;
+ case Opt_nodatasum:
+ btrfs_set_and_info(root, NODATASUM,
+ "setting nodatasum");
+ break;
+ case Opt_datasum:
+ if (btrfs_test_opt(root, NODATASUM)) {
+ if (btrfs_test_opt(root, NODATACOW))
+ btrfs_info(root->fs_info, "setting datasum, datacow enabled");
+ else
+ btrfs_info(root->fs_info, "setting datasum");
+ }
+ btrfs_clear_opt(info->mount_opt, NODATACOW);
+ btrfs_clear_opt(info->mount_opt, NODATASUM);
+ break;
+ case Opt_nodatacow:
+ if (!btrfs_test_opt(root, NODATACOW)) {
+ if (!btrfs_test_opt(root, COMPRESS) ||
+ !btrfs_test_opt(root, FORCE_COMPRESS)) {
+ btrfs_info(root->fs_info,
+ "setting nodatacow, compression disabled");
+ } else {
+ btrfs_info(root->fs_info, "setting nodatacow");
+ }
+ }
+ btrfs_clear_opt(info->mount_opt, COMPRESS);
+ btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
+ btrfs_set_opt(info->mount_opt, NODATACOW);
+ btrfs_set_opt(info->mount_opt, NODATASUM);
+ break;
+ case Opt_datacow:
+ btrfs_clear_and_info(root, NODATACOW,
+ "setting datacow");
+ break;
+ case Opt_compress_force:
+ case Opt_compress_force_type:
+ compress_force = true;
+ /* Fallthrough */
+ case Opt_compress:
+ case Opt_compress_type:
+ if (token == Opt_compress ||
+ token == Opt_compress_force ||
+ strcmp(args[0].from, "zlib") == 0) {
+ compress_type = "zlib";
+ info->compress_type = BTRFS_COMPRESS_ZLIB;
+ btrfs_set_opt(info->mount_opt, COMPRESS);
+ btrfs_clear_opt(info->mount_opt, NODATACOW);
+ btrfs_clear_opt(info->mount_opt, NODATASUM);
+ } else if (strcmp(args[0].from, "lzo") == 0) {
+ compress_type = "lzo";
+ info->compress_type = BTRFS_COMPRESS_LZO;
+ btrfs_set_opt(info->mount_opt, COMPRESS);
+ btrfs_clear_opt(info->mount_opt, NODATACOW);
+ btrfs_clear_opt(info->mount_opt, NODATASUM);
+ btrfs_set_fs_incompat(info, COMPRESS_LZO);
+ } else if (strncmp(args[0].from, "no", 2) == 0) {
+ compress_type = "no";
+ btrfs_clear_opt(info->mount_opt, COMPRESS);
+ btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
+ compress_force = false;
+ } else {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (compress_force) {
+ btrfs_set_and_info(root, FORCE_COMPRESS,
+ "force %s compression",
+ compress_type);
+ } else {
+ if (!btrfs_test_opt(root, COMPRESS))
+ btrfs_info(root->fs_info,
+ "btrfs: use %s compression",
+ compress_type);
+ /*
+ * If we remount from compress-force=xxx to
+ * compress=xxx, we need clear FORCE_COMPRESS
+ * flag, otherwise, there is no way for users
+ * to disable forcible compression separately.
+ */
+ btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
+ }
+ break;
+ case Opt_ssd:
+ btrfs_set_and_info(root, SSD,
+ "use ssd allocation scheme");
+ break;
+ case Opt_ssd_spread:
+ btrfs_set_and_info(root, SSD_SPREAD,
+ "use spread ssd allocation scheme");
+ btrfs_set_opt(info->mount_opt, SSD);
+ break;
+ case Opt_nossd:
+ btrfs_set_and_info(root, NOSSD,
+ "not using ssd allocation scheme");
+ btrfs_clear_opt(info->mount_opt, SSD);
+ break;
+ case Opt_barrier:
+ btrfs_clear_and_info(root, NOBARRIER,
+ "turning on barriers");
+ break;
+ case Opt_nobarrier:
+ btrfs_set_and_info(root, NOBARRIER,
+ "turning off barriers");
+ break;
+ case Opt_thread_pool:
+ ret = match_int(&args[0], &intarg);
+ if (ret) {
+ goto out;
+ } else if (intarg > 0) {
+ info->thread_pool_size = intarg;
+ } else {
+ ret = -EINVAL;
+ goto out;
+ }
+ break;
+ case Opt_max_inline:
+ num = match_strdup(&args[0]);
+ if (num) {
+ info->max_inline = memparse(num, NULL);
+ kfree(num);
+
+ if (info->max_inline) {
+ info->max_inline = min_t(u64,
+ info->max_inline,
+ root->sectorsize);
+ }
+ btrfs_info(root->fs_info, "max_inline at %llu",
+ info->max_inline);
+ } else {
+ ret = -ENOMEM;
+ goto out;
+ }
+ break;
+ case Opt_alloc_start:
+ num = match_strdup(&args[0]);
+ if (num) {
+ mutex_lock(&info->chunk_mutex);
+ info->alloc_start = memparse(num, NULL);
+ mutex_unlock(&info->chunk_mutex);
+ kfree(num);
+ btrfs_info(root->fs_info, "allocations start at %llu",
+ info->alloc_start);
+ } else {
+ ret = -ENOMEM;
+ goto out;
+ }
+ break;
+ case Opt_acl:
+#ifdef CONFIG_BTRFS_FS_POSIX_ACL
+ root->fs_info->sb->s_flags |= MS_POSIXACL;
+ break;
+#else
+ btrfs_err(root->fs_info,
+ "support for ACL not compiled in!");
+ ret = -EINVAL;
+ goto out;
+#endif
+ case Opt_noacl:
+ root->fs_info->sb->s_flags &= ~MS_POSIXACL;
+ break;
+ case Opt_notreelog:
+ btrfs_set_and_info(root, NOTREELOG,
+ "disabling tree log");
+ break;
+ case Opt_treelog:
+ btrfs_clear_and_info(root, NOTREELOG,
+ "enabling tree log");
+ break;
+ case Opt_flushoncommit:
+ btrfs_set_and_info(root, FLUSHONCOMMIT,
+ "turning on flush-on-commit");
+ break;
+ case Opt_noflushoncommit:
+ btrfs_clear_and_info(root, FLUSHONCOMMIT,
+ "turning off flush-on-commit");
+ break;
+ case Opt_ratio:
+ ret = match_int(&args[0], &intarg);
+ if (ret) {
+ goto out;
+ } else if (intarg >= 0) {
+ info->metadata_ratio = intarg;
+ btrfs_info(root->fs_info, "metadata ratio %d",
+ info->metadata_ratio);
+ } else {
+ ret = -EINVAL;
+ goto out;
+ }
+ break;
+ case Opt_discard:
+ btrfs_set_and_info(root, DISCARD,
+ "turning on discard");
+ break;
+ case Opt_nodiscard:
+ btrfs_clear_and_info(root, DISCARD,
+ "turning off discard");
+ break;
+ case Opt_space_cache:
+ btrfs_set_and_info(root, SPACE_CACHE,
+ "enabling disk space caching");
+ break;
+ case Opt_rescan_uuid_tree:
+ btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE);
+ break;
+ case Opt_no_space_cache:
+ btrfs_clear_and_info(root, SPACE_CACHE,
+ "disabling disk space caching");
+ break;
+ case Opt_inode_cache:
+ btrfs_set_pending_and_info(info, INODE_MAP_CACHE,
+ "enabling inode map caching");
+ break;
+ case Opt_noinode_cache:
+ btrfs_clear_pending_and_info(info, INODE_MAP_CACHE,
+ "disabling inode map caching");
+ break;
+ case Opt_clear_cache:
+ btrfs_set_and_info(root, CLEAR_CACHE,
+ "force clearing of disk cache");
+ break;
+ case Opt_user_subvol_rm_allowed:
+ btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED);
+ break;
+ case Opt_enospc_debug:
+ btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG);
+ break;
+ case Opt_noenospc_debug:
+ btrfs_clear_opt(info->mount_opt, ENOSPC_DEBUG);
+ break;
+ case Opt_defrag:
+ btrfs_set_and_info(root, AUTO_DEFRAG,
+ "enabling auto defrag");
+ break;
+ case Opt_nodefrag:
+ btrfs_clear_and_info(root, AUTO_DEFRAG,
+ "disabling auto defrag");
+ break;
+ case Opt_recovery:
+ btrfs_info(root->fs_info, "enabling auto recovery");
+ btrfs_set_opt(info->mount_opt, RECOVERY);
+ break;
+ case Opt_skip_balance:
+ btrfs_set_opt(info->mount_opt, SKIP_BALANCE);
+ break;
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+ case Opt_check_integrity_including_extent_data:
+ btrfs_info(root->fs_info,
+ "enabling check integrity including extent data");
+ btrfs_set_opt(info->mount_opt,
+ CHECK_INTEGRITY_INCLUDING_EXTENT_DATA);
+ btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
+ break;
+ case Opt_check_integrity:
+ btrfs_info(root->fs_info, "enabling check integrity");
+ btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
+ break;
+ case Opt_check_integrity_print_mask:
+ ret = match_int(&args[0], &intarg);
+ if (ret) {
+ goto out;
+ } else if (intarg >= 0) {
+ info->check_integrity_print_mask = intarg;
+ btrfs_info(root->fs_info, "check_integrity_print_mask 0x%x",
+ info->check_integrity_print_mask);
+ } else {
+ ret = -EINVAL;
+ goto out;
+ }
+ break;
+#else
+ case Opt_check_integrity_including_extent_data:
+ case Opt_check_integrity:
+ case Opt_check_integrity_print_mask:
+ btrfs_err(root->fs_info,
+ "support for check_integrity* not compiled in!");
+ ret = -EINVAL;
+ goto out;
+#endif
+ case Opt_fatal_errors:
+ if (strcmp(args[0].from, "panic") == 0)
+ btrfs_set_opt(info->mount_opt,
+ PANIC_ON_FATAL_ERROR);
+ else if (strcmp(args[0].from, "bug") == 0)
+ btrfs_clear_opt(info->mount_opt,
+ PANIC_ON_FATAL_ERROR);
+ else {
+ ret = -EINVAL;
+ goto out;
+ }
+ break;
+ case Opt_commit_interval:
+ intarg = 0;
+ ret = match_int(&args[0], &intarg);
+ if (ret < 0) {
+ btrfs_err(root->fs_info, "invalid commit interval");
+ ret = -EINVAL;
+ goto out;
+ }
+ if (intarg > 0) {
+ if (intarg > 300) {
+ btrfs_warn(root->fs_info, "excessive commit interval %d",
+ intarg);
+ }
+ info->commit_interval = intarg;
+ } else {
+ btrfs_info(root->fs_info, "using default commit interval %ds",
+ BTRFS_DEFAULT_COMMIT_INTERVAL);
+ info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
+ }
+ break;
+ case Opt_err:
+ btrfs_info(root->fs_info, "unrecognized mount option '%s'", p);
+ ret = -EINVAL;
+ goto out;
+ default:
+ break;
+ }
+ }
+out:
+ if (!ret && btrfs_test_opt(root, SPACE_CACHE))
+ btrfs_info(root->fs_info, "disk space caching is enabled");
+ kfree(orig);
+ return ret;
+}
+
+/*
+ * Parse mount options that are required early in the mount process.
+ *
+ * All other options will be parsed on much later in the mount process and
+ * only when we need to allocate a new super block.
+ */
+static int btrfs_parse_early_options(const char *options, fmode_t flags,
+ void *holder, char **subvol_name, u64 *subvol_objectid,
+ struct btrfs_fs_devices **fs_devices)
+{
+ substring_t args[MAX_OPT_ARGS];
+ char *device_name, *opts, *orig, *p;
+ char *num = NULL;
+ int error = 0;
+
+ if (!options)
+ return 0;
+
+ /*
+ * strsep changes the string, duplicate it because parse_options
+ * gets called twice
+ */
+ opts = kstrdup(options, GFP_KERNEL);
+ if (!opts)
+ return -ENOMEM;
+ orig = opts;
+
+ while ((p = strsep(&opts, ",")) != NULL) {
+ int token;
+ if (!*p)
+ continue;
+
+ token = match_token(p, tokens, args);
+ switch (token) {
+ case Opt_subvol:
+ kfree(*subvol_name);
+ *subvol_name = match_strdup(&args[0]);
+ if (!*subvol_name) {
+ error = -ENOMEM;
+ goto out;
+ }
+ break;
+ case Opt_subvolid:
+ num = match_strdup(&args[0]);
+ if (num) {
+ *subvol_objectid = memparse(num, NULL);
+ kfree(num);
+ /* we want the original fs_tree */
+ if (!*subvol_objectid)
+ *subvol_objectid =
+ BTRFS_FS_TREE_OBJECTID;
+ } else {
+ error = -EINVAL;
+ goto out;
+ }
+ break;
+ case Opt_subvolrootid:
+ printk(KERN_WARNING
+ "BTRFS: 'subvolrootid' mount option is deprecated and has "
+ "no effect\n");
+ break;
+ case Opt_device:
+ device_name = match_strdup(&args[0]);
+ if (!device_name) {
+ error = -ENOMEM;
+ goto out;
+ }
+ error = btrfs_scan_one_device(device_name,
+ flags, holder, fs_devices);
+ kfree(device_name);
+ if (error)
+ goto out;
+ break;
+ default:
+ break;
+ }
+ }
+
+out:
+ kfree(orig);
+ return error;
+}
+
+static struct dentry *get_default_root(struct super_block *sb,
+ u64 subvol_objectid)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ struct btrfs_root *root = fs_info->tree_root;
+ struct btrfs_root *new_root;
+ struct btrfs_dir_item *di;
+ struct btrfs_path *path;
+ struct btrfs_key location;
+ struct inode *inode;
+ u64 dir_id;
+ int new = 0;
+
+ /*
+ * We have a specific subvol we want to mount, just setup location and
+ * go look up the root.
+ */
+ if (subvol_objectid) {
+ location.objectid = subvol_objectid;
+ location.type = BTRFS_ROOT_ITEM_KEY;
+ location.offset = (u64)-1;
+ goto find_root;
+ }
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return ERR_PTR(-ENOMEM);
+ path->leave_spinning = 1;
+
+ /*
+ * Find the "default" dir item which points to the root item that we
+ * will mount by default if we haven't been given a specific subvolume
+ * to mount.
+ */
+ dir_id = btrfs_super_root_dir(fs_info->super_copy);
+ di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
+ if (IS_ERR(di)) {
+ btrfs_free_path(path);
+ return ERR_CAST(di);
+ }
+ if (!di) {
+ /*
+ * Ok the default dir item isn't there. This is weird since
+ * it's always been there, but don't freak out, just try and
+ * mount to root most subvolume.
+ */
+ btrfs_free_path(path);
+ dir_id = BTRFS_FIRST_FREE_OBJECTID;
+ new_root = fs_info->fs_root;
+ goto setup_root;
+ }
+
+ btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
+ btrfs_free_path(path);
+
+find_root:
+ new_root = btrfs_read_fs_root_no_name(fs_info, &location);
+ if (IS_ERR(new_root))
+ return ERR_CAST(new_root);
+
+ if (!(sb->s_flags & MS_RDONLY)) {
+ int ret;
+ down_read(&fs_info->cleanup_work_sem);
+ ret = btrfs_orphan_cleanup(new_root);
+ up_read(&fs_info->cleanup_work_sem);
+ if (ret)
+ return ERR_PTR(ret);
+ }
+
+ dir_id = btrfs_root_dirid(&new_root->root_item);
+setup_root:
+ location.objectid = dir_id;
+ location.type = BTRFS_INODE_ITEM_KEY;
+ location.offset = 0;
+
+ inode = btrfs_iget(sb, &location, new_root, &new);
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
+
+ /*
+ * If we're just mounting the root most subvol put the inode and return
+ * a reference to the dentry. We will have already gotten a reference
+ * to the inode in btrfs_fill_super so we're good to go.
+ */
+ if (!new && d_inode(sb->s_root) == inode) {
+ iput(inode);
+ return dget(sb->s_root);
+ }
+
+ return d_obtain_root(inode);
+}
+
+static int btrfs_fill_super(struct super_block *sb,
+ struct btrfs_fs_devices *fs_devices,
+ void *data, int silent)
+{
+ struct inode *inode;
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ struct btrfs_key key;
+ int err;
+
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
+ sb->s_magic = BTRFS_SUPER_MAGIC;
+ sb->s_op = &btrfs_super_ops;
+ sb->s_d_op = &btrfs_dentry_operations;
+ sb->s_export_op = &btrfs_export_ops;
+ sb->s_xattr = btrfs_xattr_handlers;
+ sb->s_time_gran = 1;
+#ifdef CONFIG_BTRFS_FS_POSIX_ACL
+ sb->s_flags |= MS_POSIXACL;
+#endif
+ sb->s_flags |= MS_I_VERSION;
+ err = open_ctree(sb, fs_devices, (char *)data);
+ if (err) {
+ printk(KERN_ERR "BTRFS: open_ctree failed\n");
+ return err;
+ }
+
+ key.objectid = BTRFS_FIRST_FREE_OBJECTID;
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+ inode = btrfs_iget(sb, &key, fs_info->fs_root, NULL);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ goto fail_close;
+ }
+
+ sb->s_root = d_make_root(inode);
+ if (!sb->s_root) {
+ err = -ENOMEM;
+ goto fail_close;
+ }
+
+ save_mount_options(sb, data);
+ cleancache_init_fs(sb);
+ sb->s_flags |= MS_ACTIVE;
+ return 0;
+
+fail_close:
+ close_ctree(fs_info->tree_root);
+ return err;
+}
+
+int btrfs_sync_fs(struct super_block *sb, int wait)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ struct btrfs_root *root = fs_info->tree_root;
+
+ trace_btrfs_sync_fs(wait);
+
+ if (!wait) {
+ filemap_flush(fs_info->btree_inode->i_mapping);
+ return 0;
+ }
+
+ btrfs_wait_ordered_roots(fs_info, -1);
+
+ trans = btrfs_attach_transaction_barrier(root);
+ if (IS_ERR(trans)) {
+ /* no transaction, don't bother */
+ if (PTR_ERR(trans) == -ENOENT) {
+ /*
+ * Exit unless we have some pending changes
+ * that need to go through commit
+ */
+ if (fs_info->pending_changes == 0)
+ return 0;
+ /*
+ * A non-blocking test if the fs is frozen. We must not
+ * start a new transaction here otherwise a deadlock
+ * happens. The pending operations are delayed to the
+ * next commit after thawing.
+ */
+ if (__sb_start_write(sb, SB_FREEZE_WRITE, false))
+ __sb_end_write(sb, SB_FREEZE_WRITE);
+ else
+ return 0;
+ trans = btrfs_start_transaction(root, 0);
+ }
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+ }
+ return btrfs_commit_transaction(trans, root);
+}
+
+static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
+{
+ struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb);
+ struct btrfs_root *root = info->tree_root;
+ char *compress_type;
+
+ if (btrfs_test_opt(root, DEGRADED))
+ seq_puts(seq, ",degraded");
+ if (btrfs_test_opt(root, NODATASUM))
+ seq_puts(seq, ",nodatasum");
+ if (btrfs_test_opt(root, NODATACOW))
+ seq_puts(seq, ",nodatacow");
+ if (btrfs_test_opt(root, NOBARRIER))
+ seq_puts(seq, ",nobarrier");
+ if (info->max_inline != BTRFS_DEFAULT_MAX_INLINE)
+ seq_printf(seq, ",max_inline=%llu", info->max_inline);
+ if (info->alloc_start != 0)
+ seq_printf(seq, ",alloc_start=%llu", info->alloc_start);
+ if (info->thread_pool_size != min_t(unsigned long,
+ num_online_cpus() + 2, 8))
+ seq_printf(seq, ",thread_pool=%d", info->thread_pool_size);
+ if (btrfs_test_opt(root, COMPRESS)) {
+ if (info->compress_type == BTRFS_COMPRESS_ZLIB)
+ compress_type = "zlib";
+ else
+ compress_type = "lzo";
+ if (btrfs_test_opt(root, FORCE_COMPRESS))
+ seq_printf(seq, ",compress-force=%s", compress_type);
+ else
+ seq_printf(seq, ",compress=%s", compress_type);
+ }
+ if (btrfs_test_opt(root, NOSSD))
+ seq_puts(seq, ",nossd");
+ if (btrfs_test_opt(root, SSD_SPREAD))
+ seq_puts(seq, ",ssd_spread");
+ else if (btrfs_test_opt(root, SSD))
+ seq_puts(seq, ",ssd");
+ if (btrfs_test_opt(root, NOTREELOG))
+ seq_puts(seq, ",notreelog");
+ if (btrfs_test_opt(root, FLUSHONCOMMIT))
+ seq_puts(seq, ",flushoncommit");
+ if (btrfs_test_opt(root, DISCARD))
+ seq_puts(seq, ",discard");
+ if (!(root->fs_info->sb->s_flags & MS_POSIXACL))
+ seq_puts(seq, ",noacl");
+ if (btrfs_test_opt(root, SPACE_CACHE))
+ seq_puts(seq, ",space_cache");
+ else
+ seq_puts(seq, ",nospace_cache");
+ if (btrfs_test_opt(root, RESCAN_UUID_TREE))
+ seq_puts(seq, ",rescan_uuid_tree");
+ if (btrfs_test_opt(root, CLEAR_CACHE))
+ seq_puts(seq, ",clear_cache");
+ if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED))
+ seq_puts(seq, ",user_subvol_rm_allowed");
+ if (btrfs_test_opt(root, ENOSPC_DEBUG))
+ seq_puts(seq, ",enospc_debug");
+ if (btrfs_test_opt(root, AUTO_DEFRAG))
+ seq_puts(seq, ",autodefrag");
+ if (btrfs_test_opt(root, INODE_MAP_CACHE))
+ seq_puts(seq, ",inode_cache");
+ if (btrfs_test_opt(root, SKIP_BALANCE))
+ seq_puts(seq, ",skip_balance");
+ if (btrfs_test_opt(root, RECOVERY))
+ seq_puts(seq, ",recovery");
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+ if (btrfs_test_opt(root, CHECK_INTEGRITY_INCLUDING_EXTENT_DATA))
+ seq_puts(seq, ",check_int_data");
+ else if (btrfs_test_opt(root, CHECK_INTEGRITY))
+ seq_puts(seq, ",check_int");
+ if (info->check_integrity_print_mask)
+ seq_printf(seq, ",check_int_print_mask=%d",
+ info->check_integrity_print_mask);
+#endif
+ if (info->metadata_ratio)
+ seq_printf(seq, ",metadata_ratio=%d",
+ info->metadata_ratio);
+ if (btrfs_test_opt(root, PANIC_ON_FATAL_ERROR))
+ seq_puts(seq, ",fatal_errors=panic");
+ if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL)
+ seq_printf(seq, ",commit=%d", info->commit_interval);
+ return 0;
+}
+
+static int btrfs_test_super(struct super_block *s, void *data)
+{
+ struct btrfs_fs_info *p = data;
+ struct btrfs_fs_info *fs_info = btrfs_sb(s);
+
+ return fs_info->fs_devices == p->fs_devices;
+}
+
+static int btrfs_set_super(struct super_block *s, void *data)
+{
+ int err = set_anon_super(s, data);
+ if (!err)
+ s->s_fs_info = data;
+ return err;
+}
+
+/*
+ * subvolumes are identified by ino 256
+ */
+static inline int is_subvolume_inode(struct inode *inode)
+{
+ if (inode && inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
+ return 1;
+ return 0;
+}
+
+/*
+ * This will strip out the subvol=%s argument for an argument string and add
+ * subvolid=0 to make sure we get the actual tree root for path walking to the
+ * subvol we want.
+ */
+static char *setup_root_args(char *args)
+{
+ unsigned len = strlen(args) + 2 + 1;
+ char *src, *dst, *buf;
+
+ /*
+ * We need the same args as before, but with this substitution:
+ * s!subvol=[^,]+!subvolid=0!
+ *
+ * Since the replacement string is up to 2 bytes longer than the
+ * original, allocate strlen(args) + 2 + 1 bytes.
+ */
+
+ src = strstr(args, "subvol=");
+ /* This shouldn't happen, but just in case.. */
+ if (!src)
+ return NULL;
+
+ buf = dst = kmalloc(len, GFP_NOFS);
+ if (!buf)
+ return NULL;
+
+ /*
+ * If the subvol= arg is not at the start of the string,
+ * copy whatever precedes it into buf.
+ */
+ if (src != args) {
+ *src++ = '\0';
+ strcpy(buf, args);
+ dst += strlen(args);
+ }
+
+ strcpy(dst, "subvolid=0");
+ dst += strlen("subvolid=0");
+
+ /*
+ * If there is a "," after the original subvol=... string,
+ * copy that suffix into our buffer. Otherwise, we're done.
+ */
+ src = strchr(src, ',');
+ if (src)
+ strcpy(dst, src);
+
+ return buf;
+}
+
+static struct dentry *mount_subvol(const char *subvol_name, int flags,
+ const char *device_name, char *data)
+{
+ struct dentry *root;
+ struct vfsmount *mnt;
+ char *newargs;
+
+ newargs = setup_root_args(data);
+ if (!newargs)
+ return ERR_PTR(-ENOMEM);
+ mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name,
+ newargs);
+
+ if (PTR_RET(mnt) == -EBUSY) {
+ if (flags & MS_RDONLY) {
+ mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~MS_RDONLY, device_name,
+ newargs);
+ } else {
+ int r;
+ mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY, device_name,
+ newargs);
+ if (IS_ERR(mnt)) {
+ kfree(newargs);
+ return ERR_CAST(mnt);
+ }
+
+ r = btrfs_remount(mnt->mnt_sb, &flags, NULL);
+ if (r < 0) {
+ /* FIXME: release vfsmount mnt ??*/
+ kfree(newargs);
+ return ERR_PTR(r);
+ }
+ }
+ }
+
+ kfree(newargs);
+
+ if (IS_ERR(mnt))
+ return ERR_CAST(mnt);
+
+ root = mount_subtree(mnt, subvol_name);
+
+ if (!IS_ERR(root) && !is_subvolume_inode(d_inode(root))) {
+ struct super_block *s = root->d_sb;
+ dput(root);
+ root = ERR_PTR(-EINVAL);
+ deactivate_locked_super(s);
+ printk(KERN_ERR "BTRFS: '%s' is not a valid subvolume\n",
+ subvol_name);
+ }
+
+ return root;
+}
+
+static int parse_security_options(char *orig_opts,
+ struct security_mnt_opts *sec_opts)
+{
+ char *secdata = NULL;
+ int ret = 0;
+
+ secdata = alloc_secdata();
+ if (!secdata)
+ return -ENOMEM;
+ ret = security_sb_copy_data(orig_opts, secdata);
+ if (ret) {
+ free_secdata(secdata);
+ return ret;
+ }
+ ret = security_sb_parse_opts_str(secdata, sec_opts);
+ free_secdata(secdata);
+ return ret;
+}
+
+static int setup_security_options(struct btrfs_fs_info *fs_info,
+ struct super_block *sb,
+ struct security_mnt_opts *sec_opts)
+{
+ int ret = 0;
+
+ /*
+ * Call security_sb_set_mnt_opts() to check whether new sec_opts
+ * is valid.
+ */
+ ret = security_sb_set_mnt_opts(sb, sec_opts, 0, NULL);
+ if (ret)
+ return ret;
+
+#ifdef CONFIG_SECURITY
+ if (!fs_info->security_opts.num_mnt_opts) {
+ /* first time security setup, copy sec_opts to fs_info */
+ memcpy(&fs_info->security_opts, sec_opts, sizeof(*sec_opts));
+ } else {
+ /*
+ * Since SELinux(the only one supports security_mnt_opts) does
+ * NOT support changing context during remount/mount same sb,
+ * This must be the same or part of the same security options,
+ * just free it.
+ */
+ security_free_mnt_opts(sec_opts);
+ }
+#endif
+ return ret;
+}
+
+/*
+ * Find a superblock for the given device / mount point.
+ *
+ * Note: This is based on get_sb_bdev from fs/super.c with a few additions
+ * for multiple device setup. Make sure to keep it in sync.
+ */
+static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
+ const char *device_name, void *data)
+{
+ struct block_device *bdev = NULL;
+ struct super_block *s;
+ struct dentry *root;
+ struct btrfs_fs_devices *fs_devices = NULL;
+ struct btrfs_fs_info *fs_info = NULL;
+ struct security_mnt_opts new_sec_opts;
+ fmode_t mode = FMODE_READ;
+ char *subvol_name = NULL;
+ u64 subvol_objectid = 0;
+ int error = 0;
+
+ if (!(flags & MS_RDONLY))
+ mode |= FMODE_WRITE;
+
+ error = btrfs_parse_early_options(data, mode, fs_type,
+ &subvol_name, &subvol_objectid,
+ &fs_devices);
+ if (error) {
+ kfree(subvol_name);
+ return ERR_PTR(error);
+ }
+
+ if (subvol_name) {
+ root = mount_subvol(subvol_name, flags, device_name, data);
+ kfree(subvol_name);
+ return root;
+ }
+
+ security_init_mnt_opts(&new_sec_opts);
+ if (data) {
+ error = parse_security_options(data, &new_sec_opts);
+ if (error)
+ return ERR_PTR(error);
+ }
+
+ error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices);
+ if (error)
+ goto error_sec_opts;
+
+ /*
+ * Setup a dummy root and fs_info for test/set super. This is because
+ * we don't actually fill this stuff out until open_ctree, but we need
+ * it for searching for existing supers, so this lets us do that and
+ * then open_ctree will properly initialize everything later.
+ */
+ fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS);
+ if (!fs_info) {
+ error = -ENOMEM;
+ goto error_sec_opts;
+ }
+
+ fs_info->fs_devices = fs_devices;
+
+ fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
+ fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
+ security_init_mnt_opts(&fs_info->security_opts);
+ if (!fs_info->super_copy || !fs_info->super_for_commit) {
+ error = -ENOMEM;
+ goto error_fs_info;
+ }
+
+ error = btrfs_open_devices(fs_devices, mode, fs_type);
+ if (error)
+ goto error_fs_info;
+
+ if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
+ error = -EACCES;
+ goto error_close_devices;
+ }
+
+ bdev = fs_devices->latest_bdev;
+ s = sget(fs_type, btrfs_test_super, btrfs_set_super, flags | MS_NOSEC,
+ fs_info);
+ if (IS_ERR(s)) {
+ error = PTR_ERR(s);
+ goto error_close_devices;
+ }
+
+ if (s->s_root) {
+ btrfs_close_devices(fs_devices);
+ free_fs_info(fs_info);
+ if ((flags ^ s->s_flags) & MS_RDONLY)
+ error = -EBUSY;
+ } else {
+ char b[BDEVNAME_SIZE];
+
+ strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
+ btrfs_sb(s)->bdev_holder = fs_type;
+ error = btrfs_fill_super(s, fs_devices, data,
+ flags & MS_SILENT ? 1 : 0);
+ }
+
+ root = !error ? get_default_root(s, subvol_objectid) : ERR_PTR(error);
+ if (IS_ERR(root)) {
+ deactivate_locked_super(s);
+ error = PTR_ERR(root);
+ goto error_sec_opts;
+ }
+
+ fs_info = btrfs_sb(s);
+ error = setup_security_options(fs_info, s, &new_sec_opts);
+ if (error) {
+ dput(root);
+ deactivate_locked_super(s);
+ goto error_sec_opts;
+ }
+
+ return root;
+
+error_close_devices:
+ btrfs_close_devices(fs_devices);
+error_fs_info:
+ free_fs_info(fs_info);
+error_sec_opts:
+ security_free_mnt_opts(&new_sec_opts);
+ return ERR_PTR(error);
+}
+
+static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
+ int new_pool_size, int old_pool_size)
+{
+ if (new_pool_size == old_pool_size)
+ return;
+
+ fs_info->thread_pool_size = new_pool_size;
+
+ btrfs_info(fs_info, "resize thread pool %d -> %d",
+ old_pool_size, new_pool_size);
+
+ btrfs_workqueue_set_max(fs_info->workers, new_pool_size);
+ btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size);
+ btrfs_workqueue_set_max(fs_info->submit_workers, new_pool_size);
+ btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size);
+ btrfs_workqueue_set_max(fs_info->endio_workers, new_pool_size);
+ btrfs_workqueue_set_max(fs_info->endio_meta_workers, new_pool_size);
+ btrfs_workqueue_set_max(fs_info->endio_meta_write_workers,
+ new_pool_size);
+ btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size);
+ btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size);
+ btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size);
+ btrfs_workqueue_set_max(fs_info->readahead_workers, new_pool_size);
+ btrfs_workqueue_set_max(fs_info->scrub_wr_completion_workers,
+ new_pool_size);
+}
+
+static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info)
+{
+ set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
+}
+
+static inline void btrfs_remount_begin(struct btrfs_fs_info *fs_info,
+ unsigned long old_opts, int flags)
+{
+ if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) &&
+ (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) ||
+ (flags & MS_RDONLY))) {
+ /* wait for any defraggers to finish */
+ wait_event(fs_info->transaction_wait,
+ (atomic_read(&fs_info->defrag_running) == 0));
+ if (flags & MS_RDONLY)
+ sync_filesystem(fs_info->sb);
+ }
+}
+
+static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info,
+ unsigned long old_opts)
+{
+ /*
+ * We need cleanup all defragable inodes if the autodefragment is
+ * close or the fs is R/O.
+ */
+ if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) &&
+ (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) ||
+ (fs_info->sb->s_flags & MS_RDONLY))) {
+ btrfs_cleanup_defrag_inodes(fs_info);
+ }
+
+ clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
+}
+
+static int btrfs_remount(struct super_block *sb, int *flags, char *data)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ struct btrfs_root *root = fs_info->tree_root;
+ unsigned old_flags = sb->s_flags;
+ unsigned long old_opts = fs_info->mount_opt;
+ unsigned long old_compress_type = fs_info->compress_type;
+ u64 old_max_inline = fs_info->max_inline;
+ u64 old_alloc_start = fs_info->alloc_start;
+ int old_thread_pool_size = fs_info->thread_pool_size;
+ unsigned int old_metadata_ratio = fs_info->metadata_ratio;
+ int ret;
+
+ sync_filesystem(sb);
+ btrfs_remount_prepare(fs_info);
+
+ if (data) {
+ struct security_mnt_opts new_sec_opts;
+
+ security_init_mnt_opts(&new_sec_opts);
+ ret = parse_security_options(data, &new_sec_opts);
+ if (ret)
+ goto restore;
+ ret = setup_security_options(fs_info, sb,
+ &new_sec_opts);
+ if (ret) {
+ security_free_mnt_opts(&new_sec_opts);
+ goto restore;
+ }
+ }
+
+ ret = btrfs_parse_options(root, data);
+ if (ret) {
+ ret = -EINVAL;
+ goto restore;
+ }
+
+ btrfs_remount_begin(fs_info, old_opts, *flags);
+ btrfs_resize_thread_pool(fs_info,
+ fs_info->thread_pool_size, old_thread_pool_size);
+
+ if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
+ goto out;
+
+ if (*flags & MS_RDONLY) {
+ /*
+ * this also happens on 'umount -rf' or on shutdown, when
+ * the filesystem is busy.
+ */
+ cancel_work_sync(&fs_info->async_reclaim_work);
+
+ /* wait for the uuid_scan task to finish */
+ down(&fs_info->uuid_tree_rescan_sem);
+ /* avoid complains from lockdep et al. */
+ up(&fs_info->uuid_tree_rescan_sem);
+
+ sb->s_flags |= MS_RDONLY;
+
+ btrfs_dev_replace_suspend_for_unmount(fs_info);
+ btrfs_scrub_cancel(fs_info);
+ btrfs_pause_balance(fs_info);
+
+ ret = btrfs_commit_super(root);
+ if (ret)
+ goto restore;
+ } else {
+ if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) {
+ btrfs_err(fs_info,
+ "Remounting read-write after error is not allowed");
+ ret = -EINVAL;
+ goto restore;
+ }
+ if (fs_info->fs_devices->rw_devices == 0) {
+ ret = -EACCES;
+ goto restore;
+ }
+
+ if (fs_info->fs_devices->missing_devices >
+ fs_info->num_tolerated_disk_barrier_failures &&
+ !(*flags & MS_RDONLY)) {
+ btrfs_warn(fs_info,
+ "too many missing devices, writeable remount is not allowed");
+ ret = -EACCES;
+ goto restore;
+ }
+
+ if (btrfs_super_log_root(fs_info->super_copy) != 0) {
+ ret = -EINVAL;
+ goto restore;
+ }
+
+ ret = btrfs_cleanup_fs_roots(fs_info);
+ if (ret)
+ goto restore;
+
+ /* recover relocation */
+ mutex_lock(&fs_info->cleaner_mutex);
+ ret = btrfs_recover_relocation(root);
+ mutex_unlock(&fs_info->cleaner_mutex);
+ if (ret)
+ goto restore;
+
+ ret = btrfs_resume_balance_async(fs_info);
+ if (ret)
+ goto restore;
+
+ ret = btrfs_resume_dev_replace_async(fs_info);
+ if (ret) {
+ btrfs_warn(fs_info, "failed to resume dev_replace");
+ goto restore;
+ }
+
+ if (!fs_info->uuid_root) {
+ btrfs_info(fs_info, "creating UUID tree");
+ ret = btrfs_create_uuid_tree(fs_info);
+ if (ret) {
+ btrfs_warn(fs_info, "failed to create the UUID tree %d", ret);
+ goto restore;
+ }
+ }
+ sb->s_flags &= ~MS_RDONLY;
+ }
+out:
+ wake_up_process(fs_info->transaction_kthread);
+ btrfs_remount_cleanup(fs_info, old_opts);
+ return 0;
+
+restore:
+ /* We've hit an error - don't reset MS_RDONLY */
+ if (sb->s_flags & MS_RDONLY)
+ old_flags |= MS_RDONLY;
+ sb->s_flags = old_flags;
+ fs_info->mount_opt = old_opts;
+ fs_info->compress_type = old_compress_type;
+ fs_info->max_inline = old_max_inline;
+ mutex_lock(&fs_info->chunk_mutex);
+ fs_info->alloc_start = old_alloc_start;
+ mutex_unlock(&fs_info->chunk_mutex);
+ btrfs_resize_thread_pool(fs_info,
+ old_thread_pool_size, fs_info->thread_pool_size);
+ fs_info->metadata_ratio = old_metadata_ratio;
+ btrfs_remount_cleanup(fs_info, old_opts);
+ return ret;
+}
+
+/* Used to sort the devices by max_avail(descending sort) */
+static int btrfs_cmp_device_free_bytes(const void *dev_info1,
+ const void *dev_info2)
+{
+ if (((struct btrfs_device_info *)dev_info1)->max_avail >
+ ((struct btrfs_device_info *)dev_info2)->max_avail)
+ return -1;
+ else if (((struct btrfs_device_info *)dev_info1)->max_avail <
+ ((struct btrfs_device_info *)dev_info2)->max_avail)
+ return 1;
+ else
+ return 0;
+}
+
+/*
+ * sort the devices by max_avail, in which max free extent size of each device
+ * is stored.(Descending Sort)
+ */
+static inline void btrfs_descending_sort_devices(
+ struct btrfs_device_info *devices,
+ size_t nr_devices)
+{
+ sort(devices, nr_devices, sizeof(struct btrfs_device_info),
+ btrfs_cmp_device_free_bytes, NULL);
+}
+
+/*
+ * The helper to calc the free space on the devices that can be used to store
+ * file data.
+ */
+static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_device_info *devices_info;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_device *device;
+ u64 skip_space;
+ u64 type;
+ u64 avail_space;
+ u64 used_space;
+ u64 min_stripe_size;
+ int min_stripes = 1, num_stripes = 1;
+ int i = 0, nr_devices;
+ int ret;
+
+ /*
+ * We aren't under the device list lock, so this is racey-ish, but good
+ * enough for our purposes.
+ */
+ nr_devices = fs_info->fs_devices->open_devices;
+ if (!nr_devices) {
+ smp_mb();
+ nr_devices = fs_info->fs_devices->open_devices;
+ ASSERT(nr_devices);
+ if (!nr_devices) {
+ *free_bytes = 0;
+ return 0;
+ }
+ }
+
+ devices_info = kmalloc_array(nr_devices, sizeof(*devices_info),
+ GFP_NOFS);
+ if (!devices_info)
+ return -ENOMEM;
+
+ /* calc min stripe number for data space alloction */
+ type = btrfs_get_alloc_profile(root, 1);
+ if (type & BTRFS_BLOCK_GROUP_RAID0) {
+ min_stripes = 2;
+ num_stripes = nr_devices;
+ } else if (type & BTRFS_BLOCK_GROUP_RAID1) {
+ min_stripes = 2;
+ num_stripes = 2;
+ } else if (type & BTRFS_BLOCK_GROUP_RAID10) {
+ min_stripes = 4;
+ num_stripes = 4;
+ }
+
+ if (type & BTRFS_BLOCK_GROUP_DUP)
+ min_stripe_size = 2 * BTRFS_STRIPE_LEN;
+ else
+ min_stripe_size = BTRFS_STRIPE_LEN;
+
+ if (fs_info->alloc_start)
+ mutex_lock(&fs_devices->device_list_mutex);
+ rcu_read_lock();
+ list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
+ if (!device->in_fs_metadata || !device->bdev ||
+ device->is_tgtdev_for_dev_replace)
+ continue;
+
+ if (i >= nr_devices)
+ break;
+
+ avail_space = device->total_bytes - device->bytes_used;
+
+ /* align with stripe_len */
+ avail_space = div_u64(avail_space, BTRFS_STRIPE_LEN);
+ avail_space *= BTRFS_STRIPE_LEN;
+
+ /*
+ * In order to avoid overwritting the superblock on the drive,
+ * btrfs starts at an offset of at least 1MB when doing chunk
+ * allocation.
+ */
+ skip_space = 1024 * 1024;
+
+ /* user can set the offset in fs_info->alloc_start. */
+ if (fs_info->alloc_start &&
+ fs_info->alloc_start + BTRFS_STRIPE_LEN <=
+ device->total_bytes) {
+ rcu_read_unlock();
+ skip_space = max(fs_info->alloc_start, skip_space);
+
+ /*
+ * btrfs can not use the free space in
+ * [0, skip_space - 1], we must subtract it from the
+ * total. In order to implement it, we account the used
+ * space in this range first.
+ */
+ ret = btrfs_account_dev_extents_size(device, 0,
+ skip_space - 1,
+ &used_space);
+ if (ret) {
+ kfree(devices_info);
+ mutex_unlock(&fs_devices->device_list_mutex);
+ return ret;
+ }
+
+ rcu_read_lock();
+
+ /* calc the free space in [0, skip_space - 1] */
+ skip_space -= used_space;
+ }
+
+ /*
+ * we can use the free space in [0, skip_space - 1], subtract
+ * it from the total.
+ */
+ if (avail_space && avail_space >= skip_space)
+ avail_space -= skip_space;
+ else
+ avail_space = 0;
+
+ if (avail_space < min_stripe_size)
+ continue;
+
+ devices_info[i].dev = device;
+ devices_info[i].max_avail = avail_space;
+
+ i++;
+ }
+ rcu_read_unlock();
+ if (fs_info->alloc_start)
+ mutex_unlock(&fs_devices->device_list_mutex);
+
+ nr_devices = i;
+
+ btrfs_descending_sort_devices(devices_info, nr_devices);
+
+ i = nr_devices - 1;
+ avail_space = 0;
+ while (nr_devices >= min_stripes) {
+ if (num_stripes > nr_devices)
+ num_stripes = nr_devices;
+
+ if (devices_info[i].max_avail >= min_stripe_size) {
+ int j;
+ u64 alloc_size;
+
+ avail_space += devices_info[i].max_avail * num_stripes;
+ alloc_size = devices_info[i].max_avail;
+ for (j = i + 1 - num_stripes; j <= i; j++)
+ devices_info[j].max_avail -= alloc_size;
+ }
+ i--;
+ nr_devices--;
+ }
+
+ kfree(devices_info);
+ *free_bytes = avail_space;
+ return 0;
+}
+
+/*
+ * Calculate numbers for 'df', pessimistic in case of mixed raid profiles.
+ *
+ * If there's a redundant raid level at DATA block groups, use the respective
+ * multiplier to scale the sizes.
+ *
+ * Unused device space usage is based on simulating the chunk allocator
+ * algorithm that respects the device sizes, order of allocations and the
+ * 'alloc_start' value, this is a close approximation of the actual use but
+ * there are other factors that may change the result (like a new metadata
+ * chunk).
+ *
+ * FIXME: not accurate for mixed block groups, total and free/used are ok,
+ * available appears slightly larger.
+ */
+static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
+ struct btrfs_super_block *disk_super = fs_info->super_copy;
+ struct list_head *head = &fs_info->space_info;
+ struct btrfs_space_info *found;
+ u64 total_used = 0;
+ u64 total_free_data = 0;
+ int bits = dentry->d_sb->s_blocksize_bits;
+ __be32 *fsid = (__be32 *)fs_info->fsid;
+ unsigned factor = 1;
+ struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
+ int ret;
+
+ /*
+ * holding chunk_muext to avoid allocating new chunks, holding
+ * device_list_mutex to avoid the device being removed
+ */
+ rcu_read_lock();
+ list_for_each_entry_rcu(found, head, list) {
+ if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
+ int i;
+
+ total_free_data += found->disk_total - found->disk_used;
+ total_free_data -=
+ btrfs_account_ro_block_groups_free_space(found);
+
+ for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
+ if (!list_empty(&found->block_groups[i])) {
+ switch (i) {
+ case BTRFS_RAID_DUP:
+ case BTRFS_RAID_RAID1:
+ case BTRFS_RAID_RAID10:
+ factor = 2;
+ }
+ }
+ }
+ }
+
+ total_used += found->disk_used;
+ }
+
+ rcu_read_unlock();
+
+ buf->f_blocks = div_u64(btrfs_super_total_bytes(disk_super), factor);
+ buf->f_blocks >>= bits;
+ buf->f_bfree = buf->f_blocks - (div_u64(total_used, factor) >> bits);
+
+ /* Account global block reserve as used, it's in logical size already */
+ spin_lock(&block_rsv->lock);
+ buf->f_bfree -= block_rsv->size >> bits;
+ spin_unlock(&block_rsv->lock);
+
+ buf->f_bavail = div_u64(total_free_data, factor);
+ ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data);
+ if (ret)
+ return ret;
+ buf->f_bavail += div_u64(total_free_data, factor);
+ buf->f_bavail = buf->f_bavail >> bits;
+
+ buf->f_type = BTRFS_SUPER_MAGIC;
+ buf->f_bsize = dentry->d_sb->s_blocksize;
+ buf->f_namelen = BTRFS_NAME_LEN;
+
+ /* We treat it as constant endianness (it doesn't matter _which_)
+ because we want the fsid to come out the same whether mounted
+ on a big-endian or little-endian host */
+ buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]);
+ buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]);
+ /* Mask in the root object ID too, to disambiguate subvols */
+ buf->f_fsid.val[0] ^= BTRFS_I(d_inode(dentry))->root->objectid >> 32;
+ buf->f_fsid.val[1] ^= BTRFS_I(d_inode(dentry))->root->objectid;
+
+ return 0;
+}
+
+static void btrfs_kill_super(struct super_block *sb)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ kill_anon_super(sb);
+ free_fs_info(fs_info);
+}
+
+static struct file_system_type btrfs_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "btrfs",
+ .mount = btrfs_mount,
+ .kill_sb = btrfs_kill_super,
+ .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA,
+};
+MODULE_ALIAS_FS("btrfs");
+
+static int btrfs_control_open(struct inode *inode, struct file *file)
+{
+ /*
+ * The control file's private_data is used to hold the
+ * transaction when it is started and is used to keep
+ * track of whether a transaction is already in progress.
+ */
+ file->private_data = NULL;
+ return 0;
+}
+
+/*
+ * used by btrfsctl to scan devices when no FS is mounted
+ */
+static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ struct btrfs_ioctl_vol_args *vol;
+ struct btrfs_fs_devices *fs_devices;
+ int ret = -ENOTTY;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ vol = memdup_user((void __user *)arg, sizeof(*vol));
+ if (IS_ERR(vol))
+ return PTR_ERR(vol);
+
+ switch (cmd) {
+ case BTRFS_IOC_SCAN_DEV:
+ ret = btrfs_scan_one_device(vol->name, FMODE_READ,
+ &btrfs_fs_type, &fs_devices);
+ break;
+ case BTRFS_IOC_DEVICES_READY:
+ ret = btrfs_scan_one_device(vol->name, FMODE_READ,
+ &btrfs_fs_type, &fs_devices);
+ if (ret)
+ break;
+ ret = !(fs_devices->num_devices == fs_devices->total_devices);
+ break;
+ }
+
+ kfree(vol);
+ return ret;
+}
+
+static int btrfs_freeze(struct super_block *sb)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = btrfs_sb(sb)->tree_root;
+
+ trans = btrfs_attach_transaction_barrier(root);
+ if (IS_ERR(trans)) {
+ /* no transaction, don't bother */
+ if (PTR_ERR(trans) == -ENOENT)
+ return 0;
+ return PTR_ERR(trans);
+ }
+ return btrfs_commit_transaction(trans, root);
+}
+
+static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb);
+ struct btrfs_fs_devices *cur_devices;
+ struct btrfs_device *dev, *first_dev = NULL;
+ struct list_head *head;
+ struct rcu_string *name;
+
+ mutex_lock(&fs_info->fs_devices->device_list_mutex);
+ cur_devices = fs_info->fs_devices;
+ while (cur_devices) {
+ head = &cur_devices->devices;
+ list_for_each_entry(dev, head, dev_list) {
+ if (dev->missing)
+ continue;
+ if (!dev->name)
+ continue;
+ if (!first_dev || dev->devid < first_dev->devid)
+ first_dev = dev;
+ }
+ cur_devices = cur_devices->seed;
+ }
+
+ if (first_dev) {
+ rcu_read_lock();
+ name = rcu_dereference(first_dev->name);
+ seq_escape(m, name->str, " \t\n\\");
+ rcu_read_unlock();
+ } else {
+ WARN_ON(1);
+ }
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ return 0;
+}
+
+static const struct super_operations btrfs_super_ops = {
+ .drop_inode = btrfs_drop_inode,
+ .evict_inode = btrfs_evict_inode,
+ .put_super = btrfs_put_super,
+ .sync_fs = btrfs_sync_fs,
+ .show_options = btrfs_show_options,
+ .show_devname = btrfs_show_devname,
+ .write_inode = btrfs_write_inode,
+ .alloc_inode = btrfs_alloc_inode,
+ .destroy_inode = btrfs_destroy_inode,
+ .statfs = btrfs_statfs,
+ .remount_fs = btrfs_remount,
+ .freeze_fs = btrfs_freeze,
+};
+
+static const struct file_operations btrfs_ctl_fops = {
+ .open = btrfs_control_open,
+ .unlocked_ioctl = btrfs_control_ioctl,
+ .compat_ioctl = btrfs_control_ioctl,
+ .owner = THIS_MODULE,
+ .llseek = noop_llseek,
+};
+
+static struct miscdevice btrfs_misc = {
+ .minor = BTRFS_MINOR,
+ .name = "btrfs-control",
+ .fops = &btrfs_ctl_fops
+};
+
+MODULE_ALIAS_MISCDEV(BTRFS_MINOR);
+MODULE_ALIAS("devname:btrfs-control");
+
+static int btrfs_interface_init(void)
+{
+ return misc_register(&btrfs_misc);
+}
+
+static void btrfs_interface_exit(void)
+{
+ if (misc_deregister(&btrfs_misc) < 0)
+ printk(KERN_INFO "BTRFS: misc_deregister failed for control device\n");
+}
+
+static void btrfs_print_info(void)
+{
+ printk(KERN_INFO "Btrfs loaded"
+#ifdef CONFIG_BTRFS_DEBUG
+ ", debug=on"
+#endif
+#ifdef CONFIG_BTRFS_ASSERT
+ ", assert=on"
+#endif
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+ ", integrity-checker=on"
+#endif
+ "\n");
+}
+
+static int btrfs_run_sanity_tests(void)
+{
+ int ret;
+
+ ret = btrfs_init_test_fs();
+ if (ret)
+ return ret;
+
+ ret = btrfs_test_free_space_cache();
+ if (ret)
+ goto out;
+ ret = btrfs_test_extent_buffer_operations();
+ if (ret)
+ goto out;
+ ret = btrfs_test_extent_io();
+ if (ret)
+ goto out;
+ ret = btrfs_test_inodes();
+ if (ret)
+ goto out;
+ ret = btrfs_test_qgroups();
+out:
+ btrfs_destroy_test_fs();
+ return ret;
+}
+
+static int __init init_btrfs_fs(void)
+{
+ int err;
+
+ err = btrfs_hash_init();
+ if (err)
+ return err;
+
+ btrfs_props_init();
+
+ err = btrfs_init_sysfs();
+ if (err)
+ goto free_hash;
+
+ btrfs_init_compress();
+
+ err = btrfs_init_cachep();
+ if (err)
+ goto free_compress;
+
+ err = extent_io_init();
+ if (err)
+ goto free_cachep;
+
+ err = extent_map_init();
+ if (err)
+ goto free_extent_io;
+
+ err = ordered_data_init();
+ if (err)
+ goto free_extent_map;
+
+ err = btrfs_delayed_inode_init();
+ if (err)
+ goto free_ordered_data;
+
+ err = btrfs_auto_defrag_init();
+ if (err)
+ goto free_delayed_inode;
+
+ err = btrfs_delayed_ref_init();
+ if (err)
+ goto free_auto_defrag;
+
+ err = btrfs_prelim_ref_init();
+ if (err)
+ goto free_delayed_ref;
+
+ err = btrfs_end_io_wq_init();
+ if (err)
+ goto free_prelim_ref;
+
+ err = btrfs_interface_init();
+ if (err)
+ goto free_end_io_wq;
+
+ btrfs_init_lockdep();
+
+ btrfs_print_info();
+
+ err = btrfs_run_sanity_tests();
+ if (err)
+ goto unregister_ioctl;
+
+ err = register_filesystem(&btrfs_fs_type);
+ if (err)
+ goto unregister_ioctl;
+
+ return 0;
+
+unregister_ioctl:
+ btrfs_interface_exit();
+free_end_io_wq:
+ btrfs_end_io_wq_exit();
+free_prelim_ref:
+ btrfs_prelim_ref_exit();
+free_delayed_ref:
+ btrfs_delayed_ref_exit();
+free_auto_defrag:
+ btrfs_auto_defrag_exit();
+free_delayed_inode:
+ btrfs_delayed_inode_exit();
+free_ordered_data:
+ ordered_data_exit();
+free_extent_map:
+ extent_map_exit();
+free_extent_io:
+ extent_io_exit();
+free_cachep:
+ btrfs_destroy_cachep();
+free_compress:
+ btrfs_exit_compress();
+ btrfs_exit_sysfs();
+free_hash:
+ btrfs_hash_exit();
+ return err;
+}
+
+static void __exit exit_btrfs_fs(void)
+{
+ btrfs_destroy_cachep();
+ btrfs_delayed_ref_exit();
+ btrfs_auto_defrag_exit();
+ btrfs_delayed_inode_exit();
+ btrfs_prelim_ref_exit();
+ ordered_data_exit();
+ extent_map_exit();
+ extent_io_exit();
+ btrfs_interface_exit();
+ btrfs_end_io_wq_exit();
+ unregister_filesystem(&btrfs_fs_type);
+ btrfs_exit_sysfs();
+ btrfs_cleanup_fs_uuids();
+ btrfs_exit_compress();
+ btrfs_hash_exit();
+}
+
+late_initcall(init_btrfs_fs);
+module_exit(exit_btrfs_fs)
+
+MODULE_LICENSE("GPL");
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
new file mode 100644
index 000000000..e8a4c86d2
--- /dev/null
+++ b/fs/btrfs/sysfs.c
@@ -0,0 +1,758 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/kobject.h>
+#include <linux/bug.h>
+#include <linux/genhd.h>
+#include <linux/debugfs.h>
+
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "sysfs.h"
+#include "volumes.h"
+
+static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj);
+
+static u64 get_features(struct btrfs_fs_info *fs_info,
+ enum btrfs_feature_set set)
+{
+ struct btrfs_super_block *disk_super = fs_info->super_copy;
+ if (set == FEAT_COMPAT)
+ return btrfs_super_compat_flags(disk_super);
+ else if (set == FEAT_COMPAT_RO)
+ return btrfs_super_compat_ro_flags(disk_super);
+ else
+ return btrfs_super_incompat_flags(disk_super);
+}
+
+static void set_features(struct btrfs_fs_info *fs_info,
+ enum btrfs_feature_set set, u64 features)
+{
+ struct btrfs_super_block *disk_super = fs_info->super_copy;
+ if (set == FEAT_COMPAT)
+ btrfs_set_super_compat_flags(disk_super, features);
+ else if (set == FEAT_COMPAT_RO)
+ btrfs_set_super_compat_ro_flags(disk_super, features);
+ else
+ btrfs_set_super_incompat_flags(disk_super, features);
+}
+
+static int can_modify_feature(struct btrfs_feature_attr *fa)
+{
+ int val = 0;
+ u64 set, clear;
+ switch (fa->feature_set) {
+ case FEAT_COMPAT:
+ set = BTRFS_FEATURE_COMPAT_SAFE_SET;
+ clear = BTRFS_FEATURE_COMPAT_SAFE_CLEAR;
+ break;
+ case FEAT_COMPAT_RO:
+ set = BTRFS_FEATURE_COMPAT_RO_SAFE_SET;
+ clear = BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR;
+ break;
+ case FEAT_INCOMPAT:
+ set = BTRFS_FEATURE_INCOMPAT_SAFE_SET;
+ clear = BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR;
+ break;
+ default:
+ printk(KERN_WARNING "btrfs: sysfs: unknown feature set %d\n",
+ fa->feature_set);
+ return 0;
+ }
+
+ if (set & fa->feature_bit)
+ val |= 1;
+ if (clear & fa->feature_bit)
+ val |= 2;
+
+ return val;
+}
+
+static ssize_t btrfs_feature_attr_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ int val = 0;
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+ struct btrfs_feature_attr *fa = to_btrfs_feature_attr(a);
+ if (fs_info) {
+ u64 features = get_features(fs_info, fa->feature_set);
+ if (features & fa->feature_bit)
+ val = 1;
+ } else
+ val = can_modify_feature(fa);
+
+ return snprintf(buf, PAGE_SIZE, "%d\n", val);
+}
+
+static ssize_t btrfs_feature_attr_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t count)
+{
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_feature_attr *fa = to_btrfs_feature_attr(a);
+ u64 features, set, clear;
+ unsigned long val;
+ int ret;
+
+ fs_info = to_fs_info(kobj);
+ if (!fs_info)
+ return -EPERM;
+
+ ret = kstrtoul(skip_spaces(buf), 0, &val);
+ if (ret)
+ return ret;
+
+ if (fa->feature_set == FEAT_COMPAT) {
+ set = BTRFS_FEATURE_COMPAT_SAFE_SET;
+ clear = BTRFS_FEATURE_COMPAT_SAFE_CLEAR;
+ } else if (fa->feature_set == FEAT_COMPAT_RO) {
+ set = BTRFS_FEATURE_COMPAT_RO_SAFE_SET;
+ clear = BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR;
+ } else {
+ set = BTRFS_FEATURE_INCOMPAT_SAFE_SET;
+ clear = BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR;
+ }
+
+ features = get_features(fs_info, fa->feature_set);
+
+ /* Nothing to do */
+ if ((val && (features & fa->feature_bit)) ||
+ (!val && !(features & fa->feature_bit)))
+ return count;
+
+ if ((val && !(set & fa->feature_bit)) ||
+ (!val && !(clear & fa->feature_bit))) {
+ btrfs_info(fs_info,
+ "%sabling feature %s on mounted fs is not supported.",
+ val ? "En" : "Dis", fa->kobj_attr.attr.name);
+ return -EPERM;
+ }
+
+ btrfs_info(fs_info, "%s %s feature flag",
+ val ? "Setting" : "Clearing", fa->kobj_attr.attr.name);
+
+ spin_lock(&fs_info->super_lock);
+ features = get_features(fs_info, fa->feature_set);
+ if (val)
+ features |= fa->feature_bit;
+ else
+ features &= ~fa->feature_bit;
+ set_features(fs_info, fa->feature_set, features);
+ spin_unlock(&fs_info->super_lock);
+
+ /*
+ * We don't want to do full transaction commit from inside sysfs
+ */
+ btrfs_set_pending(fs_info, COMMIT);
+ wake_up_process(fs_info->transaction_kthread);
+
+ return count;
+}
+
+static umode_t btrfs_feature_visible(struct kobject *kobj,
+ struct attribute *attr, int unused)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+ umode_t mode = attr->mode;
+
+ if (fs_info) {
+ struct btrfs_feature_attr *fa;
+ u64 features;
+
+ fa = attr_to_btrfs_feature_attr(attr);
+ features = get_features(fs_info, fa->feature_set);
+
+ if (can_modify_feature(fa))
+ mode |= S_IWUSR;
+ else if (!(features & fa->feature_bit))
+ mode = 0;
+ }
+
+ return mode;
+}
+
+BTRFS_FEAT_ATTR_INCOMPAT(mixed_backref, MIXED_BACKREF);
+BTRFS_FEAT_ATTR_INCOMPAT(default_subvol, DEFAULT_SUBVOL);
+BTRFS_FEAT_ATTR_INCOMPAT(mixed_groups, MIXED_GROUPS);
+BTRFS_FEAT_ATTR_INCOMPAT(compress_lzo, COMPRESS_LZO);
+BTRFS_FEAT_ATTR_INCOMPAT(big_metadata, BIG_METADATA);
+BTRFS_FEAT_ATTR_INCOMPAT(extended_iref, EXTENDED_IREF);
+BTRFS_FEAT_ATTR_INCOMPAT(raid56, RAID56);
+BTRFS_FEAT_ATTR_INCOMPAT(skinny_metadata, SKINNY_METADATA);
+BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES);
+
+static struct attribute *btrfs_supported_feature_attrs[] = {
+ BTRFS_FEAT_ATTR_PTR(mixed_backref),
+ BTRFS_FEAT_ATTR_PTR(default_subvol),
+ BTRFS_FEAT_ATTR_PTR(mixed_groups),
+ BTRFS_FEAT_ATTR_PTR(compress_lzo),
+ BTRFS_FEAT_ATTR_PTR(big_metadata),
+ BTRFS_FEAT_ATTR_PTR(extended_iref),
+ BTRFS_FEAT_ATTR_PTR(raid56),
+ BTRFS_FEAT_ATTR_PTR(skinny_metadata),
+ BTRFS_FEAT_ATTR_PTR(no_holes),
+ NULL
+};
+
+static const struct attribute_group btrfs_feature_attr_group = {
+ .name = "features",
+ .is_visible = btrfs_feature_visible,
+ .attrs = btrfs_supported_feature_attrs,
+};
+
+static ssize_t btrfs_show_u64(u64 *value_ptr, spinlock_t *lock, char *buf)
+{
+ u64 val;
+ if (lock)
+ spin_lock(lock);
+ val = *value_ptr;
+ if (lock)
+ spin_unlock(lock);
+ return snprintf(buf, PAGE_SIZE, "%llu\n", val);
+}
+
+static ssize_t global_rsv_size_show(struct kobject *kobj,
+ struct kobj_attribute *ka, char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj->parent);
+ struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
+ return btrfs_show_u64(&block_rsv->size, &block_rsv->lock, buf);
+}
+BTRFS_ATTR(global_rsv_size, global_rsv_size_show);
+
+static ssize_t global_rsv_reserved_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj->parent);
+ struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
+ return btrfs_show_u64(&block_rsv->reserved, &block_rsv->lock, buf);
+}
+BTRFS_ATTR(global_rsv_reserved, global_rsv_reserved_show);
+
+#define to_space_info(_kobj) container_of(_kobj, struct btrfs_space_info, kobj)
+#define to_raid_kobj(_kobj) container_of(_kobj, struct raid_kobject, kobj)
+
+static ssize_t raid_bytes_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf);
+BTRFS_RAID_ATTR(total_bytes, raid_bytes_show);
+BTRFS_RAID_ATTR(used_bytes, raid_bytes_show);
+
+static ssize_t raid_bytes_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+
+{
+ struct btrfs_space_info *sinfo = to_space_info(kobj->parent);
+ struct btrfs_block_group_cache *block_group;
+ int index = to_raid_kobj(kobj)->raid_type;
+ u64 val = 0;
+
+ down_read(&sinfo->groups_sem);
+ list_for_each_entry(block_group, &sinfo->block_groups[index], list) {
+ if (&attr->attr == BTRFS_RAID_ATTR_PTR(total_bytes))
+ val += block_group->key.offset;
+ else
+ val += btrfs_block_group_used(&block_group->item);
+ }
+ up_read(&sinfo->groups_sem);
+ return snprintf(buf, PAGE_SIZE, "%llu\n", val);
+}
+
+static struct attribute *raid_attributes[] = {
+ BTRFS_RAID_ATTR_PTR(total_bytes),
+ BTRFS_RAID_ATTR_PTR(used_bytes),
+ NULL
+};
+
+static void release_raid_kobj(struct kobject *kobj)
+{
+ kfree(to_raid_kobj(kobj));
+}
+
+struct kobj_type btrfs_raid_ktype = {
+ .sysfs_ops = &kobj_sysfs_ops,
+ .release = release_raid_kobj,
+ .default_attrs = raid_attributes,
+};
+
+#define SPACE_INFO_ATTR(field) \
+static ssize_t btrfs_space_info_show_##field(struct kobject *kobj, \
+ struct kobj_attribute *a, \
+ char *buf) \
+{ \
+ struct btrfs_space_info *sinfo = to_space_info(kobj); \
+ return btrfs_show_u64(&sinfo->field, &sinfo->lock, buf); \
+} \
+BTRFS_ATTR(field, btrfs_space_info_show_##field)
+
+static ssize_t btrfs_space_info_show_total_bytes_pinned(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_space_info *sinfo = to_space_info(kobj);
+ s64 val = percpu_counter_sum(&sinfo->total_bytes_pinned);
+ return snprintf(buf, PAGE_SIZE, "%lld\n", val);
+}
+
+SPACE_INFO_ATTR(flags);
+SPACE_INFO_ATTR(total_bytes);
+SPACE_INFO_ATTR(bytes_used);
+SPACE_INFO_ATTR(bytes_pinned);
+SPACE_INFO_ATTR(bytes_reserved);
+SPACE_INFO_ATTR(bytes_may_use);
+SPACE_INFO_ATTR(disk_used);
+SPACE_INFO_ATTR(disk_total);
+BTRFS_ATTR(total_bytes_pinned, btrfs_space_info_show_total_bytes_pinned);
+
+static struct attribute *space_info_attrs[] = {
+ BTRFS_ATTR_PTR(flags),
+ BTRFS_ATTR_PTR(total_bytes),
+ BTRFS_ATTR_PTR(bytes_used),
+ BTRFS_ATTR_PTR(bytes_pinned),
+ BTRFS_ATTR_PTR(bytes_reserved),
+ BTRFS_ATTR_PTR(bytes_may_use),
+ BTRFS_ATTR_PTR(disk_used),
+ BTRFS_ATTR_PTR(disk_total),
+ BTRFS_ATTR_PTR(total_bytes_pinned),
+ NULL,
+};
+
+static void space_info_release(struct kobject *kobj)
+{
+ struct btrfs_space_info *sinfo = to_space_info(kobj);
+ percpu_counter_destroy(&sinfo->total_bytes_pinned);
+ kfree(sinfo);
+}
+
+struct kobj_type space_info_ktype = {
+ .sysfs_ops = &kobj_sysfs_ops,
+ .release = space_info_release,
+ .default_attrs = space_info_attrs,
+};
+
+static const struct attribute *allocation_attrs[] = {
+ BTRFS_ATTR_PTR(global_rsv_reserved),
+ BTRFS_ATTR_PTR(global_rsv_size),
+ NULL,
+};
+
+static ssize_t btrfs_label_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+ char *label = fs_info->super_copy->label;
+ return snprintf(buf, PAGE_SIZE, label[0] ? "%s\n" : "%s", label);
+}
+
+static ssize_t btrfs_label_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+ size_t p_len;
+
+ if (fs_info->sb->s_flags & MS_RDONLY)
+ return -EROFS;
+
+ /*
+ * p_len is the len until the first occurrence of either
+ * '\n' or '\0'
+ */
+ p_len = strcspn(buf, "\n");
+
+ if (p_len >= BTRFS_LABEL_SIZE)
+ return -EINVAL;
+
+ spin_lock(&fs_info->super_lock);
+ memset(fs_info->super_copy->label, 0, BTRFS_LABEL_SIZE);
+ memcpy(fs_info->super_copy->label, buf, p_len);
+ spin_unlock(&fs_info->super_lock);
+
+ /*
+ * We don't want to do full transaction commit from inside sysfs
+ */
+ btrfs_set_pending(fs_info, COMMIT);
+ wake_up_process(fs_info->transaction_kthread);
+
+ return len;
+}
+BTRFS_ATTR_RW(label, btrfs_label_show, btrfs_label_store);
+
+static ssize_t btrfs_nodesize_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+
+ return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize);
+}
+
+BTRFS_ATTR(nodesize, btrfs_nodesize_show);
+
+static ssize_t btrfs_sectorsize_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+
+ return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize);
+}
+
+BTRFS_ATTR(sectorsize, btrfs_sectorsize_show);
+
+static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+
+ return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize);
+}
+
+BTRFS_ATTR(clone_alignment, btrfs_clone_alignment_show);
+
+static struct attribute *btrfs_attrs[] = {
+ BTRFS_ATTR_PTR(label),
+ BTRFS_ATTR_PTR(nodesize),
+ BTRFS_ATTR_PTR(sectorsize),
+ BTRFS_ATTR_PTR(clone_alignment),
+ NULL,
+};
+
+static void btrfs_release_super_kobj(struct kobject *kobj)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+ complete(&fs_info->kobj_unregister);
+}
+
+static struct kobj_type btrfs_ktype = {
+ .sysfs_ops = &kobj_sysfs_ops,
+ .release = btrfs_release_super_kobj,
+ .default_attrs = btrfs_attrs,
+};
+
+static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj)
+{
+ if (kobj->ktype != &btrfs_ktype)
+ return NULL;
+ return container_of(kobj, struct btrfs_fs_info, super_kobj);
+}
+
+#define NUM_FEATURE_BITS 64
+static char btrfs_unknown_feature_names[3][NUM_FEATURE_BITS][13];
+static struct btrfs_feature_attr btrfs_feature_attrs[3][NUM_FEATURE_BITS];
+
+static const u64 supported_feature_masks[3] = {
+ [FEAT_COMPAT] = BTRFS_FEATURE_COMPAT_SUPP,
+ [FEAT_COMPAT_RO] = BTRFS_FEATURE_COMPAT_RO_SUPP,
+ [FEAT_INCOMPAT] = BTRFS_FEATURE_INCOMPAT_SUPP,
+};
+
+static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add)
+{
+ int set;
+
+ for (set = 0; set < FEAT_MAX; set++) {
+ int i;
+ struct attribute *attrs[2];
+ struct attribute_group agroup = {
+ .name = "features",
+ .attrs = attrs,
+ };
+ u64 features = get_features(fs_info, set);
+ features &= ~supported_feature_masks[set];
+
+ if (!features)
+ continue;
+
+ attrs[1] = NULL;
+ for (i = 0; i < NUM_FEATURE_BITS; i++) {
+ struct btrfs_feature_attr *fa;
+
+ if (!(features & (1ULL << i)))
+ continue;
+
+ fa = &btrfs_feature_attrs[set][i];
+ attrs[0] = &fa->kobj_attr.attr;
+ if (add) {
+ int ret;
+ ret = sysfs_merge_group(&fs_info->super_kobj,
+ &agroup);
+ if (ret)
+ return ret;
+ } else
+ sysfs_unmerge_group(&fs_info->super_kobj,
+ &agroup);
+ }
+
+ }
+ return 0;
+}
+
+static void __btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info)
+{
+ kobject_del(&fs_info->super_kobj);
+ kobject_put(&fs_info->super_kobj);
+ wait_for_completion(&fs_info->kobj_unregister);
+}
+
+void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info)
+{
+ if (fs_info->space_info_kobj) {
+ sysfs_remove_files(fs_info->space_info_kobj, allocation_attrs);
+ kobject_del(fs_info->space_info_kobj);
+ kobject_put(fs_info->space_info_kobj);
+ }
+ kobject_del(fs_info->device_dir_kobj);
+ kobject_put(fs_info->device_dir_kobj);
+ addrm_unknown_feature_attrs(fs_info, false);
+ sysfs_remove_group(&fs_info->super_kobj, &btrfs_feature_attr_group);
+ __btrfs_sysfs_remove_one(fs_info);
+}
+
+const char * const btrfs_feature_set_names[3] = {
+ [FEAT_COMPAT] = "compat",
+ [FEAT_COMPAT_RO] = "compat_ro",
+ [FEAT_INCOMPAT] = "incompat",
+};
+
+char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags)
+{
+ size_t bufsize = 4096; /* safe max, 64 names * 64 bytes */
+ int len = 0;
+ int i;
+ char *str;
+
+ str = kmalloc(bufsize, GFP_KERNEL);
+ if (!str)
+ return str;
+
+ for (i = 0; i < ARRAY_SIZE(btrfs_feature_attrs[set]); i++) {
+ const char *name;
+
+ if (!(flags & (1ULL << i)))
+ continue;
+
+ name = btrfs_feature_attrs[set][i].kobj_attr.attr.name;
+ len += snprintf(str + len, bufsize - len, "%s%s",
+ len ? "," : "", name);
+ }
+
+ return str;
+}
+
+static void init_feature_attrs(void)
+{
+ struct btrfs_feature_attr *fa;
+ int set, i;
+
+ BUILD_BUG_ON(ARRAY_SIZE(btrfs_unknown_feature_names) !=
+ ARRAY_SIZE(btrfs_feature_attrs));
+ BUILD_BUG_ON(ARRAY_SIZE(btrfs_unknown_feature_names[0]) !=
+ ARRAY_SIZE(btrfs_feature_attrs[0]));
+
+ memset(btrfs_feature_attrs, 0, sizeof(btrfs_feature_attrs));
+ memset(btrfs_unknown_feature_names, 0,
+ sizeof(btrfs_unknown_feature_names));
+
+ for (i = 0; btrfs_supported_feature_attrs[i]; i++) {
+ struct btrfs_feature_attr *sfa;
+ struct attribute *a = btrfs_supported_feature_attrs[i];
+ int bit;
+ sfa = attr_to_btrfs_feature_attr(a);
+ bit = ilog2(sfa->feature_bit);
+ fa = &btrfs_feature_attrs[sfa->feature_set][bit];
+
+ fa->kobj_attr.attr.name = sfa->kobj_attr.attr.name;
+ }
+
+ for (set = 0; set < FEAT_MAX; set++) {
+ for (i = 0; i < ARRAY_SIZE(btrfs_feature_attrs[set]); i++) {
+ char *name = btrfs_unknown_feature_names[set][i];
+ fa = &btrfs_feature_attrs[set][i];
+
+ if (fa->kobj_attr.attr.name)
+ continue;
+
+ snprintf(name, 13, "%s:%u",
+ btrfs_feature_set_names[set], i);
+
+ fa->kobj_attr.attr.name = name;
+ fa->kobj_attr.attr.mode = S_IRUGO;
+ fa->feature_set = set;
+ fa->feature_bit = 1ULL << i;
+ }
+ }
+}
+
+int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *one_device)
+{
+ struct hd_struct *disk;
+ struct kobject *disk_kobj;
+
+ if (!fs_info->device_dir_kobj)
+ return -EINVAL;
+
+ if (one_device && one_device->bdev) {
+ disk = one_device->bdev->bd_part;
+ disk_kobj = &part_to_dev(disk)->kobj;
+
+ sysfs_remove_link(fs_info->device_dir_kobj,
+ disk_kobj->name);
+ }
+
+ return 0;
+}
+
+int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *one_device)
+{
+ int error = 0;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_device *dev;
+
+ if (!fs_info->device_dir_kobj)
+ fs_info->device_dir_kobj = kobject_create_and_add("devices",
+ &fs_info->super_kobj);
+
+ if (!fs_info->device_dir_kobj)
+ return -ENOMEM;
+
+ list_for_each_entry(dev, &fs_devices->devices, dev_list) {
+ struct hd_struct *disk;
+ struct kobject *disk_kobj;
+
+ if (!dev->bdev)
+ continue;
+
+ if (one_device && one_device != dev)
+ continue;
+
+ disk = dev->bdev->bd_part;
+ disk_kobj = &part_to_dev(disk)->kobj;
+
+ error = sysfs_create_link(fs_info->device_dir_kobj,
+ disk_kobj, disk_kobj->name);
+ if (error)
+ break;
+ }
+
+ return error;
+}
+
+/* /sys/fs/btrfs/ entry */
+static struct kset *btrfs_kset;
+
+/* /sys/kernel/debug/btrfs */
+static struct dentry *btrfs_debugfs_root_dentry;
+
+/* Debugging tunables and exported data */
+u64 btrfs_debugfs_test;
+
+int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
+{
+ int error;
+
+ init_completion(&fs_info->kobj_unregister);
+ fs_info->super_kobj.kset = btrfs_kset;
+ error = kobject_init_and_add(&fs_info->super_kobj, &btrfs_ktype, NULL,
+ "%pU", fs_info->fsid);
+ if (error)
+ return error;
+
+ error = sysfs_create_group(&fs_info->super_kobj,
+ &btrfs_feature_attr_group);
+ if (error) {
+ __btrfs_sysfs_remove_one(fs_info);
+ return error;
+ }
+
+ error = addrm_unknown_feature_attrs(fs_info, true);
+ if (error)
+ goto failure;
+
+ error = btrfs_kobj_add_device(fs_info, NULL);
+ if (error)
+ goto failure;
+
+ fs_info->space_info_kobj = kobject_create_and_add("allocation",
+ &fs_info->super_kobj);
+ if (!fs_info->space_info_kobj) {
+ error = -ENOMEM;
+ goto failure;
+ }
+
+ error = sysfs_create_files(fs_info->space_info_kobj, allocation_attrs);
+ if (error)
+ goto failure;
+
+ return 0;
+failure:
+ btrfs_sysfs_remove_one(fs_info);
+ return error;
+}
+
+static int btrfs_init_debugfs(void)
+{
+#ifdef CONFIG_DEBUG_FS
+ btrfs_debugfs_root_dentry = debugfs_create_dir("btrfs", NULL);
+ if (!btrfs_debugfs_root_dentry)
+ return -ENOMEM;
+
+ debugfs_create_u64("test", S_IRUGO | S_IWUGO, btrfs_debugfs_root_dentry,
+ &btrfs_debugfs_test);
+#endif
+ return 0;
+}
+
+int btrfs_init_sysfs(void)
+{
+ int ret;
+
+ btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj);
+ if (!btrfs_kset)
+ return -ENOMEM;
+
+ ret = btrfs_init_debugfs();
+ if (ret)
+ goto out1;
+
+ init_feature_attrs();
+ ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
+ if (ret)
+ goto out2;
+
+ return 0;
+out2:
+ debugfs_remove_recursive(btrfs_debugfs_root_dentry);
+out1:
+ kset_unregister(btrfs_kset);
+
+ return ret;
+}
+
+void btrfs_exit_sysfs(void)
+{
+ sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
+ kset_unregister(btrfs_kset);
+ debugfs_remove_recursive(btrfs_debugfs_root_dentry);
+}
+
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
new file mode 100644
index 000000000..3a4bbed72
--- /dev/null
+++ b/fs/btrfs/sysfs.h
@@ -0,0 +1,89 @@
+#ifndef _BTRFS_SYSFS_H_
+#define _BTRFS_SYSFS_H_
+
+/*
+ * Data exported through sysfs
+ */
+extern u64 btrfs_debugfs_test;
+
+enum btrfs_feature_set {
+ FEAT_COMPAT,
+ FEAT_COMPAT_RO,
+ FEAT_INCOMPAT,
+ FEAT_MAX
+};
+
+#define __INIT_KOBJ_ATTR(_name, _mode, _show, _store) \
+{ \
+ .attr = { .name = __stringify(_name), .mode = _mode }, \
+ .show = _show, \
+ .store = _store, \
+}
+
+#define BTRFS_ATTR_RW(_name, _show, _store) \
+ static struct kobj_attribute btrfs_attr_##_name = \
+ __INIT_KOBJ_ATTR(_name, 0644, _show, _store)
+
+#define BTRFS_ATTR(_name, _show) \
+ static struct kobj_attribute btrfs_attr_##_name = \
+ __INIT_KOBJ_ATTR(_name, 0444, _show, NULL)
+
+#define BTRFS_ATTR_PTR(_name) (&btrfs_attr_##_name.attr)
+
+#define BTRFS_RAID_ATTR(_name, _show) \
+ static struct kobj_attribute btrfs_raid_attr_##_name = \
+ __INIT_KOBJ_ATTR(_name, 0444, _show, NULL)
+
+#define BTRFS_RAID_ATTR_PTR(_name) (&btrfs_raid_attr_##_name.attr)
+
+
+struct btrfs_feature_attr {
+ struct kobj_attribute kobj_attr;
+ enum btrfs_feature_set feature_set;
+ u64 feature_bit;
+};
+
+#define BTRFS_FEAT_ATTR(_name, _feature_set, _prefix, _feature_bit) \
+static struct btrfs_feature_attr btrfs_attr_##_name = { \
+ .kobj_attr = __INIT_KOBJ_ATTR(_name, S_IRUGO, \
+ btrfs_feature_attr_show, \
+ btrfs_feature_attr_store), \
+ .feature_set = _feature_set, \
+ .feature_bit = _prefix ##_## _feature_bit, \
+}
+#define BTRFS_FEAT_ATTR_PTR(_name) (&btrfs_attr_##_name.kobj_attr.attr)
+
+#define BTRFS_FEAT_ATTR_COMPAT(name, feature) \
+ BTRFS_FEAT_ATTR(name, FEAT_COMPAT, BTRFS_FEATURE_COMPAT, feature)
+#define BTRFS_FEAT_ATTR_COMPAT_RO(name, feature) \
+ BTRFS_FEAT_ATTR(name, FEAT_COMPAT_RO, BTRFS_FEATURE_COMPAT, feature)
+#define BTRFS_FEAT_ATTR_INCOMPAT(name, feature) \
+ BTRFS_FEAT_ATTR(name, FEAT_INCOMPAT, BTRFS_FEATURE_INCOMPAT, feature)
+
+/* convert from attribute */
+static inline struct btrfs_feature_attr *
+to_btrfs_feature_attr(struct kobj_attribute *a)
+{
+ return container_of(a, struct btrfs_feature_attr, kobj_attr);
+}
+
+static inline struct kobj_attribute *attr_to_btrfs_attr(struct attribute *attr)
+{
+ return container_of(attr, struct kobj_attribute, attr);
+}
+
+static inline struct btrfs_feature_attr *
+attr_to_btrfs_feature_attr(struct attribute *attr)
+{
+ return to_btrfs_feature_attr(attr_to_btrfs_attr(attr));
+}
+
+char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags);
+extern const char * const btrfs_feature_set_names[3];
+extern struct kobj_type space_info_ktype;
+extern struct kobj_type btrfs_raid_ktype;
+int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *one_device);
+int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *one_device);
+#endif /* _BTRFS_SYSFS_H_ */
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
new file mode 100644
index 000000000..9626252ee
--- /dev/null
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -0,0 +1,171 @@
+/*
+ * Copyright (C) 2013 Fusion IO. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/magic.h>
+#include "btrfs-tests.h"
+#include "../ctree.h"
+#include "../volumes.h"
+#include "../disk-io.h"
+#include "../qgroup.h"
+
+static struct vfsmount *test_mnt = NULL;
+
+static const struct super_operations btrfs_test_super_ops = {
+ .alloc_inode = btrfs_alloc_inode,
+ .destroy_inode = btrfs_test_destroy_inode,
+};
+
+static struct dentry *btrfs_test_mount(struct file_system_type *fs_type,
+ int flags, const char *dev_name,
+ void *data)
+{
+ return mount_pseudo(fs_type, "btrfs_test:", &btrfs_test_super_ops,
+ NULL, BTRFS_TEST_MAGIC);
+}
+
+static struct file_system_type test_type = {
+ .name = "btrfs_test_fs",
+ .mount = btrfs_test_mount,
+ .kill_sb = kill_anon_super,
+};
+
+struct inode *btrfs_new_test_inode(void)
+{
+ return new_inode(test_mnt->mnt_sb);
+}
+
+int btrfs_init_test_fs(void)
+{
+ int ret;
+
+ ret = register_filesystem(&test_type);
+ if (ret) {
+ printk(KERN_ERR "btrfs: cannot register test file system\n");
+ return ret;
+ }
+
+ test_mnt = kern_mount(&test_type);
+ if (IS_ERR(test_mnt)) {
+ printk(KERN_ERR "btrfs: cannot mount test file system\n");
+ unregister_filesystem(&test_type);
+ return ret;
+ }
+ return 0;
+}
+
+void btrfs_destroy_test_fs(void)
+{
+ kern_unmount(test_mnt);
+ unregister_filesystem(&test_type);
+}
+
+struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void)
+{
+ struct btrfs_fs_info *fs_info = kzalloc(sizeof(struct btrfs_fs_info),
+ GFP_NOFS);
+
+ if (!fs_info)
+ return fs_info;
+ fs_info->fs_devices = kzalloc(sizeof(struct btrfs_fs_devices),
+ GFP_NOFS);
+ if (!fs_info->fs_devices) {
+ kfree(fs_info);
+ return NULL;
+ }
+ fs_info->super_copy = kzalloc(sizeof(struct btrfs_super_block),
+ GFP_NOFS);
+ if (!fs_info->super_copy) {
+ kfree(fs_info->fs_devices);
+ kfree(fs_info);
+ return NULL;
+ }
+
+ if (init_srcu_struct(&fs_info->subvol_srcu)) {
+ kfree(fs_info->fs_devices);
+ kfree(fs_info->super_copy);
+ kfree(fs_info);
+ return NULL;
+ }
+
+ spin_lock_init(&fs_info->buffer_lock);
+ spin_lock_init(&fs_info->qgroup_lock);
+ spin_lock_init(&fs_info->qgroup_op_lock);
+ spin_lock_init(&fs_info->super_lock);
+ spin_lock_init(&fs_info->fs_roots_radix_lock);
+ spin_lock_init(&fs_info->tree_mod_seq_lock);
+ mutex_init(&fs_info->qgroup_ioctl_lock);
+ mutex_init(&fs_info->qgroup_rescan_lock);
+ rwlock_init(&fs_info->tree_mod_log_lock);
+ fs_info->running_transaction = NULL;
+ fs_info->qgroup_tree = RB_ROOT;
+ fs_info->qgroup_ulist = NULL;
+ atomic64_set(&fs_info->tree_mod_seq, 0);
+ INIT_LIST_HEAD(&fs_info->dirty_qgroups);
+ INIT_LIST_HEAD(&fs_info->dead_roots);
+ INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
+ INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
+ INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
+ return fs_info;
+}
+
+static void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
+{
+ struct radix_tree_iter iter;
+ void **slot;
+
+ spin_lock(&fs_info->buffer_lock);
+restart:
+ radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) {
+ struct extent_buffer *eb;
+
+ eb = radix_tree_deref_slot_protected(slot, &fs_info->buffer_lock);
+ if (!eb)
+ continue;
+ /* Shouldn't happen but that kind of thinking creates CVE's */
+ if (radix_tree_exception(eb)) {
+ if (radix_tree_deref_retry(eb))
+ goto restart;
+ continue;
+ }
+ spin_unlock(&fs_info->buffer_lock);
+ free_extent_buffer_stale(eb);
+ spin_lock(&fs_info->buffer_lock);
+ }
+ spin_unlock(&fs_info->buffer_lock);
+
+ btrfs_free_qgroup_config(fs_info);
+ btrfs_free_fs_roots(fs_info);
+ cleanup_srcu_struct(&fs_info->subvol_srcu);
+ kfree(fs_info->super_copy);
+ kfree(fs_info->fs_devices);
+ kfree(fs_info);
+}
+
+void btrfs_free_dummy_root(struct btrfs_root *root)
+{
+ if (!root)
+ return;
+ if (root->node)
+ free_extent_buffer(root->node);
+ if (root->fs_info)
+ btrfs_free_dummy_fs_info(root->fs_info);
+ kfree(root);
+}
+
diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h
new file mode 100644
index 000000000..fd3954224
--- /dev/null
+++ b/fs/btrfs/tests/btrfs-tests.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (C) 2013 Fusion IO. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_TESTS
+#define __BTRFS_TESTS
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+
+#define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt, ##__VA_ARGS__)
+
+struct btrfs_root;
+
+int btrfs_test_free_space_cache(void);
+int btrfs_test_extent_buffer_operations(void);
+int btrfs_test_extent_io(void);
+int btrfs_test_inodes(void);
+int btrfs_test_qgroups(void);
+int btrfs_init_test_fs(void);
+void btrfs_destroy_test_fs(void);
+struct inode *btrfs_new_test_inode(void);
+struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void);
+void btrfs_free_dummy_root(struct btrfs_root *root);
+#else
+static inline int btrfs_test_free_space_cache(void)
+{
+ return 0;
+}
+static inline int btrfs_test_extent_buffer_operations(void)
+{
+ return 0;
+}
+static inline int btrfs_init_test_fs(void)
+{
+ return 0;
+}
+static inline void btrfs_destroy_test_fs(void)
+{
+}
+static inline int btrfs_test_extent_io(void)
+{
+ return 0;
+}
+static inline int btrfs_test_inodes(void)
+{
+ return 0;
+}
+static inline int btrfs_test_qgroups(void)
+{
+ return 0;
+}
+#endif
+
+#endif
diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c
new file mode 100644
index 000000000..f51963a8f
--- /dev/null
+++ b/fs/btrfs/tests/extent-buffer-tests.c
@@ -0,0 +1,229 @@
+/*
+ * Copyright (C) 2013 Fusion IO. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/slab.h>
+#include "btrfs-tests.h"
+#include "../ctree.h"
+#include "../extent_io.h"
+#include "../disk-io.h"
+
+static int test_btrfs_split_item(void)
+{
+ struct btrfs_path *path;
+ struct btrfs_root *root;
+ struct extent_buffer *eb;
+ struct btrfs_item *item;
+ char *value = "mary had a little lamb";
+ char *split1 = "mary had a little";
+ char *split2 = " lamb";
+ char *split3 = "mary";
+ char *split4 = " had a little";
+ char buf[32];
+ struct btrfs_key key;
+ u32 value_len = strlen(value);
+ int ret = 0;
+
+ test_msg("Running btrfs_split_item tests\n");
+
+ root = btrfs_alloc_dummy_root();
+ if (IS_ERR(root)) {
+ test_msg("Could not allocate root\n");
+ return PTR_ERR(root);
+ }
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ test_msg("Could not allocate path\n");
+ kfree(root);
+ return -ENOMEM;
+ }
+
+ path->nodes[0] = eb = alloc_dummy_extent_buffer(NULL, 4096);
+ if (!eb) {
+ test_msg("Could not allocate dummy buffer\n");
+ ret = -ENOMEM;
+ goto out;
+ }
+ path->slots[0] = 0;
+
+ key.objectid = 0;
+ key.type = BTRFS_EXTENT_CSUM_KEY;
+ key.offset = 0;
+
+ setup_items_for_insert(root, path, &key, &value_len, value_len,
+ value_len + sizeof(struct btrfs_item), 1);
+ item = btrfs_item_nr(0);
+ write_extent_buffer(eb, value, btrfs_item_ptr_offset(eb, 0),
+ value_len);
+
+ key.offset = 3;
+
+ /*
+ * Passing NULL trans here should be safe because we have plenty of
+ * space in this leaf to split the item without having to split the
+ * leaf.
+ */
+ ret = btrfs_split_item(NULL, root, path, &key, 17);
+ if (ret) {
+ test_msg("Split item failed %d\n", ret);
+ goto out;
+ }
+
+ /*
+ * Read the first slot, it should have the original key and contain only
+ * 'mary had a little'
+ */
+ btrfs_item_key_to_cpu(eb, &key, 0);
+ if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY ||
+ key.offset != 0) {
+ test_msg("Invalid key at slot 0\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ item = btrfs_item_nr(0);
+ if (btrfs_item_size(eb, item) != strlen(split1)) {
+ test_msg("Invalid len in the first split\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 0),
+ strlen(split1));
+ if (memcmp(buf, split1, strlen(split1))) {
+ test_msg("Data in the buffer doesn't match what it should "
+ "in the first split have='%.*s' want '%s'\n",
+ (int)strlen(split1), buf, split1);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ btrfs_item_key_to_cpu(eb, &key, 1);
+ if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY ||
+ key.offset != 3) {
+ test_msg("Invalid key at slot 1\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ item = btrfs_item_nr(1);
+ if (btrfs_item_size(eb, item) != strlen(split2)) {
+ test_msg("Invalid len in the second split\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 1),
+ strlen(split2));
+ if (memcmp(buf, split2, strlen(split2))) {
+ test_msg("Data in the buffer doesn't match what it should "
+ "in the second split\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ key.offset = 1;
+ /* Do it again so we test memmoving the other items in the leaf */
+ ret = btrfs_split_item(NULL, root, path, &key, 4);
+ if (ret) {
+ test_msg("Second split item failed %d\n", ret);
+ goto out;
+ }
+
+ btrfs_item_key_to_cpu(eb, &key, 0);
+ if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY ||
+ key.offset != 0) {
+ test_msg("Invalid key at slot 0\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ item = btrfs_item_nr(0);
+ if (btrfs_item_size(eb, item) != strlen(split3)) {
+ test_msg("Invalid len in the first split\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 0),
+ strlen(split3));
+ if (memcmp(buf, split3, strlen(split3))) {
+ test_msg("Data in the buffer doesn't match what it should "
+ "in the third split");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ btrfs_item_key_to_cpu(eb, &key, 1);
+ if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY ||
+ key.offset != 1) {
+ test_msg("Invalid key at slot 1\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ item = btrfs_item_nr(1);
+ if (btrfs_item_size(eb, item) != strlen(split4)) {
+ test_msg("Invalid len in the second split\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 1),
+ strlen(split4));
+ if (memcmp(buf, split4, strlen(split4))) {
+ test_msg("Data in the buffer doesn't match what it should "
+ "in the fourth split\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ btrfs_item_key_to_cpu(eb, &key, 2);
+ if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY ||
+ key.offset != 3) {
+ test_msg("Invalid key at slot 2\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ item = btrfs_item_nr(2);
+ if (btrfs_item_size(eb, item) != strlen(split2)) {
+ test_msg("Invalid len in the second split\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 2),
+ strlen(split2));
+ if (memcmp(buf, split2, strlen(split2))) {
+ test_msg("Data in the buffer doesn't match what it should "
+ "in the last chunk\n");
+ ret = -EINVAL;
+ goto out;
+ }
+out:
+ btrfs_free_path(path);
+ kfree(root);
+ return ret;
+}
+
+int btrfs_test_extent_buffer_operations(void)
+{
+ test_msg("Running extent buffer operation tests");
+ return test_btrfs_split_item();
+}
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
new file mode 100644
index 000000000..9e9f23681
--- /dev/null
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -0,0 +1,275 @@
+/*
+ * Copyright (C) 2013 Fusion IO. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/pagemap.h>
+#include <linux/sched.h>
+#include "btrfs-tests.h"
+#include "../extent_io.h"
+
+#define PROCESS_UNLOCK (1 << 0)
+#define PROCESS_RELEASE (1 << 1)
+#define PROCESS_TEST_LOCKED (1 << 2)
+
+static noinline int process_page_range(struct inode *inode, u64 start, u64 end,
+ unsigned long flags)
+{
+ int ret;
+ struct page *pages[16];
+ unsigned long index = start >> PAGE_CACHE_SHIFT;
+ unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+ unsigned long nr_pages = end_index - index + 1;
+ int i;
+ int count = 0;
+ int loops = 0;
+
+ while (nr_pages > 0) {
+ ret = find_get_pages_contig(inode->i_mapping, index,
+ min_t(unsigned long, nr_pages,
+ ARRAY_SIZE(pages)), pages);
+ for (i = 0; i < ret; i++) {
+ if (flags & PROCESS_TEST_LOCKED &&
+ !PageLocked(pages[i]))
+ count++;
+ if (flags & PROCESS_UNLOCK && PageLocked(pages[i]))
+ unlock_page(pages[i]);
+ page_cache_release(pages[i]);
+ if (flags & PROCESS_RELEASE)
+ page_cache_release(pages[i]);
+ }
+ nr_pages -= ret;
+ index += ret;
+ cond_resched();
+ loops++;
+ if (loops > 100000) {
+ printk(KERN_ERR "stuck in a loop, start %Lu, end %Lu, nr_pages %lu, ret %d\n", start, end, nr_pages, ret);
+ break;
+ }
+ }
+ return count;
+}
+
+static int test_find_delalloc(void)
+{
+ struct inode *inode;
+ struct extent_io_tree tmp;
+ struct page *page;
+ struct page *locked_page = NULL;
+ unsigned long index = 0;
+ u64 total_dirty = 256 * 1024 * 1024;
+ u64 max_bytes = 128 * 1024 * 1024;
+ u64 start, end, test_start;
+ u64 found;
+ int ret = -EINVAL;
+
+ inode = btrfs_new_test_inode();
+ if (!inode) {
+ test_msg("Failed to allocate test inode\n");
+ return -ENOMEM;
+ }
+
+ extent_io_tree_init(&tmp, &inode->i_data);
+
+ /*
+ * First go through and create and mark all of our pages dirty, we pin
+ * everything to make sure our pages don't get evicted and screw up our
+ * test.
+ */
+ for (index = 0; index < (total_dirty >> PAGE_CACHE_SHIFT); index++) {
+ page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+ if (!page) {
+ test_msg("Failed to allocate test page\n");
+ ret = -ENOMEM;
+ goto out;
+ }
+ SetPageDirty(page);
+ if (index) {
+ unlock_page(page);
+ } else {
+ page_cache_get(page);
+ locked_page = page;
+ }
+ }
+
+ /* Test this scenario
+ * |--- delalloc ---|
+ * |--- search ---|
+ */
+ set_extent_delalloc(&tmp, 0, 4095, NULL, GFP_NOFS);
+ start = 0;
+ end = 0;
+ found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+ &end, max_bytes);
+ if (!found) {
+ test_msg("Should have found at least one delalloc\n");
+ goto out_bits;
+ }
+ if (start != 0 || end != 4095) {
+ test_msg("Expected start 0 end 4095, got start %Lu end %Lu\n",
+ start, end);
+ goto out_bits;
+ }
+ unlock_extent(&tmp, start, end);
+ unlock_page(locked_page);
+ page_cache_release(locked_page);
+
+ /*
+ * Test this scenario
+ *
+ * |--- delalloc ---|
+ * |--- search ---|
+ */
+ test_start = 64 * 1024 * 1024;
+ locked_page = find_lock_page(inode->i_mapping,
+ test_start >> PAGE_CACHE_SHIFT);
+ if (!locked_page) {
+ test_msg("Couldn't find the locked page\n");
+ goto out_bits;
+ }
+ set_extent_delalloc(&tmp, 4096, max_bytes - 1, NULL, GFP_NOFS);
+ start = test_start;
+ end = 0;
+ found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+ &end, max_bytes);
+ if (!found) {
+ test_msg("Couldn't find delalloc in our range\n");
+ goto out_bits;
+ }
+ if (start != test_start || end != max_bytes - 1) {
+ test_msg("Expected start %Lu end %Lu, got start %Lu, end "
+ "%Lu\n", test_start, max_bytes - 1, start, end);
+ goto out_bits;
+ }
+ if (process_page_range(inode, start, end,
+ PROCESS_TEST_LOCKED | PROCESS_UNLOCK)) {
+ test_msg("There were unlocked pages in the range\n");
+ goto out_bits;
+ }
+ unlock_extent(&tmp, start, end);
+ /* locked_page was unlocked above */
+ page_cache_release(locked_page);
+
+ /*
+ * Test this scenario
+ * |--- delalloc ---|
+ * |--- search ---|
+ */
+ test_start = max_bytes + 4096;
+ locked_page = find_lock_page(inode->i_mapping, test_start >>
+ PAGE_CACHE_SHIFT);
+ if (!locked_page) {
+ test_msg("Could'nt find the locked page\n");
+ goto out_bits;
+ }
+ start = test_start;
+ end = 0;
+ found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+ &end, max_bytes);
+ if (found) {
+ test_msg("Found range when we shouldn't have\n");
+ goto out_bits;
+ }
+ if (end != (u64)-1) {
+ test_msg("Did not return the proper end offset\n");
+ goto out_bits;
+ }
+
+ /*
+ * Test this scenario
+ * [------- delalloc -------|
+ * [max_bytes]|-- search--|
+ *
+ * We are re-using our test_start from above since it works out well.
+ */
+ set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, NULL, GFP_NOFS);
+ start = test_start;
+ end = 0;
+ found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+ &end, max_bytes);
+ if (!found) {
+ test_msg("Didn't find our range\n");
+ goto out_bits;
+ }
+ if (start != test_start || end != total_dirty - 1) {
+ test_msg("Expected start %Lu end %Lu, got start %Lu end %Lu\n",
+ test_start, total_dirty - 1, start, end);
+ goto out_bits;
+ }
+ if (process_page_range(inode, start, end,
+ PROCESS_TEST_LOCKED | PROCESS_UNLOCK)) {
+ test_msg("Pages in range were not all locked\n");
+ goto out_bits;
+ }
+ unlock_extent(&tmp, start, end);
+
+ /*
+ * Now to test where we run into a page that is no longer dirty in the
+ * range we want to find.
+ */
+ page = find_get_page(inode->i_mapping, (max_bytes + (1 * 1024 * 1024))
+ >> PAGE_CACHE_SHIFT);
+ if (!page) {
+ test_msg("Couldn't find our page\n");
+ goto out_bits;
+ }
+ ClearPageDirty(page);
+ page_cache_release(page);
+
+ /* We unlocked it in the previous test */
+ lock_page(locked_page);
+ start = test_start;
+ end = 0;
+ /*
+ * Currently if we fail to find dirty pages in the delalloc range we
+ * will adjust max_bytes down to PAGE_CACHE_SIZE and then re-search. If
+ * this changes at any point in the future we will need to fix this
+ * tests expected behavior.
+ */
+ found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+ &end, max_bytes);
+ if (!found) {
+ test_msg("Didn't find our range\n");
+ goto out_bits;
+ }
+ if (start != test_start && end != test_start + PAGE_CACHE_SIZE - 1) {
+ test_msg("Expected start %Lu end %Lu, got start %Lu end %Lu\n",
+ test_start, test_start + PAGE_CACHE_SIZE - 1, start,
+ end);
+ goto out_bits;
+ }
+ if (process_page_range(inode, start, end, PROCESS_TEST_LOCKED |
+ PROCESS_UNLOCK)) {
+ test_msg("Pages in range were not all locked\n");
+ goto out_bits;
+ }
+ ret = 0;
+out_bits:
+ clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1, GFP_NOFS);
+out:
+ if (locked_page)
+ page_cache_release(locked_page);
+ process_page_range(inode, 0, total_dirty - 1,
+ PROCESS_UNLOCK | PROCESS_RELEASE);
+ iput(inode);
+ return ret;
+}
+
+int btrfs_test_extent_io(void)
+{
+ test_msg("Running find delalloc tests\n");
+ return test_find_delalloc();
+}
diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c
new file mode 100644
index 000000000..2299bfde3
--- /dev/null
+++ b/fs/btrfs/tests/free-space-tests.c
@@ -0,0 +1,909 @@
+/*
+ * Copyright (C) 2013 Fusion IO. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/slab.h>
+#include "btrfs-tests.h"
+#include "../ctree.h"
+#include "../free-space-cache.h"
+
+#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8)
+static struct btrfs_block_group_cache *init_test_block_group(void)
+{
+ struct btrfs_block_group_cache *cache;
+
+ cache = kzalloc(sizeof(*cache), GFP_NOFS);
+ if (!cache)
+ return NULL;
+ cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
+ GFP_NOFS);
+ if (!cache->free_space_ctl) {
+ kfree(cache);
+ return NULL;
+ }
+
+ cache->key.objectid = 0;
+ cache->key.offset = 1024 * 1024 * 1024;
+ cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+ cache->sectorsize = 4096;
+ cache->full_stripe_len = 4096;
+
+ spin_lock_init(&cache->lock);
+ INIT_LIST_HEAD(&cache->list);
+ INIT_LIST_HEAD(&cache->cluster_list);
+ INIT_LIST_HEAD(&cache->bg_list);
+
+ btrfs_init_free_space_ctl(cache);
+
+ return cache;
+}
+
+/*
+ * This test just does basic sanity checking, making sure we can add an exten
+ * entry and remove space from either end and the middle, and make sure we can
+ * remove space that covers adjacent extent entries.
+ */
+static int test_extents(struct btrfs_block_group_cache *cache)
+{
+ int ret = 0;
+
+ test_msg("Running extent only tests\n");
+
+ /* First just make sure we can remove an entire entry */
+ ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024);
+ if (ret) {
+ test_msg("Error adding initial extents %d\n", ret);
+ return ret;
+ }
+
+ ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024);
+ if (ret) {
+ test_msg("Error removing extent %d\n", ret);
+ return ret;
+ }
+
+ if (test_check_exists(cache, 0, 4 * 1024 * 1024)) {
+ test_msg("Full remove left some lingering space\n");
+ return -1;
+ }
+
+ /* Ok edge and middle cases now */
+ ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024);
+ if (ret) {
+ test_msg("Error adding half extent %d\n", ret);
+ return ret;
+ }
+
+ ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 1 * 1024 * 1024);
+ if (ret) {
+ test_msg("Error removing tail end %d\n", ret);
+ return ret;
+ }
+
+ ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024);
+ if (ret) {
+ test_msg("Error removing front end %d\n", ret);
+ return ret;
+ }
+
+ ret = btrfs_remove_free_space(cache, 2 * 1024 * 1024, 4096);
+ if (ret) {
+ test_msg("Error removing middle piece %d\n", ret);
+ return ret;
+ }
+
+ if (test_check_exists(cache, 0, 1 * 1024 * 1024)) {
+ test_msg("Still have space at the front\n");
+ return -1;
+ }
+
+ if (test_check_exists(cache, 2 * 1024 * 1024, 4096)) {
+ test_msg("Still have space in the middle\n");
+ return -1;
+ }
+
+ if (test_check_exists(cache, 3 * 1024 * 1024, 1 * 1024 * 1024)) {
+ test_msg("Still have space at the end\n");
+ return -1;
+ }
+
+ /* Cleanup */
+ __btrfs_remove_free_space_cache(cache->free_space_ctl);
+
+ return 0;
+}
+
+static int test_bitmaps(struct btrfs_block_group_cache *cache)
+{
+ u64 next_bitmap_offset;
+ int ret;
+
+ test_msg("Running bitmap only tests\n");
+
+ ret = test_add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1);
+ if (ret) {
+ test_msg("Couldn't create a bitmap entry %d\n", ret);
+ return ret;
+ }
+
+ ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024);
+ if (ret) {
+ test_msg("Error removing bitmap full range %d\n", ret);
+ return ret;
+ }
+
+ if (test_check_exists(cache, 0, 4 * 1024 * 1024)) {
+ test_msg("Left some space in bitmap\n");
+ return -1;
+ }
+
+ ret = test_add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1);
+ if (ret) {
+ test_msg("Couldn't add to our bitmap entry %d\n", ret);
+ return ret;
+ }
+
+ ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 2 * 1024 * 1024);
+ if (ret) {
+ test_msg("Couldn't remove middle chunk %d\n", ret);
+ return ret;
+ }
+
+ /*
+ * The first bitmap we have starts at offset 0 so the next one is just
+ * at the end of the first bitmap.
+ */
+ next_bitmap_offset = (u64)(BITS_PER_BITMAP * 4096);
+
+ /* Test a bit straddling two bitmaps */
+ ret = test_add_free_space_entry(cache, next_bitmap_offset -
+ (2 * 1024 * 1024), 4 * 1024 * 1024, 1);
+ if (ret) {
+ test_msg("Couldn't add space that straddles two bitmaps %d\n",
+ ret);
+ return ret;
+ }
+
+ ret = btrfs_remove_free_space(cache, next_bitmap_offset -
+ (1 * 1024 * 1024), 2 * 1024 * 1024);
+ if (ret) {
+ test_msg("Couldn't remove overlapping space %d\n", ret);
+ return ret;
+ }
+
+ if (test_check_exists(cache, next_bitmap_offset - (1 * 1024 * 1024),
+ 2 * 1024 * 1024)) {
+ test_msg("Left some space when removing overlapping\n");
+ return -1;
+ }
+
+ __btrfs_remove_free_space_cache(cache->free_space_ctl);
+
+ return 0;
+}
+
+/* This is the high grade jackassery */
+static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
+{
+ u64 bitmap_offset = (u64)(BITS_PER_BITMAP * 4096);
+ int ret;
+
+ test_msg("Running bitmap and extent tests\n");
+
+ /*
+ * First let's do something simple, an extent at the same offset as the
+ * bitmap, but the free space completely in the extent and then
+ * completely in the bitmap.
+ */
+ ret = test_add_free_space_entry(cache, 4 * 1024 * 1024, 1 * 1024 * 1024, 1);
+ if (ret) {
+ test_msg("Couldn't create bitmap entry %d\n", ret);
+ return ret;
+ }
+
+ ret = test_add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0);
+ if (ret) {
+ test_msg("Couldn't add extent entry %d\n", ret);
+ return ret;
+ }
+
+ ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024);
+ if (ret) {
+ test_msg("Couldn't remove extent entry %d\n", ret);
+ return ret;
+ }
+
+ if (test_check_exists(cache, 0, 1 * 1024 * 1024)) {
+ test_msg("Left remnants after our remove\n");
+ return -1;
+ }
+
+ /* Now to add back the extent entry and remove from the bitmap */
+ ret = test_add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0);
+ if (ret) {
+ test_msg("Couldn't re-add extent entry %d\n", ret);
+ return ret;
+ }
+
+ ret = btrfs_remove_free_space(cache, 4 * 1024 * 1024, 1 * 1024 * 1024);
+ if (ret) {
+ test_msg("Couldn't remove from bitmap %d\n", ret);
+ return ret;
+ }
+
+ if (test_check_exists(cache, 4 * 1024 * 1024, 1 * 1024 * 1024)) {
+ test_msg("Left remnants in the bitmap\n");
+ return -1;
+ }
+
+ /*
+ * Ok so a little more evil, extent entry and bitmap at the same offset,
+ * removing an overlapping chunk.
+ */
+ ret = test_add_free_space_entry(cache, 1 * 1024 * 1024, 4 * 1024 * 1024, 1);
+ if (ret) {
+ test_msg("Couldn't add to a bitmap %d\n", ret);
+ return ret;
+ }
+
+ ret = btrfs_remove_free_space(cache, 512 * 1024, 3 * 1024 * 1024);
+ if (ret) {
+ test_msg("Couldn't remove overlapping space %d\n", ret);
+ return ret;
+ }
+
+ if (test_check_exists(cache, 512 * 1024, 3 * 1024 * 1024)) {
+ test_msg("Left over pieces after removing overlapping\n");
+ return -1;
+ }
+
+ __btrfs_remove_free_space_cache(cache->free_space_ctl);
+
+ /* Now with the extent entry offset into the bitmap */
+ ret = test_add_free_space_entry(cache, 4 * 1024 * 1024, 4 * 1024 * 1024, 1);
+ if (ret) {
+ test_msg("Couldn't add space to the bitmap %d\n", ret);
+ return ret;
+ }
+
+ ret = test_add_free_space_entry(cache, 2 * 1024 * 1024, 2 * 1024 * 1024, 0);
+ if (ret) {
+ test_msg("Couldn't add extent to the cache %d\n", ret);
+ return ret;
+ }
+
+ ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 4 * 1024 * 1024);
+ if (ret) {
+ test_msg("Problem removing overlapping space %d\n", ret);
+ return ret;
+ }
+
+ if (test_check_exists(cache, 3 * 1024 * 1024, 4 * 1024 * 1024)) {
+ test_msg("Left something behind when removing space");
+ return -1;
+ }
+
+ /*
+ * This has blown up in the past, the extent entry starts before the
+ * bitmap entry, but we're trying to remove an offset that falls
+ * completely within the bitmap range and is in both the extent entry
+ * and the bitmap entry, looks like this
+ *
+ * [ extent ]
+ * [ bitmap ]
+ * [ del ]
+ */
+ __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ ret = test_add_free_space_entry(cache, bitmap_offset + 4 * 1024 * 1024,
+ 4 * 1024 * 1024, 1);
+ if (ret) {
+ test_msg("Couldn't add bitmap %d\n", ret);
+ return ret;
+ }
+
+ ret = test_add_free_space_entry(cache, bitmap_offset - 1 * 1024 * 1024,
+ 5 * 1024 * 1024, 0);
+ if (ret) {
+ test_msg("Couldn't add extent entry %d\n", ret);
+ return ret;
+ }
+
+ ret = btrfs_remove_free_space(cache, bitmap_offset + 1 * 1024 * 1024,
+ 5 * 1024 * 1024);
+ if (ret) {
+ test_msg("Failed to free our space %d\n", ret);
+ return ret;
+ }
+
+ if (test_check_exists(cache, bitmap_offset + 1 * 1024 * 1024,
+ 5 * 1024 * 1024)) {
+ test_msg("Left stuff over\n");
+ return -1;
+ }
+
+ __btrfs_remove_free_space_cache(cache->free_space_ctl);
+
+ /*
+ * This blew up before, we have part of the free space in a bitmap and
+ * then the entirety of the rest of the space in an extent. This used
+ * to return -EAGAIN back from btrfs_remove_extent, make sure this
+ * doesn't happen.
+ */
+ ret = test_add_free_space_entry(cache, 1 * 1024 * 1024, 2 * 1024 * 1024, 1);
+ if (ret) {
+ test_msg("Couldn't add bitmap entry %d\n", ret);
+ return ret;
+ }
+
+ ret = test_add_free_space_entry(cache, 3 * 1024 * 1024, 1 * 1024 * 1024, 0);
+ if (ret) {
+ test_msg("Couldn't add extent entry %d\n", ret);
+ return ret;
+ }
+
+ ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 3 * 1024 * 1024);
+ if (ret) {
+ test_msg("Error removing bitmap and extent overlapping %d\n", ret);
+ return ret;
+ }
+
+ __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ return 0;
+}
+
+/* Used by test_steal_space_from_bitmap_to_extent(). */
+static bool test_use_bitmap(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info)
+{
+ return ctl->free_extents > 0;
+}
+
+/* Used by test_steal_space_from_bitmap_to_extent(). */
+static int
+check_num_extents_and_bitmaps(const struct btrfs_block_group_cache *cache,
+ const int num_extents,
+ const int num_bitmaps)
+{
+ if (cache->free_space_ctl->free_extents != num_extents) {
+ test_msg("Incorrect # of extent entries in the cache: %d, expected %d\n",
+ cache->free_space_ctl->free_extents, num_extents);
+ return -EINVAL;
+ }
+ if (cache->free_space_ctl->total_bitmaps != num_bitmaps) {
+ test_msg("Incorrect # of extent entries in the cache: %d, expected %d\n",
+ cache->free_space_ctl->total_bitmaps, num_bitmaps);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/* Used by test_steal_space_from_bitmap_to_extent(). */
+static int check_cache_empty(struct btrfs_block_group_cache *cache)
+{
+ u64 offset;
+ u64 max_extent_size;
+
+ /*
+ * Now lets confirm that there's absolutely no free space left to
+ * allocate.
+ */
+ if (cache->free_space_ctl->free_space != 0) {
+ test_msg("Cache free space is not 0\n");
+ return -EINVAL;
+ }
+
+ /* And any allocation request, no matter how small, should fail now. */
+ offset = btrfs_find_space_for_alloc(cache, 0, 4096, 0,
+ &max_extent_size);
+ if (offset != 0) {
+ test_msg("Space allocation did not fail, returned offset: %llu",
+ offset);
+ return -EINVAL;
+ }
+
+ /* And no extent nor bitmap entries in the cache anymore. */
+ return check_num_extents_and_bitmaps(cache, 0, 0);
+}
+
+/*
+ * Before we were able to steal free space from a bitmap entry to an extent
+ * entry, we could end up with 2 entries representing a contiguous free space.
+ * One would be an extent entry and the other a bitmap entry. Since in order
+ * to allocate space to a caller we use only 1 entry, we couldn't return that
+ * whole range to the caller if it was requested. This forced the caller to
+ * either assume ENOSPC or perform several smaller space allocations, which
+ * wasn't optimal as they could be spread all over the block group while under
+ * concurrency (extra overhead and fragmentation).
+ *
+ * This stealing approach is benefical, since we always prefer to allocate from
+ * extent entries, both for clustered and non-clustered allocation requests.
+ */
+static int
+test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
+{
+ int ret;
+ u64 offset;
+ u64 max_extent_size;
+
+ bool (*use_bitmap_op)(struct btrfs_free_space_ctl *,
+ struct btrfs_free_space *);
+
+ test_msg("Running space stealing from bitmap to extent\n");
+
+ /*
+ * For this test, we want to ensure we end up with an extent entry
+ * immediately adjacent to a bitmap entry, where the bitmap starts
+ * at an offset where the extent entry ends. We keep adding and
+ * removing free space to reach into this state, but to get there
+ * we need to reach a point where marking new free space doesn't
+ * result in adding new extent entries or merging the new space
+ * with existing extent entries - the space ends up being marked
+ * in an existing bitmap that covers the new free space range.
+ *
+ * To get there, we need to reach the threshold defined set at
+ * cache->free_space_ctl->extents_thresh, which currently is
+ * 256 extents on a x86_64 system at least, and a few other
+ * conditions (check free_space_cache.c). Instead of making the
+ * test much longer and complicated, use a "use_bitmap" operation
+ * that forces use of bitmaps as soon as we have at least 1
+ * extent entry.
+ */
+ use_bitmap_op = cache->free_space_ctl->op->use_bitmap;
+ cache->free_space_ctl->op->use_bitmap = test_use_bitmap;
+
+ /*
+ * Extent entry covering free space range [128Mb - 256Kb, 128Mb - 128Kb[
+ */
+ ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 - 256 * 1024,
+ 128 * 1024, 0);
+ if (ret) {
+ test_msg("Couldn't add extent entry %d\n", ret);
+ return ret;
+ }
+
+ /* Bitmap entry covering free space range [128Mb + 512Kb, 256Mb[ */
+ ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 + 512 * 1024,
+ 128 * 1024 * 1024 - 512 * 1024, 1);
+ if (ret) {
+ test_msg("Couldn't add bitmap entry %d\n", ret);
+ return ret;
+ }
+
+ ret = check_num_extents_and_bitmaps(cache, 2, 1);
+ if (ret)
+ return ret;
+
+ /*
+ * Now make only the first 256Kb of the bitmap marked as free, so that
+ * we end up with only the following ranges marked as free space:
+ *
+ * [128Mb - 256Kb, 128Mb - 128Kb[
+ * [128Mb + 512Kb, 128Mb + 768Kb[
+ */
+ ret = btrfs_remove_free_space(cache,
+ 128 * 1024 * 1024 + 768 * 1024,
+ 128 * 1024 * 1024 - 768 * 1024);
+ if (ret) {
+ test_msg("Failed to free part of bitmap space %d\n", ret);
+ return ret;
+ }
+
+ /* Confirm that only those 2 ranges are marked as free. */
+ if (!test_check_exists(cache, 128 * 1024 * 1024 - 256 * 1024,
+ 128 * 1024)) {
+ test_msg("Free space range missing\n");
+ return -ENOENT;
+ }
+ if (!test_check_exists(cache, 128 * 1024 * 1024 + 512 * 1024,
+ 256 * 1024)) {
+ test_msg("Free space range missing\n");
+ return -ENOENT;
+ }
+
+ /*
+ * Confirm that the bitmap range [128Mb + 768Kb, 256Mb[ isn't marked
+ * as free anymore.
+ */
+ if (test_check_exists(cache, 128 * 1024 * 1024 + 768 * 1024,
+ 128 * 1024 * 1024 - 768 * 1024)) {
+ test_msg("Bitmap region not removed from space cache\n");
+ return -EINVAL;
+ }
+
+ /*
+ * Confirm that the region [128Mb + 256Kb, 128Mb + 512Kb[, which is
+ * covered by the bitmap, isn't marked as free.
+ */
+ if (test_check_exists(cache, 128 * 1024 * 1024 + 256 * 1024,
+ 256 * 1024)) {
+ test_msg("Invalid bitmap region marked as free\n");
+ return -EINVAL;
+ }
+
+ /*
+ * Confirm that the region [128Mb, 128Mb + 256Kb[, which is covered
+ * by the bitmap too, isn't marked as free either.
+ */
+ if (test_check_exists(cache, 128 * 1024 * 1024,
+ 256 * 1024)) {
+ test_msg("Invalid bitmap region marked as free\n");
+ return -EINVAL;
+ }
+
+ /*
+ * Now lets mark the region [128Mb, 128Mb + 512Kb[ as free too. But,
+ * lets make sure the free space cache marks it as free in the bitmap,
+ * and doesn't insert a new extent entry to represent this region.
+ */
+ ret = btrfs_add_free_space(cache, 128 * 1024 * 1024, 512 * 1024);
+ if (ret) {
+ test_msg("Error adding free space: %d\n", ret);
+ return ret;
+ }
+ /* Confirm the region is marked as free. */
+ if (!test_check_exists(cache, 128 * 1024 * 1024, 512 * 1024)) {
+ test_msg("Bitmap region not marked as free\n");
+ return -ENOENT;
+ }
+
+ /*
+ * Confirm that no new extent entries or bitmap entries were added to
+ * the cache after adding that free space region.
+ */
+ ret = check_num_extents_and_bitmaps(cache, 2, 1);
+ if (ret)
+ return ret;
+
+ /*
+ * Now lets add a small free space region to the right of the previous
+ * one, which is not contiguous with it and is part of the bitmap too.
+ * The goal is to test that the bitmap entry space stealing doesn't
+ * steal this space region.
+ */
+ ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 + 16 * 1024 * 1024,
+ 4096);
+ if (ret) {
+ test_msg("Error adding free space: %d\n", ret);
+ return ret;
+ }
+
+ /*
+ * Confirm that no new extent entries or bitmap entries were added to
+ * the cache after adding that free space region.
+ */
+ ret = check_num_extents_and_bitmaps(cache, 2, 1);
+ if (ret)
+ return ret;
+
+ /*
+ * Now mark the region [128Mb - 128Kb, 128Mb[ as free too. This will
+ * expand the range covered by the existing extent entry that represents
+ * the free space [128Mb - 256Kb, 128Mb - 128Kb[.
+ */
+ ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 - 128 * 1024,
+ 128 * 1024);
+ if (ret) {
+ test_msg("Error adding free space: %d\n", ret);
+ return ret;
+ }
+ /* Confirm the region is marked as free. */
+ if (!test_check_exists(cache, 128 * 1024 * 1024 - 128 * 1024,
+ 128 * 1024)) {
+ test_msg("Extent region not marked as free\n");
+ return -ENOENT;
+ }
+
+ /*
+ * Confirm that our extent entry didn't stole all free space from the
+ * bitmap, because of the small 4Kb free space region.
+ */
+ ret = check_num_extents_and_bitmaps(cache, 2, 1);
+ if (ret)
+ return ret;
+
+ /*
+ * So now we have the range [128Mb - 256Kb, 128Mb + 768Kb[ as free
+ * space. Without stealing bitmap free space into extent entry space,
+ * we would have all this free space represented by 2 entries in the
+ * cache:
+ *
+ * extent entry covering range: [128Mb - 256Kb, 128Mb[
+ * bitmap entry covering range: [128Mb, 128Mb + 768Kb[
+ *
+ * Attempting to allocate the whole free space (1Mb) would fail, because
+ * we can't allocate from multiple entries.
+ * With the bitmap free space stealing, we get a single extent entry
+ * that represents the 1Mb free space, and therefore we're able to
+ * allocate the whole free space at once.
+ */
+ if (!test_check_exists(cache, 128 * 1024 * 1024 - 256 * 1024,
+ 1 * 1024 * 1024)) {
+ test_msg("Expected region not marked as free\n");
+ return -ENOENT;
+ }
+
+ if (cache->free_space_ctl->free_space != (1 * 1024 * 1024 + 4096)) {
+ test_msg("Cache free space is not 1Mb + 4Kb\n");
+ return -EINVAL;
+ }
+
+ offset = btrfs_find_space_for_alloc(cache,
+ 0, 1 * 1024 * 1024, 0,
+ &max_extent_size);
+ if (offset != (128 * 1024 * 1024 - 256 * 1024)) {
+ test_msg("Failed to allocate 1Mb from space cache, returned offset is: %llu\n",
+ offset);
+ return -EINVAL;
+ }
+
+ /* All that remains is a 4Kb free space region in a bitmap. Confirm. */
+ ret = check_num_extents_and_bitmaps(cache, 1, 1);
+ if (ret)
+ return ret;
+
+ if (cache->free_space_ctl->free_space != 4096) {
+ test_msg("Cache free space is not 4Kb\n");
+ return -EINVAL;
+ }
+
+ offset = btrfs_find_space_for_alloc(cache,
+ 0, 4096, 0,
+ &max_extent_size);
+ if (offset != (128 * 1024 * 1024 + 16 * 1024 * 1024)) {
+ test_msg("Failed to allocate 4Kb from space cache, returned offset is: %llu\n",
+ offset);
+ return -EINVAL;
+ }
+
+ ret = check_cache_empty(cache);
+ if (ret)
+ return ret;
+
+ __btrfs_remove_free_space_cache(cache->free_space_ctl);
+
+ /*
+ * Now test a similar scenario, but where our extent entry is located
+ * to the right of the bitmap entry, so that we can check that stealing
+ * space from a bitmap to the front of an extent entry works.
+ */
+
+ /*
+ * Extent entry covering free space range [128Mb + 128Kb, 128Mb + 256Kb[
+ */
+ ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 + 128 * 1024,
+ 128 * 1024, 0);
+ if (ret) {
+ test_msg("Couldn't add extent entry %d\n", ret);
+ return ret;
+ }
+
+ /* Bitmap entry covering free space range [0, 128Mb - 512Kb[ */
+ ret = test_add_free_space_entry(cache, 0,
+ 128 * 1024 * 1024 - 512 * 1024, 1);
+ if (ret) {
+ test_msg("Couldn't add bitmap entry %d\n", ret);
+ return ret;
+ }
+
+ ret = check_num_extents_and_bitmaps(cache, 2, 1);
+ if (ret)
+ return ret;
+
+ /*
+ * Now make only the last 256Kb of the bitmap marked as free, so that
+ * we end up with only the following ranges marked as free space:
+ *
+ * [128Mb + 128b, 128Mb + 256Kb[
+ * [128Mb - 768Kb, 128Mb - 512Kb[
+ */
+ ret = btrfs_remove_free_space(cache,
+ 0,
+ 128 * 1024 * 1024 - 768 * 1024);
+ if (ret) {
+ test_msg("Failed to free part of bitmap space %d\n", ret);
+ return ret;
+ }
+
+ /* Confirm that only those 2 ranges are marked as free. */
+ if (!test_check_exists(cache, 128 * 1024 * 1024 + 128 * 1024,
+ 128 * 1024)) {
+ test_msg("Free space range missing\n");
+ return -ENOENT;
+ }
+ if (!test_check_exists(cache, 128 * 1024 * 1024 - 768 * 1024,
+ 256 * 1024)) {
+ test_msg("Free space range missing\n");
+ return -ENOENT;
+ }
+
+ /*
+ * Confirm that the bitmap range [0, 128Mb - 768Kb[ isn't marked
+ * as free anymore.
+ */
+ if (test_check_exists(cache, 0,
+ 128 * 1024 * 1024 - 768 * 1024)) {
+ test_msg("Bitmap region not removed from space cache\n");
+ return -EINVAL;
+ }
+
+ /*
+ * Confirm that the region [128Mb - 512Kb, 128Mb[, which is
+ * covered by the bitmap, isn't marked as free.
+ */
+ if (test_check_exists(cache, 128 * 1024 * 1024 - 512 * 1024,
+ 512 * 1024)) {
+ test_msg("Invalid bitmap region marked as free\n");
+ return -EINVAL;
+ }
+
+ /*
+ * Now lets mark the region [128Mb - 512Kb, 128Mb[ as free too. But,
+ * lets make sure the free space cache marks it as free in the bitmap,
+ * and doesn't insert a new extent entry to represent this region.
+ */
+ ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 - 512 * 1024,
+ 512 * 1024);
+ if (ret) {
+ test_msg("Error adding free space: %d\n", ret);
+ return ret;
+ }
+ /* Confirm the region is marked as free. */
+ if (!test_check_exists(cache, 128 * 1024 * 1024 - 512 * 1024,
+ 512 * 1024)) {
+ test_msg("Bitmap region not marked as free\n");
+ return -ENOENT;
+ }
+
+ /*
+ * Confirm that no new extent entries or bitmap entries were added to
+ * the cache after adding that free space region.
+ */
+ ret = check_num_extents_and_bitmaps(cache, 2, 1);
+ if (ret)
+ return ret;
+
+ /*
+ * Now lets add a small free space region to the left of the previous
+ * one, which is not contiguous with it and is part of the bitmap too.
+ * The goal is to test that the bitmap entry space stealing doesn't
+ * steal this space region.
+ */
+ ret = btrfs_add_free_space(cache, 32 * 1024 * 1024, 8192);
+ if (ret) {
+ test_msg("Error adding free space: %d\n", ret);
+ return ret;
+ }
+
+ /*
+ * Now mark the region [128Mb, 128Mb + 128Kb[ as free too. This will
+ * expand the range covered by the existing extent entry that represents
+ * the free space [128Mb + 128Kb, 128Mb + 256Kb[.
+ */
+ ret = btrfs_add_free_space(cache, 128 * 1024 * 1024, 128 * 1024);
+ if (ret) {
+ test_msg("Error adding free space: %d\n", ret);
+ return ret;
+ }
+ /* Confirm the region is marked as free. */
+ if (!test_check_exists(cache, 128 * 1024 * 1024, 128 * 1024)) {
+ test_msg("Extent region not marked as free\n");
+ return -ENOENT;
+ }
+
+ /*
+ * Confirm that our extent entry didn't stole all free space from the
+ * bitmap, because of the small 8Kb free space region.
+ */
+ ret = check_num_extents_and_bitmaps(cache, 2, 1);
+ if (ret)
+ return ret;
+
+ /*
+ * So now we have the range [128Mb - 768Kb, 128Mb + 256Kb[ as free
+ * space. Without stealing bitmap free space into extent entry space,
+ * we would have all this free space represented by 2 entries in the
+ * cache:
+ *
+ * extent entry covering range: [128Mb, 128Mb + 256Kb[
+ * bitmap entry covering range: [128Mb - 768Kb, 128Mb[
+ *
+ * Attempting to allocate the whole free space (1Mb) would fail, because
+ * we can't allocate from multiple entries.
+ * With the bitmap free space stealing, we get a single extent entry
+ * that represents the 1Mb free space, and therefore we're able to
+ * allocate the whole free space at once.
+ */
+ if (!test_check_exists(cache, 128 * 1024 * 1024 - 768 * 1024,
+ 1 * 1024 * 1024)) {
+ test_msg("Expected region not marked as free\n");
+ return -ENOENT;
+ }
+
+ if (cache->free_space_ctl->free_space != (1 * 1024 * 1024 + 8192)) {
+ test_msg("Cache free space is not 1Mb + 8Kb\n");
+ return -EINVAL;
+ }
+
+ offset = btrfs_find_space_for_alloc(cache,
+ 0, 1 * 1024 * 1024, 0,
+ &max_extent_size);
+ if (offset != (128 * 1024 * 1024 - 768 * 1024)) {
+ test_msg("Failed to allocate 1Mb from space cache, returned offset is: %llu\n",
+ offset);
+ return -EINVAL;
+ }
+
+ /* All that remains is a 8Kb free space region in a bitmap. Confirm. */
+ ret = check_num_extents_and_bitmaps(cache, 1, 1);
+ if (ret)
+ return ret;
+
+ if (cache->free_space_ctl->free_space != 8192) {
+ test_msg("Cache free space is not 8Kb\n");
+ return -EINVAL;
+ }
+
+ offset = btrfs_find_space_for_alloc(cache,
+ 0, 8192, 0,
+ &max_extent_size);
+ if (offset != (32 * 1024 * 1024)) {
+ test_msg("Failed to allocate 8Kb from space cache, returned offset is: %llu\n",
+ offset);
+ return -EINVAL;
+ }
+
+ ret = check_cache_empty(cache);
+ if (ret)
+ return ret;
+
+ cache->free_space_ctl->op->use_bitmap = use_bitmap_op;
+ __btrfs_remove_free_space_cache(cache->free_space_ctl);
+
+ return 0;
+}
+
+int btrfs_test_free_space_cache(void)
+{
+ struct btrfs_block_group_cache *cache;
+ int ret;
+
+ test_msg("Running btrfs free space cache tests\n");
+
+ cache = init_test_block_group();
+ if (!cache) {
+ test_msg("Couldn't run the tests\n");
+ return 0;
+ }
+
+ ret = test_extents(cache);
+ if (ret)
+ goto out;
+ ret = test_bitmaps(cache);
+ if (ret)
+ goto out;
+ ret = test_bitmaps_and_extents(cache);
+ if (ret)
+ goto out;
+
+ ret = test_steal_space_from_bitmap_to_extent(cache);
+out:
+ __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ kfree(cache->free_space_ctl);
+ kfree(cache);
+ test_msg("Free space cache tests finished\n");
+ return ret;
+}
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
new file mode 100644
index 000000000..054fc0d97
--- /dev/null
+++ b/fs/btrfs/tests/inode-tests.c
@@ -0,0 +1,1123 @@
+/*
+ * Copyright (C) 2013 Fusion IO. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "btrfs-tests.h"
+#include "../ctree.h"
+#include "../btrfs_inode.h"
+#include "../disk-io.h"
+#include "../extent_io.h"
+#include "../volumes.h"
+
+static void insert_extent(struct btrfs_root *root, u64 start, u64 len,
+ u64 ram_bytes, u64 offset, u64 disk_bytenr,
+ u64 disk_len, u32 type, u8 compression, int slot)
+{
+ struct btrfs_path path;
+ struct btrfs_file_extent_item *fi;
+ struct extent_buffer *leaf = root->node;
+ struct btrfs_key key;
+ u32 value_len = sizeof(struct btrfs_file_extent_item);
+
+ if (type == BTRFS_FILE_EXTENT_INLINE)
+ value_len += len;
+ memset(&path, 0, sizeof(path));
+
+ path.nodes[0] = leaf;
+ path.slots[0] = slot;
+
+ key.objectid = BTRFS_FIRST_FREE_OBJECTID;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = start;
+
+ setup_items_for_insert(root, &path, &key, &value_len, value_len,
+ value_len + sizeof(struct btrfs_item), 1);
+ fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+ btrfs_set_file_extent_generation(leaf, fi, 1);
+ btrfs_set_file_extent_type(leaf, fi, type);
+ btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
+ btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_len);
+ btrfs_set_file_extent_offset(leaf, fi, offset);
+ btrfs_set_file_extent_num_bytes(leaf, fi, len);
+ btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
+ btrfs_set_file_extent_compression(leaf, fi, compression);
+ btrfs_set_file_extent_encryption(leaf, fi, 0);
+ btrfs_set_file_extent_other_encoding(leaf, fi, 0);
+}
+
+static void insert_inode_item_key(struct btrfs_root *root)
+{
+ struct btrfs_path path;
+ struct extent_buffer *leaf = root->node;
+ struct btrfs_key key;
+ u32 value_len = 0;
+
+ memset(&path, 0, sizeof(path));
+
+ path.nodes[0] = leaf;
+ path.slots[0] = 0;
+
+ key.objectid = BTRFS_INODE_ITEM_KEY;
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+
+ setup_items_for_insert(root, &path, &key, &value_len, value_len,
+ value_len + sizeof(struct btrfs_item), 1);
+}
+
+/*
+ * Build the most complicated map of extents the earth has ever seen. We want
+ * this so we can test all of the corner cases of btrfs_get_extent. Here is a
+ * diagram of how the extents will look though this may not be possible we still
+ * want to make sure everything acts normally (the last number is not inclusive)
+ *
+ * [0 - 5][5 - 6][6 - 10][10 - 4096][ 4096 - 8192 ][8192 - 12288]
+ * [hole ][inline][ hole ][ regular ][regular1 split][ hole ]
+ *
+ * [ 12288 - 20480][20480 - 24576][ 24576 - 28672 ][28672 - 36864][36864 - 45056]
+ * [regular1 split][ prealloc1 ][prealloc1 written][ prealloc1 ][ compressed ]
+ *
+ * [45056 - 49152][49152-53248][53248-61440][61440-65536][ 65536+81920 ]
+ * [ compressed1 ][ regular ][compressed1][ regular ][ hole but no extent]
+ *
+ * [81920-86016]
+ * [ regular ]
+ */
+static void setup_file_extents(struct btrfs_root *root)
+{
+ int slot = 0;
+ u64 disk_bytenr = 1 * 1024 * 1024;
+ u64 offset = 0;
+
+ /* First we want a hole */
+ insert_extent(root, offset, 5, 5, 0, 0, 0, BTRFS_FILE_EXTENT_REG, 0,
+ slot);
+ slot++;
+ offset += 5;
+
+ /*
+ * Now we want an inline extent, I don't think this is possible but hey
+ * why not? Also keep in mind if we have an inline extent it counts as
+ * the whole first page. If we were to expand it we would have to cow
+ * and we wouldn't have an inline extent anymore.
+ */
+ insert_extent(root, offset, 1, 1, 0, 0, 0, BTRFS_FILE_EXTENT_INLINE, 0,
+ slot);
+ slot++;
+ offset = 4096;
+
+ /* Now another hole */
+ insert_extent(root, offset, 4, 4, 0, 0, 0, BTRFS_FILE_EXTENT_REG, 0,
+ slot);
+ slot++;
+ offset += 4;
+
+ /* Now for a regular extent */
+ insert_extent(root, offset, 4095, 4095, 0, disk_bytenr, 4096,
+ BTRFS_FILE_EXTENT_REG, 0, slot);
+ slot++;
+ disk_bytenr += 4096;
+ offset += 4095;
+
+ /*
+ * Now for 3 extents that were split from a hole punch so we test
+ * offsets properly.
+ */
+ insert_extent(root, offset, 4096, 16384, 0, disk_bytenr, 16384,
+ BTRFS_FILE_EXTENT_REG, 0, slot);
+ slot++;
+ offset += 4096;
+ insert_extent(root, offset, 4096, 4096, 0, 0, 0, BTRFS_FILE_EXTENT_REG,
+ 0, slot);
+ slot++;
+ offset += 4096;
+ insert_extent(root, offset, 8192, 16384, 8192, disk_bytenr, 16384,
+ BTRFS_FILE_EXTENT_REG, 0, slot);
+ slot++;
+ offset += 8192;
+ disk_bytenr += 16384;
+
+ /* Now for a unwritten prealloc extent */
+ insert_extent(root, offset, 4096, 4096, 0, disk_bytenr, 4096,
+ BTRFS_FILE_EXTENT_PREALLOC, 0, slot);
+ slot++;
+ offset += 4096;
+
+ /*
+ * We want to jack up disk_bytenr a little more so the em stuff doesn't
+ * merge our records.
+ */
+ disk_bytenr += 8192;
+
+ /*
+ * Now for a partially written prealloc extent, basically the same as
+ * the hole punch example above. Ram_bytes never changes when you mark
+ * extents written btw.
+ */
+ insert_extent(root, offset, 4096, 16384, 0, disk_bytenr, 16384,
+ BTRFS_FILE_EXTENT_PREALLOC, 0, slot);
+ slot++;
+ offset += 4096;
+ insert_extent(root, offset, 4096, 16384, 4096, disk_bytenr, 16384,
+ BTRFS_FILE_EXTENT_REG, 0, slot);
+ slot++;
+ offset += 4096;
+ insert_extent(root, offset, 8192, 16384, 8192, disk_bytenr, 16384,
+ BTRFS_FILE_EXTENT_PREALLOC, 0, slot);
+ slot++;
+ offset += 8192;
+ disk_bytenr += 16384;
+
+ /* Now a normal compressed extent */
+ insert_extent(root, offset, 8192, 8192, 0, disk_bytenr, 4096,
+ BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot);
+ slot++;
+ offset += 8192;
+ /* No merges */
+ disk_bytenr += 8192;
+
+ /* Now a split compressed extent */
+ insert_extent(root, offset, 4096, 16384, 0, disk_bytenr, 4096,
+ BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot);
+ slot++;
+ offset += 4096;
+ insert_extent(root, offset, 4096, 4096, 0, disk_bytenr + 4096, 4096,
+ BTRFS_FILE_EXTENT_REG, 0, slot);
+ slot++;
+ offset += 4096;
+ insert_extent(root, offset, 8192, 16384, 8192, disk_bytenr, 4096,
+ BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot);
+ slot++;
+ offset += 8192;
+ disk_bytenr += 8192;
+
+ /* Now extents that have a hole but no hole extent */
+ insert_extent(root, offset, 4096, 4096, 0, disk_bytenr, 4096,
+ BTRFS_FILE_EXTENT_REG, 0, slot);
+ slot++;
+ offset += 16384;
+ disk_bytenr += 4096;
+ insert_extent(root, offset, 4096, 4096, 0, disk_bytenr, 4096,
+ BTRFS_FILE_EXTENT_REG, 0, slot);
+}
+
+static unsigned long prealloc_only = 0;
+static unsigned long compressed_only = 0;
+static unsigned long vacancy_only = 0;
+
+static noinline int test_btrfs_get_extent(void)
+{
+ struct inode *inode = NULL;
+ struct btrfs_root *root = NULL;
+ struct extent_map *em = NULL;
+ u64 orig_start;
+ u64 disk_bytenr;
+ u64 offset;
+ int ret = -ENOMEM;
+
+ inode = btrfs_new_test_inode();
+ if (!inode) {
+ test_msg("Couldn't allocate inode\n");
+ return ret;
+ }
+
+ BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
+ BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID;
+ BTRFS_I(inode)->location.offset = 0;
+
+ root = btrfs_alloc_dummy_root();
+ if (IS_ERR(root)) {
+ test_msg("Couldn't allocate root\n");
+ goto out;
+ }
+
+ /*
+ * We do this since btrfs_get_extent wants to assign em->bdev to
+ * root->fs_info->fs_devices->latest_bdev.
+ */
+ root->fs_info = btrfs_alloc_dummy_fs_info();
+ if (!root->fs_info) {
+ test_msg("Couldn't allocate dummy fs info\n");
+ goto out;
+ }
+
+ root->node = alloc_dummy_extent_buffer(NULL, 4096);
+ if (!root->node) {
+ test_msg("Couldn't allocate dummy buffer\n");
+ goto out;
+ }
+
+ /*
+ * We will just free a dummy node if it's ref count is 2 so we need an
+ * extra ref so our searches don't accidently release our page.
+ */
+ extent_buffer_get(root->node);
+ btrfs_set_header_nritems(root->node, 0);
+ btrfs_set_header_level(root->node, 0);
+ ret = -EINVAL;
+
+ /* First with no extents */
+ BTRFS_I(inode)->root = root;
+ em = btrfs_get_extent(inode, NULL, 0, 0, 4096, 0);
+ if (IS_ERR(em)) {
+ em = NULL;
+ test_msg("Got an error when we shouldn't have\n");
+ goto out;
+ }
+ if (em->block_start != EXTENT_MAP_HOLE) {
+ test_msg("Expected a hole, got %llu\n", em->block_start);
+ goto out;
+ }
+ if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags)) {
+ test_msg("Vacancy flag wasn't set properly\n");
+ goto out;
+ }
+ free_extent_map(em);
+ btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
+
+ /*
+ * All of the magic numbers are based on the mapping setup in
+ * setup_file_extents, so if you change anything there you need to
+ * update the comment and update the expected values below.
+ */
+ setup_file_extents(root);
+
+ em = btrfs_get_extent(inode, NULL, 0, 0, (u64)-1, 0);
+ if (IS_ERR(em)) {
+ test_msg("Got an error when we shouldn't have\n");
+ goto out;
+ }
+ if (em->block_start != EXTENT_MAP_HOLE) {
+ test_msg("Expected a hole, got %llu\n", em->block_start);
+ goto out;
+ }
+ if (em->start != 0 || em->len != 5) {
+ test_msg("Unexpected extent wanted start 0 len 5, got start "
+ "%llu len %llu\n", em->start, em->len);
+ goto out;
+ }
+ if (em->flags != 0) {
+ test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+ goto out;
+ }
+ offset = em->start + em->len;
+ free_extent_map(em);
+
+ em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ if (IS_ERR(em)) {
+ test_msg("Got an error when we shouldn't have\n");
+ goto out;
+ }
+ if (em->block_start != EXTENT_MAP_INLINE) {
+ test_msg("Expected an inline, got %llu\n", em->block_start);
+ goto out;
+ }
+ if (em->start != offset || em->len != 4091) {
+ test_msg("Unexpected extent wanted start %llu len 1, got start "
+ "%llu len %llu\n", offset, em->start, em->len);
+ goto out;
+ }
+ if (em->flags != 0) {
+ test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+ goto out;
+ }
+ /*
+ * We don't test anything else for inline since it doesn't get set
+ * unless we have a page for it to write into. Maybe we should change
+ * this?
+ */
+ offset = em->start + em->len;
+ free_extent_map(em);
+
+ em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ if (IS_ERR(em)) {
+ test_msg("Got an error when we shouldn't have\n");
+ goto out;
+ }
+ if (em->block_start != EXTENT_MAP_HOLE) {
+ test_msg("Expected a hole, got %llu\n", em->block_start);
+ goto out;
+ }
+ if (em->start != offset || em->len != 4) {
+ test_msg("Unexpected extent wanted start %llu len 4, got start "
+ "%llu len %llu\n", offset, em->start, em->len);
+ goto out;
+ }
+ if (em->flags != 0) {
+ test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+ goto out;
+ }
+ offset = em->start + em->len;
+ free_extent_map(em);
+
+ /* Regular extent */
+ em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ if (IS_ERR(em)) {
+ test_msg("Got an error when we shouldn't have\n");
+ goto out;
+ }
+ if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+ test_msg("Expected a real extent, got %llu\n", em->block_start);
+ goto out;
+ }
+ if (em->start != offset || em->len != 4095) {
+ test_msg("Unexpected extent wanted start %llu len 4095, got "
+ "start %llu len %llu\n", offset, em->start, em->len);
+ goto out;
+ }
+ if (em->flags != 0) {
+ test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+ goto out;
+ }
+ if (em->orig_start != em->start) {
+ test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+ em->orig_start);
+ goto out;
+ }
+ offset = em->start + em->len;
+ free_extent_map(em);
+
+ /* The next 3 are split extents */
+ em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ if (IS_ERR(em)) {
+ test_msg("Got an error when we shouldn't have\n");
+ goto out;
+ }
+ if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+ test_msg("Expected a real extent, got %llu\n", em->block_start);
+ goto out;
+ }
+ if (em->start != offset || em->len != 4096) {
+ test_msg("Unexpected extent wanted start %llu len 4096, got "
+ "start %llu len %llu\n", offset, em->start, em->len);
+ goto out;
+ }
+ if (em->flags != 0) {
+ test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+ goto out;
+ }
+ if (em->orig_start != em->start) {
+ test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+ em->orig_start);
+ goto out;
+ }
+ disk_bytenr = em->block_start;
+ orig_start = em->start;
+ offset = em->start + em->len;
+ free_extent_map(em);
+
+ em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ if (IS_ERR(em)) {
+ test_msg("Got an error when we shouldn't have\n");
+ goto out;
+ }
+ if (em->block_start != EXTENT_MAP_HOLE) {
+ test_msg("Expected a hole, got %llu\n", em->block_start);
+ goto out;
+ }
+ if (em->start != offset || em->len != 4096) {
+ test_msg("Unexpected extent wanted start %llu len 4096, got "
+ "start %llu len %llu\n", offset, em->start, em->len);
+ goto out;
+ }
+ if (em->flags != 0) {
+ test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+ goto out;
+ }
+ offset = em->start + em->len;
+ free_extent_map(em);
+
+ em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ if (IS_ERR(em)) {
+ test_msg("Got an error when we shouldn't have\n");
+ goto out;
+ }
+ if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+ test_msg("Expected a real extent, got %llu\n", em->block_start);
+ goto out;
+ }
+ if (em->start != offset || em->len != 8192) {
+ test_msg("Unexpected extent wanted start %llu len 8192, got "
+ "start %llu len %llu\n", offset, em->start, em->len);
+ goto out;
+ }
+ if (em->flags != 0) {
+ test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+ goto out;
+ }
+ if (em->orig_start != orig_start) {
+ test_msg("Wrong orig offset, want %llu, have %llu\n",
+ orig_start, em->orig_start);
+ goto out;
+ }
+ disk_bytenr += (em->start - orig_start);
+ if (em->block_start != disk_bytenr) {
+ test_msg("Wrong block start, want %llu, have %llu\n",
+ disk_bytenr, em->block_start);
+ goto out;
+ }
+ offset = em->start + em->len;
+ free_extent_map(em);
+
+ /* Prealloc extent */
+ em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ if (IS_ERR(em)) {
+ test_msg("Got an error when we shouldn't have\n");
+ goto out;
+ }
+ if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+ test_msg("Expected a real extent, got %llu\n", em->block_start);
+ goto out;
+ }
+ if (em->start != offset || em->len != 4096) {
+ test_msg("Unexpected extent wanted start %llu len 4096, got "
+ "start %llu len %llu\n", offset, em->start, em->len);
+ goto out;
+ }
+ if (em->flags != prealloc_only) {
+ test_msg("Unexpected flags set, want %lu have %lu\n",
+ prealloc_only, em->flags);
+ goto out;
+ }
+ if (em->orig_start != em->start) {
+ test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+ em->orig_start);
+ goto out;
+ }
+ offset = em->start + em->len;
+ free_extent_map(em);
+
+ /* The next 3 are a half written prealloc extent */
+ em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ if (IS_ERR(em)) {
+ test_msg("Got an error when we shouldn't have\n");
+ goto out;
+ }
+ if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+ test_msg("Expected a real extent, got %llu\n", em->block_start);
+ goto out;
+ }
+ if (em->start != offset || em->len != 4096) {
+ test_msg("Unexpected extent wanted start %llu len 4096, got "
+ "start %llu len %llu\n", offset, em->start, em->len);
+ goto out;
+ }
+ if (em->flags != prealloc_only) {
+ test_msg("Unexpected flags set, want %lu have %lu\n",
+ prealloc_only, em->flags);
+ goto out;
+ }
+ if (em->orig_start != em->start) {
+ test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+ em->orig_start);
+ goto out;
+ }
+ disk_bytenr = em->block_start;
+ orig_start = em->start;
+ offset = em->start + em->len;
+ free_extent_map(em);
+
+ em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ if (IS_ERR(em)) {
+ test_msg("Got an error when we shouldn't have\n");
+ goto out;
+ }
+ if (em->block_start >= EXTENT_MAP_HOLE) {
+ test_msg("Expected a real extent, got %llu\n", em->block_start);
+ goto out;
+ }
+ if (em->start != offset || em->len != 4096) {
+ test_msg("Unexpected extent wanted start %llu len 4096, got "
+ "start %llu len %llu\n", offset, em->start, em->len);
+ goto out;
+ }
+ if (em->flags != 0) {
+ test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+ goto out;
+ }
+ if (em->orig_start != orig_start) {
+ test_msg("Unexpected orig offset, wanted %llu, have %llu\n",
+ orig_start, em->orig_start);
+ goto out;
+ }
+ if (em->block_start != (disk_bytenr + (em->start - em->orig_start))) {
+ test_msg("Unexpected block start, wanted %llu, have %llu\n",
+ disk_bytenr + (em->start - em->orig_start),
+ em->block_start);
+ goto out;
+ }
+ offset = em->start + em->len;
+ free_extent_map(em);
+
+ em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ if (IS_ERR(em)) {
+ test_msg("Got an error when we shouldn't have\n");
+ goto out;
+ }
+ if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+ test_msg("Expected a real extent, got %llu\n", em->block_start);
+ goto out;
+ }
+ if (em->start != offset || em->len != 8192) {
+ test_msg("Unexpected extent wanted start %llu len 8192, got "
+ "start %llu len %llu\n", offset, em->start, em->len);
+ goto out;
+ }
+ if (em->flags != prealloc_only) {
+ test_msg("Unexpected flags set, want %lu have %lu\n",
+ prealloc_only, em->flags);
+ goto out;
+ }
+ if (em->orig_start != orig_start) {
+ test_msg("Wrong orig offset, want %llu, have %llu\n", orig_start,
+ em->orig_start);
+ goto out;
+ }
+ if (em->block_start != (disk_bytenr + (em->start - em->orig_start))) {
+ test_msg("Unexpected block start, wanted %llu, have %llu\n",
+ disk_bytenr + (em->start - em->orig_start),
+ em->block_start);
+ goto out;
+ }
+ offset = em->start + em->len;
+ free_extent_map(em);
+
+ /* Now for the compressed extent */
+ em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ if (IS_ERR(em)) {
+ test_msg("Got an error when we shouldn't have\n");
+ goto out;
+ }
+ if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+ test_msg("Expected a real extent, got %llu\n", em->block_start);
+ goto out;
+ }
+ if (em->start != offset || em->len != 8192) {
+ test_msg("Unexpected extent wanted start %llu len 8192, got "
+ "start %llu len %llu\n", offset, em->start, em->len);
+ goto out;
+ }
+ if (em->flags != compressed_only) {
+ test_msg("Unexpected flags set, want %lu have %lu\n",
+ compressed_only, em->flags);
+ goto out;
+ }
+ if (em->orig_start != em->start) {
+ test_msg("Wrong orig offset, want %llu, have %llu\n",
+ em->start, em->orig_start);
+ goto out;
+ }
+ if (em->compress_type != BTRFS_COMPRESS_ZLIB) {
+ test_msg("Unexpected compress type, wanted %d, got %d\n",
+ BTRFS_COMPRESS_ZLIB, em->compress_type);
+ goto out;
+ }
+ offset = em->start + em->len;
+ free_extent_map(em);
+
+ /* Split compressed extent */
+ em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ if (IS_ERR(em)) {
+ test_msg("Got an error when we shouldn't have\n");
+ goto out;
+ }
+ if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+ test_msg("Expected a real extent, got %llu\n", em->block_start);
+ goto out;
+ }
+ if (em->start != offset || em->len != 4096) {
+ test_msg("Unexpected extent wanted start %llu len 4096, got "
+ "start %llu len %llu\n", offset, em->start, em->len);
+ goto out;
+ }
+ if (em->flags != compressed_only) {
+ test_msg("Unexpected flags set, want %lu have %lu\n",
+ compressed_only, em->flags);
+ goto out;
+ }
+ if (em->orig_start != em->start) {
+ test_msg("Wrong orig offset, want %llu, have %llu\n",
+ em->start, em->orig_start);
+ goto out;
+ }
+ if (em->compress_type != BTRFS_COMPRESS_ZLIB) {
+ test_msg("Unexpected compress type, wanted %d, got %d\n",
+ BTRFS_COMPRESS_ZLIB, em->compress_type);
+ goto out;
+ }
+ disk_bytenr = em->block_start;
+ orig_start = em->start;
+ offset = em->start + em->len;
+ free_extent_map(em);
+
+ em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ if (IS_ERR(em)) {
+ test_msg("Got an error when we shouldn't have\n");
+ goto out;
+ }
+ if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+ test_msg("Expected a real extent, got %llu\n", em->block_start);
+ goto out;
+ }
+ if (em->start != offset || em->len != 4096) {
+ test_msg("Unexpected extent wanted start %llu len 4096, got "
+ "start %llu len %llu\n", offset, em->start, em->len);
+ goto out;
+ }
+ if (em->flags != 0) {
+ test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+ goto out;
+ }
+ if (em->orig_start != em->start) {
+ test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+ em->orig_start);
+ goto out;
+ }
+ offset = em->start + em->len;
+ free_extent_map(em);
+
+ em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ if (IS_ERR(em)) {
+ test_msg("Got an error when we shouldn't have\n");
+ goto out;
+ }
+ if (em->block_start != disk_bytenr) {
+ test_msg("Block start does not match, want %llu got %llu\n",
+ disk_bytenr, em->block_start);
+ goto out;
+ }
+ if (em->start != offset || em->len != 8192) {
+ test_msg("Unexpected extent wanted start %llu len 8192, got "
+ "start %llu len %llu\n", offset, em->start, em->len);
+ goto out;
+ }
+ if (em->flags != compressed_only) {
+ test_msg("Unexpected flags set, want %lu have %lu\n",
+ compressed_only, em->flags);
+ goto out;
+ }
+ if (em->orig_start != orig_start) {
+ test_msg("Wrong orig offset, want %llu, have %llu\n",
+ em->start, orig_start);
+ goto out;
+ }
+ if (em->compress_type != BTRFS_COMPRESS_ZLIB) {
+ test_msg("Unexpected compress type, wanted %d, got %d\n",
+ BTRFS_COMPRESS_ZLIB, em->compress_type);
+ goto out;
+ }
+ offset = em->start + em->len;
+ free_extent_map(em);
+
+ /* A hole between regular extents but no hole extent */
+ em = btrfs_get_extent(inode, NULL, 0, offset + 6, 4096, 0);
+ if (IS_ERR(em)) {
+ test_msg("Got an error when we shouldn't have\n");
+ goto out;
+ }
+ if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+ test_msg("Expected a real extent, got %llu\n", em->block_start);
+ goto out;
+ }
+ if (em->start != offset || em->len != 4096) {
+ test_msg("Unexpected extent wanted start %llu len 4096, got "
+ "start %llu len %llu\n", offset, em->start, em->len);
+ goto out;
+ }
+ if (em->flags != 0) {
+ test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+ goto out;
+ }
+ if (em->orig_start != em->start) {
+ test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+ em->orig_start);
+ goto out;
+ }
+ offset = em->start + em->len;
+ free_extent_map(em);
+
+ em = btrfs_get_extent(inode, NULL, 0, offset, 4096 * 1024, 0);
+ if (IS_ERR(em)) {
+ test_msg("Got an error when we shouldn't have\n");
+ goto out;
+ }
+ if (em->block_start != EXTENT_MAP_HOLE) {
+ test_msg("Expected a hole extent, got %llu\n", em->block_start);
+ goto out;
+ }
+ /*
+ * Currently we just return a length that we requested rather than the
+ * length of the actual hole, if this changes we'll have to change this
+ * test.
+ */
+ if (em->start != offset || em->len != 12288) {
+ test_msg("Unexpected extent wanted start %llu len 12288, got "
+ "start %llu len %llu\n", offset, em->start, em->len);
+ goto out;
+ }
+ if (em->flags != vacancy_only) {
+ test_msg("Unexpected flags set, want %lu have %lu\n",
+ vacancy_only, em->flags);
+ goto out;
+ }
+ if (em->orig_start != em->start) {
+ test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+ em->orig_start);
+ goto out;
+ }
+ offset = em->start + em->len;
+ free_extent_map(em);
+
+ em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ if (IS_ERR(em)) {
+ test_msg("Got an error when we shouldn't have\n");
+ goto out;
+ }
+ if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+ test_msg("Expected a real extent, got %llu\n", em->block_start);
+ goto out;
+ }
+ if (em->start != offset || em->len != 4096) {
+ test_msg("Unexpected extent wanted start %llu len 4096, got "
+ "start %llu len %llu\n", offset, em->start, em->len);
+ goto out;
+ }
+ if (em->flags != 0) {
+ test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+ goto out;
+ }
+ if (em->orig_start != em->start) {
+ test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+ em->orig_start);
+ goto out;
+ }
+ ret = 0;
+out:
+ if (!IS_ERR(em))
+ free_extent_map(em);
+ iput(inode);
+ btrfs_free_dummy_root(root);
+ return ret;
+}
+
+static int test_hole_first(void)
+{
+ struct inode *inode = NULL;
+ struct btrfs_root *root = NULL;
+ struct extent_map *em = NULL;
+ int ret = -ENOMEM;
+
+ inode = btrfs_new_test_inode();
+ if (!inode) {
+ test_msg("Couldn't allocate inode\n");
+ return ret;
+ }
+
+ BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
+ BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID;
+ BTRFS_I(inode)->location.offset = 0;
+
+ root = btrfs_alloc_dummy_root();
+ if (IS_ERR(root)) {
+ test_msg("Couldn't allocate root\n");
+ goto out;
+ }
+
+ root->fs_info = btrfs_alloc_dummy_fs_info();
+ if (!root->fs_info) {
+ test_msg("Couldn't allocate dummy fs info\n");
+ goto out;
+ }
+
+ root->node = alloc_dummy_extent_buffer(NULL, 4096);
+ if (!root->node) {
+ test_msg("Couldn't allocate dummy buffer\n");
+ goto out;
+ }
+
+ extent_buffer_get(root->node);
+ btrfs_set_header_nritems(root->node, 0);
+ btrfs_set_header_level(root->node, 0);
+ BTRFS_I(inode)->root = root;
+ ret = -EINVAL;
+
+ /*
+ * Need a blank inode item here just so we don't confuse
+ * btrfs_get_extent.
+ */
+ insert_inode_item_key(root);
+ insert_extent(root, 4096, 4096, 4096, 0, 4096, 4096,
+ BTRFS_FILE_EXTENT_REG, 0, 1);
+ em = btrfs_get_extent(inode, NULL, 0, 0, 8192, 0);
+ if (IS_ERR(em)) {
+ test_msg("Got an error when we shouldn't have\n");
+ goto out;
+ }
+ if (em->block_start != EXTENT_MAP_HOLE) {
+ test_msg("Expected a hole, got %llu\n", em->block_start);
+ goto out;
+ }
+ if (em->start != 0 || em->len != 4096) {
+ test_msg("Unexpected extent wanted start 0 len 4096, got start "
+ "%llu len %llu\n", em->start, em->len);
+ goto out;
+ }
+ if (em->flags != vacancy_only) {
+ test_msg("Wrong flags, wanted %lu, have %lu\n", vacancy_only,
+ em->flags);
+ goto out;
+ }
+ free_extent_map(em);
+
+ em = btrfs_get_extent(inode, NULL, 0, 4096, 8192, 0);
+ if (IS_ERR(em)) {
+ test_msg("Got an error when we shouldn't have\n");
+ goto out;
+ }
+ if (em->block_start != 4096) {
+ test_msg("Expected a real extent, got %llu\n", em->block_start);
+ goto out;
+ }
+ if (em->start != 4096 || em->len != 4096) {
+ test_msg("Unexpected extent wanted start 4096 len 4096, got "
+ "start %llu len %llu\n", em->start, em->len);
+ goto out;
+ }
+ if (em->flags != 0) {
+ test_msg("Unexpected flags set, wanted 0 got %lu\n",
+ em->flags);
+ goto out;
+ }
+ ret = 0;
+out:
+ if (!IS_ERR(em))
+ free_extent_map(em);
+ iput(inode);
+ btrfs_free_dummy_root(root);
+ return ret;
+}
+
+static int test_extent_accounting(void)
+{
+ struct inode *inode = NULL;
+ struct btrfs_root *root = NULL;
+ int ret = -ENOMEM;
+
+ inode = btrfs_new_test_inode();
+ if (!inode) {
+ test_msg("Couldn't allocate inode\n");
+ return ret;
+ }
+
+ root = btrfs_alloc_dummy_root();
+ if (IS_ERR(root)) {
+ test_msg("Couldn't allocate root\n");
+ goto out;
+ }
+
+ root->fs_info = btrfs_alloc_dummy_fs_info();
+ if (!root->fs_info) {
+ test_msg("Couldn't allocate dummy fs info\n");
+ goto out;
+ }
+
+ BTRFS_I(inode)->root = root;
+ btrfs_test_inode_set_ops(inode);
+
+ /* [BTRFS_MAX_EXTENT_SIZE] */
+ BTRFS_I(inode)->outstanding_extents++;
+ ret = btrfs_set_extent_delalloc(inode, 0, BTRFS_MAX_EXTENT_SIZE - 1,
+ NULL);
+ if (ret) {
+ test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+ goto out;
+ }
+ if (BTRFS_I(inode)->outstanding_extents != 1) {
+ ret = -EINVAL;
+ test_msg("Miscount, wanted 1, got %u\n",
+ BTRFS_I(inode)->outstanding_extents);
+ goto out;
+ }
+
+ /* [BTRFS_MAX_EXTENT_SIZE][4k] */
+ BTRFS_I(inode)->outstanding_extents++;
+ ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE,
+ BTRFS_MAX_EXTENT_SIZE + 4095, NULL);
+ if (ret) {
+ test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+ goto out;
+ }
+ if (BTRFS_I(inode)->outstanding_extents != 2) {
+ ret = -EINVAL;
+ test_msg("Miscount, wanted 2, got %u\n",
+ BTRFS_I(inode)->outstanding_extents);
+ goto out;
+ }
+
+ /* [BTRFS_MAX_EXTENT_SIZE/2][4K HOLE][the rest] */
+ ret = clear_extent_bit(&BTRFS_I(inode)->io_tree,
+ BTRFS_MAX_EXTENT_SIZE >> 1,
+ (BTRFS_MAX_EXTENT_SIZE >> 1) + 4095,
+ EXTENT_DELALLOC | EXTENT_DIRTY |
+ EXTENT_UPTODATE | EXTENT_DO_ACCOUNTING, 0, 0,
+ NULL, GFP_NOFS);
+ if (ret) {
+ test_msg("clear_extent_bit returned %d\n", ret);
+ goto out;
+ }
+ if (BTRFS_I(inode)->outstanding_extents != 2) {
+ ret = -EINVAL;
+ test_msg("Miscount, wanted 2, got %u\n",
+ BTRFS_I(inode)->outstanding_extents);
+ goto out;
+ }
+
+ /* [BTRFS_MAX_EXTENT_SIZE][4K] */
+ BTRFS_I(inode)->outstanding_extents++;
+ ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE >> 1,
+ (BTRFS_MAX_EXTENT_SIZE >> 1) + 4095,
+ NULL);
+ if (ret) {
+ test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+ goto out;
+ }
+ if (BTRFS_I(inode)->outstanding_extents != 2) {
+ ret = -EINVAL;
+ test_msg("Miscount, wanted 2, got %u\n",
+ BTRFS_I(inode)->outstanding_extents);
+ goto out;
+ }
+
+ /*
+ * [BTRFS_MAX_EXTENT_SIZE+4K][4K HOLE][BTRFS_MAX_EXTENT_SIZE+4K]
+ *
+ * I'm artificially adding 2 to outstanding_extents because in the
+ * buffered IO case we'd add things up as we go, but I don't feel like
+ * doing that here, this isn't the interesting case we want to test.
+ */
+ BTRFS_I(inode)->outstanding_extents += 2;
+ ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE + 8192,
+ (BTRFS_MAX_EXTENT_SIZE << 1) + 12287,
+ NULL);
+ if (ret) {
+ test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+ goto out;
+ }
+ if (BTRFS_I(inode)->outstanding_extents != 4) {
+ ret = -EINVAL;
+ test_msg("Miscount, wanted 4, got %u\n",
+ BTRFS_I(inode)->outstanding_extents);
+ goto out;
+ }
+
+ /* [BTRFS_MAX_EXTENT_SIZE+4k][4k][BTRFS_MAX_EXTENT_SIZE+4k] */
+ BTRFS_I(inode)->outstanding_extents++;
+ ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE+4096,
+ BTRFS_MAX_EXTENT_SIZE+8191, NULL);
+ if (ret) {
+ test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+ goto out;
+ }
+ if (BTRFS_I(inode)->outstanding_extents != 3) {
+ ret = -EINVAL;
+ test_msg("Miscount, wanted 3, got %u\n",
+ BTRFS_I(inode)->outstanding_extents);
+ goto out;
+ }
+
+ /* [BTRFS_MAX_EXTENT_SIZE+4k][4K HOLE][BTRFS_MAX_EXTENT_SIZE+4k] */
+ ret = clear_extent_bit(&BTRFS_I(inode)->io_tree,
+ BTRFS_MAX_EXTENT_SIZE+4096,
+ BTRFS_MAX_EXTENT_SIZE+8191,
+ EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0,
+ NULL, GFP_NOFS);
+ if (ret) {
+ test_msg("clear_extent_bit returned %d\n", ret);
+ goto out;
+ }
+ if (BTRFS_I(inode)->outstanding_extents != 4) {
+ ret = -EINVAL;
+ test_msg("Miscount, wanted 4, got %u\n",
+ BTRFS_I(inode)->outstanding_extents);
+ goto out;
+ }
+
+ /*
+ * Refill the hole again just for good measure, because I thought it
+ * might fail and I'd rather satisfy my paranoia at this point.
+ */
+ BTRFS_I(inode)->outstanding_extents++;
+ ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE+4096,
+ BTRFS_MAX_EXTENT_SIZE+8191, NULL);
+ if (ret) {
+ test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+ goto out;
+ }
+ if (BTRFS_I(inode)->outstanding_extents != 3) {
+ ret = -EINVAL;
+ test_msg("Miscount, wanted 3, got %u\n",
+ BTRFS_I(inode)->outstanding_extents);
+ goto out;
+ }
+
+ /* Empty */
+ ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
+ EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0,
+ NULL, GFP_NOFS);
+ if (ret) {
+ test_msg("clear_extent_bit returned %d\n", ret);
+ goto out;
+ }
+ if (BTRFS_I(inode)->outstanding_extents) {
+ ret = -EINVAL;
+ test_msg("Miscount, wanted 0, got %u\n",
+ BTRFS_I(inode)->outstanding_extents);
+ goto out;
+ }
+ ret = 0;
+out:
+ if (ret)
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
+ EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0,
+ NULL, GFP_NOFS);
+ iput(inode);
+ btrfs_free_dummy_root(root);
+ return ret;
+}
+
+int btrfs_test_inodes(void)
+{
+ int ret;
+
+ set_bit(EXTENT_FLAG_COMPRESSED, &compressed_only);
+ set_bit(EXTENT_FLAG_VACANCY, &vacancy_only);
+ set_bit(EXTENT_FLAG_PREALLOC, &prealloc_only);
+
+ test_msg("Running btrfs_get_extent tests\n");
+ ret = test_btrfs_get_extent();
+ if (ret)
+ return ret;
+ test_msg("Running hole first btrfs_get_extent test\n");
+ ret = test_hole_first();
+ if (ret)
+ return ret;
+ test_msg("Running outstanding_extents tests\n");
+ return test_extent_accounting();
+}
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
new file mode 100644
index 000000000..c32a7ba76
--- /dev/null
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -0,0 +1,469 @@
+/*
+ * Copyright (C) 2013 Facebook. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "btrfs-tests.h"
+#include "../ctree.h"
+#include "../transaction.h"
+#include "../disk-io.h"
+#include "../qgroup.h"
+
+static void init_dummy_trans(struct btrfs_trans_handle *trans)
+{
+ memset(trans, 0, sizeof(*trans));
+ trans->transid = 1;
+ INIT_LIST_HEAD(&trans->qgroup_ref_list);
+ trans->type = __TRANS_DUMMY;
+}
+
+static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes, u64 parent, u64 root_objectid)
+{
+ struct btrfs_trans_handle trans;
+ struct btrfs_extent_item *item;
+ struct btrfs_extent_inline_ref *iref;
+ struct btrfs_tree_block_info *block_info;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_key ins;
+ u32 size = sizeof(*item) + sizeof(*iref) + sizeof(*block_info);
+ int ret;
+
+ init_dummy_trans(&trans);
+
+ ins.objectid = bytenr;
+ ins.type = BTRFS_EXTENT_ITEM_KEY;
+ ins.offset = num_bytes;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ test_msg("Couldn't allocate path\n");
+ return -ENOMEM;
+ }
+
+ path->leave_spinning = 1;
+ ret = btrfs_insert_empty_item(&trans, root, path, &ins, size);
+ if (ret) {
+ test_msg("Couldn't insert ref %d\n", ret);
+ btrfs_free_path(path);
+ return ret;
+ }
+
+ leaf = path->nodes[0];
+ item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+ btrfs_set_extent_refs(leaf, item, 1);
+ btrfs_set_extent_generation(leaf, item, 1);
+ btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_TREE_BLOCK);
+ block_info = (struct btrfs_tree_block_info *)(item + 1);
+ btrfs_set_tree_block_level(leaf, block_info, 1);
+ iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
+ if (parent > 0) {
+ btrfs_set_extent_inline_ref_type(leaf, iref,
+ BTRFS_SHARED_BLOCK_REF_KEY);
+ btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
+ } else {
+ btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_TREE_BLOCK_REF_KEY);
+ btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
+ }
+ btrfs_free_path(path);
+ return 0;
+}
+
+static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
+ u64 parent, u64 root_objectid)
+{
+ struct btrfs_trans_handle trans;
+ struct btrfs_extent_item *item;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ u64 refs;
+ int ret;
+
+ init_dummy_trans(&trans);
+
+ key.objectid = bytenr;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = num_bytes;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ test_msg("Couldn't allocate path\n");
+ return -ENOMEM;
+ }
+
+ path->leave_spinning = 1;
+ ret = btrfs_search_slot(&trans, root, &key, path, 0, 1);
+ if (ret) {
+ test_msg("Couldn't find extent ref\n");
+ btrfs_free_path(path);
+ return ret;
+ }
+
+ item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_extent_item);
+ refs = btrfs_extent_refs(path->nodes[0], item);
+ btrfs_set_extent_refs(path->nodes[0], item, refs + 1);
+ btrfs_release_path(path);
+
+ key.objectid = bytenr;
+ if (parent) {
+ key.type = BTRFS_SHARED_BLOCK_REF_KEY;
+ key.offset = parent;
+ } else {
+ key.type = BTRFS_TREE_BLOCK_REF_KEY;
+ key.offset = root_objectid;
+ }
+
+ ret = btrfs_insert_empty_item(&trans, root, path, &key, 0);
+ if (ret)
+ test_msg("Failed to insert backref\n");
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int remove_extent_item(struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes)
+{
+ struct btrfs_trans_handle trans;
+ struct btrfs_key key;
+ struct btrfs_path *path;
+ int ret;
+
+ init_dummy_trans(&trans);
+
+ key.objectid = bytenr;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = num_bytes;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ test_msg("Couldn't allocate path\n");
+ return -ENOMEM;
+ }
+ path->leave_spinning = 1;
+
+ ret = btrfs_search_slot(&trans, root, &key, path, -1, 1);
+ if (ret) {
+ test_msg("Didn't find our key %d\n", ret);
+ btrfs_free_path(path);
+ return ret;
+ }
+ btrfs_del_item(&trans, root, path);
+ btrfs_free_path(path);
+ return 0;
+}
+
+static int remove_extent_ref(struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes, u64 parent, u64 root_objectid)
+{
+ struct btrfs_trans_handle trans;
+ struct btrfs_extent_item *item;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ u64 refs;
+ int ret;
+
+ init_dummy_trans(&trans);
+
+ key.objectid = bytenr;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = num_bytes;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ test_msg("Couldn't allocate path\n");
+ return -ENOMEM;
+ }
+
+ path->leave_spinning = 1;
+ ret = btrfs_search_slot(&trans, root, &key, path, 0, 1);
+ if (ret) {
+ test_msg("Couldn't find extent ref\n");
+ btrfs_free_path(path);
+ return ret;
+ }
+
+ item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_extent_item);
+ refs = btrfs_extent_refs(path->nodes[0], item);
+ btrfs_set_extent_refs(path->nodes[0], item, refs - 1);
+ btrfs_release_path(path);
+
+ key.objectid = bytenr;
+ if (parent) {
+ key.type = BTRFS_SHARED_BLOCK_REF_KEY;
+ key.offset = parent;
+ } else {
+ key.type = BTRFS_TREE_BLOCK_REF_KEY;
+ key.offset = root_objectid;
+ }
+
+ ret = btrfs_search_slot(&trans, root, &key, path, -1, 1);
+ if (ret) {
+ test_msg("Couldn't find backref %d\n", ret);
+ btrfs_free_path(path);
+ return ret;
+ }
+ btrfs_del_item(&trans, root, path);
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int test_no_shared_qgroup(struct btrfs_root *root)
+{
+ struct btrfs_trans_handle trans;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ int ret;
+
+ init_dummy_trans(&trans);
+
+ test_msg("Qgroup basic add\n");
+ ret = btrfs_create_qgroup(NULL, fs_info, 5);
+ if (ret) {
+ test_msg("Couldn't create a qgroup %d\n", ret);
+ return ret;
+ }
+
+ ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096,
+ BTRFS_QGROUP_OPER_ADD_EXCL, 0);
+ if (ret) {
+ test_msg("Couldn't add space to a qgroup %d\n", ret);
+ return ret;
+ }
+
+ ret = insert_normal_tree_ref(root, 4096, 4096, 0, 5);
+ if (ret)
+ return ret;
+
+ ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+ if (ret) {
+ test_msg("Delayed qgroup accounting failed %d\n", ret);
+ return ret;
+ }
+
+ if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) {
+ test_msg("Qgroup counts didn't match expected values\n");
+ return -EINVAL;
+ }
+
+ ret = remove_extent_item(root, 4096, 4096);
+ if (ret)
+ return -EINVAL;
+
+ ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096,
+ BTRFS_QGROUP_OPER_SUB_EXCL, 0);
+ if (ret) {
+ test_msg("Couldn't remove space from the qgroup %d\n", ret);
+ return -EINVAL;
+ }
+
+ ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+ if (ret) {
+ test_msg("Qgroup accounting failed %d\n", ret);
+ return -EINVAL;
+ }
+
+ if (btrfs_verify_qgroup_counts(fs_info, 5, 0, 0)) {
+ test_msg("Qgroup counts didn't match expected values\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/*
+ * Add a ref for two different roots to make sure the shared value comes out
+ * right, also remove one of the roots and make sure the exclusive count is
+ * adjusted properly.
+ */
+static int test_multiple_refs(struct btrfs_root *root)
+{
+ struct btrfs_trans_handle trans;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ int ret;
+
+ init_dummy_trans(&trans);
+
+ test_msg("Qgroup multiple refs test\n");
+
+ /* We have 5 created already from the previous test */
+ ret = btrfs_create_qgroup(NULL, fs_info, 256);
+ if (ret) {
+ test_msg("Couldn't create a qgroup %d\n", ret);
+ return ret;
+ }
+
+ ret = insert_normal_tree_ref(root, 4096, 4096, 0, 5);
+ if (ret)
+ return ret;
+
+ ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096,
+ BTRFS_QGROUP_OPER_ADD_EXCL, 0);
+ if (ret) {
+ test_msg("Couldn't add space to a qgroup %d\n", ret);
+ return ret;
+ }
+
+ ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+ if (ret) {
+ test_msg("Delayed qgroup accounting failed %d\n", ret);
+ return ret;
+ }
+
+ if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) {
+ test_msg("Qgroup counts didn't match expected values\n");
+ return -EINVAL;
+ }
+
+ ret = add_tree_ref(root, 4096, 4096, 0, 256);
+ if (ret)
+ return ret;
+
+ ret = btrfs_qgroup_record_ref(&trans, fs_info, 256, 4096, 4096,
+ BTRFS_QGROUP_OPER_ADD_SHARED, 0);
+ if (ret) {
+ test_msg("Qgroup record ref failed %d\n", ret);
+ return ret;
+ }
+
+ ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+ if (ret) {
+ test_msg("Qgroup accounting failed %d\n", ret);
+ return ret;
+ }
+
+ if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 0)) {
+ test_msg("Qgroup counts didn't match expected values\n");
+ return -EINVAL;
+ }
+
+ if (btrfs_verify_qgroup_counts(fs_info, 256, 4096, 0)) {
+ test_msg("Qgroup counts didn't match expected values\n");
+ return -EINVAL;
+ }
+
+ ret = remove_extent_ref(root, 4096, 4096, 0, 256);
+ if (ret)
+ return ret;
+
+ ret = btrfs_qgroup_record_ref(&trans, fs_info, 256, 4096, 4096,
+ BTRFS_QGROUP_OPER_SUB_SHARED, 0);
+ if (ret) {
+ test_msg("Qgroup record ref failed %d\n", ret);
+ return ret;
+ }
+
+ ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+ if (ret) {
+ test_msg("Qgroup accounting failed %d\n", ret);
+ return ret;
+ }
+
+ if (btrfs_verify_qgroup_counts(fs_info, 256, 0, 0)) {
+ test_msg("Qgroup counts didn't match expected values\n");
+ return -EINVAL;
+ }
+
+ if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) {
+ test_msg("Qgroup counts didn't match expected values\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int btrfs_test_qgroups(void)
+{
+ struct btrfs_root *root;
+ struct btrfs_root *tmp_root;
+ int ret = 0;
+
+ root = btrfs_alloc_dummy_root();
+ if (IS_ERR(root)) {
+ test_msg("Couldn't allocate root\n");
+ return PTR_ERR(root);
+ }
+
+ root->fs_info = btrfs_alloc_dummy_fs_info();
+ if (!root->fs_info) {
+ test_msg("Couldn't allocate dummy fs info\n");
+ ret = -ENOMEM;
+ goto out;
+ }
+ /* We are using this root as our extent root */
+ root->fs_info->extent_root = root;
+
+ /*
+ * Some of the paths we test assume we have a filled out fs_info, so we
+ * just need to add the root in there so we don't panic.
+ */
+ root->fs_info->tree_root = root;
+ root->fs_info->quota_root = root;
+ root->fs_info->quota_enabled = 1;
+
+ /*
+ * Can't use bytenr 0, some things freak out
+ * *cough*backref walking code*cough*
+ */
+ root->node = alloc_test_extent_buffer(root->fs_info, 4096);
+ if (!root->node) {
+ test_msg("Couldn't allocate dummy buffer\n");
+ ret = -ENOMEM;
+ goto out;
+ }
+ btrfs_set_header_level(root->node, 0);
+ btrfs_set_header_nritems(root->node, 0);
+ root->alloc_bytenr += 8192;
+
+ tmp_root = btrfs_alloc_dummy_root();
+ if (IS_ERR(tmp_root)) {
+ test_msg("Couldn't allocate a fs root\n");
+ ret = PTR_ERR(tmp_root);
+ goto out;
+ }
+
+ tmp_root->root_key.objectid = 5;
+ root->fs_info->fs_root = tmp_root;
+ ret = btrfs_insert_fs_root(root->fs_info, tmp_root);
+ if (ret) {
+ test_msg("Couldn't insert fs root %d\n", ret);
+ goto out;
+ }
+
+ tmp_root = btrfs_alloc_dummy_root();
+ if (IS_ERR(tmp_root)) {
+ test_msg("Couldn't allocate a fs root\n");
+ ret = PTR_ERR(tmp_root);
+ goto out;
+ }
+
+ tmp_root->root_key.objectid = 256;
+ ret = btrfs_insert_fs_root(root->fs_info, tmp_root);
+ if (ret) {
+ test_msg("Couldn't insert fs root %d\n", ret);
+ goto out;
+ }
+
+ test_msg("Running qgroup tests\n");
+ ret = test_no_shared_qgroup(root);
+ if (ret)
+ goto out;
+ ret = test_multiple_refs(root);
+out:
+ btrfs_free_dummy_root(root);
+ return ret;
+}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
new file mode 100644
index 000000000..94e909c5a
--- /dev/null
+++ b/fs/btrfs/transaction.c
@@ -0,0 +1,2204 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/writeback.h>
+#include <linux/pagemap.h>
+#include <linux/blkdev.h>
+#include <linux/uuid.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "locking.h"
+#include "tree-log.h"
+#include "inode-map.h"
+#include "volumes.h"
+#include "dev-replace.h"
+#include "qgroup.h"
+
+#define BTRFS_ROOT_TRANS_TAG 0
+
+static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
+ [TRANS_STATE_RUNNING] = 0U,
+ [TRANS_STATE_BLOCKED] = (__TRANS_USERSPACE |
+ __TRANS_START),
+ [TRANS_STATE_COMMIT_START] = (__TRANS_USERSPACE |
+ __TRANS_START |
+ __TRANS_ATTACH),
+ [TRANS_STATE_COMMIT_DOING] = (__TRANS_USERSPACE |
+ __TRANS_START |
+ __TRANS_ATTACH |
+ __TRANS_JOIN),
+ [TRANS_STATE_UNBLOCKED] = (__TRANS_USERSPACE |
+ __TRANS_START |
+ __TRANS_ATTACH |
+ __TRANS_JOIN |
+ __TRANS_JOIN_NOLOCK),
+ [TRANS_STATE_COMPLETED] = (__TRANS_USERSPACE |
+ __TRANS_START |
+ __TRANS_ATTACH |
+ __TRANS_JOIN |
+ __TRANS_JOIN_NOLOCK),
+};
+
+void btrfs_put_transaction(struct btrfs_transaction *transaction)
+{
+ WARN_ON(atomic_read(&transaction->use_count) == 0);
+ if (atomic_dec_and_test(&transaction->use_count)) {
+ BUG_ON(!list_empty(&transaction->list));
+ WARN_ON(!RB_EMPTY_ROOT(&transaction->delayed_refs.href_root));
+ if (transaction->delayed_refs.pending_csums)
+ printk(KERN_ERR "pending csums is %llu\n",
+ transaction->delayed_refs.pending_csums);
+ while (!list_empty(&transaction->pending_chunks)) {
+ struct extent_map *em;
+
+ em = list_first_entry(&transaction->pending_chunks,
+ struct extent_map, list);
+ list_del_init(&em->list);
+ free_extent_map(em);
+ }
+ kmem_cache_free(btrfs_transaction_cachep, transaction);
+ }
+}
+
+static void clear_btree_io_tree(struct extent_io_tree *tree)
+{
+ spin_lock(&tree->lock);
+ while (!RB_EMPTY_ROOT(&tree->state)) {
+ struct rb_node *node;
+ struct extent_state *state;
+
+ node = rb_first(&tree->state);
+ state = rb_entry(node, struct extent_state, rb_node);
+ rb_erase(&state->rb_node, &tree->state);
+ RB_CLEAR_NODE(&state->rb_node);
+ /*
+ * btree io trees aren't supposed to have tasks waiting for
+ * changes in the flags of extent states ever.
+ */
+ ASSERT(!waitqueue_active(&state->wq));
+ free_extent_state(state);
+
+ cond_resched_lock(&tree->lock);
+ }
+ spin_unlock(&tree->lock);
+}
+
+static noinline void switch_commit_roots(struct btrfs_transaction *trans,
+ struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *root, *tmp;
+
+ down_write(&fs_info->commit_root_sem);
+ list_for_each_entry_safe(root, tmp, &trans->switch_commits,
+ dirty_list) {
+ list_del_init(&root->dirty_list);
+ free_extent_buffer(root->commit_root);
+ root->commit_root = btrfs_root_node(root);
+ if (is_fstree(root->objectid))
+ btrfs_unpin_free_ino(root);
+ clear_btree_io_tree(&root->dirty_log_pages);
+ }
+ up_write(&fs_info->commit_root_sem);
+}
+
+static inline void extwriter_counter_inc(struct btrfs_transaction *trans,
+ unsigned int type)
+{
+ if (type & TRANS_EXTWRITERS)
+ atomic_inc(&trans->num_extwriters);
+}
+
+static inline void extwriter_counter_dec(struct btrfs_transaction *trans,
+ unsigned int type)
+{
+ if (type & TRANS_EXTWRITERS)
+ atomic_dec(&trans->num_extwriters);
+}
+
+static inline void extwriter_counter_init(struct btrfs_transaction *trans,
+ unsigned int type)
+{
+ atomic_set(&trans->num_extwriters, ((type & TRANS_EXTWRITERS) ? 1 : 0));
+}
+
+static inline int extwriter_counter_read(struct btrfs_transaction *trans)
+{
+ return atomic_read(&trans->num_extwriters);
+}
+
+/*
+ * either allocate a new transaction or hop into the existing one
+ */
+static noinline int join_transaction(struct btrfs_root *root, unsigned int type)
+{
+ struct btrfs_transaction *cur_trans;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+
+ spin_lock(&fs_info->trans_lock);
+loop:
+ /* The file system has been taken offline. No new transactions. */
+ if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+ spin_unlock(&fs_info->trans_lock);
+ return -EROFS;
+ }
+
+ cur_trans = fs_info->running_transaction;
+ if (cur_trans) {
+ if (cur_trans->aborted) {
+ spin_unlock(&fs_info->trans_lock);
+ return cur_trans->aborted;
+ }
+ if (btrfs_blocked_trans_types[cur_trans->state] & type) {
+ spin_unlock(&fs_info->trans_lock);
+ return -EBUSY;
+ }
+ atomic_inc(&cur_trans->use_count);
+ atomic_inc(&cur_trans->num_writers);
+ extwriter_counter_inc(cur_trans, type);
+ spin_unlock(&fs_info->trans_lock);
+ return 0;
+ }
+ spin_unlock(&fs_info->trans_lock);
+
+ /*
+ * If we are ATTACH, we just want to catch the current transaction,
+ * and commit it. If there is no transaction, just return ENOENT.
+ */
+ if (type == TRANS_ATTACH)
+ return -ENOENT;
+
+ /*
+ * JOIN_NOLOCK only happens during the transaction commit, so
+ * it is impossible that ->running_transaction is NULL
+ */
+ BUG_ON(type == TRANS_JOIN_NOLOCK);
+
+ cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
+ if (!cur_trans)
+ return -ENOMEM;
+
+ spin_lock(&fs_info->trans_lock);
+ if (fs_info->running_transaction) {
+ /*
+ * someone started a transaction after we unlocked. Make sure
+ * to redo the checks above
+ */
+ kmem_cache_free(btrfs_transaction_cachep, cur_trans);
+ goto loop;
+ } else if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+ spin_unlock(&fs_info->trans_lock);
+ kmem_cache_free(btrfs_transaction_cachep, cur_trans);
+ return -EROFS;
+ }
+
+ atomic_set(&cur_trans->num_writers, 1);
+ extwriter_counter_init(cur_trans, type);
+ init_waitqueue_head(&cur_trans->writer_wait);
+ init_waitqueue_head(&cur_trans->commit_wait);
+ cur_trans->state = TRANS_STATE_RUNNING;
+ /*
+ * One for this trans handle, one so it will live on until we
+ * commit the transaction.
+ */
+ atomic_set(&cur_trans->use_count, 2);
+ cur_trans->have_free_bgs = 0;
+ cur_trans->start_time = get_seconds();
+ cur_trans->dirty_bg_run = 0;
+
+ cur_trans->delayed_refs.href_root = RB_ROOT;
+ atomic_set(&cur_trans->delayed_refs.num_entries, 0);
+ cur_trans->delayed_refs.num_heads_ready = 0;
+ cur_trans->delayed_refs.pending_csums = 0;
+ cur_trans->delayed_refs.num_heads = 0;
+ cur_trans->delayed_refs.flushing = 0;
+ cur_trans->delayed_refs.run_delayed_start = 0;
+
+ /*
+ * although the tree mod log is per file system and not per transaction,
+ * the log must never go across transaction boundaries.
+ */
+ smp_mb();
+ if (!list_empty(&fs_info->tree_mod_seq_list))
+ WARN(1, KERN_ERR "BTRFS: tree_mod_seq_list not empty when "
+ "creating a fresh transaction\n");
+ if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log))
+ WARN(1, KERN_ERR "BTRFS: tree_mod_log rb tree not empty when "
+ "creating a fresh transaction\n");
+ atomic64_set(&fs_info->tree_mod_seq, 0);
+
+ spin_lock_init(&cur_trans->delayed_refs.lock);
+
+ INIT_LIST_HEAD(&cur_trans->pending_snapshots);
+ INIT_LIST_HEAD(&cur_trans->pending_chunks);
+ INIT_LIST_HEAD(&cur_trans->switch_commits);
+ INIT_LIST_HEAD(&cur_trans->pending_ordered);
+ INIT_LIST_HEAD(&cur_trans->dirty_bgs);
+ INIT_LIST_HEAD(&cur_trans->io_bgs);
+ mutex_init(&cur_trans->cache_write_mutex);
+ cur_trans->num_dirty_bgs = 0;
+ spin_lock_init(&cur_trans->dirty_bgs_lock);
+ list_add_tail(&cur_trans->list, &fs_info->trans_list);
+ extent_io_tree_init(&cur_trans->dirty_pages,
+ fs_info->btree_inode->i_mapping);
+ fs_info->generation++;
+ cur_trans->transid = fs_info->generation;
+ fs_info->running_transaction = cur_trans;
+ cur_trans->aborted = 0;
+ spin_unlock(&fs_info->trans_lock);
+
+ return 0;
+}
+
+/*
+ * this does all the record keeping required to make sure that a reference
+ * counted root is properly recorded in a given transaction. This is required
+ * to make sure the old root from before we joined the transaction is deleted
+ * when the transaction commits
+ */
+static int record_root_in_trans(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+ root->last_trans < trans->transid) {
+ WARN_ON(root == root->fs_info->extent_root);
+ WARN_ON(root->commit_root != root->node);
+
+ /*
+ * see below for IN_TRANS_SETUP usage rules
+ * we have the reloc mutex held now, so there
+ * is only one writer in this function
+ */
+ set_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state);
+
+ /* make sure readers find IN_TRANS_SETUP before
+ * they find our root->last_trans update
+ */
+ smp_wmb();
+
+ spin_lock(&root->fs_info->fs_roots_radix_lock);
+ if (root->last_trans == trans->transid) {
+ spin_unlock(&root->fs_info->fs_roots_radix_lock);
+ return 0;
+ }
+ radix_tree_tag_set(&root->fs_info->fs_roots_radix,
+ (unsigned long)root->root_key.objectid,
+ BTRFS_ROOT_TRANS_TAG);
+ spin_unlock(&root->fs_info->fs_roots_radix_lock);
+ root->last_trans = trans->transid;
+
+ /* this is pretty tricky. We don't want to
+ * take the relocation lock in btrfs_record_root_in_trans
+ * unless we're really doing the first setup for this root in
+ * this transaction.
+ *
+ * Normally we'd use root->last_trans as a flag to decide
+ * if we want to take the expensive mutex.
+ *
+ * But, we have to set root->last_trans before we
+ * init the relocation root, otherwise, we trip over warnings
+ * in ctree.c. The solution used here is to flag ourselves
+ * with root IN_TRANS_SETUP. When this is 1, we're still
+ * fixing up the reloc trees and everyone must wait.
+ *
+ * When this is zero, they can trust root->last_trans and fly
+ * through btrfs_record_root_in_trans without having to take the
+ * lock. smp_wmb() makes sure that all the writes above are
+ * done before we pop in the zero below
+ */
+ btrfs_init_reloc_root(trans, root);
+ smp_mb__before_atomic();
+ clear_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state);
+ }
+ return 0;
+}
+
+
+int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
+ return 0;
+
+ /*
+ * see record_root_in_trans for comments about IN_TRANS_SETUP usage
+ * and barriers
+ */
+ smp_rmb();
+ if (root->last_trans == trans->transid &&
+ !test_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state))
+ return 0;
+
+ mutex_lock(&root->fs_info->reloc_mutex);
+ record_root_in_trans(trans, root);
+ mutex_unlock(&root->fs_info->reloc_mutex);
+
+ return 0;
+}
+
+static inline int is_transaction_blocked(struct btrfs_transaction *trans)
+{
+ return (trans->state >= TRANS_STATE_BLOCKED &&
+ trans->state < TRANS_STATE_UNBLOCKED &&
+ !trans->aborted);
+}
+
+/* wait for commit against the current transaction to become unblocked
+ * when this is done, it is safe to start a new transaction, but the current
+ * transaction might not be fully on disk.
+ */
+static void wait_current_trans(struct btrfs_root *root)
+{
+ struct btrfs_transaction *cur_trans;
+
+ spin_lock(&root->fs_info->trans_lock);
+ cur_trans = root->fs_info->running_transaction;
+ if (cur_trans && is_transaction_blocked(cur_trans)) {
+ atomic_inc(&cur_trans->use_count);
+ spin_unlock(&root->fs_info->trans_lock);
+
+ wait_event(root->fs_info->transaction_wait,
+ cur_trans->state >= TRANS_STATE_UNBLOCKED ||
+ cur_trans->aborted);
+ btrfs_put_transaction(cur_trans);
+ } else {
+ spin_unlock(&root->fs_info->trans_lock);
+ }
+}
+
+static int may_wait_transaction(struct btrfs_root *root, int type)
+{
+ if (root->fs_info->log_root_recovering)
+ return 0;
+
+ if (type == TRANS_USERSPACE)
+ return 1;
+
+ if (type == TRANS_START &&
+ !atomic_read(&root->fs_info->open_ioctl_trans))
+ return 1;
+
+ return 0;
+}
+
+static inline bool need_reserve_reloc_root(struct btrfs_root *root)
+{
+ if (!root->fs_info->reloc_ctl ||
+ !test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
+ root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
+ root->reloc_root)
+ return false;
+
+ return true;
+}
+
+static struct btrfs_trans_handle *
+start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
+ enum btrfs_reserve_flush_enum flush)
+{
+ struct btrfs_trans_handle *h;
+ struct btrfs_transaction *cur_trans;
+ u64 num_bytes = 0;
+ u64 qgroup_reserved = 0;
+ bool reloc_reserved = false;
+ int ret;
+
+ /* Send isn't supposed to start transactions. */
+ ASSERT(current->journal_info != BTRFS_SEND_TRANS_STUB);
+
+ if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
+ return ERR_PTR(-EROFS);
+
+ if (current->journal_info) {
+ WARN_ON(type & TRANS_EXTWRITERS);
+ h = current->journal_info;
+ h->use_count++;
+ WARN_ON(h->use_count > 2);
+ h->orig_rsv = h->block_rsv;
+ h->block_rsv = NULL;
+ goto got_it;
+ }
+
+ /*
+ * Do the reservation before we join the transaction so we can do all
+ * the appropriate flushing if need be.
+ */
+ if (num_items > 0 && root != root->fs_info->chunk_root) {
+ if (root->fs_info->quota_enabled &&
+ is_fstree(root->root_key.objectid)) {
+ qgroup_reserved = num_items * root->nodesize;
+ ret = btrfs_qgroup_reserve(root, qgroup_reserved);
+ if (ret)
+ return ERR_PTR(ret);
+ }
+
+ num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
+ /*
+ * Do the reservation for the relocation root creation
+ */
+ if (need_reserve_reloc_root(root)) {
+ num_bytes += root->nodesize;
+ reloc_reserved = true;
+ }
+
+ ret = btrfs_block_rsv_add(root,
+ &root->fs_info->trans_block_rsv,
+ num_bytes, flush);
+ if (ret)
+ goto reserve_fail;
+ }
+again:
+ h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
+ if (!h) {
+ ret = -ENOMEM;
+ goto alloc_fail;
+ }
+
+ /*
+ * If we are JOIN_NOLOCK we're already committing a transaction and
+ * waiting on this guy, so we don't need to do the sb_start_intwrite
+ * because we're already holding a ref. We need this because we could
+ * have raced in and did an fsync() on a file which can kick a commit
+ * and then we deadlock with somebody doing a freeze.
+ *
+ * If we are ATTACH, it means we just want to catch the current
+ * transaction and commit it, so we needn't do sb_start_intwrite().
+ */
+ if (type & __TRANS_FREEZABLE)
+ sb_start_intwrite(root->fs_info->sb);
+
+ if (may_wait_transaction(root, type))
+ wait_current_trans(root);
+
+ do {
+ ret = join_transaction(root, type);
+ if (ret == -EBUSY) {
+ wait_current_trans(root);
+ if (unlikely(type == TRANS_ATTACH))
+ ret = -ENOENT;
+ }
+ } while (ret == -EBUSY);
+
+ if (ret < 0) {
+ /* We must get the transaction if we are JOIN_NOLOCK. */
+ BUG_ON(type == TRANS_JOIN_NOLOCK);
+ goto join_fail;
+ }
+
+ cur_trans = root->fs_info->running_transaction;
+
+ h->transid = cur_trans->transid;
+ h->transaction = cur_trans;
+ h->blocks_used = 0;
+ h->bytes_reserved = 0;
+ h->root = root;
+ h->delayed_ref_updates = 0;
+ h->use_count = 1;
+ h->adding_csums = 0;
+ h->block_rsv = NULL;
+ h->orig_rsv = NULL;
+ h->aborted = 0;
+ h->qgroup_reserved = 0;
+ h->delayed_ref_elem.seq = 0;
+ h->type = type;
+ h->allocating_chunk = false;
+ h->reloc_reserved = false;
+ h->sync = false;
+ INIT_LIST_HEAD(&h->qgroup_ref_list);
+ INIT_LIST_HEAD(&h->new_bgs);
+ INIT_LIST_HEAD(&h->ordered);
+
+ smp_mb();
+ if (cur_trans->state >= TRANS_STATE_BLOCKED &&
+ may_wait_transaction(root, type)) {
+ current->journal_info = h;
+ btrfs_commit_transaction(h, root);
+ goto again;
+ }
+
+ if (num_bytes) {
+ trace_btrfs_space_reservation(root->fs_info, "transaction",
+ h->transid, num_bytes, 1);
+ h->block_rsv = &root->fs_info->trans_block_rsv;
+ h->bytes_reserved = num_bytes;
+ h->reloc_reserved = reloc_reserved;
+ }
+ h->qgroup_reserved = qgroup_reserved;
+
+got_it:
+ btrfs_record_root_in_trans(h, root);
+
+ if (!current->journal_info && type != TRANS_USERSPACE)
+ current->journal_info = h;
+ return h;
+
+join_fail:
+ if (type & __TRANS_FREEZABLE)
+ sb_end_intwrite(root->fs_info->sb);
+ kmem_cache_free(btrfs_trans_handle_cachep, h);
+alloc_fail:
+ if (num_bytes)
+ btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv,
+ num_bytes);
+reserve_fail:
+ if (qgroup_reserved)
+ btrfs_qgroup_free(root, qgroup_reserved);
+ return ERR_PTR(ret);
+}
+
+struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
+ int num_items)
+{
+ return start_transaction(root, num_items, TRANS_START,
+ BTRFS_RESERVE_FLUSH_ALL);
+}
+
+struct btrfs_trans_handle *btrfs_start_transaction_lflush(
+ struct btrfs_root *root, int num_items)
+{
+ return start_transaction(root, num_items, TRANS_START,
+ BTRFS_RESERVE_FLUSH_LIMIT);
+}
+
+struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
+{
+ return start_transaction(root, 0, TRANS_JOIN, 0);
+}
+
+struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root)
+{
+ return start_transaction(root, 0, TRANS_JOIN_NOLOCK, 0);
+}
+
+struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root)
+{
+ return start_transaction(root, 0, TRANS_USERSPACE, 0);
+}
+
+/*
+ * btrfs_attach_transaction() - catch the running transaction
+ *
+ * It is used when we want to commit the current the transaction, but
+ * don't want to start a new one.
+ *
+ * Note: If this function return -ENOENT, it just means there is no
+ * running transaction. But it is possible that the inactive transaction
+ * is still in the memory, not fully on disk. If you hope there is no
+ * inactive transaction in the fs when -ENOENT is returned, you should
+ * invoke
+ * btrfs_attach_transaction_barrier()
+ */
+struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root)
+{
+ return start_transaction(root, 0, TRANS_ATTACH, 0);
+}
+
+/*
+ * btrfs_attach_transaction_barrier() - catch the running transaction
+ *
+ * It is similar to the above function, the differentia is this one
+ * will wait for all the inactive transactions until they fully
+ * complete.
+ */
+struct btrfs_trans_handle *
+btrfs_attach_transaction_barrier(struct btrfs_root *root)
+{
+ struct btrfs_trans_handle *trans;
+
+ trans = start_transaction(root, 0, TRANS_ATTACH, 0);
+ if (IS_ERR(trans) && PTR_ERR(trans) == -ENOENT)
+ btrfs_wait_for_commit(root, 0);
+
+ return trans;
+}
+
+/* wait for a transaction commit to be fully complete */
+static noinline void wait_for_commit(struct btrfs_root *root,
+ struct btrfs_transaction *commit)
+{
+ wait_event(commit->commit_wait, commit->state == TRANS_STATE_COMPLETED);
+}
+
+int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
+{
+ struct btrfs_transaction *cur_trans = NULL, *t;
+ int ret = 0;
+
+ if (transid) {
+ if (transid <= root->fs_info->last_trans_committed)
+ goto out;
+
+ /* find specified transaction */
+ spin_lock(&root->fs_info->trans_lock);
+ list_for_each_entry(t, &root->fs_info->trans_list, list) {
+ if (t->transid == transid) {
+ cur_trans = t;
+ atomic_inc(&cur_trans->use_count);
+ ret = 0;
+ break;
+ }
+ if (t->transid > transid) {
+ ret = 0;
+ break;
+ }
+ }
+ spin_unlock(&root->fs_info->trans_lock);
+
+ /*
+ * The specified transaction doesn't exist, or we
+ * raced with btrfs_commit_transaction
+ */
+ if (!cur_trans) {
+ if (transid > root->fs_info->last_trans_committed)
+ ret = -EINVAL;
+ goto out;
+ }
+ } else {
+ /* find newest transaction that is committing | committed */
+ spin_lock(&root->fs_info->trans_lock);
+ list_for_each_entry_reverse(t, &root->fs_info->trans_list,
+ list) {
+ if (t->state >= TRANS_STATE_COMMIT_START) {
+ if (t->state == TRANS_STATE_COMPLETED)
+ break;
+ cur_trans = t;
+ atomic_inc(&cur_trans->use_count);
+ break;
+ }
+ }
+ spin_unlock(&root->fs_info->trans_lock);
+ if (!cur_trans)
+ goto out; /* nothing committing|committed */
+ }
+
+ wait_for_commit(root, cur_trans);
+ btrfs_put_transaction(cur_trans);
+out:
+ return ret;
+}
+
+void btrfs_throttle(struct btrfs_root *root)
+{
+ if (!atomic_read(&root->fs_info->open_ioctl_trans))
+ wait_current_trans(root);
+}
+
+static int should_end_transaction(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ if (root->fs_info->global_block_rsv.space_info->full &&
+ btrfs_check_space_for_delayed_refs(trans, root))
+ return 1;
+
+ return !!btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);
+}
+
+int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_transaction *cur_trans = trans->transaction;
+ int updates;
+ int err;
+
+ smp_mb();
+ if (cur_trans->state >= TRANS_STATE_BLOCKED ||
+ cur_trans->delayed_refs.flushing)
+ return 1;
+
+ updates = trans->delayed_ref_updates;
+ trans->delayed_ref_updates = 0;
+ if (updates) {
+ err = btrfs_run_delayed_refs(trans, root, updates * 2);
+ if (err) /* Error code will also eval true */
+ return err;
+ }
+
+ return should_end_transaction(trans, root);
+}
+
+static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, int throttle)
+{
+ struct btrfs_transaction *cur_trans = trans->transaction;
+ struct btrfs_fs_info *info = root->fs_info;
+ unsigned long cur = trans->delayed_ref_updates;
+ int lock = (trans->type != TRANS_JOIN_NOLOCK);
+ int err = 0;
+ int must_run_delayed_refs = 0;
+
+ if (trans->use_count > 1) {
+ trans->use_count--;
+ trans->block_rsv = trans->orig_rsv;
+ return 0;
+ }
+
+ btrfs_trans_release_metadata(trans, root);
+ trans->block_rsv = NULL;
+
+ if (!list_empty(&trans->new_bgs))
+ btrfs_create_pending_block_groups(trans, root);
+
+ if (!list_empty(&trans->ordered)) {
+ spin_lock(&info->trans_lock);
+ list_splice_init(&trans->ordered, &cur_trans->pending_ordered);
+ spin_unlock(&info->trans_lock);
+ }
+
+ trans->delayed_ref_updates = 0;
+ if (!trans->sync) {
+ must_run_delayed_refs =
+ btrfs_should_throttle_delayed_refs(trans, root);
+ cur = max_t(unsigned long, cur, 32);
+
+ /*
+ * don't make the caller wait if they are from a NOLOCK
+ * or ATTACH transaction, it will deadlock with commit
+ */
+ if (must_run_delayed_refs == 1 &&
+ (trans->type & (__TRANS_JOIN_NOLOCK | __TRANS_ATTACH)))
+ must_run_delayed_refs = 2;
+ }
+
+ if (trans->qgroup_reserved) {
+ /*
+ * the same root has to be passed here between start_transaction
+ * and end_transaction. Subvolume quota depends on this.
+ */
+ btrfs_qgroup_free(trans->root, trans->qgroup_reserved);
+ trans->qgroup_reserved = 0;
+ }
+
+ btrfs_trans_release_metadata(trans, root);
+ trans->block_rsv = NULL;
+
+ if (!list_empty(&trans->new_bgs))
+ btrfs_create_pending_block_groups(trans, root);
+
+ if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
+ should_end_transaction(trans, root) &&
+ ACCESS_ONCE(cur_trans->state) == TRANS_STATE_RUNNING) {
+ spin_lock(&info->trans_lock);
+ if (cur_trans->state == TRANS_STATE_RUNNING)
+ cur_trans->state = TRANS_STATE_BLOCKED;
+ spin_unlock(&info->trans_lock);
+ }
+
+ if (lock && ACCESS_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) {
+ if (throttle)
+ return btrfs_commit_transaction(trans, root);
+ else
+ wake_up_process(info->transaction_kthread);
+ }
+
+ if (trans->type & __TRANS_FREEZABLE)
+ sb_end_intwrite(root->fs_info->sb);
+
+ WARN_ON(cur_trans != info->running_transaction);
+ WARN_ON(atomic_read(&cur_trans->num_writers) < 1);
+ atomic_dec(&cur_trans->num_writers);
+ extwriter_counter_dec(cur_trans, trans->type);
+
+ smp_mb();
+ if (waitqueue_active(&cur_trans->writer_wait))
+ wake_up(&cur_trans->writer_wait);
+ btrfs_put_transaction(cur_trans);
+
+ if (current->journal_info == trans)
+ current->journal_info = NULL;
+
+ if (throttle)
+ btrfs_run_delayed_iputs(root);
+
+ if (trans->aborted ||
+ test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) {
+ wake_up_process(info->transaction_kthread);
+ err = -EIO;
+ }
+ assert_qgroups_uptodate(trans);
+
+ kmem_cache_free(btrfs_trans_handle_cachep, trans);
+ if (must_run_delayed_refs) {
+ btrfs_async_run_delayed_refs(root, cur,
+ must_run_delayed_refs == 1);
+ }
+ return err;
+}
+
+int btrfs_end_transaction(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ return __btrfs_end_transaction(trans, root, 0);
+}
+
+int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ return __btrfs_end_transaction(trans, root, 1);
+}
+
+/*
+ * when btree blocks are allocated, they have some corresponding bits set for
+ * them in one of two extent_io trees. This is used to make sure all of
+ * those extents are sent to disk but does not wait on them
+ */
+int btrfs_write_marked_extents(struct btrfs_root *root,
+ struct extent_io_tree *dirty_pages, int mark)
+{
+ int err = 0;
+ int werr = 0;
+ struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
+ struct extent_state *cached_state = NULL;
+ u64 start = 0;
+ u64 end;
+
+ while (!find_first_extent_bit(dirty_pages, start, &start, &end,
+ mark, &cached_state)) {
+ bool wait_writeback = false;
+
+ err = convert_extent_bit(dirty_pages, start, end,
+ EXTENT_NEED_WAIT,
+ mark, &cached_state, GFP_NOFS);
+ /*
+ * convert_extent_bit can return -ENOMEM, which is most of the
+ * time a temporary error. So when it happens, ignore the error
+ * and wait for writeback of this range to finish - because we
+ * failed to set the bit EXTENT_NEED_WAIT for the range, a call
+ * to btrfs_wait_marked_extents() would not know that writeback
+ * for this range started and therefore wouldn't wait for it to
+ * finish - we don't want to commit a superblock that points to
+ * btree nodes/leafs for which writeback hasn't finished yet
+ * (and without errors).
+ * We cleanup any entries left in the io tree when committing
+ * the transaction (through clear_btree_io_tree()).
+ */
+ if (err == -ENOMEM) {
+ err = 0;
+ wait_writeback = true;
+ }
+ if (!err)
+ err = filemap_fdatawrite_range(mapping, start, end);
+ if (err)
+ werr = err;
+ else if (wait_writeback)
+ werr = filemap_fdatawait_range(mapping, start, end);
+ free_extent_state(cached_state);
+ cached_state = NULL;
+ cond_resched();
+ start = end + 1;
+ }
+ return werr;
+}
+
+/*
+ * when btree blocks are allocated, they have some corresponding bits set for
+ * them in one of two extent_io trees. This is used to make sure all of
+ * those extents are on disk for transaction or log commit. We wait
+ * on all the pages and clear them from the dirty pages state tree
+ */
+int btrfs_wait_marked_extents(struct btrfs_root *root,
+ struct extent_io_tree *dirty_pages, int mark)
+{
+ int err = 0;
+ int werr = 0;
+ struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
+ struct extent_state *cached_state = NULL;
+ u64 start = 0;
+ u64 end;
+ struct btrfs_inode *btree_ino = BTRFS_I(root->fs_info->btree_inode);
+ bool errors = false;
+
+ while (!find_first_extent_bit(dirty_pages, start, &start, &end,
+ EXTENT_NEED_WAIT, &cached_state)) {
+ /*
+ * Ignore -ENOMEM errors returned by clear_extent_bit().
+ * When committing the transaction, we'll remove any entries
+ * left in the io tree. For a log commit, we don't remove them
+ * after committing the log because the tree can be accessed
+ * concurrently - we do it only at transaction commit time when
+ * it's safe to do it (through clear_btree_io_tree()).
+ */
+ err = clear_extent_bit(dirty_pages, start, end,
+ EXTENT_NEED_WAIT,
+ 0, 0, &cached_state, GFP_NOFS);
+ if (err == -ENOMEM)
+ err = 0;
+ if (!err)
+ err = filemap_fdatawait_range(mapping, start, end);
+ if (err)
+ werr = err;
+ free_extent_state(cached_state);
+ cached_state = NULL;
+ cond_resched();
+ start = end + 1;
+ }
+ if (err)
+ werr = err;
+
+ if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
+ if ((mark & EXTENT_DIRTY) &&
+ test_and_clear_bit(BTRFS_INODE_BTREE_LOG1_ERR,
+ &btree_ino->runtime_flags))
+ errors = true;
+
+ if ((mark & EXTENT_NEW) &&
+ test_and_clear_bit(BTRFS_INODE_BTREE_LOG2_ERR,
+ &btree_ino->runtime_flags))
+ errors = true;
+ } else {
+ if (test_and_clear_bit(BTRFS_INODE_BTREE_ERR,
+ &btree_ino->runtime_flags))
+ errors = true;
+ }
+
+ if (errors && !werr)
+ werr = -EIO;
+
+ return werr;
+}
+
+/*
+ * when btree blocks are allocated, they have some corresponding bits set for
+ * them in one of two extent_io trees. This is used to make sure all of
+ * those extents are on disk for transaction or log commit
+ */
+static int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
+ struct extent_io_tree *dirty_pages, int mark)
+{
+ int ret;
+ int ret2;
+ struct blk_plug plug;
+
+ blk_start_plug(&plug);
+ ret = btrfs_write_marked_extents(root, dirty_pages, mark);
+ blk_finish_plug(&plug);
+ ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark);
+
+ if (ret)
+ return ret;
+ if (ret2)
+ return ret2;
+ return 0;
+}
+
+static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ int ret;
+
+ ret = btrfs_write_and_wait_marked_extents(root,
+ &trans->transaction->dirty_pages,
+ EXTENT_DIRTY);
+ clear_btree_io_tree(&trans->transaction->dirty_pages);
+
+ return ret;
+}
+
+/*
+ * this is used to update the root pointer in the tree of tree roots.
+ *
+ * But, in the case of the extent allocation tree, updating the root
+ * pointer may allocate blocks which may change the root of the extent
+ * allocation tree.
+ *
+ * So, this loops and repeats and makes sure the cowonly root didn't
+ * change while the root pointer was being updated in the metadata.
+ */
+static int update_cowonly_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ int ret;
+ u64 old_root_bytenr;
+ u64 old_root_used;
+ struct btrfs_root *tree_root = root->fs_info->tree_root;
+
+ old_root_used = btrfs_root_used(&root->root_item);
+
+ while (1) {
+ old_root_bytenr = btrfs_root_bytenr(&root->root_item);
+ if (old_root_bytenr == root->node->start &&
+ old_root_used == btrfs_root_used(&root->root_item))
+ break;
+
+ btrfs_set_root_node(&root->root_item, root->node);
+ ret = btrfs_update_root(trans, tree_root,
+ &root->root_key,
+ &root->root_item);
+ if (ret)
+ return ret;
+
+ old_root_used = btrfs_root_used(&root->root_item);
+ }
+
+ return 0;
+}
+
+/*
+ * update all the cowonly tree roots on disk
+ *
+ * The error handling in this function may not be obvious. Any of the
+ * failures will cause the file system to go offline. We still need
+ * to clean up the delayed refs.
+ */
+static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct list_head *dirty_bgs = &trans->transaction->dirty_bgs;
+ struct list_head *io_bgs = &trans->transaction->io_bgs;
+ struct list_head *next;
+ struct extent_buffer *eb;
+ int ret;
+
+ eb = btrfs_lock_root_node(fs_info->tree_root);
+ ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL,
+ 0, &eb);
+ btrfs_tree_unlock(eb);
+ free_extent_buffer(eb);
+
+ if (ret)
+ return ret;
+
+ ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+ if (ret)
+ return ret;
+
+ ret = btrfs_run_dev_stats(trans, root->fs_info);
+ if (ret)
+ return ret;
+ ret = btrfs_run_dev_replace(trans, root->fs_info);
+ if (ret)
+ return ret;
+ ret = btrfs_run_qgroups(trans, root->fs_info);
+ if (ret)
+ return ret;
+
+ ret = btrfs_setup_space_cache(trans, root);
+ if (ret)
+ return ret;
+
+ /* run_qgroups might have added some more refs */
+ ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+ if (ret)
+ return ret;
+again:
+ while (!list_empty(&fs_info->dirty_cowonly_roots)) {
+ next = fs_info->dirty_cowonly_roots.next;
+ list_del_init(next);
+ root = list_entry(next, struct btrfs_root, dirty_list);
+ clear_bit(BTRFS_ROOT_DIRTY, &root->state);
+
+ if (root != fs_info->extent_root)
+ list_add_tail(&root->dirty_list,
+ &trans->transaction->switch_commits);
+ ret = update_cowonly_root(trans, root);
+ if (ret)
+ return ret;
+ ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+ if (ret)
+ return ret;
+ }
+
+ while (!list_empty(dirty_bgs) || !list_empty(io_bgs)) {
+ ret = btrfs_write_dirty_block_groups(trans, root);
+ if (ret)
+ return ret;
+ ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+ if (ret)
+ return ret;
+ }
+
+ if (!list_empty(&fs_info->dirty_cowonly_roots))
+ goto again;
+
+ list_add_tail(&fs_info->extent_root->dirty_list,
+ &trans->transaction->switch_commits);
+ btrfs_after_dev_replace_commit(fs_info);
+
+ return 0;
+}
+
+/*
+ * dead roots are old snapshots that need to be deleted. This allocates
+ * a dirty root struct and adds it into the list of dead roots that need to
+ * be deleted
+ */
+void btrfs_add_dead_root(struct btrfs_root *root)
+{
+ spin_lock(&root->fs_info->trans_lock);
+ if (list_empty(&root->root_list))
+ list_add_tail(&root->root_list, &root->fs_info->dead_roots);
+ spin_unlock(&root->fs_info->trans_lock);
+}
+
+/*
+ * update all the cowonly tree roots on disk
+ */
+static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_root *gang[8];
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ int i;
+ int ret;
+ int err = 0;
+
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ while (1) {
+ ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
+ (void **)gang, 0,
+ ARRAY_SIZE(gang),
+ BTRFS_ROOT_TRANS_TAG);
+ if (ret == 0)
+ break;
+ for (i = 0; i < ret; i++) {
+ root = gang[i];
+ radix_tree_tag_clear(&fs_info->fs_roots_radix,
+ (unsigned long)root->root_key.objectid,
+ BTRFS_ROOT_TRANS_TAG);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+
+ btrfs_free_log(trans, root);
+ btrfs_update_reloc_root(trans, root);
+ btrfs_orphan_commit_root(trans, root);
+
+ btrfs_save_ino_cache(root, trans);
+
+ /* see comments in should_cow_block() */
+ clear_bit(BTRFS_ROOT_FORCE_COW, &root->state);
+ smp_mb__after_atomic();
+
+ if (root->commit_root != root->node) {
+ list_add_tail(&root->dirty_list,
+ &trans->transaction->switch_commits);
+ btrfs_set_root_node(&root->root_item,
+ root->node);
+ }
+
+ err = btrfs_update_root(trans, fs_info->tree_root,
+ &root->root_key,
+ &root->root_item);
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ if (err)
+ break;
+ }
+ }
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+ return err;
+}
+
+/*
+ * defrag a given btree.
+ * Every leaf in the btree is read and defragged.
+ */
+int btrfs_defrag_root(struct btrfs_root *root)
+{
+ struct btrfs_fs_info *info = root->fs_info;
+ struct btrfs_trans_handle *trans;
+ int ret;
+
+ if (test_and_set_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state))
+ return 0;
+
+ while (1) {
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ ret = btrfs_defrag_leaves(trans, root);
+
+ btrfs_end_transaction(trans, root);
+ btrfs_btree_balance_dirty(info->tree_root);
+ cond_resched();
+
+ if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN)
+ break;
+
+ if (btrfs_defrag_cancelled(root->fs_info)) {
+ pr_debug("BTRFS: defrag_root cancelled\n");
+ ret = -EAGAIN;
+ break;
+ }
+ }
+ clear_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state);
+ return ret;
+}
+
+/*
+ * new snapshots need to be created at a very specific time in the
+ * transaction commit. This does the actual creation.
+ *
+ * Note:
+ * If the error which may affect the commitment of the current transaction
+ * happens, we should return the error number. If the error which just affect
+ * the creation of the pending snapshots, just return 0.
+ */
+static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_pending_snapshot *pending)
+{
+ struct btrfs_key key;
+ struct btrfs_root_item *new_root_item;
+ struct btrfs_root *tree_root = fs_info->tree_root;
+ struct btrfs_root *root = pending->root;
+ struct btrfs_root *parent_root;
+ struct btrfs_block_rsv *rsv;
+ struct inode *parent_inode;
+ struct btrfs_path *path;
+ struct btrfs_dir_item *dir_item;
+ struct dentry *dentry;
+ struct extent_buffer *tmp;
+ struct extent_buffer *old;
+ struct timespec cur_time = CURRENT_TIME;
+ int ret = 0;
+ u64 to_reserve = 0;
+ u64 index = 0;
+ u64 objectid;
+ u64 root_flags;
+ uuid_le new_uuid;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ pending->error = -ENOMEM;
+ return 0;
+ }
+
+ new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
+ if (!new_root_item) {
+ pending->error = -ENOMEM;
+ goto root_item_alloc_fail;
+ }
+
+ pending->error = btrfs_find_free_objectid(tree_root, &objectid);
+ if (pending->error)
+ goto no_free_objectid;
+
+ btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
+
+ if (to_reserve > 0) {
+ pending->error = btrfs_block_rsv_add(root,
+ &pending->block_rsv,
+ to_reserve,
+ BTRFS_RESERVE_NO_FLUSH);
+ if (pending->error)
+ goto no_free_objectid;
+ }
+
+ key.objectid = objectid;
+ key.offset = (u64)-1;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+
+ rsv = trans->block_rsv;
+ trans->block_rsv = &pending->block_rsv;
+ trans->bytes_reserved = trans->block_rsv->reserved;
+
+ dentry = pending->dentry;
+ parent_inode = pending->dir;
+ parent_root = BTRFS_I(parent_inode)->root;
+ record_root_in_trans(trans, parent_root);
+
+ /*
+ * insert the directory item
+ */
+ ret = btrfs_set_inode_index(parent_inode, &index);
+ BUG_ON(ret); /* -ENOMEM */
+
+ /* check if there is a file/dir which has the same name. */
+ dir_item = btrfs_lookup_dir_item(NULL, parent_root, path,
+ btrfs_ino(parent_inode),
+ dentry->d_name.name,
+ dentry->d_name.len, 0);
+ if (dir_item != NULL && !IS_ERR(dir_item)) {
+ pending->error = -EEXIST;
+ goto dir_item_existed;
+ } else if (IS_ERR(dir_item)) {
+ ret = PTR_ERR(dir_item);
+ btrfs_abort_transaction(trans, root, ret);
+ goto fail;
+ }
+ btrfs_release_path(path);
+
+ /*
+ * pull in the delayed directory update
+ * and the delayed inode item
+ * otherwise we corrupt the FS during
+ * snapshot
+ */
+ ret = btrfs_run_delayed_items(trans, root);
+ if (ret) { /* Transaction aborted */
+ btrfs_abort_transaction(trans, root, ret);
+ goto fail;
+ }
+
+ record_root_in_trans(trans, root);
+ btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
+ memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
+ btrfs_check_and_init_root_item(new_root_item);
+
+ root_flags = btrfs_root_flags(new_root_item);
+ if (pending->readonly)
+ root_flags |= BTRFS_ROOT_SUBVOL_RDONLY;
+ else
+ root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY;
+ btrfs_set_root_flags(new_root_item, root_flags);
+
+ btrfs_set_root_generation_v2(new_root_item,
+ trans->transid);
+ uuid_le_gen(&new_uuid);
+ memcpy(new_root_item->uuid, new_uuid.b, BTRFS_UUID_SIZE);
+ memcpy(new_root_item->parent_uuid, root->root_item.uuid,
+ BTRFS_UUID_SIZE);
+ if (!(root_flags & BTRFS_ROOT_SUBVOL_RDONLY)) {
+ memset(new_root_item->received_uuid, 0,
+ sizeof(new_root_item->received_uuid));
+ memset(&new_root_item->stime, 0, sizeof(new_root_item->stime));
+ memset(&new_root_item->rtime, 0, sizeof(new_root_item->rtime));
+ btrfs_set_root_stransid(new_root_item, 0);
+ btrfs_set_root_rtransid(new_root_item, 0);
+ }
+ btrfs_set_stack_timespec_sec(&new_root_item->otime, cur_time.tv_sec);
+ btrfs_set_stack_timespec_nsec(&new_root_item->otime, cur_time.tv_nsec);
+ btrfs_set_root_otransid(new_root_item, trans->transid);
+
+ old = btrfs_lock_root_node(root);
+ ret = btrfs_cow_block(trans, root, old, NULL, 0, &old);
+ if (ret) {
+ btrfs_tree_unlock(old);
+ free_extent_buffer(old);
+ btrfs_abort_transaction(trans, root, ret);
+ goto fail;
+ }
+
+ btrfs_set_lock_blocking(old);
+
+ ret = btrfs_copy_root(trans, root, old, &tmp, objectid);
+ /* clean up in any case */
+ btrfs_tree_unlock(old);
+ free_extent_buffer(old);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto fail;
+ }
+
+ /*
+ * We need to flush delayed refs in order to make sure all of our quota
+ * operations have been done before we call btrfs_qgroup_inherit.
+ */
+ ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto fail;
+ }
+
+ ret = btrfs_qgroup_inherit(trans, fs_info,
+ root->root_key.objectid,
+ objectid, pending->inherit);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto fail;
+ }
+
+ /* see comments in should_cow_block() */
+ set_bit(BTRFS_ROOT_FORCE_COW, &root->state);
+ smp_wmb();
+
+ btrfs_set_root_node(new_root_item, tmp);
+ /* record when the snapshot was created in key.offset */
+ key.offset = trans->transid;
+ ret = btrfs_insert_root(trans, tree_root, &key, new_root_item);
+ btrfs_tree_unlock(tmp);
+ free_extent_buffer(tmp);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto fail;
+ }
+
+ /*
+ * insert root back/forward references
+ */
+ ret = btrfs_add_root_ref(trans, tree_root, objectid,
+ parent_root->root_key.objectid,
+ btrfs_ino(parent_inode), index,
+ dentry->d_name.name, dentry->d_name.len);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto fail;
+ }
+
+ key.offset = (u64)-1;
+ pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);
+ if (IS_ERR(pending->snap)) {
+ ret = PTR_ERR(pending->snap);
+ btrfs_abort_transaction(trans, root, ret);
+ goto fail;
+ }
+
+ ret = btrfs_reloc_post_snapshot(trans, pending);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto fail;
+ }
+
+ ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto fail;
+ }
+
+ ret = btrfs_insert_dir_item(trans, parent_root,
+ dentry->d_name.name, dentry->d_name.len,
+ parent_inode, &key,
+ BTRFS_FT_DIR, index);
+ /* We have check then name at the beginning, so it is impossible. */
+ BUG_ON(ret == -EEXIST || ret == -EOVERFLOW);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto fail;
+ }
+
+ btrfs_i_size_write(parent_inode, parent_inode->i_size +
+ dentry->d_name.len * 2);
+ parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
+ ret = btrfs_update_inode_fallback(trans, parent_root, parent_inode);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto fail;
+ }
+ ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root, new_uuid.b,
+ BTRFS_UUID_KEY_SUBVOL, objectid);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto fail;
+ }
+ if (!btrfs_is_empty_uuid(new_root_item->received_uuid)) {
+ ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root,
+ new_root_item->received_uuid,
+ BTRFS_UUID_KEY_RECEIVED_SUBVOL,
+ objectid);
+ if (ret && ret != -EEXIST) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto fail;
+ }
+ }
+fail:
+ pending->error = ret;
+dir_item_existed:
+ trans->block_rsv = rsv;
+ trans->bytes_reserved = 0;
+no_free_objectid:
+ kfree(new_root_item);
+root_item_alloc_fail:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * create all the snapshots we've scheduled for creation
+ */
+static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_pending_snapshot *pending, *next;
+ struct list_head *head = &trans->transaction->pending_snapshots;
+ int ret = 0;
+
+ list_for_each_entry_safe(pending, next, head, list) {
+ list_del(&pending->list);
+ ret = create_pending_snapshot(trans, fs_info, pending);
+ if (ret)
+ break;
+ }
+ return ret;
+}
+
+static void update_super_roots(struct btrfs_root *root)
+{
+ struct btrfs_root_item *root_item;
+ struct btrfs_super_block *super;
+
+ super = root->fs_info->super_copy;
+
+ root_item = &root->fs_info->chunk_root->root_item;
+ super->chunk_root = root_item->bytenr;
+ super->chunk_root_generation = root_item->generation;
+ super->chunk_root_level = root_item->level;
+
+ root_item = &root->fs_info->tree_root->root_item;
+ super->root = root_item->bytenr;
+ super->generation = root_item->generation;
+ super->root_level = root_item->level;
+ if (btrfs_test_opt(root, SPACE_CACHE))
+ super->cache_generation = root_item->generation;
+ if (root->fs_info->update_uuid_tree_gen)
+ super->uuid_tree_generation = root_item->generation;
+}
+
+int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
+{
+ struct btrfs_transaction *trans;
+ int ret = 0;
+
+ spin_lock(&info->trans_lock);
+ trans = info->running_transaction;
+ if (trans)
+ ret = (trans->state >= TRANS_STATE_COMMIT_START);
+ spin_unlock(&info->trans_lock);
+ return ret;
+}
+
+int btrfs_transaction_blocked(struct btrfs_fs_info *info)
+{
+ struct btrfs_transaction *trans;
+ int ret = 0;
+
+ spin_lock(&info->trans_lock);
+ trans = info->running_transaction;
+ if (trans)
+ ret = is_transaction_blocked(trans);
+ spin_unlock(&info->trans_lock);
+ return ret;
+}
+
+/*
+ * wait for the current transaction commit to start and block subsequent
+ * transaction joins
+ */
+static void wait_current_trans_commit_start(struct btrfs_root *root,
+ struct btrfs_transaction *trans)
+{
+ wait_event(root->fs_info->transaction_blocked_wait,
+ trans->state >= TRANS_STATE_COMMIT_START ||
+ trans->aborted);
+}
+
+/*
+ * wait for the current transaction to start and then become unblocked.
+ * caller holds ref.
+ */
+static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root,
+ struct btrfs_transaction *trans)
+{
+ wait_event(root->fs_info->transaction_wait,
+ trans->state >= TRANS_STATE_UNBLOCKED ||
+ trans->aborted);
+}
+
+/*
+ * commit transactions asynchronously. once btrfs_commit_transaction_async
+ * returns, any subsequent transaction will not be allowed to join.
+ */
+struct btrfs_async_commit {
+ struct btrfs_trans_handle *newtrans;
+ struct btrfs_root *root;
+ struct work_struct work;
+};
+
+static void do_async_commit(struct work_struct *work)
+{
+ struct btrfs_async_commit *ac =
+ container_of(work, struct btrfs_async_commit, work);
+
+ /*
+ * We've got freeze protection passed with the transaction.
+ * Tell lockdep about it.
+ */
+ if (ac->newtrans->type & __TRANS_FREEZABLE)
+ rwsem_acquire_read(
+ &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
+ 0, 1, _THIS_IP_);
+
+ current->journal_info = ac->newtrans;
+
+ btrfs_commit_transaction(ac->newtrans, ac->root);
+ kfree(ac);
+}
+
+int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ int wait_for_unblock)
+{
+ struct btrfs_async_commit *ac;
+ struct btrfs_transaction *cur_trans;
+
+ ac = kmalloc(sizeof(*ac), GFP_NOFS);
+ if (!ac)
+ return -ENOMEM;
+
+ INIT_WORK(&ac->work, do_async_commit);
+ ac->root = root;
+ ac->newtrans = btrfs_join_transaction(root);
+ if (IS_ERR(ac->newtrans)) {
+ int err = PTR_ERR(ac->newtrans);
+ kfree(ac);
+ return err;
+ }
+
+ /* take transaction reference */
+ cur_trans = trans->transaction;
+ atomic_inc(&cur_trans->use_count);
+
+ btrfs_end_transaction(trans, root);
+
+ /*
+ * Tell lockdep we've released the freeze rwsem, since the
+ * async commit thread will be the one to unlock it.
+ */
+ if (ac->newtrans->type & __TRANS_FREEZABLE)
+ rwsem_release(
+ &root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
+ 1, _THIS_IP_);
+
+ schedule_work(&ac->work);
+
+ /* wait for transaction to start and unblock */
+ if (wait_for_unblock)
+ wait_current_trans_commit_start_and_unblock(root, cur_trans);
+ else
+ wait_current_trans_commit_start(root, cur_trans);
+
+ if (current->journal_info == trans)
+ current->journal_info = NULL;
+
+ btrfs_put_transaction(cur_trans);
+ return 0;
+}
+
+
+static void cleanup_transaction(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, int err)
+{
+ struct btrfs_transaction *cur_trans = trans->transaction;
+ DEFINE_WAIT(wait);
+
+ WARN_ON(trans->use_count > 1);
+
+ btrfs_abort_transaction(trans, root, err);
+
+ spin_lock(&root->fs_info->trans_lock);
+
+ /*
+ * If the transaction is removed from the list, it means this
+ * transaction has been committed successfully, so it is impossible
+ * to call the cleanup function.
+ */
+ BUG_ON(list_empty(&cur_trans->list));
+
+ list_del_init(&cur_trans->list);
+ if (cur_trans == root->fs_info->running_transaction) {
+ cur_trans->state = TRANS_STATE_COMMIT_DOING;
+ spin_unlock(&root->fs_info->trans_lock);
+ wait_event(cur_trans->writer_wait,
+ atomic_read(&cur_trans->num_writers) == 1);
+
+ spin_lock(&root->fs_info->trans_lock);
+ }
+ spin_unlock(&root->fs_info->trans_lock);
+
+ btrfs_cleanup_one_transaction(trans->transaction, root);
+
+ spin_lock(&root->fs_info->trans_lock);
+ if (cur_trans == root->fs_info->running_transaction)
+ root->fs_info->running_transaction = NULL;
+ spin_unlock(&root->fs_info->trans_lock);
+
+ if (trans->type & __TRANS_FREEZABLE)
+ sb_end_intwrite(root->fs_info->sb);
+ btrfs_put_transaction(cur_trans);
+ btrfs_put_transaction(cur_trans);
+
+ trace_btrfs_transaction_commit(root);
+
+ if (current->journal_info == trans)
+ current->journal_info = NULL;
+ btrfs_scrub_cancel(root->fs_info);
+
+ kmem_cache_free(btrfs_trans_handle_cachep, trans);
+}
+
+static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
+{
+ if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
+ return btrfs_start_delalloc_roots(fs_info, 1, -1);
+ return 0;
+}
+
+static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
+{
+ if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
+ btrfs_wait_ordered_roots(fs_info, -1);
+}
+
+static inline void
+btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans,
+ struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_ordered_extent *ordered;
+
+ spin_lock(&fs_info->trans_lock);
+ while (!list_empty(&cur_trans->pending_ordered)) {
+ ordered = list_first_entry(&cur_trans->pending_ordered,
+ struct btrfs_ordered_extent,
+ trans_list);
+ list_del_init(&ordered->trans_list);
+ spin_unlock(&fs_info->trans_lock);
+
+ wait_event(ordered->wait, test_bit(BTRFS_ORDERED_COMPLETE,
+ &ordered->flags));
+ btrfs_put_ordered_extent(ordered);
+ spin_lock(&fs_info->trans_lock);
+ }
+ spin_unlock(&fs_info->trans_lock);
+}
+
+int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_transaction *cur_trans = trans->transaction;
+ struct btrfs_transaction *prev_trans = NULL;
+ struct btrfs_inode *btree_ino = BTRFS_I(root->fs_info->btree_inode);
+ int ret;
+
+ /* Stop the commit early if ->aborted is set */
+ if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
+ ret = cur_trans->aborted;
+ btrfs_end_transaction(trans, root);
+ return ret;
+ }
+
+ /* make a pass through all the delayed refs we have so far
+ * any runnings procs may add more while we are here
+ */
+ ret = btrfs_run_delayed_refs(trans, root, 0);
+ if (ret) {
+ btrfs_end_transaction(trans, root);
+ return ret;
+ }
+
+ btrfs_trans_release_metadata(trans, root);
+ trans->block_rsv = NULL;
+ if (trans->qgroup_reserved) {
+ btrfs_qgroup_free(root, trans->qgroup_reserved);
+ trans->qgroup_reserved = 0;
+ }
+
+ cur_trans = trans->transaction;
+
+ /*
+ * set the flushing flag so procs in this transaction have to
+ * start sending their work down.
+ */
+ cur_trans->delayed_refs.flushing = 1;
+ smp_wmb();
+
+ if (!list_empty(&trans->new_bgs))
+ btrfs_create_pending_block_groups(trans, root);
+
+ ret = btrfs_run_delayed_refs(trans, root, 0);
+ if (ret) {
+ btrfs_end_transaction(trans, root);
+ return ret;
+ }
+
+ if (!cur_trans->dirty_bg_run) {
+ int run_it = 0;
+
+ /* this mutex is also taken before trying to set
+ * block groups readonly. We need to make sure
+ * that nobody has set a block group readonly
+ * after a extents from that block group have been
+ * allocated for cache files. btrfs_set_block_group_ro
+ * will wait for the transaction to commit if it
+ * finds dirty_bg_run = 1
+ *
+ * The dirty_bg_run flag is also used to make sure only
+ * one process starts all the block group IO. It wouldn't
+ * hurt to have more than one go through, but there's no
+ * real advantage to it either.
+ */
+ mutex_lock(&root->fs_info->ro_block_group_mutex);
+ if (!cur_trans->dirty_bg_run) {
+ run_it = 1;
+ cur_trans->dirty_bg_run = 1;
+ }
+ mutex_unlock(&root->fs_info->ro_block_group_mutex);
+
+ if (run_it)
+ ret = btrfs_start_dirty_block_groups(trans, root);
+ }
+ if (ret) {
+ btrfs_end_transaction(trans, root);
+ return ret;
+ }
+
+ spin_lock(&root->fs_info->trans_lock);
+ list_splice_init(&trans->ordered, &cur_trans->pending_ordered);
+ if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
+ spin_unlock(&root->fs_info->trans_lock);
+ atomic_inc(&cur_trans->use_count);
+ ret = btrfs_end_transaction(trans, root);
+
+ wait_for_commit(root, cur_trans);
+
+ if (unlikely(cur_trans->aborted))
+ ret = cur_trans->aborted;
+
+ btrfs_put_transaction(cur_trans);
+
+ return ret;
+ }
+
+ cur_trans->state = TRANS_STATE_COMMIT_START;
+ wake_up(&root->fs_info->transaction_blocked_wait);
+
+ if (cur_trans->list.prev != &root->fs_info->trans_list) {
+ prev_trans = list_entry(cur_trans->list.prev,
+ struct btrfs_transaction, list);
+ if (prev_trans->state != TRANS_STATE_COMPLETED) {
+ atomic_inc(&prev_trans->use_count);
+ spin_unlock(&root->fs_info->trans_lock);
+
+ wait_for_commit(root, prev_trans);
+
+ btrfs_put_transaction(prev_trans);
+ } else {
+ spin_unlock(&root->fs_info->trans_lock);
+ }
+ } else {
+ spin_unlock(&root->fs_info->trans_lock);
+ }
+
+ extwriter_counter_dec(cur_trans, trans->type);
+
+ ret = btrfs_start_delalloc_flush(root->fs_info);
+ if (ret)
+ goto cleanup_transaction;
+
+ ret = btrfs_run_delayed_items(trans, root);
+ if (ret)
+ goto cleanup_transaction;
+
+ wait_event(cur_trans->writer_wait,
+ extwriter_counter_read(cur_trans) == 0);
+
+ /* some pending stuffs might be added after the previous flush. */
+ ret = btrfs_run_delayed_items(trans, root);
+ if (ret)
+ goto cleanup_transaction;
+
+ btrfs_wait_delalloc_flush(root->fs_info);
+
+ btrfs_wait_pending_ordered(cur_trans, root->fs_info);
+
+ btrfs_scrub_pause(root);
+ /*
+ * Ok now we need to make sure to block out any other joins while we
+ * commit the transaction. We could have started a join before setting
+ * COMMIT_DOING so make sure to wait for num_writers to == 1 again.
+ */
+ spin_lock(&root->fs_info->trans_lock);
+ cur_trans->state = TRANS_STATE_COMMIT_DOING;
+ spin_unlock(&root->fs_info->trans_lock);
+ wait_event(cur_trans->writer_wait,
+ atomic_read(&cur_trans->num_writers) == 1);
+
+ /* ->aborted might be set after the previous check, so check it */
+ if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
+ ret = cur_trans->aborted;
+ goto scrub_continue;
+ }
+ /*
+ * the reloc mutex makes sure that we stop
+ * the balancing code from coming in and moving
+ * extents around in the middle of the commit
+ */
+ mutex_lock(&root->fs_info->reloc_mutex);
+
+ /*
+ * We needn't worry about the delayed items because we will
+ * deal with them in create_pending_snapshot(), which is the
+ * core function of the snapshot creation.
+ */
+ ret = create_pending_snapshots(trans, root->fs_info);
+ if (ret) {
+ mutex_unlock(&root->fs_info->reloc_mutex);
+ goto scrub_continue;
+ }
+
+ /*
+ * We insert the dir indexes of the snapshots and update the inode
+ * of the snapshots' parents after the snapshot creation, so there
+ * are some delayed items which are not dealt with. Now deal with
+ * them.
+ *
+ * We needn't worry that this operation will corrupt the snapshots,
+ * because all the tree which are snapshoted will be forced to COW
+ * the nodes and leaves.
+ */
+ ret = btrfs_run_delayed_items(trans, root);
+ if (ret) {
+ mutex_unlock(&root->fs_info->reloc_mutex);
+ goto scrub_continue;
+ }
+
+ ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+ if (ret) {
+ mutex_unlock(&root->fs_info->reloc_mutex);
+ goto scrub_continue;
+ }
+
+ /*
+ * make sure none of the code above managed to slip in a
+ * delayed item
+ */
+ btrfs_assert_delayed_root_empty(root);
+
+ WARN_ON(cur_trans != trans->transaction);
+
+ /* btrfs_commit_tree_roots is responsible for getting the
+ * various roots consistent with each other. Every pointer
+ * in the tree of tree roots has to point to the most up to date
+ * root for every subvolume and other tree. So, we have to keep
+ * the tree logging code from jumping in and changing any
+ * of the trees.
+ *
+ * At this point in the commit, there can't be any tree-log
+ * writers, but a little lower down we drop the trans mutex
+ * and let new people in. By holding the tree_log_mutex
+ * from now until after the super is written, we avoid races
+ * with the tree-log code.
+ */
+ mutex_lock(&root->fs_info->tree_log_mutex);
+
+ ret = commit_fs_roots(trans, root);
+ if (ret) {
+ mutex_unlock(&root->fs_info->tree_log_mutex);
+ mutex_unlock(&root->fs_info->reloc_mutex);
+ goto scrub_continue;
+ }
+
+ /*
+ * Since the transaction is done, we can apply the pending changes
+ * before the next transaction.
+ */
+ btrfs_apply_pending_changes(root->fs_info);
+
+ /* commit_fs_roots gets rid of all the tree log roots, it is now
+ * safe to free the root of tree log roots
+ */
+ btrfs_free_log_root_tree(trans, root->fs_info);
+
+ ret = commit_cowonly_roots(trans, root);
+ if (ret) {
+ mutex_unlock(&root->fs_info->tree_log_mutex);
+ mutex_unlock(&root->fs_info->reloc_mutex);
+ goto scrub_continue;
+ }
+
+ /*
+ * The tasks which save the space cache and inode cache may also
+ * update ->aborted, check it.
+ */
+ if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
+ ret = cur_trans->aborted;
+ mutex_unlock(&root->fs_info->tree_log_mutex);
+ mutex_unlock(&root->fs_info->reloc_mutex);
+ goto scrub_continue;
+ }
+
+ btrfs_prepare_extent_commit(trans, root);
+
+ cur_trans = root->fs_info->running_transaction;
+
+ btrfs_set_root_node(&root->fs_info->tree_root->root_item,
+ root->fs_info->tree_root->node);
+ list_add_tail(&root->fs_info->tree_root->dirty_list,
+ &cur_trans->switch_commits);
+
+ btrfs_set_root_node(&root->fs_info->chunk_root->root_item,
+ root->fs_info->chunk_root->node);
+ list_add_tail(&root->fs_info->chunk_root->dirty_list,
+ &cur_trans->switch_commits);
+
+ switch_commit_roots(cur_trans, root->fs_info);
+
+ assert_qgroups_uptodate(trans);
+ ASSERT(list_empty(&cur_trans->dirty_bgs));
+ ASSERT(list_empty(&cur_trans->io_bgs));
+ update_super_roots(root);
+
+ btrfs_set_super_log_root(root->fs_info->super_copy, 0);
+ btrfs_set_super_log_root_level(root->fs_info->super_copy, 0);
+ memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy,
+ sizeof(*root->fs_info->super_copy));
+
+ btrfs_update_commit_device_size(root->fs_info);
+ btrfs_update_commit_device_bytes_used(root, cur_trans);
+
+ clear_bit(BTRFS_INODE_BTREE_LOG1_ERR, &btree_ino->runtime_flags);
+ clear_bit(BTRFS_INODE_BTREE_LOG2_ERR, &btree_ino->runtime_flags);
+
+ spin_lock(&root->fs_info->trans_lock);
+ cur_trans->state = TRANS_STATE_UNBLOCKED;
+ root->fs_info->running_transaction = NULL;
+ spin_unlock(&root->fs_info->trans_lock);
+ mutex_unlock(&root->fs_info->reloc_mutex);
+
+ wake_up(&root->fs_info->transaction_wait);
+
+ ret = btrfs_write_and_wait_transaction(trans, root);
+ if (ret) {
+ btrfs_error(root->fs_info, ret,
+ "Error while writing out transaction");
+ mutex_unlock(&root->fs_info->tree_log_mutex);
+ goto scrub_continue;
+ }
+
+ ret = write_ctree_super(trans, root, 0);
+ if (ret) {
+ mutex_unlock(&root->fs_info->tree_log_mutex);
+ goto scrub_continue;
+ }
+
+ /*
+ * the super is written, we can safely allow the tree-loggers
+ * to go about their business
+ */
+ mutex_unlock(&root->fs_info->tree_log_mutex);
+
+ btrfs_finish_extent_commit(trans, root);
+
+ if (cur_trans->have_free_bgs)
+ btrfs_clear_space_info_full(root->fs_info);
+
+ root->fs_info->last_trans_committed = cur_trans->transid;
+ /*
+ * We needn't acquire the lock here because there is no other task
+ * which can change it.
+ */
+ cur_trans->state = TRANS_STATE_COMPLETED;
+ wake_up(&cur_trans->commit_wait);
+
+ spin_lock(&root->fs_info->trans_lock);
+ list_del_init(&cur_trans->list);
+ spin_unlock(&root->fs_info->trans_lock);
+
+ btrfs_put_transaction(cur_trans);
+ btrfs_put_transaction(cur_trans);
+
+ if (trans->type & __TRANS_FREEZABLE)
+ sb_end_intwrite(root->fs_info->sb);
+
+ trace_btrfs_transaction_commit(root);
+
+ btrfs_scrub_continue(root);
+
+ if (current->journal_info == trans)
+ current->journal_info = NULL;
+
+ kmem_cache_free(btrfs_trans_handle_cachep, trans);
+
+ if (current != root->fs_info->transaction_kthread)
+ btrfs_run_delayed_iputs(root);
+
+ return ret;
+
+scrub_continue:
+ btrfs_scrub_continue(root);
+cleanup_transaction:
+ btrfs_trans_release_metadata(trans, root);
+ trans->block_rsv = NULL;
+ if (trans->qgroup_reserved) {
+ btrfs_qgroup_free(root, trans->qgroup_reserved);
+ trans->qgroup_reserved = 0;
+ }
+ btrfs_warn(root->fs_info, "Skipping commit of aborted transaction.");
+ if (current->journal_info == trans)
+ current->journal_info = NULL;
+ cleanup_transaction(trans, root, ret);
+
+ return ret;
+}
+
+/*
+ * return < 0 if error
+ * 0 if there are no more dead_roots at the time of call
+ * 1 there are more to be processed, call me again
+ *
+ * The return value indicates there are certainly more snapshots to delete, but
+ * if there comes a new one during processing, it may return 0. We don't mind,
+ * because btrfs_commit_super will poke cleaner thread and it will process it a
+ * few seconds later.
+ */
+int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
+{
+ int ret;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+
+ spin_lock(&fs_info->trans_lock);
+ if (list_empty(&fs_info->dead_roots)) {
+ spin_unlock(&fs_info->trans_lock);
+ return 0;
+ }
+ root = list_first_entry(&fs_info->dead_roots,
+ struct btrfs_root, root_list);
+ list_del_init(&root->root_list);
+ spin_unlock(&fs_info->trans_lock);
+
+ pr_debug("BTRFS: cleaner removing %llu\n", root->objectid);
+
+ btrfs_kill_all_delayed_nodes(root);
+
+ if (btrfs_header_backref_rev(root->node) <
+ BTRFS_MIXED_BACKREF_REV)
+ ret = btrfs_drop_snapshot(root, NULL, 0, 0);
+ else
+ ret = btrfs_drop_snapshot(root, NULL, 1, 0);
+
+ return (ret < 0) ? 0 : 1;
+}
+
+void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info)
+{
+ unsigned long prev;
+ unsigned long bit;
+
+ prev = xchg(&fs_info->pending_changes, 0);
+ if (!prev)
+ return;
+
+ bit = 1 << BTRFS_PENDING_SET_INODE_MAP_CACHE;
+ if (prev & bit)
+ btrfs_set_opt(fs_info->mount_opt, INODE_MAP_CACHE);
+ prev &= ~bit;
+
+ bit = 1 << BTRFS_PENDING_CLEAR_INODE_MAP_CACHE;
+ if (prev & bit)
+ btrfs_clear_opt(fs_info->mount_opt, INODE_MAP_CACHE);
+ prev &= ~bit;
+
+ bit = 1 << BTRFS_PENDING_COMMIT;
+ if (prev & bit)
+ btrfs_debug(fs_info, "pending commit done");
+ prev &= ~bit;
+
+ if (prev)
+ btrfs_warn(fs_info,
+ "unknown pending changes left 0x%lx, ignoring", prev);
+}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
new file mode 100644
index 000000000..0b2475559
--- /dev/null
+++ b/fs/btrfs/transaction.h
@@ -0,0 +1,194 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_TRANSACTION__
+#define __BTRFS_TRANSACTION__
+#include "btrfs_inode.h"
+#include "delayed-ref.h"
+#include "ctree.h"
+
+enum btrfs_trans_state {
+ TRANS_STATE_RUNNING = 0,
+ TRANS_STATE_BLOCKED = 1,
+ TRANS_STATE_COMMIT_START = 2,
+ TRANS_STATE_COMMIT_DOING = 3,
+ TRANS_STATE_UNBLOCKED = 4,
+ TRANS_STATE_COMPLETED = 5,
+ TRANS_STATE_MAX = 6,
+};
+
+struct btrfs_transaction {
+ u64 transid;
+ /*
+ * total external writers(USERSPACE/START/ATTACH) in this
+ * transaction, it must be zero before the transaction is
+ * being committed
+ */
+ atomic_t num_extwriters;
+ /*
+ * total writers in this transaction, it must be zero before the
+ * transaction can end
+ */
+ atomic_t num_writers;
+ atomic_t use_count;
+
+ /*
+ * true if there is free bgs operations in this transaction
+ */
+ int have_free_bgs;
+
+ /* Be protected by fs_info->trans_lock when we want to change it. */
+ enum btrfs_trans_state state;
+ struct list_head list;
+ struct extent_io_tree dirty_pages;
+ unsigned long start_time;
+ wait_queue_head_t writer_wait;
+ wait_queue_head_t commit_wait;
+ struct list_head pending_snapshots;
+ struct list_head pending_chunks;
+ struct list_head pending_ordered;
+ struct list_head switch_commits;
+ struct list_head dirty_bgs;
+ struct list_head io_bgs;
+ u64 num_dirty_bgs;
+
+ /*
+ * we need to make sure block group deletion doesn't race with
+ * free space cache writeout. This mutex keeps them from stomping
+ * on each other
+ */
+ struct mutex cache_write_mutex;
+ spinlock_t dirty_bgs_lock;
+ struct btrfs_delayed_ref_root delayed_refs;
+ int aborted;
+ int dirty_bg_run;
+};
+
+#define __TRANS_FREEZABLE (1U << 0)
+
+#define __TRANS_USERSPACE (1U << 8)
+#define __TRANS_START (1U << 9)
+#define __TRANS_ATTACH (1U << 10)
+#define __TRANS_JOIN (1U << 11)
+#define __TRANS_JOIN_NOLOCK (1U << 12)
+#define __TRANS_DUMMY (1U << 13)
+
+#define TRANS_USERSPACE (__TRANS_USERSPACE | __TRANS_FREEZABLE)
+#define TRANS_START (__TRANS_START | __TRANS_FREEZABLE)
+#define TRANS_ATTACH (__TRANS_ATTACH)
+#define TRANS_JOIN (__TRANS_JOIN | __TRANS_FREEZABLE)
+#define TRANS_JOIN_NOLOCK (__TRANS_JOIN_NOLOCK)
+
+#define TRANS_EXTWRITERS (__TRANS_USERSPACE | __TRANS_START | \
+ __TRANS_ATTACH)
+
+#define BTRFS_SEND_TRANS_STUB ((void *)1)
+
+struct btrfs_trans_handle {
+ u64 transid;
+ u64 bytes_reserved;
+ u64 qgroup_reserved;
+ unsigned long use_count;
+ unsigned long blocks_reserved;
+ unsigned long blocks_used;
+ unsigned long delayed_ref_updates;
+ struct btrfs_transaction *transaction;
+ struct btrfs_block_rsv *block_rsv;
+ struct btrfs_block_rsv *orig_rsv;
+ short aborted;
+ short adding_csums;
+ bool allocating_chunk;
+ bool reloc_reserved;
+ bool sync;
+ unsigned int type;
+ /*
+ * this root is only needed to validate that the root passed to
+ * start_transaction is the same as the one passed to end_transaction.
+ * Subvolume quota depends on this
+ */
+ struct btrfs_root *root;
+ struct seq_list delayed_ref_elem;
+ struct list_head ordered;
+ struct list_head qgroup_ref_list;
+ struct list_head new_bgs;
+};
+
+struct btrfs_pending_snapshot {
+ struct dentry *dentry;
+ struct inode *dir;
+ struct btrfs_root *root;
+ struct btrfs_root *snap;
+ struct btrfs_qgroup_inherit *inherit;
+ /* block reservation for the operation */
+ struct btrfs_block_rsv block_rsv;
+ u64 qgroup_reserved;
+ /* extra metadata reseration for relocation */
+ int error;
+ bool readonly;
+ struct list_head list;
+};
+
+static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
+ struct inode *inode)
+{
+ spin_lock(&BTRFS_I(inode)->lock);
+ BTRFS_I(inode)->last_trans = trans->transaction->transid;
+ BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
+ BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
+ spin_unlock(&BTRFS_I(inode)->lock);
+}
+
+int btrfs_end_transaction(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
+ int num_items);
+struct btrfs_trans_handle *btrfs_start_transaction_lflush(
+ struct btrfs_root *root, int num_items);
+struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
+struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
+struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root);
+struct btrfs_trans_handle *btrfs_attach_transaction_barrier(
+ struct btrfs_root *root);
+struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root);
+int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);
+
+void btrfs_add_dead_root(struct btrfs_root *root);
+int btrfs_defrag_root(struct btrfs_root *root);
+int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root);
+int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ int wait_for_unblock);
+int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+void btrfs_throttle(struct btrfs_root *root);
+int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+int btrfs_write_marked_extents(struct btrfs_root *root,
+ struct extent_io_tree *dirty_pages, int mark);
+int btrfs_wait_marked_extents(struct btrfs_root *root,
+ struct extent_io_tree *dirty_pages, int mark);
+int btrfs_transaction_blocked(struct btrfs_fs_info *info);
+int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
+void btrfs_put_transaction(struct btrfs_transaction *transaction);
+void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info);
+
+#endif
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
new file mode 100644
index 000000000..a63719cc9
--- /dev/null
+++ b/fs/btrfs/tree-defrag.c
@@ -0,0 +1,139 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "print-tree.h"
+#include "transaction.h"
+#include "locking.h"
+
+/*
+ * Defrag all the leaves in a given btree.
+ * Read all the leaves and try to get key order to
+ * better reflect disk order
+ */
+
+int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_path *path = NULL;
+ struct btrfs_key key;
+ int ret = 0;
+ int wret;
+ int level;
+ int next_key_ret = 0;
+ u64 last_ret = 0;
+ u64 min_trans = 0;
+
+ if (root->fs_info->extent_root == root) {
+ /*
+ * there's recursion here right now in the tree locking,
+ * we can't defrag the extent root without deadlock
+ */
+ goto out;
+ }
+
+ if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
+ goto out;
+
+ if (btrfs_test_opt(root, SSD))
+ goto out;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ level = btrfs_header_level(root->node);
+
+ if (level == 0)
+ goto out;
+
+ if (root->defrag_progress.objectid == 0) {
+ struct extent_buffer *root_node;
+ u32 nritems;
+
+ root_node = btrfs_lock_root_node(root);
+ btrfs_set_lock_blocking(root_node);
+ nritems = btrfs_header_nritems(root_node);
+ root->defrag_max.objectid = 0;
+ /* from above we know this is not a leaf */
+ btrfs_node_key_to_cpu(root_node, &root->defrag_max,
+ nritems - 1);
+ btrfs_tree_unlock(root_node);
+ free_extent_buffer(root_node);
+ memset(&key, 0, sizeof(key));
+ } else {
+ memcpy(&key, &root->defrag_progress, sizeof(key));
+ }
+
+ path->keep_locks = 1;
+
+ ret = btrfs_search_forward(root, &key, path, min_trans);
+ if (ret < 0)
+ goto out;
+ if (ret > 0) {
+ ret = 0;
+ goto out;
+ }
+ btrfs_release_path(path);
+ wret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+
+ if (wret < 0) {
+ ret = wret;
+ goto out;
+ }
+ if (!path->nodes[1]) {
+ ret = 0;
+ goto out;
+ }
+ path->slots[1] = btrfs_header_nritems(path->nodes[1]);
+ next_key_ret = btrfs_find_next_key(root, path, &key, 1,
+ min_trans);
+ ret = btrfs_realloc_node(trans, root,
+ path->nodes[1], 0,
+ &last_ret,
+ &root->defrag_progress);
+ if (ret) {
+ WARN_ON(ret == -EAGAIN);
+ goto out;
+ }
+ if (next_key_ret == 0) {
+ memcpy(&root->defrag_progress, &key, sizeof(key));
+ ret = -EAGAIN;
+ }
+out:
+ if (path)
+ btrfs_free_path(path);
+ if (ret == -EAGAIN) {
+ if (root->defrag_max.objectid > root->defrag_progress.objectid)
+ goto done;
+ if (root->defrag_max.type > root->defrag_progress.type)
+ goto done;
+ if (root->defrag_max.offset > root->defrag_progress.offset)
+ goto done;
+ ret = 0;
+ }
+done:
+ if (ret != -EAGAIN) {
+ memset(&root->defrag_progress, 0,
+ sizeof(root->defrag_progress));
+ root->defrag_trans_start = trans->transid;
+ }
+ return ret;
+}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
new file mode 100644
index 000000000..4920fceff
--- /dev/null
+++ b/fs/btrfs/tree-log.c
@@ -0,0 +1,5093 @@
+/*
+ * Copyright (C) 2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/list_sort.h>
+#include "tree-log.h"
+#include "disk-io.h"
+#include "locking.h"
+#include "print-tree.h"
+#include "backref.h"
+#include "hash.h"
+
+/* magic values for the inode_only field in btrfs_log_inode:
+ *
+ * LOG_INODE_ALL means to log everything
+ * LOG_INODE_EXISTS means to log just enough to recreate the inode
+ * during log replay
+ */
+#define LOG_INODE_ALL 0
+#define LOG_INODE_EXISTS 1
+
+/*
+ * directory trouble cases
+ *
+ * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
+ * log, we must force a full commit before doing an fsync of the directory
+ * where the unlink was done.
+ * ---> record transid of last unlink/rename per directory
+ *
+ * mkdir foo/some_dir
+ * normal commit
+ * rename foo/some_dir foo2/some_dir
+ * mkdir foo/some_dir
+ * fsync foo/some_dir/some_file
+ *
+ * The fsync above will unlink the original some_dir without recording
+ * it in its new location (foo2). After a crash, some_dir will be gone
+ * unless the fsync of some_file forces a full commit
+ *
+ * 2) we must log any new names for any file or dir that is in the fsync
+ * log. ---> check inode while renaming/linking.
+ *
+ * 2a) we must log any new names for any file or dir during rename
+ * when the directory they are being removed from was logged.
+ * ---> check inode and old parent dir during rename
+ *
+ * 2a is actually the more important variant. With the extra logging
+ * a crash might unlink the old name without recreating the new one
+ *
+ * 3) after a crash, we must go through any directories with a link count
+ * of zero and redo the rm -rf
+ *
+ * mkdir f1/foo
+ * normal commit
+ * rm -rf f1/foo
+ * fsync(f1)
+ *
+ * The directory f1 was fully removed from the FS, but fsync was never
+ * called on f1, only its parent dir. After a crash the rm -rf must
+ * be replayed. This must be able to recurse down the entire
+ * directory tree. The inode link count fixup code takes care of the
+ * ugly details.
+ */
+
+/*
+ * stages for the tree walking. The first
+ * stage (0) is to only pin down the blocks we find
+ * the second stage (1) is to make sure that all the inodes
+ * we find in the log are created in the subvolume.
+ *
+ * The last stage is to deal with directories and links and extents
+ * and all the other fun semantics
+ */
+#define LOG_WALK_PIN_ONLY 0
+#define LOG_WALK_REPLAY_INODES 1
+#define LOG_WALK_REPLAY_DIR_INDEX 2
+#define LOG_WALK_REPLAY_ALL 3
+
+static int btrfs_log_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode,
+ int inode_only,
+ const loff_t start,
+ const loff_t end,
+ struct btrfs_log_ctx *ctx);
+static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 objectid);
+static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_root *log,
+ struct btrfs_path *path,
+ u64 dirid, int del_all);
+
+/*
+ * tree logging is a special write ahead log used to make sure that
+ * fsyncs and O_SYNCs can happen without doing full tree commits.
+ *
+ * Full tree commits are expensive because they require commonly
+ * modified blocks to be recowed, creating many dirty pages in the
+ * extent tree an 4x-6x higher write load than ext3.
+ *
+ * Instead of doing a tree commit on every fsync, we use the
+ * key ranges and transaction ids to find items for a given file or directory
+ * that have changed in this transaction. Those items are copied into
+ * a special tree (one per subvolume root), that tree is written to disk
+ * and then the fsync is considered complete.
+ *
+ * After a crash, items are copied out of the log-tree back into the
+ * subvolume tree. Any file data extents found are recorded in the extent
+ * allocation tree, and the log-tree freed.
+ *
+ * The log tree is read three times, once to pin down all the extents it is
+ * using in ram and once, once to create all the inodes logged in the tree
+ * and once to do all the other items.
+ */
+
+/*
+ * start a sub transaction and setup the log tree
+ * this increments the log tree writer count to make the people
+ * syncing the tree wait for us to finish
+ */
+static int start_log_trans(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_log_ctx *ctx)
+{
+ int index;
+ int ret;
+
+ mutex_lock(&root->log_mutex);
+ if (root->log_root) {
+ if (btrfs_need_log_full_commit(root->fs_info, trans)) {
+ ret = -EAGAIN;
+ goto out;
+ }
+ if (!root->log_start_pid) {
+ root->log_start_pid = current->pid;
+ clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
+ } else if (root->log_start_pid != current->pid) {
+ set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
+ }
+
+ atomic_inc(&root->log_batch);
+ atomic_inc(&root->log_writers);
+ if (ctx) {
+ index = root->log_transid % 2;
+ list_add_tail(&ctx->list, &root->log_ctxs[index]);
+ ctx->log_transid = root->log_transid;
+ }
+ mutex_unlock(&root->log_mutex);
+ return 0;
+ }
+
+ ret = 0;
+ mutex_lock(&root->fs_info->tree_log_mutex);
+ if (!root->fs_info->log_root_tree)
+ ret = btrfs_init_log_root_tree(trans, root->fs_info);
+ mutex_unlock(&root->fs_info->tree_log_mutex);
+ if (ret)
+ goto out;
+
+ if (!root->log_root) {
+ ret = btrfs_add_log_tree(trans, root);
+ if (ret)
+ goto out;
+ }
+ clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
+ root->log_start_pid = current->pid;
+ atomic_inc(&root->log_batch);
+ atomic_inc(&root->log_writers);
+ if (ctx) {
+ index = root->log_transid % 2;
+ list_add_tail(&ctx->list, &root->log_ctxs[index]);
+ ctx->log_transid = root->log_transid;
+ }
+out:
+ mutex_unlock(&root->log_mutex);
+ return ret;
+}
+
+/*
+ * returns 0 if there was a log transaction running and we were able
+ * to join, or returns -ENOENT if there were not transactions
+ * in progress
+ */
+static int join_running_log_trans(struct btrfs_root *root)
+{
+ int ret = -ENOENT;
+
+ smp_mb();
+ if (!root->log_root)
+ return -ENOENT;
+
+ mutex_lock(&root->log_mutex);
+ if (root->log_root) {
+ ret = 0;
+ atomic_inc(&root->log_writers);
+ }
+ mutex_unlock(&root->log_mutex);
+ return ret;
+}
+
+/*
+ * This either makes the current running log transaction wait
+ * until you call btrfs_end_log_trans() or it makes any future
+ * log transactions wait until you call btrfs_end_log_trans()
+ */
+int btrfs_pin_log_trans(struct btrfs_root *root)
+{
+ int ret = -ENOENT;
+
+ mutex_lock(&root->log_mutex);
+ atomic_inc(&root->log_writers);
+ mutex_unlock(&root->log_mutex);
+ return ret;
+}
+
+/*
+ * indicate we're done making changes to the log tree
+ * and wake up anyone waiting to do a sync
+ */
+void btrfs_end_log_trans(struct btrfs_root *root)
+{
+ if (atomic_dec_and_test(&root->log_writers)) {
+ smp_mb();
+ if (waitqueue_active(&root->log_writer_wait))
+ wake_up(&root->log_writer_wait);
+ }
+}
+
+
+/*
+ * the walk control struct is used to pass state down the chain when
+ * processing the log tree. The stage field tells us which part
+ * of the log tree processing we are currently doing. The others
+ * are state fields used for that specific part
+ */
+struct walk_control {
+ /* should we free the extent on disk when done? This is used
+ * at transaction commit time while freeing a log tree
+ */
+ int free;
+
+ /* should we write out the extent buffer? This is used
+ * while flushing the log tree to disk during a sync
+ */
+ int write;
+
+ /* should we wait for the extent buffer io to finish? Also used
+ * while flushing the log tree to disk for a sync
+ */
+ int wait;
+
+ /* pin only walk, we record which extents on disk belong to the
+ * log trees
+ */
+ int pin;
+
+ /* what stage of the replay code we're currently in */
+ int stage;
+
+ /* the root we are currently replaying */
+ struct btrfs_root *replay_dest;
+
+ /* the trans handle for the current replay */
+ struct btrfs_trans_handle *trans;
+
+ /* the function that gets used to process blocks we find in the
+ * tree. Note the extent_buffer might not be up to date when it is
+ * passed in, and it must be checked or read if you need the data
+ * inside it
+ */
+ int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
+ struct walk_control *wc, u64 gen);
+};
+
+/*
+ * process_func used to pin down extents, write them or wait on them
+ */
+static int process_one_buffer(struct btrfs_root *log,
+ struct extent_buffer *eb,
+ struct walk_control *wc, u64 gen)
+{
+ int ret = 0;
+
+ /*
+ * If this fs is mixed then we need to be able to process the leaves to
+ * pin down any logged extents, so we have to read the block.
+ */
+ if (btrfs_fs_incompat(log->fs_info, MIXED_GROUPS)) {
+ ret = btrfs_read_buffer(eb, gen);
+ if (ret)
+ return ret;
+ }
+
+ if (wc->pin)
+ ret = btrfs_pin_extent_for_log_replay(log->fs_info->extent_root,
+ eb->start, eb->len);
+
+ if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
+ if (wc->pin && btrfs_header_level(eb) == 0)
+ ret = btrfs_exclude_logged_extents(log, eb);
+ if (wc->write)
+ btrfs_write_tree_block(eb);
+ if (wc->wait)
+ btrfs_wait_tree_block_writeback(eb);
+ }
+ return ret;
+}
+
+/*
+ * Item overwrite used by replay and tree logging. eb, slot and key all refer
+ * to the src data we are copying out.
+ *
+ * root is the tree we are copying into, and path is a scratch
+ * path for use in this function (it should be released on entry and
+ * will be released on exit).
+ *
+ * If the key is already in the destination tree the existing item is
+ * overwritten. If the existing item isn't big enough, it is extended.
+ * If it is too large, it is truncated.
+ *
+ * If the key isn't in the destination yet, a new item is inserted.
+ */
+static noinline int overwrite_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct extent_buffer *eb, int slot,
+ struct btrfs_key *key)
+{
+ int ret;
+ u32 item_size;
+ u64 saved_i_size = 0;
+ int save_old_i_size = 0;
+ unsigned long src_ptr;
+ unsigned long dst_ptr;
+ int overwrite_root = 0;
+ bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
+
+ if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
+ overwrite_root = 1;
+
+ item_size = btrfs_item_size_nr(eb, slot);
+ src_ptr = btrfs_item_ptr_offset(eb, slot);
+
+ /* look for the key in the destination tree */
+ ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+
+ if (ret == 0) {
+ char *src_copy;
+ char *dst_copy;
+ u32 dst_size = btrfs_item_size_nr(path->nodes[0],
+ path->slots[0]);
+ if (dst_size != item_size)
+ goto insert;
+
+ if (item_size == 0) {
+ btrfs_release_path(path);
+ return 0;
+ }
+ dst_copy = kmalloc(item_size, GFP_NOFS);
+ src_copy = kmalloc(item_size, GFP_NOFS);
+ if (!dst_copy || !src_copy) {
+ btrfs_release_path(path);
+ kfree(dst_copy);
+ kfree(src_copy);
+ return -ENOMEM;
+ }
+
+ read_extent_buffer(eb, src_copy, src_ptr, item_size);
+
+ dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
+ read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
+ item_size);
+ ret = memcmp(dst_copy, src_copy, item_size);
+
+ kfree(dst_copy);
+ kfree(src_copy);
+ /*
+ * they have the same contents, just return, this saves
+ * us from cowing blocks in the destination tree and doing
+ * extra writes that may not have been done by a previous
+ * sync
+ */
+ if (ret == 0) {
+ btrfs_release_path(path);
+ return 0;
+ }
+
+ /*
+ * We need to load the old nbytes into the inode so when we
+ * replay the extents we've logged we get the right nbytes.
+ */
+ if (inode_item) {
+ struct btrfs_inode_item *item;
+ u64 nbytes;
+ u32 mode;
+
+ item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_inode_item);
+ nbytes = btrfs_inode_nbytes(path->nodes[0], item);
+ item = btrfs_item_ptr(eb, slot,
+ struct btrfs_inode_item);
+ btrfs_set_inode_nbytes(eb, item, nbytes);
+
+ /*
+ * If this is a directory we need to reset the i_size to
+ * 0 so that we can set it up properly when replaying
+ * the rest of the items in this log.
+ */
+ mode = btrfs_inode_mode(eb, item);
+ if (S_ISDIR(mode))
+ btrfs_set_inode_size(eb, item, 0);
+ }
+ } else if (inode_item) {
+ struct btrfs_inode_item *item;
+ u32 mode;
+
+ /*
+ * New inode, set nbytes to 0 so that the nbytes comes out
+ * properly when we replay the extents.
+ */
+ item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
+ btrfs_set_inode_nbytes(eb, item, 0);
+
+ /*
+ * If this is a directory we need to reset the i_size to 0 so
+ * that we can set it up properly when replaying the rest of
+ * the items in this log.
+ */
+ mode = btrfs_inode_mode(eb, item);
+ if (S_ISDIR(mode))
+ btrfs_set_inode_size(eb, item, 0);
+ }
+insert:
+ btrfs_release_path(path);
+ /* try to insert the key into the destination tree */
+ path->skip_release_on_error = 1;
+ ret = btrfs_insert_empty_item(trans, root, path,
+ key, item_size);
+ path->skip_release_on_error = 0;
+
+ /* make sure any existing item is the correct size */
+ if (ret == -EEXIST || ret == -EOVERFLOW) {
+ u32 found_size;
+ found_size = btrfs_item_size_nr(path->nodes[0],
+ path->slots[0]);
+ if (found_size > item_size)
+ btrfs_truncate_item(root, path, item_size, 1);
+ else if (found_size < item_size)
+ btrfs_extend_item(root, path,
+ item_size - found_size);
+ } else if (ret) {
+ return ret;
+ }
+ dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
+ path->slots[0]);
+
+ /* don't overwrite an existing inode if the generation number
+ * was logged as zero. This is done when the tree logging code
+ * is just logging an inode to make sure it exists after recovery.
+ *
+ * Also, don't overwrite i_size on directories during replay.
+ * log replay inserts and removes directory items based on the
+ * state of the tree found in the subvolume, and i_size is modified
+ * as it goes
+ */
+ if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
+ struct btrfs_inode_item *src_item;
+ struct btrfs_inode_item *dst_item;
+
+ src_item = (struct btrfs_inode_item *)src_ptr;
+ dst_item = (struct btrfs_inode_item *)dst_ptr;
+
+ if (btrfs_inode_generation(eb, src_item) == 0) {
+ struct extent_buffer *dst_eb = path->nodes[0];
+ const u64 ino_size = btrfs_inode_size(eb, src_item);
+
+ /*
+ * For regular files an ino_size == 0 is used only when
+ * logging that an inode exists, as part of a directory
+ * fsync, and the inode wasn't fsynced before. In this
+ * case don't set the size of the inode in the fs/subvol
+ * tree, otherwise we would be throwing valid data away.
+ */
+ if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
+ S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
+ ino_size != 0) {
+ struct btrfs_map_token token;
+
+ btrfs_init_map_token(&token);
+ btrfs_set_token_inode_size(dst_eb, dst_item,
+ ino_size, &token);
+ }
+ goto no_copy;
+ }
+
+ if (overwrite_root &&
+ S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
+ S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
+ save_old_i_size = 1;
+ saved_i_size = btrfs_inode_size(path->nodes[0],
+ dst_item);
+ }
+ }
+
+ copy_extent_buffer(path->nodes[0], eb, dst_ptr,
+ src_ptr, item_size);
+
+ if (save_old_i_size) {
+ struct btrfs_inode_item *dst_item;
+ dst_item = (struct btrfs_inode_item *)dst_ptr;
+ btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
+ }
+
+ /* make sure the generation is filled in */
+ if (key->type == BTRFS_INODE_ITEM_KEY) {
+ struct btrfs_inode_item *dst_item;
+ dst_item = (struct btrfs_inode_item *)dst_ptr;
+ if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
+ btrfs_set_inode_generation(path->nodes[0], dst_item,
+ trans->transid);
+ }
+ }
+no_copy:
+ btrfs_mark_buffer_dirty(path->nodes[0]);
+ btrfs_release_path(path);
+ return 0;
+}
+
+/*
+ * simple helper to read an inode off the disk from a given root
+ * This can only be called for subvolume roots and not for the log
+ */
+static noinline struct inode *read_one_inode(struct btrfs_root *root,
+ u64 objectid)
+{
+ struct btrfs_key key;
+ struct inode *inode;
+
+ key.objectid = objectid;
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+ inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
+ if (IS_ERR(inode)) {
+ inode = NULL;
+ } else if (is_bad_inode(inode)) {
+ iput(inode);
+ inode = NULL;
+ }
+ return inode;
+}
+
+/* replays a single extent in 'eb' at 'slot' with 'key' into the
+ * subvolume 'root'. path is released on entry and should be released
+ * on exit.
+ *
+ * extents in the log tree have not been allocated out of the extent
+ * tree yet. So, this completes the allocation, taking a reference
+ * as required if the extent already exists or creating a new extent
+ * if it isn't in the extent allocation tree yet.
+ *
+ * The extent is inserted into the file, dropping any existing extents
+ * from the file that overlap the new one.
+ */
+static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct extent_buffer *eb, int slot,
+ struct btrfs_key *key)
+{
+ int found_type;
+ u64 extent_end;
+ u64 start = key->offset;
+ u64 nbytes = 0;
+ struct btrfs_file_extent_item *item;
+ struct inode *inode = NULL;
+ unsigned long size;
+ int ret = 0;
+
+ item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
+ found_type = btrfs_file_extent_type(eb, item);
+
+ if (found_type == BTRFS_FILE_EXTENT_REG ||
+ found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ nbytes = btrfs_file_extent_num_bytes(eb, item);
+ extent_end = start + nbytes;
+
+ /*
+ * We don't add to the inodes nbytes if we are prealloc or a
+ * hole.
+ */
+ if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
+ nbytes = 0;
+ } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+ size = btrfs_file_extent_inline_len(eb, slot, item);
+ nbytes = btrfs_file_extent_ram_bytes(eb, item);
+ extent_end = ALIGN(start + size, root->sectorsize);
+ } else {
+ ret = 0;
+ goto out;
+ }
+
+ inode = read_one_inode(root, key->objectid);
+ if (!inode) {
+ ret = -EIO;
+ goto out;
+ }
+
+ /*
+ * first check to see if we already have this extent in the
+ * file. This must be done before the btrfs_drop_extents run
+ * so we don't try to drop this extent.
+ */
+ ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode),
+ start, 0);
+
+ if (ret == 0 &&
+ (found_type == BTRFS_FILE_EXTENT_REG ||
+ found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
+ struct btrfs_file_extent_item cmp1;
+ struct btrfs_file_extent_item cmp2;
+ struct btrfs_file_extent_item *existing;
+ struct extent_buffer *leaf;
+
+ leaf = path->nodes[0];
+ existing = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+
+ read_extent_buffer(eb, &cmp1, (unsigned long)item,
+ sizeof(cmp1));
+ read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
+ sizeof(cmp2));
+
+ /*
+ * we already have a pointer to this exact extent,
+ * we don't have to do anything
+ */
+ if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
+ btrfs_release_path(path);
+ goto out;
+ }
+ }
+ btrfs_release_path(path);
+
+ /* drop any overlapping extents */
+ ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1);
+ if (ret)
+ goto out;
+
+ if (found_type == BTRFS_FILE_EXTENT_REG ||
+ found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ u64 offset;
+ unsigned long dest_offset;
+ struct btrfs_key ins;
+
+ ret = btrfs_insert_empty_item(trans, root, path, key,
+ sizeof(*item));
+ if (ret)
+ goto out;
+ dest_offset = btrfs_item_ptr_offset(path->nodes[0],
+ path->slots[0]);
+ copy_extent_buffer(path->nodes[0], eb, dest_offset,
+ (unsigned long)item, sizeof(*item));
+
+ ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
+ ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
+ ins.type = BTRFS_EXTENT_ITEM_KEY;
+ offset = key->offset - btrfs_file_extent_offset(eb, item);
+
+ if (ins.objectid > 0) {
+ u64 csum_start;
+ u64 csum_end;
+ LIST_HEAD(ordered_sums);
+ /*
+ * is this extent already allocated in the extent
+ * allocation tree? If so, just add a reference
+ */
+ ret = btrfs_lookup_data_extent(root, ins.objectid,
+ ins.offset);
+ if (ret == 0) {
+ ret = btrfs_inc_extent_ref(trans, root,
+ ins.objectid, ins.offset,
+ 0, root->root_key.objectid,
+ key->objectid, offset, 0);
+ if (ret)
+ goto out;
+ } else {
+ /*
+ * insert the extent pointer in the extent
+ * allocation tree
+ */
+ ret = btrfs_alloc_logged_file_extent(trans,
+ root, root->root_key.objectid,
+ key->objectid, offset, &ins);
+ if (ret)
+ goto out;
+ }
+ btrfs_release_path(path);
+
+ if (btrfs_file_extent_compression(eb, item)) {
+ csum_start = ins.objectid;
+ csum_end = csum_start + ins.offset;
+ } else {
+ csum_start = ins.objectid +
+ btrfs_file_extent_offset(eb, item);
+ csum_end = csum_start +
+ btrfs_file_extent_num_bytes(eb, item);
+ }
+
+ ret = btrfs_lookup_csums_range(root->log_root,
+ csum_start, csum_end - 1,
+ &ordered_sums, 0);
+ if (ret)
+ goto out;
+ while (!list_empty(&ordered_sums)) {
+ struct btrfs_ordered_sum *sums;
+ sums = list_entry(ordered_sums.next,
+ struct btrfs_ordered_sum,
+ list);
+ if (!ret)
+ ret = btrfs_csum_file_blocks(trans,
+ root->fs_info->csum_root,
+ sums);
+ list_del(&sums->list);
+ kfree(sums);
+ }
+ if (ret)
+ goto out;
+ } else {
+ btrfs_release_path(path);
+ }
+ } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+ /* inline extents are easy, we just overwrite them */
+ ret = overwrite_item(trans, root, path, eb, slot, key);
+ if (ret)
+ goto out;
+ }
+
+ inode_add_bytes(inode, nbytes);
+ ret = btrfs_update_inode(trans, root, inode);
+out:
+ if (inode)
+ iput(inode);
+ return ret;
+}
+
+/*
+ * when cleaning up conflicts between the directory names in the
+ * subvolume, directory names in the log and directory names in the
+ * inode back references, we may have to unlink inodes from directories.
+ *
+ * This is a helper function to do the unlink of a specific directory
+ * item
+ */
+static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct inode *dir,
+ struct btrfs_dir_item *di)
+{
+ struct inode *inode;
+ char *name;
+ int name_len;
+ struct extent_buffer *leaf;
+ struct btrfs_key location;
+ int ret;
+
+ leaf = path->nodes[0];
+
+ btrfs_dir_item_key_to_cpu(leaf, di, &location);
+ name_len = btrfs_dir_name_len(leaf, di);
+ name = kmalloc(name_len, GFP_NOFS);
+ if (!name)
+ return -ENOMEM;
+
+ read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
+ btrfs_release_path(path);
+
+ inode = read_one_inode(root, location.objectid);
+ if (!inode) {
+ ret = -EIO;
+ goto out;
+ }
+
+ ret = link_to_fixup_dir(trans, root, path, location.objectid);
+ if (ret)
+ goto out;
+
+ ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
+ if (ret)
+ goto out;
+ else
+ ret = btrfs_run_delayed_items(trans, root);
+out:
+ kfree(name);
+ iput(inode);
+ return ret;
+}
+
+/*
+ * helper function to see if a given name and sequence number found
+ * in an inode back reference are already in a directory and correctly
+ * point to this inode
+ */
+static noinline int inode_in_dir(struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 dirid, u64 objectid, u64 index,
+ const char *name, int name_len)
+{
+ struct btrfs_dir_item *di;
+ struct btrfs_key location;
+ int match = 0;
+
+ di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
+ index, name, name_len, 0);
+ if (di && !IS_ERR(di)) {
+ btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
+ if (location.objectid != objectid)
+ goto out;
+ } else
+ goto out;
+ btrfs_release_path(path);
+
+ di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
+ if (di && !IS_ERR(di)) {
+ btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
+ if (location.objectid != objectid)
+ goto out;
+ } else
+ goto out;
+ match = 1;
+out:
+ btrfs_release_path(path);
+ return match;
+}
+
+/*
+ * helper function to check a log tree for a named back reference in
+ * an inode. This is used to decide if a back reference that is
+ * found in the subvolume conflicts with what we find in the log.
+ *
+ * inode backreferences may have multiple refs in a single item,
+ * during replay we process one reference at a time, and we don't
+ * want to delete valid links to a file from the subvolume if that
+ * link is also in the log.
+ */
+static noinline int backref_in_log(struct btrfs_root *log,
+ struct btrfs_key *key,
+ u64 ref_objectid,
+ const char *name, int namelen)
+{
+ struct btrfs_path *path;
+ struct btrfs_inode_ref *ref;
+ unsigned long ptr;
+ unsigned long ptr_end;
+ unsigned long name_ptr;
+ int found_name_len;
+ int item_size;
+ int ret;
+ int match = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
+ if (ret != 0)
+ goto out;
+
+ ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
+
+ if (key->type == BTRFS_INODE_EXTREF_KEY) {
+ if (btrfs_find_name_in_ext_backref(path, ref_objectid,
+ name, namelen, NULL))
+ match = 1;
+
+ goto out;
+ }
+
+ item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
+ ptr_end = ptr + item_size;
+ while (ptr < ptr_end) {
+ ref = (struct btrfs_inode_ref *)ptr;
+ found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref);
+ if (found_name_len == namelen) {
+ name_ptr = (unsigned long)(ref + 1);
+ ret = memcmp_extent_buffer(path->nodes[0], name,
+ name_ptr, namelen);
+ if (ret == 0) {
+ match = 1;
+ goto out;
+ }
+ }
+ ptr = (unsigned long)(ref + 1) + found_name_len;
+ }
+out:
+ btrfs_free_path(path);
+ return match;
+}
+
+static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_root *log_root,
+ struct inode *dir, struct inode *inode,
+ struct extent_buffer *eb,
+ u64 inode_objectid, u64 parent_objectid,
+ u64 ref_index, char *name, int namelen,
+ int *search_done)
+{
+ int ret;
+ char *victim_name;
+ int victim_name_len;
+ struct extent_buffer *leaf;
+ struct btrfs_dir_item *di;
+ struct btrfs_key search_key;
+ struct btrfs_inode_extref *extref;
+
+again:
+ /* Search old style refs */
+ search_key.objectid = inode_objectid;
+ search_key.type = BTRFS_INODE_REF_KEY;
+ search_key.offset = parent_objectid;
+ ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+ if (ret == 0) {
+ struct btrfs_inode_ref *victim_ref;
+ unsigned long ptr;
+ unsigned long ptr_end;
+
+ leaf = path->nodes[0];
+
+ /* are we trying to overwrite a back ref for the root directory
+ * if so, just jump out, we're done
+ */
+ if (search_key.objectid == search_key.offset)
+ return 1;
+
+ /* check all the names in this back reference to see
+ * if they are in the log. if so, we allow them to stay
+ * otherwise they must be unlinked as a conflict
+ */
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
+ while (ptr < ptr_end) {
+ victim_ref = (struct btrfs_inode_ref *)ptr;
+ victim_name_len = btrfs_inode_ref_name_len(leaf,
+ victim_ref);
+ victim_name = kmalloc(victim_name_len, GFP_NOFS);
+ if (!victim_name)
+ return -ENOMEM;
+
+ read_extent_buffer(leaf, victim_name,
+ (unsigned long)(victim_ref + 1),
+ victim_name_len);
+
+ if (!backref_in_log(log_root, &search_key,
+ parent_objectid,
+ victim_name,
+ victim_name_len)) {
+ inc_nlink(inode);
+ btrfs_release_path(path);
+
+ ret = btrfs_unlink_inode(trans, root, dir,
+ inode, victim_name,
+ victim_name_len);
+ kfree(victim_name);
+ if (ret)
+ return ret;
+ ret = btrfs_run_delayed_items(trans, root);
+ if (ret)
+ return ret;
+ *search_done = 1;
+ goto again;
+ }
+ kfree(victim_name);
+
+ ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
+ }
+
+ /*
+ * NOTE: we have searched root tree and checked the
+ * coresponding ref, it does not need to check again.
+ */
+ *search_done = 1;
+ }
+ btrfs_release_path(path);
+
+ /* Same search but for extended refs */
+ extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
+ inode_objectid, parent_objectid, 0,
+ 0);
+ if (!IS_ERR_OR_NULL(extref)) {
+ u32 item_size;
+ u32 cur_offset = 0;
+ unsigned long base;
+ struct inode *victim_parent;
+
+ leaf = path->nodes[0];
+
+ item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ base = btrfs_item_ptr_offset(leaf, path->slots[0]);
+
+ while (cur_offset < item_size) {
+ extref = (struct btrfs_inode_extref *)(base + cur_offset);
+
+ victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
+
+ if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
+ goto next;
+
+ victim_name = kmalloc(victim_name_len, GFP_NOFS);
+ if (!victim_name)
+ return -ENOMEM;
+ read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name,
+ victim_name_len);
+
+ search_key.objectid = inode_objectid;
+ search_key.type = BTRFS_INODE_EXTREF_KEY;
+ search_key.offset = btrfs_extref_hash(parent_objectid,
+ victim_name,
+ victim_name_len);
+ ret = 0;
+ if (!backref_in_log(log_root, &search_key,
+ parent_objectid, victim_name,
+ victim_name_len)) {
+ ret = -ENOENT;
+ victim_parent = read_one_inode(root,
+ parent_objectid);
+ if (victim_parent) {
+ inc_nlink(inode);
+ btrfs_release_path(path);
+
+ ret = btrfs_unlink_inode(trans, root,
+ victim_parent,
+ inode,
+ victim_name,
+ victim_name_len);
+ if (!ret)
+ ret = btrfs_run_delayed_items(
+ trans, root);
+ }
+ iput(victim_parent);
+ kfree(victim_name);
+ if (ret)
+ return ret;
+ *search_done = 1;
+ goto again;
+ }
+ kfree(victim_name);
+ if (ret)
+ return ret;
+next:
+ cur_offset += victim_name_len + sizeof(*extref);
+ }
+ *search_done = 1;
+ }
+ btrfs_release_path(path);
+
+ /* look for a conflicting sequence number */
+ di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
+ ref_index, name, namelen, 0);
+ if (di && !IS_ERR(di)) {
+ ret = drop_one_dir_item(trans, root, path, dir, di);
+ if (ret)
+ return ret;
+ }
+ btrfs_release_path(path);
+
+ /* look for a conflicing name */
+ di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
+ name, namelen, 0);
+ if (di && !IS_ERR(di)) {
+ ret = drop_one_dir_item(trans, root, path, dir, di);
+ if (ret)
+ return ret;
+ }
+ btrfs_release_path(path);
+
+ return 0;
+}
+
+static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
+ u32 *namelen, char **name, u64 *index,
+ u64 *parent_objectid)
+{
+ struct btrfs_inode_extref *extref;
+
+ extref = (struct btrfs_inode_extref *)ref_ptr;
+
+ *namelen = btrfs_inode_extref_name_len(eb, extref);
+ *name = kmalloc(*namelen, GFP_NOFS);
+ if (*name == NULL)
+ return -ENOMEM;
+
+ read_extent_buffer(eb, *name, (unsigned long)&extref->name,
+ *namelen);
+
+ *index = btrfs_inode_extref_index(eb, extref);
+ if (parent_objectid)
+ *parent_objectid = btrfs_inode_extref_parent(eb, extref);
+
+ return 0;
+}
+
+static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
+ u32 *namelen, char **name, u64 *index)
+{
+ struct btrfs_inode_ref *ref;
+
+ ref = (struct btrfs_inode_ref *)ref_ptr;
+
+ *namelen = btrfs_inode_ref_name_len(eb, ref);
+ *name = kmalloc(*namelen, GFP_NOFS);
+ if (*name == NULL)
+ return -ENOMEM;
+
+ read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen);
+
+ *index = btrfs_inode_ref_index(eb, ref);
+
+ return 0;
+}
+
+/*
+ * replay one inode back reference item found in the log tree.
+ * eb, slot and key refer to the buffer and key found in the log tree.
+ * root is the destination we are replaying into, and path is for temp
+ * use by this function. (it should be released on return).
+ */
+static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_root *log,
+ struct btrfs_path *path,
+ struct extent_buffer *eb, int slot,
+ struct btrfs_key *key)
+{
+ struct inode *dir = NULL;
+ struct inode *inode = NULL;
+ unsigned long ref_ptr;
+ unsigned long ref_end;
+ char *name = NULL;
+ int namelen;
+ int ret;
+ int search_done = 0;
+ int log_ref_ver = 0;
+ u64 parent_objectid;
+ u64 inode_objectid;
+ u64 ref_index = 0;
+ int ref_struct_size;
+
+ ref_ptr = btrfs_item_ptr_offset(eb, slot);
+ ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
+
+ if (key->type == BTRFS_INODE_EXTREF_KEY) {
+ struct btrfs_inode_extref *r;
+
+ ref_struct_size = sizeof(struct btrfs_inode_extref);
+ log_ref_ver = 1;
+ r = (struct btrfs_inode_extref *)ref_ptr;
+ parent_objectid = btrfs_inode_extref_parent(eb, r);
+ } else {
+ ref_struct_size = sizeof(struct btrfs_inode_ref);
+ parent_objectid = key->offset;
+ }
+ inode_objectid = key->objectid;
+
+ /*
+ * it is possible that we didn't log all the parent directories
+ * for a given inode. If we don't find the dir, just don't
+ * copy the back ref in. The link count fixup code will take
+ * care of the rest
+ */
+ dir = read_one_inode(root, parent_objectid);
+ if (!dir) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ inode = read_one_inode(root, inode_objectid);
+ if (!inode) {
+ ret = -EIO;
+ goto out;
+ }
+
+ while (ref_ptr < ref_end) {
+ if (log_ref_ver) {
+ ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
+ &ref_index, &parent_objectid);
+ /*
+ * parent object can change from one array
+ * item to another.
+ */
+ if (!dir)
+ dir = read_one_inode(root, parent_objectid);
+ if (!dir) {
+ ret = -ENOENT;
+ goto out;
+ }
+ } else {
+ ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
+ &ref_index);
+ }
+ if (ret)
+ goto out;
+
+ /* if we already have a perfect match, we're done */
+ if (!inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
+ ref_index, name, namelen)) {
+ /*
+ * look for a conflicting back reference in the
+ * metadata. if we find one we have to unlink that name
+ * of the file before we add our new link. Later on, we
+ * overwrite any existing back reference, and we don't
+ * want to create dangling pointers in the directory.
+ */
+
+ if (!search_done) {
+ ret = __add_inode_ref(trans, root, path, log,
+ dir, inode, eb,
+ inode_objectid,
+ parent_objectid,
+ ref_index, name, namelen,
+ &search_done);
+ if (ret) {
+ if (ret == 1)
+ ret = 0;
+ goto out;
+ }
+ }
+
+ /* insert our name */
+ ret = btrfs_add_link(trans, dir, inode, name, namelen,
+ 0, ref_index);
+ if (ret)
+ goto out;
+
+ btrfs_update_inode(trans, root, inode);
+ }
+
+ ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
+ kfree(name);
+ name = NULL;
+ if (log_ref_ver) {
+ iput(dir);
+ dir = NULL;
+ }
+ }
+
+ /* finally write the back reference in the inode */
+ ret = overwrite_item(trans, root, path, eb, slot, key);
+out:
+ btrfs_release_path(path);
+ kfree(name);
+ iput(dir);
+ iput(inode);
+ return ret;
+}
+
+static int insert_orphan_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 ino)
+{
+ int ret;
+
+ ret = btrfs_insert_orphan_item(trans, root, ino);
+ if (ret == -EEXIST)
+ ret = 0;
+
+ return ret;
+}
+
+static int count_inode_extrefs(struct btrfs_root *root,
+ struct inode *inode, struct btrfs_path *path)
+{
+ int ret = 0;
+ int name_len;
+ unsigned int nlink = 0;
+ u32 item_size;
+ u32 cur_offset = 0;
+ u64 inode_objectid = btrfs_ino(inode);
+ u64 offset = 0;
+ unsigned long ptr;
+ struct btrfs_inode_extref *extref;
+ struct extent_buffer *leaf;
+
+ while (1) {
+ ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
+ &extref, &offset);
+ if (ret)
+ break;
+
+ leaf = path->nodes[0];
+ item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ cur_offset = 0;
+
+ while (cur_offset < item_size) {
+ extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
+ name_len = btrfs_inode_extref_name_len(leaf, extref);
+
+ nlink++;
+
+ cur_offset += name_len + sizeof(*extref);
+ }
+
+ offset++;
+ btrfs_release_path(path);
+ }
+ btrfs_release_path(path);
+
+ if (ret < 0 && ret != -ENOENT)
+ return ret;
+ return nlink;
+}
+
+static int count_inode_refs(struct btrfs_root *root,
+ struct inode *inode, struct btrfs_path *path)
+{
+ int ret;
+ struct btrfs_key key;
+ unsigned int nlink = 0;
+ unsigned long ptr;
+ unsigned long ptr_end;
+ int name_len;
+ u64 ino = btrfs_ino(inode);
+
+ key.objectid = ino;
+ key.type = BTRFS_INODE_REF_KEY;
+ key.offset = (u64)-1;
+
+ while (1) {
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ break;
+ if (ret > 0) {
+ if (path->slots[0] == 0)
+ break;
+ path->slots[0]--;
+ }
+process_slot:
+ btrfs_item_key_to_cpu(path->nodes[0], &key,
+ path->slots[0]);
+ if (key.objectid != ino ||
+ key.type != BTRFS_INODE_REF_KEY)
+ break;
+ ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
+ ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
+ path->slots[0]);
+ while (ptr < ptr_end) {
+ struct btrfs_inode_ref *ref;
+
+ ref = (struct btrfs_inode_ref *)ptr;
+ name_len = btrfs_inode_ref_name_len(path->nodes[0],
+ ref);
+ ptr = (unsigned long)(ref + 1) + name_len;
+ nlink++;
+ }
+
+ if (key.offset == 0)
+ break;
+ if (path->slots[0] > 0) {
+ path->slots[0]--;
+ goto process_slot;
+ }
+ key.offset--;
+ btrfs_release_path(path);
+ }
+ btrfs_release_path(path);
+
+ return nlink;
+}
+
+/*
+ * There are a few corners where the link count of the file can't
+ * be properly maintained during replay. So, instead of adding
+ * lots of complexity to the log code, we just scan the backrefs
+ * for any file that has been through replay.
+ *
+ * The scan will update the link count on the inode to reflect the
+ * number of back refs found. If it goes down to zero, the iput
+ * will free the inode.
+ */
+static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *inode)
+{
+ struct btrfs_path *path;
+ int ret;
+ u64 nlink = 0;
+ u64 ino = btrfs_ino(inode);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = count_inode_refs(root, inode, path);
+ if (ret < 0)
+ goto out;
+
+ nlink = ret;
+
+ ret = count_inode_extrefs(root, inode, path);
+ if (ret < 0)
+ goto out;
+
+ nlink += ret;
+
+ ret = 0;
+
+ if (nlink != inode->i_nlink) {
+ set_nlink(inode, nlink);
+ btrfs_update_inode(trans, root, inode);
+ }
+ BTRFS_I(inode)->index_cnt = (u64)-1;
+
+ if (inode->i_nlink == 0) {
+ if (S_ISDIR(inode->i_mode)) {
+ ret = replay_dir_deletes(trans, root, NULL, path,
+ ino, 1);
+ if (ret)
+ goto out;
+ }
+ ret = insert_orphan_item(trans, root, ino);
+ }
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path)
+{
+ int ret;
+ struct btrfs_key key;
+ struct inode *inode;
+
+ key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
+ key.type = BTRFS_ORPHAN_ITEM_KEY;
+ key.offset = (u64)-1;
+ while (1) {
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret < 0)
+ break;
+
+ if (ret == 1) {
+ if (path->slots[0] == 0)
+ break;
+ path->slots[0]--;
+ }
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
+ key.type != BTRFS_ORPHAN_ITEM_KEY)
+ break;
+
+ ret = btrfs_del_item(trans, root, path);
+ if (ret)
+ goto out;
+
+ btrfs_release_path(path);
+ inode = read_one_inode(root, key.offset);
+ if (!inode)
+ return -EIO;
+
+ ret = fixup_inode_link_count(trans, root, inode);
+ iput(inode);
+ if (ret)
+ goto out;
+
+ /*
+ * fixup on a directory may create new entries,
+ * make sure we always look for the highset possible
+ * offset
+ */
+ key.offset = (u64)-1;
+ }
+ ret = 0;
+out:
+ btrfs_release_path(path);
+ return ret;
+}
+
+
+/*
+ * record a given inode in the fixup dir so we can check its link
+ * count when replay is done. The link count is incremented here
+ * so the inode won't go away until we check it
+ */
+static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 objectid)
+{
+ struct btrfs_key key;
+ int ret = 0;
+ struct inode *inode;
+
+ inode = read_one_inode(root, objectid);
+ if (!inode)
+ return -EIO;
+
+ key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
+ key.type = BTRFS_ORPHAN_ITEM_KEY;
+ key.offset = objectid;
+
+ ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+
+ btrfs_release_path(path);
+ if (ret == 0) {
+ if (!inode->i_nlink)
+ set_nlink(inode, 1);
+ else
+ inc_nlink(inode);
+ ret = btrfs_update_inode(trans, root, inode);
+ } else if (ret == -EEXIST) {
+ ret = 0;
+ } else {
+ BUG(); /* Logic Error */
+ }
+ iput(inode);
+
+ return ret;
+}
+
+/*
+ * when replaying the log for a directory, we only insert names
+ * for inodes that actually exist. This means an fsync on a directory
+ * does not implicitly fsync all the new files in it
+ */
+static noinline int insert_one_name(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 dirid, u64 index,
+ char *name, int name_len, u8 type,
+ struct btrfs_key *location)
+{
+ struct inode *inode;
+ struct inode *dir;
+ int ret;
+
+ inode = read_one_inode(root, location->objectid);
+ if (!inode)
+ return -ENOENT;
+
+ dir = read_one_inode(root, dirid);
+ if (!dir) {
+ iput(inode);
+ return -EIO;
+ }
+
+ ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index);
+
+ /* FIXME, put inode into FIXUP list */
+
+ iput(inode);
+ iput(dir);
+ return ret;
+}
+
+/*
+ * Return true if an inode reference exists in the log for the given name,
+ * inode and parent inode.
+ */
+static bool name_in_log_ref(struct btrfs_root *log_root,
+ const char *name, const int name_len,
+ const u64 dirid, const u64 ino)
+{
+ struct btrfs_key search_key;
+
+ search_key.objectid = ino;
+ search_key.type = BTRFS_INODE_REF_KEY;
+ search_key.offset = dirid;
+ if (backref_in_log(log_root, &search_key, dirid, name, name_len))
+ return true;
+
+ search_key.type = BTRFS_INODE_EXTREF_KEY;
+ search_key.offset = btrfs_extref_hash(dirid, name, name_len);
+ if (backref_in_log(log_root, &search_key, dirid, name, name_len))
+ return true;
+
+ return false;
+}
+
+/*
+ * take a single entry in a log directory item and replay it into
+ * the subvolume.
+ *
+ * if a conflicting item exists in the subdirectory already,
+ * the inode it points to is unlinked and put into the link count
+ * fix up tree.
+ *
+ * If a name from the log points to a file or directory that does
+ * not exist in the FS, it is skipped. fsyncs on directories
+ * do not force down inodes inside that directory, just changes to the
+ * names or unlinks in a directory.
+ */
+static noinline int replay_one_name(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct extent_buffer *eb,
+ struct btrfs_dir_item *di,
+ struct btrfs_key *key)
+{
+ char *name;
+ int name_len;
+ struct btrfs_dir_item *dst_di;
+ struct btrfs_key found_key;
+ struct btrfs_key log_key;
+ struct inode *dir;
+ u8 log_type;
+ int exists;
+ int ret = 0;
+ bool update_size = (key->type == BTRFS_DIR_INDEX_KEY);
+
+ dir = read_one_inode(root, key->objectid);
+ if (!dir)
+ return -EIO;
+
+ name_len = btrfs_dir_name_len(eb, di);
+ name = kmalloc(name_len, GFP_NOFS);
+ if (!name) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ log_type = btrfs_dir_type(eb, di);
+ read_extent_buffer(eb, name, (unsigned long)(di + 1),
+ name_len);
+
+ btrfs_dir_item_key_to_cpu(eb, di, &log_key);
+ exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
+ if (exists == 0)
+ exists = 1;
+ else
+ exists = 0;
+ btrfs_release_path(path);
+
+ if (key->type == BTRFS_DIR_ITEM_KEY) {
+ dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
+ name, name_len, 1);
+ } else if (key->type == BTRFS_DIR_INDEX_KEY) {
+ dst_di = btrfs_lookup_dir_index_item(trans, root, path,
+ key->objectid,
+ key->offset, name,
+ name_len, 1);
+ } else {
+ /* Corruption */
+ ret = -EINVAL;
+ goto out;
+ }
+ if (IS_ERR_OR_NULL(dst_di)) {
+ /* we need a sequence number to insert, so we only
+ * do inserts for the BTRFS_DIR_INDEX_KEY types
+ */
+ if (key->type != BTRFS_DIR_INDEX_KEY)
+ goto out;
+ goto insert;
+ }
+
+ btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
+ /* the existing item matches the logged item */
+ if (found_key.objectid == log_key.objectid &&
+ found_key.type == log_key.type &&
+ found_key.offset == log_key.offset &&
+ btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
+ update_size = false;
+ goto out;
+ }
+
+ /*
+ * don't drop the conflicting directory entry if the inode
+ * for the new entry doesn't exist
+ */
+ if (!exists)
+ goto out;
+
+ ret = drop_one_dir_item(trans, root, path, dir, dst_di);
+ if (ret)
+ goto out;
+
+ if (key->type == BTRFS_DIR_INDEX_KEY)
+ goto insert;
+out:
+ btrfs_release_path(path);
+ if (!ret && update_size) {
+ btrfs_i_size_write(dir, dir->i_size + name_len * 2);
+ ret = btrfs_update_inode(trans, root, dir);
+ }
+ kfree(name);
+ iput(dir);
+ return ret;
+
+insert:
+ if (name_in_log_ref(root->log_root, name, name_len,
+ key->objectid, log_key.objectid)) {
+ /* The dentry will be added later. */
+ ret = 0;
+ update_size = false;
+ goto out;
+ }
+ btrfs_release_path(path);
+ ret = insert_one_name(trans, root, path, key->objectid, key->offset,
+ name, name_len, log_type, &log_key);
+ if (ret && ret != -ENOENT && ret != -EEXIST)
+ goto out;
+ update_size = false;
+ ret = 0;
+ goto out;
+}
+
+/*
+ * find all the names in a directory item and reconcile them into
+ * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than
+ * one name in a directory item, but the same code gets used for
+ * both directory index types
+ */
+static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct extent_buffer *eb, int slot,
+ struct btrfs_key *key)
+{
+ int ret;
+ u32 item_size = btrfs_item_size_nr(eb, slot);
+ struct btrfs_dir_item *di;
+ int name_len;
+ unsigned long ptr;
+ unsigned long ptr_end;
+
+ ptr = btrfs_item_ptr_offset(eb, slot);
+ ptr_end = ptr + item_size;
+ while (ptr < ptr_end) {
+ di = (struct btrfs_dir_item *)ptr;
+ if (verify_dir_item(root, eb, di))
+ return -EIO;
+ name_len = btrfs_dir_name_len(eb, di);
+ ret = replay_one_name(trans, root, path, eb, di, key);
+ if (ret)
+ return ret;
+ ptr = (unsigned long)(di + 1);
+ ptr += name_len;
+ }
+ return 0;
+}
+
+/*
+ * directory replay has two parts. There are the standard directory
+ * items in the log copied from the subvolume, and range items
+ * created in the log while the subvolume was logged.
+ *
+ * The range items tell us which parts of the key space the log
+ * is authoritative for. During replay, if a key in the subvolume
+ * directory is in a logged range item, but not actually in the log
+ * that means it was deleted from the directory before the fsync
+ * and should be removed.
+ */
+static noinline int find_dir_range(struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 dirid, int key_type,
+ u64 *start_ret, u64 *end_ret)
+{
+ struct btrfs_key key;
+ u64 found_end;
+ struct btrfs_dir_log_item *item;
+ int ret;
+ int nritems;
+
+ if (*start_ret == (u64)-1)
+ return 1;
+
+ key.objectid = dirid;
+ key.type = key_type;
+ key.offset = *start_ret;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ if (ret > 0) {
+ if (path->slots[0] == 0)
+ goto out;
+ path->slots[0]--;
+ }
+ if (ret != 0)
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+ if (key.type != key_type || key.objectid != dirid) {
+ ret = 1;
+ goto next;
+ }
+ item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_dir_log_item);
+ found_end = btrfs_dir_log_end(path->nodes[0], item);
+
+ if (*start_ret >= key.offset && *start_ret <= found_end) {
+ ret = 0;
+ *start_ret = key.offset;
+ *end_ret = found_end;
+ goto out;
+ }
+ ret = 1;
+next:
+ /* check the next slot in the tree to see if it is a valid item */
+ nritems = btrfs_header_nritems(path->nodes[0]);
+ if (path->slots[0] >= nritems) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret)
+ goto out;
+ } else {
+ path->slots[0]++;
+ }
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+ if (key.type != key_type || key.objectid != dirid) {
+ ret = 1;
+ goto out;
+ }
+ item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_dir_log_item);
+ found_end = btrfs_dir_log_end(path->nodes[0], item);
+ *start_ret = key.offset;
+ *end_ret = found_end;
+ ret = 0;
+out:
+ btrfs_release_path(path);
+ return ret;
+}
+
+/*
+ * this looks for a given directory item in the log. If the directory
+ * item is not in the log, the item is removed and the inode it points
+ * to is unlinked
+ */
+static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_root *log,
+ struct btrfs_path *path,
+ struct btrfs_path *log_path,
+ struct inode *dir,
+ struct btrfs_key *dir_key)
+{
+ int ret;
+ struct extent_buffer *eb;
+ int slot;
+ u32 item_size;
+ struct btrfs_dir_item *di;
+ struct btrfs_dir_item *log_di;
+ int name_len;
+ unsigned long ptr;
+ unsigned long ptr_end;
+ char *name;
+ struct inode *inode;
+ struct btrfs_key location;
+
+again:
+ eb = path->nodes[0];
+ slot = path->slots[0];
+ item_size = btrfs_item_size_nr(eb, slot);
+ ptr = btrfs_item_ptr_offset(eb, slot);
+ ptr_end = ptr + item_size;
+ while (ptr < ptr_end) {
+ di = (struct btrfs_dir_item *)ptr;
+ if (verify_dir_item(root, eb, di)) {
+ ret = -EIO;
+ goto out;
+ }
+
+ name_len = btrfs_dir_name_len(eb, di);
+ name = kmalloc(name_len, GFP_NOFS);
+ if (!name) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ read_extent_buffer(eb, name, (unsigned long)(di + 1),
+ name_len);
+ log_di = NULL;
+ if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
+ log_di = btrfs_lookup_dir_item(trans, log, log_path,
+ dir_key->objectid,
+ name, name_len, 0);
+ } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
+ log_di = btrfs_lookup_dir_index_item(trans, log,
+ log_path,
+ dir_key->objectid,
+ dir_key->offset,
+ name, name_len, 0);
+ }
+ if (!log_di || (IS_ERR(log_di) && PTR_ERR(log_di) == -ENOENT)) {
+ btrfs_dir_item_key_to_cpu(eb, di, &location);
+ btrfs_release_path(path);
+ btrfs_release_path(log_path);
+ inode = read_one_inode(root, location.objectid);
+ if (!inode) {
+ kfree(name);
+ return -EIO;
+ }
+
+ ret = link_to_fixup_dir(trans, root,
+ path, location.objectid);
+ if (ret) {
+ kfree(name);
+ iput(inode);
+ goto out;
+ }
+
+ inc_nlink(inode);
+ ret = btrfs_unlink_inode(trans, root, dir, inode,
+ name, name_len);
+ if (!ret)
+ ret = btrfs_run_delayed_items(trans, root);
+ kfree(name);
+ iput(inode);
+ if (ret)
+ goto out;
+
+ /* there might still be more names under this key
+ * check and repeat if required
+ */
+ ret = btrfs_search_slot(NULL, root, dir_key, path,
+ 0, 0);
+ if (ret == 0)
+ goto again;
+ ret = 0;
+ goto out;
+ } else if (IS_ERR(log_di)) {
+ kfree(name);
+ return PTR_ERR(log_di);
+ }
+ btrfs_release_path(log_path);
+ kfree(name);
+
+ ptr = (unsigned long)(di + 1);
+ ptr += name_len;
+ }
+ ret = 0;
+out:
+ btrfs_release_path(path);
+ btrfs_release_path(log_path);
+ return ret;
+}
+
+static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_root *log,
+ struct btrfs_path *path,
+ const u64 ino)
+{
+ struct btrfs_key search_key;
+ struct btrfs_path *log_path;
+ int i;
+ int nritems;
+ int ret;
+
+ log_path = btrfs_alloc_path();
+ if (!log_path)
+ return -ENOMEM;
+
+ search_key.objectid = ino;
+ search_key.type = BTRFS_XATTR_ITEM_KEY;
+ search_key.offset = 0;
+again:
+ ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+process_leaf:
+ nritems = btrfs_header_nritems(path->nodes[0]);
+ for (i = path->slots[0]; i < nritems; i++) {
+ struct btrfs_key key;
+ struct btrfs_dir_item *di;
+ struct btrfs_dir_item *log_di;
+ u32 total_size;
+ u32 cur;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, i);
+ if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) {
+ ret = 0;
+ goto out;
+ }
+
+ di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
+ total_size = btrfs_item_size_nr(path->nodes[0], i);
+ cur = 0;
+ while (cur < total_size) {
+ u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
+ u16 data_len = btrfs_dir_data_len(path->nodes[0], di);
+ u32 this_len = sizeof(*di) + name_len + data_len;
+ char *name;
+
+ name = kmalloc(name_len, GFP_NOFS);
+ if (!name) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ read_extent_buffer(path->nodes[0], name,
+ (unsigned long)(di + 1), name_len);
+
+ log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
+ name, name_len, 0);
+ btrfs_release_path(log_path);
+ if (!log_di) {
+ /* Doesn't exist in log tree, so delete it. */
+ btrfs_release_path(path);
+ di = btrfs_lookup_xattr(trans, root, path, ino,
+ name, name_len, -1);
+ kfree(name);
+ if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
+ goto out;
+ }
+ ASSERT(di);
+ ret = btrfs_delete_one_dir_name(trans, root,
+ path, di);
+ if (ret)
+ goto out;
+ btrfs_release_path(path);
+ search_key = key;
+ goto again;
+ }
+ kfree(name);
+ if (IS_ERR(log_di)) {
+ ret = PTR_ERR(log_di);
+ goto out;
+ }
+ cur += this_len;
+ di = (struct btrfs_dir_item *)((char *)di + this_len);
+ }
+ }
+ ret = btrfs_next_leaf(root, path);
+ if (ret > 0)
+ ret = 0;
+ else if (ret == 0)
+ goto process_leaf;
+out:
+ btrfs_free_path(log_path);
+ btrfs_release_path(path);
+ return ret;
+}
+
+
+/*
+ * deletion replay happens before we copy any new directory items
+ * out of the log or out of backreferences from inodes. It
+ * scans the log to find ranges of keys that log is authoritative for,
+ * and then scans the directory to find items in those ranges that are
+ * not present in the log.
+ *
+ * Anything we don't find in the log is unlinked and removed from the
+ * directory.
+ */
+static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_root *log,
+ struct btrfs_path *path,
+ u64 dirid, int del_all)
+{
+ u64 range_start;
+ u64 range_end;
+ int key_type = BTRFS_DIR_LOG_ITEM_KEY;
+ int ret = 0;
+ struct btrfs_key dir_key;
+ struct btrfs_key found_key;
+ struct btrfs_path *log_path;
+ struct inode *dir;
+
+ dir_key.objectid = dirid;
+ dir_key.type = BTRFS_DIR_ITEM_KEY;
+ log_path = btrfs_alloc_path();
+ if (!log_path)
+ return -ENOMEM;
+
+ dir = read_one_inode(root, dirid);
+ /* it isn't an error if the inode isn't there, that can happen
+ * because we replay the deletes before we copy in the inode item
+ * from the log
+ */
+ if (!dir) {
+ btrfs_free_path(log_path);
+ return 0;
+ }
+again:
+ range_start = 0;
+ range_end = 0;
+ while (1) {
+ if (del_all)
+ range_end = (u64)-1;
+ else {
+ ret = find_dir_range(log, path, dirid, key_type,
+ &range_start, &range_end);
+ if (ret != 0)
+ break;
+ }
+
+ dir_key.offset = range_start;
+ while (1) {
+ int nritems;
+ ret = btrfs_search_slot(NULL, root, &dir_key, path,
+ 0, 0);
+ if (ret < 0)
+ goto out;
+
+ nritems = btrfs_header_nritems(path->nodes[0]);
+ if (path->slots[0] >= nritems) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret)
+ break;
+ }
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+ path->slots[0]);
+ if (found_key.objectid != dirid ||
+ found_key.type != dir_key.type)
+ goto next_type;
+
+ if (found_key.offset > range_end)
+ break;
+
+ ret = check_item_in_log(trans, root, log, path,
+ log_path, dir,
+ &found_key);
+ if (ret)
+ goto out;
+ if (found_key.offset == (u64)-1)
+ break;
+ dir_key.offset = found_key.offset + 1;
+ }
+ btrfs_release_path(path);
+ if (range_end == (u64)-1)
+ break;
+ range_start = range_end + 1;
+ }
+
+next_type:
+ ret = 0;
+ if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
+ key_type = BTRFS_DIR_LOG_INDEX_KEY;
+ dir_key.type = BTRFS_DIR_INDEX_KEY;
+ btrfs_release_path(path);
+ goto again;
+ }
+out:
+ btrfs_release_path(path);
+ btrfs_free_path(log_path);
+ iput(dir);
+ return ret;
+}
+
+/*
+ * the process_func used to replay items from the log tree. This
+ * gets called in two different stages. The first stage just looks
+ * for inodes and makes sure they are all copied into the subvolume.
+ *
+ * The second stage copies all the other item types from the log into
+ * the subvolume. The two stage approach is slower, but gets rid of
+ * lots of complexity around inodes referencing other inodes that exist
+ * only in the log (references come from either directory items or inode
+ * back refs).
+ */
+static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
+ struct walk_control *wc, u64 gen)
+{
+ int nritems;
+ struct btrfs_path *path;
+ struct btrfs_root *root = wc->replay_dest;
+ struct btrfs_key key;
+ int level;
+ int i;
+ int ret;
+
+ ret = btrfs_read_buffer(eb, gen);
+ if (ret)
+ return ret;
+
+ level = btrfs_header_level(eb);
+
+ if (level != 0)
+ return 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ nritems = btrfs_header_nritems(eb);
+ for (i = 0; i < nritems; i++) {
+ btrfs_item_key_to_cpu(eb, &key, i);
+
+ /* inode keys are done during the first stage */
+ if (key.type == BTRFS_INODE_ITEM_KEY &&
+ wc->stage == LOG_WALK_REPLAY_INODES) {
+ struct btrfs_inode_item *inode_item;
+ u32 mode;
+
+ inode_item = btrfs_item_ptr(eb, i,
+ struct btrfs_inode_item);
+ ret = replay_xattr_deletes(wc->trans, root, log,
+ path, key.objectid);
+ if (ret)
+ break;
+ mode = btrfs_inode_mode(eb, inode_item);
+ if (S_ISDIR(mode)) {
+ ret = replay_dir_deletes(wc->trans,
+ root, log, path, key.objectid, 0);
+ if (ret)
+ break;
+ }
+ ret = overwrite_item(wc->trans, root, path,
+ eb, i, &key);
+ if (ret)
+ break;
+
+ /* for regular files, make sure corresponding
+ * orhpan item exist. extents past the new EOF
+ * will be truncated later by orphan cleanup.
+ */
+ if (S_ISREG(mode)) {
+ ret = insert_orphan_item(wc->trans, root,
+ key.objectid);
+ if (ret)
+ break;
+ }
+
+ ret = link_to_fixup_dir(wc->trans, root,
+ path, key.objectid);
+ if (ret)
+ break;
+ }
+
+ if (key.type == BTRFS_DIR_INDEX_KEY &&
+ wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
+ ret = replay_one_dir_item(wc->trans, root, path,
+ eb, i, &key);
+ if (ret)
+ break;
+ }
+
+ if (wc->stage < LOG_WALK_REPLAY_ALL)
+ continue;
+
+ /* these keys are simply copied */
+ if (key.type == BTRFS_XATTR_ITEM_KEY) {
+ ret = overwrite_item(wc->trans, root, path,
+ eb, i, &key);
+ if (ret)
+ break;
+ } else if (key.type == BTRFS_INODE_REF_KEY ||
+ key.type == BTRFS_INODE_EXTREF_KEY) {
+ ret = add_inode_ref(wc->trans, root, log, path,
+ eb, i, &key);
+ if (ret && ret != -ENOENT)
+ break;
+ ret = 0;
+ } else if (key.type == BTRFS_EXTENT_DATA_KEY) {
+ ret = replay_one_extent(wc->trans, root, path,
+ eb, i, &key);
+ if (ret)
+ break;
+ } else if (key.type == BTRFS_DIR_ITEM_KEY) {
+ ret = replay_one_dir_item(wc->trans, root, path,
+ eb, i, &key);
+ if (ret)
+ break;
+ }
+ }
+ btrfs_free_path(path);
+ return ret;
+}
+
+static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, int *level,
+ struct walk_control *wc)
+{
+ u64 root_owner;
+ u64 bytenr;
+ u64 ptr_gen;
+ struct extent_buffer *next;
+ struct extent_buffer *cur;
+ struct extent_buffer *parent;
+ u32 blocksize;
+ int ret = 0;
+
+ WARN_ON(*level < 0);
+ WARN_ON(*level >= BTRFS_MAX_LEVEL);
+
+ while (*level > 0) {
+ WARN_ON(*level < 0);
+ WARN_ON(*level >= BTRFS_MAX_LEVEL);
+ cur = path->nodes[*level];
+
+ WARN_ON(btrfs_header_level(cur) != *level);
+
+ if (path->slots[*level] >=
+ btrfs_header_nritems(cur))
+ break;
+
+ bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
+ ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
+ blocksize = root->nodesize;
+
+ parent = path->nodes[*level];
+ root_owner = btrfs_header_owner(parent);
+
+ next = btrfs_find_create_tree_block(root, bytenr);
+ if (!next)
+ return -ENOMEM;
+
+ if (*level == 1) {
+ ret = wc->process_func(root, next, wc, ptr_gen);
+ if (ret) {
+ free_extent_buffer(next);
+ return ret;
+ }
+
+ path->slots[*level]++;
+ if (wc->free) {
+ ret = btrfs_read_buffer(next, ptr_gen);
+ if (ret) {
+ free_extent_buffer(next);
+ return ret;
+ }
+
+ if (trans) {
+ btrfs_tree_lock(next);
+ btrfs_set_lock_blocking(next);
+ clean_tree_block(trans, root->fs_info,
+ next);
+ btrfs_wait_tree_block_writeback(next);
+ btrfs_tree_unlock(next);
+ }
+
+ WARN_ON(root_owner !=
+ BTRFS_TREE_LOG_OBJECTID);
+ ret = btrfs_free_and_pin_reserved_extent(root,
+ bytenr, blocksize);
+ if (ret) {
+ free_extent_buffer(next);
+ return ret;
+ }
+ }
+ free_extent_buffer(next);
+ continue;
+ }
+ ret = btrfs_read_buffer(next, ptr_gen);
+ if (ret) {
+ free_extent_buffer(next);
+ return ret;
+ }
+
+ WARN_ON(*level <= 0);
+ if (path->nodes[*level-1])
+ free_extent_buffer(path->nodes[*level-1]);
+ path->nodes[*level-1] = next;
+ *level = btrfs_header_level(next);
+ path->slots[*level] = 0;
+ cond_resched();
+ }
+ WARN_ON(*level < 0);
+ WARN_ON(*level >= BTRFS_MAX_LEVEL);
+
+ path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
+
+ cond_resched();
+ return 0;
+}
+
+static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, int *level,
+ struct walk_control *wc)
+{
+ u64 root_owner;
+ int i;
+ int slot;
+ int ret;
+
+ for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
+ slot = path->slots[i];
+ if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
+ path->slots[i]++;
+ *level = i;
+ WARN_ON(*level == 0);
+ return 0;
+ } else {
+ struct extent_buffer *parent;
+ if (path->nodes[*level] == root->node)
+ parent = path->nodes[*level];
+ else
+ parent = path->nodes[*level + 1];
+
+ root_owner = btrfs_header_owner(parent);
+ ret = wc->process_func(root, path->nodes[*level], wc,
+ btrfs_header_generation(path->nodes[*level]));
+ if (ret)
+ return ret;
+
+ if (wc->free) {
+ struct extent_buffer *next;
+
+ next = path->nodes[*level];
+
+ if (trans) {
+ btrfs_tree_lock(next);
+ btrfs_set_lock_blocking(next);
+ clean_tree_block(trans, root->fs_info,
+ next);
+ btrfs_wait_tree_block_writeback(next);
+ btrfs_tree_unlock(next);
+ }
+
+ WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
+ ret = btrfs_free_and_pin_reserved_extent(root,
+ path->nodes[*level]->start,
+ path->nodes[*level]->len);
+ if (ret)
+ return ret;
+ }
+ free_extent_buffer(path->nodes[*level]);
+ path->nodes[*level] = NULL;
+ *level = i + 1;
+ }
+ }
+ return 1;
+}
+
+/*
+ * drop the reference count on the tree rooted at 'snap'. This traverses
+ * the tree freeing any blocks that have a ref count of zero after being
+ * decremented.
+ */
+static int walk_log_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log, struct walk_control *wc)
+{
+ int ret = 0;
+ int wret;
+ int level;
+ struct btrfs_path *path;
+ int orig_level;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ level = btrfs_header_level(log->node);
+ orig_level = level;
+ path->nodes[level] = log->node;
+ extent_buffer_get(log->node);
+ path->slots[level] = 0;
+
+ while (1) {
+ wret = walk_down_log_tree(trans, log, path, &level, wc);
+ if (wret > 0)
+ break;
+ if (wret < 0) {
+ ret = wret;
+ goto out;
+ }
+
+ wret = walk_up_log_tree(trans, log, path, &level, wc);
+ if (wret > 0)
+ break;
+ if (wret < 0) {
+ ret = wret;
+ goto out;
+ }
+ }
+
+ /* was the root node processed? if not, catch it here */
+ if (path->nodes[orig_level]) {
+ ret = wc->process_func(log, path->nodes[orig_level], wc,
+ btrfs_header_generation(path->nodes[orig_level]));
+ if (ret)
+ goto out;
+ if (wc->free) {
+ struct extent_buffer *next;
+
+ next = path->nodes[orig_level];
+
+ if (trans) {
+ btrfs_tree_lock(next);
+ btrfs_set_lock_blocking(next);
+ clean_tree_block(trans, log->fs_info, next);
+ btrfs_wait_tree_block_writeback(next);
+ btrfs_tree_unlock(next);
+ }
+
+ WARN_ON(log->root_key.objectid !=
+ BTRFS_TREE_LOG_OBJECTID);
+ ret = btrfs_free_and_pin_reserved_extent(log, next->start,
+ next->len);
+ if (ret)
+ goto out;
+ }
+ }
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * helper function to update the item for a given subvolumes log root
+ * in the tree of log roots
+ */
+static int update_log_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log)
+{
+ int ret;
+
+ if (log->log_transid == 1) {
+ /* insert root item on the first sync */
+ ret = btrfs_insert_root(trans, log->fs_info->log_root_tree,
+ &log->root_key, &log->root_item);
+ } else {
+ ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
+ &log->root_key, &log->root_item);
+ }
+ return ret;
+}
+
+static void wait_log_commit(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, int transid)
+{
+ DEFINE_WAIT(wait);
+ int index = transid % 2;
+
+ /*
+ * we only allow two pending log transactions at a time,
+ * so we know that if ours is more than 2 older than the
+ * current transaction, we're done
+ */
+ do {
+ prepare_to_wait(&root->log_commit_wait[index],
+ &wait, TASK_UNINTERRUPTIBLE);
+ mutex_unlock(&root->log_mutex);
+
+ if (root->log_transid_committed < transid &&
+ atomic_read(&root->log_commit[index]))
+ schedule();
+
+ finish_wait(&root->log_commit_wait[index], &wait);
+ mutex_lock(&root->log_mutex);
+ } while (root->log_transid_committed < transid &&
+ atomic_read(&root->log_commit[index]));
+}
+
+static void wait_for_writer(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ DEFINE_WAIT(wait);
+
+ while (atomic_read(&root->log_writers)) {
+ prepare_to_wait(&root->log_writer_wait,
+ &wait, TASK_UNINTERRUPTIBLE);
+ mutex_unlock(&root->log_mutex);
+ if (atomic_read(&root->log_writers))
+ schedule();
+ finish_wait(&root->log_writer_wait, &wait);
+ mutex_lock(&root->log_mutex);
+ }
+}
+
+static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
+ struct btrfs_log_ctx *ctx)
+{
+ if (!ctx)
+ return;
+
+ mutex_lock(&root->log_mutex);
+ list_del_init(&ctx->list);
+ mutex_unlock(&root->log_mutex);
+}
+
+/*
+ * Invoked in log mutex context, or be sure there is no other task which
+ * can access the list.
+ */
+static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
+ int index, int error)
+{
+ struct btrfs_log_ctx *ctx;
+
+ if (!error) {
+ INIT_LIST_HEAD(&root->log_ctxs[index]);
+ return;
+ }
+
+ list_for_each_entry(ctx, &root->log_ctxs[index], list)
+ ctx->log_ret = error;
+
+ INIT_LIST_HEAD(&root->log_ctxs[index]);
+}
+
+/*
+ * btrfs_sync_log does sends a given tree log down to the disk and
+ * updates the super blocks to record it. When this call is done,
+ * you know that any inodes previously logged are safely on disk only
+ * if it returns 0.
+ *
+ * Any other return value means you need to call btrfs_commit_transaction.
+ * Some of the edge cases for fsyncing directories that have had unlinks
+ * or renames done in the past mean that sometimes the only safe
+ * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN,
+ * that has happened.
+ */
+int btrfs_sync_log(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_log_ctx *ctx)
+{
+ int index1;
+ int index2;
+ int mark;
+ int ret;
+ struct btrfs_root *log = root->log_root;
+ struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
+ int log_transid = 0;
+ struct btrfs_log_ctx root_log_ctx;
+ struct blk_plug plug;
+
+ mutex_lock(&root->log_mutex);
+ log_transid = ctx->log_transid;
+ if (root->log_transid_committed >= log_transid) {
+ mutex_unlock(&root->log_mutex);
+ return ctx->log_ret;
+ }
+
+ index1 = log_transid % 2;
+ if (atomic_read(&root->log_commit[index1])) {
+ wait_log_commit(trans, root, log_transid);
+ mutex_unlock(&root->log_mutex);
+ return ctx->log_ret;
+ }
+ ASSERT(log_transid == root->log_transid);
+ atomic_set(&root->log_commit[index1], 1);
+
+ /* wait for previous tree log sync to complete */
+ if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
+ wait_log_commit(trans, root, log_transid - 1);
+
+ while (1) {
+ int batch = atomic_read(&root->log_batch);
+ /* when we're on an ssd, just kick the log commit out */
+ if (!btrfs_test_opt(root, SSD) &&
+ test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
+ mutex_unlock(&root->log_mutex);
+ schedule_timeout_uninterruptible(1);
+ mutex_lock(&root->log_mutex);
+ }
+ wait_for_writer(trans, root);
+ if (batch == atomic_read(&root->log_batch))
+ break;
+ }
+
+ /* bail out if we need to do a full commit */
+ if (btrfs_need_log_full_commit(root->fs_info, trans)) {
+ ret = -EAGAIN;
+ btrfs_free_logged_extents(log, log_transid);
+ mutex_unlock(&root->log_mutex);
+ goto out;
+ }
+
+ if (log_transid % 2 == 0)
+ mark = EXTENT_DIRTY;
+ else
+ mark = EXTENT_NEW;
+
+ /* we start IO on all the marked extents here, but we don't actually
+ * wait for them until later.
+ */
+ blk_start_plug(&plug);
+ ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark);
+ if (ret) {
+ blk_finish_plug(&plug);
+ btrfs_abort_transaction(trans, root, ret);
+ btrfs_free_logged_extents(log, log_transid);
+ btrfs_set_log_full_commit(root->fs_info, trans);
+ mutex_unlock(&root->log_mutex);
+ goto out;
+ }
+
+ btrfs_set_root_node(&log->root_item, log->node);
+
+ root->log_transid++;
+ log->log_transid = root->log_transid;
+ root->log_start_pid = 0;
+ /*
+ * IO has been started, blocks of the log tree have WRITTEN flag set
+ * in their headers. new modifications of the log will be written to
+ * new positions. so it's safe to allow log writers to go in.
+ */
+ mutex_unlock(&root->log_mutex);
+
+ btrfs_init_log_ctx(&root_log_ctx);
+
+ mutex_lock(&log_root_tree->log_mutex);
+ atomic_inc(&log_root_tree->log_batch);
+ atomic_inc(&log_root_tree->log_writers);
+
+ index2 = log_root_tree->log_transid % 2;
+ list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
+ root_log_ctx.log_transid = log_root_tree->log_transid;
+
+ mutex_unlock(&log_root_tree->log_mutex);
+
+ ret = update_log_root(trans, log);
+
+ mutex_lock(&log_root_tree->log_mutex);
+ if (atomic_dec_and_test(&log_root_tree->log_writers)) {
+ smp_mb();
+ if (waitqueue_active(&log_root_tree->log_writer_wait))
+ wake_up(&log_root_tree->log_writer_wait);
+ }
+
+ if (ret) {
+ if (!list_empty(&root_log_ctx.list))
+ list_del_init(&root_log_ctx.list);
+
+ blk_finish_plug(&plug);
+ btrfs_set_log_full_commit(root->fs_info, trans);
+
+ if (ret != -ENOSPC) {
+ btrfs_abort_transaction(trans, root, ret);
+ mutex_unlock(&log_root_tree->log_mutex);
+ goto out;
+ }
+ btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+ btrfs_free_logged_extents(log, log_transid);
+ mutex_unlock(&log_root_tree->log_mutex);
+ ret = -EAGAIN;
+ goto out;
+ }
+
+ if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
+ blk_finish_plug(&plug);
+ mutex_unlock(&log_root_tree->log_mutex);
+ ret = root_log_ctx.log_ret;
+ goto out;
+ }
+
+ index2 = root_log_ctx.log_transid % 2;
+ if (atomic_read(&log_root_tree->log_commit[index2])) {
+ blk_finish_plug(&plug);
+ ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages,
+ mark);
+ btrfs_wait_logged_extents(trans, log, log_transid);
+ wait_log_commit(trans, log_root_tree,
+ root_log_ctx.log_transid);
+ mutex_unlock(&log_root_tree->log_mutex);
+ if (!ret)
+ ret = root_log_ctx.log_ret;
+ goto out;
+ }
+ ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
+ atomic_set(&log_root_tree->log_commit[index2], 1);
+
+ if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
+ wait_log_commit(trans, log_root_tree,
+ root_log_ctx.log_transid - 1);
+ }
+
+ wait_for_writer(trans, log_root_tree);
+
+ /*
+ * now that we've moved on to the tree of log tree roots,
+ * check the full commit flag again
+ */
+ if (btrfs_need_log_full_commit(root->fs_info, trans)) {
+ blk_finish_plug(&plug);
+ btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+ btrfs_free_logged_extents(log, log_transid);
+ mutex_unlock(&log_root_tree->log_mutex);
+ ret = -EAGAIN;
+ goto out_wake_log_root;
+ }
+
+ ret = btrfs_write_marked_extents(log_root_tree,
+ &log_root_tree->dirty_log_pages,
+ EXTENT_DIRTY | EXTENT_NEW);
+ blk_finish_plug(&plug);
+ if (ret) {
+ btrfs_set_log_full_commit(root->fs_info, trans);
+ btrfs_abort_transaction(trans, root, ret);
+ btrfs_free_logged_extents(log, log_transid);
+ mutex_unlock(&log_root_tree->log_mutex);
+ goto out_wake_log_root;
+ }
+ ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+ if (!ret)
+ ret = btrfs_wait_marked_extents(log_root_tree,
+ &log_root_tree->dirty_log_pages,
+ EXTENT_NEW | EXTENT_DIRTY);
+ if (ret) {
+ btrfs_set_log_full_commit(root->fs_info, trans);
+ btrfs_free_logged_extents(log, log_transid);
+ mutex_unlock(&log_root_tree->log_mutex);
+ goto out_wake_log_root;
+ }
+ btrfs_wait_logged_extents(trans, log, log_transid);
+
+ btrfs_set_super_log_root(root->fs_info->super_for_commit,
+ log_root_tree->node->start);
+ btrfs_set_super_log_root_level(root->fs_info->super_for_commit,
+ btrfs_header_level(log_root_tree->node));
+
+ log_root_tree->log_transid++;
+ mutex_unlock(&log_root_tree->log_mutex);
+
+ /*
+ * nobody else is going to jump in and write the the ctree
+ * super here because the log_commit atomic below is protecting
+ * us. We must be called with a transaction handle pinning
+ * the running transaction open, so a full commit can't hop
+ * in and cause problems either.
+ */
+ ret = write_ctree_super(trans, root->fs_info->tree_root, 1);
+ if (ret) {
+ btrfs_set_log_full_commit(root->fs_info, trans);
+ btrfs_abort_transaction(trans, root, ret);
+ goto out_wake_log_root;
+ }
+
+ mutex_lock(&root->log_mutex);
+ if (root->last_log_commit < log_transid)
+ root->last_log_commit = log_transid;
+ mutex_unlock(&root->log_mutex);
+
+out_wake_log_root:
+ /*
+ * We needn't get log_mutex here because we are sure all
+ * the other tasks are blocked.
+ */
+ btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
+
+ mutex_lock(&log_root_tree->log_mutex);
+ log_root_tree->log_transid_committed++;
+ atomic_set(&log_root_tree->log_commit[index2], 0);
+ mutex_unlock(&log_root_tree->log_mutex);
+
+ if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
+ wake_up(&log_root_tree->log_commit_wait[index2]);
+out:
+ /* See above. */
+ btrfs_remove_all_log_ctxs(root, index1, ret);
+
+ mutex_lock(&root->log_mutex);
+ root->log_transid_committed++;
+ atomic_set(&root->log_commit[index1], 0);
+ mutex_unlock(&root->log_mutex);
+
+ if (waitqueue_active(&root->log_commit_wait[index1]))
+ wake_up(&root->log_commit_wait[index1]);
+ return ret;
+}
+
+static void free_log_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log)
+{
+ int ret;
+ u64 start;
+ u64 end;
+ struct walk_control wc = {
+ .free = 1,
+ .process_func = process_one_buffer
+ };
+
+ ret = walk_log_tree(trans, log, &wc);
+ /* I don't think this can happen but just in case */
+ if (ret)
+ btrfs_abort_transaction(trans, log, ret);
+
+ while (1) {
+ ret = find_first_extent_bit(&log->dirty_log_pages,
+ 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW,
+ NULL);
+ if (ret)
+ break;
+
+ clear_extent_bits(&log->dirty_log_pages, start, end,
+ EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
+ }
+
+ /*
+ * We may have short-circuited the log tree with the full commit logic
+ * and left ordered extents on our list, so clear these out to keep us
+ * from leaking inodes and memory.
+ */
+ btrfs_free_logged_extents(log, 0);
+ btrfs_free_logged_extents(log, 1);
+
+ free_extent_buffer(log->node);
+ kfree(log);
+}
+
+/*
+ * free all the extents used by the tree log. This should be called
+ * at commit time of the full transaction
+ */
+int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
+{
+ if (root->log_root) {
+ free_log_tree(trans, root->log_root);
+ root->log_root = NULL;
+ }
+ return 0;
+}
+
+int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info)
+{
+ if (fs_info->log_root_tree) {
+ free_log_tree(trans, fs_info->log_root_tree);
+ fs_info->log_root_tree = NULL;
+ }
+ return 0;
+}
+
+/*
+ * If both a file and directory are logged, and unlinks or renames are
+ * mixed in, we have a few interesting corners:
+ *
+ * create file X in dir Y
+ * link file X to X.link in dir Y
+ * fsync file X
+ * unlink file X but leave X.link
+ * fsync dir Y
+ *
+ * After a crash we would expect only X.link to exist. But file X
+ * didn't get fsync'd again so the log has back refs for X and X.link.
+ *
+ * We solve this by removing directory entries and inode backrefs from the
+ * log when a file that was logged in the current transaction is
+ * unlinked. Any later fsync will include the updated log entries, and
+ * we'll be able to reconstruct the proper directory items from backrefs.
+ *
+ * This optimizations allows us to avoid relogging the entire inode
+ * or the entire directory.
+ */
+int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+ struct inode *dir, u64 index)
+{
+ struct btrfs_root *log;
+ struct btrfs_dir_item *di;
+ struct btrfs_path *path;
+ int ret;
+ int err = 0;
+ int bytes_del = 0;
+ u64 dir_ino = btrfs_ino(dir);
+
+ if (BTRFS_I(dir)->logged_trans < trans->transid)
+ return 0;
+
+ ret = join_running_log_trans(root);
+ if (ret)
+ return 0;
+
+ mutex_lock(&BTRFS_I(dir)->log_mutex);
+
+ log = root->log_root;
+ path = btrfs_alloc_path();
+ if (!path) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+
+ di = btrfs_lookup_dir_item(trans, log, path, dir_ino,
+ name, name_len, -1);
+ if (IS_ERR(di)) {
+ err = PTR_ERR(di);
+ goto fail;
+ }
+ if (di) {
+ ret = btrfs_delete_one_dir_name(trans, log, path, di);
+ bytes_del += name_len;
+ if (ret) {
+ err = ret;
+ goto fail;
+ }
+ }
+ btrfs_release_path(path);
+ di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
+ index, name, name_len, -1);
+ if (IS_ERR(di)) {
+ err = PTR_ERR(di);
+ goto fail;
+ }
+ if (di) {
+ ret = btrfs_delete_one_dir_name(trans, log, path, di);
+ bytes_del += name_len;
+ if (ret) {
+ err = ret;
+ goto fail;
+ }
+ }
+
+ /* update the directory size in the log to reflect the names
+ * we have removed
+ */
+ if (bytes_del) {
+ struct btrfs_key key;
+
+ key.objectid = dir_ino;
+ key.offset = 0;
+ key.type = BTRFS_INODE_ITEM_KEY;
+ btrfs_release_path(path);
+
+ ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
+ if (ret < 0) {
+ err = ret;
+ goto fail;
+ }
+ if (ret == 0) {
+ struct btrfs_inode_item *item;
+ u64 i_size;
+
+ item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_inode_item);
+ i_size = btrfs_inode_size(path->nodes[0], item);
+ if (i_size > bytes_del)
+ i_size -= bytes_del;
+ else
+ i_size = 0;
+ btrfs_set_inode_size(path->nodes[0], item, i_size);
+ btrfs_mark_buffer_dirty(path->nodes[0]);
+ } else
+ ret = 0;
+ btrfs_release_path(path);
+ }
+fail:
+ btrfs_free_path(path);
+out_unlock:
+ mutex_unlock(&BTRFS_I(dir)->log_mutex);
+ if (ret == -ENOSPC) {
+ btrfs_set_log_full_commit(root->fs_info, trans);
+ ret = 0;
+ } else if (ret < 0)
+ btrfs_abort_transaction(trans, root, ret);
+
+ btrfs_end_log_trans(root);
+
+ return err;
+}
+
+/* see comments for btrfs_del_dir_entries_in_log */
+int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+ struct inode *inode, u64 dirid)
+{
+ struct btrfs_root *log;
+ u64 index;
+ int ret;
+
+ if (BTRFS_I(inode)->logged_trans < trans->transid)
+ return 0;
+
+ ret = join_running_log_trans(root);
+ if (ret)
+ return 0;
+ log = root->log_root;
+ mutex_lock(&BTRFS_I(inode)->log_mutex);
+
+ ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode),
+ dirid, &index);
+ mutex_unlock(&BTRFS_I(inode)->log_mutex);
+ if (ret == -ENOSPC) {
+ btrfs_set_log_full_commit(root->fs_info, trans);
+ ret = 0;
+ } else if (ret < 0 && ret != -ENOENT)
+ btrfs_abort_transaction(trans, root, ret);
+ btrfs_end_log_trans(root);
+
+ return ret;
+}
+
+/*
+ * creates a range item in the log for 'dirid'. first_offset and
+ * last_offset tell us which parts of the key space the log should
+ * be considered authoritative for.
+ */
+static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log,
+ struct btrfs_path *path,
+ int key_type, u64 dirid,
+ u64 first_offset, u64 last_offset)
+{
+ int ret;
+ struct btrfs_key key;
+ struct btrfs_dir_log_item *item;
+
+ key.objectid = dirid;
+ key.offset = first_offset;
+ if (key_type == BTRFS_DIR_ITEM_KEY)
+ key.type = BTRFS_DIR_LOG_ITEM_KEY;
+ else
+ key.type = BTRFS_DIR_LOG_INDEX_KEY;
+ ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
+ if (ret)
+ return ret;
+
+ item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_dir_log_item);
+ btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
+ btrfs_mark_buffer_dirty(path->nodes[0]);
+ btrfs_release_path(path);
+ return 0;
+}
+
+/*
+ * log all the items included in the current transaction for a given
+ * directory. This also creates the range items in the log tree required
+ * to replay anything deleted before the fsync
+ */
+static noinline int log_dir_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode,
+ struct btrfs_path *path,
+ struct btrfs_path *dst_path, int key_type,
+ struct btrfs_log_ctx *ctx,
+ u64 min_offset, u64 *last_offset_ret)
+{
+ struct btrfs_key min_key;
+ struct btrfs_root *log = root->log_root;
+ struct extent_buffer *src;
+ int err = 0;
+ int ret;
+ int i;
+ int nritems;
+ u64 first_offset = min_offset;
+ u64 last_offset = (u64)-1;
+ u64 ino = btrfs_ino(inode);
+
+ log = root->log_root;
+
+ min_key.objectid = ino;
+ min_key.type = key_type;
+ min_key.offset = min_offset;
+
+ ret = btrfs_search_forward(root, &min_key, path, trans->transid);
+
+ /*
+ * we didn't find anything from this transaction, see if there
+ * is anything at all
+ */
+ if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) {
+ min_key.objectid = ino;
+ min_key.type = key_type;
+ min_key.offset = (u64)-1;
+ btrfs_release_path(path);
+ ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
+ if (ret < 0) {
+ btrfs_release_path(path);
+ return ret;
+ }
+ ret = btrfs_previous_item(root, path, ino, key_type);
+
+ /* if ret == 0 there are items for this type,
+ * create a range to tell us the last key of this type.
+ * otherwise, there are no items in this directory after
+ * *min_offset, and we create a range to indicate that.
+ */
+ if (ret == 0) {
+ struct btrfs_key tmp;
+ btrfs_item_key_to_cpu(path->nodes[0], &tmp,
+ path->slots[0]);
+ if (key_type == tmp.type)
+ first_offset = max(min_offset, tmp.offset) + 1;
+ }
+ goto done;
+ }
+
+ /* go backward to find any previous key */
+ ret = btrfs_previous_item(root, path, ino, key_type);
+ if (ret == 0) {
+ struct btrfs_key tmp;
+ btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
+ if (key_type == tmp.type) {
+ first_offset = tmp.offset;
+ ret = overwrite_item(trans, log, dst_path,
+ path->nodes[0], path->slots[0],
+ &tmp);
+ if (ret) {
+ err = ret;
+ goto done;
+ }
+ }
+ }
+ btrfs_release_path(path);
+
+ /* find the first key from this transaction again */
+ ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
+ if (WARN_ON(ret != 0))
+ goto done;
+
+ /*
+ * we have a block from this transaction, log every item in it
+ * from our directory
+ */
+ while (1) {
+ struct btrfs_key tmp;
+ src = path->nodes[0];
+ nritems = btrfs_header_nritems(src);
+ for (i = path->slots[0]; i < nritems; i++) {
+ struct btrfs_dir_item *di;
+
+ btrfs_item_key_to_cpu(src, &min_key, i);
+
+ if (min_key.objectid != ino || min_key.type != key_type)
+ goto done;
+ ret = overwrite_item(trans, log, dst_path, src, i,
+ &min_key);
+ if (ret) {
+ err = ret;
+ goto done;
+ }
+
+ /*
+ * We must make sure that when we log a directory entry,
+ * the corresponding inode, after log replay, has a
+ * matching link count. For example:
+ *
+ * touch foo
+ * mkdir mydir
+ * sync
+ * ln foo mydir/bar
+ * xfs_io -c "fsync" mydir
+ * <crash>
+ * <mount fs and log replay>
+ *
+ * Would result in a fsync log that when replayed, our
+ * file inode would have a link count of 1, but we get
+ * two directory entries pointing to the same inode.
+ * After removing one of the names, it would not be
+ * possible to remove the other name, which resulted
+ * always in stale file handle errors, and would not
+ * be possible to rmdir the parent directory, since
+ * its i_size could never decrement to the value
+ * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors.
+ */
+ di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
+ btrfs_dir_item_key_to_cpu(src, di, &tmp);
+ if (ctx &&
+ (btrfs_dir_transid(src, di) == trans->transid ||
+ btrfs_dir_type(src, di) == BTRFS_FT_DIR) &&
+ tmp.type != BTRFS_ROOT_ITEM_KEY)
+ ctx->log_new_dentries = true;
+ }
+ path->slots[0] = nritems;
+
+ /*
+ * look ahead to the next item and see if it is also
+ * from this directory and from this transaction
+ */
+ ret = btrfs_next_leaf(root, path);
+ if (ret == 1) {
+ last_offset = (u64)-1;
+ goto done;
+ }
+ btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
+ if (tmp.objectid != ino || tmp.type != key_type) {
+ last_offset = (u64)-1;
+ goto done;
+ }
+ if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
+ ret = overwrite_item(trans, log, dst_path,
+ path->nodes[0], path->slots[0],
+ &tmp);
+ if (ret)
+ err = ret;
+ else
+ last_offset = tmp.offset;
+ goto done;
+ }
+ }
+done:
+ btrfs_release_path(path);
+ btrfs_release_path(dst_path);
+
+ if (err == 0) {
+ *last_offset_ret = last_offset;
+ /*
+ * insert the log range keys to indicate where the log
+ * is valid
+ */
+ ret = insert_dir_log_key(trans, log, path, key_type,
+ ino, first_offset, last_offset);
+ if (ret)
+ err = ret;
+ }
+ return err;
+}
+
+/*
+ * logging directories is very similar to logging inodes, We find all the items
+ * from the current transaction and write them to the log.
+ *
+ * The recovery code scans the directory in the subvolume, and if it finds a
+ * key in the range logged that is not present in the log tree, then it means
+ * that dir entry was unlinked during the transaction.
+ *
+ * In order for that scan to work, we must include one key smaller than
+ * the smallest logged by this transaction and one key larger than the largest
+ * key logged by this transaction.
+ */
+static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode,
+ struct btrfs_path *path,
+ struct btrfs_path *dst_path,
+ struct btrfs_log_ctx *ctx)
+{
+ u64 min_key;
+ u64 max_key;
+ int ret;
+ int key_type = BTRFS_DIR_ITEM_KEY;
+
+again:
+ min_key = 0;
+ max_key = 0;
+ while (1) {
+ ret = log_dir_items(trans, root, inode, path,
+ dst_path, key_type, ctx, min_key,
+ &max_key);
+ if (ret)
+ return ret;
+ if (max_key == (u64)-1)
+ break;
+ min_key = max_key + 1;
+ }
+
+ if (key_type == BTRFS_DIR_ITEM_KEY) {
+ key_type = BTRFS_DIR_INDEX_KEY;
+ goto again;
+ }
+ return 0;
+}
+
+/*
+ * a helper function to drop items from the log before we relog an
+ * inode. max_key_type indicates the highest item type to remove.
+ * This cannot be run for file data extents because it does not
+ * free the extents they point to.
+ */
+static int drop_objectid_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log,
+ struct btrfs_path *path,
+ u64 objectid, int max_key_type)
+{
+ int ret;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ int start_slot;
+
+ key.objectid = objectid;
+ key.type = max_key_type;
+ key.offset = (u64)-1;
+
+ while (1) {
+ ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
+ BUG_ON(ret == 0); /* Logic error */
+ if (ret < 0)
+ break;
+
+ if (path->slots[0] == 0)
+ break;
+
+ path->slots[0]--;
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+ path->slots[0]);
+
+ if (found_key.objectid != objectid)
+ break;
+
+ found_key.offset = 0;
+ found_key.type = 0;
+ ret = btrfs_bin_search(path->nodes[0], &found_key, 0,
+ &start_slot);
+
+ ret = btrfs_del_items(trans, log, path, start_slot,
+ path->slots[0] - start_slot + 1);
+ /*
+ * If start slot isn't 0 then we don't need to re-search, we've
+ * found the last guy with the objectid in this tree.
+ */
+ if (ret || start_slot != 0)
+ break;
+ btrfs_release_path(path);
+ }
+ btrfs_release_path(path);
+ if (ret > 0)
+ ret = 0;
+ return ret;
+}
+
+static void fill_inode_item(struct btrfs_trans_handle *trans,
+ struct extent_buffer *leaf,
+ struct btrfs_inode_item *item,
+ struct inode *inode, int log_inode_only,
+ u64 logged_isize)
+{
+ struct btrfs_map_token token;
+
+ btrfs_init_map_token(&token);
+
+ if (log_inode_only) {
+ /* set the generation to zero so the recover code
+ * can tell the difference between an logging
+ * just to say 'this inode exists' and a logging
+ * to say 'update this inode with these values'
+ */
+ btrfs_set_token_inode_generation(leaf, item, 0, &token);
+ btrfs_set_token_inode_size(leaf, item, logged_isize, &token);
+ } else {
+ btrfs_set_token_inode_generation(leaf, item,
+ BTRFS_I(inode)->generation,
+ &token);
+ btrfs_set_token_inode_size(leaf, item, inode->i_size, &token);
+ }
+
+ btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
+ btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
+ btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
+ btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
+
+ btrfs_set_token_timespec_sec(leaf, &item->atime,
+ inode->i_atime.tv_sec, &token);
+ btrfs_set_token_timespec_nsec(leaf, &item->atime,
+ inode->i_atime.tv_nsec, &token);
+
+ btrfs_set_token_timespec_sec(leaf, &item->mtime,
+ inode->i_mtime.tv_sec, &token);
+ btrfs_set_token_timespec_nsec(leaf, &item->mtime,
+ inode->i_mtime.tv_nsec, &token);
+
+ btrfs_set_token_timespec_sec(leaf, &item->ctime,
+ inode->i_ctime.tv_sec, &token);
+ btrfs_set_token_timespec_nsec(leaf, &item->ctime,
+ inode->i_ctime.tv_nsec, &token);
+
+ btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
+ &token);
+
+ btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
+ btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
+ btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
+ btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
+ btrfs_set_token_inode_block_group(leaf, item, 0, &token);
+}
+
+static int log_inode_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log, struct btrfs_path *path,
+ struct inode *inode)
+{
+ struct btrfs_inode_item *inode_item;
+ int ret;
+
+ ret = btrfs_insert_empty_item(trans, log, path,
+ &BTRFS_I(inode)->location,
+ sizeof(*inode_item));
+ if (ret && ret != -EEXIST)
+ return ret;
+ inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_inode_item);
+ fill_inode_item(trans, path->nodes[0], inode_item, inode, 0, 0);
+ btrfs_release_path(path);
+ return 0;
+}
+
+static noinline int copy_items(struct btrfs_trans_handle *trans,
+ struct inode *inode,
+ struct btrfs_path *dst_path,
+ struct btrfs_path *src_path, u64 *last_extent,
+ int start_slot, int nr, int inode_only,
+ u64 logged_isize)
+{
+ unsigned long src_offset;
+ unsigned long dst_offset;
+ struct btrfs_root *log = BTRFS_I(inode)->root->log_root;
+ struct btrfs_file_extent_item *extent;
+ struct btrfs_inode_item *inode_item;
+ struct extent_buffer *src = src_path->nodes[0];
+ struct btrfs_key first_key, last_key, key;
+ int ret;
+ struct btrfs_key *ins_keys;
+ u32 *ins_sizes;
+ char *ins_data;
+ int i;
+ struct list_head ordered_sums;
+ int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+ bool has_extents = false;
+ bool need_find_last_extent = true;
+ bool done = false;
+
+ INIT_LIST_HEAD(&ordered_sums);
+
+ ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
+ nr * sizeof(u32), GFP_NOFS);
+ if (!ins_data)
+ return -ENOMEM;
+
+ first_key.objectid = (u64)-1;
+
+ ins_sizes = (u32 *)ins_data;
+ ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
+
+ for (i = 0; i < nr; i++) {
+ ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
+ btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
+ }
+ ret = btrfs_insert_empty_items(trans, log, dst_path,
+ ins_keys, ins_sizes, nr);
+ if (ret) {
+ kfree(ins_data);
+ return ret;
+ }
+
+ for (i = 0; i < nr; i++, dst_path->slots[0]++) {
+ dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
+ dst_path->slots[0]);
+
+ src_offset = btrfs_item_ptr_offset(src, start_slot + i);
+
+ if ((i == (nr - 1)))
+ last_key = ins_keys[i];
+
+ if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
+ inode_item = btrfs_item_ptr(dst_path->nodes[0],
+ dst_path->slots[0],
+ struct btrfs_inode_item);
+ fill_inode_item(trans, dst_path->nodes[0], inode_item,
+ inode, inode_only == LOG_INODE_EXISTS,
+ logged_isize);
+ } else {
+ copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
+ src_offset, ins_sizes[i]);
+ }
+
+ /*
+ * We set need_find_last_extent here in case we know we were
+ * processing other items and then walk into the first extent in
+ * the inode. If we don't hit an extent then nothing changes,
+ * we'll do the last search the next time around.
+ */
+ if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) {
+ has_extents = true;
+ if (first_key.objectid == (u64)-1)
+ first_key = ins_keys[i];
+ } else {
+ need_find_last_extent = false;
+ }
+
+ /* take a reference on file data extents so that truncates
+ * or deletes of this inode don't have to relog the inode
+ * again
+ */
+ if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY &&
+ !skip_csum) {
+ int found_type;
+ extent = btrfs_item_ptr(src, start_slot + i,
+ struct btrfs_file_extent_item);
+
+ if (btrfs_file_extent_generation(src, extent) < trans->transid)
+ continue;
+
+ found_type = btrfs_file_extent_type(src, extent);
+ if (found_type == BTRFS_FILE_EXTENT_REG) {
+ u64 ds, dl, cs, cl;
+ ds = btrfs_file_extent_disk_bytenr(src,
+ extent);
+ /* ds == 0 is a hole */
+ if (ds == 0)
+ continue;
+
+ dl = btrfs_file_extent_disk_num_bytes(src,
+ extent);
+ cs = btrfs_file_extent_offset(src, extent);
+ cl = btrfs_file_extent_num_bytes(src,
+ extent);
+ if (btrfs_file_extent_compression(src,
+ extent)) {
+ cs = 0;
+ cl = dl;
+ }
+
+ ret = btrfs_lookup_csums_range(
+ log->fs_info->csum_root,
+ ds + cs, ds + cs + cl - 1,
+ &ordered_sums, 0);
+ if (ret) {
+ btrfs_release_path(dst_path);
+ kfree(ins_data);
+ return ret;
+ }
+ }
+ }
+ }
+
+ btrfs_mark_buffer_dirty(dst_path->nodes[0]);
+ btrfs_release_path(dst_path);
+ kfree(ins_data);
+
+ /*
+ * we have to do this after the loop above to avoid changing the
+ * log tree while trying to change the log tree.
+ */
+ ret = 0;
+ while (!list_empty(&ordered_sums)) {
+ struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
+ struct btrfs_ordered_sum,
+ list);
+ if (!ret)
+ ret = btrfs_csum_file_blocks(trans, log, sums);
+ list_del(&sums->list);
+ kfree(sums);
+ }
+
+ if (!has_extents)
+ return ret;
+
+ if (need_find_last_extent && *last_extent == first_key.offset) {
+ /*
+ * We don't have any leafs between our current one and the one
+ * we processed before that can have file extent items for our
+ * inode (and have a generation number smaller than our current
+ * transaction id).
+ */
+ need_find_last_extent = false;
+ }
+
+ /*
+ * Because we use btrfs_search_forward we could skip leaves that were
+ * not modified and then assume *last_extent is valid when it really
+ * isn't. So back up to the previous leaf and read the end of the last
+ * extent before we go and fill in holes.
+ */
+ if (need_find_last_extent) {
+ u64 len;
+
+ ret = btrfs_prev_leaf(BTRFS_I(inode)->root, src_path);
+ if (ret < 0)
+ return ret;
+ if (ret)
+ goto fill_holes;
+ if (src_path->slots[0])
+ src_path->slots[0]--;
+ src = src_path->nodes[0];
+ btrfs_item_key_to_cpu(src, &key, src_path->slots[0]);
+ if (key.objectid != btrfs_ino(inode) ||
+ key.type != BTRFS_EXTENT_DATA_KEY)
+ goto fill_holes;
+ extent = btrfs_item_ptr(src, src_path->slots[0],
+ struct btrfs_file_extent_item);
+ if (btrfs_file_extent_type(src, extent) ==
+ BTRFS_FILE_EXTENT_INLINE) {
+ len = btrfs_file_extent_inline_len(src,
+ src_path->slots[0],
+ extent);
+ *last_extent = ALIGN(key.offset + len,
+ log->sectorsize);
+ } else {
+ len = btrfs_file_extent_num_bytes(src, extent);
+ *last_extent = key.offset + len;
+ }
+ }
+fill_holes:
+ /* So we did prev_leaf, now we need to move to the next leaf, but a few
+ * things could have happened
+ *
+ * 1) A merge could have happened, so we could currently be on a leaf
+ * that holds what we were copying in the first place.
+ * 2) A split could have happened, and now not all of the items we want
+ * are on the same leaf.
+ *
+ * So we need to adjust how we search for holes, we need to drop the
+ * path and re-search for the first extent key we found, and then walk
+ * forward until we hit the last one we copied.
+ */
+ if (need_find_last_extent) {
+ /* btrfs_prev_leaf could return 1 without releasing the path */
+ btrfs_release_path(src_path);
+ ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &first_key,
+ src_path, 0, 0);
+ if (ret < 0)
+ return ret;
+ ASSERT(ret == 0);
+ src = src_path->nodes[0];
+ i = src_path->slots[0];
+ } else {
+ i = start_slot;
+ }
+
+ /*
+ * Ok so here we need to go through and fill in any holes we may have
+ * to make sure that holes are punched for those areas in case they had
+ * extents previously.
+ */
+ while (!done) {
+ u64 offset, len;
+ u64 extent_end;
+
+ if (i >= btrfs_header_nritems(src_path->nodes[0])) {
+ ret = btrfs_next_leaf(BTRFS_I(inode)->root, src_path);
+ if (ret < 0)
+ return ret;
+ ASSERT(ret == 0);
+ src = src_path->nodes[0];
+ i = 0;
+ }
+
+ btrfs_item_key_to_cpu(src, &key, i);
+ if (!btrfs_comp_cpu_keys(&key, &last_key))
+ done = true;
+ if (key.objectid != btrfs_ino(inode) ||
+ key.type != BTRFS_EXTENT_DATA_KEY) {
+ i++;
+ continue;
+ }
+ extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item);
+ if (btrfs_file_extent_type(src, extent) ==
+ BTRFS_FILE_EXTENT_INLINE) {
+ len = btrfs_file_extent_inline_len(src, i, extent);
+ extent_end = ALIGN(key.offset + len, log->sectorsize);
+ } else {
+ len = btrfs_file_extent_num_bytes(src, extent);
+ extent_end = key.offset + len;
+ }
+ i++;
+
+ if (*last_extent == key.offset) {
+ *last_extent = extent_end;
+ continue;
+ }
+ offset = *last_extent;
+ len = key.offset - *last_extent;
+ ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode),
+ offset, 0, 0, len, 0, len, 0,
+ 0, 0);
+ if (ret)
+ break;
+ *last_extent = extent_end;
+ }
+ /*
+ * Need to let the callers know we dropped the path so they should
+ * re-search.
+ */
+ if (!ret && need_find_last_extent)
+ ret = 1;
+ return ret;
+}
+
+static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+ struct extent_map *em1, *em2;
+
+ em1 = list_entry(a, struct extent_map, list);
+ em2 = list_entry(b, struct extent_map, list);
+
+ if (em1->start < em2->start)
+ return -1;
+ else if (em1->start > em2->start)
+ return 1;
+ return 0;
+}
+
+static int wait_ordered_extents(struct btrfs_trans_handle *trans,
+ struct inode *inode,
+ struct btrfs_root *root,
+ const struct extent_map *em,
+ const struct list_head *logged_list,
+ bool *ordered_io_error)
+{
+ struct btrfs_ordered_extent *ordered;
+ struct btrfs_root *log = root->log_root;
+ u64 mod_start = em->mod_start;
+ u64 mod_len = em->mod_len;
+ const bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+ u64 csum_offset;
+ u64 csum_len;
+ LIST_HEAD(ordered_sums);
+ int ret = 0;
+
+ *ordered_io_error = false;
+
+ if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
+ em->block_start == EXTENT_MAP_HOLE)
+ return 0;
+
+ /*
+ * Wait far any ordered extent that covers our extent map. If it
+ * finishes without an error, first check and see if our csums are on
+ * our outstanding ordered extents.
+ */
+ list_for_each_entry(ordered, logged_list, log_list) {
+ struct btrfs_ordered_sum *sum;
+
+ if (!mod_len)
+ break;
+
+ if (ordered->file_offset + ordered->len <= mod_start ||
+ mod_start + mod_len <= ordered->file_offset)
+ continue;
+
+ if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) &&
+ !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) &&
+ !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) {
+ const u64 start = ordered->file_offset;
+ const u64 end = ordered->file_offset + ordered->len - 1;
+
+ WARN_ON(ordered->inode != inode);
+ filemap_fdatawrite_range(inode->i_mapping, start, end);
+ }
+
+ wait_event(ordered->wait,
+ (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) ||
+ test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)));
+
+ if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) {
+ /*
+ * Clear the AS_EIO/AS_ENOSPC flags from the inode's
+ * i_mapping flags, so that the next fsync won't get
+ * an outdated io error too.
+ */
+ btrfs_inode_check_errors(inode);
+ *ordered_io_error = true;
+ break;
+ }
+ /*
+ * We are going to copy all the csums on this ordered extent, so
+ * go ahead and adjust mod_start and mod_len in case this
+ * ordered extent has already been logged.
+ */
+ if (ordered->file_offset > mod_start) {
+ if (ordered->file_offset + ordered->len >=
+ mod_start + mod_len)
+ mod_len = ordered->file_offset - mod_start;
+ /*
+ * If we have this case
+ *
+ * |--------- logged extent ---------|
+ * |----- ordered extent ----|
+ *
+ * Just don't mess with mod_start and mod_len, we'll
+ * just end up logging more csums than we need and it
+ * will be ok.
+ */
+ } else {
+ if (ordered->file_offset + ordered->len <
+ mod_start + mod_len) {
+ mod_len = (mod_start + mod_len) -
+ (ordered->file_offset + ordered->len);
+ mod_start = ordered->file_offset +
+ ordered->len;
+ } else {
+ mod_len = 0;
+ }
+ }
+
+ if (skip_csum)
+ continue;
+
+ /*
+ * To keep us from looping for the above case of an ordered
+ * extent that falls inside of the logged extent.
+ */
+ if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM,
+ &ordered->flags))
+ continue;
+
+ if (ordered->csum_bytes_left) {
+ btrfs_start_ordered_extent(inode, ordered, 0);
+ wait_event(ordered->wait,
+ ordered->csum_bytes_left == 0);
+ }
+
+ list_for_each_entry(sum, &ordered->list, list) {
+ ret = btrfs_csum_file_blocks(trans, log, sum);
+ if (ret)
+ break;
+ }
+ }
+
+ if (*ordered_io_error || !mod_len || ret || skip_csum)
+ return ret;
+
+ if (em->compress_type) {
+ csum_offset = 0;
+ csum_len = max(em->block_len, em->orig_block_len);
+ } else {
+ csum_offset = mod_start - em->start;
+ csum_len = mod_len;
+ }
+
+ /* block start is already adjusted for the file extent offset. */
+ ret = btrfs_lookup_csums_range(log->fs_info->csum_root,
+ em->block_start + csum_offset,
+ em->block_start + csum_offset +
+ csum_len - 1, &ordered_sums, 0);
+ if (ret)
+ return ret;
+
+ while (!list_empty(&ordered_sums)) {
+ struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
+ struct btrfs_ordered_sum,
+ list);
+ if (!ret)
+ ret = btrfs_csum_file_blocks(trans, log, sums);
+ list_del(&sums->list);
+ kfree(sums);
+ }
+
+ return ret;
+}
+
+static int log_one_extent(struct btrfs_trans_handle *trans,
+ struct inode *inode, struct btrfs_root *root,
+ const struct extent_map *em,
+ struct btrfs_path *path,
+ const struct list_head *logged_list,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_root *log = root->log_root;
+ struct btrfs_file_extent_item *fi;
+ struct extent_buffer *leaf;
+ struct btrfs_map_token token;
+ struct btrfs_key key;
+ u64 extent_offset = em->start - em->orig_start;
+ u64 block_len;
+ int ret;
+ int extent_inserted = 0;
+ bool ordered_io_err = false;
+
+ ret = wait_ordered_extents(trans, inode, root, em, logged_list,
+ &ordered_io_err);
+ if (ret)
+ return ret;
+
+ if (ordered_io_err) {
+ ctx->io_err = -EIO;
+ return 0;
+ }
+
+ btrfs_init_map_token(&token);
+
+ ret = __btrfs_drop_extents(trans, log, inode, path, em->start,
+ em->start + em->len, NULL, 0, 1,
+ sizeof(*fi), &extent_inserted);
+ if (ret)
+ return ret;
+
+ if (!extent_inserted) {
+ key.objectid = btrfs_ino(inode);
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = em->start;
+
+ ret = btrfs_insert_empty_item(trans, log, path, &key,
+ sizeof(*fi));
+ if (ret)
+ return ret;
+ }
+ leaf = path->nodes[0];
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+
+ btrfs_set_token_file_extent_generation(leaf, fi, trans->transid,
+ &token);
+ if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ btrfs_set_token_file_extent_type(leaf, fi,
+ BTRFS_FILE_EXTENT_PREALLOC,
+ &token);
+ else
+ btrfs_set_token_file_extent_type(leaf, fi,
+ BTRFS_FILE_EXTENT_REG,
+ &token);
+
+ block_len = max(em->block_len, em->orig_block_len);
+ if (em->compress_type != BTRFS_COMPRESS_NONE) {
+ btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
+ em->block_start,
+ &token);
+ btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
+ &token);
+ } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
+ btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
+ em->block_start -
+ extent_offset, &token);
+ btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
+ &token);
+ } else {
+ btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token);
+ btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0,
+ &token);
+ }
+
+ btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, &token);
+ btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token);
+ btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token);
+ btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type,
+ &token);
+ btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token);
+ btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token);
+ btrfs_mark_buffer_dirty(leaf);
+
+ btrfs_release_path(path);
+
+ return ret;
+}
+
+static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *inode,
+ struct btrfs_path *path,
+ struct list_head *logged_list,
+ struct btrfs_log_ctx *ctx)
+{
+ struct extent_map *em, *n;
+ struct list_head extents;
+ struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
+ u64 test_gen;
+ int ret = 0;
+ int num = 0;
+
+ INIT_LIST_HEAD(&extents);
+
+ write_lock(&tree->lock);
+ test_gen = root->fs_info->last_trans_committed;
+
+ list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
+ list_del_init(&em->list);
+
+ /*
+ * Just an arbitrary number, this can be really CPU intensive
+ * once we start getting a lot of extents, and really once we
+ * have a bunch of extents we just want to commit since it will
+ * be faster.
+ */
+ if (++num > 32768) {
+ list_del_init(&tree->modified_extents);
+ ret = -EFBIG;
+ goto process;
+ }
+
+ if (em->generation <= test_gen)
+ continue;
+ /* Need a ref to keep it from getting evicted from cache */
+ atomic_inc(&em->refs);
+ set_bit(EXTENT_FLAG_LOGGING, &em->flags);
+ list_add_tail(&em->list, &extents);
+ num++;
+ }
+
+ list_sort(NULL, &extents, extent_cmp);
+
+process:
+ while (!list_empty(&extents)) {
+ em = list_entry(extents.next, struct extent_map, list);
+
+ list_del_init(&em->list);
+
+ /*
+ * If we had an error we just need to delete everybody from our
+ * private list.
+ */
+ if (ret) {
+ clear_em_logging(tree, em);
+ free_extent_map(em);
+ continue;
+ }
+
+ write_unlock(&tree->lock);
+
+ ret = log_one_extent(trans, inode, root, em, path, logged_list,
+ ctx);
+ write_lock(&tree->lock);
+ clear_em_logging(tree, em);
+ free_extent_map(em);
+ }
+ WARN_ON(!list_empty(&extents));
+ write_unlock(&tree->lock);
+
+ btrfs_release_path(path);
+ return ret;
+}
+
+static int logged_inode_size(struct btrfs_root *log, struct inode *inode,
+ struct btrfs_path *path, u64 *size_ret)
+{
+ struct btrfs_key key;
+ int ret;
+
+ key.objectid = btrfs_ino(inode);
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot(NULL, log, &key, path, 0, 0);
+ if (ret < 0) {
+ return ret;
+ } else if (ret > 0) {
+ *size_ret = 0;
+ } else {
+ struct btrfs_inode_item *item;
+
+ item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_inode_item);
+ *size_ret = btrfs_inode_size(path->nodes[0], item);
+ }
+
+ btrfs_release_path(path);
+ return 0;
+}
+
+/* log a single inode in the tree log.
+ * At least one parent directory for this inode must exist in the tree
+ * or be logged already.
+ *
+ * Any items from this inode changed by the current transaction are copied
+ * to the log tree. An extra reference is taken on any extents in this
+ * file, allowing us to avoid a whole pile of corner cases around logging
+ * blocks that have been removed from the tree.
+ *
+ * See LOG_INODE_ALL and related defines for a description of what inode_only
+ * does.
+ *
+ * This handles both files and directories.
+ */
+static int btrfs_log_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode,
+ int inode_only,
+ const loff_t start,
+ const loff_t end,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_path *path;
+ struct btrfs_path *dst_path;
+ struct btrfs_key min_key;
+ struct btrfs_key max_key;
+ struct btrfs_root *log = root->log_root;
+ struct extent_buffer *src = NULL;
+ LIST_HEAD(logged_list);
+ u64 last_extent = 0;
+ int err = 0;
+ int ret;
+ int nritems;
+ int ins_start_slot = 0;
+ int ins_nr;
+ bool fast_search = false;
+ u64 ino = btrfs_ino(inode);
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ u64 logged_isize = 0;
+ bool need_log_inode_item = true;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ dst_path = btrfs_alloc_path();
+ if (!dst_path) {
+ btrfs_free_path(path);
+ return -ENOMEM;
+ }
+
+ min_key.objectid = ino;
+ min_key.type = BTRFS_INODE_ITEM_KEY;
+ min_key.offset = 0;
+
+ max_key.objectid = ino;
+
+
+ /* today the code can only do partial logging of directories */
+ if (S_ISDIR(inode->i_mode) ||
+ (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags) &&
+ inode_only == LOG_INODE_EXISTS))
+ max_key.type = BTRFS_XATTR_ITEM_KEY;
+ else
+ max_key.type = (u8)-1;
+ max_key.offset = (u64)-1;
+
+ /*
+ * Only run delayed items if we are a dir or a new file.
+ * Otherwise commit the delayed inode only, which is needed in
+ * order for the log replay code to mark inodes for link count
+ * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items).
+ */
+ if (S_ISDIR(inode->i_mode) ||
+ BTRFS_I(inode)->generation > root->fs_info->last_trans_committed)
+ ret = btrfs_commit_inode_delayed_items(trans, inode);
+ else
+ ret = btrfs_commit_inode_delayed_inode(inode);
+
+ if (ret) {
+ btrfs_free_path(path);
+ btrfs_free_path(dst_path);
+ return ret;
+ }
+
+ mutex_lock(&BTRFS_I(inode)->log_mutex);
+
+ btrfs_get_logged_extents(inode, &logged_list, start, end);
+
+ /*
+ * a brute force approach to making sure we get the most uptodate
+ * copies of everything.
+ */
+ if (S_ISDIR(inode->i_mode)) {
+ int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
+
+ if (inode_only == LOG_INODE_EXISTS)
+ max_key_type = BTRFS_XATTR_ITEM_KEY;
+ ret = drop_objectid_items(trans, log, path, ino, max_key_type);
+ } else {
+ if (inode_only == LOG_INODE_EXISTS) {
+ /*
+ * Make sure the new inode item we write to the log has
+ * the same isize as the current one (if it exists).
+ * This is necessary to prevent data loss after log
+ * replay, and also to prevent doing a wrong expanding
+ * truncate - for e.g. create file, write 4K into offset
+ * 0, fsync, write 4K into offset 4096, add hard link,
+ * fsync some other file (to sync log), power fail - if
+ * we use the inode's current i_size, after log replay
+ * we get a 8Kb file, with the last 4Kb extent as a hole
+ * (zeroes), as if an expanding truncate happened,
+ * instead of getting a file of 4Kb only.
+ */
+ err = logged_inode_size(log, inode, path,
+ &logged_isize);
+ if (err)
+ goto out_unlock;
+ }
+ if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags)) {
+ if (inode_only == LOG_INODE_EXISTS) {
+ max_key.type = BTRFS_XATTR_ITEM_KEY;
+ ret = drop_objectid_items(trans, log, path, ino,
+ max_key.type);
+ } else {
+ clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
+ clear_bit(BTRFS_INODE_COPY_EVERYTHING,
+ &BTRFS_I(inode)->runtime_flags);
+ while(1) {
+ ret = btrfs_truncate_inode_items(trans,
+ log, inode, 0, 0);
+ if (ret != -EAGAIN)
+ break;
+ }
+ }
+ } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
+ &BTRFS_I(inode)->runtime_flags) ||
+ inode_only == LOG_INODE_EXISTS) {
+ if (inode_only == LOG_INODE_ALL)
+ fast_search = true;
+ max_key.type = BTRFS_XATTR_ITEM_KEY;
+ ret = drop_objectid_items(trans, log, path, ino,
+ max_key.type);
+ } else {
+ if (inode_only == LOG_INODE_ALL)
+ fast_search = true;
+ goto log_extents;
+ }
+
+ }
+ if (ret) {
+ err = ret;
+ goto out_unlock;
+ }
+
+ while (1) {
+ ins_nr = 0;
+ ret = btrfs_search_forward(root, &min_key,
+ path, trans->transid);
+ if (ret != 0)
+ break;
+again:
+ /* note, ins_nr might be > 0 here, cleanup outside the loop */
+ if (min_key.objectid != ino)
+ break;
+ if (min_key.type > max_key.type)
+ break;
+
+ if (min_key.type == BTRFS_INODE_ITEM_KEY)
+ need_log_inode_item = false;
+
+ src = path->nodes[0];
+ if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
+ ins_nr++;
+ goto next_slot;
+ } else if (!ins_nr) {
+ ins_start_slot = path->slots[0];
+ ins_nr = 1;
+ goto next_slot;
+ }
+
+ ret = copy_items(trans, inode, dst_path, path, &last_extent,
+ ins_start_slot, ins_nr, inode_only,
+ logged_isize);
+ if (ret < 0) {
+ err = ret;
+ goto out_unlock;
+ }
+ if (ret) {
+ ins_nr = 0;
+ btrfs_release_path(path);
+ continue;
+ }
+ ins_nr = 1;
+ ins_start_slot = path->slots[0];
+next_slot:
+
+ nritems = btrfs_header_nritems(path->nodes[0]);
+ path->slots[0]++;
+ if (path->slots[0] < nritems) {
+ btrfs_item_key_to_cpu(path->nodes[0], &min_key,
+ path->slots[0]);
+ goto again;
+ }
+ if (ins_nr) {
+ ret = copy_items(trans, inode, dst_path, path,
+ &last_extent, ins_start_slot,
+ ins_nr, inode_only, logged_isize);
+ if (ret < 0) {
+ err = ret;
+ goto out_unlock;
+ }
+ ret = 0;
+ ins_nr = 0;
+ }
+ btrfs_release_path(path);
+
+ if (min_key.offset < (u64)-1) {
+ min_key.offset++;
+ } else if (min_key.type < max_key.type) {
+ min_key.type++;
+ min_key.offset = 0;
+ } else {
+ break;
+ }
+ }
+ if (ins_nr) {
+ ret = copy_items(trans, inode, dst_path, path, &last_extent,
+ ins_start_slot, ins_nr, inode_only,
+ logged_isize);
+ if (ret < 0) {
+ err = ret;
+ goto out_unlock;
+ }
+ ret = 0;
+ ins_nr = 0;
+ }
+
+log_extents:
+ btrfs_release_path(path);
+ btrfs_release_path(dst_path);
+ if (need_log_inode_item) {
+ err = log_inode_item(trans, log, dst_path, inode);
+ if (err)
+ goto out_unlock;
+ }
+ if (fast_search) {
+ /*
+ * Some ordered extents started by fsync might have completed
+ * before we collected the ordered extents in logged_list, which
+ * means they're gone, not in our logged_list nor in the inode's
+ * ordered tree. We want the application/user space to know an
+ * error happened while attempting to persist file data so that
+ * it can take proper action. If such error happened, we leave
+ * without writing to the log tree and the fsync must report the
+ * file data write error and not commit the current transaction.
+ */
+ err = btrfs_inode_check_errors(inode);
+ if (err) {
+ ctx->io_err = err;
+ goto out_unlock;
+ }
+ ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
+ &logged_list, ctx);
+ if (ret) {
+ err = ret;
+ goto out_unlock;
+ }
+ } else if (inode_only == LOG_INODE_ALL) {
+ struct extent_map *em, *n;
+
+ write_lock(&em_tree->lock);
+ /*
+ * We can't just remove every em if we're called for a ranged
+ * fsync - that is, one that doesn't cover the whole possible
+ * file range (0 to LLONG_MAX). This is because we can have
+ * em's that fall outside the range we're logging and therefore
+ * their ordered operations haven't completed yet
+ * (btrfs_finish_ordered_io() not invoked yet). This means we
+ * didn't get their respective file extent item in the fs/subvol
+ * tree yet, and need to let the next fast fsync (one which
+ * consults the list of modified extent maps) find the em so
+ * that it logs a matching file extent item and waits for the
+ * respective ordered operation to complete (if it's still
+ * running).
+ *
+ * Removing every em outside the range we're logging would make
+ * the next fast fsync not log their matching file extent items,
+ * therefore making us lose data after a log replay.
+ */
+ list_for_each_entry_safe(em, n, &em_tree->modified_extents,
+ list) {
+ const u64 mod_end = em->mod_start + em->mod_len - 1;
+
+ if (em->mod_start >= start && mod_end <= end)
+ list_del_init(&em->list);
+ }
+ write_unlock(&em_tree->lock);
+ }
+
+ if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
+ ret = log_directory_changes(trans, root, inode, path, dst_path,
+ ctx);
+ if (ret) {
+ err = ret;
+ goto out_unlock;
+ }
+ }
+
+ spin_lock(&BTRFS_I(inode)->lock);
+ BTRFS_I(inode)->logged_trans = trans->transid;
+ BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
+ spin_unlock(&BTRFS_I(inode)->lock);
+out_unlock:
+ if (unlikely(err))
+ btrfs_put_logged_extents(&logged_list);
+ else
+ btrfs_submit_logged_extents(&logged_list, log);
+ mutex_unlock(&BTRFS_I(inode)->log_mutex);
+
+ btrfs_free_path(path);
+ btrfs_free_path(dst_path);
+ return err;
+}
+
+/*
+ * follow the dentry parent pointers up the chain and see if any
+ * of the directories in it require a full commit before they can
+ * be logged. Returns zero if nothing special needs to be done or 1 if
+ * a full commit is required.
+ */
+static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
+ struct inode *inode,
+ struct dentry *parent,
+ struct super_block *sb,
+ u64 last_committed)
+{
+ int ret = 0;
+ struct btrfs_root *root;
+ struct dentry *old_parent = NULL;
+ struct inode *orig_inode = inode;
+
+ /*
+ * for regular files, if its inode is already on disk, we don't
+ * have to worry about the parents at all. This is because
+ * we can use the last_unlink_trans field to record renames
+ * and other fun in this file.
+ */
+ if (S_ISREG(inode->i_mode) &&
+ BTRFS_I(inode)->generation <= last_committed &&
+ BTRFS_I(inode)->last_unlink_trans <= last_committed)
+ goto out;
+
+ if (!S_ISDIR(inode->i_mode)) {
+ if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb)
+ goto out;
+ inode = d_inode(parent);
+ }
+
+ while (1) {
+ /*
+ * If we are logging a directory then we start with our inode,
+ * not our parents inode, so we need to skipp setting the
+ * logged_trans so that further down in the log code we don't
+ * think this inode has already been logged.
+ */
+ if (inode != orig_inode)
+ BTRFS_I(inode)->logged_trans = trans->transid;
+ smp_mb();
+
+ if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
+ root = BTRFS_I(inode)->root;
+
+ /*
+ * make sure any commits to the log are forced
+ * to be full commits
+ */
+ btrfs_set_log_full_commit(root->fs_info, trans);
+ ret = 1;
+ break;
+ }
+
+ if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb)
+ break;
+
+ if (IS_ROOT(parent))
+ break;
+
+ parent = dget_parent(parent);
+ dput(old_parent);
+ old_parent = parent;
+ inode = d_inode(parent);
+
+ }
+ dput(old_parent);
+out:
+ return ret;
+}
+
+struct btrfs_dir_list {
+ u64 ino;
+ struct list_head list;
+};
+
+/*
+ * Log the inodes of the new dentries of a directory. See log_dir_items() for
+ * details about the why it is needed.
+ * This is a recursive operation - if an existing dentry corresponds to a
+ * directory, that directory's new entries are logged too (same behaviour as
+ * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
+ * the dentries point to we do not lock their i_mutex, otherwise lockdep
+ * complains about the following circular lock dependency / possible deadlock:
+ *
+ * CPU0 CPU1
+ * ---- ----
+ * lock(&type->i_mutex_dir_key#3/2);
+ * lock(sb_internal#2);
+ * lock(&type->i_mutex_dir_key#3/2);
+ * lock(&sb->s_type->i_mutex_key#14);
+ *
+ * Where sb_internal is the lock (a counter that works as a lock) acquired by
+ * sb_start_intwrite() in btrfs_start_transaction().
+ * Not locking i_mutex of the inodes is still safe because:
+ *
+ * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
+ * that while logging the inode new references (names) are added or removed
+ * from the inode, leaving the logged inode item with a link count that does
+ * not match the number of logged inode reference items. This is fine because
+ * at log replay time we compute the real number of links and correct the
+ * link count in the inode item (see replay_one_buffer() and
+ * link_to_fixup_dir());
+ *
+ * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
+ * while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and
+ * BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item
+ * has a size that doesn't match the sum of the lengths of all the logged
+ * names. This does not result in a problem because if a dir_item key is
+ * logged but its matching dir_index key is not logged, at log replay time we
+ * don't use it to replay the respective name (see replay_one_name()). On the
+ * other hand if only the dir_index key ends up being logged, the respective
+ * name is added to the fs/subvol tree with both the dir_item and dir_index
+ * keys created (see replay_one_name()).
+ * The directory's inode item with a wrong i_size is not a problem as well,
+ * since we don't use it at log replay time to set the i_size in the inode
+ * item of the fs/subvol tree (see overwrite_item()).
+ */
+static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *start_inode,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_root *log = root->log_root;
+ struct btrfs_path *path;
+ LIST_HEAD(dir_list);
+ struct btrfs_dir_list *dir_elem;
+ int ret = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
+ if (!dir_elem) {
+ btrfs_free_path(path);
+ return -ENOMEM;
+ }
+ dir_elem->ino = btrfs_ino(start_inode);
+ list_add_tail(&dir_elem->list, &dir_list);
+
+ while (!list_empty(&dir_list)) {
+ struct extent_buffer *leaf;
+ struct btrfs_key min_key;
+ int nritems;
+ int i;
+
+ dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list,
+ list);
+ if (ret)
+ goto next_dir_inode;
+
+ min_key.objectid = dir_elem->ino;
+ min_key.type = BTRFS_DIR_ITEM_KEY;
+ min_key.offset = 0;
+again:
+ btrfs_release_path(path);
+ ret = btrfs_search_forward(log, &min_key, path, trans->transid);
+ if (ret < 0) {
+ goto next_dir_inode;
+ } else if (ret > 0) {
+ ret = 0;
+ goto next_dir_inode;
+ }
+
+process_leaf:
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ for (i = path->slots[0]; i < nritems; i++) {
+ struct btrfs_dir_item *di;
+ struct btrfs_key di_key;
+ struct inode *di_inode;
+ struct btrfs_dir_list *new_dir_elem;
+ int log_mode = LOG_INODE_EXISTS;
+ int type;
+
+ btrfs_item_key_to_cpu(leaf, &min_key, i);
+ if (min_key.objectid != dir_elem->ino ||
+ min_key.type != BTRFS_DIR_ITEM_KEY)
+ goto next_dir_inode;
+
+ di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
+ type = btrfs_dir_type(leaf, di);
+ if (btrfs_dir_transid(leaf, di) < trans->transid &&
+ type != BTRFS_FT_DIR)
+ continue;
+ btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
+ if (di_key.type == BTRFS_ROOT_ITEM_KEY)
+ continue;
+
+ di_inode = btrfs_iget(root->fs_info->sb, &di_key,
+ root, NULL);
+ if (IS_ERR(di_inode)) {
+ ret = PTR_ERR(di_inode);
+ goto next_dir_inode;
+ }
+
+ if (btrfs_inode_in_log(di_inode, trans->transid)) {
+ iput(di_inode);
+ continue;
+ }
+
+ ctx->log_new_dentries = false;
+ if (type == BTRFS_FT_DIR)
+ log_mode = LOG_INODE_ALL;
+ btrfs_release_path(path);
+ ret = btrfs_log_inode(trans, root, di_inode,
+ log_mode, 0, LLONG_MAX, ctx);
+ iput(di_inode);
+ if (ret)
+ goto next_dir_inode;
+ if (ctx->log_new_dentries) {
+ new_dir_elem = kmalloc(sizeof(*new_dir_elem),
+ GFP_NOFS);
+ if (!new_dir_elem) {
+ ret = -ENOMEM;
+ goto next_dir_inode;
+ }
+ new_dir_elem->ino = di_key.objectid;
+ list_add_tail(&new_dir_elem->list, &dir_list);
+ }
+ break;
+ }
+ if (i == nritems) {
+ ret = btrfs_next_leaf(log, path);
+ if (ret < 0) {
+ goto next_dir_inode;
+ } else if (ret > 0) {
+ ret = 0;
+ goto next_dir_inode;
+ }
+ goto process_leaf;
+ }
+ if (min_key.offset < (u64)-1) {
+ min_key.offset++;
+ goto again;
+ }
+next_dir_inode:
+ list_del(&dir_elem->list);
+ kfree(dir_elem);
+ }
+
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * helper function around btrfs_log_inode to make sure newly created
+ * parent directories also end up in the log. A minimal inode and backref
+ * only logging is done of any parent directories that are older than
+ * the last committed transaction
+ */
+static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode,
+ struct dentry *parent,
+ const loff_t start,
+ const loff_t end,
+ int exists_only,
+ struct btrfs_log_ctx *ctx)
+{
+ int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
+ struct super_block *sb;
+ struct dentry *old_parent = NULL;
+ int ret = 0;
+ u64 last_committed = root->fs_info->last_trans_committed;
+ const struct dentry * const first_parent = parent;
+ const bool did_unlink = (BTRFS_I(inode)->last_unlink_trans >
+ last_committed);
+ bool log_dentries = false;
+ struct inode *orig_inode = inode;
+
+ sb = inode->i_sb;
+
+ if (btrfs_test_opt(root, NOTREELOG)) {
+ ret = 1;
+ goto end_no_trans;
+ }
+
+ /*
+ * The prev transaction commit doesn't complete, we need do
+ * full commit by ourselves.
+ */
+ if (root->fs_info->last_trans_log_full_commit >
+ root->fs_info->last_trans_committed) {
+ ret = 1;
+ goto end_no_trans;
+ }
+
+ if (root != BTRFS_I(inode)->root ||
+ btrfs_root_refs(&root->root_item) == 0) {
+ ret = 1;
+ goto end_no_trans;
+ }
+
+ ret = check_parent_dirs_for_sync(trans, inode, parent,
+ sb, last_committed);
+ if (ret)
+ goto end_no_trans;
+
+ if (btrfs_inode_in_log(inode, trans->transid)) {
+ ret = BTRFS_NO_LOG_SYNC;
+ goto end_no_trans;
+ }
+
+ ret = start_log_trans(trans, root, ctx);
+ if (ret)
+ goto end_no_trans;
+
+ ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx);
+ if (ret)
+ goto end_trans;
+
+ /*
+ * for regular files, if its inode is already on disk, we don't
+ * have to worry about the parents at all. This is because
+ * we can use the last_unlink_trans field to record renames
+ * and other fun in this file.
+ */
+ if (S_ISREG(inode->i_mode) &&
+ BTRFS_I(inode)->generation <= last_committed &&
+ BTRFS_I(inode)->last_unlink_trans <= last_committed) {
+ ret = 0;
+ goto end_trans;
+ }
+
+ if (S_ISDIR(inode->i_mode) && ctx && ctx->log_new_dentries)
+ log_dentries = true;
+
+ while (1) {
+ if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb)
+ break;
+
+ inode = d_inode(parent);
+ if (root != BTRFS_I(inode)->root)
+ break;
+
+ /*
+ * On unlink we must make sure our immediate parent directory
+ * inode is fully logged. This is to prevent leaving dangling
+ * directory index entries and a wrong directory inode's i_size.
+ * Not doing so can result in a directory being impossible to
+ * delete after log replay (rmdir will always fail with error
+ * -ENOTEMPTY).
+ */
+ if (did_unlink && parent == first_parent)
+ inode_only = LOG_INODE_ALL;
+ else
+ inode_only = LOG_INODE_EXISTS;
+
+ if (BTRFS_I(inode)->generation >
+ root->fs_info->last_trans_committed ||
+ inode_only == LOG_INODE_ALL) {
+ ret = btrfs_log_inode(trans, root, inode, inode_only,
+ 0, LLONG_MAX, ctx);
+ if (ret)
+ goto end_trans;
+ }
+ if (IS_ROOT(parent))
+ break;
+
+ parent = dget_parent(parent);
+ dput(old_parent);
+ old_parent = parent;
+ }
+ if (log_dentries)
+ ret = log_new_dir_dentries(trans, root, orig_inode, ctx);
+ else
+ ret = 0;
+end_trans:
+ dput(old_parent);
+ if (ret < 0) {
+ btrfs_set_log_full_commit(root->fs_info, trans);
+ ret = 1;
+ }
+
+ if (ret)
+ btrfs_remove_log_ctx(root, ctx);
+ btrfs_end_log_trans(root);
+end_no_trans:
+ return ret;
+}
+
+/*
+ * it is not safe to log dentry if the chunk root has added new
+ * chunks. This returns 0 if the dentry was logged, and 1 otherwise.
+ * If this returns 1, you must commit the transaction to safely get your
+ * data on disk.
+ */
+int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct dentry *dentry,
+ const loff_t start,
+ const loff_t end,
+ struct btrfs_log_ctx *ctx)
+{
+ struct dentry *parent = dget_parent(dentry);
+ int ret;
+
+ ret = btrfs_log_inode_parent(trans, root, d_inode(dentry), parent,
+ start, end, 0, ctx);
+ dput(parent);
+
+ return ret;
+}
+
+/*
+ * should be called during mount to recover any replay any log trees
+ * from the FS
+ */
+int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
+{
+ int ret;
+ struct btrfs_path *path;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ struct btrfs_key tmp_key;
+ struct btrfs_root *log;
+ struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
+ struct walk_control wc = {
+ .process_func = process_one_buffer,
+ .stage = 0,
+ };
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ fs_info->log_root_recovering = 1;
+
+ trans = btrfs_start_transaction(fs_info->tree_root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto error;
+ }
+
+ wc.trans = trans;
+ wc.pin = 1;
+
+ ret = walk_log_tree(trans, log_root_tree, &wc);
+ if (ret) {
+ btrfs_error(fs_info, ret, "Failed to pin buffers while "
+ "recovering log root tree.");
+ goto error;
+ }
+
+again:
+ key.objectid = BTRFS_TREE_LOG_OBJECTID;
+ key.offset = (u64)-1;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+
+ while (1) {
+ ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
+
+ if (ret < 0) {
+ btrfs_error(fs_info, ret,
+ "Couldn't find tree log root.");
+ goto error;
+ }
+ if (ret > 0) {
+ if (path->slots[0] == 0)
+ break;
+ path->slots[0]--;
+ }
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+ path->slots[0]);
+ btrfs_release_path(path);
+ if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
+ break;
+
+ log = btrfs_read_fs_root(log_root_tree, &found_key);
+ if (IS_ERR(log)) {
+ ret = PTR_ERR(log);
+ btrfs_error(fs_info, ret,
+ "Couldn't read tree log root.");
+ goto error;
+ }
+
+ tmp_key.objectid = found_key.offset;
+ tmp_key.type = BTRFS_ROOT_ITEM_KEY;
+ tmp_key.offset = (u64)-1;
+
+ wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
+ if (IS_ERR(wc.replay_dest)) {
+ ret = PTR_ERR(wc.replay_dest);
+ free_extent_buffer(log->node);
+ free_extent_buffer(log->commit_root);
+ kfree(log);
+ btrfs_error(fs_info, ret, "Couldn't read target root "
+ "for tree log recovery.");
+ goto error;
+ }
+
+ wc.replay_dest->log_root = log;
+ btrfs_record_root_in_trans(trans, wc.replay_dest);
+ ret = walk_log_tree(trans, log, &wc);
+
+ if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
+ ret = fixup_inode_link_counts(trans, wc.replay_dest,
+ path);
+ }
+
+ key.offset = found_key.offset - 1;
+ wc.replay_dest->log_root = NULL;
+ free_extent_buffer(log->node);
+ free_extent_buffer(log->commit_root);
+ kfree(log);
+
+ if (ret)
+ goto error;
+
+ if (found_key.offset == 0)
+ break;
+ }
+ btrfs_release_path(path);
+
+ /* step one is to pin it all, step two is to replay just inodes */
+ if (wc.pin) {
+ wc.pin = 0;
+ wc.process_func = replay_one_buffer;
+ wc.stage = LOG_WALK_REPLAY_INODES;
+ goto again;
+ }
+ /* step three is to replay everything */
+ if (wc.stage < LOG_WALK_REPLAY_ALL) {
+ wc.stage++;
+ goto again;
+ }
+
+ btrfs_free_path(path);
+
+ /* step 4: commit the transaction, which also unpins the blocks */
+ ret = btrfs_commit_transaction(trans, fs_info->tree_root);
+ if (ret)
+ return ret;
+
+ free_extent_buffer(log_root_tree->node);
+ log_root_tree->log_root = NULL;
+ fs_info->log_root_recovering = 0;
+ kfree(log_root_tree);
+
+ return 0;
+error:
+ if (wc.trans)
+ btrfs_end_transaction(wc.trans, fs_info->tree_root);
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * there are some corner cases where we want to force a full
+ * commit instead of allowing a directory to be logged.
+ *
+ * They revolve around files there were unlinked from the directory, and
+ * this function updates the parent directory so that a full commit is
+ * properly done if it is fsync'd later after the unlinks are done.
+ */
+void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
+ struct inode *dir, struct inode *inode,
+ int for_rename)
+{
+ /*
+ * when we're logging a file, if it hasn't been renamed
+ * or unlinked, and its inode is fully committed on disk,
+ * we don't have to worry about walking up the directory chain
+ * to log its parents.
+ *
+ * So, we use the last_unlink_trans field to put this transid
+ * into the file. When the file is logged we check it and
+ * don't log the parents if the file is fully on disk.
+ */
+ if (S_ISREG(inode->i_mode))
+ BTRFS_I(inode)->last_unlink_trans = trans->transid;
+
+ /*
+ * if this directory was already logged any new
+ * names for this file/dir will get recorded
+ */
+ smp_mb();
+ if (BTRFS_I(dir)->logged_trans == trans->transid)
+ return;
+
+ /*
+ * if the inode we're about to unlink was logged,
+ * the log will be properly updated for any new names
+ */
+ if (BTRFS_I(inode)->logged_trans == trans->transid)
+ return;
+
+ /*
+ * when renaming files across directories, if the directory
+ * there we're unlinking from gets fsync'd later on, there's
+ * no way to find the destination directory later and fsync it
+ * properly. So, we have to be conservative and force commits
+ * so the new name gets discovered.
+ */
+ if (for_rename)
+ goto record;
+
+ /* we can safely do the unlink without any special recording */
+ return;
+
+record:
+ BTRFS_I(dir)->last_unlink_trans = trans->transid;
+}
+
+/*
+ * Call this after adding a new name for a file and it will properly
+ * update the log to reflect the new name.
+ *
+ * It will return zero if all goes well, and it will return 1 if a
+ * full transaction commit is required.
+ */
+int btrfs_log_new_name(struct btrfs_trans_handle *trans,
+ struct inode *inode, struct inode *old_dir,
+ struct dentry *parent)
+{
+ struct btrfs_root * root = BTRFS_I(inode)->root;
+
+ /*
+ * this will force the logging code to walk the dentry chain
+ * up for the file
+ */
+ if (S_ISREG(inode->i_mode))
+ BTRFS_I(inode)->last_unlink_trans = trans->transid;
+
+ /*
+ * if this inode hasn't been logged and directory we're renaming it
+ * from hasn't been logged, we don't need to log it
+ */
+ if (BTRFS_I(inode)->logged_trans <=
+ root->fs_info->last_trans_committed &&
+ (!old_dir || BTRFS_I(old_dir)->logged_trans <=
+ root->fs_info->last_trans_committed))
+ return 0;
+
+ return btrfs_log_inode_parent(trans, root, inode, parent, 0,
+ LLONG_MAX, 1, NULL);
+}
+
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
new file mode 100644
index 000000000..6916a781e
--- /dev/null
+++ b/fs/btrfs/tree-log.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (C) 2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __TREE_LOG_
+#define __TREE_LOG_
+
+#include "ctree.h"
+#include "transaction.h"
+
+/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */
+#define BTRFS_NO_LOG_SYNC 256
+
+struct btrfs_log_ctx {
+ int log_ret;
+ int log_transid;
+ int io_err;
+ bool log_new_dentries;
+ struct list_head list;
+};
+
+static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx)
+{
+ ctx->log_ret = 0;
+ ctx->log_transid = 0;
+ ctx->io_err = 0;
+ ctx->log_new_dentries = false;
+ INIT_LIST_HEAD(&ctx->list);
+}
+
+static inline void btrfs_set_log_full_commit(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans)
+{
+ ACCESS_ONCE(fs_info->last_trans_log_full_commit) = trans->transid;
+}
+
+static inline int btrfs_need_log_full_commit(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans)
+{
+ return ACCESS_ONCE(fs_info->last_trans_log_full_commit) ==
+ trans->transid;
+}
+
+int btrfs_sync_log(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_log_ctx *ctx);
+int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
+int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info);
+int btrfs_recover_log_trees(struct btrfs_root *tree_root);
+int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct dentry *dentry,
+ const loff_t start,
+ const loff_t end,
+ struct btrfs_log_ctx *ctx);
+int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+ struct inode *dir, u64 index);
+int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+ struct inode *inode, u64 dirid);
+void btrfs_end_log_trans(struct btrfs_root *root);
+int btrfs_pin_log_trans(struct btrfs_root *root);
+void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
+ struct inode *dir, struct inode *inode,
+ int for_rename);
+int btrfs_log_new_name(struct btrfs_trans_handle *trans,
+ struct inode *inode, struct inode *old_dir,
+ struct dentry *parent);
+#endif
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
new file mode 100644
index 000000000..840a38b27
--- /dev/null
+++ b/fs/btrfs/ulist.c
@@ -0,0 +1,251 @@
+/*
+ * Copyright (C) 2011 STRATO AG
+ * written by Arne Jansen <sensille@gmx.net>
+ * Distributed under the GNU GPL license version 2.
+ */
+
+#include <linux/slab.h>
+#include "ulist.h"
+#include "ctree.h"
+
+/*
+ * ulist is a generic data structure to hold a collection of unique u64
+ * values. The only operations it supports is adding to the list and
+ * enumerating it.
+ * It is possible to store an auxiliary value along with the key.
+ *
+ * A sample usage for ulists is the enumeration of directed graphs without
+ * visiting a node twice. The pseudo-code could look like this:
+ *
+ * ulist = ulist_alloc();
+ * ulist_add(ulist, root);
+ * ULIST_ITER_INIT(&uiter);
+ *
+ * while ((elem = ulist_next(ulist, &uiter)) {
+ * for (all child nodes n in elem)
+ * ulist_add(ulist, n);
+ * do something useful with the node;
+ * }
+ * ulist_free(ulist);
+ *
+ * This assumes the graph nodes are adressable by u64. This stems from the
+ * usage for tree enumeration in btrfs, where the logical addresses are
+ * 64 bit.
+ *
+ * It is also useful for tree enumeration which could be done elegantly
+ * recursively, but is not possible due to kernel stack limitations. The
+ * loop would be similar to the above.
+ */
+
+/**
+ * ulist_init - freshly initialize a ulist
+ * @ulist: the ulist to initialize
+ *
+ * Note: don't use this function to init an already used ulist, use
+ * ulist_reinit instead.
+ */
+void ulist_init(struct ulist *ulist)
+{
+ INIT_LIST_HEAD(&ulist->nodes);
+ ulist->root = RB_ROOT;
+ ulist->nnodes = 0;
+}
+
+/**
+ * ulist_fini - free up additionally allocated memory for the ulist
+ * @ulist: the ulist from which to free the additional memory
+ *
+ * This is useful in cases where the base 'struct ulist' has been statically
+ * allocated.
+ */
+static void ulist_fini(struct ulist *ulist)
+{
+ struct ulist_node *node;
+ struct ulist_node *next;
+
+ list_for_each_entry_safe(node, next, &ulist->nodes, list) {
+ kfree(node);
+ }
+ ulist->root = RB_ROOT;
+ INIT_LIST_HEAD(&ulist->nodes);
+}
+
+/**
+ * ulist_reinit - prepare a ulist for reuse
+ * @ulist: ulist to be reused
+ *
+ * Free up all additional memory allocated for the list elements and reinit
+ * the ulist.
+ */
+void ulist_reinit(struct ulist *ulist)
+{
+ ulist_fini(ulist);
+ ulist_init(ulist);
+}
+
+/**
+ * ulist_alloc - dynamically allocate a ulist
+ * @gfp_mask: allocation flags to for base allocation
+ *
+ * The allocated ulist will be returned in an initialized state.
+ */
+struct ulist *ulist_alloc(gfp_t gfp_mask)
+{
+ struct ulist *ulist = kmalloc(sizeof(*ulist), gfp_mask);
+
+ if (!ulist)
+ return NULL;
+
+ ulist_init(ulist);
+
+ return ulist;
+}
+
+/**
+ * ulist_free - free dynamically allocated ulist
+ * @ulist: ulist to free
+ *
+ * It is not necessary to call ulist_fini before.
+ */
+void ulist_free(struct ulist *ulist)
+{
+ if (!ulist)
+ return;
+ ulist_fini(ulist);
+ kfree(ulist);
+}
+
+static struct ulist_node *ulist_rbtree_search(struct ulist *ulist, u64 val)
+{
+ struct rb_node *n = ulist->root.rb_node;
+ struct ulist_node *u = NULL;
+
+ while (n) {
+ u = rb_entry(n, struct ulist_node, rb_node);
+ if (u->val < val)
+ n = n->rb_right;
+ else if (u->val > val)
+ n = n->rb_left;
+ else
+ return u;
+ }
+ return NULL;
+}
+
+static int ulist_rbtree_insert(struct ulist *ulist, struct ulist_node *ins)
+{
+ struct rb_node **p = &ulist->root.rb_node;
+ struct rb_node *parent = NULL;
+ struct ulist_node *cur = NULL;
+
+ while (*p) {
+ parent = *p;
+ cur = rb_entry(parent, struct ulist_node, rb_node);
+
+ if (cur->val < ins->val)
+ p = &(*p)->rb_right;
+ else if (cur->val > ins->val)
+ p = &(*p)->rb_left;
+ else
+ return -EEXIST;
+ }
+ rb_link_node(&ins->rb_node, parent, p);
+ rb_insert_color(&ins->rb_node, &ulist->root);
+ return 0;
+}
+
+/**
+ * ulist_add - add an element to the ulist
+ * @ulist: ulist to add the element to
+ * @val: value to add to ulist
+ * @aux: auxiliary value to store along with val
+ * @gfp_mask: flags to use for allocation
+ *
+ * Note: locking must be provided by the caller. In case of rwlocks write
+ * locking is needed
+ *
+ * Add an element to a ulist. The @val will only be added if it doesn't
+ * already exist. If it is added, the auxiliary value @aux is stored along with
+ * it. In case @val already exists in the ulist, @aux is ignored, even if
+ * it differs from the already stored value.
+ *
+ * ulist_add returns 0 if @val already exists in ulist and 1 if @val has been
+ * inserted.
+ * In case of allocation failure -ENOMEM is returned and the ulist stays
+ * unaltered.
+ */
+int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask)
+{
+ return ulist_add_merge(ulist, val, aux, NULL, gfp_mask);
+}
+
+int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
+ u64 *old_aux, gfp_t gfp_mask)
+{
+ int ret;
+ struct ulist_node *node;
+
+ node = ulist_rbtree_search(ulist, val);
+ if (node) {
+ if (old_aux)
+ *old_aux = node->aux;
+ return 0;
+ }
+ node = kmalloc(sizeof(*node), gfp_mask);
+ if (!node)
+ return -ENOMEM;
+
+ node->val = val;
+ node->aux = aux;
+#ifdef CONFIG_BTRFS_DEBUG
+ node->seqnum = ulist->nnodes;
+#endif
+
+ ret = ulist_rbtree_insert(ulist, node);
+ ASSERT(!ret);
+ list_add_tail(&node->list, &ulist->nodes);
+ ulist->nnodes++;
+
+ return 1;
+}
+
+/**
+ * ulist_next - iterate ulist
+ * @ulist: ulist to iterate
+ * @uiter: iterator variable, initialized with ULIST_ITER_INIT(&iterator)
+ *
+ * Note: locking must be provided by the caller. In case of rwlocks only read
+ * locking is needed
+ *
+ * This function is used to iterate an ulist.
+ * It returns the next element from the ulist or %NULL when the
+ * end is reached. No guarantee is made with respect to the order in which
+ * the elements are returned. They might neither be returned in order of
+ * addition nor in ascending order.
+ * It is allowed to call ulist_add during an enumeration. Newly added items
+ * are guaranteed to show up in the running enumeration.
+ */
+struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter)
+{
+ struct ulist_node *node;
+
+ if (list_empty(&ulist->nodes))
+ return NULL;
+ if (uiter->cur_list && uiter->cur_list->next == &ulist->nodes)
+ return NULL;
+ if (uiter->cur_list) {
+ uiter->cur_list = uiter->cur_list->next;
+ } else {
+ uiter->cur_list = ulist->nodes.next;
+#ifdef CONFIG_BTRFS_DEBUG
+ uiter->i = 0;
+#endif
+ }
+ node = list_entry(uiter->cur_list, struct ulist_node, list);
+#ifdef CONFIG_BTRFS_DEBUG
+ ASSERT(node->seqnum == uiter->i);
+ ASSERT(uiter->i >= 0 && uiter->i < ulist->nnodes);
+ uiter->i++;
+#endif
+ return node;
+}
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
new file mode 100644
index 000000000..4c29db604
--- /dev/null
+++ b/fs/btrfs/ulist.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (C) 2011 STRATO AG
+ * written by Arne Jansen <sensille@gmx.net>
+ * Distributed under the GNU GPL license version 2.
+ *
+ */
+
+#ifndef __ULIST__
+#define __ULIST__
+
+#include <linux/list.h>
+#include <linux/rbtree.h>
+
+/*
+ * ulist is a generic data structure to hold a collection of unique u64
+ * values. The only operations it supports is adding to the list and
+ * enumerating it.
+ * It is possible to store an auxiliary value along with the key.
+ *
+ */
+struct ulist_iterator {
+#ifdef CONFIG_BTRFS_DEBUG
+ int i;
+#endif
+ struct list_head *cur_list; /* hint to start search */
+};
+
+/*
+ * element of the list
+ */
+struct ulist_node {
+ u64 val; /* value to store */
+ u64 aux; /* auxiliary value saved along with the val */
+
+#ifdef CONFIG_BTRFS_DEBUG
+ int seqnum; /* sequence number this node is added */
+#endif
+
+ struct list_head list; /* used to link node */
+ struct rb_node rb_node; /* used to speed up search */
+};
+
+struct ulist {
+ /*
+ * number of elements stored in list
+ */
+ unsigned long nnodes;
+
+ struct list_head nodes;
+ struct rb_root root;
+};
+
+void ulist_init(struct ulist *ulist);
+void ulist_reinit(struct ulist *ulist);
+struct ulist *ulist_alloc(gfp_t gfp_mask);
+void ulist_free(struct ulist *ulist);
+int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask);
+int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
+ u64 *old_aux, gfp_t gfp_mask);
+
+/* just like ulist_add_merge() but take a pointer for the aux data */
+static inline int ulist_add_merge_ptr(struct ulist *ulist, u64 val, void *aux,
+ void **old_aux, gfp_t gfp_mask)
+{
+#if BITS_PER_LONG == 32
+ u64 old64 = (uintptr_t)*old_aux;
+ int ret = ulist_add_merge(ulist, val, (uintptr_t)aux, &old64, gfp_mask);
+ *old_aux = (void *)((uintptr_t)old64);
+ return ret;
+#else
+ return ulist_add_merge(ulist, val, (u64)aux, (u64 *)old_aux, gfp_mask);
+#endif
+}
+
+struct ulist_node *ulist_next(struct ulist *ulist,
+ struct ulist_iterator *uiter);
+
+#define ULIST_ITER_INIT(uiter) ((uiter)->cur_list = NULL)
+
+#endif
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
new file mode 100644
index 000000000..778282944
--- /dev/null
+++ b/fs/btrfs/uuid-tree.c
@@ -0,0 +1,354 @@
+/*
+ * Copyright (C) STRATO AG 2013. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/uuid.h>
+#include <asm/unaligned.h>
+#include "ctree.h"
+#include "transaction.h"
+#include "disk-io.h"
+#include "print-tree.h"
+
+
+static void btrfs_uuid_to_key(u8 *uuid, u8 type, struct btrfs_key *key)
+{
+ key->type = type;
+ key->objectid = get_unaligned_le64(uuid);
+ key->offset = get_unaligned_le64(uuid + sizeof(u64));
+}
+
+/* return -ENOENT for !found, < 0 for errors, or 0 if an item was found */
+static int btrfs_uuid_tree_lookup(struct btrfs_root *uuid_root, u8 *uuid,
+ u8 type, u64 subid)
+{
+ int ret;
+ struct btrfs_path *path = NULL;
+ struct extent_buffer *eb;
+ int slot;
+ u32 item_size;
+ unsigned long offset;
+ struct btrfs_key key;
+
+ if (WARN_ON_ONCE(!uuid_root)) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ btrfs_uuid_to_key(uuid, type, &key);
+ ret = btrfs_search_slot(NULL, uuid_root, &key, path, 0, 0);
+ if (ret < 0) {
+ goto out;
+ } else if (ret > 0) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ eb = path->nodes[0];
+ slot = path->slots[0];
+ item_size = btrfs_item_size_nr(eb, slot);
+ offset = btrfs_item_ptr_offset(eb, slot);
+ ret = -ENOENT;
+
+ if (!IS_ALIGNED(item_size, sizeof(u64))) {
+ btrfs_warn(uuid_root->fs_info, "uuid item with illegal size %lu!",
+ (unsigned long)item_size);
+ goto out;
+ }
+ while (item_size) {
+ __le64 data;
+
+ read_extent_buffer(eb, &data, offset, sizeof(data));
+ if (le64_to_cpu(data) == subid) {
+ ret = 0;
+ break;
+ }
+ offset += sizeof(data);
+ item_size -= sizeof(data);
+ }
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans,
+ struct btrfs_root *uuid_root, u8 *uuid, u8 type,
+ u64 subid_cpu)
+{
+ int ret;
+ struct btrfs_path *path = NULL;
+ struct btrfs_key key;
+ struct extent_buffer *eb;
+ int slot;
+ unsigned long offset;
+ __le64 subid_le;
+
+ ret = btrfs_uuid_tree_lookup(uuid_root, uuid, type, subid_cpu);
+ if (ret != -ENOENT)
+ return ret;
+
+ if (WARN_ON_ONCE(!uuid_root)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ btrfs_uuid_to_key(uuid, type, &key);
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = btrfs_insert_empty_item(trans, uuid_root, path, &key,
+ sizeof(subid_le));
+ if (ret >= 0) {
+ /* Add an item for the type for the first time */
+ eb = path->nodes[0];
+ slot = path->slots[0];
+ offset = btrfs_item_ptr_offset(eb, slot);
+ } else if (ret == -EEXIST) {
+ /*
+ * An item with that type already exists.
+ * Extend the item and store the new subid at the end.
+ */
+ btrfs_extend_item(uuid_root, path, sizeof(subid_le));
+ eb = path->nodes[0];
+ slot = path->slots[0];
+ offset = btrfs_item_ptr_offset(eb, slot);
+ offset += btrfs_item_size_nr(eb, slot) - sizeof(subid_le);
+ } else if (ret < 0) {
+ btrfs_warn(uuid_root->fs_info, "insert uuid item failed %d "
+ "(0x%016llx, 0x%016llx) type %u!",
+ ret, (unsigned long long)key.objectid,
+ (unsigned long long)key.offset, type);
+ goto out;
+ }
+
+ ret = 0;
+ subid_le = cpu_to_le64(subid_cpu);
+ write_extent_buffer(eb, &subid_le, offset, sizeof(subid_le));
+ btrfs_mark_buffer_dirty(eb);
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_uuid_tree_rem(struct btrfs_trans_handle *trans,
+ struct btrfs_root *uuid_root, u8 *uuid, u8 type,
+ u64 subid)
+{
+ int ret;
+ struct btrfs_path *path = NULL;
+ struct btrfs_key key;
+ struct extent_buffer *eb;
+ int slot;
+ unsigned long offset;
+ u32 item_size;
+ unsigned long move_dst;
+ unsigned long move_src;
+ unsigned long move_len;
+
+ if (WARN_ON_ONCE(!uuid_root)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ btrfs_uuid_to_key(uuid, type, &key);
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = btrfs_search_slot(trans, uuid_root, &key, path, -1, 1);
+ if (ret < 0) {
+ btrfs_warn(uuid_root->fs_info, "error %d while searching for uuid item!",
+ ret);
+ goto out;
+ }
+ if (ret > 0) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ eb = path->nodes[0];
+ slot = path->slots[0];
+ offset = btrfs_item_ptr_offset(eb, slot);
+ item_size = btrfs_item_size_nr(eb, slot);
+ if (!IS_ALIGNED(item_size, sizeof(u64))) {
+ btrfs_warn(uuid_root->fs_info, "uuid item with illegal size %lu!",
+ (unsigned long)item_size);
+ ret = -ENOENT;
+ goto out;
+ }
+ while (item_size) {
+ __le64 read_subid;
+
+ read_extent_buffer(eb, &read_subid, offset, sizeof(read_subid));
+ if (le64_to_cpu(read_subid) == subid)
+ break;
+ offset += sizeof(read_subid);
+ item_size -= sizeof(read_subid);
+ }
+
+ if (!item_size) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ item_size = btrfs_item_size_nr(eb, slot);
+ if (item_size == sizeof(subid)) {
+ ret = btrfs_del_item(trans, uuid_root, path);
+ goto out;
+ }
+
+ move_dst = offset;
+ move_src = offset + sizeof(subid);
+ move_len = item_size - (move_src - btrfs_item_ptr_offset(eb, slot));
+ memmove_extent_buffer(eb, move_dst, move_src, move_len);
+ btrfs_truncate_item(uuid_root, path, item_size - sizeof(subid), 1);
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int btrfs_uuid_iter_rem(struct btrfs_root *uuid_root, u8 *uuid, u8 type,
+ u64 subid)
+{
+ struct btrfs_trans_handle *trans;
+ int ret;
+
+ /* 1 - for the uuid item */
+ trans = btrfs_start_transaction(uuid_root, 1);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+
+ ret = btrfs_uuid_tree_rem(trans, uuid_root, uuid, type, subid);
+ btrfs_end_transaction(trans, uuid_root);
+
+out:
+ return ret;
+}
+
+int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info,
+ int (*check_func)(struct btrfs_fs_info *, u8 *, u8,
+ u64))
+{
+ struct btrfs_root *root = fs_info->uuid_root;
+ struct btrfs_key key;
+ struct btrfs_path *path;
+ int ret = 0;
+ struct extent_buffer *leaf;
+ int slot;
+ u32 item_size;
+ unsigned long offset;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ key.objectid = 0;
+ key.type = 0;
+ key.offset = 0;
+
+again_search_slot:
+ ret = btrfs_search_forward(root, &key, path, 0);
+ if (ret) {
+ if (ret > 0)
+ ret = 0;
+ goto out;
+ }
+
+ while (1) {
+ cond_resched();
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+
+ if (key.type != BTRFS_UUID_KEY_SUBVOL &&
+ key.type != BTRFS_UUID_KEY_RECEIVED_SUBVOL)
+ goto skip;
+
+ offset = btrfs_item_ptr_offset(leaf, slot);
+ item_size = btrfs_item_size_nr(leaf, slot);
+ if (!IS_ALIGNED(item_size, sizeof(u64))) {
+ btrfs_warn(fs_info, "uuid item with illegal size %lu!",
+ (unsigned long)item_size);
+ goto skip;
+ }
+ while (item_size) {
+ u8 uuid[BTRFS_UUID_SIZE];
+ __le64 subid_le;
+ u64 subid_cpu;
+
+ put_unaligned_le64(key.objectid, uuid);
+ put_unaligned_le64(key.offset, uuid + sizeof(u64));
+ read_extent_buffer(leaf, &subid_le, offset,
+ sizeof(subid_le));
+ subid_cpu = le64_to_cpu(subid_le);
+ ret = check_func(fs_info, uuid, key.type, subid_cpu);
+ if (ret < 0)
+ goto out;
+ if (ret > 0) {
+ btrfs_release_path(path);
+ ret = btrfs_uuid_iter_rem(root, uuid, key.type,
+ subid_cpu);
+ if (ret == 0) {
+ /*
+ * this might look inefficient, but the
+ * justification is that it is an
+ * exception that check_func returns 1,
+ * and that in the regular case only one
+ * entry per UUID exists.
+ */
+ goto again_search_slot;
+ }
+ if (ret < 0 && ret != -ENOENT)
+ goto out;
+ }
+ item_size -= sizeof(subid_le);
+ offset += sizeof(subid_le);
+ }
+
+skip:
+ ret = btrfs_next_item(root, path);
+ if (ret == 0)
+ continue;
+ else if (ret > 0)
+ ret = 0;
+ break;
+ }
+
+out:
+ btrfs_free_path(path);
+ if (ret)
+ btrfs_warn(fs_info, "btrfs_uuid_tree_iterate failed %d", ret);
+ return 0;
+}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
new file mode 100644
index 000000000..174f5e1e0
--- /dev/null
+++ b/fs/btrfs/volumes.c
@@ -0,0 +1,6730 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/bio.h>
+#include <linux/slab.h>
+#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
+#include <linux/random.h>
+#include <linux/iocontext.h>
+#include <linux/capability.h>
+#include <linux/ratelimit.h>
+#include <linux/kthread.h>
+#include <linux/raid/pq.h>
+#include <linux/semaphore.h>
+#include <asm/div64.h>
+#include "ctree.h"
+#include "extent_map.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "print-tree.h"
+#include "volumes.h"
+#include "raid56.h"
+#include "async-thread.h"
+#include "check-integrity.h"
+#include "rcu-string.h"
+#include "math.h"
+#include "dev-replace.h"
+#include "sysfs.h"
+
+static int init_first_rw_device(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_device *device);
+static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
+static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
+static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
+static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
+
+DEFINE_MUTEX(uuid_mutex);
+static LIST_HEAD(fs_uuids);
+
+static struct btrfs_fs_devices *__alloc_fs_devices(void)
+{
+ struct btrfs_fs_devices *fs_devs;
+
+ fs_devs = kzalloc(sizeof(*fs_devs), GFP_NOFS);
+ if (!fs_devs)
+ return ERR_PTR(-ENOMEM);
+
+ mutex_init(&fs_devs->device_list_mutex);
+
+ INIT_LIST_HEAD(&fs_devs->devices);
+ INIT_LIST_HEAD(&fs_devs->resized_devices);
+ INIT_LIST_HEAD(&fs_devs->alloc_list);
+ INIT_LIST_HEAD(&fs_devs->list);
+
+ return fs_devs;
+}
+
+/**
+ * alloc_fs_devices - allocate struct btrfs_fs_devices
+ * @fsid: a pointer to UUID for this FS. If NULL a new UUID is
+ * generated.
+ *
+ * Return: a pointer to a new &struct btrfs_fs_devices on success;
+ * ERR_PTR() on error. Returned struct is not linked onto any lists and
+ * can be destroyed with kfree() right away.
+ */
+static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
+{
+ struct btrfs_fs_devices *fs_devs;
+
+ fs_devs = __alloc_fs_devices();
+ if (IS_ERR(fs_devs))
+ return fs_devs;
+
+ if (fsid)
+ memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
+ else
+ generate_random_uuid(fs_devs->fsid);
+
+ return fs_devs;
+}
+
+static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
+{
+ struct btrfs_device *device;
+ WARN_ON(fs_devices->opened);
+ while (!list_empty(&fs_devices->devices)) {
+ device = list_entry(fs_devices->devices.next,
+ struct btrfs_device, dev_list);
+ list_del(&device->dev_list);
+ rcu_string_free(device->name);
+ kfree(device);
+ }
+ kfree(fs_devices);
+}
+
+static void btrfs_kobject_uevent(struct block_device *bdev,
+ enum kobject_action action)
+{
+ int ret;
+
+ ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
+ if (ret)
+ pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n",
+ action,
+ kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
+ &disk_to_dev(bdev->bd_disk)->kobj);
+}
+
+void btrfs_cleanup_fs_uuids(void)
+{
+ struct btrfs_fs_devices *fs_devices;
+
+ while (!list_empty(&fs_uuids)) {
+ fs_devices = list_entry(fs_uuids.next,
+ struct btrfs_fs_devices, list);
+ list_del(&fs_devices->list);
+ free_fs_devices(fs_devices);
+ }
+}
+
+static struct btrfs_device *__alloc_device(void)
+{
+ struct btrfs_device *dev;
+
+ dev = kzalloc(sizeof(*dev), GFP_NOFS);
+ if (!dev)
+ return ERR_PTR(-ENOMEM);
+
+ INIT_LIST_HEAD(&dev->dev_list);
+ INIT_LIST_HEAD(&dev->dev_alloc_list);
+ INIT_LIST_HEAD(&dev->resized_list);
+
+ spin_lock_init(&dev->io_lock);
+
+ spin_lock_init(&dev->reada_lock);
+ atomic_set(&dev->reada_in_flight, 0);
+ atomic_set(&dev->dev_stats_ccnt, 0);
+ INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_WAIT);
+ INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_WAIT);
+
+ return dev;
+}
+
+static noinline struct btrfs_device *__find_device(struct list_head *head,
+ u64 devid, u8 *uuid)
+{
+ struct btrfs_device *dev;
+
+ list_for_each_entry(dev, head, dev_list) {
+ if (dev->devid == devid &&
+ (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
+ return dev;
+ }
+ }
+ return NULL;
+}
+
+static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
+{
+ struct btrfs_fs_devices *fs_devices;
+
+ list_for_each_entry(fs_devices, &fs_uuids, list) {
+ if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
+ return fs_devices;
+ }
+ return NULL;
+}
+
+static int
+btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
+ int flush, struct block_device **bdev,
+ struct buffer_head **bh)
+{
+ int ret;
+
+ *bdev = blkdev_get_by_path(device_path, flags, holder);
+
+ if (IS_ERR(*bdev)) {
+ ret = PTR_ERR(*bdev);
+ printk(KERN_INFO "BTRFS: open %s failed\n", device_path);
+ goto error;
+ }
+
+ if (flush)
+ filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
+ ret = set_blocksize(*bdev, 4096);
+ if (ret) {
+ blkdev_put(*bdev, flags);
+ goto error;
+ }
+ invalidate_bdev(*bdev);
+ *bh = btrfs_read_dev_super(*bdev);
+ if (!*bh) {
+ ret = -EINVAL;
+ blkdev_put(*bdev, flags);
+ goto error;
+ }
+
+ return 0;
+
+error:
+ *bdev = NULL;
+ *bh = NULL;
+ return ret;
+}
+
+static void requeue_list(struct btrfs_pending_bios *pending_bios,
+ struct bio *head, struct bio *tail)
+{
+
+ struct bio *old_head;
+
+ old_head = pending_bios->head;
+ pending_bios->head = head;
+ if (pending_bios->tail)
+ tail->bi_next = old_head;
+ else
+ pending_bios->tail = tail;
+}
+
+/*
+ * we try to collect pending bios for a device so we don't get a large
+ * number of procs sending bios down to the same device. This greatly
+ * improves the schedulers ability to collect and merge the bios.
+ *
+ * But, it also turns into a long list of bios to process and that is sure
+ * to eventually make the worker thread block. The solution here is to
+ * make some progress and then put this work struct back at the end of
+ * the list if the block device is congested. This way, multiple devices
+ * can make progress from a single worker thread.
+ */
+static noinline void run_scheduled_bios(struct btrfs_device *device)
+{
+ struct bio *pending;
+ struct backing_dev_info *bdi;
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_pending_bios *pending_bios;
+ struct bio *tail;
+ struct bio *cur;
+ int again = 0;
+ unsigned long num_run;
+ unsigned long batch_run = 0;
+ unsigned long limit;
+ unsigned long last_waited = 0;
+ int force_reg = 0;
+ int sync_pending = 0;
+ struct blk_plug plug;
+
+ /*
+ * this function runs all the bios we've collected for
+ * a particular device. We don't want to wander off to
+ * another device without first sending all of these down.
+ * So, setup a plug here and finish it off before we return
+ */
+ blk_start_plug(&plug);
+
+ bdi = blk_get_backing_dev_info(device->bdev);
+ fs_info = device->dev_root->fs_info;
+ limit = btrfs_async_submit_limit(fs_info);
+ limit = limit * 2 / 3;
+
+loop:
+ spin_lock(&device->io_lock);
+
+loop_lock:
+ num_run = 0;
+
+ /* take all the bios off the list at once and process them
+ * later on (without the lock held). But, remember the
+ * tail and other pointers so the bios can be properly reinserted
+ * into the list if we hit congestion
+ */
+ if (!force_reg && device->pending_sync_bios.head) {
+ pending_bios = &device->pending_sync_bios;
+ force_reg = 1;
+ } else {
+ pending_bios = &device->pending_bios;
+ force_reg = 0;
+ }
+
+ pending = pending_bios->head;
+ tail = pending_bios->tail;
+ WARN_ON(pending && !tail);
+
+ /*
+ * if pending was null this time around, no bios need processing
+ * at all and we can stop. Otherwise it'll loop back up again
+ * and do an additional check so no bios are missed.
+ *
+ * device->running_pending is used to synchronize with the
+ * schedule_bio code.
+ */
+ if (device->pending_sync_bios.head == NULL &&
+ device->pending_bios.head == NULL) {
+ again = 0;
+ device->running_pending = 0;
+ } else {
+ again = 1;
+ device->running_pending = 1;
+ }
+
+ pending_bios->head = NULL;
+ pending_bios->tail = NULL;
+
+ spin_unlock(&device->io_lock);
+
+ while (pending) {
+
+ rmb();
+ /* we want to work on both lists, but do more bios on the
+ * sync list than the regular list
+ */
+ if ((num_run > 32 &&
+ pending_bios != &device->pending_sync_bios &&
+ device->pending_sync_bios.head) ||
+ (num_run > 64 && pending_bios == &device->pending_sync_bios &&
+ device->pending_bios.head)) {
+ spin_lock(&device->io_lock);
+ requeue_list(pending_bios, pending, tail);
+ goto loop_lock;
+ }
+
+ cur = pending;
+ pending = pending->bi_next;
+ cur->bi_next = NULL;
+
+ if (atomic_dec_return(&fs_info->nr_async_bios) < limit &&
+ waitqueue_active(&fs_info->async_submit_wait))
+ wake_up(&fs_info->async_submit_wait);
+
+ BUG_ON(atomic_read(&cur->bi_cnt) == 0);
+
+ /*
+ * if we're doing the sync list, record that our
+ * plug has some sync requests on it
+ *
+ * If we're doing the regular list and there are
+ * sync requests sitting around, unplug before
+ * we add more
+ */
+ if (pending_bios == &device->pending_sync_bios) {
+ sync_pending = 1;
+ } else if (sync_pending) {
+ blk_finish_plug(&plug);
+ blk_start_plug(&plug);
+ sync_pending = 0;
+ }
+
+ btrfsic_submit_bio(cur->bi_rw, cur);
+ num_run++;
+ batch_run++;
+
+ cond_resched();
+
+ /*
+ * we made progress, there is more work to do and the bdi
+ * is now congested. Back off and let other work structs
+ * run instead
+ */
+ if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
+ fs_info->fs_devices->open_devices > 1) {
+ struct io_context *ioc;
+
+ ioc = current->io_context;
+
+ /*
+ * the main goal here is that we don't want to
+ * block if we're going to be able to submit
+ * more requests without blocking.
+ *
+ * This code does two great things, it pokes into
+ * the elevator code from a filesystem _and_
+ * it makes assumptions about how batching works.
+ */
+ if (ioc && ioc->nr_batch_requests > 0 &&
+ time_before(jiffies, ioc->last_waited + HZ/50UL) &&
+ (last_waited == 0 ||
+ ioc->last_waited == last_waited)) {
+ /*
+ * we want to go through our batch of
+ * requests and stop. So, we copy out
+ * the ioc->last_waited time and test
+ * against it before looping
+ */
+ last_waited = ioc->last_waited;
+ cond_resched();
+ continue;
+ }
+ spin_lock(&device->io_lock);
+ requeue_list(pending_bios, pending, tail);
+ device->running_pending = 1;
+
+ spin_unlock(&device->io_lock);
+ btrfs_queue_work(fs_info->submit_workers,
+ &device->work);
+ goto done;
+ }
+ /* unplug every 64 requests just for good measure */
+ if (batch_run % 64 == 0) {
+ blk_finish_plug(&plug);
+ blk_start_plug(&plug);
+ sync_pending = 0;
+ }
+ }
+
+ cond_resched();
+ if (again)
+ goto loop;
+
+ spin_lock(&device->io_lock);
+ if (device->pending_bios.head || device->pending_sync_bios.head)
+ goto loop_lock;
+ spin_unlock(&device->io_lock);
+
+done:
+ blk_finish_plug(&plug);
+}
+
+static void pending_bios_fn(struct btrfs_work *work)
+{
+ struct btrfs_device *device;
+
+ device = container_of(work, struct btrfs_device, work);
+ run_scheduled_bios(device);
+}
+
+/*
+ * Add new device to list of registered devices
+ *
+ * Returns:
+ * 1 - first time device is seen
+ * 0 - device already known
+ * < 0 - error
+ */
+static noinline int device_list_add(const char *path,
+ struct btrfs_super_block *disk_super,
+ u64 devid, struct btrfs_fs_devices **fs_devices_ret)
+{
+ struct btrfs_device *device;
+ struct btrfs_fs_devices *fs_devices;
+ struct rcu_string *name;
+ int ret = 0;
+ u64 found_transid = btrfs_super_generation(disk_super);
+
+ fs_devices = find_fsid(disk_super->fsid);
+ if (!fs_devices) {
+ fs_devices = alloc_fs_devices(disk_super->fsid);
+ if (IS_ERR(fs_devices))
+ return PTR_ERR(fs_devices);
+
+ list_add(&fs_devices->list, &fs_uuids);
+
+ device = NULL;
+ } else {
+ device = __find_device(&fs_devices->devices, devid,
+ disk_super->dev_item.uuid);
+ }
+
+ if (!device) {
+ if (fs_devices->opened)
+ return -EBUSY;
+
+ device = btrfs_alloc_device(NULL, &devid,
+ disk_super->dev_item.uuid);
+ if (IS_ERR(device)) {
+ /* we can safely leave the fs_devices entry around */
+ return PTR_ERR(device);
+ }
+
+ name = rcu_string_strdup(path, GFP_NOFS);
+ if (!name) {
+ kfree(device);
+ return -ENOMEM;
+ }
+ rcu_assign_pointer(device->name, name);
+
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_add_rcu(&device->dev_list, &fs_devices->devices);
+ fs_devices->num_devices++;
+ mutex_unlock(&fs_devices->device_list_mutex);
+
+ ret = 1;
+ device->fs_devices = fs_devices;
+ } else if (!device->name || strcmp(device->name->str, path)) {
+ /*
+ * When FS is already mounted.
+ * 1. If you are here and if the device->name is NULL that
+ * means this device was missing at time of FS mount.
+ * 2. If you are here and if the device->name is different
+ * from 'path' that means either
+ * a. The same device disappeared and reappeared with
+ * different name. or
+ * b. The missing-disk-which-was-replaced, has
+ * reappeared now.
+ *
+ * We must allow 1 and 2a above. But 2b would be a spurious
+ * and unintentional.
+ *
+ * Further in case of 1 and 2a above, the disk at 'path'
+ * would have missed some transaction when it was away and
+ * in case of 2a the stale bdev has to be updated as well.
+ * 2b must not be allowed at all time.
+ */
+
+ /*
+ * For now, we do allow update to btrfs_fs_device through the
+ * btrfs dev scan cli after FS has been mounted. We're still
+ * tracking a problem where systems fail mount by subvolume id
+ * when we reject replacement on a mounted FS.
+ */
+ if (!fs_devices->opened && found_transid < device->generation) {
+ /*
+ * That is if the FS is _not_ mounted and if you
+ * are here, that means there is more than one
+ * disk with same uuid and devid.We keep the one
+ * with larger generation number or the last-in if
+ * generation are equal.
+ */
+ return -EEXIST;
+ }
+
+ name = rcu_string_strdup(path, GFP_NOFS);
+ if (!name)
+ return -ENOMEM;
+ rcu_string_free(device->name);
+ rcu_assign_pointer(device->name, name);
+ if (device->missing) {
+ fs_devices->missing_devices--;
+ device->missing = 0;
+ }
+ }
+
+ /*
+ * Unmount does not free the btrfs_device struct but would zero
+ * generation along with most of the other members. So just update
+ * it back. We need it to pick the disk with largest generation
+ * (as above).
+ */
+ if (!fs_devices->opened)
+ device->generation = found_transid;
+
+ *fs_devices_ret = fs_devices;
+
+ return ret;
+}
+
+static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
+{
+ struct btrfs_fs_devices *fs_devices;
+ struct btrfs_device *device;
+ struct btrfs_device *orig_dev;
+
+ fs_devices = alloc_fs_devices(orig->fsid);
+ if (IS_ERR(fs_devices))
+ return fs_devices;
+
+ mutex_lock(&orig->device_list_mutex);
+ fs_devices->total_devices = orig->total_devices;
+
+ /* We have held the volume lock, it is safe to get the devices. */
+ list_for_each_entry(orig_dev, &orig->devices, dev_list) {
+ struct rcu_string *name;
+
+ device = btrfs_alloc_device(NULL, &orig_dev->devid,
+ orig_dev->uuid);
+ if (IS_ERR(device))
+ goto error;
+
+ /*
+ * This is ok to do without rcu read locked because we hold the
+ * uuid mutex so nothing we touch in here is going to disappear.
+ */
+ if (orig_dev->name) {
+ name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS);
+ if (!name) {
+ kfree(device);
+ goto error;
+ }
+ rcu_assign_pointer(device->name, name);
+ }
+
+ list_add(&device->dev_list, &fs_devices->devices);
+ device->fs_devices = fs_devices;
+ fs_devices->num_devices++;
+ }
+ mutex_unlock(&orig->device_list_mutex);
+ return fs_devices;
+error:
+ mutex_unlock(&orig->device_list_mutex);
+ free_fs_devices(fs_devices);
+ return ERR_PTR(-ENOMEM);
+}
+
+void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step)
+{
+ struct btrfs_device *device, *next;
+ struct btrfs_device *latest_dev = NULL;
+
+ mutex_lock(&uuid_mutex);
+again:
+ /* This is the initialized path, it is safe to release the devices. */
+ list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
+ if (device->in_fs_metadata) {
+ if (!device->is_tgtdev_for_dev_replace &&
+ (!latest_dev ||
+ device->generation > latest_dev->generation)) {
+ latest_dev = device;
+ }
+ continue;
+ }
+
+ if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
+ /*
+ * In the first step, keep the device which has
+ * the correct fsid and the devid that is used
+ * for the dev_replace procedure.
+ * In the second step, the dev_replace state is
+ * read from the device tree and it is known
+ * whether the procedure is really active or
+ * not, which means whether this device is
+ * used or whether it should be removed.
+ */
+ if (step == 0 || device->is_tgtdev_for_dev_replace) {
+ continue;
+ }
+ }
+ if (device->bdev) {
+ blkdev_put(device->bdev, device->mode);
+ device->bdev = NULL;
+ fs_devices->open_devices--;
+ }
+ if (device->writeable) {
+ list_del_init(&device->dev_alloc_list);
+ device->writeable = 0;
+ if (!device->is_tgtdev_for_dev_replace)
+ fs_devices->rw_devices--;
+ }
+ list_del_init(&device->dev_list);
+ fs_devices->num_devices--;
+ rcu_string_free(device->name);
+ kfree(device);
+ }
+
+ if (fs_devices->seed) {
+ fs_devices = fs_devices->seed;
+ goto again;
+ }
+
+ fs_devices->latest_bdev = latest_dev->bdev;
+
+ mutex_unlock(&uuid_mutex);
+}
+
+static void __free_device(struct work_struct *work)
+{
+ struct btrfs_device *device;
+
+ device = container_of(work, struct btrfs_device, rcu_work);
+
+ if (device->bdev)
+ blkdev_put(device->bdev, device->mode);
+
+ rcu_string_free(device->name);
+ kfree(device);
+}
+
+static void free_device(struct rcu_head *head)
+{
+ struct btrfs_device *device;
+
+ device = container_of(head, struct btrfs_device, rcu);
+
+ INIT_WORK(&device->rcu_work, __free_device);
+ schedule_work(&device->rcu_work);
+}
+
+static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
+{
+ struct btrfs_device *device;
+
+ if (--fs_devices->opened > 0)
+ return 0;
+
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ struct btrfs_device *new_device;
+ struct rcu_string *name;
+
+ if (device->bdev)
+ fs_devices->open_devices--;
+
+ if (device->writeable &&
+ device->devid != BTRFS_DEV_REPLACE_DEVID) {
+ list_del_init(&device->dev_alloc_list);
+ fs_devices->rw_devices--;
+ }
+
+ if (device->missing)
+ fs_devices->missing_devices--;
+
+ new_device = btrfs_alloc_device(NULL, &device->devid,
+ device->uuid);
+ BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
+
+ /* Safe because we are under uuid_mutex */
+ if (device->name) {
+ name = rcu_string_strdup(device->name->str, GFP_NOFS);
+ BUG_ON(!name); /* -ENOMEM */
+ rcu_assign_pointer(new_device->name, name);
+ }
+
+ list_replace_rcu(&device->dev_list, &new_device->dev_list);
+ new_device->fs_devices = device->fs_devices;
+
+ call_rcu(&device->rcu, free_device);
+ }
+ mutex_unlock(&fs_devices->device_list_mutex);
+
+ WARN_ON(fs_devices->open_devices);
+ WARN_ON(fs_devices->rw_devices);
+ fs_devices->opened = 0;
+ fs_devices->seeding = 0;
+
+ return 0;
+}
+
+int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
+{
+ struct btrfs_fs_devices *seed_devices = NULL;
+ int ret;
+
+ mutex_lock(&uuid_mutex);
+ ret = __btrfs_close_devices(fs_devices);
+ if (!fs_devices->opened) {
+ seed_devices = fs_devices->seed;
+ fs_devices->seed = NULL;
+ }
+ mutex_unlock(&uuid_mutex);
+
+ while (seed_devices) {
+ fs_devices = seed_devices;
+ seed_devices = fs_devices->seed;
+ __btrfs_close_devices(fs_devices);
+ free_fs_devices(fs_devices);
+ }
+ /*
+ * Wait for rcu kworkers under __btrfs_close_devices
+ * to finish all blkdev_puts so device is really
+ * free when umount is done.
+ */
+ rcu_barrier();
+ return ret;
+}
+
+static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
+ fmode_t flags, void *holder)
+{
+ struct request_queue *q;
+ struct block_device *bdev;
+ struct list_head *head = &fs_devices->devices;
+ struct btrfs_device *device;
+ struct btrfs_device *latest_dev = NULL;
+ struct buffer_head *bh;
+ struct btrfs_super_block *disk_super;
+ u64 devid;
+ int seeding = 1;
+ int ret = 0;
+
+ flags |= FMODE_EXCL;
+
+ list_for_each_entry(device, head, dev_list) {
+ if (device->bdev)
+ continue;
+ if (!device->name)
+ continue;
+
+ /* Just open everything we can; ignore failures here */
+ if (btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
+ &bdev, &bh))
+ continue;
+
+ disk_super = (struct btrfs_super_block *)bh->b_data;
+ devid = btrfs_stack_device_id(&disk_super->dev_item);
+ if (devid != device->devid)
+ goto error_brelse;
+
+ if (memcmp(device->uuid, disk_super->dev_item.uuid,
+ BTRFS_UUID_SIZE))
+ goto error_brelse;
+
+ device->generation = btrfs_super_generation(disk_super);
+ if (!latest_dev ||
+ device->generation > latest_dev->generation)
+ latest_dev = device;
+
+ if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
+ device->writeable = 0;
+ } else {
+ device->writeable = !bdev_read_only(bdev);
+ seeding = 0;
+ }
+
+ q = bdev_get_queue(bdev);
+ if (blk_queue_discard(q))
+ device->can_discard = 1;
+
+ device->bdev = bdev;
+ device->in_fs_metadata = 0;
+ device->mode = flags;
+
+ if (!blk_queue_nonrot(bdev_get_queue(bdev)))
+ fs_devices->rotating = 1;
+
+ fs_devices->open_devices++;
+ if (device->writeable &&
+ device->devid != BTRFS_DEV_REPLACE_DEVID) {
+ fs_devices->rw_devices++;
+ list_add(&device->dev_alloc_list,
+ &fs_devices->alloc_list);
+ }
+ brelse(bh);
+ continue;
+
+error_brelse:
+ brelse(bh);
+ blkdev_put(bdev, flags);
+ continue;
+ }
+ if (fs_devices->open_devices == 0) {
+ ret = -EINVAL;
+ goto out;
+ }
+ fs_devices->seeding = seeding;
+ fs_devices->opened = 1;
+ fs_devices->latest_bdev = latest_dev->bdev;
+ fs_devices->total_rw_bytes = 0;
+out:
+ return ret;
+}
+
+int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
+ fmode_t flags, void *holder)
+{
+ int ret;
+
+ mutex_lock(&uuid_mutex);
+ if (fs_devices->opened) {
+ fs_devices->opened++;
+ ret = 0;
+ } else {
+ ret = __btrfs_open_devices(fs_devices, flags, holder);
+ }
+ mutex_unlock(&uuid_mutex);
+ return ret;
+}
+
+/*
+ * Look for a btrfs signature on a device. This may be called out of the mount path
+ * and we are not allowed to call set_blocksize during the scan. The superblock
+ * is read via pagecache
+ */
+int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
+ struct btrfs_fs_devices **fs_devices_ret)
+{
+ struct btrfs_super_block *disk_super;
+ struct block_device *bdev;
+ struct page *page;
+ void *p;
+ int ret = -EINVAL;
+ u64 devid;
+ u64 transid;
+ u64 total_devices;
+ u64 bytenr;
+ pgoff_t index;
+
+ /*
+ * we would like to check all the supers, but that would make
+ * a btrfs mount succeed after a mkfs from a different FS.
+ * So, we need to add a special mount option to scan for
+ * later supers, using BTRFS_SUPER_MIRROR_MAX instead
+ */
+ bytenr = btrfs_sb_offset(0);
+ flags |= FMODE_EXCL;
+ mutex_lock(&uuid_mutex);
+
+ bdev = blkdev_get_by_path(path, flags, holder);
+
+ if (IS_ERR(bdev)) {
+ ret = PTR_ERR(bdev);
+ goto error;
+ }
+
+ /* make sure our super fits in the device */
+ if (bytenr + PAGE_CACHE_SIZE >= i_size_read(bdev->bd_inode))
+ goto error_bdev_put;
+
+ /* make sure our super fits in the page */
+ if (sizeof(*disk_super) > PAGE_CACHE_SIZE)
+ goto error_bdev_put;
+
+ /* make sure our super doesn't straddle pages on disk */
+ index = bytenr >> PAGE_CACHE_SHIFT;
+ if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_CACHE_SHIFT != index)
+ goto error_bdev_put;
+
+ /* pull in the page with our super */
+ page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
+ index, GFP_NOFS);
+
+ if (IS_ERR_OR_NULL(page))
+ goto error_bdev_put;
+
+ p = kmap(page);
+
+ /* align our pointer to the offset of the super block */
+ disk_super = p + (bytenr & ~PAGE_CACHE_MASK);
+
+ if (btrfs_super_bytenr(disk_super) != bytenr ||
+ btrfs_super_magic(disk_super) != BTRFS_MAGIC)
+ goto error_unmap;
+
+ devid = btrfs_stack_device_id(&disk_super->dev_item);
+ transid = btrfs_super_generation(disk_super);
+ total_devices = btrfs_super_num_devices(disk_super);
+
+ ret = device_list_add(path, disk_super, devid, fs_devices_ret);
+ if (ret > 0) {
+ if (disk_super->label[0]) {
+ if (disk_super->label[BTRFS_LABEL_SIZE - 1])
+ disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';
+ printk(KERN_INFO "BTRFS: device label %s ", disk_super->label);
+ } else {
+ printk(KERN_INFO "BTRFS: device fsid %pU ", disk_super->fsid);
+ }
+
+ printk(KERN_CONT "devid %llu transid %llu %s\n", devid, transid, path);
+ ret = 0;
+ }
+ if (!ret && fs_devices_ret)
+ (*fs_devices_ret)->total_devices = total_devices;
+
+error_unmap:
+ kunmap(page);
+ page_cache_release(page);
+
+error_bdev_put:
+ blkdev_put(bdev, flags);
+error:
+ mutex_unlock(&uuid_mutex);
+ return ret;
+}
+
+/* helper to account the used device space in the range */
+int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
+ u64 end, u64 *length)
+{
+ struct btrfs_key key;
+ struct btrfs_root *root = device->dev_root;
+ struct btrfs_dev_extent *dev_extent;
+ struct btrfs_path *path;
+ u64 extent_end;
+ int ret;
+ int slot;
+ struct extent_buffer *l;
+
+ *length = 0;
+
+ if (start >= device->total_bytes || device->is_tgtdev_for_dev_replace)
+ return 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ path->reada = 2;
+
+ key.objectid = device->devid;
+ key.offset = start;
+ key.type = BTRFS_DEV_EXTENT_KEY;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ if (ret > 0) {
+ ret = btrfs_previous_item(root, path, key.objectid, key.type);
+ if (ret < 0)
+ goto out;
+ }
+
+ while (1) {
+ l = path->nodes[0];
+ slot = path->slots[0];
+ if (slot >= btrfs_header_nritems(l)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret == 0)
+ continue;
+ if (ret < 0)
+ goto out;
+
+ break;
+ }
+ btrfs_item_key_to_cpu(l, &key, slot);
+
+ if (key.objectid < device->devid)
+ goto next;
+
+ if (key.objectid > device->devid)
+ break;
+
+ if (key.type != BTRFS_DEV_EXTENT_KEY)
+ goto next;
+
+ dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
+ extent_end = key.offset + btrfs_dev_extent_length(l,
+ dev_extent);
+ if (key.offset <= start && extent_end > end) {
+ *length = end - start + 1;
+ break;
+ } else if (key.offset <= start && extent_end > start)
+ *length += extent_end - start;
+ else if (key.offset > start && extent_end <= end)
+ *length += extent_end - key.offset;
+ else if (key.offset > start && key.offset <= end) {
+ *length += end - key.offset + 1;
+ break;
+ } else if (key.offset > end)
+ break;
+
+next:
+ path->slots[0]++;
+ }
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int contains_pending_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device,
+ u64 *start, u64 len)
+{
+ struct extent_map *em;
+ struct list_head *search_list = &trans->transaction->pending_chunks;
+ int ret = 0;
+ u64 physical_start = *start;
+
+again:
+ list_for_each_entry(em, search_list, list) {
+ struct map_lookup *map;
+ int i;
+
+ map = (struct map_lookup *)em->bdev;
+ for (i = 0; i < map->num_stripes; i++) {
+ if (map->stripes[i].dev != device)
+ continue;
+ if (map->stripes[i].physical >= physical_start + len ||
+ map->stripes[i].physical + em->orig_block_len <=
+ physical_start)
+ continue;
+ *start = map->stripes[i].physical +
+ em->orig_block_len;
+ ret = 1;
+ }
+ }
+ if (search_list == &trans->transaction->pending_chunks) {
+ search_list = &trans->root->fs_info->pinned_chunks;
+ goto again;
+ }
+
+ return ret;
+}
+
+
+/*
+ * find_free_dev_extent - find free space in the specified device
+ * @device: the device which we search the free space in
+ * @num_bytes: the size of the free space that we need
+ * @start: store the start of the free space.
+ * @len: the size of the free space. that we find, or the size of the max
+ * free space if we don't find suitable free space
+ *
+ * this uses a pretty simple search, the expectation is that it is
+ * called very infrequently and that a given device has a small number
+ * of extents
+ *
+ * @start is used to store the start of the free space if we find. But if we
+ * don't find suitable free space, it will be used to store the start position
+ * of the max free space.
+ *
+ * @len is used to store the size of the free space that we find.
+ * But if we don't find suitable free space, it is used to store the size of
+ * the max free space.
+ */
+int find_free_dev_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device, u64 num_bytes,
+ u64 *start, u64 *len)
+{
+ struct btrfs_key key;
+ struct btrfs_root *root = device->dev_root;
+ struct btrfs_dev_extent *dev_extent;
+ struct btrfs_path *path;
+ u64 hole_size;
+ u64 max_hole_start;
+ u64 max_hole_size;
+ u64 extent_end;
+ u64 search_start;
+ u64 search_end = device->total_bytes;
+ int ret;
+ int slot;
+ struct extent_buffer *l;
+
+ /* FIXME use last free of some kind */
+
+ /* we don't want to overwrite the superblock on the drive,
+ * so we make sure to start at an offset of at least 1MB
+ */
+ search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ max_hole_start = search_start;
+ max_hole_size = 0;
+
+again:
+ if (search_start >= search_end || device->is_tgtdev_for_dev_replace) {
+ ret = -ENOSPC;
+ goto out;
+ }
+
+ path->reada = 2;
+ path->search_commit_root = 1;
+ path->skip_locking = 1;
+
+ key.objectid = device->devid;
+ key.offset = search_start;
+ key.type = BTRFS_DEV_EXTENT_KEY;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ if (ret > 0) {
+ ret = btrfs_previous_item(root, path, key.objectid, key.type);
+ if (ret < 0)
+ goto out;
+ }
+
+ while (1) {
+ l = path->nodes[0];
+ slot = path->slots[0];
+ if (slot >= btrfs_header_nritems(l)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret == 0)
+ continue;
+ if (ret < 0)
+ goto out;
+
+ break;
+ }
+ btrfs_item_key_to_cpu(l, &key, slot);
+
+ if (key.objectid < device->devid)
+ goto next;
+
+ if (key.objectid > device->devid)
+ break;
+
+ if (key.type != BTRFS_DEV_EXTENT_KEY)
+ goto next;
+
+ if (key.offset > search_start) {
+ hole_size = key.offset - search_start;
+
+ /*
+ * Have to check before we set max_hole_start, otherwise
+ * we could end up sending back this offset anyway.
+ */
+ if (contains_pending_extent(trans, device,
+ &search_start,
+ hole_size)) {
+ if (key.offset >= search_start) {
+ hole_size = key.offset - search_start;
+ } else {
+ WARN_ON_ONCE(1);
+ hole_size = 0;
+ }
+ }
+
+ if (hole_size > max_hole_size) {
+ max_hole_start = search_start;
+ max_hole_size = hole_size;
+ }
+
+ /*
+ * If this free space is greater than which we need,
+ * it must be the max free space that we have found
+ * until now, so max_hole_start must point to the start
+ * of this free space and the length of this free space
+ * is stored in max_hole_size. Thus, we return
+ * max_hole_start and max_hole_size and go back to the
+ * caller.
+ */
+ if (hole_size >= num_bytes) {
+ ret = 0;
+ goto out;
+ }
+ }
+
+ dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
+ extent_end = key.offset + btrfs_dev_extent_length(l,
+ dev_extent);
+ if (extent_end > search_start)
+ search_start = extent_end;
+next:
+ path->slots[0]++;
+ cond_resched();
+ }
+
+ /*
+ * At this point, search_start should be the end of
+ * allocated dev extents, and when shrinking the device,
+ * search_end may be smaller than search_start.
+ */
+ if (search_end > search_start) {
+ hole_size = search_end - search_start;
+
+ if (contains_pending_extent(trans, device, &search_start,
+ hole_size)) {
+ btrfs_release_path(path);
+ goto again;
+ }
+
+ if (hole_size > max_hole_size) {
+ max_hole_start = search_start;
+ max_hole_size = hole_size;
+ }
+ }
+
+ /* See above. */
+ if (max_hole_size < num_bytes)
+ ret = -ENOSPC;
+ else
+ ret = 0;
+
+out:
+ btrfs_free_path(path);
+ *start = max_hole_start;
+ if (len)
+ *len = max_hole_size;
+ return ret;
+}
+
+static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device,
+ u64 start, u64 *dev_extent_len)
+{
+ int ret;
+ struct btrfs_path *path;
+ struct btrfs_root *root = device->dev_root;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ struct extent_buffer *leaf = NULL;
+ struct btrfs_dev_extent *extent = NULL;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = device->devid;
+ key.offset = start;
+ key.type = BTRFS_DEV_EXTENT_KEY;
+again:
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret > 0) {
+ ret = btrfs_previous_item(root, path, key.objectid,
+ BTRFS_DEV_EXTENT_KEY);
+ if (ret)
+ goto out;
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ extent = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_dev_extent);
+ BUG_ON(found_key.offset > start || found_key.offset +
+ btrfs_dev_extent_length(leaf, extent) < start);
+ key = found_key;
+ btrfs_release_path(path);
+ goto again;
+ } else if (ret == 0) {
+ leaf = path->nodes[0];
+ extent = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_dev_extent);
+ } else {
+ btrfs_error(root->fs_info, ret, "Slot search failed");
+ goto out;
+ }
+
+ *dev_extent_len = btrfs_dev_extent_length(leaf, extent);
+
+ ret = btrfs_del_item(trans, root, path);
+ if (ret) {
+ btrfs_error(root->fs_info, ret,
+ "Failed to remove dev extent item");
+ } else {
+ trans->transaction->have_free_bgs = 1;
+ }
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device,
+ u64 chunk_tree, u64 chunk_objectid,
+ u64 chunk_offset, u64 start, u64 num_bytes)
+{
+ int ret;
+ struct btrfs_path *path;
+ struct btrfs_root *root = device->dev_root;
+ struct btrfs_dev_extent *extent;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+
+ WARN_ON(!device->in_fs_metadata);
+ WARN_ON(device->is_tgtdev_for_dev_replace);
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = device->devid;
+ key.offset = start;
+ key.type = BTRFS_DEV_EXTENT_KEY;
+ ret = btrfs_insert_empty_item(trans, root, path, &key,
+ sizeof(*extent));
+ if (ret)
+ goto out;
+
+ leaf = path->nodes[0];
+ extent = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_dev_extent);
+ btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree);
+ btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid);
+ btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
+
+ write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
+ btrfs_dev_extent_chunk_tree_uuid(extent), BTRFS_UUID_SIZE);
+
+ btrfs_set_dev_extent_length(leaf, extent, num_bytes);
+ btrfs_mark_buffer_dirty(leaf);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
+{
+ struct extent_map_tree *em_tree;
+ struct extent_map *em;
+ struct rb_node *n;
+ u64 ret = 0;
+
+ em_tree = &fs_info->mapping_tree.map_tree;
+ read_lock(&em_tree->lock);
+ n = rb_last(&em_tree->map);
+ if (n) {
+ em = rb_entry(n, struct extent_map, rb_node);
+ ret = em->start + em->len;
+ }
+ read_unlock(&em_tree->lock);
+
+ return ret;
+}
+
+static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
+ u64 *devid_ret)
+{
+ int ret;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ struct btrfs_path *path;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+ key.type = BTRFS_DEV_ITEM_KEY;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
+ if (ret < 0)
+ goto error;
+
+ BUG_ON(ret == 0); /* Corruption */
+
+ ret = btrfs_previous_item(fs_info->chunk_root, path,
+ BTRFS_DEV_ITEMS_OBJECTID,
+ BTRFS_DEV_ITEM_KEY);
+ if (ret) {
+ *devid_ret = 1;
+ } else {
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+ path->slots[0]);
+ *devid_ret = found_key.offset + 1;
+ }
+ ret = 0;
+error:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * the device information is stored in the chunk root
+ * the btrfs_device struct should be fully filled in
+ */
+static int btrfs_add_device(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_device *device)
+{
+ int ret;
+ struct btrfs_path *path;
+ struct btrfs_dev_item *dev_item;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ unsigned long ptr;
+
+ root = root->fs_info->chunk_root;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+ key.type = BTRFS_DEV_ITEM_KEY;
+ key.offset = device->devid;
+
+ ret = btrfs_insert_empty_item(trans, root, path, &key,
+ sizeof(*dev_item));
+ if (ret)
+ goto out;
+
+ leaf = path->nodes[0];
+ dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
+
+ btrfs_set_device_id(leaf, dev_item, device->devid);
+ btrfs_set_device_generation(leaf, dev_item, 0);
+ btrfs_set_device_type(leaf, dev_item, device->type);
+ btrfs_set_device_io_align(leaf, dev_item, device->io_align);
+ btrfs_set_device_io_width(leaf, dev_item, device->io_width);
+ btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
+ btrfs_set_device_total_bytes(leaf, dev_item,
+ btrfs_device_get_disk_total_bytes(device));
+ btrfs_set_device_bytes_used(leaf, dev_item,
+ btrfs_device_get_bytes_used(device));
+ btrfs_set_device_group(leaf, dev_item, 0);
+ btrfs_set_device_seek_speed(leaf, dev_item, 0);
+ btrfs_set_device_bandwidth(leaf, dev_item, 0);
+ btrfs_set_device_start_offset(leaf, dev_item, 0);
+
+ ptr = btrfs_device_uuid(dev_item);
+ write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
+ ptr = btrfs_device_fsid(dev_item);
+ write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE);
+ btrfs_mark_buffer_dirty(leaf);
+
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * Function to update ctime/mtime for a given device path.
+ * Mainly used for ctime/mtime based probe like libblkid.
+ */
+static void update_dev_time(char *path_name)
+{
+ struct file *filp;
+
+ filp = filp_open(path_name, O_RDWR, 0);
+ if (IS_ERR(filp))
+ return;
+ file_update_time(filp);
+ filp_close(filp, NULL);
+ return;
+}
+
+static int btrfs_rm_dev_item(struct btrfs_root *root,
+ struct btrfs_device *device)
+{
+ int ret;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct btrfs_trans_handle *trans;
+
+ root = root->fs_info->chunk_root;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ btrfs_free_path(path);
+ return PTR_ERR(trans);
+ }
+ key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+ key.type = BTRFS_DEV_ITEM_KEY;
+ key.offset = device->devid;
+
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret < 0)
+ goto out;
+
+ if (ret > 0) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ ret = btrfs_del_item(trans, root, path);
+ if (ret)
+ goto out;
+out:
+ btrfs_free_path(path);
+ btrfs_commit_transaction(trans, root);
+ return ret;
+}
+
+int btrfs_rm_device(struct btrfs_root *root, char *device_path)
+{
+ struct btrfs_device *device;
+ struct btrfs_device *next_device;
+ struct block_device *bdev;
+ struct buffer_head *bh = NULL;
+ struct btrfs_super_block *disk_super;
+ struct btrfs_fs_devices *cur_devices;
+ u64 all_avail;
+ u64 devid;
+ u64 num_devices;
+ u8 *dev_uuid;
+ unsigned seq;
+ int ret = 0;
+ bool clear_super = false;
+
+ mutex_lock(&uuid_mutex);
+
+ do {
+ seq = read_seqbegin(&root->fs_info->profiles_lock);
+
+ all_avail = root->fs_info->avail_data_alloc_bits |
+ root->fs_info->avail_system_alloc_bits |
+ root->fs_info->avail_metadata_alloc_bits;
+ } while (read_seqretry(&root->fs_info->profiles_lock, seq));
+
+ num_devices = root->fs_info->fs_devices->num_devices;
+ btrfs_dev_replace_lock(&root->fs_info->dev_replace);
+ if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) {
+ WARN_ON(num_devices < 1);
+ num_devices--;
+ }
+ btrfs_dev_replace_unlock(&root->fs_info->dev_replace);
+
+ if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) {
+ ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET;
+ goto out;
+ }
+
+ if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) {
+ ret = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET;
+ goto out;
+ }
+
+ if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) &&
+ root->fs_info->fs_devices->rw_devices <= 2) {
+ ret = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET;
+ goto out;
+ }
+ if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) &&
+ root->fs_info->fs_devices->rw_devices <= 3) {
+ ret = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET;
+ goto out;
+ }
+
+ if (strcmp(device_path, "missing") == 0) {
+ struct list_head *devices;
+ struct btrfs_device *tmp;
+
+ device = NULL;
+ devices = &root->fs_info->fs_devices->devices;
+ /*
+ * It is safe to read the devices since the volume_mutex
+ * is held.
+ */
+ list_for_each_entry(tmp, devices, dev_list) {
+ if (tmp->in_fs_metadata &&
+ !tmp->is_tgtdev_for_dev_replace &&
+ !tmp->bdev) {
+ device = tmp;
+ break;
+ }
+ }
+ bdev = NULL;
+ bh = NULL;
+ disk_super = NULL;
+ if (!device) {
+ ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
+ goto out;
+ }
+ } else {
+ ret = btrfs_get_bdev_and_sb(device_path,
+ FMODE_WRITE | FMODE_EXCL,
+ root->fs_info->bdev_holder, 0,
+ &bdev, &bh);
+ if (ret)
+ goto out;
+ disk_super = (struct btrfs_super_block *)bh->b_data;
+ devid = btrfs_stack_device_id(&disk_super->dev_item);
+ dev_uuid = disk_super->dev_item.uuid;
+ device = btrfs_find_device(root->fs_info, devid, dev_uuid,
+ disk_super->fsid);
+ if (!device) {
+ ret = -ENOENT;
+ goto error_brelse;
+ }
+ }
+
+ if (device->is_tgtdev_for_dev_replace) {
+ ret = BTRFS_ERROR_DEV_TGT_REPLACE;
+ goto error_brelse;
+ }
+
+ if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
+ ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
+ goto error_brelse;
+ }
+
+ if (device->writeable) {
+ lock_chunks(root);
+ list_del_init(&device->dev_alloc_list);
+ device->fs_devices->rw_devices--;
+ unlock_chunks(root);
+ clear_super = true;
+ }
+
+ mutex_unlock(&uuid_mutex);
+ ret = btrfs_shrink_device(device, 0);
+ mutex_lock(&uuid_mutex);
+ if (ret)
+ goto error_undo;
+
+ /*
+ * TODO: the superblock still includes this device in its num_devices
+ * counter although write_all_supers() is not locked out. This
+ * could give a filesystem state which requires a degraded mount.
+ */
+ ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
+ if (ret)
+ goto error_undo;
+
+ device->in_fs_metadata = 0;
+ btrfs_scrub_cancel_dev(root->fs_info, device);
+
+ /*
+ * the device list mutex makes sure that we don't change
+ * the device list while someone else is writing out all
+ * the device supers. Whoever is writing all supers, should
+ * lock the device list mutex before getting the number of
+ * devices in the super block (super_copy). Conversely,
+ * whoever updates the number of devices in the super block
+ * (super_copy) should hold the device list mutex.
+ */
+
+ cur_devices = device->fs_devices;
+ mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+ list_del_rcu(&device->dev_list);
+
+ device->fs_devices->num_devices--;
+ device->fs_devices->total_devices--;
+
+ if (device->missing)
+ device->fs_devices->missing_devices--;
+
+ next_device = list_entry(root->fs_info->fs_devices->devices.next,
+ struct btrfs_device, dev_list);
+ if (device->bdev == root->fs_info->sb->s_bdev)
+ root->fs_info->sb->s_bdev = next_device->bdev;
+ if (device->bdev == root->fs_info->fs_devices->latest_bdev)
+ root->fs_info->fs_devices->latest_bdev = next_device->bdev;
+
+ if (device->bdev) {
+ device->fs_devices->open_devices--;
+ /* remove sysfs entry */
+ btrfs_kobj_rm_device(root->fs_info, device);
+ }
+
+ call_rcu(&device->rcu, free_device);
+
+ num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
+ btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices);
+ mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
+ if (cur_devices->open_devices == 0) {
+ struct btrfs_fs_devices *fs_devices;
+ fs_devices = root->fs_info->fs_devices;
+ while (fs_devices) {
+ if (fs_devices->seed == cur_devices) {
+ fs_devices->seed = cur_devices->seed;
+ break;
+ }
+ fs_devices = fs_devices->seed;
+ }
+ cur_devices->seed = NULL;
+ __btrfs_close_devices(cur_devices);
+ free_fs_devices(cur_devices);
+ }
+
+ root->fs_info->num_tolerated_disk_barrier_failures =
+ btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
+
+ /*
+ * at this point, the device is zero sized. We want to
+ * remove it from the devices list and zero out the old super
+ */
+ if (clear_super && disk_super) {
+ u64 bytenr;
+ int i;
+
+ /* make sure this device isn't detected as part of
+ * the FS anymore
+ */
+ memset(&disk_super->magic, 0, sizeof(disk_super->magic));
+ set_buffer_dirty(bh);
+ sync_dirty_buffer(bh);
+
+ /* clear the mirror copies of super block on the disk
+ * being removed, 0th copy is been taken care above and
+ * the below would take of the rest
+ */
+ for (i = 1; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+ bytenr = btrfs_sb_offset(i);
+ if (bytenr + BTRFS_SUPER_INFO_SIZE >=
+ i_size_read(bdev->bd_inode))
+ break;
+
+ brelse(bh);
+ bh = __bread(bdev, bytenr / 4096,
+ BTRFS_SUPER_INFO_SIZE);
+ if (!bh)
+ continue;
+
+ disk_super = (struct btrfs_super_block *)bh->b_data;
+
+ if (btrfs_super_bytenr(disk_super) != bytenr ||
+ btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
+ continue;
+ }
+ memset(&disk_super->magic, 0,
+ sizeof(disk_super->magic));
+ set_buffer_dirty(bh);
+ sync_dirty_buffer(bh);
+ }
+ }
+
+ ret = 0;
+
+ if (bdev) {
+ /* Notify udev that device has changed */
+ btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
+
+ /* Update ctime/mtime for device path for libblkid */
+ update_dev_time(device_path);
+ }
+
+error_brelse:
+ brelse(bh);
+ if (bdev)
+ blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
+out:
+ mutex_unlock(&uuid_mutex);
+ return ret;
+error_undo:
+ if (device->writeable) {
+ lock_chunks(root);
+ list_add(&device->dev_alloc_list,
+ &root->fs_info->fs_devices->alloc_list);
+ device->fs_devices->rw_devices++;
+ unlock_chunks(root);
+ }
+ goto error_brelse;
+}
+
+void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *srcdev)
+{
+ struct btrfs_fs_devices *fs_devices;
+
+ WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex));
+
+ /*
+ * in case of fs with no seed, srcdev->fs_devices will point
+ * to fs_devices of fs_info. However when the dev being replaced is
+ * a seed dev it will point to the seed's local fs_devices. In short
+ * srcdev will have its correct fs_devices in both the cases.
+ */
+ fs_devices = srcdev->fs_devices;
+
+ list_del_rcu(&srcdev->dev_list);
+ list_del_rcu(&srcdev->dev_alloc_list);
+ fs_devices->num_devices--;
+ if (srcdev->missing)
+ fs_devices->missing_devices--;
+
+ if (srcdev->writeable) {
+ fs_devices->rw_devices--;
+ /* zero out the old super if it is writable */
+ btrfs_scratch_superblock(srcdev);
+ }
+
+ if (srcdev->bdev)
+ fs_devices->open_devices--;
+}
+
+void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *srcdev)
+{
+ struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
+
+ call_rcu(&srcdev->rcu, free_device);
+
+ /*
+ * unless fs_devices is seed fs, num_devices shouldn't go
+ * zero
+ */
+ BUG_ON(!fs_devices->num_devices && !fs_devices->seeding);
+
+ /* if this is no devs we rather delete the fs_devices */
+ if (!fs_devices->num_devices) {
+ struct btrfs_fs_devices *tmp_fs_devices;
+
+ tmp_fs_devices = fs_info->fs_devices;
+ while (tmp_fs_devices) {
+ if (tmp_fs_devices->seed == fs_devices) {
+ tmp_fs_devices->seed = fs_devices->seed;
+ break;
+ }
+ tmp_fs_devices = tmp_fs_devices->seed;
+ }
+ fs_devices->seed = NULL;
+ __btrfs_close_devices(fs_devices);
+ free_fs_devices(fs_devices);
+ }
+}
+
+void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *tgtdev)
+{
+ struct btrfs_device *next_device;
+
+ mutex_lock(&uuid_mutex);
+ WARN_ON(!tgtdev);
+ mutex_lock(&fs_info->fs_devices->device_list_mutex);
+ if (tgtdev->bdev) {
+ btrfs_scratch_superblock(tgtdev);
+ fs_info->fs_devices->open_devices--;
+ }
+ fs_info->fs_devices->num_devices--;
+
+ next_device = list_entry(fs_info->fs_devices->devices.next,
+ struct btrfs_device, dev_list);
+ if (tgtdev->bdev == fs_info->sb->s_bdev)
+ fs_info->sb->s_bdev = next_device->bdev;
+ if (tgtdev->bdev == fs_info->fs_devices->latest_bdev)
+ fs_info->fs_devices->latest_bdev = next_device->bdev;
+ list_del_rcu(&tgtdev->dev_list);
+
+ call_rcu(&tgtdev->rcu, free_device);
+
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ mutex_unlock(&uuid_mutex);
+}
+
+static int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
+ struct btrfs_device **device)
+{
+ int ret = 0;
+ struct btrfs_super_block *disk_super;
+ u64 devid;
+ u8 *dev_uuid;
+ struct block_device *bdev;
+ struct buffer_head *bh;
+
+ *device = NULL;
+ ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
+ root->fs_info->bdev_holder, 0, &bdev, &bh);
+ if (ret)
+ return ret;
+ disk_super = (struct btrfs_super_block *)bh->b_data;
+ devid = btrfs_stack_device_id(&disk_super->dev_item);
+ dev_uuid = disk_super->dev_item.uuid;
+ *device = btrfs_find_device(root->fs_info, devid, dev_uuid,
+ disk_super->fsid);
+ brelse(bh);
+ if (!*device)
+ ret = -ENOENT;
+ blkdev_put(bdev, FMODE_READ);
+ return ret;
+}
+
+int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
+ char *device_path,
+ struct btrfs_device **device)
+{
+ *device = NULL;
+ if (strcmp(device_path, "missing") == 0) {
+ struct list_head *devices;
+ struct btrfs_device *tmp;
+
+ devices = &root->fs_info->fs_devices->devices;
+ /*
+ * It is safe to read the devices since the volume_mutex
+ * is held by the caller.
+ */
+ list_for_each_entry(tmp, devices, dev_list) {
+ if (tmp->in_fs_metadata && !tmp->bdev) {
+ *device = tmp;
+ break;
+ }
+ }
+
+ if (!*device) {
+ btrfs_err(root->fs_info, "no missing device found");
+ return -ENOENT;
+ }
+
+ return 0;
+ } else {
+ return btrfs_find_device_by_path(root, device_path, device);
+ }
+}
+
+/*
+ * does all the dirty work required for changing file system's UUID.
+ */
+static int btrfs_prepare_sprout(struct btrfs_root *root)
+{
+ struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+ struct btrfs_fs_devices *old_devices;
+ struct btrfs_fs_devices *seed_devices;
+ struct btrfs_super_block *disk_super = root->fs_info->super_copy;
+ struct btrfs_device *device;
+ u64 super_flags;
+
+ BUG_ON(!mutex_is_locked(&uuid_mutex));
+ if (!fs_devices->seeding)
+ return -EINVAL;
+
+ seed_devices = __alloc_fs_devices();
+ if (IS_ERR(seed_devices))
+ return PTR_ERR(seed_devices);
+
+ old_devices = clone_fs_devices(fs_devices);
+ if (IS_ERR(old_devices)) {
+ kfree(seed_devices);
+ return PTR_ERR(old_devices);
+ }
+
+ list_add(&old_devices->list, &fs_uuids);
+
+ memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
+ seed_devices->opened = 1;
+ INIT_LIST_HEAD(&seed_devices->devices);
+ INIT_LIST_HEAD(&seed_devices->alloc_list);
+ mutex_init(&seed_devices->device_list_mutex);
+
+ mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+ list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
+ synchronize_rcu);
+ list_for_each_entry(device, &seed_devices->devices, dev_list)
+ device->fs_devices = seed_devices;
+
+ lock_chunks(root);
+ list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
+ unlock_chunks(root);
+
+ fs_devices->seeding = 0;
+ fs_devices->num_devices = 0;
+ fs_devices->open_devices = 0;
+ fs_devices->missing_devices = 0;
+ fs_devices->rotating = 0;
+ fs_devices->seed = seed_devices;
+
+ generate_random_uuid(fs_devices->fsid);
+ memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
+ memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
+ mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
+ super_flags = btrfs_super_flags(disk_super) &
+ ~BTRFS_SUPER_FLAG_SEEDING;
+ btrfs_set_super_flags(disk_super, super_flags);
+
+ return 0;
+}
+
+/*
+ * strore the expected generation for seed devices in device items.
+ */
+static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_dev_item *dev_item;
+ struct btrfs_device *device;
+ struct btrfs_key key;
+ u8 fs_uuid[BTRFS_UUID_SIZE];
+ u8 dev_uuid[BTRFS_UUID_SIZE];
+ u64 devid;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ root = root->fs_info->chunk_root;
+ key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+ key.offset = 0;
+ key.type = BTRFS_DEV_ITEM_KEY;
+
+ while (1) {
+ ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+ if (ret < 0)
+ goto error;
+
+ leaf = path->nodes[0];
+next_slot:
+ if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret > 0)
+ break;
+ if (ret < 0)
+ goto error;
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ btrfs_release_path(path);
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
+ key.type != BTRFS_DEV_ITEM_KEY)
+ break;
+
+ dev_item = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_dev_item);
+ devid = btrfs_device_id(leaf, dev_item);
+ read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
+ BTRFS_UUID_SIZE);
+ read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
+ BTRFS_UUID_SIZE);
+ device = btrfs_find_device(root->fs_info, devid, dev_uuid,
+ fs_uuid);
+ BUG_ON(!device); /* Logic error */
+
+ if (device->fs_devices->seeding) {
+ btrfs_set_device_generation(leaf, dev_item,
+ device->generation);
+ btrfs_mark_buffer_dirty(leaf);
+ }
+
+ path->slots[0]++;
+ goto next_slot;
+ }
+ ret = 0;
+error:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
+{
+ struct request_queue *q;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_device *device;
+ struct block_device *bdev;
+ struct list_head *devices;
+ struct super_block *sb = root->fs_info->sb;
+ struct rcu_string *name;
+ u64 tmp;
+ int seeding_dev = 0;
+ int ret = 0;
+
+ if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
+ return -EROFS;
+
+ bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
+ root->fs_info->bdev_holder);
+ if (IS_ERR(bdev))
+ return PTR_ERR(bdev);
+
+ if (root->fs_info->fs_devices->seeding) {
+ seeding_dev = 1;
+ down_write(&sb->s_umount);
+ mutex_lock(&uuid_mutex);
+ }
+
+ filemap_write_and_wait(bdev->bd_inode->i_mapping);
+
+ devices = &root->fs_info->fs_devices->devices;
+
+ mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+ list_for_each_entry(device, devices, dev_list) {
+ if (device->bdev == bdev) {
+ ret = -EEXIST;
+ mutex_unlock(
+ &root->fs_info->fs_devices->device_list_mutex);
+ goto error;
+ }
+ }
+ mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
+ device = btrfs_alloc_device(root->fs_info, NULL, NULL);
+ if (IS_ERR(device)) {
+ /* we can safely leave the fs_devices entry around */
+ ret = PTR_ERR(device);
+ goto error;
+ }
+
+ name = rcu_string_strdup(device_path, GFP_NOFS);
+ if (!name) {
+ kfree(device);
+ ret = -ENOMEM;
+ goto error;
+ }
+ rcu_assign_pointer(device->name, name);
+
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ rcu_string_free(device->name);
+ kfree(device);
+ ret = PTR_ERR(trans);
+ goto error;
+ }
+
+ q = bdev_get_queue(bdev);
+ if (blk_queue_discard(q))
+ device->can_discard = 1;
+ device->writeable = 1;
+ device->generation = trans->transid;
+ device->io_width = root->sectorsize;
+ device->io_align = root->sectorsize;
+ device->sector_size = root->sectorsize;
+ device->total_bytes = i_size_read(bdev->bd_inode);
+ device->disk_total_bytes = device->total_bytes;
+ device->commit_total_bytes = device->total_bytes;
+ device->dev_root = root->fs_info->dev_root;
+ device->bdev = bdev;
+ device->in_fs_metadata = 1;
+ device->is_tgtdev_for_dev_replace = 0;
+ device->mode = FMODE_EXCL;
+ device->dev_stats_valid = 1;
+ set_blocksize(device->bdev, 4096);
+
+ if (seeding_dev) {
+ sb->s_flags &= ~MS_RDONLY;
+ ret = btrfs_prepare_sprout(root);
+ BUG_ON(ret); /* -ENOMEM */
+ }
+
+ device->fs_devices = root->fs_info->fs_devices;
+
+ mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+ lock_chunks(root);
+ list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices);
+ list_add(&device->dev_alloc_list,
+ &root->fs_info->fs_devices->alloc_list);
+ root->fs_info->fs_devices->num_devices++;
+ root->fs_info->fs_devices->open_devices++;
+ root->fs_info->fs_devices->rw_devices++;
+ root->fs_info->fs_devices->total_devices++;
+ root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
+
+ spin_lock(&root->fs_info->free_chunk_lock);
+ root->fs_info->free_chunk_space += device->total_bytes;
+ spin_unlock(&root->fs_info->free_chunk_lock);
+
+ if (!blk_queue_nonrot(bdev_get_queue(bdev)))
+ root->fs_info->fs_devices->rotating = 1;
+
+ tmp = btrfs_super_total_bytes(root->fs_info->super_copy);
+ btrfs_set_super_total_bytes(root->fs_info->super_copy,
+ tmp + device->total_bytes);
+
+ tmp = btrfs_super_num_devices(root->fs_info->super_copy);
+ btrfs_set_super_num_devices(root->fs_info->super_copy,
+ tmp + 1);
+
+ /* add sysfs device entry */
+ btrfs_kobj_add_device(root->fs_info, device);
+
+ /*
+ * we've got more storage, clear any full flags on the space
+ * infos
+ */
+ btrfs_clear_space_info_full(root->fs_info);
+
+ unlock_chunks(root);
+ mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
+ if (seeding_dev) {
+ lock_chunks(root);
+ ret = init_first_rw_device(trans, root, device);
+ unlock_chunks(root);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto error_trans;
+ }
+ }
+
+ ret = btrfs_add_device(trans, root, device);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto error_trans;
+ }
+
+ if (seeding_dev) {
+ char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
+
+ ret = btrfs_finish_sprout(trans, root);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto error_trans;
+ }
+
+ /* Sprouting would change fsid of the mounted root,
+ * so rename the fsid on the sysfs
+ */
+ snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
+ root->fs_info->fsid);
+ if (kobject_rename(&root->fs_info->super_kobj, fsid_buf))
+ goto error_trans;
+ }
+
+ root->fs_info->num_tolerated_disk_barrier_failures =
+ btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
+ ret = btrfs_commit_transaction(trans, root);
+
+ if (seeding_dev) {
+ mutex_unlock(&uuid_mutex);
+ up_write(&sb->s_umount);
+
+ if (ret) /* transaction commit */
+ return ret;
+
+ ret = btrfs_relocate_sys_chunks(root);
+ if (ret < 0)
+ btrfs_error(root->fs_info, ret,
+ "Failed to relocate sys chunks after "
+ "device initialization. This can be fixed "
+ "using the \"btrfs balance\" command.");
+ trans = btrfs_attach_transaction(root);
+ if (IS_ERR(trans)) {
+ if (PTR_ERR(trans) == -ENOENT)
+ return 0;
+ return PTR_ERR(trans);
+ }
+ ret = btrfs_commit_transaction(trans, root);
+ }
+
+ /* Update ctime/mtime for libblkid */
+ update_dev_time(device_path);
+ return ret;
+
+error_trans:
+ btrfs_end_transaction(trans, root);
+ rcu_string_free(device->name);
+ btrfs_kobj_rm_device(root->fs_info, device);
+ kfree(device);
+error:
+ blkdev_put(bdev, FMODE_EXCL);
+ if (seeding_dev) {
+ mutex_unlock(&uuid_mutex);
+ up_write(&sb->s_umount);
+ }
+ return ret;
+}
+
+int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
+ struct btrfs_device *srcdev,
+ struct btrfs_device **device_out)
+{
+ struct request_queue *q;
+ struct btrfs_device *device;
+ struct block_device *bdev;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct list_head *devices;
+ struct rcu_string *name;
+ u64 devid = BTRFS_DEV_REPLACE_DEVID;
+ int ret = 0;
+
+ *device_out = NULL;
+ if (fs_info->fs_devices->seeding) {
+ btrfs_err(fs_info, "the filesystem is a seed filesystem!");
+ return -EINVAL;
+ }
+
+ bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
+ fs_info->bdev_holder);
+ if (IS_ERR(bdev)) {
+ btrfs_err(fs_info, "target device %s is invalid!", device_path);
+ return PTR_ERR(bdev);
+ }
+
+ filemap_write_and_wait(bdev->bd_inode->i_mapping);
+
+ devices = &fs_info->fs_devices->devices;
+ list_for_each_entry(device, devices, dev_list) {
+ if (device->bdev == bdev) {
+ btrfs_err(fs_info, "target device is in the filesystem!");
+ ret = -EEXIST;
+ goto error;
+ }
+ }
+
+
+ if (i_size_read(bdev->bd_inode) <
+ btrfs_device_get_total_bytes(srcdev)) {
+ btrfs_err(fs_info, "target device is smaller than source device!");
+ ret = -EINVAL;
+ goto error;
+ }
+
+
+ device = btrfs_alloc_device(NULL, &devid, NULL);
+ if (IS_ERR(device)) {
+ ret = PTR_ERR(device);
+ goto error;
+ }
+
+ name = rcu_string_strdup(device_path, GFP_NOFS);
+ if (!name) {
+ kfree(device);
+ ret = -ENOMEM;
+ goto error;
+ }
+ rcu_assign_pointer(device->name, name);
+
+ q = bdev_get_queue(bdev);
+ if (blk_queue_discard(q))
+ device->can_discard = 1;
+ mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+ device->writeable = 1;
+ device->generation = 0;
+ device->io_width = root->sectorsize;
+ device->io_align = root->sectorsize;
+ device->sector_size = root->sectorsize;
+ device->total_bytes = btrfs_device_get_total_bytes(srcdev);
+ device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev);
+ device->bytes_used = btrfs_device_get_bytes_used(srcdev);
+ ASSERT(list_empty(&srcdev->resized_list));
+ device->commit_total_bytes = srcdev->commit_total_bytes;
+ device->commit_bytes_used = device->bytes_used;
+ device->dev_root = fs_info->dev_root;
+ device->bdev = bdev;
+ device->in_fs_metadata = 1;
+ device->is_tgtdev_for_dev_replace = 1;
+ device->mode = FMODE_EXCL;
+ device->dev_stats_valid = 1;
+ set_blocksize(device->bdev, 4096);
+ device->fs_devices = fs_info->fs_devices;
+ list_add(&device->dev_list, &fs_info->fs_devices->devices);
+ fs_info->fs_devices->num_devices++;
+ fs_info->fs_devices->open_devices++;
+ mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
+ *device_out = device;
+ return ret;
+
+error:
+ blkdev_put(bdev, FMODE_EXCL);
+ return ret;
+}
+
+void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *tgtdev)
+{
+ WARN_ON(fs_info->fs_devices->rw_devices == 0);
+ tgtdev->io_width = fs_info->dev_root->sectorsize;
+ tgtdev->io_align = fs_info->dev_root->sectorsize;
+ tgtdev->sector_size = fs_info->dev_root->sectorsize;
+ tgtdev->dev_root = fs_info->dev_root;
+ tgtdev->in_fs_metadata = 1;
+}
+
+static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device)
+{
+ int ret;
+ struct btrfs_path *path;
+ struct btrfs_root *root;
+ struct btrfs_dev_item *dev_item;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+
+ root = device->dev_root->fs_info->chunk_root;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+ key.type = BTRFS_DEV_ITEM_KEY;
+ key.offset = device->devid;
+
+ ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+ if (ret < 0)
+ goto out;
+
+ if (ret > 0) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ leaf = path->nodes[0];
+ dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
+
+ btrfs_set_device_id(leaf, dev_item, device->devid);
+ btrfs_set_device_type(leaf, dev_item, device->type);
+ btrfs_set_device_io_align(leaf, dev_item, device->io_align);
+ btrfs_set_device_io_width(leaf, dev_item, device->io_width);
+ btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
+ btrfs_set_device_total_bytes(leaf, dev_item,
+ btrfs_device_get_disk_total_bytes(device));
+ btrfs_set_device_bytes_used(leaf, dev_item,
+ btrfs_device_get_bytes_used(device));
+ btrfs_mark_buffer_dirty(leaf);
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_grow_device(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device, u64 new_size)
+{
+ struct btrfs_super_block *super_copy =
+ device->dev_root->fs_info->super_copy;
+ struct btrfs_fs_devices *fs_devices;
+ u64 old_total;
+ u64 diff;
+
+ if (!device->writeable)
+ return -EACCES;
+
+ lock_chunks(device->dev_root);
+ old_total = btrfs_super_total_bytes(super_copy);
+ diff = new_size - device->total_bytes;
+
+ if (new_size <= device->total_bytes ||
+ device->is_tgtdev_for_dev_replace) {
+ unlock_chunks(device->dev_root);
+ return -EINVAL;
+ }
+
+ fs_devices = device->dev_root->fs_info->fs_devices;
+
+ btrfs_set_super_total_bytes(super_copy, old_total + diff);
+ device->fs_devices->total_rw_bytes += diff;
+
+ btrfs_device_set_total_bytes(device, new_size);
+ btrfs_device_set_disk_total_bytes(device, new_size);
+ btrfs_clear_space_info_full(device->dev_root->fs_info);
+ if (list_empty(&device->resized_list))
+ list_add_tail(&device->resized_list,
+ &fs_devices->resized_devices);
+ unlock_chunks(device->dev_root);
+
+ return btrfs_update_device(trans, device);
+}
+
+static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 chunk_objectid,
+ u64 chunk_offset)
+{
+ int ret;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+
+ root = root->fs_info->chunk_root;
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = chunk_objectid;
+ key.offset = chunk_offset;
+ key.type = BTRFS_CHUNK_ITEM_KEY;
+
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret < 0)
+ goto out;
+ else if (ret > 0) { /* Logic error or corruption */
+ btrfs_error(root->fs_info, -ENOENT,
+ "Failed lookup while freeing chunk.");
+ ret = -ENOENT;
+ goto out;
+ }
+
+ ret = btrfs_del_item(trans, root, path);
+ if (ret < 0)
+ btrfs_error(root->fs_info, ret,
+ "Failed to delete chunk item.");
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
+ chunk_offset)
+{
+ struct btrfs_super_block *super_copy = root->fs_info->super_copy;
+ struct btrfs_disk_key *disk_key;
+ struct btrfs_chunk *chunk;
+ u8 *ptr;
+ int ret = 0;
+ u32 num_stripes;
+ u32 array_size;
+ u32 len = 0;
+ u32 cur;
+ struct btrfs_key key;
+
+ lock_chunks(root);
+ array_size = btrfs_super_sys_array_size(super_copy);
+
+ ptr = super_copy->sys_chunk_array;
+ cur = 0;
+
+ while (cur < array_size) {
+ disk_key = (struct btrfs_disk_key *)ptr;
+ btrfs_disk_key_to_cpu(&key, disk_key);
+
+ len = sizeof(*disk_key);
+
+ if (key.type == BTRFS_CHUNK_ITEM_KEY) {
+ chunk = (struct btrfs_chunk *)(ptr + len);
+ num_stripes = btrfs_stack_chunk_num_stripes(chunk);
+ len += btrfs_chunk_item_size(num_stripes);
+ } else {
+ ret = -EIO;
+ break;
+ }
+ if (key.objectid == chunk_objectid &&
+ key.offset == chunk_offset) {
+ memmove(ptr, ptr + len, array_size - (cur + len));
+ array_size -= len;
+ btrfs_set_super_sys_array_size(super_copy, array_size);
+ } else {
+ ptr += len;
+ cur += len;
+ }
+ }
+ unlock_chunks(root);
+ return ret;
+}
+
+int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 chunk_offset)
+{
+ struct extent_map_tree *em_tree;
+ struct extent_map *em;
+ struct btrfs_root *extent_root = root->fs_info->extent_root;
+ struct map_lookup *map;
+ u64 dev_extent_len = 0;
+ u64 chunk_objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+ int i, ret = 0;
+
+ /* Just in case */
+ root = root->fs_info->chunk_root;
+ em_tree = &root->fs_info->mapping_tree.map_tree;
+
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, chunk_offset, 1);
+ read_unlock(&em_tree->lock);
+
+ if (!em || em->start > chunk_offset ||
+ em->start + em->len < chunk_offset) {
+ /*
+ * This is a logic error, but we don't want to just rely on the
+ * user having built with ASSERT enabled, so if ASSERT doens't
+ * do anything we still error out.
+ */
+ ASSERT(0);
+ if (em)
+ free_extent_map(em);
+ return -EINVAL;
+ }
+ map = (struct map_lookup *)em->bdev;
+
+ for (i = 0; i < map->num_stripes; i++) {
+ struct btrfs_device *device = map->stripes[i].dev;
+ ret = btrfs_free_dev_extent(trans, device,
+ map->stripes[i].physical,
+ &dev_extent_len);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
+
+ if (device->bytes_used > 0) {
+ lock_chunks(root);
+ btrfs_device_set_bytes_used(device,
+ device->bytes_used - dev_extent_len);
+ spin_lock(&root->fs_info->free_chunk_lock);
+ root->fs_info->free_chunk_space += dev_extent_len;
+ spin_unlock(&root->fs_info->free_chunk_lock);
+ btrfs_clear_space_info_full(root->fs_info);
+ unlock_chunks(root);
+ }
+
+ if (map->stripes[i].dev) {
+ ret = btrfs_update_device(trans, map->stripes[i].dev);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
+ }
+ }
+ ret = btrfs_free_chunk(trans, root, chunk_objectid, chunk_offset);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
+
+ trace_btrfs_chunk_free(root, map, chunk_offset, em->len);
+
+ if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
+ ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto out;
+ }
+ }
+
+ ret = btrfs_remove_block_group(trans, extent_root, chunk_offset, em);
+ if (ret) {
+ btrfs_abort_transaction(trans, extent_root, ret);
+ goto out;
+ }
+
+out:
+ /* once for us */
+ free_extent_map(em);
+ return ret;
+}
+
+static int btrfs_relocate_chunk(struct btrfs_root *root,
+ u64 chunk_objectid,
+ u64 chunk_offset)
+{
+ struct btrfs_root *extent_root;
+ struct btrfs_trans_handle *trans;
+ int ret;
+
+ root = root->fs_info->chunk_root;
+ extent_root = root->fs_info->extent_root;
+
+ ret = btrfs_can_relocate(extent_root, chunk_offset);
+ if (ret)
+ return -ENOSPC;
+
+ /* step one, relocate all the extents inside this chunk */
+ ret = btrfs_relocate_block_group(extent_root, chunk_offset);
+ if (ret)
+ return ret;
+
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ btrfs_std_error(root->fs_info, ret);
+ return ret;
+ }
+
+ /*
+ * step two, delete the device extents and the
+ * chunk tree entries
+ */
+ ret = btrfs_remove_chunk(trans, root, chunk_offset);
+ btrfs_end_transaction(trans, root);
+ return ret;
+}
+
+static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
+{
+ struct btrfs_root *chunk_root = root->fs_info->chunk_root;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_chunk *chunk;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ u64 chunk_type;
+ bool retried = false;
+ int failed = 0;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+again:
+ key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+ key.offset = (u64)-1;
+ key.type = BTRFS_CHUNK_ITEM_KEY;
+
+ while (1) {
+ ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
+ if (ret < 0)
+ goto error;
+ BUG_ON(ret == 0); /* Corruption */
+
+ ret = btrfs_previous_item(chunk_root, path, key.objectid,
+ key.type);
+ if (ret < 0)
+ goto error;
+ if (ret > 0)
+ break;
+
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+ chunk = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_chunk);
+ chunk_type = btrfs_chunk_type(leaf, chunk);
+ btrfs_release_path(path);
+
+ if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
+ ret = btrfs_relocate_chunk(chunk_root,
+ found_key.objectid,
+ found_key.offset);
+ if (ret == -ENOSPC)
+ failed++;
+ else
+ BUG_ON(ret);
+ }
+
+ if (found_key.offset == 0)
+ break;
+ key.offset = found_key.offset - 1;
+ }
+ ret = 0;
+ if (failed && !retried) {
+ failed = 0;
+ retried = true;
+ goto again;
+ } else if (WARN_ON(failed && retried)) {
+ ret = -ENOSPC;
+ }
+error:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int insert_balance_item(struct btrfs_root *root,
+ struct btrfs_balance_control *bctl)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_balance_item *item;
+ struct btrfs_disk_balance_args disk_bargs;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ int ret, err;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ btrfs_free_path(path);
+ return PTR_ERR(trans);
+ }
+
+ key.objectid = BTRFS_BALANCE_OBJECTID;
+ key.type = BTRFS_BALANCE_ITEM_KEY;
+ key.offset = 0;
+
+ ret = btrfs_insert_empty_item(trans, root, path, &key,
+ sizeof(*item));
+ if (ret)
+ goto out;
+
+ leaf = path->nodes[0];
+ item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
+
+ memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
+
+ btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
+ btrfs_set_balance_data(leaf, item, &disk_bargs);
+ btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
+ btrfs_set_balance_meta(leaf, item, &disk_bargs);
+ btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
+ btrfs_set_balance_sys(leaf, item, &disk_bargs);
+
+ btrfs_set_balance_flags(leaf, item, bctl->flags);
+
+ btrfs_mark_buffer_dirty(leaf);
+out:
+ btrfs_free_path(path);
+ err = btrfs_commit_transaction(trans, root);
+ if (err && !ret)
+ ret = err;
+ return ret;
+}
+
+static int del_balance_item(struct btrfs_root *root)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ int ret, err;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ btrfs_free_path(path);
+ return PTR_ERR(trans);
+ }
+
+ key.objectid = BTRFS_BALANCE_OBJECTID;
+ key.type = BTRFS_BALANCE_ITEM_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret < 0)
+ goto out;
+ if (ret > 0) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ ret = btrfs_del_item(trans, root, path);
+out:
+ btrfs_free_path(path);
+ err = btrfs_commit_transaction(trans, root);
+ if (err && !ret)
+ ret = err;
+ return ret;
+}
+
+/*
+ * This is a heuristic used to reduce the number of chunks balanced on
+ * resume after balance was interrupted.
+ */
+static void update_balance_args(struct btrfs_balance_control *bctl)
+{
+ /*
+ * Turn on soft mode for chunk types that were being converted.
+ */
+ if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
+ bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
+ if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
+ bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
+ if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
+ bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
+
+ /*
+ * Turn on usage filter if is not already used. The idea is
+ * that chunks that we have already balanced should be
+ * reasonably full. Don't do it for chunks that are being
+ * converted - that will keep us from relocating unconverted
+ * (albeit full) chunks.
+ */
+ if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+ !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
+ bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
+ bctl->data.usage = 90;
+ }
+ if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+ !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
+ bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
+ bctl->sys.usage = 90;
+ }
+ if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+ !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
+ bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
+ bctl->meta.usage = 90;
+ }
+}
+
+/*
+ * Should be called with both balance and volume mutexes held to
+ * serialize other volume operations (add_dev/rm_dev/resize) with
+ * restriper. Same goes for unset_balance_control.
+ */
+static void set_balance_control(struct btrfs_balance_control *bctl)
+{
+ struct btrfs_fs_info *fs_info = bctl->fs_info;
+
+ BUG_ON(fs_info->balance_ctl);
+
+ spin_lock(&fs_info->balance_lock);
+ fs_info->balance_ctl = bctl;
+ spin_unlock(&fs_info->balance_lock);
+}
+
+static void unset_balance_control(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+
+ BUG_ON(!fs_info->balance_ctl);
+
+ spin_lock(&fs_info->balance_lock);
+ fs_info->balance_ctl = NULL;
+ spin_unlock(&fs_info->balance_lock);
+
+ kfree(bctl);
+}
+
+/*
+ * Balance filters. Return 1 if chunk should be filtered out
+ * (should not be balanced).
+ */
+static int chunk_profiles_filter(u64 chunk_type,
+ struct btrfs_balance_args *bargs)
+{
+ chunk_type = chunk_to_extended(chunk_type) &
+ BTRFS_EXTENDED_PROFILE_MASK;
+
+ if (bargs->profiles & chunk_type)
+ return 0;
+
+ return 1;
+}
+
+static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
+ struct btrfs_balance_args *bargs)
+{
+ struct btrfs_block_group_cache *cache;
+ u64 chunk_used, user_thresh;
+ int ret = 1;
+
+ cache = btrfs_lookup_block_group(fs_info, chunk_offset);
+ chunk_used = btrfs_block_group_used(&cache->item);
+
+ if (bargs->usage == 0)
+ user_thresh = 1;
+ else if (bargs->usage > 100)
+ user_thresh = cache->key.offset;
+ else
+ user_thresh = div_factor_fine(cache->key.offset,
+ bargs->usage);
+
+ if (chunk_used < user_thresh)
+ ret = 0;
+
+ btrfs_put_block_group(cache);
+ return ret;
+}
+
+static int chunk_devid_filter(struct extent_buffer *leaf,
+ struct btrfs_chunk *chunk,
+ struct btrfs_balance_args *bargs)
+{
+ struct btrfs_stripe *stripe;
+ int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+ int i;
+
+ for (i = 0; i < num_stripes; i++) {
+ stripe = btrfs_stripe_nr(chunk, i);
+ if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
+ return 0;
+ }
+
+ return 1;
+}
+
+/* [pstart, pend) */
+static int chunk_drange_filter(struct extent_buffer *leaf,
+ struct btrfs_chunk *chunk,
+ u64 chunk_offset,
+ struct btrfs_balance_args *bargs)
+{
+ struct btrfs_stripe *stripe;
+ int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+ u64 stripe_offset;
+ u64 stripe_length;
+ int factor;
+ int i;
+
+ if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
+ return 0;
+
+ if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
+ BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) {
+ factor = num_stripes / 2;
+ } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) {
+ factor = num_stripes - 1;
+ } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) {
+ factor = num_stripes - 2;
+ } else {
+ factor = num_stripes;
+ }
+
+ for (i = 0; i < num_stripes; i++) {
+ stripe = btrfs_stripe_nr(chunk, i);
+ if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
+ continue;
+
+ stripe_offset = btrfs_stripe_offset(leaf, stripe);
+ stripe_length = btrfs_chunk_length(leaf, chunk);
+ stripe_length = div_u64(stripe_length, factor);
+
+ if (stripe_offset < bargs->pend &&
+ stripe_offset + stripe_length > bargs->pstart)
+ return 0;
+ }
+
+ return 1;
+}
+
+/* [vstart, vend) */
+static int chunk_vrange_filter(struct extent_buffer *leaf,
+ struct btrfs_chunk *chunk,
+ u64 chunk_offset,
+ struct btrfs_balance_args *bargs)
+{
+ if (chunk_offset < bargs->vend &&
+ chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
+ /* at least part of the chunk is inside this vrange */
+ return 0;
+
+ return 1;
+}
+
+static int chunk_soft_convert_filter(u64 chunk_type,
+ struct btrfs_balance_args *bargs)
+{
+ if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
+ return 0;
+
+ chunk_type = chunk_to_extended(chunk_type) &
+ BTRFS_EXTENDED_PROFILE_MASK;
+
+ if (bargs->target == chunk_type)
+ return 1;
+
+ return 0;
+}
+
+static int should_balance_chunk(struct btrfs_root *root,
+ struct extent_buffer *leaf,
+ struct btrfs_chunk *chunk, u64 chunk_offset)
+{
+ struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
+ struct btrfs_balance_args *bargs = NULL;
+ u64 chunk_type = btrfs_chunk_type(leaf, chunk);
+
+ /* type filter */
+ if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
+ (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
+ return 0;
+ }
+
+ if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
+ bargs = &bctl->data;
+ else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
+ bargs = &bctl->sys;
+ else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
+ bargs = &bctl->meta;
+
+ /* profiles filter */
+ if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
+ chunk_profiles_filter(chunk_type, bargs)) {
+ return 0;
+ }
+
+ /* usage filter */
+ if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
+ chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) {
+ return 0;
+ }
+
+ /* devid filter */
+ if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
+ chunk_devid_filter(leaf, chunk, bargs)) {
+ return 0;
+ }
+
+ /* drange filter, makes sense only with devid filter */
+ if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
+ chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) {
+ return 0;
+ }
+
+ /* vrange filter */
+ if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
+ chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
+ return 0;
+ }
+
+ /* soft profile changing mode */
+ if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
+ chunk_soft_convert_filter(chunk_type, bargs)) {
+ return 0;
+ }
+
+ /*
+ * limited by count, must be the last filter
+ */
+ if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
+ if (bargs->limit == 0)
+ return 0;
+ else
+ bargs->limit--;
+ }
+
+ return 1;
+}
+
+static int __btrfs_balance(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+ struct btrfs_root *chunk_root = fs_info->chunk_root;
+ struct btrfs_root *dev_root = fs_info->dev_root;
+ struct list_head *devices;
+ struct btrfs_device *device;
+ u64 old_size;
+ u64 size_to_free;
+ struct btrfs_chunk *chunk;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ struct btrfs_trans_handle *trans;
+ struct extent_buffer *leaf;
+ int slot;
+ int ret;
+ int enospc_errors = 0;
+ bool counting = true;
+ u64 limit_data = bctl->data.limit;
+ u64 limit_meta = bctl->meta.limit;
+ u64 limit_sys = bctl->sys.limit;
+
+ /* step one make some room on all the devices */
+ devices = &fs_info->fs_devices->devices;
+ list_for_each_entry(device, devices, dev_list) {
+ old_size = btrfs_device_get_total_bytes(device);
+ size_to_free = div_factor(old_size, 1);
+ size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
+ if (!device->writeable ||
+ btrfs_device_get_total_bytes(device) -
+ btrfs_device_get_bytes_used(device) > size_to_free ||
+ device->is_tgtdev_for_dev_replace)
+ continue;
+
+ ret = btrfs_shrink_device(device, old_size - size_to_free);
+ if (ret == -ENOSPC)
+ break;
+ BUG_ON(ret);
+
+ trans = btrfs_start_transaction(dev_root, 0);
+ BUG_ON(IS_ERR(trans));
+
+ ret = btrfs_grow_device(trans, device, old_size);
+ BUG_ON(ret);
+
+ btrfs_end_transaction(trans, dev_root);
+ }
+
+ /* step two, relocate all the chunks */
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ /* zero out stat counters */
+ spin_lock(&fs_info->balance_lock);
+ memset(&bctl->stat, 0, sizeof(bctl->stat));
+ spin_unlock(&fs_info->balance_lock);
+again:
+ if (!counting) {
+ bctl->data.limit = limit_data;
+ bctl->meta.limit = limit_meta;
+ bctl->sys.limit = limit_sys;
+ }
+ key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+ key.offset = (u64)-1;
+ key.type = BTRFS_CHUNK_ITEM_KEY;
+
+ while (1) {
+ if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
+ atomic_read(&fs_info->balance_cancel_req)) {
+ ret = -ECANCELED;
+ goto error;
+ }
+
+ ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
+ if (ret < 0)
+ goto error;
+
+ /*
+ * this shouldn't happen, it means the last relocate
+ * failed
+ */
+ if (ret == 0)
+ BUG(); /* FIXME break ? */
+
+ ret = btrfs_previous_item(chunk_root, path, 0,
+ BTRFS_CHUNK_ITEM_KEY);
+ if (ret) {
+ ret = 0;
+ break;
+ }
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+ if (found_key.objectid != key.objectid)
+ break;
+
+ chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
+
+ if (!counting) {
+ spin_lock(&fs_info->balance_lock);
+ bctl->stat.considered++;
+ spin_unlock(&fs_info->balance_lock);
+ }
+
+ ret = should_balance_chunk(chunk_root, leaf, chunk,
+ found_key.offset);
+ btrfs_release_path(path);
+ if (!ret)
+ goto loop;
+
+ if (counting) {
+ spin_lock(&fs_info->balance_lock);
+ bctl->stat.expected++;
+ spin_unlock(&fs_info->balance_lock);
+ goto loop;
+ }
+
+ ret = btrfs_relocate_chunk(chunk_root,
+ found_key.objectid,
+ found_key.offset);
+ if (ret && ret != -ENOSPC)
+ goto error;
+ if (ret == -ENOSPC) {
+ enospc_errors++;
+ } else {
+ spin_lock(&fs_info->balance_lock);
+ bctl->stat.completed++;
+ spin_unlock(&fs_info->balance_lock);
+ }
+loop:
+ if (found_key.offset == 0)
+ break;
+ key.offset = found_key.offset - 1;
+ }
+
+ if (counting) {
+ btrfs_release_path(path);
+ counting = false;
+ goto again;
+ }
+error:
+ btrfs_free_path(path);
+ if (enospc_errors) {
+ btrfs_info(fs_info, "%d enospc errors during balance",
+ enospc_errors);
+ if (!ret)
+ ret = -ENOSPC;
+ }
+
+ return ret;
+}
+
+/**
+ * alloc_profile_is_valid - see if a given profile is valid and reduced
+ * @flags: profile to validate
+ * @extended: if true @flags is treated as an extended profile
+ */
+static int alloc_profile_is_valid(u64 flags, int extended)
+{
+ u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
+ BTRFS_BLOCK_GROUP_PROFILE_MASK);
+
+ flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
+
+ /* 1) check that all other bits are zeroed */
+ if (flags & ~mask)
+ return 0;
+
+ /* 2) see if profile is reduced */
+ if (flags == 0)
+ return !extended; /* "0" is valid for usual profiles */
+
+ /* true if exactly one bit set */
+ return (flags & (flags - 1)) == 0;
+}
+
+static inline int balance_need_close(struct btrfs_fs_info *fs_info)
+{
+ /* cancel requested || normal exit path */
+ return atomic_read(&fs_info->balance_cancel_req) ||
+ (atomic_read(&fs_info->balance_pause_req) == 0 &&
+ atomic_read(&fs_info->balance_cancel_req) == 0);
+}
+
+static void __cancel_balance(struct btrfs_fs_info *fs_info)
+{
+ int ret;
+
+ unset_balance_control(fs_info);
+ ret = del_balance_item(fs_info->tree_root);
+ if (ret)
+ btrfs_std_error(fs_info, ret);
+
+ atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+}
+
+/*
+ * Should be called with both balance and volume mutexes held
+ */
+int btrfs_balance(struct btrfs_balance_control *bctl,
+ struct btrfs_ioctl_balance_args *bargs)
+{
+ struct btrfs_fs_info *fs_info = bctl->fs_info;
+ u64 allowed;
+ int mixed = 0;
+ int ret;
+ u64 num_devices;
+ unsigned seq;
+
+ if (btrfs_fs_closing(fs_info) ||
+ atomic_read(&fs_info->balance_pause_req) ||
+ atomic_read(&fs_info->balance_cancel_req)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ allowed = btrfs_super_incompat_flags(fs_info->super_copy);
+ if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
+ mixed = 1;
+
+ /*
+ * In case of mixed groups both data and meta should be picked,
+ * and identical options should be given for both of them.
+ */
+ allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
+ if (mixed && (bctl->flags & allowed)) {
+ if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
+ !(bctl->flags & BTRFS_BALANCE_METADATA) ||
+ memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
+ btrfs_err(fs_info, "with mixed groups data and "
+ "metadata balance options must be the same");
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+ num_devices = fs_info->fs_devices->num_devices;
+ btrfs_dev_replace_lock(&fs_info->dev_replace);
+ if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
+ BUG_ON(num_devices < 1);
+ num_devices--;
+ }
+ btrfs_dev_replace_unlock(&fs_info->dev_replace);
+ allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+ if (num_devices == 1)
+ allowed |= BTRFS_BLOCK_GROUP_DUP;
+ else if (num_devices > 1)
+ allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
+ if (num_devices > 2)
+ allowed |= BTRFS_BLOCK_GROUP_RAID5;
+ if (num_devices > 3)
+ allowed |= (BTRFS_BLOCK_GROUP_RAID10 |
+ BTRFS_BLOCK_GROUP_RAID6);
+ if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+ (!alloc_profile_is_valid(bctl->data.target, 1) ||
+ (bctl->data.target & ~allowed))) {
+ btrfs_err(fs_info, "unable to start balance with target "
+ "data profile %llu",
+ bctl->data.target);
+ ret = -EINVAL;
+ goto out;
+ }
+ if ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+ (!alloc_profile_is_valid(bctl->meta.target, 1) ||
+ (bctl->meta.target & ~allowed))) {
+ btrfs_err(fs_info,
+ "unable to start balance with target metadata profile %llu",
+ bctl->meta.target);
+ ret = -EINVAL;
+ goto out;
+ }
+ if ((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+ (!alloc_profile_is_valid(bctl->sys.target, 1) ||
+ (bctl->sys.target & ~allowed))) {
+ btrfs_err(fs_info,
+ "unable to start balance with target system profile %llu",
+ bctl->sys.target);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* allow dup'ed data chunks only in mixed mode */
+ if (!mixed && (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+ (bctl->data.target & BTRFS_BLOCK_GROUP_DUP)) {
+ btrfs_err(fs_info, "dup for data is not allowed");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* allow to reduce meta or sys integrity only if force set */
+ allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID10 |
+ BTRFS_BLOCK_GROUP_RAID5 |
+ BTRFS_BLOCK_GROUP_RAID6;
+ do {
+ seq = read_seqbegin(&fs_info->profiles_lock);
+
+ if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+ (fs_info->avail_system_alloc_bits & allowed) &&
+ !(bctl->sys.target & allowed)) ||
+ ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+ (fs_info->avail_metadata_alloc_bits & allowed) &&
+ !(bctl->meta.target & allowed))) {
+ if (bctl->flags & BTRFS_BALANCE_FORCE) {
+ btrfs_info(fs_info, "force reducing metadata integrity");
+ } else {
+ btrfs_err(fs_info, "balance will reduce metadata "
+ "integrity, use force if you want this");
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+ } while (read_seqretry(&fs_info->profiles_lock, seq));
+
+ if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+ int num_tolerated_disk_barrier_failures;
+ u64 target = bctl->sys.target;
+
+ num_tolerated_disk_barrier_failures =
+ btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
+ if (num_tolerated_disk_barrier_failures > 0 &&
+ (target &
+ (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_AVAIL_ALLOC_BIT_SINGLE)))
+ num_tolerated_disk_barrier_failures = 0;
+ else if (num_tolerated_disk_barrier_failures > 1 &&
+ (target &
+ (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)))
+ num_tolerated_disk_barrier_failures = 1;
+
+ fs_info->num_tolerated_disk_barrier_failures =
+ num_tolerated_disk_barrier_failures;
+ }
+
+ ret = insert_balance_item(fs_info->tree_root, bctl);
+ if (ret && ret != -EEXIST)
+ goto out;
+
+ if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
+ BUG_ON(ret == -EEXIST);
+ set_balance_control(bctl);
+ } else {
+ BUG_ON(ret != -EEXIST);
+ spin_lock(&fs_info->balance_lock);
+ update_balance_args(bctl);
+ spin_unlock(&fs_info->balance_lock);
+ }
+
+ atomic_inc(&fs_info->balance_running);
+ mutex_unlock(&fs_info->balance_mutex);
+
+ ret = __btrfs_balance(fs_info);
+
+ mutex_lock(&fs_info->balance_mutex);
+ atomic_dec(&fs_info->balance_running);
+
+ if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+ fs_info->num_tolerated_disk_barrier_failures =
+ btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
+ }
+
+ if (bargs) {
+ memset(bargs, 0, sizeof(*bargs));
+ update_ioctl_balance_args(fs_info, 0, bargs);
+ }
+
+ if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
+ balance_need_close(fs_info)) {
+ __cancel_balance(fs_info);
+ }
+
+ wake_up(&fs_info->balance_wait_q);
+
+ return ret;
+out:
+ if (bctl->flags & BTRFS_BALANCE_RESUME)
+ __cancel_balance(fs_info);
+ else {
+ kfree(bctl);
+ atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+ }
+ return ret;
+}
+
+static int balance_kthread(void *data)
+{
+ struct btrfs_fs_info *fs_info = data;
+ int ret = 0;
+
+ mutex_lock(&fs_info->volume_mutex);
+ mutex_lock(&fs_info->balance_mutex);
+
+ if (fs_info->balance_ctl) {
+ btrfs_info(fs_info, "continuing balance");
+ ret = btrfs_balance(fs_info->balance_ctl, NULL);
+ }
+
+ mutex_unlock(&fs_info->balance_mutex);
+ mutex_unlock(&fs_info->volume_mutex);
+
+ return ret;
+}
+
+int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
+{
+ struct task_struct *tsk;
+
+ spin_lock(&fs_info->balance_lock);
+ if (!fs_info->balance_ctl) {
+ spin_unlock(&fs_info->balance_lock);
+ return 0;
+ }
+ spin_unlock(&fs_info->balance_lock);
+
+ if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) {
+ btrfs_info(fs_info, "force skipping balance");
+ return 0;
+ }
+
+ tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
+ return PTR_ERR_OR_ZERO(tsk);
+}
+
+int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_balance_control *bctl;
+ struct btrfs_balance_item *item;
+ struct btrfs_disk_balance_args disk_bargs;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = BTRFS_BALANCE_OBJECTID;
+ key.type = BTRFS_BALANCE_ITEM_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ if (ret > 0) { /* ret = -ENOENT; */
+ ret = 0;
+ goto out;
+ }
+
+ bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
+ if (!bctl) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ leaf = path->nodes[0];
+ item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
+
+ bctl->fs_info = fs_info;
+ bctl->flags = btrfs_balance_flags(leaf, item);
+ bctl->flags |= BTRFS_BALANCE_RESUME;
+
+ btrfs_balance_data(leaf, item, &disk_bargs);
+ btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
+ btrfs_balance_meta(leaf, item, &disk_bargs);
+ btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
+ btrfs_balance_sys(leaf, item, &disk_bargs);
+ btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
+
+ WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1));
+
+ mutex_lock(&fs_info->volume_mutex);
+ mutex_lock(&fs_info->balance_mutex);
+
+ set_balance_control(bctl);
+
+ mutex_unlock(&fs_info->balance_mutex);
+ mutex_unlock(&fs_info->volume_mutex);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
+{
+ int ret = 0;
+
+ mutex_lock(&fs_info->balance_mutex);
+ if (!fs_info->balance_ctl) {
+ mutex_unlock(&fs_info->balance_mutex);
+ return -ENOTCONN;
+ }
+
+ if (atomic_read(&fs_info->balance_running)) {
+ atomic_inc(&fs_info->balance_pause_req);
+ mutex_unlock(&fs_info->balance_mutex);
+
+ wait_event(fs_info->balance_wait_q,
+ atomic_read(&fs_info->balance_running) == 0);
+
+ mutex_lock(&fs_info->balance_mutex);
+ /* we are good with balance_ctl ripped off from under us */
+ BUG_ON(atomic_read(&fs_info->balance_running));
+ atomic_dec(&fs_info->balance_pause_req);
+ } else {
+ ret = -ENOTCONN;
+ }
+
+ mutex_unlock(&fs_info->balance_mutex);
+ return ret;
+}
+
+int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
+{
+ if (fs_info->sb->s_flags & MS_RDONLY)
+ return -EROFS;
+
+ mutex_lock(&fs_info->balance_mutex);
+ if (!fs_info->balance_ctl) {
+ mutex_unlock(&fs_info->balance_mutex);
+ return -ENOTCONN;
+ }
+
+ atomic_inc(&fs_info->balance_cancel_req);
+ /*
+ * if we are running just wait and return, balance item is
+ * deleted in btrfs_balance in this case
+ */
+ if (atomic_read(&fs_info->balance_running)) {
+ mutex_unlock(&fs_info->balance_mutex);
+ wait_event(fs_info->balance_wait_q,
+ atomic_read(&fs_info->balance_running) == 0);
+ mutex_lock(&fs_info->balance_mutex);
+ } else {
+ /* __cancel_balance needs volume_mutex */
+ mutex_unlock(&fs_info->balance_mutex);
+ mutex_lock(&fs_info->volume_mutex);
+ mutex_lock(&fs_info->balance_mutex);
+
+ if (fs_info->balance_ctl)
+ __cancel_balance(fs_info);
+
+ mutex_unlock(&fs_info->volume_mutex);
+ }
+
+ BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running));
+ atomic_dec(&fs_info->balance_cancel_req);
+ mutex_unlock(&fs_info->balance_mutex);
+ return 0;
+}
+
+static int btrfs_uuid_scan_kthread(void *data)
+{
+ struct btrfs_fs_info *fs_info = data;
+ struct btrfs_root *root = fs_info->tree_root;
+ struct btrfs_key key;
+ struct btrfs_key max_key;
+ struct btrfs_path *path = NULL;
+ int ret = 0;
+ struct extent_buffer *eb;
+ int slot;
+ struct btrfs_root_item root_item;
+ u32 item_size;
+ struct btrfs_trans_handle *trans = NULL;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ key.objectid = 0;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = 0;
+
+ max_key.objectid = (u64)-1;
+ max_key.type = BTRFS_ROOT_ITEM_KEY;
+ max_key.offset = (u64)-1;
+
+ while (1) {
+ ret = btrfs_search_forward(root, &key, path, 0);
+ if (ret) {
+ if (ret > 0)
+ ret = 0;
+ break;
+ }
+
+ if (key.type != BTRFS_ROOT_ITEM_KEY ||
+ (key.objectid < BTRFS_FIRST_FREE_OBJECTID &&
+ key.objectid != BTRFS_FS_TREE_OBJECTID) ||
+ key.objectid > BTRFS_LAST_FREE_OBJECTID)
+ goto skip;
+
+ eb = path->nodes[0];
+ slot = path->slots[0];
+ item_size = btrfs_item_size_nr(eb, slot);
+ if (item_size < sizeof(root_item))
+ goto skip;
+
+ read_extent_buffer(eb, &root_item,
+ btrfs_item_ptr_offset(eb, slot),
+ (int)sizeof(root_item));
+ if (btrfs_root_refs(&root_item) == 0)
+ goto skip;
+
+ if (!btrfs_is_empty_uuid(root_item.uuid) ||
+ !btrfs_is_empty_uuid(root_item.received_uuid)) {
+ if (trans)
+ goto update_tree;
+
+ btrfs_release_path(path);
+ /*
+ * 1 - subvol uuid item
+ * 1 - received_subvol uuid item
+ */
+ trans = btrfs_start_transaction(fs_info->uuid_root, 2);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ break;
+ }
+ continue;
+ } else {
+ goto skip;
+ }
+update_tree:
+ if (!btrfs_is_empty_uuid(root_item.uuid)) {
+ ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root,
+ root_item.uuid,
+ BTRFS_UUID_KEY_SUBVOL,
+ key.objectid);
+ if (ret < 0) {
+ btrfs_warn(fs_info, "uuid_tree_add failed %d",
+ ret);
+ break;
+ }
+ }
+
+ if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
+ ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root,
+ root_item.received_uuid,
+ BTRFS_UUID_KEY_RECEIVED_SUBVOL,
+ key.objectid);
+ if (ret < 0) {
+ btrfs_warn(fs_info, "uuid_tree_add failed %d",
+ ret);
+ break;
+ }
+ }
+
+skip:
+ if (trans) {
+ ret = btrfs_end_transaction(trans, fs_info->uuid_root);
+ trans = NULL;
+ if (ret)
+ break;
+ }
+
+ btrfs_release_path(path);
+ if (key.offset < (u64)-1) {
+ key.offset++;
+ } else if (key.type < BTRFS_ROOT_ITEM_KEY) {
+ key.offset = 0;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ } else if (key.objectid < (u64)-1) {
+ key.offset = 0;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.objectid++;
+ } else {
+ break;
+ }
+ cond_resched();
+ }
+
+out:
+ btrfs_free_path(path);
+ if (trans && !IS_ERR(trans))
+ btrfs_end_transaction(trans, fs_info->uuid_root);
+ if (ret)
+ btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
+ else
+ fs_info->update_uuid_tree_gen = 1;
+ up(&fs_info->uuid_tree_rescan_sem);
+ return 0;
+}
+
+/*
+ * Callback for btrfs_uuid_tree_iterate().
+ * returns:
+ * 0 check succeeded, the entry is not outdated.
+ * < 0 if an error occured.
+ * > 0 if the check failed, which means the caller shall remove the entry.
+ */
+static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info,
+ u8 *uuid, u8 type, u64 subid)
+{
+ struct btrfs_key key;
+ int ret = 0;
+ struct btrfs_root *subvol_root;
+
+ if (type != BTRFS_UUID_KEY_SUBVOL &&
+ type != BTRFS_UUID_KEY_RECEIVED_SUBVOL)
+ goto out;
+
+ key.objectid = subid;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = (u64)-1;
+ subvol_root = btrfs_read_fs_root_no_name(fs_info, &key);
+ if (IS_ERR(subvol_root)) {
+ ret = PTR_ERR(subvol_root);
+ if (ret == -ENOENT)
+ ret = 1;
+ goto out;
+ }
+
+ switch (type) {
+ case BTRFS_UUID_KEY_SUBVOL:
+ if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE))
+ ret = 1;
+ break;
+ case BTRFS_UUID_KEY_RECEIVED_SUBVOL:
+ if (memcmp(uuid, subvol_root->root_item.received_uuid,
+ BTRFS_UUID_SIZE))
+ ret = 1;
+ break;
+ }
+
+out:
+ return ret;
+}
+
+static int btrfs_uuid_rescan_kthread(void *data)
+{
+ struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data;
+ int ret;
+
+ /*
+ * 1st step is to iterate through the existing UUID tree and
+ * to delete all entries that contain outdated data.
+ * 2nd step is to add all missing entries to the UUID tree.
+ */
+ ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry);
+ if (ret < 0) {
+ btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret);
+ up(&fs_info->uuid_tree_rescan_sem);
+ return ret;
+ }
+ return btrfs_uuid_scan_kthread(data);
+}
+
+int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *tree_root = fs_info->tree_root;
+ struct btrfs_root *uuid_root;
+ struct task_struct *task;
+ int ret;
+
+ /*
+ * 1 - root node
+ * 1 - root item
+ */
+ trans = btrfs_start_transaction(tree_root, 2);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ uuid_root = btrfs_create_tree(trans, fs_info,
+ BTRFS_UUID_TREE_OBJECTID);
+ if (IS_ERR(uuid_root)) {
+ btrfs_abort_transaction(trans, tree_root,
+ PTR_ERR(uuid_root));
+ return PTR_ERR(uuid_root);
+ }
+
+ fs_info->uuid_root = uuid_root;
+
+ ret = btrfs_commit_transaction(trans, tree_root);
+ if (ret)
+ return ret;
+
+ down(&fs_info->uuid_tree_rescan_sem);
+ task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
+ if (IS_ERR(task)) {
+ /* fs_info->update_uuid_tree_gen remains 0 in all error case */
+ btrfs_warn(fs_info, "failed to start uuid_scan task");
+ up(&fs_info->uuid_tree_rescan_sem);
+ return PTR_ERR(task);
+ }
+
+ return 0;
+}
+
+int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
+{
+ struct task_struct *task;
+
+ down(&fs_info->uuid_tree_rescan_sem);
+ task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid");
+ if (IS_ERR(task)) {
+ /* fs_info->update_uuid_tree_gen remains 0 in all error case */
+ btrfs_warn(fs_info, "failed to start uuid_rescan task");
+ up(&fs_info->uuid_tree_rescan_sem);
+ return PTR_ERR(task);
+ }
+
+ return 0;
+}
+
+/*
+ * shrinking a device means finding all of the device extents past
+ * the new size, and then following the back refs to the chunks.
+ * The chunk relocation code actually frees the device extent
+ */
+int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = device->dev_root;
+ struct btrfs_dev_extent *dev_extent = NULL;
+ struct btrfs_path *path;
+ u64 length;
+ u64 chunk_objectid;
+ u64 chunk_offset;
+ int ret;
+ int slot;
+ int failed = 0;
+ bool retried = false;
+ struct extent_buffer *l;
+ struct btrfs_key key;
+ struct btrfs_super_block *super_copy = root->fs_info->super_copy;
+ u64 old_total = btrfs_super_total_bytes(super_copy);
+ u64 old_size = btrfs_device_get_total_bytes(device);
+ u64 diff = old_size - new_size;
+
+ if (device->is_tgtdev_for_dev_replace)
+ return -EINVAL;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ path->reada = 2;
+
+ lock_chunks(root);
+
+ btrfs_device_set_total_bytes(device, new_size);
+ if (device->writeable) {
+ device->fs_devices->total_rw_bytes -= diff;
+ spin_lock(&root->fs_info->free_chunk_lock);
+ root->fs_info->free_chunk_space -= diff;
+ spin_unlock(&root->fs_info->free_chunk_lock);
+ }
+ unlock_chunks(root);
+
+again:
+ key.objectid = device->devid;
+ key.offset = (u64)-1;
+ key.type = BTRFS_DEV_EXTENT_KEY;
+
+ do {
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto done;
+
+ ret = btrfs_previous_item(root, path, 0, key.type);
+ if (ret < 0)
+ goto done;
+ if (ret) {
+ ret = 0;
+ btrfs_release_path(path);
+ break;
+ }
+
+ l = path->nodes[0];
+ slot = path->slots[0];
+ btrfs_item_key_to_cpu(l, &key, path->slots[0]);
+
+ if (key.objectid != device->devid) {
+ btrfs_release_path(path);
+ break;
+ }
+
+ dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
+ length = btrfs_dev_extent_length(l, dev_extent);
+
+ if (key.offset + length <= new_size) {
+ btrfs_release_path(path);
+ break;
+ }
+
+ chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
+ chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
+ btrfs_release_path(path);
+
+ ret = btrfs_relocate_chunk(root, chunk_objectid, chunk_offset);
+ if (ret && ret != -ENOSPC)
+ goto done;
+ if (ret == -ENOSPC)
+ failed++;
+ } while (key.offset-- > 0);
+
+ if (failed && !retried) {
+ failed = 0;
+ retried = true;
+ goto again;
+ } else if (failed && retried) {
+ ret = -ENOSPC;
+ lock_chunks(root);
+
+ btrfs_device_set_total_bytes(device, old_size);
+ if (device->writeable)
+ device->fs_devices->total_rw_bytes += diff;
+ spin_lock(&root->fs_info->free_chunk_lock);
+ root->fs_info->free_chunk_space += diff;
+ spin_unlock(&root->fs_info->free_chunk_lock);
+ unlock_chunks(root);
+ goto done;
+ }
+
+ /* Shrinking succeeded, else we would be at "done". */
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto done;
+ }
+
+ lock_chunks(root);
+ btrfs_device_set_disk_total_bytes(device, new_size);
+ if (list_empty(&device->resized_list))
+ list_add_tail(&device->resized_list,
+ &root->fs_info->fs_devices->resized_devices);
+
+ WARN_ON(diff > old_total);
+ btrfs_set_super_total_bytes(super_copy, old_total - diff);
+ unlock_chunks(root);
+
+ /* Now btrfs_update_device() will change the on-disk size. */
+ ret = btrfs_update_device(trans, device);
+ btrfs_end_transaction(trans, root);
+done:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int btrfs_add_system_chunk(struct btrfs_root *root,
+ struct btrfs_key *key,
+ struct btrfs_chunk *chunk, int item_size)
+{
+ struct btrfs_super_block *super_copy = root->fs_info->super_copy;
+ struct btrfs_disk_key disk_key;
+ u32 array_size;
+ u8 *ptr;
+
+ lock_chunks(root);
+ array_size = btrfs_super_sys_array_size(super_copy);
+ if (array_size + item_size + sizeof(disk_key)
+ > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
+ unlock_chunks(root);
+ return -EFBIG;
+ }
+
+ ptr = super_copy->sys_chunk_array + array_size;
+ btrfs_cpu_key_to_disk(&disk_key, key);
+ memcpy(ptr, &disk_key, sizeof(disk_key));
+ ptr += sizeof(disk_key);
+ memcpy(ptr, chunk, item_size);
+ item_size += sizeof(disk_key);
+ btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
+ unlock_chunks(root);
+
+ return 0;
+}
+
+/*
+ * sort the devices in descending order by max_avail, total_avail
+ */
+static int btrfs_cmp_device_info(const void *a, const void *b)
+{
+ const struct btrfs_device_info *di_a = a;
+ const struct btrfs_device_info *di_b = b;
+
+ if (di_a->max_avail > di_b->max_avail)
+ return -1;
+ if (di_a->max_avail < di_b->max_avail)
+ return 1;
+ if (di_a->total_avail > di_b->total_avail)
+ return -1;
+ if (di_a->total_avail < di_b->total_avail)
+ return 1;
+ return 0;
+}
+
+static const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
+ [BTRFS_RAID_RAID10] = {
+ .sub_stripes = 2,
+ .dev_stripes = 1,
+ .devs_max = 0, /* 0 == as many as possible */
+ .devs_min = 4,
+ .devs_increment = 2,
+ .ncopies = 2,
+ },
+ [BTRFS_RAID_RAID1] = {
+ .sub_stripes = 1,
+ .dev_stripes = 1,
+ .devs_max = 2,
+ .devs_min = 2,
+ .devs_increment = 2,
+ .ncopies = 2,
+ },
+ [BTRFS_RAID_DUP] = {
+ .sub_stripes = 1,
+ .dev_stripes = 2,
+ .devs_max = 1,
+ .devs_min = 1,
+ .devs_increment = 1,
+ .ncopies = 2,
+ },
+ [BTRFS_RAID_RAID0] = {
+ .sub_stripes = 1,
+ .dev_stripes = 1,
+ .devs_max = 0,
+ .devs_min = 2,
+ .devs_increment = 1,
+ .ncopies = 1,
+ },
+ [BTRFS_RAID_SINGLE] = {
+ .sub_stripes = 1,
+ .dev_stripes = 1,
+ .devs_max = 1,
+ .devs_min = 1,
+ .devs_increment = 1,
+ .ncopies = 1,
+ },
+ [BTRFS_RAID_RAID5] = {
+ .sub_stripes = 1,
+ .dev_stripes = 1,
+ .devs_max = 0,
+ .devs_min = 2,
+ .devs_increment = 1,
+ .ncopies = 2,
+ },
+ [BTRFS_RAID_RAID6] = {
+ .sub_stripes = 1,
+ .dev_stripes = 1,
+ .devs_max = 0,
+ .devs_min = 3,
+ .devs_increment = 1,
+ .ncopies = 3,
+ },
+};
+
+static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target)
+{
+ /* TODO allow them to set a preferred stripe size */
+ return 64 * 1024;
+}
+
+static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
+{
+ if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
+ return;
+
+ btrfs_set_fs_incompat(info, RAID56);
+}
+
+#define BTRFS_MAX_DEVS(r) ((BTRFS_LEAF_DATA_SIZE(r) \
+ - sizeof(struct btrfs_item) \
+ - sizeof(struct btrfs_chunk)) \
+ / sizeof(struct btrfs_stripe) + 1)
+
+#define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE \
+ - 2 * sizeof(struct btrfs_disk_key) \
+ - 2 * sizeof(struct btrfs_chunk)) \
+ / sizeof(struct btrfs_stripe) + 1)
+
+static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+ struct btrfs_root *extent_root, u64 start,
+ u64 type)
+{
+ struct btrfs_fs_info *info = extent_root->fs_info;
+ struct btrfs_fs_devices *fs_devices = info->fs_devices;
+ struct list_head *cur;
+ struct map_lookup *map = NULL;
+ struct extent_map_tree *em_tree;
+ struct extent_map *em;
+ struct btrfs_device_info *devices_info = NULL;
+ u64 total_avail;
+ int num_stripes; /* total number of stripes to allocate */
+ int data_stripes; /* number of stripes that count for
+ block group size */
+ int sub_stripes; /* sub_stripes info for map */
+ int dev_stripes; /* stripes per dev */
+ int devs_max; /* max devs to use */
+ int devs_min; /* min devs needed */
+ int devs_increment; /* ndevs has to be a multiple of this */
+ int ncopies; /* how many copies to data has */
+ int ret;
+ u64 max_stripe_size;
+ u64 max_chunk_size;
+ u64 stripe_size;
+ u64 num_bytes;
+ u64 raid_stripe_len = BTRFS_STRIPE_LEN;
+ int ndevs;
+ int i;
+ int j;
+ int index;
+
+ BUG_ON(!alloc_profile_is_valid(type, 0));
+
+ if (list_empty(&fs_devices->alloc_list))
+ return -ENOSPC;
+
+ index = __get_raid_index(type);
+
+ sub_stripes = btrfs_raid_array[index].sub_stripes;
+ dev_stripes = btrfs_raid_array[index].dev_stripes;
+ devs_max = btrfs_raid_array[index].devs_max;
+ devs_min = btrfs_raid_array[index].devs_min;
+ devs_increment = btrfs_raid_array[index].devs_increment;
+ ncopies = btrfs_raid_array[index].ncopies;
+
+ if (type & BTRFS_BLOCK_GROUP_DATA) {
+ max_stripe_size = 1024 * 1024 * 1024;
+ max_chunk_size = 10 * max_stripe_size;
+ if (!devs_max)
+ devs_max = BTRFS_MAX_DEVS(info->chunk_root);
+ } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
+ /* for larger filesystems, use larger metadata chunks */
+ if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024)
+ max_stripe_size = 1024 * 1024 * 1024;
+ else
+ max_stripe_size = 256 * 1024 * 1024;
+ max_chunk_size = max_stripe_size;
+ if (!devs_max)
+ devs_max = BTRFS_MAX_DEVS(info->chunk_root);
+ } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
+ max_stripe_size = 32 * 1024 * 1024;
+ max_chunk_size = 2 * max_stripe_size;
+ if (!devs_max)
+ devs_max = BTRFS_MAX_DEVS_SYS_CHUNK;
+ } else {
+ btrfs_err(info, "invalid chunk type 0x%llx requested",
+ type);
+ BUG_ON(1);
+ }
+
+ /* we don't want a chunk larger than 10% of writeable space */
+ max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
+ max_chunk_size);
+
+ devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
+ GFP_NOFS);
+ if (!devices_info)
+ return -ENOMEM;
+
+ cur = fs_devices->alloc_list.next;
+
+ /*
+ * in the first pass through the devices list, we gather information
+ * about the available holes on each device.
+ */
+ ndevs = 0;
+ while (cur != &fs_devices->alloc_list) {
+ struct btrfs_device *device;
+ u64 max_avail;
+ u64 dev_offset;
+
+ device = list_entry(cur, struct btrfs_device, dev_alloc_list);
+
+ cur = cur->next;
+
+ if (!device->writeable) {
+ WARN(1, KERN_ERR
+ "BTRFS: read-only device in alloc_list\n");
+ continue;
+ }
+
+ if (!device->in_fs_metadata ||
+ device->is_tgtdev_for_dev_replace)
+ continue;
+
+ if (device->total_bytes > device->bytes_used)
+ total_avail = device->total_bytes - device->bytes_used;
+ else
+ total_avail = 0;
+
+ /* If there is no space on this device, skip it. */
+ if (total_avail == 0)
+ continue;
+
+ ret = find_free_dev_extent(trans, device,
+ max_stripe_size * dev_stripes,
+ &dev_offset, &max_avail);
+ if (ret && ret != -ENOSPC)
+ goto error;
+
+ if (ret == 0)
+ max_avail = max_stripe_size * dev_stripes;
+
+ if (max_avail < BTRFS_STRIPE_LEN * dev_stripes)
+ continue;
+
+ if (ndevs == fs_devices->rw_devices) {
+ WARN(1, "%s: found more than %llu devices\n",
+ __func__, fs_devices->rw_devices);
+ break;
+ }
+ devices_info[ndevs].dev_offset = dev_offset;
+ devices_info[ndevs].max_avail = max_avail;
+ devices_info[ndevs].total_avail = total_avail;
+ devices_info[ndevs].dev = device;
+ ++ndevs;
+ }
+
+ /*
+ * now sort the devices by hole size / available space
+ */
+ sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
+ btrfs_cmp_device_info, NULL);
+
+ /* round down to number of usable stripes */
+ ndevs -= ndevs % devs_increment;
+
+ if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) {
+ ret = -ENOSPC;
+ goto error;
+ }
+
+ if (devs_max && ndevs > devs_max)
+ ndevs = devs_max;
+ /*
+ * the primary goal is to maximize the number of stripes, so use as many
+ * devices as possible, even if the stripes are not maximum sized.
+ */
+ stripe_size = devices_info[ndevs-1].max_avail;
+ num_stripes = ndevs * dev_stripes;
+
+ /*
+ * this will have to be fixed for RAID1 and RAID10 over
+ * more drives
+ */
+ data_stripes = num_stripes / ncopies;
+
+ if (type & BTRFS_BLOCK_GROUP_RAID5) {
+ raid_stripe_len = find_raid56_stripe_len(ndevs - 1,
+ btrfs_super_stripesize(info->super_copy));
+ data_stripes = num_stripes - 1;
+ }
+ if (type & BTRFS_BLOCK_GROUP_RAID6) {
+ raid_stripe_len = find_raid56_stripe_len(ndevs - 2,
+ btrfs_super_stripesize(info->super_copy));
+ data_stripes = num_stripes - 2;
+ }
+
+ /*
+ * Use the number of data stripes to figure out how big this chunk
+ * is really going to be in terms of logical address space,
+ * and compare that answer with the max chunk size
+ */
+ if (stripe_size * data_stripes > max_chunk_size) {
+ u64 mask = (1ULL << 24) - 1;
+
+ stripe_size = div_u64(max_chunk_size, data_stripes);
+
+ /* bump the answer up to a 16MB boundary */
+ stripe_size = (stripe_size + mask) & ~mask;
+
+ /* but don't go higher than the limits we found
+ * while searching for free extents
+ */
+ if (stripe_size > devices_info[ndevs-1].max_avail)
+ stripe_size = devices_info[ndevs-1].max_avail;
+ }
+
+ stripe_size = div_u64(stripe_size, dev_stripes);
+
+ /* align to BTRFS_STRIPE_LEN */
+ stripe_size = div_u64(stripe_size, raid_stripe_len);
+ stripe_size *= raid_stripe_len;
+
+ map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
+ if (!map) {
+ ret = -ENOMEM;
+ goto error;
+ }
+ map->num_stripes = num_stripes;
+
+ for (i = 0; i < ndevs; ++i) {
+ for (j = 0; j < dev_stripes; ++j) {
+ int s = i * dev_stripes + j;
+ map->stripes[s].dev = devices_info[i].dev;
+ map->stripes[s].physical = devices_info[i].dev_offset +
+ j * stripe_size;
+ }
+ }
+ map->sector_size = extent_root->sectorsize;
+ map->stripe_len = raid_stripe_len;
+ map->io_align = raid_stripe_len;
+ map->io_width = raid_stripe_len;
+ map->type = type;
+ map->sub_stripes = sub_stripes;
+
+ num_bytes = stripe_size * data_stripes;
+
+ trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes);
+
+ em = alloc_extent_map();
+ if (!em) {
+ kfree(map);
+ ret = -ENOMEM;
+ goto error;
+ }
+ set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
+ em->bdev = (struct block_device *)map;
+ em->start = start;
+ em->len = num_bytes;
+ em->block_start = 0;
+ em->block_len = em->len;
+ em->orig_block_len = stripe_size;
+
+ em_tree = &extent_root->fs_info->mapping_tree.map_tree;
+ write_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, em, 0);
+ if (!ret) {
+ list_add_tail(&em->list, &trans->transaction->pending_chunks);
+ atomic_inc(&em->refs);
+ }
+ write_unlock(&em_tree->lock);
+ if (ret) {
+ free_extent_map(em);
+ goto error;
+ }
+
+ ret = btrfs_make_block_group(trans, extent_root, 0, type,
+ BTRFS_FIRST_CHUNK_TREE_OBJECTID,
+ start, num_bytes);
+ if (ret)
+ goto error_del_extent;
+
+ for (i = 0; i < map->num_stripes; i++) {
+ num_bytes = map->stripes[i].dev->bytes_used + stripe_size;
+ btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes);
+ }
+
+ spin_lock(&extent_root->fs_info->free_chunk_lock);
+ extent_root->fs_info->free_chunk_space -= (stripe_size *
+ map->num_stripes);
+ spin_unlock(&extent_root->fs_info->free_chunk_lock);
+
+ free_extent_map(em);
+ check_raid56_incompat_flag(extent_root->fs_info, type);
+
+ kfree(devices_info);
+ return 0;
+
+error_del_extent:
+ write_lock(&em_tree->lock);
+ remove_extent_mapping(em_tree, em);
+ write_unlock(&em_tree->lock);
+
+ /* One for our allocation */
+ free_extent_map(em);
+ /* One for the tree reference */
+ free_extent_map(em);
+ /* One for the pending_chunks list reference */
+ free_extent_map(em);
+error:
+ kfree(devices_info);
+ return ret;
+}
+
+int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
+ struct btrfs_root *extent_root,
+ u64 chunk_offset, u64 chunk_size)
+{
+ struct btrfs_key key;
+ struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
+ struct btrfs_device *device;
+ struct btrfs_chunk *chunk;
+ struct btrfs_stripe *stripe;
+ struct extent_map_tree *em_tree;
+ struct extent_map *em;
+ struct map_lookup *map;
+ size_t item_size;
+ u64 dev_offset;
+ u64 stripe_size;
+ int i = 0;
+ int ret;
+
+ em_tree = &extent_root->fs_info->mapping_tree.map_tree;
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, chunk_offset, chunk_size);
+ read_unlock(&em_tree->lock);
+
+ if (!em) {
+ btrfs_crit(extent_root->fs_info, "unable to find logical "
+ "%Lu len %Lu", chunk_offset, chunk_size);
+ return -EINVAL;
+ }
+
+ if (em->start != chunk_offset || em->len != chunk_size) {
+ btrfs_crit(extent_root->fs_info, "found a bad mapping, wanted"
+ " %Lu-%Lu, found %Lu-%Lu", chunk_offset,
+ chunk_size, em->start, em->len);
+ free_extent_map(em);
+ return -EINVAL;
+ }
+
+ map = (struct map_lookup *)em->bdev;
+ item_size = btrfs_chunk_item_size(map->num_stripes);
+ stripe_size = em->orig_block_len;
+
+ chunk = kzalloc(item_size, GFP_NOFS);
+ if (!chunk) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < map->num_stripes; i++) {
+ device = map->stripes[i].dev;
+ dev_offset = map->stripes[i].physical;
+
+ ret = btrfs_update_device(trans, device);
+ if (ret)
+ goto out;
+ ret = btrfs_alloc_dev_extent(trans, device,
+ chunk_root->root_key.objectid,
+ BTRFS_FIRST_CHUNK_TREE_OBJECTID,
+ chunk_offset, dev_offset,
+ stripe_size);
+ if (ret)
+ goto out;
+ }
+
+ stripe = &chunk->stripe;
+ for (i = 0; i < map->num_stripes; i++) {
+ device = map->stripes[i].dev;
+ dev_offset = map->stripes[i].physical;
+
+ btrfs_set_stack_stripe_devid(stripe, device->devid);
+ btrfs_set_stack_stripe_offset(stripe, dev_offset);
+ memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
+ stripe++;
+ }
+
+ btrfs_set_stack_chunk_length(chunk, chunk_size);
+ btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
+ btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
+ btrfs_set_stack_chunk_type(chunk, map->type);
+ btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
+ btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
+ btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
+ btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize);
+ btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
+
+ key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+ key.type = BTRFS_CHUNK_ITEM_KEY;
+ key.offset = chunk_offset;
+
+ ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
+ if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
+ /*
+ * TODO: Cleanup of inserted chunk root in case of
+ * failure.
+ */
+ ret = btrfs_add_system_chunk(chunk_root, &key, chunk,
+ item_size);
+ }
+
+out:
+ kfree(chunk);
+ free_extent_map(em);
+ return ret;
+}
+
+/*
+ * Chunk allocation falls into two parts. The first part does works
+ * that make the new allocated chunk useable, but not do any operation
+ * that modifies the chunk tree. The second part does the works that
+ * require modifying the chunk tree. This division is important for the
+ * bootstrap process of adding storage to a seed btrfs.
+ */
+int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+ struct btrfs_root *extent_root, u64 type)
+{
+ u64 chunk_offset;
+
+ ASSERT(mutex_is_locked(&extent_root->fs_info->chunk_mutex));
+ chunk_offset = find_next_chunk(extent_root->fs_info);
+ return __btrfs_alloc_chunk(trans, extent_root, chunk_offset, type);
+}
+
+static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_device *device)
+{
+ u64 chunk_offset;
+ u64 sys_chunk_offset;
+ u64 alloc_profile;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_root *extent_root = fs_info->extent_root;
+ int ret;
+
+ chunk_offset = find_next_chunk(fs_info);
+ alloc_profile = btrfs_get_alloc_profile(extent_root, 0);
+ ret = __btrfs_alloc_chunk(trans, extent_root, chunk_offset,
+ alloc_profile);
+ if (ret)
+ return ret;
+
+ sys_chunk_offset = find_next_chunk(root->fs_info);
+ alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0);
+ ret = __btrfs_alloc_chunk(trans, extent_root, sys_chunk_offset,
+ alloc_profile);
+ return ret;
+}
+
+static inline int btrfs_chunk_max_errors(struct map_lookup *map)
+{
+ int max_errors;
+
+ if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID10 |
+ BTRFS_BLOCK_GROUP_RAID5 |
+ BTRFS_BLOCK_GROUP_DUP)) {
+ max_errors = 1;
+ } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
+ max_errors = 2;
+ } else {
+ max_errors = 0;
+ }
+
+ return max_errors;
+}
+
+int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
+{
+ struct extent_map *em;
+ struct map_lookup *map;
+ struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
+ int readonly = 0;
+ int miss_ndevs = 0;
+ int i;
+
+ read_lock(&map_tree->map_tree.lock);
+ em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
+ read_unlock(&map_tree->map_tree.lock);
+ if (!em)
+ return 1;
+
+ map = (struct map_lookup *)em->bdev;
+ for (i = 0; i < map->num_stripes; i++) {
+ if (map->stripes[i].dev->missing) {
+ miss_ndevs++;
+ continue;
+ }
+
+ if (!map->stripes[i].dev->writeable) {
+ readonly = 1;
+ goto end;
+ }
+ }
+
+ /*
+ * If the number of missing devices is larger than max errors,
+ * we can not write the data into that chunk successfully, so
+ * set it readonly.
+ */
+ if (miss_ndevs > btrfs_chunk_max_errors(map))
+ readonly = 1;
+end:
+ free_extent_map(em);
+ return readonly;
+}
+
+void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
+{
+ extent_map_tree_init(&tree->map_tree);
+}
+
+void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
+{
+ struct extent_map *em;
+
+ while (1) {
+ write_lock(&tree->map_tree.lock);
+ em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
+ if (em)
+ remove_extent_mapping(&tree->map_tree, em);
+ write_unlock(&tree->map_tree.lock);
+ if (!em)
+ break;
+ /* once for us */
+ free_extent_map(em);
+ /* once for the tree */
+ free_extent_map(em);
+ }
+}
+
+int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
+{
+ struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
+ struct extent_map *em;
+ struct map_lookup *map;
+ struct extent_map_tree *em_tree = &map_tree->map_tree;
+ int ret;
+
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, logical, len);
+ read_unlock(&em_tree->lock);
+
+ /*
+ * We could return errors for these cases, but that could get ugly and
+ * we'd probably do the same thing which is just not do anything else
+ * and exit, so return 1 so the callers don't try to use other copies.
+ */
+ if (!em) {
+ btrfs_crit(fs_info, "No mapping for %Lu-%Lu", logical,
+ logical+len);
+ return 1;
+ }
+
+ if (em->start > logical || em->start + em->len < logical) {
+ btrfs_crit(fs_info, "Invalid mapping for %Lu-%Lu, got "
+ "%Lu-%Lu", logical, logical+len, em->start,
+ em->start + em->len);
+ free_extent_map(em);
+ return 1;
+ }
+
+ map = (struct map_lookup *)em->bdev;
+ if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
+ ret = map->num_stripes;
+ else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
+ ret = map->sub_stripes;
+ else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
+ ret = 2;
+ else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
+ ret = 3;
+ else
+ ret = 1;
+ free_extent_map(em);
+
+ btrfs_dev_replace_lock(&fs_info->dev_replace);
+ if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))
+ ret++;
+ btrfs_dev_replace_unlock(&fs_info->dev_replace);
+
+ return ret;
+}
+
+unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
+ struct btrfs_mapping_tree *map_tree,
+ u64 logical)
+{
+ struct extent_map *em;
+ struct map_lookup *map;
+ struct extent_map_tree *em_tree = &map_tree->map_tree;
+ unsigned long len = root->sectorsize;
+
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, logical, len);
+ read_unlock(&em_tree->lock);
+ BUG_ON(!em);
+
+ BUG_ON(em->start > logical || em->start + em->len < logical);
+ map = (struct map_lookup *)em->bdev;
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
+ len = map->stripe_len * nr_data_stripes(map);
+ free_extent_map(em);
+ return len;
+}
+
+int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
+ u64 logical, u64 len, int mirror_num)
+{
+ struct extent_map *em;
+ struct map_lookup *map;
+ struct extent_map_tree *em_tree = &map_tree->map_tree;
+ int ret = 0;
+
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, logical, len);
+ read_unlock(&em_tree->lock);
+ BUG_ON(!em);
+
+ BUG_ON(em->start > logical || em->start + em->len < logical);
+ map = (struct map_lookup *)em->bdev;
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
+ ret = 1;
+ free_extent_map(em);
+ return ret;
+}
+
+static int find_live_mirror(struct btrfs_fs_info *fs_info,
+ struct map_lookup *map, int first, int num,
+ int optimal, int dev_replace_is_ongoing)
+{
+ int i;
+ int tolerance;
+ struct btrfs_device *srcdev;
+
+ if (dev_replace_is_ongoing &&
+ fs_info->dev_replace.cont_reading_from_srcdev_mode ==
+ BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
+ srcdev = fs_info->dev_replace.srcdev;
+ else
+ srcdev = NULL;
+
+ /*
+ * try to avoid the drive that is the source drive for a
+ * dev-replace procedure, only choose it if no other non-missing
+ * mirror is available
+ */
+ for (tolerance = 0; tolerance < 2; tolerance++) {
+ if (map->stripes[optimal].dev->bdev &&
+ (tolerance || map->stripes[optimal].dev != srcdev))
+ return optimal;
+ for (i = first; i < first + num; i++) {
+ if (map->stripes[i].dev->bdev &&
+ (tolerance || map->stripes[i].dev != srcdev))
+ return i;
+ }
+ }
+
+ /* we couldn't find one that doesn't fail. Just return something
+ * and the io error handling code will clean up eventually
+ */
+ return optimal;
+}
+
+static inline int parity_smaller(u64 a, u64 b)
+{
+ return a > b;
+}
+
+/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
+static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
+{
+ struct btrfs_bio_stripe s;
+ int i;
+ u64 l;
+ int again = 1;
+
+ while (again) {
+ again = 0;
+ for (i = 0; i < num_stripes - 1; i++) {
+ if (parity_smaller(bbio->raid_map[i],
+ bbio->raid_map[i+1])) {
+ s = bbio->stripes[i];
+ l = bbio->raid_map[i];
+ bbio->stripes[i] = bbio->stripes[i+1];
+ bbio->raid_map[i] = bbio->raid_map[i+1];
+ bbio->stripes[i+1] = s;
+ bbio->raid_map[i+1] = l;
+
+ again = 1;
+ }
+ }
+ }
+}
+
+static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
+{
+ struct btrfs_bio *bbio = kzalloc(
+ /* the size of the btrfs_bio */
+ sizeof(struct btrfs_bio) +
+ /* plus the variable array for the stripes */
+ sizeof(struct btrfs_bio_stripe) * (total_stripes) +
+ /* plus the variable array for the tgt dev */
+ sizeof(int) * (real_stripes) +
+ /*
+ * plus the raid_map, which includes both the tgt dev
+ * and the stripes
+ */
+ sizeof(u64) * (total_stripes),
+ GFP_NOFS);
+ if (!bbio)
+ return NULL;
+
+ atomic_set(&bbio->error, 0);
+ atomic_set(&bbio->refs, 1);
+
+ return bbio;
+}
+
+void btrfs_get_bbio(struct btrfs_bio *bbio)
+{
+ WARN_ON(!atomic_read(&bbio->refs));
+ atomic_inc(&bbio->refs);
+}
+
+void btrfs_put_bbio(struct btrfs_bio *bbio)
+{
+ if (!bbio)
+ return;
+ if (atomic_dec_and_test(&bbio->refs))
+ kfree(bbio);
+}
+
+static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
+ u64 logical, u64 *length,
+ struct btrfs_bio **bbio_ret,
+ int mirror_num, int need_raid_map)
+{
+ struct extent_map *em;
+ struct map_lookup *map;
+ struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
+ struct extent_map_tree *em_tree = &map_tree->map_tree;
+ u64 offset;
+ u64 stripe_offset;
+ u64 stripe_end_offset;
+ u64 stripe_nr;
+ u64 stripe_nr_orig;
+ u64 stripe_nr_end;
+ u64 stripe_len;
+ u32 stripe_index;
+ int i;
+ int ret = 0;
+ int num_stripes;
+ int max_errors = 0;
+ int tgtdev_indexes = 0;
+ struct btrfs_bio *bbio = NULL;
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+ int dev_replace_is_ongoing = 0;
+ int num_alloc_stripes;
+ int patch_the_first_stripe_for_dev_replace = 0;
+ u64 physical_to_patch_in_first_stripe = 0;
+ u64 raid56_full_stripe_start = (u64)-1;
+
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, logical, *length);
+ read_unlock(&em_tree->lock);
+
+ if (!em) {
+ btrfs_crit(fs_info, "unable to find logical %llu len %llu",
+ logical, *length);
+ return -EINVAL;
+ }
+
+ if (em->start > logical || em->start + em->len < logical) {
+ btrfs_crit(fs_info, "found a bad mapping, wanted %Lu, "
+ "found %Lu-%Lu", logical, em->start,
+ em->start + em->len);
+ free_extent_map(em);
+ return -EINVAL;
+ }
+
+ map = (struct map_lookup *)em->bdev;
+ offset = logical - em->start;
+
+ stripe_len = map->stripe_len;
+ stripe_nr = offset;
+ /*
+ * stripe_nr counts the total number of stripes we have to stride
+ * to get to this block
+ */
+ stripe_nr = div64_u64(stripe_nr, stripe_len);
+
+ stripe_offset = stripe_nr * stripe_len;
+ BUG_ON(offset < stripe_offset);
+
+ /* stripe_offset is the offset of this block in its stripe*/
+ stripe_offset = offset - stripe_offset;
+
+ /* if we're here for raid56, we need to know the stripe aligned start */
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+ unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
+ raid56_full_stripe_start = offset;
+
+ /* allow a write of a full stripe, but make sure we don't
+ * allow straddling of stripes
+ */
+ raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
+ full_stripe_len);
+ raid56_full_stripe_start *= full_stripe_len;
+ }
+
+ if (rw & REQ_DISCARD) {
+ /* we don't discard raid56 yet */
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+ *length = min_t(u64, em->len - offset, *length);
+ } else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+ u64 max_len;
+ /* For writes to RAID[56], allow a full stripeset across all disks.
+ For other RAID types and for RAID[56] reads, just allow a single
+ stripe (on a single disk). */
+ if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
+ (rw & REQ_WRITE)) {
+ max_len = stripe_len * nr_data_stripes(map) -
+ (offset - raid56_full_stripe_start);
+ } else {
+ /* we limit the length of each bio to what fits in a stripe */
+ max_len = stripe_len - stripe_offset;
+ }
+ *length = min_t(u64, em->len - offset, max_len);
+ } else {
+ *length = em->len - offset;
+ }
+
+ /* This is for when we're called from btrfs_merge_bio_hook() and all
+ it cares about is the length */
+ if (!bbio_ret)
+ goto out;
+
+ btrfs_dev_replace_lock(dev_replace);
+ dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
+ if (!dev_replace_is_ongoing)
+ btrfs_dev_replace_unlock(dev_replace);
+
+ if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
+ !(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) &&
+ dev_replace->tgtdev != NULL) {
+ /*
+ * in dev-replace case, for repair case (that's the only
+ * case where the mirror is selected explicitly when
+ * calling btrfs_map_block), blocks left of the left cursor
+ * can also be read from the target drive.
+ * For REQ_GET_READ_MIRRORS, the target drive is added as
+ * the last one to the array of stripes. For READ, it also
+ * needs to be supported using the same mirror number.
+ * If the requested block is not left of the left cursor,
+ * EIO is returned. This can happen because btrfs_num_copies()
+ * returns one more in the dev-replace case.
+ */
+ u64 tmp_length = *length;
+ struct btrfs_bio *tmp_bbio = NULL;
+ int tmp_num_stripes;
+ u64 srcdev_devid = dev_replace->srcdev->devid;
+ int index_srcdev = 0;
+ int found = 0;
+ u64 physical_of_found = 0;
+
+ ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS,
+ logical, &tmp_length, &tmp_bbio, 0, 0);
+ if (ret) {
+ WARN_ON(tmp_bbio != NULL);
+ goto out;
+ }
+
+ tmp_num_stripes = tmp_bbio->num_stripes;
+ if (mirror_num > tmp_num_stripes) {
+ /*
+ * REQ_GET_READ_MIRRORS does not contain this
+ * mirror, that means that the requested area
+ * is not left of the left cursor
+ */
+ ret = -EIO;
+ btrfs_put_bbio(tmp_bbio);
+ goto out;
+ }
+
+ /*
+ * process the rest of the function using the mirror_num
+ * of the source drive. Therefore look it up first.
+ * At the end, patch the device pointer to the one of the
+ * target drive.
+ */
+ for (i = 0; i < tmp_num_stripes; i++) {
+ if (tmp_bbio->stripes[i].dev->devid == srcdev_devid) {
+ /*
+ * In case of DUP, in order to keep it
+ * simple, only add the mirror with the
+ * lowest physical address
+ */
+ if (found &&
+ physical_of_found <=
+ tmp_bbio->stripes[i].physical)
+ continue;
+ index_srcdev = i;
+ found = 1;
+ physical_of_found =
+ tmp_bbio->stripes[i].physical;
+ }
+ }
+
+ if (found) {
+ mirror_num = index_srcdev + 1;
+ patch_the_first_stripe_for_dev_replace = 1;
+ physical_to_patch_in_first_stripe = physical_of_found;
+ } else {
+ WARN_ON(1);
+ ret = -EIO;
+ btrfs_put_bbio(tmp_bbio);
+ goto out;
+ }
+
+ btrfs_put_bbio(tmp_bbio);
+ } else if (mirror_num > map->num_stripes) {
+ mirror_num = 0;
+ }
+
+ num_stripes = 1;
+ stripe_index = 0;
+ stripe_nr_orig = stripe_nr;
+ stripe_nr_end = ALIGN(offset + *length, map->stripe_len);
+ stripe_nr_end = div_u64(stripe_nr_end, map->stripe_len);
+ stripe_end_offset = stripe_nr_end * map->stripe_len -
+ (offset + *length);
+
+ if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
+ if (rw & REQ_DISCARD)
+ num_stripes = min_t(u64, map->num_stripes,
+ stripe_nr_end - stripe_nr_orig);
+ stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
+ &stripe_index);
+ if (!(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)))
+ mirror_num = 1;
+ } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
+ if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS))
+ num_stripes = map->num_stripes;
+ else if (mirror_num)
+ stripe_index = mirror_num - 1;
+ else {
+ stripe_index = find_live_mirror(fs_info, map, 0,
+ map->num_stripes,
+ current->pid % map->num_stripes,
+ dev_replace_is_ongoing);
+ mirror_num = stripe_index + 1;
+ }
+
+ } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
+ if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) {
+ num_stripes = map->num_stripes;
+ } else if (mirror_num) {
+ stripe_index = mirror_num - 1;
+ } else {
+ mirror_num = 1;
+ }
+
+ } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+ u32 factor = map->num_stripes / map->sub_stripes;
+
+ stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
+ stripe_index *= map->sub_stripes;
+
+ if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
+ num_stripes = map->sub_stripes;
+ else if (rw & REQ_DISCARD)
+ num_stripes = min_t(u64, map->sub_stripes *
+ (stripe_nr_end - stripe_nr_orig),
+ map->num_stripes);
+ else if (mirror_num)
+ stripe_index += mirror_num - 1;
+ else {
+ int old_stripe_index = stripe_index;
+ stripe_index = find_live_mirror(fs_info, map,
+ stripe_index,
+ map->sub_stripes, stripe_index +
+ current->pid % map->sub_stripes,
+ dev_replace_is_ongoing);
+ mirror_num = stripe_index - old_stripe_index + 1;
+ }
+
+ } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+ if (need_raid_map &&
+ ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) ||
+ mirror_num > 1)) {
+ /* push stripe_nr back to the start of the full stripe */
+ stripe_nr = div_u64(raid56_full_stripe_start,
+ stripe_len * nr_data_stripes(map));
+
+ /* RAID[56] write or recovery. Return all stripes */
+ num_stripes = map->num_stripes;
+ max_errors = nr_parity_stripes(map);
+
+ *length = map->stripe_len;
+ stripe_index = 0;
+ stripe_offset = 0;
+ } else {
+ /*
+ * Mirror #0 or #1 means the original data block.
+ * Mirror #2 is RAID5 parity block.
+ * Mirror #3 is RAID6 Q block.
+ */
+ stripe_nr = div_u64_rem(stripe_nr,
+ nr_data_stripes(map), &stripe_index);
+ if (mirror_num > 1)
+ stripe_index = nr_data_stripes(map) +
+ mirror_num - 2;
+
+ /* We distribute the parity blocks across stripes */
+ div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
+ &stripe_index);
+ if (!(rw & (REQ_WRITE | REQ_DISCARD |
+ REQ_GET_READ_MIRRORS)) && mirror_num <= 1)
+ mirror_num = 1;
+ }
+ } else {
+ /*
+ * after this, stripe_nr is the number of stripes on this
+ * device we have to walk to find the data, and stripe_index is
+ * the number of our device in the stripe array
+ */
+ stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
+ &stripe_index);
+ mirror_num = stripe_index + 1;
+ }
+ BUG_ON(stripe_index >= map->num_stripes);
+
+ num_alloc_stripes = num_stripes;
+ if (dev_replace_is_ongoing) {
+ if (rw & (REQ_WRITE | REQ_DISCARD))
+ num_alloc_stripes <<= 1;
+ if (rw & REQ_GET_READ_MIRRORS)
+ num_alloc_stripes++;
+ tgtdev_indexes = num_stripes;
+ }
+
+ bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes);
+ if (!bbio) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ if (dev_replace_is_ongoing)
+ bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes);
+
+ /* build raid_map */
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK &&
+ need_raid_map && ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) ||
+ mirror_num > 1)) {
+ u64 tmp;
+ unsigned rot;
+
+ bbio->raid_map = (u64 *)((void *)bbio->stripes +
+ sizeof(struct btrfs_bio_stripe) *
+ num_alloc_stripes +
+ sizeof(int) * tgtdev_indexes);
+
+ /* Work out the disk rotation on this stripe-set */
+ div_u64_rem(stripe_nr, num_stripes, &rot);
+
+ /* Fill in the logical address of each stripe */
+ tmp = stripe_nr * nr_data_stripes(map);
+ for (i = 0; i < nr_data_stripes(map); i++)
+ bbio->raid_map[(i+rot) % num_stripes] =
+ em->start + (tmp + i) * map->stripe_len;
+
+ bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
+ if (map->type & BTRFS_BLOCK_GROUP_RAID6)
+ bbio->raid_map[(i+rot+1) % num_stripes] =
+ RAID6_Q_STRIPE;
+ }
+
+ if (rw & REQ_DISCARD) {
+ u32 factor = 0;
+ u32 sub_stripes = 0;
+ u64 stripes_per_dev = 0;
+ u32 remaining_stripes = 0;
+ u32 last_stripe = 0;
+
+ if (map->type &
+ (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
+ if (map->type & BTRFS_BLOCK_GROUP_RAID0)
+ sub_stripes = 1;
+ else
+ sub_stripes = map->sub_stripes;
+
+ factor = map->num_stripes / sub_stripes;
+ stripes_per_dev = div_u64_rem(stripe_nr_end -
+ stripe_nr_orig,
+ factor,
+ &remaining_stripes);
+ div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
+ last_stripe *= sub_stripes;
+ }
+
+ for (i = 0; i < num_stripes; i++) {
+ bbio->stripes[i].physical =
+ map->stripes[stripe_index].physical +
+ stripe_offset + stripe_nr * map->stripe_len;
+ bbio->stripes[i].dev = map->stripes[stripe_index].dev;
+
+ if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID10)) {
+ bbio->stripes[i].length = stripes_per_dev *
+ map->stripe_len;
+
+ if (i / sub_stripes < remaining_stripes)
+ bbio->stripes[i].length +=
+ map->stripe_len;
+
+ /*
+ * Special for the first stripe and
+ * the last stripe:
+ *
+ * |-------|...|-------|
+ * |----------|
+ * off end_off
+ */
+ if (i < sub_stripes)
+ bbio->stripes[i].length -=
+ stripe_offset;
+
+ if (stripe_index >= last_stripe &&
+ stripe_index <= (last_stripe +
+ sub_stripes - 1))
+ bbio->stripes[i].length -=
+ stripe_end_offset;
+
+ if (i == sub_stripes - 1)
+ stripe_offset = 0;
+ } else
+ bbio->stripes[i].length = *length;
+
+ stripe_index++;
+ if (stripe_index == map->num_stripes) {
+ /* This could only happen for RAID0/10 */
+ stripe_index = 0;
+ stripe_nr++;
+ }
+ }
+ } else {
+ for (i = 0; i < num_stripes; i++) {
+ bbio->stripes[i].physical =
+ map->stripes[stripe_index].physical +
+ stripe_offset +
+ stripe_nr * map->stripe_len;
+ bbio->stripes[i].dev =
+ map->stripes[stripe_index].dev;
+ stripe_index++;
+ }
+ }
+
+ if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
+ max_errors = btrfs_chunk_max_errors(map);
+
+ if (bbio->raid_map)
+ sort_parity_stripes(bbio, num_stripes);
+
+ tgtdev_indexes = 0;
+ if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) &&
+ dev_replace->tgtdev != NULL) {
+ int index_where_to_add;
+ u64 srcdev_devid = dev_replace->srcdev->devid;
+
+ /*
+ * duplicate the write operations while the dev replace
+ * procedure is running. Since the copying of the old disk
+ * to the new disk takes place at run time while the
+ * filesystem is mounted writable, the regular write
+ * operations to the old disk have to be duplicated to go
+ * to the new disk as well.
+ * Note that device->missing is handled by the caller, and
+ * that the write to the old disk is already set up in the
+ * stripes array.
+ */
+ index_where_to_add = num_stripes;
+ for (i = 0; i < num_stripes; i++) {
+ if (bbio->stripes[i].dev->devid == srcdev_devid) {
+ /* write to new disk, too */
+ struct btrfs_bio_stripe *new =
+ bbio->stripes + index_where_to_add;
+ struct btrfs_bio_stripe *old =
+ bbio->stripes + i;
+
+ new->physical = old->physical;
+ new->length = old->length;
+ new->dev = dev_replace->tgtdev;
+ bbio->tgtdev_map[i] = index_where_to_add;
+ index_where_to_add++;
+ max_errors++;
+ tgtdev_indexes++;
+ }
+ }
+ num_stripes = index_where_to_add;
+ } else if (dev_replace_is_ongoing && (rw & REQ_GET_READ_MIRRORS) &&
+ dev_replace->tgtdev != NULL) {
+ u64 srcdev_devid = dev_replace->srcdev->devid;
+ int index_srcdev = 0;
+ int found = 0;
+ u64 physical_of_found = 0;
+
+ /*
+ * During the dev-replace procedure, the target drive can
+ * also be used to read data in case it is needed to repair
+ * a corrupt block elsewhere. This is possible if the
+ * requested area is left of the left cursor. In this area,
+ * the target drive is a full copy of the source drive.
+ */
+ for (i = 0; i < num_stripes; i++) {
+ if (bbio->stripes[i].dev->devid == srcdev_devid) {
+ /*
+ * In case of DUP, in order to keep it
+ * simple, only add the mirror with the
+ * lowest physical address
+ */
+ if (found &&
+ physical_of_found <=
+ bbio->stripes[i].physical)
+ continue;
+ index_srcdev = i;
+ found = 1;
+ physical_of_found = bbio->stripes[i].physical;
+ }
+ }
+ if (found) {
+ if (physical_of_found + map->stripe_len <=
+ dev_replace->cursor_left) {
+ struct btrfs_bio_stripe *tgtdev_stripe =
+ bbio->stripes + num_stripes;
+
+ tgtdev_stripe->physical = physical_of_found;
+ tgtdev_stripe->length =
+ bbio->stripes[index_srcdev].length;
+ tgtdev_stripe->dev = dev_replace->tgtdev;
+ bbio->tgtdev_map[index_srcdev] = num_stripes;
+
+ tgtdev_indexes++;
+ num_stripes++;
+ }
+ }
+ }
+
+ *bbio_ret = bbio;
+ bbio->map_type = map->type;
+ bbio->num_stripes = num_stripes;
+ bbio->max_errors = max_errors;
+ bbio->mirror_num = mirror_num;
+ bbio->num_tgtdevs = tgtdev_indexes;
+
+ /*
+ * this is the case that REQ_READ && dev_replace_is_ongoing &&
+ * mirror_num == num_stripes + 1 && dev_replace target drive is
+ * available as a mirror
+ */
+ if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
+ WARN_ON(num_stripes > 1);
+ bbio->stripes[0].dev = dev_replace->tgtdev;
+ bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
+ bbio->mirror_num = map->num_stripes + 1;
+ }
+out:
+ if (dev_replace_is_ongoing)
+ btrfs_dev_replace_unlock(dev_replace);
+ free_extent_map(em);
+ return ret;
+}
+
+int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
+ u64 logical, u64 *length,
+ struct btrfs_bio **bbio_ret, int mirror_num)
+{
+ return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
+ mirror_num, 0);
+}
+
+/* For Scrub/replace */
+int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw,
+ u64 logical, u64 *length,
+ struct btrfs_bio **bbio_ret, int mirror_num,
+ int need_raid_map)
+{
+ return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
+ mirror_num, need_raid_map);
+}
+
+int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
+ u64 chunk_start, u64 physical, u64 devid,
+ u64 **logical, int *naddrs, int *stripe_len)
+{
+ struct extent_map_tree *em_tree = &map_tree->map_tree;
+ struct extent_map *em;
+ struct map_lookup *map;
+ u64 *buf;
+ u64 bytenr;
+ u64 length;
+ u64 stripe_nr;
+ u64 rmap_len;
+ int i, j, nr = 0;
+
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, chunk_start, 1);
+ read_unlock(&em_tree->lock);
+
+ if (!em) {
+ printk(KERN_ERR "BTRFS: couldn't find em for chunk %Lu\n",
+ chunk_start);
+ return -EIO;
+ }
+
+ if (em->start != chunk_start) {
+ printk(KERN_ERR "BTRFS: bad chunk start, em=%Lu, wanted=%Lu\n",
+ em->start, chunk_start);
+ free_extent_map(em);
+ return -EIO;
+ }
+ map = (struct map_lookup *)em->bdev;
+
+ length = em->len;
+ rmap_len = map->stripe_len;
+
+ if (map->type & BTRFS_BLOCK_GROUP_RAID10)
+ length = div_u64(length, map->num_stripes / map->sub_stripes);
+ else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
+ length = div_u64(length, map->num_stripes);
+ else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+ length = div_u64(length, nr_data_stripes(map));
+ rmap_len = map->stripe_len * nr_data_stripes(map);
+ }
+
+ buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
+ BUG_ON(!buf); /* -ENOMEM */
+
+ for (i = 0; i < map->num_stripes; i++) {
+ if (devid && map->stripes[i].dev->devid != devid)
+ continue;
+ if (map->stripes[i].physical > physical ||
+ map->stripes[i].physical + length <= physical)
+ continue;
+
+ stripe_nr = physical - map->stripes[i].physical;
+ stripe_nr = div_u64(stripe_nr, map->stripe_len);
+
+ if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+ stripe_nr = stripe_nr * map->num_stripes + i;
+ stripe_nr = div_u64(stripe_nr, map->sub_stripes);
+ } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
+ stripe_nr = stripe_nr * map->num_stripes + i;
+ } /* else if RAID[56], multiply by nr_data_stripes().
+ * Alternatively, just use rmap_len below instead of
+ * map->stripe_len */
+
+ bytenr = chunk_start + stripe_nr * rmap_len;
+ WARN_ON(nr >= map->num_stripes);
+ for (j = 0; j < nr; j++) {
+ if (buf[j] == bytenr)
+ break;
+ }
+ if (j == nr) {
+ WARN_ON(nr >= map->num_stripes);
+ buf[nr++] = bytenr;
+ }
+ }
+
+ *logical = buf;
+ *naddrs = nr;
+ *stripe_len = rmap_len;
+
+ free_extent_map(em);
+ return 0;
+}
+
+static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio, int err)
+{
+ if (likely(bbio->flags & BTRFS_BIO_ORIG_BIO_SUBMITTED))
+ bio_endio_nodec(bio, err);
+ else
+ bio_endio(bio, err);
+ btrfs_put_bbio(bbio);
+}
+
+static void btrfs_end_bio(struct bio *bio, int err)
+{
+ struct btrfs_bio *bbio = bio->bi_private;
+ struct btrfs_device *dev = bbio->stripes[0].dev;
+ int is_orig_bio = 0;
+
+ if (err) {
+ atomic_inc(&bbio->error);
+ if (err == -EIO || err == -EREMOTEIO) {
+ unsigned int stripe_index =
+ btrfs_io_bio(bio)->stripe_index;
+
+ BUG_ON(stripe_index >= bbio->num_stripes);
+ dev = bbio->stripes[stripe_index].dev;
+ if (dev->bdev) {
+ if (bio->bi_rw & WRITE)
+ btrfs_dev_stat_inc(dev,
+ BTRFS_DEV_STAT_WRITE_ERRS);
+ else
+ btrfs_dev_stat_inc(dev,
+ BTRFS_DEV_STAT_READ_ERRS);
+ if ((bio->bi_rw & WRITE_FLUSH) == WRITE_FLUSH)
+ btrfs_dev_stat_inc(dev,
+ BTRFS_DEV_STAT_FLUSH_ERRS);
+ btrfs_dev_stat_print_on_error(dev);
+ }
+ }
+ }
+
+ if (bio == bbio->orig_bio)
+ is_orig_bio = 1;
+
+ btrfs_bio_counter_dec(bbio->fs_info);
+
+ if (atomic_dec_and_test(&bbio->stripes_pending)) {
+ if (!is_orig_bio) {
+ bio_put(bio);
+ bio = bbio->orig_bio;
+ }
+
+ bio->bi_private = bbio->private;
+ bio->bi_end_io = bbio->end_io;
+ btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
+ /* only send an error to the higher layers if it is
+ * beyond the tolerance of the btrfs bio
+ */
+ if (atomic_read(&bbio->error) > bbio->max_errors) {
+ err = -EIO;
+ } else {
+ /*
+ * this bio is actually up to date, we didn't
+ * go over the max number of errors
+ */
+ set_bit(BIO_UPTODATE, &bio->bi_flags);
+ err = 0;
+ }
+
+ btrfs_end_bbio(bbio, bio, err);
+ } else if (!is_orig_bio) {
+ bio_put(bio);
+ }
+}
+
+/*
+ * see run_scheduled_bios for a description of why bios are collected for
+ * async submit.
+ *
+ * This will add one bio to the pending list for a device and make sure
+ * the work struct is scheduled.
+ */
+static noinline void btrfs_schedule_bio(struct btrfs_root *root,
+ struct btrfs_device *device,
+ int rw, struct bio *bio)
+{
+ int should_queue = 1;
+ struct btrfs_pending_bios *pending_bios;
+
+ if (device->missing || !device->bdev) {
+ bio_endio(bio, -EIO);
+ return;
+ }
+
+ /* don't bother with additional async steps for reads, right now */
+ if (!(rw & REQ_WRITE)) {
+ bio_get(bio);
+ btrfsic_submit_bio(rw, bio);
+ bio_put(bio);
+ return;
+ }
+
+ /*
+ * nr_async_bios allows us to reliably return congestion to the
+ * higher layers. Otherwise, the async bio makes it appear we have
+ * made progress against dirty pages when we've really just put it
+ * on a queue for later
+ */
+ atomic_inc(&root->fs_info->nr_async_bios);
+ WARN_ON(bio->bi_next);
+ bio->bi_next = NULL;
+ bio->bi_rw |= rw;
+
+ spin_lock(&device->io_lock);
+ if (bio->bi_rw & REQ_SYNC)
+ pending_bios = &device->pending_sync_bios;
+ else
+ pending_bios = &device->pending_bios;
+
+ if (pending_bios->tail)
+ pending_bios->tail->bi_next = bio;
+
+ pending_bios->tail = bio;
+ if (!pending_bios->head)
+ pending_bios->head = bio;
+ if (device->running_pending)
+ should_queue = 0;
+
+ spin_unlock(&device->io_lock);
+
+ if (should_queue)
+ btrfs_queue_work(root->fs_info->submit_workers,
+ &device->work);
+}
+
+static int bio_size_ok(struct block_device *bdev, struct bio *bio,
+ sector_t sector)
+{
+ struct bio_vec *prev;
+ struct request_queue *q = bdev_get_queue(bdev);
+ unsigned int max_sectors = queue_max_sectors(q);
+ struct bvec_merge_data bvm = {
+ .bi_bdev = bdev,
+ .bi_sector = sector,
+ .bi_rw = bio->bi_rw,
+ };
+
+ if (WARN_ON(bio->bi_vcnt == 0))
+ return 1;
+
+ prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
+ if (bio_sectors(bio) > max_sectors)
+ return 0;
+
+ if (!q->merge_bvec_fn)
+ return 1;
+
+ bvm.bi_size = bio->bi_iter.bi_size - prev->bv_len;
+ if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len)
+ return 0;
+ return 1;
+}
+
+static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
+ struct bio *bio, u64 physical, int dev_nr,
+ int rw, int async)
+{
+ struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
+
+ bio->bi_private = bbio;
+ btrfs_io_bio(bio)->stripe_index = dev_nr;
+ bio->bi_end_io = btrfs_end_bio;
+ bio->bi_iter.bi_sector = physical >> 9;
+#ifdef DEBUG
+ {
+ struct rcu_string *name;
+
+ rcu_read_lock();
+ name = rcu_dereference(dev->name);
+ pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu "
+ "(%s id %llu), size=%u\n", rw,
+ (u64)bio->bi_iter.bi_sector, (u_long)dev->bdev->bd_dev,
+ name->str, dev->devid, bio->bi_iter.bi_size);
+ rcu_read_unlock();
+ }
+#endif
+ bio->bi_bdev = dev->bdev;
+
+ btrfs_bio_counter_inc_noblocked(root->fs_info);
+
+ if (async)
+ btrfs_schedule_bio(root, dev, rw, bio);
+ else
+ btrfsic_submit_bio(rw, bio);
+}
+
+static int breakup_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
+ struct bio *first_bio, struct btrfs_device *dev,
+ int dev_nr, int rw, int async)
+{
+ struct bio_vec *bvec = first_bio->bi_io_vec;
+ struct bio *bio;
+ int nr_vecs = bio_get_nr_vecs(dev->bdev);
+ u64 physical = bbio->stripes[dev_nr].physical;
+
+again:
+ bio = btrfs_bio_alloc(dev->bdev, physical >> 9, nr_vecs, GFP_NOFS);
+ if (!bio)
+ return -ENOMEM;
+
+ while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) {
+ if (bio_add_page(bio, bvec->bv_page, bvec->bv_len,
+ bvec->bv_offset) < bvec->bv_len) {
+ u64 len = bio->bi_iter.bi_size;
+
+ atomic_inc(&bbio->stripes_pending);
+ submit_stripe_bio(root, bbio, bio, physical, dev_nr,
+ rw, async);
+ physical += len;
+ goto again;
+ }
+ bvec++;
+ }
+
+ submit_stripe_bio(root, bbio, bio, physical, dev_nr, rw, async);
+ return 0;
+}
+
+static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
+{
+ atomic_inc(&bbio->error);
+ if (atomic_dec_and_test(&bbio->stripes_pending)) {
+ /* Shoud be the original bio. */
+ WARN_ON(bio != bbio->orig_bio);
+
+ bio->bi_private = bbio->private;
+ bio->bi_end_io = bbio->end_io;
+ btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
+ bio->bi_iter.bi_sector = logical >> 9;
+
+ btrfs_end_bbio(bbio, bio, -EIO);
+ }
+}
+
+int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
+ int mirror_num, int async_submit)
+{
+ struct btrfs_device *dev;
+ struct bio *first_bio = bio;
+ u64 logical = (u64)bio->bi_iter.bi_sector << 9;
+ u64 length = 0;
+ u64 map_length;
+ int ret;
+ int dev_nr;
+ int total_devs;
+ struct btrfs_bio *bbio = NULL;
+
+ length = bio->bi_iter.bi_size;
+ map_length = length;
+
+ btrfs_bio_counter_inc_blocked(root->fs_info);
+ ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
+ mirror_num, 1);
+ if (ret) {
+ btrfs_bio_counter_dec(root->fs_info);
+ return ret;
+ }
+
+ total_devs = bbio->num_stripes;
+ bbio->orig_bio = first_bio;
+ bbio->private = first_bio->bi_private;
+ bbio->end_io = first_bio->bi_end_io;
+ bbio->fs_info = root->fs_info;
+ atomic_set(&bbio->stripes_pending, bbio->num_stripes);
+
+ if (bbio->raid_map) {
+ /* In this case, map_length has been set to the length of
+ a single stripe; not the whole write */
+ if (rw & WRITE) {
+ ret = raid56_parity_write(root, bio, bbio, map_length);
+ } else {
+ ret = raid56_parity_recover(root, bio, bbio, map_length,
+ mirror_num, 1);
+ }
+
+ btrfs_bio_counter_dec(root->fs_info);
+ return ret;
+ }
+
+ if (map_length < length) {
+ btrfs_crit(root->fs_info, "mapping failed logical %llu bio len %llu len %llu",
+ logical, length, map_length);
+ BUG();
+ }
+
+ for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
+ dev = bbio->stripes[dev_nr].dev;
+ if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) {
+ bbio_error(bbio, first_bio, logical);
+ continue;
+ }
+
+ /*
+ * Check and see if we're ok with this bio based on it's size
+ * and offset with the given device.
+ */
+ if (!bio_size_ok(dev->bdev, first_bio,
+ bbio->stripes[dev_nr].physical >> 9)) {
+ ret = breakup_stripe_bio(root, bbio, first_bio, dev,
+ dev_nr, rw, async_submit);
+ BUG_ON(ret);
+ continue;
+ }
+
+ if (dev_nr < total_devs - 1) {
+ bio = btrfs_bio_clone(first_bio, GFP_NOFS);
+ BUG_ON(!bio); /* -ENOMEM */
+ } else {
+ bio = first_bio;
+ bbio->flags |= BTRFS_BIO_ORIG_BIO_SUBMITTED;
+ }
+
+ submit_stripe_bio(root, bbio, bio,
+ bbio->stripes[dev_nr].physical, dev_nr, rw,
+ async_submit);
+ }
+ btrfs_bio_counter_dec(root->fs_info);
+ return 0;
+}
+
+struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
+ u8 *uuid, u8 *fsid)
+{
+ struct btrfs_device *device;
+ struct btrfs_fs_devices *cur_devices;
+
+ cur_devices = fs_info->fs_devices;
+ while (cur_devices) {
+ if (!fsid ||
+ !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
+ device = __find_device(&cur_devices->devices,
+ devid, uuid);
+ if (device)
+ return device;
+ }
+ cur_devices = cur_devices->seed;
+ }
+ return NULL;
+}
+
+static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
+ struct btrfs_fs_devices *fs_devices,
+ u64 devid, u8 *dev_uuid)
+{
+ struct btrfs_device *device;
+
+ device = btrfs_alloc_device(NULL, &devid, dev_uuid);
+ if (IS_ERR(device))
+ return NULL;
+
+ list_add(&device->dev_list, &fs_devices->devices);
+ device->fs_devices = fs_devices;
+ fs_devices->num_devices++;
+
+ device->missing = 1;
+ fs_devices->missing_devices++;
+
+ return device;
+}
+
+/**
+ * btrfs_alloc_device - allocate struct btrfs_device
+ * @fs_info: used only for generating a new devid, can be NULL if
+ * devid is provided (i.e. @devid != NULL).
+ * @devid: a pointer to devid for this device. If NULL a new devid
+ * is generated.
+ * @uuid: a pointer to UUID for this device. If NULL a new UUID
+ * is generated.
+ *
+ * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
+ * on error. Returned struct is not linked onto any lists and can be
+ * destroyed with kfree() right away.
+ */
+struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
+ const u64 *devid,
+ const u8 *uuid)
+{
+ struct btrfs_device *dev;
+ u64 tmp;
+
+ if (WARN_ON(!devid && !fs_info))
+ return ERR_PTR(-EINVAL);
+
+ dev = __alloc_device();
+ if (IS_ERR(dev))
+ return dev;
+
+ if (devid)
+ tmp = *devid;
+ else {
+ int ret;
+
+ ret = find_next_devid(fs_info, &tmp);
+ if (ret) {
+ kfree(dev);
+ return ERR_PTR(ret);
+ }
+ }
+ dev->devid = tmp;
+
+ if (uuid)
+ memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE);
+ else
+ generate_random_uuid(dev->uuid);
+
+ btrfs_init_work(&dev->work, btrfs_submit_helper,
+ pending_bios_fn, NULL, NULL);
+
+ return dev;
+}
+
+static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
+ struct extent_buffer *leaf,
+ struct btrfs_chunk *chunk)
+{
+ struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
+ struct map_lookup *map;
+ struct extent_map *em;
+ u64 logical;
+ u64 length;
+ u64 devid;
+ u8 uuid[BTRFS_UUID_SIZE];
+ int num_stripes;
+ int ret;
+ int i;
+
+ logical = key->offset;
+ length = btrfs_chunk_length(leaf, chunk);
+
+ read_lock(&map_tree->map_tree.lock);
+ em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
+ read_unlock(&map_tree->map_tree.lock);
+
+ /* already mapped? */
+ if (em && em->start <= logical && em->start + em->len > logical) {
+ free_extent_map(em);
+ return 0;
+ } else if (em) {
+ free_extent_map(em);
+ }
+
+ em = alloc_extent_map();
+ if (!em)
+ return -ENOMEM;
+ num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+ map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
+ if (!map) {
+ free_extent_map(em);
+ return -ENOMEM;
+ }
+
+ set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
+ em->bdev = (struct block_device *)map;
+ em->start = logical;
+ em->len = length;
+ em->orig_start = 0;
+ em->block_start = 0;
+ em->block_len = em->len;
+
+ map->num_stripes = num_stripes;
+ map->io_width = btrfs_chunk_io_width(leaf, chunk);
+ map->io_align = btrfs_chunk_io_align(leaf, chunk);
+ map->sector_size = btrfs_chunk_sector_size(leaf, chunk);
+ map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
+ map->type = btrfs_chunk_type(leaf, chunk);
+ map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
+ for (i = 0; i < num_stripes; i++) {
+ map->stripes[i].physical =
+ btrfs_stripe_offset_nr(leaf, chunk, i);
+ devid = btrfs_stripe_devid_nr(leaf, chunk, i);
+ read_extent_buffer(leaf, uuid, (unsigned long)
+ btrfs_stripe_dev_uuid_nr(chunk, i),
+ BTRFS_UUID_SIZE);
+ map->stripes[i].dev = btrfs_find_device(root->fs_info, devid,
+ uuid, NULL);
+ if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
+ free_extent_map(em);
+ return -EIO;
+ }
+ if (!map->stripes[i].dev) {
+ map->stripes[i].dev =
+ add_missing_dev(root, root->fs_info->fs_devices,
+ devid, uuid);
+ if (!map->stripes[i].dev) {
+ free_extent_map(em);
+ return -EIO;
+ }
+ }
+ map->stripes[i].dev->in_fs_metadata = 1;
+ }
+
+ write_lock(&map_tree->map_tree.lock);
+ ret = add_extent_mapping(&map_tree->map_tree, em, 0);
+ write_unlock(&map_tree->map_tree.lock);
+ BUG_ON(ret); /* Tree corruption */
+ free_extent_map(em);
+
+ return 0;
+}
+
+static void fill_device_from_item(struct extent_buffer *leaf,
+ struct btrfs_dev_item *dev_item,
+ struct btrfs_device *device)
+{
+ unsigned long ptr;
+
+ device->devid = btrfs_device_id(leaf, dev_item);
+ device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
+ device->total_bytes = device->disk_total_bytes;
+ device->commit_total_bytes = device->disk_total_bytes;
+ device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
+ device->commit_bytes_used = device->bytes_used;
+ device->type = btrfs_device_type(leaf, dev_item);
+ device->io_align = btrfs_device_io_align(leaf, dev_item);
+ device->io_width = btrfs_device_io_width(leaf, dev_item);
+ device->sector_size = btrfs_device_sector_size(leaf, dev_item);
+ WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
+ device->is_tgtdev_for_dev_replace = 0;
+
+ ptr = btrfs_device_uuid(dev_item);
+ read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
+}
+
+static struct btrfs_fs_devices *open_seed_devices(struct btrfs_root *root,
+ u8 *fsid)
+{
+ struct btrfs_fs_devices *fs_devices;
+ int ret;
+
+ BUG_ON(!mutex_is_locked(&uuid_mutex));
+
+ fs_devices = root->fs_info->fs_devices->seed;
+ while (fs_devices) {
+ if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE))
+ return fs_devices;
+
+ fs_devices = fs_devices->seed;
+ }
+
+ fs_devices = find_fsid(fsid);
+ if (!fs_devices) {
+ if (!btrfs_test_opt(root, DEGRADED))
+ return ERR_PTR(-ENOENT);
+
+ fs_devices = alloc_fs_devices(fsid);
+ if (IS_ERR(fs_devices))
+ return fs_devices;
+
+ fs_devices->seeding = 1;
+ fs_devices->opened = 1;
+ return fs_devices;
+ }
+
+ fs_devices = clone_fs_devices(fs_devices);
+ if (IS_ERR(fs_devices))
+ return fs_devices;
+
+ ret = __btrfs_open_devices(fs_devices, FMODE_READ,
+ root->fs_info->bdev_holder);
+ if (ret) {
+ free_fs_devices(fs_devices);
+ fs_devices = ERR_PTR(ret);
+ goto out;
+ }
+
+ if (!fs_devices->seeding) {
+ __btrfs_close_devices(fs_devices);
+ free_fs_devices(fs_devices);
+ fs_devices = ERR_PTR(-EINVAL);
+ goto out;
+ }
+
+ fs_devices->seed = root->fs_info->fs_devices->seed;
+ root->fs_info->fs_devices->seed = fs_devices;
+out:
+ return fs_devices;
+}
+
+static int read_one_dev(struct btrfs_root *root,
+ struct extent_buffer *leaf,
+ struct btrfs_dev_item *dev_item)
+{
+ struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+ struct btrfs_device *device;
+ u64 devid;
+ int ret;
+ u8 fs_uuid[BTRFS_UUID_SIZE];
+ u8 dev_uuid[BTRFS_UUID_SIZE];
+
+ devid = btrfs_device_id(leaf, dev_item);
+ read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
+ BTRFS_UUID_SIZE);
+ read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
+ BTRFS_UUID_SIZE);
+
+ if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) {
+ fs_devices = open_seed_devices(root, fs_uuid);
+ if (IS_ERR(fs_devices))
+ return PTR_ERR(fs_devices);
+ }
+
+ device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid);
+ if (!device) {
+ if (!btrfs_test_opt(root, DEGRADED))
+ return -EIO;
+
+ btrfs_warn(root->fs_info, "devid %llu missing", devid);
+ device = add_missing_dev(root, fs_devices, devid, dev_uuid);
+ if (!device)
+ return -ENOMEM;
+ } else {
+ if (!device->bdev && !btrfs_test_opt(root, DEGRADED))
+ return -EIO;
+
+ if(!device->bdev && !device->missing) {
+ /*
+ * this happens when a device that was properly setup
+ * in the device info lists suddenly goes bad.
+ * device->bdev is NULL, and so we have to set
+ * device->missing to one here
+ */
+ device->fs_devices->missing_devices++;
+ device->missing = 1;
+ }
+
+ /* Move the device to its own fs_devices */
+ if (device->fs_devices != fs_devices) {
+ ASSERT(device->missing);
+
+ list_move(&device->dev_list, &fs_devices->devices);
+ device->fs_devices->num_devices--;
+ fs_devices->num_devices++;
+
+ device->fs_devices->missing_devices--;
+ fs_devices->missing_devices++;
+
+ device->fs_devices = fs_devices;
+ }
+ }
+
+ if (device->fs_devices != root->fs_info->fs_devices) {
+ BUG_ON(device->writeable);
+ if (device->generation !=
+ btrfs_device_generation(leaf, dev_item))
+ return -EINVAL;
+ }
+
+ fill_device_from_item(leaf, dev_item, device);
+ device->in_fs_metadata = 1;
+ if (device->writeable && !device->is_tgtdev_for_dev_replace) {
+ device->fs_devices->total_rw_bytes += device->total_bytes;
+ spin_lock(&root->fs_info->free_chunk_lock);
+ root->fs_info->free_chunk_space += device->total_bytes -
+ device->bytes_used;
+ spin_unlock(&root->fs_info->free_chunk_lock);
+ }
+ ret = 0;
+ return ret;
+}
+
+int btrfs_read_sys_array(struct btrfs_root *root)
+{
+ struct btrfs_super_block *super_copy = root->fs_info->super_copy;
+ struct extent_buffer *sb;
+ struct btrfs_disk_key *disk_key;
+ struct btrfs_chunk *chunk;
+ u8 *array_ptr;
+ unsigned long sb_array_offset;
+ int ret = 0;
+ u32 num_stripes;
+ u32 array_size;
+ u32 len = 0;
+ u32 cur_offset;
+ struct btrfs_key key;
+
+ ASSERT(BTRFS_SUPER_INFO_SIZE <= root->nodesize);
+ /*
+ * This will create extent buffer of nodesize, superblock size is
+ * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will
+ * overallocate but we can keep it as-is, only the first page is used.
+ */
+ sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET);
+ if (!sb)
+ return -ENOMEM;
+ btrfs_set_buffer_uptodate(sb);
+ btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
+ /*
+ * The sb extent buffer is artifical and just used to read the system array.
+ * btrfs_set_buffer_uptodate() call does not properly mark all it's
+ * pages up-to-date when the page is larger: extent does not cover the
+ * whole page and consequently check_page_uptodate does not find all
+ * the page's extents up-to-date (the hole beyond sb),
+ * write_extent_buffer then triggers a WARN_ON.
+ *
+ * Regular short extents go through mark_extent_buffer_dirty/writeback cycle,
+ * but sb spans only this function. Add an explicit SetPageUptodate call
+ * to silence the warning eg. on PowerPC 64.
+ */
+ if (PAGE_CACHE_SIZE > BTRFS_SUPER_INFO_SIZE)
+ SetPageUptodate(sb->pages[0]);
+
+ write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
+ array_size = btrfs_super_sys_array_size(super_copy);
+
+ array_ptr = super_copy->sys_chunk_array;
+ sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array);
+ cur_offset = 0;
+
+ while (cur_offset < array_size) {
+ disk_key = (struct btrfs_disk_key *)array_ptr;
+ len = sizeof(*disk_key);
+ if (cur_offset + len > array_size)
+ goto out_short_read;
+
+ btrfs_disk_key_to_cpu(&key, disk_key);
+
+ array_ptr += len;
+ sb_array_offset += len;
+ cur_offset += len;
+
+ if (key.type == BTRFS_CHUNK_ITEM_KEY) {
+ chunk = (struct btrfs_chunk *)sb_array_offset;
+ /*
+ * At least one btrfs_chunk with one stripe must be
+ * present, exact stripe count check comes afterwards
+ */
+ len = btrfs_chunk_item_size(1);
+ if (cur_offset + len > array_size)
+ goto out_short_read;
+
+ num_stripes = btrfs_chunk_num_stripes(sb, chunk);
+ len = btrfs_chunk_item_size(num_stripes);
+ if (cur_offset + len > array_size)
+ goto out_short_read;
+
+ ret = read_one_chunk(root, &key, sb, chunk);
+ if (ret)
+ break;
+ } else {
+ ret = -EIO;
+ break;
+ }
+ array_ptr += len;
+ sb_array_offset += len;
+ cur_offset += len;
+ }
+ free_extent_buffer(sb);
+ return ret;
+
+out_short_read:
+ printk(KERN_ERR "BTRFS: sys_array too short to read %u bytes at offset %u\n",
+ len, cur_offset);
+ free_extent_buffer(sb);
+ return -EIO;
+}
+
+int btrfs_read_chunk_tree(struct btrfs_root *root)
+{
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ int ret;
+ int slot;
+
+ root = root->fs_info->chunk_root;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ mutex_lock(&uuid_mutex);
+ lock_chunks(root);
+
+ /*
+ * Read all device items, and then all the chunk items. All
+ * device items are found before any chunk item (their object id
+ * is smaller than the lowest possible object id for a chunk
+ * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
+ */
+ key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+ key.offset = 0;
+ key.type = 0;
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto error;
+ while (1) {
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ if (slot >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret == 0)
+ continue;
+ if (ret < 0)
+ goto error;
+ break;
+ }
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
+ if (found_key.type == BTRFS_DEV_ITEM_KEY) {
+ struct btrfs_dev_item *dev_item;
+ dev_item = btrfs_item_ptr(leaf, slot,
+ struct btrfs_dev_item);
+ ret = read_one_dev(root, leaf, dev_item);
+ if (ret)
+ goto error;
+ } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
+ struct btrfs_chunk *chunk;
+ chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
+ ret = read_one_chunk(root, &found_key, leaf, chunk);
+ if (ret)
+ goto error;
+ }
+ path->slots[0]++;
+ }
+ ret = 0;
+error:
+ unlock_chunks(root);
+ mutex_unlock(&uuid_mutex);
+
+ btrfs_free_path(path);
+ return ret;
+}
+
+void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_device *device;
+
+ while (fs_devices) {
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_for_each_entry(device, &fs_devices->devices, dev_list)
+ device->dev_root = fs_info->dev_root;
+ mutex_unlock(&fs_devices->device_list_mutex);
+
+ fs_devices = fs_devices->seed;
+ }
+}
+
+static void __btrfs_reset_dev_stats(struct btrfs_device *dev)
+{
+ int i;
+
+ for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
+ btrfs_dev_stat_reset(dev, i);
+}
+
+int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ struct btrfs_root *dev_root = fs_info->dev_root;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct extent_buffer *eb;
+ int slot;
+ int ret = 0;
+ struct btrfs_device *device;
+ struct btrfs_path *path = NULL;
+ int i;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ int item_size;
+ struct btrfs_dev_stats_item *ptr;
+
+ key.objectid = 0;
+ key.type = BTRFS_DEV_STATS_KEY;
+ key.offset = device->devid;
+ ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
+ if (ret) {
+ __btrfs_reset_dev_stats(device);
+ device->dev_stats_valid = 1;
+ btrfs_release_path(path);
+ continue;
+ }
+ slot = path->slots[0];
+ eb = path->nodes[0];
+ btrfs_item_key_to_cpu(eb, &found_key, slot);
+ item_size = btrfs_item_size_nr(eb, slot);
+
+ ptr = btrfs_item_ptr(eb, slot,
+ struct btrfs_dev_stats_item);
+
+ for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
+ if (item_size >= (1 + i) * sizeof(__le64))
+ btrfs_dev_stat_set(device, i,
+ btrfs_dev_stats_value(eb, ptr, i));
+ else
+ btrfs_dev_stat_reset(device, i);
+ }
+
+ device->dev_stats_valid = 1;
+ btrfs_dev_stat_print_on_load(device);
+ btrfs_release_path(path);
+ }
+ mutex_unlock(&fs_devices->device_list_mutex);
+
+out:
+ btrfs_free_path(path);
+ return ret < 0 ? ret : 0;
+}
+
+static int update_dev_stat_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *dev_root,
+ struct btrfs_device *device)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct extent_buffer *eb;
+ struct btrfs_dev_stats_item *ptr;
+ int ret;
+ int i;
+
+ key.objectid = 0;
+ key.type = BTRFS_DEV_STATS_KEY;
+ key.offset = device->devid;
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+ ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
+ if (ret < 0) {
+ printk_in_rcu(KERN_WARNING "BTRFS: "
+ "error %d while searching for dev_stats item for device %s!\n",
+ ret, rcu_str_deref(device->name));
+ goto out;
+ }
+
+ if (ret == 0 &&
+ btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
+ /* need to delete old one and insert a new one */
+ ret = btrfs_del_item(trans, dev_root, path);
+ if (ret != 0) {
+ printk_in_rcu(KERN_WARNING "BTRFS: "
+ "delete too small dev_stats item for device %s failed %d!\n",
+ rcu_str_deref(device->name), ret);
+ goto out;
+ }
+ ret = 1;
+ }
+
+ if (ret == 1) {
+ /* need to insert a new item */
+ btrfs_release_path(path);
+ ret = btrfs_insert_empty_item(trans, dev_root, path,
+ &key, sizeof(*ptr));
+ if (ret < 0) {
+ printk_in_rcu(KERN_WARNING "BTRFS: "
+ "insert dev_stats item for device %s failed %d!\n",
+ rcu_str_deref(device->name), ret);
+ goto out;
+ }
+ }
+
+ eb = path->nodes[0];
+ ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
+ for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
+ btrfs_set_dev_stats_value(eb, ptr, i,
+ btrfs_dev_stat_read(device, i));
+ btrfs_mark_buffer_dirty(eb);
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * called from commit_transaction. Writes all changed device stats to disk.
+ */
+int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *dev_root = fs_info->dev_root;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_device *device;
+ int stats_cnt;
+ int ret = 0;
+
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ if (!device->dev_stats_valid || !btrfs_dev_stats_dirty(device))
+ continue;
+
+ stats_cnt = atomic_read(&device->dev_stats_ccnt);
+ ret = update_dev_stat_item(trans, dev_root, device);
+ if (!ret)
+ atomic_sub(stats_cnt, &device->dev_stats_ccnt);
+ }
+ mutex_unlock(&fs_devices->device_list_mutex);
+
+ return ret;
+}
+
+void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
+{
+ btrfs_dev_stat_inc(dev, index);
+ btrfs_dev_stat_print_on_error(dev);
+}
+
+static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
+{
+ if (!dev->dev_stats_valid)
+ return;
+ printk_ratelimited_in_rcu(KERN_ERR "BTRFS: "
+ "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
+ rcu_str_deref(dev->name),
+ btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
+ btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
+ btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
+ btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
+ btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
+}
+
+static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
+{
+ int i;
+
+ for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
+ if (btrfs_dev_stat_read(dev, i) != 0)
+ break;
+ if (i == BTRFS_DEV_STAT_VALUES_MAX)
+ return; /* all values == 0, suppress message */
+
+ printk_in_rcu(KERN_INFO "BTRFS: "
+ "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
+ rcu_str_deref(dev->name),
+ btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
+ btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
+ btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
+ btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
+ btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
+}
+
+int btrfs_get_dev_stats(struct btrfs_root *root,
+ struct btrfs_ioctl_get_dev_stats *stats)
+{
+ struct btrfs_device *dev;
+ struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+ int i;
+
+ mutex_lock(&fs_devices->device_list_mutex);
+ dev = btrfs_find_device(root->fs_info, stats->devid, NULL, NULL);
+ mutex_unlock(&fs_devices->device_list_mutex);
+
+ if (!dev) {
+ btrfs_warn(root->fs_info, "get dev_stats failed, device not found");
+ return -ENODEV;
+ } else if (!dev->dev_stats_valid) {
+ btrfs_warn(root->fs_info, "get dev_stats failed, not yet valid");
+ return -ENODEV;
+ } else if (stats->flags & BTRFS_DEV_STATS_RESET) {
+ for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
+ if (stats->nr_items > i)
+ stats->values[i] =
+ btrfs_dev_stat_read_and_reset(dev, i);
+ else
+ btrfs_dev_stat_reset(dev, i);
+ }
+ } else {
+ for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
+ if (stats->nr_items > i)
+ stats->values[i] = btrfs_dev_stat_read(dev, i);
+ }
+ if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
+ stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
+ return 0;
+}
+
+int btrfs_scratch_superblock(struct btrfs_device *device)
+{
+ struct buffer_head *bh;
+ struct btrfs_super_block *disk_super;
+
+ bh = btrfs_read_dev_super(device->bdev);
+ if (!bh)
+ return -EINVAL;
+ disk_super = (struct btrfs_super_block *)bh->b_data;
+
+ memset(&disk_super->magic, 0, sizeof(disk_super->magic));
+ set_buffer_dirty(bh);
+ sync_dirty_buffer(bh);
+ brelse(bh);
+
+ return 0;
+}
+
+/*
+ * Update the size of all devices, which is used for writing out the
+ * super blocks.
+ */
+void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_device *curr, *next;
+
+ if (list_empty(&fs_devices->resized_devices))
+ return;
+
+ mutex_lock(&fs_devices->device_list_mutex);
+ lock_chunks(fs_info->dev_root);
+ list_for_each_entry_safe(curr, next, &fs_devices->resized_devices,
+ resized_list) {
+ list_del_init(&curr->resized_list);
+ curr->commit_total_bytes = curr->disk_total_bytes;
+ }
+ unlock_chunks(fs_info->dev_root);
+ mutex_unlock(&fs_devices->device_list_mutex);
+}
+
+/* Must be invoked during the transaction commit */
+void btrfs_update_commit_device_bytes_used(struct btrfs_root *root,
+ struct btrfs_transaction *transaction)
+{
+ struct extent_map *em;
+ struct map_lookup *map;
+ struct btrfs_device *dev;
+ int i;
+
+ if (list_empty(&transaction->pending_chunks))
+ return;
+
+ /* In order to kick the device replace finish process */
+ lock_chunks(root);
+ list_for_each_entry(em, &transaction->pending_chunks, list) {
+ map = (struct map_lookup *)em->bdev;
+
+ for (i = 0; i < map->num_stripes; i++) {
+ dev = map->stripes[i].dev;
+ dev->commit_bytes_used = dev->bytes_used;
+ }
+ }
+ unlock_chunks(root);
+}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
new file mode 100644
index 000000000..ebc31331a
--- /dev/null
+++ b/fs/btrfs/volumes.h
@@ -0,0 +1,541 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_VOLUMES_
+#define __BTRFS_VOLUMES_
+
+#include <linux/bio.h>
+#include <linux/sort.h>
+#include <linux/btrfs.h>
+#include "async-thread.h"
+
+extern struct mutex uuid_mutex;
+
+#define BTRFS_STRIPE_LEN (64 * 1024)
+
+struct buffer_head;
+struct btrfs_pending_bios {
+ struct bio *head;
+ struct bio *tail;
+};
+
+/*
+ * Use sequence counter to get consistent device stat data on
+ * 32-bit processors.
+ */
+#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
+#include <linux/seqlock.h>
+#define __BTRFS_NEED_DEVICE_DATA_ORDERED
+#define btrfs_device_data_ordered_init(device) \
+ seqcount_init(&device->data_seqcount)
+#else
+#define btrfs_device_data_ordered_init(device) do { } while (0)
+#endif
+
+struct btrfs_device {
+ struct list_head dev_list;
+ struct list_head dev_alloc_list;
+ struct btrfs_fs_devices *fs_devices;
+
+ struct btrfs_root *dev_root;
+
+ struct rcu_string *name;
+
+ u64 generation;
+
+ spinlock_t io_lock ____cacheline_aligned;
+ int running_pending;
+ /* regular prio bios */
+ struct btrfs_pending_bios pending_bios;
+ /* WRITE_SYNC bios */
+ struct btrfs_pending_bios pending_sync_bios;
+
+ struct block_device *bdev;
+
+ /* the mode sent to blkdev_get */
+ fmode_t mode;
+
+ int writeable;
+ int in_fs_metadata;
+ int missing;
+ int can_discard;
+ int is_tgtdev_for_dev_replace;
+
+#ifdef __BTRFS_NEED_DEVICE_DATA_ORDERED
+ seqcount_t data_seqcount;
+#endif
+
+ /* the internal btrfs device id */
+ u64 devid;
+
+ /* size of the device in memory */
+ u64 total_bytes;
+
+ /* size of the device on disk */
+ u64 disk_total_bytes;
+
+ /* bytes used */
+ u64 bytes_used;
+
+ /* optimal io alignment for this device */
+ u32 io_align;
+
+ /* optimal io width for this device */
+ u32 io_width;
+ /* type and info about this device */
+ u64 type;
+
+ /* minimal io size for this device */
+ u32 sector_size;
+
+ /* physical drive uuid (or lvm uuid) */
+ u8 uuid[BTRFS_UUID_SIZE];
+
+ /*
+ * size of the device on the current transaction
+ *
+ * This variant is update when committing the transaction,
+ * and protected by device_list_mutex
+ */
+ u64 commit_total_bytes;
+
+ /* bytes used on the current transaction */
+ u64 commit_bytes_used;
+ /*
+ * used to manage the device which is resized
+ *
+ * It is protected by chunk_lock.
+ */
+ struct list_head resized_list;
+
+ /* for sending down flush barriers */
+ int nobarriers;
+ struct bio *flush_bio;
+ struct completion flush_wait;
+
+ /* per-device scrub information */
+ struct scrub_ctx *scrub_device;
+
+ struct btrfs_work work;
+ struct rcu_head rcu;
+ struct work_struct rcu_work;
+
+ /* readahead state */
+ spinlock_t reada_lock;
+ atomic_t reada_in_flight;
+ u64 reada_next;
+ struct reada_zone *reada_curr_zone;
+ struct radix_tree_root reada_zones;
+ struct radix_tree_root reada_extents;
+
+ /* disk I/O failure stats. For detailed description refer to
+ * enum btrfs_dev_stat_values in ioctl.h */
+ int dev_stats_valid;
+
+ /* Counter to record the change of device stats */
+ atomic_t dev_stats_ccnt;
+ atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX];
+};
+
+/*
+ * If we read those variants at the context of their own lock, we needn't
+ * use the following helpers, reading them directly is safe.
+ */
+#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
+#define BTRFS_DEVICE_GETSET_FUNCS(name) \
+static inline u64 \
+btrfs_device_get_##name(const struct btrfs_device *dev) \
+{ \
+ u64 size; \
+ unsigned int seq; \
+ \
+ do { \
+ seq = read_seqcount_begin(&dev->data_seqcount); \
+ size = dev->name; \
+ } while (read_seqcount_retry(&dev->data_seqcount, seq)); \
+ return size; \
+} \
+ \
+static inline void \
+btrfs_device_set_##name(struct btrfs_device *dev, u64 size) \
+{ \
+ preempt_disable(); \
+ write_seqcount_begin(&dev->data_seqcount); \
+ dev->name = size; \
+ write_seqcount_end(&dev->data_seqcount); \
+ preempt_enable(); \
+}
+#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPT)
+#define BTRFS_DEVICE_GETSET_FUNCS(name) \
+static inline u64 \
+btrfs_device_get_##name(const struct btrfs_device *dev) \
+{ \
+ u64 size; \
+ \
+ preempt_disable(); \
+ size = dev->name; \
+ preempt_enable(); \
+ return size; \
+} \
+ \
+static inline void \
+btrfs_device_set_##name(struct btrfs_device *dev, u64 size) \
+{ \
+ preempt_disable(); \
+ dev->name = size; \
+ preempt_enable(); \
+}
+#else
+#define BTRFS_DEVICE_GETSET_FUNCS(name) \
+static inline u64 \
+btrfs_device_get_##name(const struct btrfs_device *dev) \
+{ \
+ return dev->name; \
+} \
+ \
+static inline void \
+btrfs_device_set_##name(struct btrfs_device *dev, u64 size) \
+{ \
+ dev->name = size; \
+}
+#endif
+
+BTRFS_DEVICE_GETSET_FUNCS(total_bytes);
+BTRFS_DEVICE_GETSET_FUNCS(disk_total_bytes);
+BTRFS_DEVICE_GETSET_FUNCS(bytes_used);
+
+struct btrfs_fs_devices {
+ u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
+
+ u64 num_devices;
+ u64 open_devices;
+ u64 rw_devices;
+ u64 missing_devices;
+ u64 total_rw_bytes;
+ u64 total_devices;
+ struct block_device *latest_bdev;
+
+ /* all of the devices in the FS, protected by a mutex
+ * so we can safely walk it to write out the supers without
+ * worrying about add/remove by the multi-device code.
+ * Scrubbing super can kick off supers writing by holding
+ * this mutex lock.
+ */
+ struct mutex device_list_mutex;
+ struct list_head devices;
+
+ struct list_head resized_devices;
+ /* devices not currently being allocated */
+ struct list_head alloc_list;
+ struct list_head list;
+
+ struct btrfs_fs_devices *seed;
+ int seeding;
+
+ int opened;
+
+ /* set when we find or add a device that doesn't have the
+ * nonrot flag set
+ */
+ int rotating;
+};
+
+#define BTRFS_BIO_INLINE_CSUM_SIZE 64
+
+/*
+ * we need the mirror number and stripe index to be passed around
+ * the call chain while we are processing end_io (especially errors).
+ * Really, what we need is a btrfs_bio structure that has this info
+ * and is properly sized with its stripe array, but we're not there
+ * quite yet. We have our own btrfs bioset, and all of the bios
+ * we allocate are actually btrfs_io_bios. We'll cram as much of
+ * struct btrfs_bio as we can into this over time.
+ */
+typedef void (btrfs_io_bio_end_io_t) (struct btrfs_io_bio *bio, int err);
+struct btrfs_io_bio {
+ unsigned int mirror_num;
+ unsigned int stripe_index;
+ u64 logical;
+ u8 *csum;
+ u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE];
+ u8 *csum_allocated;
+ btrfs_io_bio_end_io_t *end_io;
+ struct bio bio;
+};
+
+static inline struct btrfs_io_bio *btrfs_io_bio(struct bio *bio)
+{
+ return container_of(bio, struct btrfs_io_bio, bio);
+}
+
+struct btrfs_bio_stripe {
+ struct btrfs_device *dev;
+ u64 physical;
+ u64 length; /* only used for discard mappings */
+};
+
+struct btrfs_bio;
+typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
+
+#define BTRFS_BIO_ORIG_BIO_SUBMITTED (1 << 0)
+
+struct btrfs_bio {
+ atomic_t refs;
+ atomic_t stripes_pending;
+ struct btrfs_fs_info *fs_info;
+ u64 map_type; /* get from map_lookup->type */
+ bio_end_io_t *end_io;
+ struct bio *orig_bio;
+ unsigned long flags;
+ void *private;
+ atomic_t error;
+ int max_errors;
+ int num_stripes;
+ int mirror_num;
+ int num_tgtdevs;
+ int *tgtdev_map;
+ /*
+ * logical block numbers for the start of each stripe
+ * The last one or two are p/q. These are sorted,
+ * so raid_map[0] is the start of our full stripe
+ */
+ u64 *raid_map;
+ struct btrfs_bio_stripe stripes[];
+};
+
+struct btrfs_device_info {
+ struct btrfs_device *dev;
+ u64 dev_offset;
+ u64 max_avail;
+ u64 total_avail;
+};
+
+struct btrfs_raid_attr {
+ int sub_stripes; /* sub_stripes info for map */
+ int dev_stripes; /* stripes per dev */
+ int devs_max; /* max devs to use */
+ int devs_min; /* min devs needed */
+ int devs_increment; /* ndevs has to be a multiple of this */
+ int ncopies; /* how many copies to data has */
+};
+
+struct map_lookup {
+ u64 type;
+ int io_align;
+ int io_width;
+ int stripe_len;
+ int sector_size;
+ int num_stripes;
+ int sub_stripes;
+ struct btrfs_bio_stripe stripes[];
+};
+
+#define map_lookup_size(n) (sizeof(struct map_lookup) + \
+ (sizeof(struct btrfs_bio_stripe) * (n)))
+
+/*
+ * Restriper's general type filter
+ */
+#define BTRFS_BALANCE_DATA (1ULL << 0)
+#define BTRFS_BALANCE_SYSTEM (1ULL << 1)
+#define BTRFS_BALANCE_METADATA (1ULL << 2)
+
+#define BTRFS_BALANCE_TYPE_MASK (BTRFS_BALANCE_DATA | \
+ BTRFS_BALANCE_SYSTEM | \
+ BTRFS_BALANCE_METADATA)
+
+#define BTRFS_BALANCE_FORCE (1ULL << 3)
+#define BTRFS_BALANCE_RESUME (1ULL << 4)
+
+/*
+ * Balance filters
+ */
+#define BTRFS_BALANCE_ARGS_PROFILES (1ULL << 0)
+#define BTRFS_BALANCE_ARGS_USAGE (1ULL << 1)
+#define BTRFS_BALANCE_ARGS_DEVID (1ULL << 2)
+#define BTRFS_BALANCE_ARGS_DRANGE (1ULL << 3)
+#define BTRFS_BALANCE_ARGS_VRANGE (1ULL << 4)
+#define BTRFS_BALANCE_ARGS_LIMIT (1ULL << 5)
+
+/*
+ * Profile changing flags. When SOFT is set we won't relocate chunk if
+ * it already has the target profile (even though it may be
+ * half-filled).
+ */
+#define BTRFS_BALANCE_ARGS_CONVERT (1ULL << 8)
+#define BTRFS_BALANCE_ARGS_SOFT (1ULL << 9)
+
+struct btrfs_balance_args;
+struct btrfs_balance_progress;
+struct btrfs_balance_control {
+ struct btrfs_fs_info *fs_info;
+
+ struct btrfs_balance_args data;
+ struct btrfs_balance_args meta;
+ struct btrfs_balance_args sys;
+
+ u64 flags;
+
+ struct btrfs_balance_progress stat;
+};
+
+int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
+ u64 end, u64 *length);
+void btrfs_get_bbio(struct btrfs_bio *bbio);
+void btrfs_put_bbio(struct btrfs_bio *bbio);
+int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
+ u64 logical, u64 *length,
+ struct btrfs_bio **bbio_ret, int mirror_num);
+int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw,
+ u64 logical, u64 *length,
+ struct btrfs_bio **bbio_ret, int mirror_num,
+ int need_raid_map);
+int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
+ u64 chunk_start, u64 physical, u64 devid,
+ u64 **logical, int *naddrs, int *stripe_len);
+int btrfs_read_sys_array(struct btrfs_root *root);
+int btrfs_read_chunk_tree(struct btrfs_root *root);
+int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+ struct btrfs_root *extent_root, u64 type);
+void btrfs_mapping_init(struct btrfs_mapping_tree *tree);
+void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree);
+int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
+ int mirror_num, int async_submit);
+int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
+ fmode_t flags, void *holder);
+int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
+ struct btrfs_fs_devices **fs_devices_ret);
+int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
+void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step);
+int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
+ char *device_path,
+ struct btrfs_device **device);
+struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
+ const u64 *devid,
+ const u8 *uuid);
+int btrfs_rm_device(struct btrfs_root *root, char *device_path);
+void btrfs_cleanup_fs_uuids(void);
+int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
+int btrfs_grow_device(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device, u64 new_size);
+struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
+ u8 *uuid, u8 *fsid);
+int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
+int btrfs_init_new_device(struct btrfs_root *root, char *path);
+int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
+ struct btrfs_device *srcdev,
+ struct btrfs_device **device_out);
+int btrfs_balance(struct btrfs_balance_control *bctl,
+ struct btrfs_ioctl_balance_args *bargs);
+int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info);
+int btrfs_recover_balance(struct btrfs_fs_info *fs_info);
+int btrfs_pause_balance(struct btrfs_fs_info *fs_info);
+int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
+int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info);
+int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info);
+int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
+int find_free_dev_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device, u64 num_bytes,
+ u64 *start, u64 *max_avail);
+void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index);
+int btrfs_get_dev_stats(struct btrfs_root *root,
+ struct btrfs_ioctl_get_dev_stats *stats);
+void btrfs_init_devices_late(struct btrfs_fs_info *fs_info);
+int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info);
+int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info);
+void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *srcdev);
+void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *srcdev);
+void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *tgtdev);
+void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *tgtdev);
+int btrfs_scratch_superblock(struct btrfs_device *device);
+int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
+ u64 logical, u64 len, int mirror_num);
+unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
+ struct btrfs_mapping_tree *map_tree,
+ u64 logical);
+int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
+ struct btrfs_root *extent_root,
+ u64 chunk_offset, u64 chunk_size);
+int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 chunk_offset);
+
+static inline int btrfs_dev_stats_dirty(struct btrfs_device *dev)
+{
+ return atomic_read(&dev->dev_stats_ccnt);
+}
+
+static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
+ int index)
+{
+ atomic_inc(dev->dev_stat_values + index);
+ smp_mb__before_atomic();
+ atomic_inc(&dev->dev_stats_ccnt);
+}
+
+static inline int btrfs_dev_stat_read(struct btrfs_device *dev,
+ int index)
+{
+ return atomic_read(dev->dev_stat_values + index);
+}
+
+static inline int btrfs_dev_stat_read_and_reset(struct btrfs_device *dev,
+ int index)
+{
+ int ret;
+
+ ret = atomic_xchg(dev->dev_stat_values + index, 0);
+ smp_mb__before_atomic();
+ atomic_inc(&dev->dev_stats_ccnt);
+ return ret;
+}
+
+static inline void btrfs_dev_stat_set(struct btrfs_device *dev,
+ int index, unsigned long val)
+{
+ atomic_set(dev->dev_stat_values + index, val);
+ smp_mb__before_atomic();
+ atomic_inc(&dev->dev_stats_ccnt);
+}
+
+static inline void btrfs_dev_stat_reset(struct btrfs_device *dev,
+ int index)
+{
+ btrfs_dev_stat_set(dev, index, 0);
+}
+
+void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info);
+void btrfs_update_commit_device_bytes_used(struct btrfs_root *root,
+ struct btrfs_transaction *transaction);
+
+static inline void lock_chunks(struct btrfs_root *root)
+{
+ mutex_lock(&root->fs_info->chunk_mutex);
+}
+
+static inline void unlock_chunks(struct btrfs_root *root)
+{
+ mutex_unlock(&root->fs_info->chunk_mutex);
+}
+
+
+#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
new file mode 100644
index 000000000..6f518c90e
--- /dev/null
+++ b/fs/btrfs/xattr.c
@@ -0,0 +1,517 @@
+/*
+ * Copyright (C) 2007 Red Hat. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/rwsem.h>
+#include <linux/xattr.h>
+#include <linux/security.h>
+#include <linux/posix_acl_xattr.h>
+#include "ctree.h"
+#include "btrfs_inode.h"
+#include "transaction.h"
+#include "xattr.h"
+#include "disk-io.h"
+#include "props.h"
+#include "locking.h"
+
+
+ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
+ void *buffer, size_t size)
+{
+ struct btrfs_dir_item *di;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ int ret = 0;
+ unsigned long data_ptr;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ /* lookup the xattr by name */
+ di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode), name,
+ strlen(name), 0);
+ if (!di) {
+ ret = -ENODATA;
+ goto out;
+ } else if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
+ goto out;
+ }
+
+ leaf = path->nodes[0];
+ /* if size is 0, that means we want the size of the attr */
+ if (!size) {
+ ret = btrfs_dir_data_len(leaf, di);
+ goto out;
+ }
+
+ /* now get the data out of our dir_item */
+ if (btrfs_dir_data_len(leaf, di) > size) {
+ ret = -ERANGE;
+ goto out;
+ }
+
+ /*
+ * The way things are packed into the leaf is like this
+ * |struct btrfs_dir_item|name|data|
+ * where name is the xattr name, so security.foo, and data is the
+ * content of the xattr. data_ptr points to the location in memory
+ * where the data starts in the in memory leaf
+ */
+ data_ptr = (unsigned long)((char *)(di + 1) +
+ btrfs_dir_name_len(leaf, di));
+ read_extent_buffer(leaf, buffer, data_ptr,
+ btrfs_dir_data_len(leaf, di));
+ ret = btrfs_dir_data_len(leaf, di);
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int do_setxattr(struct btrfs_trans_handle *trans,
+ struct inode *inode, const char *name,
+ const void *value, size_t size, int flags)
+{
+ struct btrfs_dir_item *di = NULL;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_path *path;
+ size_t name_len = strlen(name);
+ int ret = 0;
+
+ if (name_len + size > BTRFS_MAX_XATTR_SIZE(root))
+ return -ENOSPC;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ path->skip_release_on_error = 1;
+
+ if (!value) {
+ di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode),
+ name, name_len, -1);
+ if (!di && (flags & XATTR_REPLACE))
+ ret = -ENODATA;
+ else if (IS_ERR(di))
+ ret = PTR_ERR(di);
+ else if (di)
+ ret = btrfs_delete_one_dir_name(trans, root, path, di);
+ goto out;
+ }
+
+ /*
+ * For a replace we can't just do the insert blindly.
+ * Do a lookup first (read-only btrfs_search_slot), and return if xattr
+ * doesn't exist. If it exists, fall down below to the insert/replace
+ * path - we can't race with a concurrent xattr delete, because the VFS
+ * locks the inode's i_mutex before calling setxattr or removexattr.
+ */
+ if (flags & XATTR_REPLACE) {
+ ASSERT(mutex_is_locked(&inode->i_mutex));
+ di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode),
+ name, name_len, 0);
+ if (!di)
+ ret = -ENODATA;
+ else if (IS_ERR(di))
+ ret = PTR_ERR(di);
+ if (ret)
+ goto out;
+ btrfs_release_path(path);
+ di = NULL;
+ }
+
+ ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode),
+ name, name_len, value, size);
+ if (ret == -EOVERFLOW) {
+ /*
+ * We have an existing item in a leaf, split_leaf couldn't
+ * expand it. That item might have or not a dir_item that
+ * matches our target xattr, so lets check.
+ */
+ ret = 0;
+ btrfs_assert_tree_locked(path->nodes[0]);
+ di = btrfs_match_dir_item_name(root, path, name, name_len);
+ if (!di && !(flags & XATTR_REPLACE)) {
+ ret = -ENOSPC;
+ goto out;
+ }
+ } else if (ret == -EEXIST) {
+ ret = 0;
+ di = btrfs_match_dir_item_name(root, path, name, name_len);
+ ASSERT(di); /* logic error */
+ } else if (ret) {
+ goto out;
+ }
+
+ if (di && (flags & XATTR_CREATE)) {
+ ret = -EEXIST;
+ goto out;
+ }
+
+ if (di) {
+ /*
+ * We're doing a replace, and it must be atomic, that is, at
+ * any point in time we have either the old or the new xattr
+ * value in the tree. We don't want readers (getxattr and
+ * listxattrs) to miss a value, this is specially important
+ * for ACLs.
+ */
+ const int slot = path->slots[0];
+ struct extent_buffer *leaf = path->nodes[0];
+ const u16 old_data_len = btrfs_dir_data_len(leaf, di);
+ const u32 item_size = btrfs_item_size_nr(leaf, slot);
+ const u32 data_size = sizeof(*di) + name_len + size;
+ struct btrfs_item *item;
+ unsigned long data_ptr;
+ char *ptr;
+
+ if (size > old_data_len) {
+ if (btrfs_leaf_free_space(root, leaf) <
+ (size - old_data_len)) {
+ ret = -ENOSPC;
+ goto out;
+ }
+ }
+
+ if (old_data_len + name_len + sizeof(*di) == item_size) {
+ /* No other xattrs packed in the same leaf item. */
+ if (size > old_data_len)
+ btrfs_extend_item(root, path,
+ size - old_data_len);
+ else if (size < old_data_len)
+ btrfs_truncate_item(root, path, data_size, 1);
+ } else {
+ /* There are other xattrs packed in the same item. */
+ ret = btrfs_delete_one_dir_name(trans, root, path, di);
+ if (ret)
+ goto out;
+ btrfs_extend_item(root, path, data_size);
+ }
+
+ item = btrfs_item_nr(slot);
+ ptr = btrfs_item_ptr(leaf, slot, char);
+ ptr += btrfs_item_size(leaf, item) - data_size;
+ di = (struct btrfs_dir_item *)ptr;
+ btrfs_set_dir_data_len(leaf, di, size);
+ data_ptr = ((unsigned long)(di + 1)) + name_len;
+ write_extent_buffer(leaf, value, data_ptr, size);
+ btrfs_mark_buffer_dirty(leaf);
+ } else {
+ /*
+ * Insert, and we had space for the xattr, so path->slots[0] is
+ * where our xattr dir_item is and btrfs_insert_xattr_item()
+ * filled it.
+ */
+ }
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * @value: "" makes the attribute to empty, NULL removes it
+ */
+int __btrfs_setxattr(struct btrfs_trans_handle *trans,
+ struct inode *inode, const char *name,
+ const void *value, size_t size, int flags)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret;
+
+ if (trans)
+ return do_setxattr(trans, inode, name, value, size, flags);
+
+ trans = btrfs_start_transaction(root, 2);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ ret = do_setxattr(trans, inode, name, value, size, flags);
+ if (ret)
+ goto out;
+
+ inode_inc_iversion(inode);
+ inode->i_ctime = CURRENT_TIME;
+ set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
+ ret = btrfs_update_inode(trans, root, inode);
+ BUG_ON(ret);
+out:
+ btrfs_end_transaction(trans, root);
+ return ret;
+}
+
+ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+ struct btrfs_key key, found_key;
+ struct inode *inode = d_inode(dentry);
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_dir_item *di;
+ int ret = 0, slot;
+ size_t total_size = 0, size_left = size;
+ unsigned long name_ptr;
+ size_t name_len;
+
+ /*
+ * ok we want all objects associated with this id.
+ * NOTE: we set key.offset = 0; because we want to start with the
+ * first xattr that we find and walk forward
+ */
+ key.objectid = btrfs_ino(inode);
+ key.type = BTRFS_XATTR_ITEM_KEY;
+ key.offset = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ path->reada = 2;
+
+ /* search for our xattrs */
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto err;
+
+ while (1) {
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+
+ /* this is where we start walking through the path */
+ if (slot >= btrfs_header_nritems(leaf)) {
+ /*
+ * if we've reached the last slot in this leaf we need
+ * to go to the next leaf and reset everything
+ */
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto err;
+ else if (ret > 0)
+ break;
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+ /* check to make sure this item is what we want */
+ if (found_key.objectid != key.objectid)
+ break;
+ if (found_key.type != BTRFS_XATTR_ITEM_KEY)
+ break;
+
+ di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
+ if (verify_dir_item(root, leaf, di))
+ goto next;
+
+ name_len = btrfs_dir_name_len(leaf, di);
+ total_size += name_len + 1;
+
+ /* we are just looking for how big our buffer needs to be */
+ if (!size)
+ goto next;
+
+ if (!buffer || (name_len + 1) > size_left) {
+ ret = -ERANGE;
+ goto err;
+ }
+
+ name_ptr = (unsigned long)(di + 1);
+ read_extent_buffer(leaf, buffer, name_ptr, name_len);
+ buffer[name_len] = '\0';
+
+ size_left -= name_len + 1;
+ buffer += name_len + 1;
+next:
+ path->slots[0]++;
+ }
+ ret = total_size;
+
+err:
+ btrfs_free_path(path);
+
+ return ret;
+}
+
+/*
+ * List of handlers for synthetic system.* attributes. All real ondisk
+ * attributes are handled directly.
+ */
+const struct xattr_handler *btrfs_xattr_handlers[] = {
+#ifdef CONFIG_BTRFS_FS_POSIX_ACL
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
+#endif
+ NULL,
+};
+
+/*
+ * Check if the attribute is in a supported namespace.
+ *
+ * This is applied after the check for the synthetic attributes in the system
+ * namespace.
+ */
+static int btrfs_is_valid_xattr(const char *name)
+{
+ int len = strlen(name);
+ int prefixlen = 0;
+
+ if (!strncmp(name, XATTR_SECURITY_PREFIX,
+ XATTR_SECURITY_PREFIX_LEN))
+ prefixlen = XATTR_SECURITY_PREFIX_LEN;
+ else if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+ prefixlen = XATTR_SYSTEM_PREFIX_LEN;
+ else if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN))
+ prefixlen = XATTR_TRUSTED_PREFIX_LEN;
+ else if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
+ prefixlen = XATTR_USER_PREFIX_LEN;
+ else if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN))
+ prefixlen = XATTR_BTRFS_PREFIX_LEN;
+ else
+ return -EOPNOTSUPP;
+
+ /*
+ * The name cannot consist of just prefix
+ */
+ if (len <= prefixlen)
+ return -EINVAL;
+
+ return 0;
+}
+
+ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
+{
+ int ret;
+
+ /*
+ * If this is a request for a synthetic attribute in the system.*
+ * namespace use the generic infrastructure to resolve a handler
+ * for it via sb->s_xattr.
+ */
+ if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+ return generic_getxattr(dentry, name, buffer, size);
+
+ ret = btrfs_is_valid_xattr(name);
+ if (ret)
+ return ret;
+ return __btrfs_getxattr(d_inode(dentry), name, buffer, size);
+}
+
+int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
+ size_t size, int flags)
+{
+ struct btrfs_root *root = BTRFS_I(d_inode(dentry))->root;
+ int ret;
+
+ /*
+ * The permission on security.* and system.* is not checked
+ * in permission().
+ */
+ if (btrfs_root_readonly(root))
+ return -EROFS;
+
+ /*
+ * If this is a request for a synthetic attribute in the system.*
+ * namespace use the generic infrastructure to resolve a handler
+ * for it via sb->s_xattr.
+ */
+ if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+ return generic_setxattr(dentry, name, value, size, flags);
+
+ ret = btrfs_is_valid_xattr(name);
+ if (ret)
+ return ret;
+
+ if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN))
+ return btrfs_set_prop(d_inode(dentry), name,
+ value, size, flags);
+
+ if (size == 0)
+ value = ""; /* empty EA, do not remove */
+
+ return __btrfs_setxattr(NULL, d_inode(dentry), name, value, size,
+ flags);
+}
+
+int btrfs_removexattr(struct dentry *dentry, const char *name)
+{
+ struct btrfs_root *root = BTRFS_I(d_inode(dentry))->root;
+ int ret;
+
+ /*
+ * The permission on security.* and system.* is not checked
+ * in permission().
+ */
+ if (btrfs_root_readonly(root))
+ return -EROFS;
+
+ /*
+ * If this is a request for a synthetic attribute in the system.*
+ * namespace use the generic infrastructure to resolve a handler
+ * for it via sb->s_xattr.
+ */
+ if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+ return generic_removexattr(dentry, name);
+
+ ret = btrfs_is_valid_xattr(name);
+ if (ret)
+ return ret;
+
+ if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN))
+ return btrfs_set_prop(d_inode(dentry), name,
+ NULL, 0, XATTR_REPLACE);
+
+ return __btrfs_setxattr(NULL, d_inode(dentry), name, NULL, 0,
+ XATTR_REPLACE);
+}
+
+static int btrfs_initxattrs(struct inode *inode,
+ const struct xattr *xattr_array, void *fs_info)
+{
+ const struct xattr *xattr;
+ struct btrfs_trans_handle *trans = fs_info;
+ char *name;
+ int err = 0;
+
+ for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+ name = kmalloc(XATTR_SECURITY_PREFIX_LEN +
+ strlen(xattr->name) + 1, GFP_NOFS);
+ if (!name) {
+ err = -ENOMEM;
+ break;
+ }
+ strcpy(name, XATTR_SECURITY_PREFIX);
+ strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name);
+ err = __btrfs_setxattr(trans, inode, name,
+ xattr->value, xattr->value_len, 0);
+ kfree(name);
+ if (err < 0)
+ break;
+ }
+ return err;
+}
+
+int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
+ struct inode *inode, struct inode *dir,
+ const struct qstr *qstr)
+{
+ return security_inode_init_security(inode, dir, qstr,
+ &btrfs_initxattrs, trans);
+}
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
new file mode 100644
index 000000000..5049608d1
--- /dev/null
+++ b/fs/btrfs/xattr.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (C) 2007 Red Hat. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __XATTR__
+#define __XATTR__
+
+#include <linux/xattr.h>
+
+extern const struct xattr_handler *btrfs_xattr_handlers[];
+
+extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
+ void *buffer, size_t size);
+extern int __btrfs_setxattr(struct btrfs_trans_handle *trans,
+ struct inode *inode, const char *name,
+ const void *value, size_t size, int flags);
+extern ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
+ void *buffer, size_t size);
+extern int btrfs_setxattr(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags);
+extern int btrfs_removexattr(struct dentry *dentry, const char *name);
+
+extern int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
+ struct inode *inode, struct inode *dir,
+ const struct qstr *qstr);
+
+#endif /* __XATTR__ */
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
new file mode 100644
index 000000000..82990b8f8
--- /dev/null
+++ b/fs/btrfs/zlib.c
@@ -0,0 +1,412 @@
+/*
+ * Copyright (C) 2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on jffs2 zlib code:
+ * Copyright © 2001-2007 Red Hat, Inc.
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/zlib.h>
+#include <linux/zutil.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/bio.h>
+#include "compression.h"
+
+struct workspace {
+ z_stream strm;
+ char *buf;
+ struct list_head list;
+};
+
+static void zlib_free_workspace(struct list_head *ws)
+{
+ struct workspace *workspace = list_entry(ws, struct workspace, list);
+
+ vfree(workspace->strm.workspace);
+ kfree(workspace->buf);
+ kfree(workspace);
+}
+
+static struct list_head *zlib_alloc_workspace(void)
+{
+ struct workspace *workspace;
+ int workspacesize;
+
+ workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
+ if (!workspace)
+ return ERR_PTR(-ENOMEM);
+
+ workspacesize = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
+ zlib_inflate_workspacesize());
+ workspace->strm.workspace = vmalloc(workspacesize);
+ workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+ if (!workspace->strm.workspace || !workspace->buf)
+ goto fail;
+
+ INIT_LIST_HEAD(&workspace->list);
+
+ return &workspace->list;
+fail:
+ zlib_free_workspace(&workspace->list);
+ return ERR_PTR(-ENOMEM);
+}
+
+static int zlib_compress_pages(struct list_head *ws,
+ struct address_space *mapping,
+ u64 start, unsigned long len,
+ struct page **pages,
+ unsigned long nr_dest_pages,
+ unsigned long *out_pages,
+ unsigned long *total_in,
+ unsigned long *total_out,
+ unsigned long max_out)
+{
+ struct workspace *workspace = list_entry(ws, struct workspace, list);
+ int ret;
+ char *data_in;
+ char *cpage_out;
+ int nr_pages = 0;
+ struct page *in_page = NULL;
+ struct page *out_page = NULL;
+ unsigned long bytes_left;
+
+ *out_pages = 0;
+ *total_out = 0;
+ *total_in = 0;
+
+ if (Z_OK != zlib_deflateInit(&workspace->strm, 3)) {
+ printk(KERN_WARNING "BTRFS: deflateInit failed\n");
+ ret = -EIO;
+ goto out;
+ }
+
+ workspace->strm.total_in = 0;
+ workspace->strm.total_out = 0;
+
+ in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
+ data_in = kmap(in_page);
+
+ out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ if (out_page == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ cpage_out = kmap(out_page);
+ pages[0] = out_page;
+ nr_pages = 1;
+
+ workspace->strm.next_in = data_in;
+ workspace->strm.next_out = cpage_out;
+ workspace->strm.avail_out = PAGE_CACHE_SIZE;
+ workspace->strm.avail_in = min(len, PAGE_CACHE_SIZE);
+
+ while (workspace->strm.total_in < len) {
+ ret = zlib_deflate(&workspace->strm, Z_SYNC_FLUSH);
+ if (ret != Z_OK) {
+ printk(KERN_DEBUG "BTRFS: deflate in loop returned %d\n",
+ ret);
+ zlib_deflateEnd(&workspace->strm);
+ ret = -EIO;
+ goto out;
+ }
+
+ /* we're making it bigger, give up */
+ if (workspace->strm.total_in > 8192 &&
+ workspace->strm.total_in <
+ workspace->strm.total_out) {
+ ret = -E2BIG;
+ goto out;
+ }
+ /* we need another page for writing out. Test this
+ * before the total_in so we will pull in a new page for
+ * the stream end if required
+ */
+ if (workspace->strm.avail_out == 0) {
+ kunmap(out_page);
+ if (nr_pages == nr_dest_pages) {
+ out_page = NULL;
+ ret = -E2BIG;
+ goto out;
+ }
+ out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ if (out_page == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ cpage_out = kmap(out_page);
+ pages[nr_pages] = out_page;
+ nr_pages++;
+ workspace->strm.avail_out = PAGE_CACHE_SIZE;
+ workspace->strm.next_out = cpage_out;
+ }
+ /* we're all done */
+ if (workspace->strm.total_in >= len)
+ break;
+
+ /* we've read in a full page, get a new one */
+ if (workspace->strm.avail_in == 0) {
+ if (workspace->strm.total_out > max_out)
+ break;
+
+ bytes_left = len - workspace->strm.total_in;
+ kunmap(in_page);
+ page_cache_release(in_page);
+
+ start += PAGE_CACHE_SIZE;
+ in_page = find_get_page(mapping,
+ start >> PAGE_CACHE_SHIFT);
+ data_in = kmap(in_page);
+ workspace->strm.avail_in = min(bytes_left,
+ PAGE_CACHE_SIZE);
+ workspace->strm.next_in = data_in;
+ }
+ }
+ workspace->strm.avail_in = 0;
+ ret = zlib_deflate(&workspace->strm, Z_FINISH);
+ zlib_deflateEnd(&workspace->strm);
+
+ if (ret != Z_STREAM_END) {
+ ret = -EIO;
+ goto out;
+ }
+
+ if (workspace->strm.total_out >= workspace->strm.total_in) {
+ ret = -E2BIG;
+ goto out;
+ }
+
+ ret = 0;
+ *total_out = workspace->strm.total_out;
+ *total_in = workspace->strm.total_in;
+out:
+ *out_pages = nr_pages;
+ if (out_page)
+ kunmap(out_page);
+
+ if (in_page) {
+ kunmap(in_page);
+ page_cache_release(in_page);
+ }
+ return ret;
+}
+
+static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
+ u64 disk_start,
+ struct bio_vec *bvec,
+ int vcnt,
+ size_t srclen)
+{
+ struct workspace *workspace = list_entry(ws, struct workspace, list);
+ int ret = 0, ret2;
+ int wbits = MAX_WBITS;
+ char *data_in;
+ size_t total_out = 0;
+ unsigned long page_in_index = 0;
+ unsigned long page_out_index = 0;
+ unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_CACHE_SIZE);
+ unsigned long buf_start;
+ unsigned long pg_offset;
+
+ data_in = kmap(pages_in[page_in_index]);
+ workspace->strm.next_in = data_in;
+ workspace->strm.avail_in = min_t(size_t, srclen, PAGE_CACHE_SIZE);
+ workspace->strm.total_in = 0;
+
+ workspace->strm.total_out = 0;
+ workspace->strm.next_out = workspace->buf;
+ workspace->strm.avail_out = PAGE_CACHE_SIZE;
+ pg_offset = 0;
+
+ /* If it's deflate, and it's got no preset dictionary, then
+ we can tell zlib to skip the adler32 check. */
+ if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
+ ((data_in[0] & 0x0f) == Z_DEFLATED) &&
+ !(((data_in[0]<<8) + data_in[1]) % 31)) {
+
+ wbits = -((data_in[0] >> 4) + 8);
+ workspace->strm.next_in += 2;
+ workspace->strm.avail_in -= 2;
+ }
+
+ if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) {
+ printk(KERN_WARNING "BTRFS: inflateInit failed\n");
+ return -EIO;
+ }
+ while (workspace->strm.total_in < srclen) {
+ ret = zlib_inflate(&workspace->strm, Z_NO_FLUSH);
+ if (ret != Z_OK && ret != Z_STREAM_END)
+ break;
+
+ buf_start = total_out;
+ total_out = workspace->strm.total_out;
+
+ /* we didn't make progress in this inflate call, we're done */
+ if (buf_start == total_out)
+ break;
+
+ ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start,
+ total_out, disk_start,
+ bvec, vcnt,
+ &page_out_index, &pg_offset);
+ if (ret2 == 0) {
+ ret = 0;
+ goto done;
+ }
+
+ workspace->strm.next_out = workspace->buf;
+ workspace->strm.avail_out = PAGE_CACHE_SIZE;
+
+ if (workspace->strm.avail_in == 0) {
+ unsigned long tmp;
+ kunmap(pages_in[page_in_index]);
+ page_in_index++;
+ if (page_in_index >= total_pages_in) {
+ data_in = NULL;
+ break;
+ }
+ data_in = kmap(pages_in[page_in_index]);
+ workspace->strm.next_in = data_in;
+ tmp = srclen - workspace->strm.total_in;
+ workspace->strm.avail_in = min(tmp,
+ PAGE_CACHE_SIZE);
+ }
+ }
+ if (ret != Z_STREAM_END)
+ ret = -EIO;
+ else
+ ret = 0;
+done:
+ zlib_inflateEnd(&workspace->strm);
+ if (data_in)
+ kunmap(pages_in[page_in_index]);
+ if (!ret)
+ btrfs_clear_biovec_end(bvec, vcnt, page_out_index, pg_offset);
+ return ret;
+}
+
+static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
+ struct page *dest_page,
+ unsigned long start_byte,
+ size_t srclen, size_t destlen)
+{
+ struct workspace *workspace = list_entry(ws, struct workspace, list);
+ int ret = 0;
+ int wbits = MAX_WBITS;
+ unsigned long bytes_left;
+ unsigned long total_out = 0;
+ unsigned long pg_offset = 0;
+ char *kaddr;
+
+ destlen = min_t(unsigned long, destlen, PAGE_SIZE);
+ bytes_left = destlen;
+
+ workspace->strm.next_in = data_in;
+ workspace->strm.avail_in = srclen;
+ workspace->strm.total_in = 0;
+
+ workspace->strm.next_out = workspace->buf;
+ workspace->strm.avail_out = PAGE_CACHE_SIZE;
+ workspace->strm.total_out = 0;
+ /* If it's deflate, and it's got no preset dictionary, then
+ we can tell zlib to skip the adler32 check. */
+ if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
+ ((data_in[0] & 0x0f) == Z_DEFLATED) &&
+ !(((data_in[0]<<8) + data_in[1]) % 31)) {
+
+ wbits = -((data_in[0] >> 4) + 8);
+ workspace->strm.next_in += 2;
+ workspace->strm.avail_in -= 2;
+ }
+
+ if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) {
+ printk(KERN_WARNING "BTRFS: inflateInit failed\n");
+ return -EIO;
+ }
+
+ while (bytes_left > 0) {
+ unsigned long buf_start;
+ unsigned long buf_offset;
+ unsigned long bytes;
+
+ ret = zlib_inflate(&workspace->strm, Z_NO_FLUSH);
+ if (ret != Z_OK && ret != Z_STREAM_END)
+ break;
+
+ buf_start = total_out;
+ total_out = workspace->strm.total_out;
+
+ if (total_out == buf_start) {
+ ret = -EIO;
+ break;
+ }
+
+ if (total_out <= start_byte)
+ goto next;
+
+ if (total_out > start_byte && buf_start < start_byte)
+ buf_offset = start_byte - buf_start;
+ else
+ buf_offset = 0;
+
+ bytes = min(PAGE_CACHE_SIZE - pg_offset,
+ PAGE_CACHE_SIZE - buf_offset);
+ bytes = min(bytes, bytes_left);
+
+ kaddr = kmap_atomic(dest_page);
+ memcpy(kaddr + pg_offset, workspace->buf + buf_offset, bytes);
+ kunmap_atomic(kaddr);
+
+ pg_offset += bytes;
+ bytes_left -= bytes;
+next:
+ workspace->strm.next_out = workspace->buf;
+ workspace->strm.avail_out = PAGE_CACHE_SIZE;
+ }
+
+ if (ret != Z_STREAM_END && bytes_left != 0)
+ ret = -EIO;
+ else
+ ret = 0;
+
+ zlib_inflateEnd(&workspace->strm);
+
+ /*
+ * this should only happen if zlib returned fewer bytes than we
+ * expected. btrfs_get_block is responsible for zeroing from the
+ * end of the inline extent (destlen) to the end of the page
+ */
+ if (pg_offset < destlen) {
+ kaddr = kmap_atomic(dest_page);
+ memset(kaddr + pg_offset, 0, destlen - pg_offset);
+ kunmap_atomic(kaddr);
+ }
+ return ret;
+}
+
+const struct btrfs_compress_op btrfs_zlib_compress = {
+ .alloc_workspace = zlib_alloc_workspace,
+ .free_workspace = zlib_free_workspace,
+ .compress_pages = zlib_compress_pages,
+ .decompress_biovec = zlib_decompress_biovec,
+ .decompress = zlib_decompress,
+};