summaryrefslogtreecommitdiff
path: root/fs/btrfs
diff options
context:
space:
mode:
authorAndré Fabian Silva Delgado <emulatorman@parabola.nu>2015-09-08 01:01:14 -0300
committerAndré Fabian Silva Delgado <emulatorman@parabola.nu>2015-09-08 01:01:14 -0300
commite5fd91f1ef340da553f7a79da9540c3db711c937 (patch)
treeb11842027dc6641da63f4bcc524f8678263304a3 /fs/btrfs
parent2a9b0348e685a63d97486f6749622b61e9e3292f (diff)
Linux-libre 4.2-gnu
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/async-thread.c1
-rw-r--r--fs/btrfs/async-thread.h2
-rw-r--r--fs/btrfs/backref.c59
-rw-r--r--fs/btrfs/btrfs_inode.h2
-rw-r--r--fs/btrfs/ctree.c16
-rw-r--r--fs/btrfs/ctree.h29
-rw-r--r--fs/btrfs/delayed-ref.c372
-rw-r--r--fs/btrfs/delayed-ref.h29
-rw-r--r--fs/btrfs/dev-replace.c7
-rw-r--r--fs/btrfs/disk-io.c113
-rw-r--r--fs/btrfs/extent-tree.c332
-rw-r--r--fs/btrfs/extent-tree.h0
-rw-r--r--fs/btrfs/extent_io.c11
-rw-r--r--fs/btrfs/file.c11
-rw-r--r--fs/btrfs/free-space-cache.c14
-rw-r--r--fs/btrfs/inode.c116
-rw-r--r--fs/btrfs/ioctl.c281
-rw-r--r--fs/btrfs/ordered-data.c42
-rw-r--r--fs/btrfs/ordered-data.h6
-rw-r--r--fs/btrfs/qgroup.c1104
-rw-r--r--fs/btrfs/qgroup.h61
-rw-r--r--fs/btrfs/relocation.c21
-rw-r--r--fs/btrfs/scrub.c57
-rw-r--r--fs/btrfs/send.c129
-rw-r--r--fs/btrfs/super.c397
-rw-r--r--fs/btrfs/sysfs.c148
-rw-r--r--fs/btrfs/sysfs.h8
-rw-r--r--fs/btrfs/tests/qgroup-tests.c109
-rw-r--r--fs/btrfs/transaction.c82
-rw-r--r--fs/btrfs/transaction.h24
-rw-r--r--fs/btrfs/tree-defrag.c3
-rw-r--r--fs/btrfs/tree-log.c218
-rw-r--r--fs/btrfs/ulist.c47
-rw-r--r--fs/btrfs/ulist.h1
-rw-r--r--fs/btrfs/volumes.c254
-rw-r--r--fs/btrfs/volumes.h11
36 files changed, 2334 insertions, 1783 deletions
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index df9932b00..1ce06c849 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -85,6 +85,7 @@ BTRFS_WORK_HELPER(extent_refs_helper);
BTRFS_WORK_HELPER(scrub_helper);
BTRFS_WORK_HELPER(scrubwrc_helper);
BTRFS_WORK_HELPER(scrubnc_helper);
+BTRFS_WORK_HELPER(scrubparity_helper);
static struct __btrfs_workqueue *
__btrfs_alloc_workqueue(const char *name, unsigned int flags, int max_active,
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index ec2ee477f..b0b093b6a 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -64,6 +64,8 @@ BTRFS_WORK_HELPER_PROTO(extent_refs_helper);
BTRFS_WORK_HELPER_PROTO(scrub_helper);
BTRFS_WORK_HELPER_PROTO(scrubwrc_helper);
BTRFS_WORK_HELPER_PROTO(scrubnc_helper);
+BTRFS_WORK_HELPER_PROTO(scrubparity_helper);
+
struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
unsigned int flags,
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 614aaa196..802fabb30 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -250,8 +250,12 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
* the first item to check. But sometimes, we may enter it with
* slot==nritems. In that case, go to the next leaf before we continue.
*/
- if (path->slots[0] >= btrfs_header_nritems(path->nodes[0]))
- ret = btrfs_next_old_leaf(root, path, time_seq);
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+ if (time_seq == (u64)-1)
+ ret = btrfs_next_leaf(root, path);
+ else
+ ret = btrfs_next_old_leaf(root, path, time_seq);
+ }
while (!ret && count < total_refs) {
eb = path->nodes[0];
@@ -291,7 +295,10 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
eie = NULL;
}
next:
- ret = btrfs_next_old_item(root, path, time_seq);
+ if (time_seq == (u64)-1)
+ ret = btrfs_next_item(root, path);
+ else
+ ret = btrfs_next_old_item(root, path, time_seq);
}
if (ret > 0)
@@ -334,6 +341,8 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
if (path->search_commit_root)
root_level = btrfs_header_level(root->commit_root);
+ else if (time_seq == (u64)-1)
+ root_level = btrfs_header_level(root->node);
else
root_level = btrfs_old_root_level(root, time_seq);
@@ -343,7 +352,12 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
}
path->lowest_level = level;
- ret = btrfs_search_old_slot(root, &ref->key_for_search, path, time_seq);
+ if (time_seq == (u64)-1)
+ ret = btrfs_search_slot(NULL, root, &ref->key_for_search, path,
+ 0, 0);
+ else
+ ret = btrfs_search_old_slot(root, &ref->key_for_search, path,
+ time_seq);
/* root node has been locked, we can release @subvol_srcu safely here */
srcu_read_unlock(&fs_info->subvol_srcu, index);
@@ -491,7 +505,9 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info,
BUG_ON(!ref->wanted_disk_byte);
eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte,
0);
- if (!eb || !extent_buffer_uptodate(eb)) {
+ if (IS_ERR(eb)) {
+ return PTR_ERR(eb);
+ } else if (!extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
return -EIO;
}
@@ -507,7 +523,7 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info,
}
/*
- * merge two lists of backrefs and adjust counts accordingly
+ * merge backrefs and adjust counts accordingly
*
* mode = 1: merge identical keys, if key is set
* FIXME: if we add more keys in __add_prelim_ref, we can merge more here.
@@ -535,9 +551,9 @@ static void __merge_refs(struct list_head *head, int mode)
ref2 = list_entry(pos2, struct __prelim_ref, list);
+ if (!ref_for_same_block(ref1, ref2))
+ continue;
if (mode == 1) {
- if (!ref_for_same_block(ref1, ref2))
- continue;
if (!ref1->parent && ref2->parent) {
xchg = ref1;
ref1 = ref2;
@@ -572,8 +588,8 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
struct list_head *prefs, u64 *total_refs,
u64 inum)
{
+ struct btrfs_delayed_ref_node *node;
struct btrfs_delayed_extent_op *extent_op = head->extent_op;
- struct rb_node *n = &head->node.rb_node;
struct btrfs_key key;
struct btrfs_key op_key = {0};
int sgn;
@@ -583,12 +599,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
btrfs_disk_key_to_cpu(&op_key, &extent_op->key);
spin_lock(&head->lock);
- n = rb_first(&head->ref_root);
- while (n) {
- struct btrfs_delayed_ref_node *node;
- node = rb_entry(n, struct btrfs_delayed_ref_node,
- rb_node);
- n = rb_next(n);
+ list_for_each_entry(node, &head->ref_list, list) {
if (node->seq > seq)
continue;
@@ -882,6 +893,11 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
*
* NOTE: This can return values > 0
*
+ * If time_seq is set to (u64)-1, it will not search delayed_refs, and behave
+ * much like trans == NULL case, the difference only lies in it will not
+ * commit root.
+ * The special case is for qgroup to search roots in commit_transaction().
+ *
* FIXME some caching might speed things up
*/
static int find_parent_nodes(struct btrfs_trans_handle *trans,
@@ -920,6 +936,9 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
path->skip_locking = 1;
}
+ if (time_seq == (u64)-1)
+ path->skip_locking = 1;
+
/*
* grab both a lock on the path and a lock on the delayed ref head.
* We need both to get a consistent picture of how the refs look
@@ -934,9 +953,10 @@ again:
BUG_ON(ret == 0);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
- if (trans && likely(trans->type != __TRANS_DUMMY)) {
+ if (trans && likely(trans->type != __TRANS_DUMMY) &&
+ time_seq != (u64)-1) {
#else
- if (trans) {
+ if (trans && time_seq != (u64)-1) {
#endif
/*
* look if there are updates for this ref queued and lock the
@@ -1034,7 +1054,10 @@ again:
eb = read_tree_block(fs_info->extent_root,
ref->parent, 0);
- if (!eb || !extent_buffer_uptodate(eb)) {
+ if (IS_ERR(eb)) {
+ ret = PTR_ERR(eb);
+ goto out;
+ } else if (!extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
ret = -EIO;
goto out;
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 0ef5cc13f..81220b220 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -44,6 +44,8 @@
#define BTRFS_INODE_IN_DELALLOC_LIST 9
#define BTRFS_INODE_READDIO_NEED_LOCK 10
#define BTRFS_INODE_HAS_PROPS 11
+/* DIO is ready to submit */
+#define BTRFS_INODE_DIO_READY 12
/*
* The following 3 bits are meant only for the btree inode.
* When any of them is set, it means an error happened while writing an
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 0f11ebc92..54114b488 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1439,8 +1439,9 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
btrfs_tree_read_unlock(eb_root);
free_extent_buffer(eb_root);
old = read_tree_block(root, logical, 0);
- if (WARN_ON(!old || !extent_buffer_uptodate(old))) {
- free_extent_buffer(old);
+ if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
+ if (!IS_ERR(old))
+ free_extent_buffer(old);
btrfs_warn(root->fs_info,
"failed to read tree block %llu from get_old_root", logical);
} else {
@@ -1685,7 +1686,9 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
if (!cur || !uptodate) {
if (!cur) {
cur = read_tree_block(root, blocknr, gen);
- if (!cur || !extent_buffer_uptodate(cur)) {
+ if (IS_ERR(cur)) {
+ return PTR_ERR(cur);
+ } else if (!extent_buffer_uptodate(cur)) {
free_extent_buffer(cur);
return -EIO;
}
@@ -1864,8 +1867,9 @@ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
eb = read_tree_block(root, btrfs_node_blockptr(parent, slot),
btrfs_node_ptr_generation(parent, slot));
- if (eb && !extent_buffer_uptodate(eb)) {
- free_extent_buffer(eb);
+ if (IS_ERR(eb) || !extent_buffer_uptodate(eb)) {
+ if (!IS_ERR(eb))
+ free_extent_buffer(eb);
eb = NULL;
}
@@ -2494,7 +2498,7 @@ read_block_for_search(struct btrfs_trans_handle *trans,
ret = -EAGAIN;
tmp = read_tree_block(root, blocknr, 0);
- if (tmp) {
+ if (!IS_ERR(tmp)) {
/*
* If the read above didn't mark this buffer up to date,
* it will never end up being up to date. Set ret to EIO now
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 6f364e1d8..aac314e14 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -174,7 +174,7 @@ struct btrfs_ordered_sum;
/* csum types */
#define BTRFS_CSUM_TYPE_CRC32 0
-static int btrfs_csum_sizes[] = { 4, 0 };
+static int btrfs_csum_sizes[] = { 4 };
/* four bytes for CRC32 */
#define BTRFS_EMPTY_DIR_SIZE 0
@@ -1619,10 +1619,7 @@ struct btrfs_fs_info {
struct task_struct *cleaner_kthread;
int thread_pool_size;
- struct kobject super_kobj;
struct kobject *space_info_kobj;
- struct kobject *device_dir_kobj;
- struct completion kobj_unregister;
int do_barriers;
int closing;
int log_root_recovering;
@@ -1698,6 +1695,7 @@ struct btrfs_fs_info {
struct btrfs_workqueue *scrub_workers;
struct btrfs_workqueue *scrub_wr_completion_workers;
struct btrfs_workqueue *scrub_nocow_workers;
+ struct btrfs_workqueue *scrub_parity_workers;
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
u32 check_integrity_print_mask;
@@ -1735,7 +1733,7 @@ struct btrfs_fs_info {
/* list of dirty qgroups to be written at next commit */
struct list_head dirty_qgroups;
- /* used by btrfs_qgroup_record_ref for an efficient tree traversal */
+ /* used by qgroup for an efficient tree traversal */
u64 qgroup_seq;
/* qgroup rescan items */
@@ -1780,6 +1778,7 @@ struct btrfs_fs_info {
spinlock_t unused_bgs_lock;
struct list_head unused_bgs;
struct mutex unused_bg_unpin_mutex;
+ struct mutex delete_unused_bgs_mutex;
/* For btrfs to record security options */
struct security_mnt_opts security_opts;
@@ -3458,6 +3457,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes)
void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
+void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
struct inode *inode);
void btrfs_orphan_release_metadata(struct inode *inode);
@@ -3515,6 +3515,9 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
int __get_raid_index(u64 flags);
int btrfs_start_write_no_snapshoting(struct btrfs_root *root);
void btrfs_end_write_no_snapshoting(struct btrfs_root *root);
+void check_system_chunk(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const u64 type);
/* ctree.c */
int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
int level, int *slot);
@@ -4050,6 +4053,7 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
#ifdef CONFIG_BTRFS_ASSERT
+__cold
static inline void assfail(char *expr, char *file, int line)
{
pr_err("BTRFS: assertion failed: %s, file: %s, line: %d",
@@ -4065,10 +4069,12 @@ static inline void assfail(char *expr, char *file, int line)
#define btrfs_assert()
__printf(5, 6)
+__cold
void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int errno, const char *fmt, ...);
+__cold
void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root, const char *function,
unsigned int line, int errno);
@@ -4111,11 +4117,17 @@ static inline int __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag)
* Call btrfs_abort_transaction as early as possible when an error condition is
* detected, that way the exact line number is reported.
*/
-
#define btrfs_abort_transaction(trans, root, errno) \
do { \
- __btrfs_abort_transaction(trans, root, __func__, \
- __LINE__, errno); \
+ /* Report first abort since mount */ \
+ if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \
+ &((root)->fs_info->fs_state))) { \
+ WARN(1, KERN_DEBUG \
+ "BTRFS: Transaction aborted (error %d)\n", \
+ (errno)); \
+ } \
+ __btrfs_abort_transaction((trans), (root), __func__, \
+ __LINE__, (errno)); \
} while (0)
#define btrfs_std_error(fs_info, errno) \
@@ -4132,6 +4144,7 @@ do { \
} while (0)
__printf(5, 6)
+__cold
void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int errno, const char *fmt, ...);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 8f8ed7d20..ac3e81da6 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -22,6 +22,7 @@
#include "ctree.h"
#include "delayed-ref.h"
#include "transaction.h"
+#include "qgroup.h"
struct kmem_cache *btrfs_delayed_ref_head_cachep;
struct kmem_cache *btrfs_delayed_tree_ref_cachep;
@@ -84,87 +85,6 @@ static int comp_data_refs(struct btrfs_delayed_data_ref *ref2,
return 0;
}
-/*
- * entries in the rb tree are ordered by the byte number of the extent,
- * type of the delayed backrefs and content of delayed backrefs.
- */
-static int comp_entry(struct btrfs_delayed_ref_node *ref2,
- struct btrfs_delayed_ref_node *ref1,
- bool compare_seq)
-{
- if (ref1->bytenr < ref2->bytenr)
- return -1;
- if (ref1->bytenr > ref2->bytenr)
- return 1;
- if (ref1->is_head && ref2->is_head)
- return 0;
- if (ref2->is_head)
- return -1;
- if (ref1->is_head)
- return 1;
- if (ref1->type < ref2->type)
- return -1;
- if (ref1->type > ref2->type)
- return 1;
- if (ref1->no_quota > ref2->no_quota)
- return 1;
- if (ref1->no_quota < ref2->no_quota)
- return -1;
- /* merging of sequenced refs is not allowed */
- if (compare_seq) {
- if (ref1->seq < ref2->seq)
- return -1;
- if (ref1->seq > ref2->seq)
- return 1;
- }
- if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
- ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) {
- return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2),
- btrfs_delayed_node_to_tree_ref(ref1),
- ref1->type);
- } else if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY ||
- ref1->type == BTRFS_SHARED_DATA_REF_KEY) {
- return comp_data_refs(btrfs_delayed_node_to_data_ref(ref2),
- btrfs_delayed_node_to_data_ref(ref1));
- }
- BUG();
- return 0;
-}
-
-/*
- * insert a new ref into the rbtree. This returns any existing refs
- * for the same (bytenr,parent) tuple, or NULL if the new node was properly
- * inserted.
- */
-static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
- struct rb_node *node)
-{
- struct rb_node **p = &root->rb_node;
- struct rb_node *parent_node = NULL;
- struct btrfs_delayed_ref_node *entry;
- struct btrfs_delayed_ref_node *ins;
- int cmp;
-
- ins = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
- while (*p) {
- parent_node = *p;
- entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
- rb_node);
-
- cmp = comp_entry(entry, ins, 1);
- if (cmp < 0)
- p = &(*p)->rb_left;
- else if (cmp > 0)
- p = &(*p)->rb_right;
- else
- return entry;
- }
-
- rb_link_node(node, parent_node, p);
- rb_insert_color(node, root);
- return NULL;
-}
-
/* insert a new ref to head ref rbtree */
static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root,
struct rb_node *node)
@@ -268,7 +188,7 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
rb_erase(&head->href_node, &delayed_refs->href_root);
} else {
assert_spin_locked(&head->lock);
- rb_erase(&ref->rb_node, &head->ref_root);
+ list_del(&ref->list);
}
ref->in_tree = 0;
btrfs_put_delayed_ref(ref);
@@ -277,99 +197,6 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
trans->delayed_ref_updates--;
}
-static int merge_ref(struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_root *delayed_refs,
- struct btrfs_delayed_ref_head *head,
- struct btrfs_delayed_ref_node *ref, u64 seq)
-{
- struct rb_node *node;
- int mod = 0;
- int done = 0;
-
- node = rb_next(&ref->rb_node);
- while (!done && node) {
- struct btrfs_delayed_ref_node *next;
-
- next = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
- node = rb_next(node);
- if (seq && next->seq >= seq)
- break;
- if (comp_entry(ref, next, 0))
- continue;
-
- if (ref->action == next->action) {
- mod = next->ref_mod;
- } else {
- if (ref->ref_mod < next->ref_mod) {
- struct btrfs_delayed_ref_node *tmp;
-
- tmp = ref;
- ref = next;
- next = tmp;
- done = 1;
- }
- mod = -next->ref_mod;
- }
-
- drop_delayed_ref(trans, delayed_refs, head, next);
- ref->ref_mod += mod;
- if (ref->ref_mod == 0) {
- drop_delayed_ref(trans, delayed_refs, head, ref);
- done = 1;
- } else {
- /*
- * You can't have multiples of the same ref on a tree
- * block.
- */
- WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY ||
- ref->type == BTRFS_SHARED_BLOCK_REF_KEY);
- }
- }
- return done;
-}
-
-void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
- struct btrfs_delayed_ref_root *delayed_refs,
- struct btrfs_delayed_ref_head *head)
-{
- struct rb_node *node;
- u64 seq = 0;
-
- assert_spin_locked(&head->lock);
- /*
- * We don't have too much refs to merge in the case of delayed data
- * refs.
- */
- if (head->is_data)
- return;
-
- spin_lock(&fs_info->tree_mod_seq_lock);
- if (!list_empty(&fs_info->tree_mod_seq_list)) {
- struct seq_list *elem;
-
- elem = list_first_entry(&fs_info->tree_mod_seq_list,
- struct seq_list, list);
- seq = elem->seq;
- }
- spin_unlock(&fs_info->tree_mod_seq_lock);
-
- node = rb_first(&head->ref_root);
- while (node) {
- struct btrfs_delayed_ref_node *ref;
-
- ref = rb_entry(node, struct btrfs_delayed_ref_node,
- rb_node);
- /* We can't merge refs that are outside of our seq count */
- if (seq && ref->seq >= seq)
- break;
- if (merge_ref(trans, delayed_refs, head, ref, seq))
- node = rb_first(&head->ref_root);
- else
- node = rb_next(&ref->rb_node);
- }
-}
-
int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
u64 seq)
@@ -443,45 +270,71 @@ again:
}
/*
- * helper function to update an extent delayed ref in the
- * rbtree. existing and update must both have the same
- * bytenr and parent
+ * Helper to insert the ref_node to the tail or merge with tail.
*
- * This may free existing if the update cancels out whatever
- * operation it was doing.
+ * Return 0 for insert.
+ * Return >0 for merge.
*/
-static noinline void
-update_existing_ref(struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_root *delayed_refs,
- struct btrfs_delayed_ref_head *head,
- struct btrfs_delayed_ref_node *existing,
- struct btrfs_delayed_ref_node *update)
+static int
+add_delayed_ref_tail_merge(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_root *root,
+ struct btrfs_delayed_ref_head *href,
+ struct btrfs_delayed_ref_node *ref)
{
- if (update->action != existing->action) {
- /*
- * this is effectively undoing either an add or a
- * drop. We decrement the ref_mod, and if it goes
- * down to zero we just delete the entry without
- * every changing the extent allocation tree.
- */
- existing->ref_mod--;
- if (existing->ref_mod == 0)
- drop_delayed_ref(trans, delayed_refs, head, existing);
- else
- WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
- existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
+ struct btrfs_delayed_ref_node *exist;
+ int mod;
+ int ret = 0;
+
+ spin_lock(&href->lock);
+ /* Check whether we can merge the tail node with ref */
+ if (list_empty(&href->ref_list))
+ goto add_tail;
+ exist = list_entry(href->ref_list.prev, struct btrfs_delayed_ref_node,
+ list);
+ /* No need to compare bytenr nor is_head */
+ if (exist->type != ref->type || exist->no_quota != ref->no_quota ||
+ exist->seq != ref->seq)
+ goto add_tail;
+
+ if ((exist->type == BTRFS_TREE_BLOCK_REF_KEY ||
+ exist->type == BTRFS_SHARED_BLOCK_REF_KEY) &&
+ comp_tree_refs(btrfs_delayed_node_to_tree_ref(exist),
+ btrfs_delayed_node_to_tree_ref(ref),
+ ref->type))
+ goto add_tail;
+ if ((exist->type == BTRFS_EXTENT_DATA_REF_KEY ||
+ exist->type == BTRFS_SHARED_DATA_REF_KEY) &&
+ comp_data_refs(btrfs_delayed_node_to_data_ref(exist),
+ btrfs_delayed_node_to_data_ref(ref)))
+ goto add_tail;
+
+ /* Now we are sure we can merge */
+ ret = 1;
+ if (exist->action == ref->action) {
+ mod = ref->ref_mod;
} else {
- WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
- existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
- /*
- * the action on the existing ref matches
- * the action on the ref we're trying to add.
- * Bump the ref_mod by one so the backref that
- * is eventually added/removed has the correct
- * reference count
- */
- existing->ref_mod += update->ref_mod;
+ /* Need to change action */
+ if (exist->ref_mod < ref->ref_mod) {
+ exist->action = ref->action;
+ mod = -exist->ref_mod;
+ exist->ref_mod = ref->ref_mod;
+ } else
+ mod = -ref->ref_mod;
}
+ exist->ref_mod += mod;
+
+ /* remove existing tail if its ref_mod is zero */
+ if (exist->ref_mod == 0)
+ drop_delayed_ref(trans, root, href, exist);
+ spin_unlock(&href->lock);
+ return ret;
+
+add_tail:
+ list_add_tail(&ref->list, &href->ref_list);
+ atomic_inc(&root->num_entries);
+ trans->delayed_ref_updates++;
+ spin_unlock(&href->lock);
+ return ret;
}
/*
@@ -568,12 +421,14 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs,
static noinline struct btrfs_delayed_ref_head *
add_delayed_ref_head(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_node *ref, u64 bytenr,
- u64 num_bytes, int action, int is_data)
+ struct btrfs_delayed_ref_node *ref,
+ struct btrfs_qgroup_extent_record *qrecord,
+ u64 bytenr, u64 num_bytes, int action, int is_data)
{
struct btrfs_delayed_ref_head *existing;
struct btrfs_delayed_ref_head *head_ref = NULL;
struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_qgroup_extent_record *qexisting;
int count_mod = 1;
int must_insert_reserved = 0;
@@ -618,10 +473,22 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
head_ref = btrfs_delayed_node_to_head(ref);
head_ref->must_insert_reserved = must_insert_reserved;
head_ref->is_data = is_data;
- head_ref->ref_root = RB_ROOT;
+ INIT_LIST_HEAD(&head_ref->ref_list);
head_ref->processing = 0;
head_ref->total_ref_mod = count_mod;
+ /* Record qgroup extent info if provided */
+ if (qrecord) {
+ qrecord->bytenr = bytenr;
+ qrecord->num_bytes = num_bytes;
+ qrecord->old_roots = NULL;
+
+ qexisting = btrfs_qgroup_insert_dirty_extent(delayed_refs,
+ qrecord);
+ if (qexisting)
+ kfree(qrecord);
+ }
+
spin_lock_init(&head_ref->lock);
mutex_init(&head_ref->mutex);
@@ -659,10 +526,10 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
u64 num_bytes, u64 parent, u64 ref_root, int level,
int action, int no_quota)
{
- struct btrfs_delayed_ref_node *existing;
struct btrfs_delayed_tree_ref *full_ref;
struct btrfs_delayed_ref_root *delayed_refs;
u64 seq = 0;
+ int ret;
if (action == BTRFS_ADD_DELAYED_EXTENT)
action = BTRFS_ADD_DELAYED_REF;
@@ -693,21 +560,14 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
trace_add_delayed_tree_ref(ref, full_ref, action);
- spin_lock(&head_ref->lock);
- existing = tree_insert(&head_ref->ref_root, &ref->rb_node);
- if (existing) {
- update_existing_ref(trans, delayed_refs, head_ref, existing,
- ref);
- /*
- * we've updated the existing ref, free the newly
- * allocated ref
- */
+ ret = add_delayed_ref_tail_merge(trans, delayed_refs, head_ref, ref);
+
+ /*
+ * XXX: memory should be freed at the same level allocated.
+ * But bad practice is anywhere... Follow it now. Need cleanup.
+ */
+ if (ret > 0)
kmem_cache_free(btrfs_delayed_tree_ref_cachep, full_ref);
- } else {
- atomic_inc(&delayed_refs->num_entries);
- trans->delayed_ref_updates++;
- }
- spin_unlock(&head_ref->lock);
}
/*
@@ -721,10 +581,10 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
u64 num_bytes, u64 parent, u64 ref_root, u64 owner,
u64 offset, int action, int no_quota)
{
- struct btrfs_delayed_ref_node *existing;
struct btrfs_delayed_data_ref *full_ref;
struct btrfs_delayed_ref_root *delayed_refs;
u64 seq = 0;
+ int ret;
if (action == BTRFS_ADD_DELAYED_EXTENT)
action = BTRFS_ADD_DELAYED_REF;
@@ -758,21 +618,10 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
trace_add_delayed_data_ref(ref, full_ref, action);
- spin_lock(&head_ref->lock);
- existing = tree_insert(&head_ref->ref_root, &ref->rb_node);
- if (existing) {
- update_existing_ref(trans, delayed_refs, head_ref, existing,
- ref);
- /*
- * we've updated the existing ref, free the newly
- * allocated ref
- */
+ ret = add_delayed_ref_tail_merge(trans, delayed_refs, head_ref, ref);
+
+ if (ret > 0)
kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref);
- } else {
- atomic_inc(&delayed_refs->num_entries);
- trans->delayed_ref_updates++;
- }
- spin_unlock(&head_ref->lock);
}
/*
@@ -790,6 +639,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_tree_ref *ref;
struct btrfs_delayed_ref_head *head_ref;
struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_qgroup_extent_record *record = NULL;
if (!is_fstree(ref_root) || !fs_info->quota_enabled)
no_quota = 0;
@@ -800,9 +650,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
return -ENOMEM;
head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
- if (!head_ref) {
- kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
- return -ENOMEM;
+ if (!head_ref)
+ goto free_ref;
+
+ if (fs_info->quota_enabled && is_fstree(ref_root)) {
+ record = kmalloc(sizeof(*record), GFP_NOFS);
+ if (!record)
+ goto free_head_ref;
}
head_ref->extent_op = extent_op;
@@ -814,7 +668,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
* insert both the head node and the new ref without dropping
* the spin lock
*/
- head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node,
+ head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record,
bytenr, num_bytes, action, 0);
add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr,
@@ -823,6 +677,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
spin_unlock(&delayed_refs->lock);
return 0;
+
+free_head_ref:
+ kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
+free_ref:
+ kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
+
+ return -ENOMEM;
}
/*
@@ -839,6 +700,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_data_ref *ref;
struct btrfs_delayed_ref_head *head_ref;
struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_qgroup_extent_record *record = NULL;
if (!is_fstree(ref_root) || !fs_info->quota_enabled)
no_quota = 0;
@@ -854,6 +716,16 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
return -ENOMEM;
}
+ if (fs_info->quota_enabled && is_fstree(ref_root)) {
+ record = kmalloc(sizeof(*record), GFP_NOFS);
+ if (!record) {
+ kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
+ kmem_cache_free(btrfs_delayed_ref_head_cachep,
+ head_ref);
+ return -ENOMEM;
+ }
+ }
+
head_ref->extent_op = extent_op;
delayed_refs = &trans->transaction->delayed_refs;
@@ -863,7 +735,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
* insert both the head node and the new ref without dropping
* the spin lock
*/
- head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node,
+ head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record,
bytenr, num_bytes, action, 1);
add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr,
@@ -891,9 +763,9 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
delayed_refs = &trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock);
- add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
- num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
- extent_op->is_data);
+ add_delayed_ref_head(fs_info, trans, &head_ref->node, NULL, bytenr,
+ num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
+ extent_op->is_data);
spin_unlock(&delayed_refs->lock);
return 0;
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 5eb089239..13fb5e609 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -24,9 +24,25 @@
#define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */
#define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */
+/*
+ * XXX: Qu: I really hate the design that ref_head and tree/data ref shares the
+ * same ref_node structure.
+ * Ref_head is in a higher logic level than tree/data ref, and duplicated
+ * bytenr/num_bytes in ref_node is really a waste or memory, they should be
+ * referred from ref_head.
+ * This gets more disgusting after we use list to store tree/data ref in
+ * ref_head. Must clean this mess up later.
+ */
struct btrfs_delayed_ref_node {
+ /*
+ * ref_head use rb tree, stored in ref_root->href.
+ * indexed by bytenr
+ */
struct rb_node rb_node;
+ /*data/tree ref use list, stored in ref_head->ref_list. */
+ struct list_head list;
+
/* the starting bytenr of the extent */
u64 bytenr;
@@ -83,7 +99,7 @@ struct btrfs_delayed_ref_head {
struct mutex mutex;
spinlock_t lock;
- struct rb_root ref_root;
+ struct list_head ref_list;
struct rb_node href_node;
@@ -132,6 +148,9 @@ struct btrfs_delayed_ref_root {
/* head ref rbtree */
struct rb_root href_root;
+ /* dirty extent records */
+ struct rb_root dirty_extent_root;
+
/* this spin lock protects the rbtree and the entries inside */
spinlock_t lock;
@@ -156,6 +175,14 @@ struct btrfs_delayed_ref_root {
int flushing;
u64 run_delayed_start;
+
+ /*
+ * To make qgroup to skip given root.
+ * This is for snapshot, as btrfs_qgroup_inherit() will manully
+ * modify counters for snapshot and its source, so we should skip
+ * the snapshot in new_root/old_roots or it will get calculated twice
+ */
+ u64 qgroup_to_skip;
};
extern struct kmem_cache *btrfs_delayed_ref_head_cachep;
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 0573848c7..564a7de17 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -376,6 +376,10 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
WARN_ON(!tgt_device);
dev_replace->tgtdev = tgt_device;
+ ret = btrfs_kobj_add_device(tgt_device->fs_devices, tgt_device);
+ if (ret)
+ btrfs_err(root->fs_info, "kobj add dev failed %d\n", ret);
+
printk_in_rcu(KERN_INFO
"BTRFS: dev_replace from %s (devid %llu) to %s started\n",
src_device->missing ? "<missing disk>" :
@@ -583,8 +587,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
mutex_unlock(&uuid_mutex);
/* replace the sysfs entry */
- btrfs_kobj_rm_device(fs_info, src_device);
- btrfs_kobj_add_device(fs_info, tgt_device);
+ btrfs_kobj_rm_device(fs_info->fs_devices, src_device);
btrfs_rm_dev_replace_free_srcdev(fs_info, src_device);
/* write back the superblocks */
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2ef9a4b72..f556c3732 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1149,12 +1149,12 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
buf = btrfs_find_create_tree_block(root, bytenr);
if (!buf)
- return NULL;
+ return ERR_PTR(-ENOMEM);
ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
if (ret) {
free_extent_buffer(buf);
- return NULL;
+ return ERR_PTR(ret);
}
return buf;
@@ -1509,20 +1509,19 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
generation = btrfs_root_generation(&root->root_item);
root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
generation);
- if (!root->node) {
- ret = -ENOMEM;
+ if (IS_ERR(root->node)) {
+ ret = PTR_ERR(root->node);
goto find_fail;
} else if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
ret = -EIO;
- goto read_fail;
+ free_extent_buffer(root->node);
+ goto find_fail;
}
root->commit_root = btrfs_root_node(root);
out:
btrfs_free_path(path);
return root;
-read_fail:
- free_extent_buffer(root->node);
find_fail:
kfree(root);
alloc_fail:
@@ -1745,13 +1744,14 @@ static void end_workqueue_fn(struct btrfs_work *work)
bio->bi_private = end_io_wq->private;
bio->bi_end_io = end_io_wq->end_io;
kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq);
- bio_endio_nodec(bio, error);
+ bio_endio(bio, error);
}
static int cleaner_kthread(void *arg)
{
struct btrfs_root *root = arg;
int again;
+ struct btrfs_trans_handle *trans;
do {
again = 0;
@@ -1773,7 +1773,6 @@ static int cleaner_kthread(void *arg)
}
btrfs_run_delayed_iputs(root);
- btrfs_delete_unused_bgs(root->fs_info);
again = btrfs_clean_one_deleted_snapshot(root);
mutex_unlock(&root->fs_info->cleaner_mutex);
@@ -1782,6 +1781,16 @@ static int cleaner_kthread(void *arg)
* needn't do anything special here.
*/
btrfs_run_defrag_inodes(root->fs_info);
+
+ /*
+ * Acquires fs_info->delete_unused_bgs_mutex to avoid racing
+ * with relocation (btrfs_relocate_chunk) and relocation
+ * acquires fs_info->cleaner_mutex (btrfs_relocate_block_group)
+ * after acquiring fs_info->delete_unused_bgs_mutex. So we
+ * can't hold, nor need to, fs_info->cleaner_mutex when deleting
+ * unused block groups.
+ */
+ btrfs_delete_unused_bgs(root->fs_info);
sleep:
if (!try_to_freeze() && !again) {
set_current_state(TASK_INTERRUPTIBLE);
@@ -1790,6 +1799,34 @@ sleep:
__set_current_state(TASK_RUNNING);
}
} while (!kthread_should_stop());
+
+ /*
+ * Transaction kthread is stopped before us and wakes us up.
+ * However we might have started a new transaction and COWed some
+ * tree blocks when deleting unused block groups for example. So
+ * make sure we commit the transaction we started to have a clean
+ * shutdown when evicting the btree inode - if it has dirty pages
+ * when we do the final iput() on it, eviction will trigger a
+ * writeback for it which will fail with null pointer dereferences
+ * since work queues and other resources were already released and
+ * destroyed by the time the iput/eviction/writeback is made.
+ */
+ trans = btrfs_attach_transaction(root);
+ if (IS_ERR(trans)) {
+ if (PTR_ERR(trans) != -ENOENT)
+ btrfs_err(root->fs_info,
+ "cleaner transaction attach returned %ld",
+ PTR_ERR(trans));
+ } else {
+ int ret;
+
+ ret = btrfs_commit_transaction(trans, root);
+ if (ret)
+ btrfs_err(root->fs_info,
+ "cleaner open transaction commit returned %d",
+ ret);
+ }
+
return 0;
}
@@ -2320,8 +2357,12 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
log_tree_root->node = read_tree_block(tree_root, bytenr,
fs_info->generation + 1);
- if (!log_tree_root->node ||
- !extent_buffer_uptodate(log_tree_root->node)) {
+ if (IS_ERR(log_tree_root->node)) {
+ printk(KERN_ERR "BTRFS: failed to read log tree\n");
+ ret = PTR_ERR(log_tree_root->node);
+ kfree(log_tree_root);
+ return ret;
+ } else if (!extent_buffer_uptodate(log_tree_root->node)) {
printk(KERN_ERR "BTRFS: failed to read log tree\n");
free_extent_buffer(log_tree_root->node);
kfree(log_tree_root);
@@ -2489,12 +2530,12 @@ int open_ctree(struct super_block *sb,
spin_lock_init(&fs_info->unused_bgs_lock);
rwlock_init(&fs_info->tree_mod_log_lock);
mutex_init(&fs_info->unused_bg_unpin_mutex);
+ mutex_init(&fs_info->delete_unused_bgs_mutex);
mutex_init(&fs_info->reloc_mutex);
mutex_init(&fs_info->delalloc_root_mutex);
seqlock_init(&fs_info->profiles_lock);
init_rwsem(&fs_info->delayed_iput_sem);
- init_completion(&fs_info->kobj_unregister);
INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
INIT_LIST_HEAD(&fs_info->space_info);
INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
@@ -2797,10 +2838,11 @@ int open_ctree(struct super_block *sb,
chunk_root->node = read_tree_block(chunk_root,
btrfs_super_chunk_root(disk_super),
generation);
- if (!chunk_root->node ||
- !test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
+ if (IS_ERR(chunk_root->node) ||
+ !extent_buffer_uptodate(chunk_root->node)) {
printk(KERN_ERR "BTRFS: failed to read chunk root on %s\n",
sb->s_id);
+ chunk_root->node = NULL;
goto fail_tree_roots;
}
btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
@@ -2834,11 +2876,11 @@ retry_root_backup:
tree_root->node = read_tree_block(tree_root,
btrfs_super_root(disk_super),
generation);
- if (!tree_root->node ||
- !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
+ if (IS_ERR(tree_root->node) ||
+ !extent_buffer_uptodate(tree_root->node)) {
printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n",
sb->s_id);
-
+ tree_root->node = NULL;
goto recovery_tree_root;
}
@@ -2874,10 +2916,22 @@ retry_root_backup:
btrfs_close_extra_devices(fs_devices, 1);
+ ret = btrfs_sysfs_add_fsid(fs_devices, NULL);
+ if (ret) {
+ pr_err("BTRFS: failed to init sysfs fsid interface: %d\n", ret);
+ goto fail_block_groups;
+ }
+
+ ret = btrfs_sysfs_add_device(fs_devices);
+ if (ret) {
+ pr_err("BTRFS: failed to init sysfs device interface: %d\n", ret);
+ goto fail_fsdev_sysfs;
+ }
+
ret = btrfs_sysfs_add_one(fs_info);
if (ret) {
pr_err("BTRFS: failed to init sysfs interface: %d\n", ret);
- goto fail_block_groups;
+ goto fail_fsdev_sysfs;
}
ret = btrfs_init_space_info(fs_info);
@@ -3055,6 +3109,9 @@ fail_cleaner:
fail_sysfs:
btrfs_sysfs_remove_one(fs_info);
+fail_fsdev_sysfs:
+ btrfs_sysfs_remove_fsid(fs_info->fs_devices);
+
fail_block_groups:
btrfs_put_block_group_cache(fs_info);
btrfs_free_block_groups(fs_info);
@@ -3269,11 +3326,8 @@ static int write_dev_supers(struct btrfs_device *device,
*/
static void btrfs_end_empty_barrier(struct bio *bio, int err)
{
- if (err) {
- if (err == -EOPNOTSUPP)
- set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
+ if (err)
clear_bit(BIO_UPTODATE, &bio->bi_flags);
- }
if (bio->bi_private)
complete(bio->bi_private);
bio_put(bio);
@@ -3301,11 +3355,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
wait_for_completion(&device->flush_wait);
- if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
- printk_in_rcu("BTRFS: disabling barriers on dev %s\n",
- rcu_str_deref(device->name));
- device->nobarriers = 1;
- } else if (!bio_flagged(bio, BIO_UPTODATE)) {
+ if (!bio_flagged(bio, BIO_UPTODATE)) {
ret = -EIO;
btrfs_dev_stat_inc_and_print(device,
BTRFS_DEV_STAT_FLUSH_ERRS);
@@ -3732,6 +3782,7 @@ void close_ctree(struct btrfs_root *root)
}
btrfs_sysfs_remove_one(fs_info);
+ btrfs_sysfs_remove_fsid(fs_info->fs_devices);
btrfs_free_fs_roots(fs_info);
@@ -4060,6 +4111,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
while ((node = rb_first(&delayed_refs->href_root)) != NULL) {
struct btrfs_delayed_ref_head *head;
+ struct btrfs_delayed_ref_node *tmp;
bool pin_bytes = false;
head = rb_entry(node, struct btrfs_delayed_ref_head,
@@ -4075,11 +4127,10 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
continue;
}
spin_lock(&head->lock);
- while ((node = rb_first(&head->ref_root)) != NULL) {
- ref = rb_entry(node, struct btrfs_delayed_ref_node,
- rb_node);
+ list_for_each_entry_safe_reverse(ref, tmp, &head->ref_list,
+ list) {
ref->in_tree = 0;
- rb_erase(&ref->rb_node, &head->ref_root);
+ list_del(&ref->list);
atomic_dec(&delayed_refs->num_entries);
btrfs_put_delayed_ref(ref);
}
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0ec3acd14..07204bf60 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -79,11 +79,10 @@ static int update_block_group(struct btrfs_trans_handle *trans,
u64 num_bytes, int alloc);
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- u64 bytenr, u64 num_bytes, u64 parent,
+ struct btrfs_delayed_ref_node *node, u64 parent,
u64 root_objectid, u64 owner_objectid,
u64 owner_offset, int refs_to_drop,
- struct btrfs_delayed_extent_op *extra_op,
- int no_quota);
+ struct btrfs_delayed_extent_op *extra_op);
static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
struct extent_buffer *leaf,
struct btrfs_extent_item *ei);
@@ -1967,10 +1966,9 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- u64 bytenr, u64 num_bytes,
+ struct btrfs_delayed_ref_node *node,
u64 parent, u64 root_objectid,
u64 owner, u64 offset, int refs_to_add,
- int no_quota,
struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -1978,9 +1976,11 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
struct btrfs_extent_item *item;
struct btrfs_key key;
+ u64 bytenr = node->bytenr;
+ u64 num_bytes = node->num_bytes;
u64 refs;
int ret;
- enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_ADD_EXCL;
+ int no_quota = node->no_quota;
path = btrfs_alloc_path();
if (!path)
@@ -1996,26 +1996,8 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
bytenr, num_bytes, parent,
root_objectid, owner, offset,
refs_to_add, extent_op);
- if ((ret < 0 && ret != -EAGAIN) || (!ret && no_quota))
+ if ((ret < 0 && ret != -EAGAIN) || !ret)
goto out;
- /*
- * Ok we were able to insert an inline extent and it appears to be a new
- * reference, deal with the qgroup accounting.
- */
- if (!ret && !no_quota) {
- ASSERT(root->fs_info->quota_enabled);
- leaf = path->nodes[0];
- btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
- item = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_extent_item);
- if (btrfs_extent_refs(leaf, item) > (u64)refs_to_add)
- type = BTRFS_QGROUP_OPER_ADD_SHARED;
- btrfs_release_path(path);
-
- ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
- bytenr, num_bytes, type, 0);
- goto out;
- }
/*
* Ok we had -EAGAIN which means we didn't have space to insert and
@@ -2026,8 +2008,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
refs = btrfs_extent_refs(leaf, item);
- if (refs)
- type = BTRFS_QGROUP_OPER_ADD_SHARED;
btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
if (extent_op)
__run_delayed_extent_op(extent_op, leaf, item);
@@ -2035,13 +2015,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
- if (!no_quota) {
- ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
- bytenr, num_bytes, type, 0);
- if (ret)
- goto out;
- }
-
path->reada = 1;
path->leave_spinning = 1;
/* now insert the actual backref */
@@ -2087,17 +2060,15 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
ref->objectid, ref->offset,
&ins, node->ref_mod);
} else if (node->action == BTRFS_ADD_DELAYED_REF) {
- ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
- node->num_bytes, parent,
+ ret = __btrfs_inc_extent_ref(trans, root, node, parent,
ref_root, ref->objectid,
ref->offset, node->ref_mod,
- node->no_quota, extent_op);
+ extent_op);
} else if (node->action == BTRFS_DROP_DELAYED_REF) {
- ret = __btrfs_free_extent(trans, root, node->bytenr,
- node->num_bytes, parent,
+ ret = __btrfs_free_extent(trans, root, node, parent,
ref_root, ref->objectid,
ref->offset, node->ref_mod,
- extent_op, node->no_quota);
+ extent_op);
} else {
BUG();
}
@@ -2255,15 +2226,14 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
ref->level, &ins,
node->no_quota);
} else if (node->action == BTRFS_ADD_DELAYED_REF) {
- ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
- node->num_bytes, parent, ref_root,
- ref->level, 0, 1, node->no_quota,
+ ret = __btrfs_inc_extent_ref(trans, root, node,
+ parent, ref_root,
+ ref->level, 0, 1,
extent_op);
} else if (node->action == BTRFS_DROP_DELAYED_REF) {
- ret = __btrfs_free_extent(trans, root, node->bytenr,
- node->num_bytes, parent, ref_root,
- ref->level, 0, 1, extent_op,
- node->no_quota);
+ ret = __btrfs_free_extent(trans, root, node,
+ parent, ref_root,
+ ref->level, 0, 1, extent_op);
} else {
BUG();
}
@@ -2323,28 +2293,27 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
return ret;
}
-static noinline struct btrfs_delayed_ref_node *
+static inline struct btrfs_delayed_ref_node *
select_delayed_ref(struct btrfs_delayed_ref_head *head)
{
- struct rb_node *node;
- struct btrfs_delayed_ref_node *ref, *last = NULL;;
+ struct btrfs_delayed_ref_node *ref;
+
+ if (list_empty(&head->ref_list))
+ return NULL;
/*
- * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
- * this prevents ref count from going down to zero when
- * there still are pending delayed ref.
+ * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
+ * This is to prevent a ref count from going down to zero, which deletes
+ * the extent item from the extent tree, when there still are references
+ * to add, which would fail because they would not find the extent item.
*/
- node = rb_first(&head->ref_root);
- while (node) {
- ref = rb_entry(node, struct btrfs_delayed_ref_node,
- rb_node);
+ list_for_each_entry(ref, &head->ref_list, list) {
if (ref->action == BTRFS_ADD_DELAYED_REF)
return ref;
- else if (last == NULL)
- last = ref;
- node = rb_next(node);
}
- return last;
+
+ return list_entry(head->ref_list.next, struct btrfs_delayed_ref_node,
+ list);
}
/*
@@ -2396,16 +2365,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
}
}
- /*
- * We need to try and merge add/drops of the same ref since we
- * can run into issues with relocate dropping the implicit ref
- * and then it being added back again before the drop can
- * finish. If we merged anything we need to re-loop so we can
- * get a good ref.
- */
spin_lock(&locked_ref->lock);
- btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
- locked_ref);
/*
* locked_ref is the head node, so we have to go one
@@ -2482,7 +2442,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
spin_unlock(&locked_ref->lock);
spin_lock(&delayed_refs->lock);
spin_lock(&locked_ref->lock);
- if (rb_first(&locked_ref->ref_root) ||
+ if (!list_empty(&locked_ref->ref_list) ||
locked_ref->extent_op) {
spin_unlock(&locked_ref->lock);
spin_unlock(&delayed_refs->lock);
@@ -2496,7 +2456,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
} else {
actual_count++;
ref->in_tree = 0;
- rb_erase(&ref->rb_node, &locked_ref->ref_root);
+ list_del(&ref->list);
}
atomic_dec(&delayed_refs->num_entries);
@@ -2864,9 +2824,6 @@ again:
goto again;
}
out:
- ret = btrfs_delayed_qgroup_accounting(trans, root->fs_info);
- if (ret)
- return ret;
assert_qgroups_uptodate(trans);
return 0;
}
@@ -2905,7 +2862,6 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *ref;
struct btrfs_delayed_data_ref *data_ref;
struct btrfs_delayed_ref_root *delayed_refs;
- struct rb_node *node;
int ret = 0;
delayed_refs = &trans->transaction->delayed_refs;
@@ -2934,11 +2890,7 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
spin_unlock(&delayed_refs->lock);
spin_lock(&head->lock);
- node = rb_first(&head->ref_root);
- while (node) {
- ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
- node = rb_next(node);
-
+ list_for_each_entry(ref, &head->ref_list, list) {
/* If it's a shared ref we know a cross reference exists */
if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
ret = 1;
@@ -3693,7 +3645,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
found->disk_total += total_bytes * factor;
found->bytes_used += bytes_used;
found->disk_used += bytes_used * factor;
- found->full = 0;
+ if (total_bytes > 0)
+ found->full = 0;
spin_unlock(&found->lock);
*space_info = found;
return 0;
@@ -3721,7 +3674,10 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
found->bytes_reserved = 0;
found->bytes_readonly = 0;
found->bytes_may_use = 0;
- found->full = 0;
+ if (total_bytes > 0)
+ found->full = 0;
+ else
+ found->full = 1;
found->force_alloc = CHUNK_ALLOC_NO_FORCE;
found->chunk_alloc = 0;
found->flush = 0;
@@ -3975,6 +3931,9 @@ commit_trans:
!atomic_read(&root->fs_info->open_ioctl_trans)) {
need_commit--;
+ if (need_commit > 0)
+ btrfs_wait_ordered_roots(fs_info, -1);
+
trans = btrfs_join_transaction(root);
if (IS_ERR(trans))
return PTR_ERR(trans);
@@ -4088,7 +4047,7 @@ static int should_alloc_chunk(struct btrfs_root *root,
return 1;
}
-static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
+static u64 get_profile_num_devs(struct btrfs_root *root, u64 type)
{
u64 num_dev;
@@ -4102,24 +4061,43 @@ static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
else
num_dev = 1; /* DUP or single */
- /* metadata for updaing devices and chunk tree */
- return btrfs_calc_trans_metadata_size(root, num_dev + 1);
+ return num_dev;
}
-static void check_system_chunk(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 type)
+/*
+ * If @is_allocation is true, reserve space in the system space info necessary
+ * for allocating a chunk, otherwise if it's false, reserve space necessary for
+ * removing a chunk.
+ */
+void check_system_chunk(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 type)
{
struct btrfs_space_info *info;
u64 left;
u64 thresh;
+ int ret = 0;
+ u64 num_devs;
+
+ /*
+ * Needed because we can end up allocating a system chunk and for an
+ * atomic and race free space reservation in the chunk block reserve.
+ */
+ ASSERT(mutex_is_locked(&root->fs_info->chunk_mutex));
info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
spin_lock(&info->lock);
left = info->total_bytes - info->bytes_used - info->bytes_pinned -
- info->bytes_reserved - info->bytes_readonly;
+ info->bytes_reserved - info->bytes_readonly -
+ info->bytes_may_use;
spin_unlock(&info->lock);
- thresh = get_system_chunk_thresh(root, type);
+ num_devs = get_profile_num_devs(root, type);
+
+ /* num_devs device items to update and 1 chunk item to add or remove */
+ thresh = btrfs_calc_trunc_metadata_size(root, num_devs) +
+ btrfs_calc_trans_metadata_size(root, 1);
+
if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) {
btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu",
left, thresh, type);
@@ -4130,7 +4108,21 @@ static void check_system_chunk(struct btrfs_trans_handle *trans,
u64 flags;
flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0);
- btrfs_alloc_chunk(trans, root, flags);
+ /*
+ * Ignore failure to create system chunk. We might end up not
+ * needing it, as we might not need to COW all nodes/leafs from
+ * the paths we visit in the chunk tree (they were already COWed
+ * or created in the current transaction for example).
+ */
+ ret = btrfs_alloc_chunk(trans, root, flags);
+ }
+
+ if (!ret) {
+ ret = btrfs_block_rsv_add(root->fs_info->chunk_root,
+ &root->fs_info->chunk_block_rsv,
+ thresh, BTRFS_RESERVE_NO_FLUSH);
+ if (!ret)
+ trans->chunk_bytes_reserved += thresh;
}
}
@@ -4235,6 +4227,24 @@ out:
space_info->chunk_alloc = 0;
spin_unlock(&space_info->lock);
mutex_unlock(&fs_info->chunk_mutex);
+ /*
+ * When we allocate a new chunk we reserve space in the chunk block
+ * reserve to make sure we can COW nodes/leafs in the chunk tree or
+ * add new nodes/leafs to it if we end up needing to do it when
+ * inserting the chunk item and updating device items as part of the
+ * second phase of chunk allocation, performed by
+ * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
+ * large number of new block groups to create in our transaction
+ * handle's new_bgs list to avoid exhausting the chunk block reserve
+ * in extreme cases - like having a single transaction create many new
+ * block groups when starting to write out the free space caches of all
+ * the block groups that were made dirty during the lifetime of the
+ * transaction.
+ */
+ if (trans->chunk_bytes_reserved >= (2 * 1024 * 1024ull)) {
+ btrfs_create_pending_block_groups(trans, trans->root);
+ btrfs_trans_release_chunk_metadata(trans);
+ }
return ret;
}
@@ -5188,6 +5198,24 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
trans->bytes_reserved = 0;
}
+/*
+ * To be called after all the new block groups attached to the transaction
+ * handle have been created (btrfs_create_pending_block_groups()).
+ */
+void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->root->fs_info;
+
+ if (!trans->chunk_bytes_reserved)
+ return;
+
+ WARN_ON_ONCE(!list_empty(&trans->new_bgs));
+
+ block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL,
+ trans->chunk_bytes_reserved);
+ trans->chunk_bytes_reserved = 0;
+}
+
/* Can only return 0 or -ENOSPC */
int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
struct inode *inode)
@@ -6092,11 +6120,10 @@ static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes,
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- u64 bytenr, u64 num_bytes, u64 parent,
+ struct btrfs_delayed_ref_node *node, u64 parent,
u64 root_objectid, u64 owner_objectid,
u64 owner_offset, int refs_to_drop,
- struct btrfs_delayed_extent_op *extent_op,
- int no_quota)
+ struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_key key;
struct btrfs_path *path;
@@ -6110,10 +6137,12 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
int extent_slot = 0;
int found_extent = 0;
int num_to_del = 1;
+ int no_quota = node->no_quota;
u32 item_size;
u64 refs;
+ u64 bytenr = node->bytenr;
+ u64 num_bytes = node->num_bytes;
int last_ref = 0;
- enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_SUB_EXCL;
bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
SKINNY_METADATA);
@@ -6294,7 +6323,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
refs -= refs_to_drop;
if (refs > 0) {
- type = BTRFS_QGROUP_OPER_SUB_SHARED;
if (extent_op)
__run_delayed_extent_op(extent_op, leaf, ei);
/*
@@ -6356,18 +6384,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
btrfs_release_path(path);
- /* Deal with the quota accounting */
- if (!ret && last_ref && !no_quota) {
- int mod_seq = 0;
-
- if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID &&
- type == BTRFS_QGROUP_OPER_SUB_SHARED)
- mod_seq = 1;
-
- ret = btrfs_qgroup_record_ref(trans, info, root_objectid,
- bytenr, num_bytes, type,
- mod_seq);
- }
out:
btrfs_free_path(path);
return ret;
@@ -6393,7 +6409,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
goto out_delayed_unlock;
spin_lock(&head->lock);
- if (rb_first(&head->ref_root))
+ if (!list_empty(&head->ref_list))
goto out;
if (head->extent_op) {
@@ -7303,13 +7319,6 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(path->nodes[0]);
btrfs_free_path(path);
- /* Always set parent to 0 here since its exclusive anyway. */
- ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
- ins->objectid, ins->offset,
- BTRFS_QGROUP_OPER_ADD_EXCL, 0);
- if (ret)
- return ret;
-
ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
if (ret) { /* -ENOENT, logic error */
btrfs_err(fs_info, "update block group failed for %llu %llu",
@@ -7391,14 +7400,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_free_path(path);
- if (!no_quota) {
- ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
- ins->objectid, num_bytes,
- BTRFS_QGROUP_OPER_ADD_EXCL, 0);
- if (ret)
- return ret;
- }
-
ret = update_block_group(trans, root, ins->objectid, root->nodesize,
1);
if (ret) { /* -ENOENT, logic error */
@@ -7755,12 +7756,18 @@ reada:
wc->reada_slot = slot;
}
+/*
+ * TODO: Modify related function to add related node/leaf to dirty_extent_root,
+ * for later qgroup accounting.
+ *
+ * Current, this function does nothing.
+ */
static int account_leaf_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *eb)
{
int nr = btrfs_header_nritems(eb);
- int i, extent_type, ret;
+ int i, extent_type;
struct btrfs_key key;
struct btrfs_file_extent_item *fi;
u64 bytenr, num_bytes;
@@ -7783,13 +7790,6 @@ static int account_leaf_items(struct btrfs_trans_handle *trans,
continue;
num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
-
- ret = btrfs_qgroup_record_ref(trans, root->fs_info,
- root->objectid,
- bytenr, num_bytes,
- BTRFS_QGROUP_OPER_SUB_SUBTREE, 0);
- if (ret)
- return ret;
}
return 0;
}
@@ -7858,6 +7858,8 @@ static int adjust_slots_upwards(struct btrfs_root *root,
/*
* root_eb is the subtree root and is locked before this function is called.
+ * TODO: Modify this function to mark all (including complete shared node)
+ * to dirty_extent_root to allow it get accounted in qgroup.
*/
static int account_shared_subtree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -7920,7 +7922,11 @@ walk_down:
child_gen = btrfs_node_ptr_generation(eb, parent_slot);
eb = read_tree_block(root, child_bytenr, child_gen);
- if (!eb || !extent_buffer_uptodate(eb)) {
+ if (IS_ERR(eb)) {
+ ret = PTR_ERR(eb);
+ goto out;
+ } else if (!extent_buffer_uptodate(eb)) {
+ free_extent_buffer(eb);
ret = -EIO;
goto out;
}
@@ -7931,16 +7937,6 @@ walk_down:
btrfs_tree_read_lock(eb);
btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
-
- ret = btrfs_qgroup_record_ref(trans, root->fs_info,
- root->objectid,
- child_bytenr,
- root->nodesize,
- BTRFS_QGROUP_OPER_SUB_SUBTREE,
- 0);
- if (ret)
- goto out;
-
}
if (level == 0) {
@@ -8151,7 +8147,9 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
if (reada && level == 1)
reada_walk_down(trans, root, wc, path);
next = read_tree_block(root, bytenr, generation);
- if (!next || !extent_buffer_uptodate(next)) {
+ if (IS_ERR(next)) {
+ return PTR_ERR(next);
+ } else if (!extent_buffer_uptodate(next)) {
free_extent_buffer(next);
return -EIO;
}
@@ -8533,24 +8531,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
goto out_end_trans;
}
- /*
- * Qgroup update accounting is run from
- * delayed ref handling. This usually works
- * out because delayed refs are normally the
- * only way qgroup updates are added. However,
- * we may have added updates during our tree
- * walk so run qgroups here to make sure we
- * don't lose any updates.
- */
- ret = btrfs_delayed_qgroup_accounting(trans,
- root->fs_info);
- if (ret)
- printk_ratelimited(KERN_ERR "BTRFS: Failure %d "
- "running qgroup updates "
- "during snapshot delete. "
- "Quota is out of sync, "
- "rescan required.\n", ret);
-
btrfs_end_transaction_throttle(trans, tree_root);
if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
pr_debug("BTRFS: drop snapshot early exit\n");
@@ -8604,14 +8584,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
}
root_dropped = true;
out_end_trans:
- ret = btrfs_delayed_qgroup_accounting(trans, tree_root->fs_info);
- if (ret)
- printk_ratelimited(KERN_ERR "BTRFS: Failure %d "
- "running qgroup updates "
- "during snapshot delete. "
- "Quota is out of sync, "
- "rescan required.\n", ret);
-
btrfs_end_transaction_throttle(trans, tree_root);
out_free:
kfree(wc);
@@ -9562,6 +9534,19 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
free_excluded_extents(root, cache);
+ /*
+ * Call to ensure the corresponding space_info object is created and
+ * assigned to our block group, but don't update its counters just yet.
+ * We want our bg to be added to the rbtree with its ->space_info set.
+ */
+ ret = update_space_info(root->fs_info, cache->flags, 0, 0,
+ &cache->space_info);
+ if (ret) {
+ btrfs_remove_free_space_cache(cache);
+ btrfs_put_block_group(cache);
+ return ret;
+ }
+
ret = btrfs_add_block_group_cache(root->fs_info, cache);
if (ret) {
btrfs_remove_free_space_cache(cache);
@@ -9569,6 +9554,10 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
return ret;
}
+ /*
+ * Now that our block group has its ->space_info set and is inserted in
+ * the rbtree, update the space info's counters.
+ */
ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
&cache->space_info);
if (ret) {
@@ -9931,6 +9920,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
}
spin_unlock(&fs_info->unused_bgs_lock);
+ mutex_lock(&root->fs_info->delete_unused_bgs_mutex);
+
/* Don't want to race with allocators so take the groups_sem */
down_write(&space_info->groups_sem);
spin_lock(&block_group->lock);
@@ -10025,6 +10016,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
end_trans:
btrfs_end_transaction(trans, root);
next:
+ mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
btrfs_put_block_group(block_group);
spin_lock(&fs_info->unused_bgs_lock);
}
diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/fs/btrfs/extent-tree.h
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c32d226bf..02d05817c 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1277,7 +1277,12 @@ int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, gfp_t mask)
{
- return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
+ int wake = 0;
+
+ if (bits & EXTENT_LOCKED)
+ wake = 1;
+
+ return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, mask);
}
int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
@@ -2767,8 +2772,6 @@ static int __must_check submit_one_bio(int rw, struct bio *bio,
else
btrfsic_submit_bio(rw, bio);
- if (bio_flagged(bio, BIO_EOPNOTSUPP))
- ret = -EOPNOTSUPP;
bio_put(bio);
return ret;
}
@@ -4492,6 +4495,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
}
if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
flags |= FIEMAP_EXTENT_ENCODED;
+ if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ flags |= FIEMAP_EXTENT_UNWRITTEN;
free_extent_map(em);
em = NULL;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index b072e1747..b823fac91 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1748,7 +1748,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
}
current->backing_dev_info = inode_to_bdi(inode);
- err = file_remove_suid(file);
+ err = file_remove_privs(file);
if (err) {
mutex_unlock(&inode->i_mutex);
goto out;
@@ -1868,6 +1868,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
struct btrfs_log_ctx ctx;
int ret = 0;
bool full_sync = 0;
+ const u64 len = end - start + 1;
trace_btrfs_sync_file(file, datasync);
@@ -1896,7 +1897,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* all extents are persisted and the respective file extent
* items are in the fs/subvol btree.
*/
- ret = btrfs_wait_ordered_range(inode, start, end - start + 1);
+ ret = btrfs_wait_ordered_range(inode, start, len);
} else {
/*
* Start any new ordered operations before starting to log the
@@ -1968,8 +1969,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
*/
smp_mb();
if (btrfs_inode_in_log(inode, root->fs_info->generation) ||
- (full_sync && BTRFS_I(inode)->last_trans <=
- root->fs_info->last_trans_committed)) {
+ (BTRFS_I(inode)->last_trans <=
+ root->fs_info->last_trans_committed &&
+ (full_sync ||
+ !btrfs_have_ordered_extents_in_range(inode, start, len)))) {
/*
* We'v had everything committed since the last time we were
* modified so clear this flag in case it was set for whatever
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 9dbe5b548..fb5a6b1c6 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -231,6 +231,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
{
int ret = 0;
struct btrfs_path *path = btrfs_alloc_path();
+ bool locked = false;
if (!path) {
ret = -ENOMEM;
@@ -238,6 +239,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
}
if (block_group) {
+ locked = true;
mutex_lock(&trans->transaction->cache_write_mutex);
if (!list_empty(&block_group->io_list)) {
list_del_init(&block_group->io_list);
@@ -269,18 +271,14 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
*/
ret = btrfs_truncate_inode_items(trans, root, inode,
0, BTRFS_EXTENT_DATA_KEY);
- if (ret) {
- mutex_unlock(&trans->transaction->cache_write_mutex);
- btrfs_abort_transaction(trans, root, ret);
- return ret;
- }
+ if (ret)
+ goto fail;
ret = btrfs_update_inode(trans, root, inode);
- if (block_group)
- mutex_unlock(&trans->transaction->cache_write_mutex);
-
fail:
+ if (locked)
+ mutex_unlock(&trans->transaction->cache_write_mutex);
if (ret)
btrfs_abort_transaction(trans, root, ret);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8bb013672..e33dff356 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4209,7 +4209,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
u64 extent_num_bytes = 0;
u64 extent_offset = 0;
u64 item_end = 0;
- u64 last_size = (u64)-1;
+ u64 last_size = new_size;
u32 found_type = (u8)-1;
int found_extent;
int del_item;
@@ -4493,8 +4493,7 @@ out:
btrfs_abort_transaction(trans, root, ret);
}
error:
- if (last_size != (u64)-1 &&
- root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
+ if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
btrfs_ordered_update_i_size(inode, last_size, NULL);
btrfs_free_path(path);
@@ -4986,24 +4985,41 @@ static void evict_inode_truncate_pages(struct inode *inode)
}
write_unlock(&map_tree->lock);
+ /*
+ * Keep looping until we have no more ranges in the io tree.
+ * We can have ongoing bios started by readpages (called from readahead)
+ * that have their endio callback (extent_io.c:end_bio_extent_readpage)
+ * still in progress (unlocked the pages in the bio but did not yet
+ * unlocked the ranges in the io tree). Therefore this means some
+ * ranges can still be locked and eviction started because before
+ * submitting those bios, which are executed by a separate task (work
+ * queue kthread), inode references (inode->i_count) were not taken
+ * (which would be dropped in the end io callback of each bio).
+ * Therefore here we effectively end up waiting for those bios and
+ * anyone else holding locked ranges without having bumped the inode's
+ * reference count - if we don't do it, when they access the inode's
+ * io_tree to unlock a range it may be too late, leading to an
+ * use-after-free issue.
+ */
spin_lock(&io_tree->lock);
while (!RB_EMPTY_ROOT(&io_tree->state)) {
struct extent_state *state;
struct extent_state *cached_state = NULL;
+ u64 start;
+ u64 end;
node = rb_first(&io_tree->state);
state = rb_entry(node, struct extent_state, rb_node);
- atomic_inc(&state->refs);
+ start = state->start;
+ end = state->end;
spin_unlock(&io_tree->lock);
- lock_extent_bits(io_tree, state->start, state->end,
- 0, &cached_state);
- clear_extent_bit(io_tree, state->start, state->end,
+ lock_extent_bits(io_tree, start, end, 0, &cached_state);
+ clear_extent_bit(io_tree, start, end,
EXTENT_LOCKED | EXTENT_DIRTY |
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
EXTENT_DEFRAG, 1, 1,
&cached_state, GFP_NOFS);
- free_extent_state(state);
cond_resched();
spin_lock(&io_tree->lock);
@@ -7530,6 +7546,7 @@ unlock:
current->journal_info = outstanding_extents;
btrfs_free_reserved_data_space(inode, len);
+ set_bit(BTRFS_INODE_DIO_READY, &BTRFS_I(inode)->runtime_flags);
}
/*
@@ -7855,8 +7872,6 @@ static void btrfs_endio_direct_write(struct bio *bio, int err)
struct bio *dio_bio;
int ret;
- if (err)
- goto out_done;
again:
ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
&ordered_offset,
@@ -7879,7 +7894,6 @@ out_test:
ordered = NULL;
goto again;
}
-out_done:
dio_bio = dip->dio_bio;
kfree(dip);
@@ -8147,9 +8161,8 @@ out_err:
static void btrfs_submit_direct(int rw, struct bio *dio_bio,
struct inode *inode, loff_t file_offset)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_dio_private *dip;
- struct bio *io_bio;
+ struct btrfs_dio_private *dip = NULL;
+ struct bio *io_bio = NULL;
struct btrfs_io_bio *btrfs_bio;
int skip_sum;
int write = rw & REQ_WRITE;
@@ -8166,7 +8179,7 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
dip = kzalloc(sizeof(*dip), GFP_NOFS);
if (!dip) {
ret = -ENOMEM;
- goto free_io_bio;
+ goto free_ordered;
}
dip->private = dio_bio->bi_private;
@@ -8194,25 +8207,55 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
if (btrfs_bio->end_io)
btrfs_bio->end_io(btrfs_bio, ret);
-free_io_bio:
- bio_put(io_bio);
free_ordered:
/*
- * If this is a write, we need to clean up the reserved space and kill
- * the ordered extent.
+ * If we arrived here it means either we failed to submit the dip
+ * or we either failed to clone the dio_bio or failed to allocate the
+ * dip. If we cloned the dio_bio and allocated the dip, we can just
+ * call bio_endio against our io_bio so that we get proper resource
+ * cleanup if we fail to submit the dip, otherwise, we must do the
+ * same as btrfs_endio_direct_[write|read] because we can't call these
+ * callbacks - they require an allocated dip and a clone of dio_bio.
*/
- if (write) {
- struct btrfs_ordered_extent *ordered;
- ordered = btrfs_lookup_ordered_extent(inode, file_offset);
- if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
- !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
- btrfs_free_reserved_extent(root, ordered->start,
- ordered->disk_len, 1);
- btrfs_put_ordered_extent(ordered);
- btrfs_put_ordered_extent(ordered);
+ if (io_bio && dip) {
+ bio_endio(io_bio, ret);
+ /*
+ * The end io callbacks free our dip, do the final put on io_bio
+ * and all the cleanup and final put for dio_bio (through
+ * dio_end_io()).
+ */
+ dip = NULL;
+ io_bio = NULL;
+ } else {
+ if (write) {
+ struct btrfs_ordered_extent *ordered;
+
+ ordered = btrfs_lookup_ordered_extent(inode,
+ file_offset);
+ set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
+ /*
+ * Decrements our ref on the ordered extent and removes
+ * the ordered extent from the inode's ordered tree,
+ * doing all the proper resource cleanup such as for the
+ * reserved space and waking up any waiters for this
+ * ordered extent (through btrfs_remove_ordered_extent).
+ */
+ btrfs_finish_ordered_io(ordered);
+ } else {
+ unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
+ file_offset + dio_bio->bi_iter.bi_size - 1);
+ }
+ clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
+ /*
+ * Releases and cleans up our dio_bio, no need to bio_put()
+ * nor bio_endio()/bio_io_error() against dio_bio.
+ */
+ dio_end_io(dio_bio, ret);
}
- bio_endio(dio_bio, ret);
+ if (io_bio)
+ bio_put(io_bio);
+ kfree(dip);
}
static ssize_t check_direct_IO(struct btrfs_root *root, struct kiocb *iocb,
@@ -8314,9 +8357,18 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
btrfs_submit_direct, flags);
if (iov_iter_rw(iter) == WRITE) {
current->journal_info = NULL;
- if (ret < 0 && ret != -EIOCBQUEUED)
- btrfs_delalloc_release_space(inode, count);
- else if (ret >= 0 && (size_t)ret < count)
+ if (ret < 0 && ret != -EIOCBQUEUED) {
+ /*
+ * If the error comes from submitting stage,
+ * btrfs_get_blocsk_direct() has free'd data space,
+ * and metadata space will be handled by
+ * finish_ordered_fn, don't do that again to make
+ * sure bytes_may_use is correct.
+ */
+ if (!test_and_clear_bit(BTRFS_INODE_DIO_READY,
+ &BTRFS_I(inode)->runtime_flags))
+ btrfs_delalloc_release_space(inode, count);
+ } else if (ret >= 0 && (size_t)ret < count)
btrfs_delalloc_release_space(inode,
count - (size_t)ret);
}
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 37d456a9a..0770c9158 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -87,7 +87,8 @@ struct btrfs_ioctl_received_subvol_args_32 {
static int btrfs_clone(struct inode *src, struct inode *inode,
- u64 off, u64 olen, u64 olen_aligned, u64 destoff);
+ u64 off, u64 olen, u64 olen_aligned, u64 destoff,
+ int no_time_update);
/* Mask out flags that are inappropriate for the given type of inode. */
static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
@@ -553,8 +554,8 @@ static noinline int create_subvol(struct inode *dir,
key.offset = (u64)-1;
new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
if (IS_ERR(new_root)) {
- btrfs_abort_transaction(trans, root, PTR_ERR(new_root));
ret = PTR_ERR(new_root);
+ btrfs_abort_transaction(trans, root, ret);
goto fail;
}
@@ -1318,7 +1319,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
i = range->start >> PAGE_CACHE_SHIFT;
}
if (!max_to_defrag)
- max_to_defrag = last_index + 1;
+ max_to_defrag = last_index - i + 1;
/*
* make writeback starts from i, so the defrag range can be
@@ -1368,7 +1369,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
ra_index = max(i, ra_index);
btrfs_force_ra(inode->i_mapping, ra, file, ra_index,
cluster);
- ra_index += max_cluster;
+ ra_index += cluster;
}
mutex_lock(&inode->i_mutex);
@@ -2271,10 +2272,7 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file,
{
struct btrfs_ioctl_ino_lookup_args *args;
struct inode *inode;
- int ret;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
+ int ret = 0;
args = memdup_user(argp, sizeof(*args));
if (IS_ERR(args))
@@ -2282,13 +2280,28 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file,
inode = file_inode(file);
+ /*
+ * Unprivileged query to obtain the containing subvolume root id. The
+ * path is reset so it's consistent with btrfs_search_path_in_tree.
+ */
if (args->treeid == 0)
args->treeid = BTRFS_I(inode)->root->root_key.objectid;
+ if (args->objectid == BTRFS_FIRST_FREE_OBJECTID) {
+ args->name[0] = 0;
+ goto out;
+ }
+
+ if (!capable(CAP_SYS_ADMIN)) {
+ ret = -EPERM;
+ goto out;
+ }
+
ret = btrfs_search_path_in_tree(BTRFS_I(inode)->root->fs_info,
args->treeid, args->objectid,
args->name);
+out:
if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
ret = -EFAULT;
@@ -2753,14 +2766,11 @@ out:
return ret;
}
-static struct page *extent_same_get_page(struct inode *inode, u64 off)
+static struct page *extent_same_get_page(struct inode *inode, pgoff_t index)
{
struct page *page;
- pgoff_t index;
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
- index = off >> PAGE_CACHE_SHIFT;
-
page = grab_cache_page(inode->i_mapping, index);
if (!page)
return NULL;
@@ -2781,6 +2791,20 @@ static struct page *extent_same_get_page(struct inode *inode, u64 off)
return page;
}
+static int gather_extent_pages(struct inode *inode, struct page **pages,
+ int num_pages, u64 off)
+{
+ int i;
+ pgoff_t index = off >> PAGE_CACHE_SHIFT;
+
+ for (i = 0; i < num_pages; i++) {
+ pages[i] = extent_same_get_page(inode, index + i);
+ if (!pages[i])
+ return -ENOMEM;
+ }
+ return 0;
+}
+
static inline void lock_extent_range(struct inode *inode, u64 off, u64 len)
{
/* do any pending delalloc/csum calc on src, one way or
@@ -2806,52 +2830,120 @@ static inline void lock_extent_range(struct inode *inode, u64 off, u64 len)
}
}
-static void btrfs_double_unlock(struct inode *inode1, u64 loff1,
- struct inode *inode2, u64 loff2, u64 len)
+static void btrfs_double_inode_unlock(struct inode *inode1, struct inode *inode2)
{
- unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
- unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
-
mutex_unlock(&inode1->i_mutex);
mutex_unlock(&inode2->i_mutex);
}
-static void btrfs_double_lock(struct inode *inode1, u64 loff1,
- struct inode *inode2, u64 loff2, u64 len)
+static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2)
+{
+ if (inode1 < inode2)
+ swap(inode1, inode2);
+
+ mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
+ if (inode1 != inode2)
+ mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
+}
+
+static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
+ struct inode *inode2, u64 loff2, u64 len)
+{
+ unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
+ unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
+}
+
+static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
+ struct inode *inode2, u64 loff2, u64 len)
{
if (inode1 < inode2) {
swap(inode1, inode2);
swap(loff1, loff2);
}
-
- mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
lock_extent_range(inode1, loff1, len);
- if (inode1 != inode2) {
- mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
+ if (inode1 != inode2)
lock_extent_range(inode2, loff2, len);
+}
+
+struct cmp_pages {
+ int num_pages;
+ struct page **src_pages;
+ struct page **dst_pages;
+};
+
+static void btrfs_cmp_data_free(struct cmp_pages *cmp)
+{
+ int i;
+ struct page *pg;
+
+ for (i = 0; i < cmp->num_pages; i++) {
+ pg = cmp->src_pages[i];
+ if (pg)
+ page_cache_release(pg);
+ pg = cmp->dst_pages[i];
+ if (pg)
+ page_cache_release(pg);
+ }
+ kfree(cmp->src_pages);
+ kfree(cmp->dst_pages);
+}
+
+static int btrfs_cmp_data_prepare(struct inode *src, u64 loff,
+ struct inode *dst, u64 dst_loff,
+ u64 len, struct cmp_pages *cmp)
+{
+ int ret;
+ int num_pages = PAGE_CACHE_ALIGN(len) >> PAGE_CACHE_SHIFT;
+ struct page **src_pgarr, **dst_pgarr;
+
+ /*
+ * We must gather up all the pages before we initiate our
+ * extent locking. We use an array for the page pointers. Size
+ * of the array is bounded by len, which is in turn bounded by
+ * BTRFS_MAX_DEDUPE_LEN.
+ */
+ src_pgarr = kzalloc(num_pages * sizeof(struct page *), GFP_NOFS);
+ dst_pgarr = kzalloc(num_pages * sizeof(struct page *), GFP_NOFS);
+ if (!src_pgarr || !dst_pgarr) {
+ kfree(src_pgarr);
+ kfree(dst_pgarr);
+ return -ENOMEM;
}
+ cmp->num_pages = num_pages;
+ cmp->src_pages = src_pgarr;
+ cmp->dst_pages = dst_pgarr;
+
+ ret = gather_extent_pages(src, cmp->src_pages, cmp->num_pages, loff);
+ if (ret)
+ goto out;
+
+ ret = gather_extent_pages(dst, cmp->dst_pages, cmp->num_pages, dst_loff);
+
+out:
+ if (ret)
+ btrfs_cmp_data_free(cmp);
+ return 0;
}
static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst,
- u64 dst_loff, u64 len)
+ u64 dst_loff, u64 len, struct cmp_pages *cmp)
{
int ret = 0;
+ int i;
struct page *src_page, *dst_page;
unsigned int cmp_len = PAGE_CACHE_SIZE;
void *addr, *dst_addr;
+ i = 0;
while (len) {
if (len < PAGE_CACHE_SIZE)
cmp_len = len;
- src_page = extent_same_get_page(src, loff);
- if (!src_page)
- return -EINVAL;
- dst_page = extent_same_get_page(dst, dst_loff);
- if (!dst_page) {
- page_cache_release(src_page);
- return -EINVAL;
- }
+ BUG_ON(i >= cmp->num_pages);
+
+ src_page = cmp->src_pages[i];
+ dst_page = cmp->dst_pages[i];
+
addr = kmap_atomic(src_page);
dst_addr = kmap_atomic(dst_page);
@@ -2863,26 +2955,30 @@ static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst,
kunmap_atomic(addr);
kunmap_atomic(dst_addr);
- page_cache_release(src_page);
- page_cache_release(dst_page);
if (ret)
break;
- loff += cmp_len;
- dst_loff += cmp_len;
len -= cmp_len;
+ i++;
}
return ret;
}
-static int extent_same_check_offsets(struct inode *inode, u64 off, u64 len)
+static int extent_same_check_offsets(struct inode *inode, u64 off, u64 *plen,
+ u64 olen)
{
+ u64 len = *plen;
u64 bs = BTRFS_I(inode)->root->fs_info->sb->s_blocksize;
- if (off + len > inode->i_size || off + len < off)
+ if (off + olen > inode->i_size || off + olen < off)
return -EINVAL;
+
+ /* if we extend to eof, continue to block boundary */
+ if (off + len == inode->i_size)
+ *plen = len = ALIGN(inode->i_size, bs) - off;
+
/* Check that we are block aligned - btrfs_clone() requires this */
if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs))
return -EINVAL;
@@ -2890,31 +2986,67 @@ static int extent_same_check_offsets(struct inode *inode, u64 off, u64 len)
return 0;
}
-static int btrfs_extent_same(struct inode *src, u64 loff, u64 len,
+static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
struct inode *dst, u64 dst_loff)
{
int ret;
+ u64 len = olen;
+ struct cmp_pages cmp;
+ int same_inode = 0;
+ u64 same_lock_start = 0;
+ u64 same_lock_len = 0;
- /*
- * btrfs_clone() can't handle extents in the same file
- * yet. Once that works, we can drop this check and replace it
- * with a check for the same inode, but overlapping extents.
- */
if (src == dst)
- return -EINVAL;
+ same_inode = 1;
if (len == 0)
return 0;
- btrfs_double_lock(src, loff, dst, dst_loff, len);
+ if (same_inode) {
+ mutex_lock(&src->i_mutex);
- ret = extent_same_check_offsets(src, loff, len);
- if (ret)
- goto out_unlock;
+ ret = extent_same_check_offsets(src, loff, &len, olen);
+ if (ret)
+ goto out_unlock;
- ret = extent_same_check_offsets(dst, dst_loff, len);
- if (ret)
- goto out_unlock;
+ /*
+ * Single inode case wants the same checks, except we
+ * don't want our length pushed out past i_size as
+ * comparing that data range makes no sense.
+ *
+ * extent_same_check_offsets() will do this for an
+ * unaligned length at i_size, so catch it here and
+ * reject the request.
+ *
+ * This effectively means we require aligned extents
+ * for the single-inode case, whereas the other cases
+ * allow an unaligned length so long as it ends at
+ * i_size.
+ */
+ if (len != olen) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ /* Check for overlapping ranges */
+ if (dst_loff + len > loff && dst_loff < loff + len) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ same_lock_start = min_t(u64, loff, dst_loff);
+ same_lock_len = max_t(u64, loff, dst_loff) + len - same_lock_start;
+ } else {
+ btrfs_double_inode_lock(src, dst);
+
+ ret = extent_same_check_offsets(src, loff, &len, olen);
+ if (ret)
+ goto out_unlock;
+
+ ret = extent_same_check_offsets(dst, dst_loff, &len, olen);
+ if (ret)
+ goto out_unlock;
+ }
/* don't make the dst file partly checksummed */
if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
@@ -2923,12 +3055,32 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 len,
goto out_unlock;
}
- ret = btrfs_cmp_data(src, loff, dst, dst_loff, len);
+ ret = btrfs_cmp_data_prepare(src, loff, dst, dst_loff, olen, &cmp);
+ if (ret)
+ goto out_unlock;
+
+ if (same_inode)
+ lock_extent_range(src, same_lock_start, same_lock_len);
+ else
+ btrfs_double_extent_lock(src, loff, dst, dst_loff, len);
+
+ /* pass original length for comparison so we stay within i_size */
+ ret = btrfs_cmp_data(src, loff, dst, dst_loff, olen, &cmp);
if (ret == 0)
- ret = btrfs_clone(src, dst, loff, len, len, dst_loff);
+ ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1);
+ if (same_inode)
+ unlock_extent(&BTRFS_I(src)->io_tree, same_lock_start,
+ same_lock_start + same_lock_len - 1);
+ else
+ btrfs_double_extent_unlock(src, loff, dst, dst_loff, len);
+
+ btrfs_cmp_data_free(&cmp);
out_unlock:
- btrfs_double_unlock(src, loff, dst, dst_loff, len);
+ if (same_inode)
+ mutex_unlock(&src->i_mutex);
+ else
+ btrfs_double_inode_unlock(src, dst);
return ret;
}
@@ -3082,13 +3234,15 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
struct inode *inode,
u64 endoff,
const u64 destoff,
- const u64 olen)
+ const u64 olen,
+ int no_time_update)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret;
inode_inc_iversion(inode);
- inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ if (!no_time_update)
+ inode->i_mtime = inode->i_ctime = CURRENT_TIME;
/*
* We round up to the block size at eof when determining which
* extents to clone above, but shouldn't round up the file size.
@@ -3173,13 +3327,13 @@ static void clone_update_extent_map(struct inode *inode,
* @inode: Inode to clone to
* @off: Offset within source to start clone from
* @olen: Original length, passed by user, of range to clone
- * @olen_aligned: Block-aligned value of olen, extent_same uses
- * identical values here
+ * @olen_aligned: Block-aligned value of olen
* @destoff: Offset within @inode to start clone
+ * @no_time_update: Whether to update mtime/ctime on the target inode
*/
static int btrfs_clone(struct inode *src, struct inode *inode,
const u64 off, const u64 olen, const u64 olen_aligned,
- const u64 destoff)
+ const u64 destoff, int no_time_update)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_path *path = NULL;
@@ -3517,7 +3671,8 @@ process_slot:
root->sectorsize);
ret = clone_finish_inode_update(trans, inode,
last_dest_end,
- destoff, olen);
+ destoff, olen,
+ no_time_update);
if (ret)
goto out;
if (new_key.offset + datal >= destoff + len)
@@ -3555,7 +3710,7 @@ process_slot:
clone_update_extent_map(inode, trans, NULL, last_dest_end,
destoff + len - last_dest_end);
ret = clone_finish_inode_update(trans, inode, destoff + len,
- destoff, olen);
+ destoff, olen, no_time_update);
}
out:
@@ -3692,7 +3847,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
lock_extent_range(inode, destoff, len);
}
- ret = btrfs_clone(src, inode, off, olen, len, destoff);
+ ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
if (same_inode) {
u64 lock_start = min_t(u64, off, destoff);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 760c4a5e0..52170cf17 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -198,9 +198,6 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
entry->file_offset = file_offset;
entry->start = start;
entry->len = len;
- if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) &&
- !(type == BTRFS_ORDERED_NOCOW))
- entry->csum_bytes_left = disk_len;
entry->disk_len = disk_len;
entry->bytes_left = len;
entry->inode = igrab(inode);
@@ -286,10 +283,6 @@ void btrfs_add_ordered_sum(struct inode *inode,
tree = &BTRFS_I(inode)->ordered_tree;
spin_lock_irq(&tree->lock);
list_add_tail(&sum->list, &entry->list);
- WARN_ON(entry->csum_bytes_left < sum->len);
- entry->csum_bytes_left -= sum->len;
- if (entry->csum_bytes_left == 0)
- wake_up(&entry->wait);
spin_unlock_irq(&tree->lock);
}
@@ -509,7 +502,21 @@ void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE,
&ordered->flags));
- list_add_tail(&ordered->trans_list, &trans->ordered);
+ /*
+ * If our ordered extent completed it means it updated the
+ * fs/subvol and csum trees already, so no need to make the
+ * current transaction's commit wait for it, as we end up
+ * holding memory unnecessarily and delaying the inode's iput
+ * until the transaction commit (we schedule an iput for the
+ * inode when the ordered extent's refcount drops to 0), which
+ * prevents it from being evictable until the transaction
+ * commits.
+ */
+ if (test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags))
+ btrfs_put_ordered_extent(ordered);
+ else
+ list_add_tail(&ordered->trans_list, &trans->ordered);
+
spin_lock_irq(&log->log_extents_lock[index]);
}
spin_unlock_irq(&log->log_extents_lock[index]);
@@ -545,6 +552,10 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
trace_btrfs_ordered_extent_put(entry->inode, entry);
if (atomic_dec_and_test(&entry->refs)) {
+ ASSERT(list_empty(&entry->log_list));
+ ASSERT(list_empty(&entry->trans_list));
+ ASSERT(list_empty(&entry->root_extent_list));
+ ASSERT(RB_EMPTY_NODE(&entry->rb_node));
if (entry->inode)
btrfs_add_delayed_iput(entry->inode);
while (!list_empty(&entry->list)) {
@@ -572,6 +583,7 @@ void btrfs_remove_ordered_extent(struct inode *inode,
spin_lock_irq(&tree->lock);
node = &entry->rb_node;
rb_erase(node, &tree->tree);
+ RB_CLEAR_NODE(node);
if (tree->last == node)
tree->last = NULL;
set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
@@ -844,6 +856,20 @@ out:
return entry;
}
+bool btrfs_have_ordered_extents_in_range(struct inode *inode,
+ u64 file_offset,
+ u64 len)
+{
+ struct btrfs_ordered_extent *oe;
+
+ oe = btrfs_lookup_ordered_range(inode, file_offset, len);
+ if (oe) {
+ btrfs_put_ordered_extent(oe);
+ return true;
+ }
+ return false;
+}
+
/*
* lookup and return any extent before 'file_offset'. NULL is returned
* if none is found
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index e96cd4ccd..7176cc0fe 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -89,9 +89,6 @@ struct btrfs_ordered_extent {
/* number of bytes that still need writing */
u64 bytes_left;
- /* number of bytes that still need csumming */
- u64 csum_bytes_left;
-
/*
* the end of the ordered extent which is behind it but
* didn't update disk_i_size. Please see the comment of
@@ -191,6 +188,9 @@ btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
u64 file_offset,
u64 len);
+bool btrfs_have_ordered_extents_in_range(struct inode *inode,
+ u64 file_offset,
+ u64 len);
int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
struct btrfs_ordered_extent *ordered);
int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 3d6546581..8a8202956 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -34,6 +34,7 @@
#include "extent_io.h"
#include "qgroup.h"
+
/* TODO XXX FIXME
* - subvol delete -> delete when ref goes to 0? delete limits also?
* - reorganize keys
@@ -84,11 +85,42 @@ struct btrfs_qgroup {
/*
* temp variables for accounting operations
+ * Refer to qgroup_shared_accouting() for details.
*/
u64 old_refcnt;
u64 new_refcnt;
};
+static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq,
+ int mod)
+{
+ if (qg->old_refcnt < seq)
+ qg->old_refcnt = seq;
+ qg->old_refcnt += mod;
+}
+
+static void btrfs_qgroup_update_new_refcnt(struct btrfs_qgroup *qg, u64 seq,
+ int mod)
+{
+ if (qg->new_refcnt < seq)
+ qg->new_refcnt = seq;
+ qg->new_refcnt += mod;
+}
+
+static inline u64 btrfs_qgroup_get_old_refcnt(struct btrfs_qgroup *qg, u64 seq)
+{
+ if (qg->old_refcnt < seq)
+ return 0;
+ return qg->old_refcnt - seq;
+}
+
+static inline u64 btrfs_qgroup_get_new_refcnt(struct btrfs_qgroup *qg, u64 seq)
+{
+ if (qg->new_refcnt < seq)
+ return 0;
+ return qg->new_refcnt - seq;
+}
+
/*
* glue structure to represent the relations between qgroups.
*/
@@ -1115,14 +1147,14 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
struct ulist *tmp;
int ret = 0;
- tmp = ulist_alloc(GFP_NOFS);
- if (!tmp)
- return -ENOMEM;
-
/* Check the level of src and dst first */
if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst))
return -EINVAL;
+ tmp = ulist_alloc(GFP_NOFS);
+ if (!tmp)
+ return -ENOMEM;
+
mutex_lock(&fs_info->qgroup_ioctl_lock);
quota_root = fs_info->quota_root;
if (!quota_root) {
@@ -1317,6 +1349,11 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
struct btrfs_root *quota_root;
struct btrfs_qgroup *qgroup;
int ret = 0;
+ /* Sometimes we would want to clear the limit on this qgroup.
+ * To meet this requirement, we treat the -1 as a special value
+ * which tell kernel to clear the limit on this qgroup.
+ */
+ const u64 CLEAR_VALUE = -1;
mutex_lock(&fs_info->qgroup_ioctl_lock);
quota_root = fs_info->quota_root;
@@ -1332,14 +1369,42 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
}
spin_lock(&fs_info->qgroup_lock);
- if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_RFER)
- qgroup->max_rfer = limit->max_rfer;
- if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_EXCL)
- qgroup->max_excl = limit->max_excl;
- if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER)
- qgroup->rsv_rfer = limit->rsv_rfer;
- if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL)
- qgroup->rsv_excl = limit->rsv_excl;
+ if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_RFER) {
+ if (limit->max_rfer == CLEAR_VALUE) {
+ qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER;
+ limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER;
+ qgroup->max_rfer = 0;
+ } else {
+ qgroup->max_rfer = limit->max_rfer;
+ }
+ }
+ if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) {
+ if (limit->max_excl == CLEAR_VALUE) {
+ qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL;
+ limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL;
+ qgroup->max_excl = 0;
+ } else {
+ qgroup->max_excl = limit->max_excl;
+ }
+ }
+ if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER) {
+ if (limit->rsv_rfer == CLEAR_VALUE) {
+ qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER;
+ limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER;
+ qgroup->rsv_rfer = 0;
+ } else {
+ qgroup->rsv_rfer = limit->rsv_rfer;
+ }
+ }
+ if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL) {
+ if (limit->rsv_excl == CLEAR_VALUE) {
+ qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL;
+ limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL;
+ qgroup->rsv_excl = 0;
+ } else {
+ qgroup->rsv_excl = limit->rsv_excl;
+ }
+ }
qgroup->lim_flags |= limit->flags;
spin_unlock(&fs_info->qgroup_lock);
@@ -1356,239 +1421,86 @@ out:
return ret;
}
-static int comp_oper_exist(struct btrfs_qgroup_operation *oper1,
- struct btrfs_qgroup_operation *oper2)
+int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info)
{
- /*
- * Ignore seq and type here, we're looking for any operation
- * at all related to this extent on that root.
- */
- if (oper1->bytenr < oper2->bytenr)
- return -1;
- if (oper1->bytenr > oper2->bytenr)
- return 1;
- if (oper1->ref_root < oper2->ref_root)
- return -1;
- if (oper1->ref_root > oper2->ref_root)
- return 1;
- return 0;
-}
+ struct btrfs_qgroup_extent_record *record;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct rb_node *node;
+ u64 qgroup_to_skip;
+ int ret = 0;
-static int qgroup_oper_exists(struct btrfs_fs_info *fs_info,
- struct btrfs_qgroup_operation *oper)
-{
- struct rb_node *n;
- struct btrfs_qgroup_operation *cur;
- int cmp;
+ delayed_refs = &trans->transaction->delayed_refs;
+ qgroup_to_skip = delayed_refs->qgroup_to_skip;
- spin_lock(&fs_info->qgroup_op_lock);
- n = fs_info->qgroup_op_tree.rb_node;
- while (n) {
- cur = rb_entry(n, struct btrfs_qgroup_operation, n);
- cmp = comp_oper_exist(cur, oper);
- if (cmp < 0) {
- n = n->rb_right;
- } else if (cmp) {
- n = n->rb_left;
- } else {
- spin_unlock(&fs_info->qgroup_op_lock);
- return -EEXIST;
- }
+ /*
+ * No need to do lock, since this function will only be called in
+ * btrfs_commmit_transaction().
+ */
+ node = rb_first(&delayed_refs->dirty_extent_root);
+ while (node) {
+ record = rb_entry(node, struct btrfs_qgroup_extent_record,
+ node);
+ ret = btrfs_find_all_roots(NULL, fs_info, record->bytenr, 0,
+ &record->old_roots);
+ if (ret < 0)
+ break;
+ if (qgroup_to_skip)
+ ulist_del(record->old_roots, qgroup_to_skip, 0);
+ node = rb_next(node);
}
- spin_unlock(&fs_info->qgroup_op_lock);
- return 0;
-}
-
-static int comp_oper(struct btrfs_qgroup_operation *oper1,
- struct btrfs_qgroup_operation *oper2)
-{
- if (oper1->bytenr < oper2->bytenr)
- return -1;
- if (oper1->bytenr > oper2->bytenr)
- return 1;
- if (oper1->ref_root < oper2->ref_root)
- return -1;
- if (oper1->ref_root > oper2->ref_root)
- return 1;
- if (oper1->seq < oper2->seq)
- return -1;
- if (oper1->seq > oper2->seq)
- return 1;
- if (oper1->type < oper2->type)
- return -1;
- if (oper1->type > oper2->type)
- return 1;
- return 0;
+ return ret;
}
-static int insert_qgroup_oper(struct btrfs_fs_info *fs_info,
- struct btrfs_qgroup_operation *oper)
+struct btrfs_qgroup_extent_record
+*btrfs_qgroup_insert_dirty_extent(struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_qgroup_extent_record *record)
{
- struct rb_node **p;
- struct rb_node *parent = NULL;
- struct btrfs_qgroup_operation *cur;
- int cmp;
+ struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node;
+ struct rb_node *parent_node = NULL;
+ struct btrfs_qgroup_extent_record *entry;
+ u64 bytenr = record->bytenr;
- spin_lock(&fs_info->qgroup_op_lock);
- p = &fs_info->qgroup_op_tree.rb_node;
while (*p) {
- parent = *p;
- cur = rb_entry(parent, struct btrfs_qgroup_operation, n);
- cmp = comp_oper(cur, oper);
- if (cmp < 0) {
- p = &(*p)->rb_right;
- } else if (cmp) {
+ parent_node = *p;
+ entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record,
+ node);
+ if (bytenr < entry->bytenr)
p = &(*p)->rb_left;
- } else {
- spin_unlock(&fs_info->qgroup_op_lock);
- return -EEXIST;
- }
- }
- rb_link_node(&oper->n, parent, p);
- rb_insert_color(&oper->n, &fs_info->qgroup_op_tree);
- spin_unlock(&fs_info->qgroup_op_lock);
- return 0;
-}
-
-/*
- * Record a quota operation for processing later on.
- * @trans: the transaction we are adding the delayed op to.
- * @fs_info: the fs_info for this fs.
- * @ref_root: the root of the reference we are acting on,
- * @bytenr: the bytenr we are acting on.
- * @num_bytes: the number of bytes in the reference.
- * @type: the type of operation this is.
- * @mod_seq: do we need to get a sequence number for looking up roots.
- *
- * We just add it to our trans qgroup_ref_list and carry on and process these
- * operations in order at some later point. If the reference root isn't a fs
- * root then we don't bother with doing anything.
- *
- * MUST BE HOLDING THE REF LOCK.
- */
-int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 ref_root,
- u64 bytenr, u64 num_bytes,
- enum btrfs_qgroup_operation_type type, int mod_seq)
-{
- struct btrfs_qgroup_operation *oper;
- int ret;
-
- if (!is_fstree(ref_root) || !fs_info->quota_enabled)
- return 0;
-
- oper = kmalloc(sizeof(*oper), GFP_NOFS);
- if (!oper)
- return -ENOMEM;
-
- oper->ref_root = ref_root;
- oper->bytenr = bytenr;
- oper->num_bytes = num_bytes;
- oper->type = type;
- oper->seq = atomic_inc_return(&fs_info->qgroup_op_seq);
- INIT_LIST_HEAD(&oper->elem.list);
- oper->elem.seq = 0;
-
- trace_btrfs_qgroup_record_ref(oper);
-
- if (type == BTRFS_QGROUP_OPER_SUB_SUBTREE) {
- /*
- * If any operation for this bytenr/ref_root combo
- * exists, then we know it's not exclusively owned and
- * shouldn't be queued up.
- *
- * This also catches the case where we have a cloned
- * extent that gets queued up multiple times during
- * drop snapshot.
- */
- if (qgroup_oper_exists(fs_info, oper)) {
- kfree(oper);
- return 0;
- }
- }
-
- ret = insert_qgroup_oper(fs_info, oper);
- if (ret) {
- /* Shouldn't happen so have an assert for developers */
- ASSERT(0);
- kfree(oper);
- return ret;
+ else if (bytenr > entry->bytenr)
+ p = &(*p)->rb_right;
+ else
+ return entry;
}
- list_add_tail(&oper->list, &trans->qgroup_ref_list);
-
- if (mod_seq)
- btrfs_get_tree_mod_seq(fs_info, &oper->elem);
-
- return 0;
-}
-
-static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
- struct btrfs_qgroup_operation *oper)
-{
- struct ulist *tmp;
- int sign = 0;
- int ret = 0;
- tmp = ulist_alloc(GFP_NOFS);
- if (!tmp)
- return -ENOMEM;
-
- spin_lock(&fs_info->qgroup_lock);
- if (!fs_info->quota_root)
- goto out;
-
- switch (oper->type) {
- case BTRFS_QGROUP_OPER_ADD_EXCL:
- sign = 1;
- break;
- case BTRFS_QGROUP_OPER_SUB_EXCL:
- sign = -1;
- break;
- default:
- ASSERT(0);
- }
- ret = __qgroup_excl_accounting(fs_info, tmp, oper->ref_root,
- oper->num_bytes, sign);
-out:
- spin_unlock(&fs_info->qgroup_lock);
- ulist_free(tmp);
- return ret;
+ rb_link_node(&record->node, parent_node, p);
+ rb_insert_color(&record->node, &delayed_refs->dirty_extent_root);
+ return NULL;
}
+#define UPDATE_NEW 0
+#define UPDATE_OLD 1
/*
- * Walk all of the roots that pointed to our bytenr and adjust their refcnts as
- * properly.
+ * Walk all of the roots that points to the bytenr and adjust their refcnts.
*/
-static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info,
- u64 root_to_skip, struct ulist *tmp,
- struct ulist *roots, struct ulist *qgroups,
- u64 seq, int *old_roots, int rescan)
+static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
+ struct ulist *roots, struct ulist *tmp,
+ struct ulist *qgroups, u64 seq, int update_old)
{
struct ulist_node *unode;
struct ulist_iterator uiter;
struct ulist_node *tmp_unode;
struct ulist_iterator tmp_uiter;
struct btrfs_qgroup *qg;
- int ret;
+ int ret = 0;
+ if (!roots)
+ return 0;
ULIST_ITER_INIT(&uiter);
while ((unode = ulist_next(roots, &uiter))) {
- /* We don't count our current root here */
- if (unode->val == root_to_skip)
- continue;
qg = find_qgroup_rb(fs_info, unode->val);
if (!qg)
continue;
- /*
- * We could have a pending removal of this same ref so we may
- * not have actually found our ref root when doing
- * btrfs_find_all_roots, so we need to keep track of how many
- * old roots we find in case we removed ours and added a
- * different one at the same time. I don't think this could
- * happen in practice but that sort of thinking leads to pain
- * and suffering and to the dark side.
- */
- (*old_roots)++;
ulist_reinit(tmp);
ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg),
@@ -1603,29 +1515,10 @@ static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info,
struct btrfs_qgroup_list *glist;
qg = u64_to_ptr(tmp_unode->aux);
- /*
- * We use this sequence number to keep from having to
- * run the whole list and 0 out the refcnt every time.
- * We basically use sequnce as the known 0 count and
- * then add 1 everytime we see a qgroup. This is how we
- * get how many of the roots actually point up to the
- * upper level qgroups in order to determine exclusive
- * counts.
- *
- * For rescan we want to set old_refcnt to seq so our
- * exclusive calculations end up correct.
- */
- if (rescan)
- qg->old_refcnt = seq;
- else if (qg->old_refcnt < seq)
- qg->old_refcnt = seq + 1;
+ if (update_old)
+ btrfs_qgroup_update_old_refcnt(qg, seq, 1);
else
- qg->old_refcnt++;
-
- if (qg->new_refcnt < seq)
- qg->new_refcnt = seq + 1;
- else
- qg->new_refcnt++;
+ btrfs_qgroup_update_new_refcnt(qg, seq, 1);
list_for_each_entry(glist, &qg->groups, next_group) {
ret = ulist_add(qgroups, glist->group->qgroupid,
ptr_to_u64(glist->group),
@@ -1644,161 +1537,46 @@ static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info,
}
/*
- * We need to walk forward in our operation tree and account for any roots that
- * were deleted after we made this operation.
- */
-static int qgroup_account_deleted_refs(struct btrfs_fs_info *fs_info,
- struct btrfs_qgroup_operation *oper,
- struct ulist *tmp,
- struct ulist *qgroups, u64 seq,
- int *old_roots)
-{
- struct ulist_node *unode;
- struct ulist_iterator uiter;
- struct btrfs_qgroup *qg;
- struct btrfs_qgroup_operation *tmp_oper;
- struct rb_node *n;
- int ret;
-
- ulist_reinit(tmp);
-
- /*
- * We only walk forward in the tree since we're only interested in
- * removals that happened _after_ our operation.
- */
- spin_lock(&fs_info->qgroup_op_lock);
- n = rb_next(&oper->n);
- spin_unlock(&fs_info->qgroup_op_lock);
- if (!n)
- return 0;
- tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n);
- while (tmp_oper->bytenr == oper->bytenr) {
- /*
- * If it's not a removal we don't care, additions work out
- * properly with our refcnt tracking.
- */
- if (tmp_oper->type != BTRFS_QGROUP_OPER_SUB_SHARED &&
- tmp_oper->type != BTRFS_QGROUP_OPER_SUB_EXCL)
- goto next;
- qg = find_qgroup_rb(fs_info, tmp_oper->ref_root);
- if (!qg)
- goto next;
- ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg),
- GFP_ATOMIC);
- if (ret) {
- if (ret < 0)
- return ret;
- /*
- * We only want to increase old_roots if this qgroup is
- * not already in the list of qgroups. If it is already
- * there then that means it must have been re-added or
- * the delete will be discarded because we had an
- * existing ref that we haven't looked up yet. In this
- * case we don't want to increase old_roots. So if ret
- * == 1 then we know that this is the first time we've
- * seen this qgroup and we can bump the old_roots.
- */
- (*old_roots)++;
- ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg),
- GFP_ATOMIC);
- if (ret < 0)
- return ret;
- }
-next:
- spin_lock(&fs_info->qgroup_op_lock);
- n = rb_next(&tmp_oper->n);
- spin_unlock(&fs_info->qgroup_op_lock);
- if (!n)
- break;
- tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n);
- }
-
- /* Ok now process the qgroups we found */
- ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(tmp, &uiter))) {
- struct btrfs_qgroup_list *glist;
-
- qg = u64_to_ptr(unode->aux);
- if (qg->old_refcnt < seq)
- qg->old_refcnt = seq + 1;
- else
- qg->old_refcnt++;
- if (qg->new_refcnt < seq)
- qg->new_refcnt = seq + 1;
- else
- qg->new_refcnt++;
- list_for_each_entry(glist, &qg->groups, next_group) {
- ret = ulist_add(qgroups, glist->group->qgroupid,
- ptr_to_u64(glist->group), GFP_ATOMIC);
- if (ret < 0)
- return ret;
- ret = ulist_add(tmp, glist->group->qgroupid,
- ptr_to_u64(glist->group), GFP_ATOMIC);
- if (ret < 0)
- return ret;
- }
- }
- return 0;
-}
-
-/* Add refcnt for the newly added reference. */
-static int qgroup_calc_new_refcnt(struct btrfs_fs_info *fs_info,
- struct btrfs_qgroup_operation *oper,
- struct btrfs_qgroup *qgroup,
- struct ulist *tmp, struct ulist *qgroups,
- u64 seq)
-{
- struct ulist_node *unode;
- struct ulist_iterator uiter;
- struct btrfs_qgroup *qg;
- int ret;
-
- ulist_reinit(tmp);
- ret = ulist_add(qgroups, qgroup->qgroupid, ptr_to_u64(qgroup),
- GFP_ATOMIC);
- if (ret < 0)
- return ret;
- ret = ulist_add(tmp, qgroup->qgroupid, ptr_to_u64(qgroup),
- GFP_ATOMIC);
- if (ret < 0)
- return ret;
- ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(tmp, &uiter))) {
- struct btrfs_qgroup_list *glist;
-
- qg = u64_to_ptr(unode->aux);
- if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) {
- if (qg->new_refcnt < seq)
- qg->new_refcnt = seq + 1;
- else
- qg->new_refcnt++;
- } else {
- if (qg->old_refcnt < seq)
- qg->old_refcnt = seq + 1;
- else
- qg->old_refcnt++;
- }
- list_for_each_entry(glist, &qg->groups, next_group) {
- ret = ulist_add(tmp, glist->group->qgroupid,
- ptr_to_u64(glist->group), GFP_ATOMIC);
- if (ret < 0)
- return ret;
- ret = ulist_add(qgroups, glist->group->qgroupid,
- ptr_to_u64(glist->group), GFP_ATOMIC);
- if (ret < 0)
- return ret;
- }
- }
- return 0;
-}
-
-/*
- * This adjusts the counters for all referenced qgroups if need be.
+ * Update qgroup rfer/excl counters.
+ * Rfer update is easy, codes can explain themselves.
+ *
+ * Excl update is tricky, the update is split into 2 part.
+ * Part 1: Possible exclusive <-> sharing detect:
+ * | A | !A |
+ * -------------------------------------
+ * B | * | - |
+ * -------------------------------------
+ * !B | + | ** |
+ * -------------------------------------
+ *
+ * Conditions:
+ * A: cur_old_roots < nr_old_roots (not exclusive before)
+ * !A: cur_old_roots == nr_old_roots (possible exclusive before)
+ * B: cur_new_roots < nr_new_roots (not exclusive now)
+ * !B: cur_new_roots == nr_new_roots (possible exclsuive now)
+ *
+ * Results:
+ * +: Possible sharing -> exclusive -: Possible exclusive -> sharing
+ * *: Definitely not changed. **: Possible unchanged.
+ *
+ * For !A and !B condition, the exception is cur_old/new_roots == 0 case.
+ *
+ * To make the logic clear, we first use condition A and B to split
+ * combination into 4 results.
+ *
+ * Then, for result "+" and "-", check old/new_roots == 0 case, as in them
+ * only on variant maybe 0.
+ *
+ * Lastly, check result **, since there are 2 variants maybe 0, split them
+ * again(2x2).
+ * But this time we don't need to consider other things, the codes and logic
+ * is easy to understand now.
*/
-static int qgroup_adjust_counters(struct btrfs_fs_info *fs_info,
- u64 root_to_skip, u64 num_bytes,
- struct ulist *qgroups, u64 seq,
- int old_roots, int new_roots, int rescan)
+static int qgroup_update_counters(struct btrfs_fs_info *fs_info,
+ struct ulist *qgroups,
+ u64 nr_old_roots,
+ u64 nr_new_roots,
+ u64 num_bytes, u64 seq)
{
struct ulist_node *unode;
struct ulist_iterator uiter;
@@ -1810,423 +1588,196 @@ static int qgroup_adjust_counters(struct btrfs_fs_info *fs_info,
bool dirty = false;
qg = u64_to_ptr(unode->aux);
- /*
- * Wasn't referenced before but is now, add to the reference
- * counters.
- */
- if (qg->old_refcnt <= seq && qg->new_refcnt > seq) {
+ cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq);
+ cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq);
+
+ /* Rfer update part */
+ if (cur_old_count == 0 && cur_new_count > 0) {
qg->rfer += num_bytes;
qg->rfer_cmpr += num_bytes;
dirty = true;
}
-
- /*
- * Was referenced before but isn't now, subtract from the
- * reference counters.
- */
- if (qg->old_refcnt > seq && qg->new_refcnt <= seq) {
+ if (cur_old_count > 0 && cur_new_count == 0) {
qg->rfer -= num_bytes;
qg->rfer_cmpr -= num_bytes;
dirty = true;
}
- if (qg->old_refcnt < seq)
- cur_old_count = 0;
- else
- cur_old_count = qg->old_refcnt - seq;
- if (qg->new_refcnt < seq)
- cur_new_count = 0;
- else
- cur_new_count = qg->new_refcnt - seq;
+ /* Excl update part */
+ /* Exclusive/none -> shared case */
+ if (cur_old_count == nr_old_roots &&
+ cur_new_count < nr_new_roots) {
+ /* Exclusive -> shared */
+ if (cur_old_count != 0) {
+ qg->excl -= num_bytes;
+ qg->excl_cmpr -= num_bytes;
+ dirty = true;
+ }
+ }
- /*
- * If our refcount was the same as the roots previously but our
- * new count isn't the same as the number of roots now then we
- * went from having a exclusive reference on this range to not.
- */
- if (old_roots && cur_old_count == old_roots &&
- (cur_new_count != new_roots || new_roots == 0)) {
- WARN_ON(cur_new_count != new_roots && new_roots == 0);
- qg->excl -= num_bytes;
- qg->excl_cmpr -= num_bytes;
- dirty = true;
+ /* Shared -> exclusive/none case */
+ if (cur_old_count < nr_old_roots &&
+ cur_new_count == nr_new_roots) {
+ /* Shared->exclusive */
+ if (cur_new_count != 0) {
+ qg->excl += num_bytes;
+ qg->excl_cmpr += num_bytes;
+ dirty = true;
+ }
}
- /*
- * If we didn't reference all the roots before but now we do we
- * have an exclusive reference to this range.
- */
- if ((!old_roots || (old_roots && cur_old_count != old_roots))
- && cur_new_count == new_roots) {
- qg->excl += num_bytes;
- qg->excl_cmpr += num_bytes;
- dirty = true;
+ /* Exclusive/none -> exclusive/none case */
+ if (cur_old_count == nr_old_roots &&
+ cur_new_count == nr_new_roots) {
+ if (cur_old_count == 0) {
+ /* None -> exclusive/none */
+
+ if (cur_new_count != 0) {
+ /* None -> exclusive */
+ qg->excl += num_bytes;
+ qg->excl_cmpr += num_bytes;
+ dirty = true;
+ }
+ /* None -> none, nothing changed */
+ } else {
+ /* Exclusive -> exclusive/none */
+
+ if (cur_new_count == 0) {
+ /* Exclusive -> none */
+ qg->excl -= num_bytes;
+ qg->excl_cmpr -= num_bytes;
+ dirty = true;
+ }
+ /* Exclusive -> exclusive, nothing changed */
+ }
}
+ /* For exclusive extent, free its reserved bytes too */
+ if (nr_old_roots == 0 && nr_new_roots == 1 &&
+ cur_new_count == nr_new_roots)
+ qg->reserved -= num_bytes;
if (dirty)
qgroup_dirty(fs_info, qg);
}
return 0;
}
-/*
- * If we removed a data extent and there were other references for that bytenr
- * then we need to lookup all referenced roots to make sure we still don't
- * reference this bytenr. If we do then we can just discard this operation.
- */
-static int check_existing_refs(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
- struct btrfs_qgroup_operation *oper)
-{
- struct ulist *roots = NULL;
- struct ulist_node *unode;
- struct ulist_iterator uiter;
- int ret = 0;
-
- ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr,
- oper->elem.seq, &roots);
- if (ret < 0)
- return ret;
- ret = 0;
-
- ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(roots, &uiter))) {
- if (unode->val == oper->ref_root) {
- ret = 1;
- break;
- }
- }
- ulist_free(roots);
- btrfs_put_tree_mod_seq(fs_info, &oper->elem);
-
- return ret;
-}
-
-/*
- * If we share a reference across multiple roots then we may need to adjust
- * various qgroups referenced and exclusive counters. The basic premise is this
- *
- * 1) We have seq to represent a 0 count. Instead of looping through all of the
- * qgroups and resetting their refcount to 0 we just constantly bump this
- * sequence number to act as the base reference count. This means that if
- * anybody is equal to or below this sequence they were never referenced. We
- * jack this sequence up by the number of roots we found each time in order to
- * make sure we don't have any overlap.
- *
- * 2) We first search all the roots that reference the area _except_ the root
- * we're acting on currently. This makes up the old_refcnt of all the qgroups
- * before.
- *
- * 3) We walk all of the qgroups referenced by the root we are currently acting
- * on, and will either adjust old_refcnt in the case of a removal or the
- * new_refcnt in the case of an addition.
- *
- * 4) Finally we walk all the qgroups that are referenced by this range
- * including the root we are acting on currently. We will adjust the counters
- * based on the number of roots we had and will have after this operation.
- *
- * Take this example as an illustration
- *
- * [qgroup 1/0]
- * / | \
- * [qg 0/0] [qg 0/1] [qg 0/2]
- * \ | /
- * [ extent ]
- *
- * Say we are adding a reference that is covered by qg 0/0. The first step
- * would give a refcnt of 1 to qg 0/1 and 0/2 and a refcnt of 2 to qg 1/0 with
- * old_roots being 2. Because it is adding new_roots will be 1. We then go
- * through qg 0/0 which will get the new_refcnt set to 1 and add 1 to qg 1/0's
- * new_refcnt, bringing it to 3. We then walk through all of the qgroups, we
- * notice that the old refcnt for qg 0/0 < the new refcnt, so we added a
- * reference and thus must add the size to the referenced bytes. Everything
- * else is the same so nothing else changes.
- */
-static int qgroup_shared_accounting(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
- struct btrfs_qgroup_operation *oper)
+int
+btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ u64 bytenr, u64 num_bytes,
+ struct ulist *old_roots, struct ulist *new_roots)
{
- struct ulist *roots = NULL;
- struct ulist *qgroups, *tmp;
- struct btrfs_qgroup *qgroup;
- struct seq_list elem = SEQ_LIST_INIT(elem);
+ struct ulist *qgroups = NULL;
+ struct ulist *tmp = NULL;
u64 seq;
- int old_roots = 0;
- int new_roots = 0;
+ u64 nr_new_roots = 0;
+ u64 nr_old_roots = 0;
int ret = 0;
- if (oper->elem.seq) {
- ret = check_existing_refs(trans, fs_info, oper);
- if (ret < 0)
- return ret;
- if (ret)
- return 0;
- }
+ if (new_roots)
+ nr_new_roots = new_roots->nnodes;
+ if (old_roots)
+ nr_old_roots = old_roots->nnodes;
- qgroups = ulist_alloc(GFP_NOFS);
- if (!qgroups)
- return -ENOMEM;
+ if (!fs_info->quota_enabled)
+ goto out_free;
+ BUG_ON(!fs_info->quota_root);
+ qgroups = ulist_alloc(GFP_NOFS);
+ if (!qgroups) {
+ ret = -ENOMEM;
+ goto out_free;
+ }
tmp = ulist_alloc(GFP_NOFS);
if (!tmp) {
- ulist_free(qgroups);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out_free;
}
- btrfs_get_tree_mod_seq(fs_info, &elem);
- ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, elem.seq,
- &roots);
- btrfs_put_tree_mod_seq(fs_info, &elem);
- if (ret < 0) {
- ulist_free(qgroups);
- ulist_free(tmp);
- return ret;
+ mutex_lock(&fs_info->qgroup_rescan_lock);
+ if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
+ if (fs_info->qgroup_rescan_progress.objectid <= bytenr) {
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+ ret = 0;
+ goto out_free;
+ }
}
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+
spin_lock(&fs_info->qgroup_lock);
- qgroup = find_qgroup_rb(fs_info, oper->ref_root);
- if (!qgroup)
- goto out;
seq = fs_info->qgroup_seq;
- /*
- * So roots is the list of all the roots currently pointing at the
- * bytenr, including the ref we are adding if we are adding, or not if
- * we are removing a ref. So we pass in the ref_root to skip that root
- * in our calculations. We set old_refnct and new_refcnt cause who the
- * hell knows what everything looked like before, and it doesn't matter
- * except...
- */
- ret = qgroup_calc_old_refcnt(fs_info, oper->ref_root, tmp, roots, qgroups,
- seq, &old_roots, 0);
- if (ret < 0)
- goto out;
-
- /*
- * Now adjust the refcounts of the qgroups that care about this
- * reference, either the old_count in the case of removal or new_count
- * in the case of an addition.
- */
- ret = qgroup_calc_new_refcnt(fs_info, oper, qgroup, tmp, qgroups,
- seq);
+ /* Update old refcnts using old_roots */
+ ret = qgroup_update_refcnt(fs_info, old_roots, tmp, qgroups, seq,
+ UPDATE_OLD);
if (ret < 0)
goto out;
- /*
- * ...in the case of removals. If we had a removal before we got around
- * to processing this operation then we need to find that guy and count
- * his references as if they really existed so we don't end up screwing
- * up the exclusive counts. Then whenever we go to process the delete
- * everything will be grand and we can account for whatever exclusive
- * changes need to be made there. We also have to pass in old_roots so
- * we have an accurate count of the roots as it pertains to this
- * operations view of the world.
- */
- ret = qgroup_account_deleted_refs(fs_info, oper, tmp, qgroups, seq,
- &old_roots);
+ /* Update new refcnts using new_roots */
+ ret = qgroup_update_refcnt(fs_info, new_roots, tmp, qgroups, seq,
+ UPDATE_NEW);
if (ret < 0)
goto out;
- /*
- * We are adding our root, need to adjust up the number of roots,
- * otherwise old_roots is the number of roots we want.
- */
- if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) {
- new_roots = old_roots + 1;
- } else {
- new_roots = old_roots;
- old_roots++;
- }
- fs_info->qgroup_seq += old_roots + 1;
-
+ qgroup_update_counters(fs_info, qgroups, nr_old_roots, nr_new_roots,
+ num_bytes, seq);
/*
- * And now the magic happens, bless Arne for having a pretty elegant
- * solution for this.
+ * Bump qgroup_seq to avoid seq overlap
*/
- qgroup_adjust_counters(fs_info, oper->ref_root, oper->num_bytes,
- qgroups, seq, old_roots, new_roots, 0);
+ fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1;
out:
spin_unlock(&fs_info->qgroup_lock);
- ulist_free(qgroups);
- ulist_free(roots);
+out_free:
ulist_free(tmp);
+ ulist_free(qgroups);
+ ulist_free(old_roots);
+ ulist_free(new_roots);
return ret;
}
-/*
- * Process a reference to a shared subtree. This type of operation is
- * queued during snapshot removal when we encounter extents which are
- * shared between more than one root.
- */
-static int qgroup_subtree_accounting(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
- struct btrfs_qgroup_operation *oper)
-{
- struct ulist *roots = NULL;
- struct ulist_node *unode;
- struct ulist_iterator uiter;
- struct btrfs_qgroup_list *glist;
- struct ulist *parents;
- int ret = 0;
- int err;
- struct btrfs_qgroup *qg;
- u64 root_obj = 0;
- struct seq_list elem = SEQ_LIST_INIT(elem);
-
- parents = ulist_alloc(GFP_NOFS);
- if (!parents)
- return -ENOMEM;
-
- btrfs_get_tree_mod_seq(fs_info, &elem);
- ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr,
- elem.seq, &roots);
- btrfs_put_tree_mod_seq(fs_info, &elem);
- if (ret < 0)
- goto out;
-
- if (roots->nnodes != 1)
- goto out;
-
- ULIST_ITER_INIT(&uiter);
- unode = ulist_next(roots, &uiter); /* Only want 1 so no need to loop */
- /*
- * If we find our ref root then that means all refs
- * this extent has to the root have not yet been
- * deleted. In that case, we do nothing and let the
- * last ref for this bytenr drive our update.
- *
- * This can happen for example if an extent is
- * referenced multiple times in a snapshot (clone,
- * etc). If we are in the middle of snapshot removal,
- * queued updates for such an extent will find the
- * root if we have not yet finished removing the
- * snapshot.
- */
- if (unode->val == oper->ref_root)
- goto out;
-
- root_obj = unode->val;
- BUG_ON(!root_obj);
-
- spin_lock(&fs_info->qgroup_lock);
- qg = find_qgroup_rb(fs_info, root_obj);
- if (!qg)
- goto out_unlock;
-
- qg->excl += oper->num_bytes;
- qg->excl_cmpr += oper->num_bytes;
- qgroup_dirty(fs_info, qg);
-
- /*
- * Adjust counts for parent groups. First we find all
- * parents, then in the 2nd loop we do the adjustment
- * while adding parents of the parents to our ulist.
- */
- list_for_each_entry(glist, &qg->groups, next_group) {
- err = ulist_add(parents, glist->group->qgroupid,
- ptr_to_u64(glist->group), GFP_ATOMIC);
- if (err < 0) {
- ret = err;
- goto out_unlock;
- }
- }
-
- ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(parents, &uiter))) {
- qg = u64_to_ptr(unode->aux);
- qg->excl += oper->num_bytes;
- qg->excl_cmpr += oper->num_bytes;
- qgroup_dirty(fs_info, qg);
-
- /* Add any parents of the parents */
- list_for_each_entry(glist, &qg->groups, next_group) {
- err = ulist_add(parents, glist->group->qgroupid,
- ptr_to_u64(glist->group), GFP_ATOMIC);
- if (err < 0) {
- ret = err;
- goto out_unlock;
- }
- }
- }
-
-out_unlock:
- spin_unlock(&fs_info->qgroup_lock);
-
-out:
- ulist_free(roots);
- ulist_free(parents);
- return ret;
-}
-
-/*
- * btrfs_qgroup_account_ref is called for every ref that is added to or deleted
- * from the fs. First, all roots referencing the extent are searched, and
- * then the space is accounted accordingly to the different roots. The
- * accounting algorithm works in 3 steps documented inline.
- */
-static int btrfs_qgroup_account(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
- struct btrfs_qgroup_operation *oper)
+int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info)
{
+ struct btrfs_qgroup_extent_record *record;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct ulist *new_roots = NULL;
+ struct rb_node *node;
+ u64 qgroup_to_skip;
int ret = 0;
- if (!fs_info->quota_enabled)
- return 0;
-
- BUG_ON(!fs_info->quota_root);
+ delayed_refs = &trans->transaction->delayed_refs;
+ qgroup_to_skip = delayed_refs->qgroup_to_skip;
+ while ((node = rb_first(&delayed_refs->dirty_extent_root))) {
+ record = rb_entry(node, struct btrfs_qgroup_extent_record,
+ node);
- mutex_lock(&fs_info->qgroup_rescan_lock);
- if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
- if (fs_info->qgroup_rescan_progress.objectid <= oper->bytenr) {
- mutex_unlock(&fs_info->qgroup_rescan_lock);
- return 0;
+ if (!ret) {
+ /*
+ * Use (u64)-1 as time_seq to do special search, which
+ * doesn't lock tree or delayed_refs and search current
+ * root. It's safe inside commit_transaction().
+ */
+ ret = btrfs_find_all_roots(trans, fs_info,
+ record->bytenr, (u64)-1, &new_roots);
+ if (ret < 0)
+ goto cleanup;
+ if (qgroup_to_skip)
+ ulist_del(new_roots, qgroup_to_skip, 0);
+ ret = btrfs_qgroup_account_extent(trans, fs_info,
+ record->bytenr, record->num_bytes,
+ record->old_roots, new_roots);
+ record->old_roots = NULL;
+ new_roots = NULL;
}
- }
- mutex_unlock(&fs_info->qgroup_rescan_lock);
-
- ASSERT(is_fstree(oper->ref_root));
-
- trace_btrfs_qgroup_account(oper);
-
- switch (oper->type) {
- case BTRFS_QGROUP_OPER_ADD_EXCL:
- case BTRFS_QGROUP_OPER_SUB_EXCL:
- ret = qgroup_excl_accounting(fs_info, oper);
- break;
- case BTRFS_QGROUP_OPER_ADD_SHARED:
- case BTRFS_QGROUP_OPER_SUB_SHARED:
- ret = qgroup_shared_accounting(trans, fs_info, oper);
- break;
- case BTRFS_QGROUP_OPER_SUB_SUBTREE:
- ret = qgroup_subtree_accounting(trans, fs_info, oper);
- break;
- default:
- ASSERT(0);
- }
- return ret;
-}
-
-/*
- * Needs to be called everytime we run delayed refs, even if there is an error
- * in order to cleanup outstanding operations.
- */
-int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info)
-{
- struct btrfs_qgroup_operation *oper;
- int ret = 0;
+cleanup:
+ ulist_free(record->old_roots);
+ ulist_free(new_roots);
+ new_roots = NULL;
+ rb_erase(node, &delayed_refs->dirty_extent_root);
+ kfree(record);
- while (!list_empty(&trans->qgroup_ref_list)) {
- oper = list_first_entry(&trans->qgroup_ref_list,
- struct btrfs_qgroup_operation, list);
- list_del_init(&oper->list);
- if (!ret || !trans->aborted)
- ret = btrfs_qgroup_account(trans, fs_info, oper);
- spin_lock(&fs_info->qgroup_op_lock);
- rb_erase(&oper->n, &fs_info->qgroup_op_tree);
- spin_unlock(&fs_info->qgroup_op_lock);
- btrfs_put_tree_mod_seq(fs_info, &oper->elem);
- kfree(oper);
}
return ret;
}
@@ -2637,15 +2188,13 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
*/
static int
qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
- struct btrfs_trans_handle *trans, struct ulist *qgroups,
- struct ulist *tmp, struct extent_buffer *scratch_leaf)
+ struct btrfs_trans_handle *trans,
+ struct extent_buffer *scratch_leaf)
{
struct btrfs_key found;
struct ulist *roots = NULL;
struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem);
u64 num_bytes;
- u64 seq;
- int new_roots;
int slot;
int ret;
@@ -2695,33 +2244,15 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
else
num_bytes = found.offset;
- ulist_reinit(qgroups);
ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0,
&roots);
if (ret < 0)
goto out;
- spin_lock(&fs_info->qgroup_lock);
- seq = fs_info->qgroup_seq;
- fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */
-
- new_roots = 0;
- ret = qgroup_calc_old_refcnt(fs_info, 0, tmp, roots, qgroups,
- seq, &new_roots, 1);
- if (ret < 0) {
- spin_unlock(&fs_info->qgroup_lock);
- ulist_free(roots);
- goto out;
- }
-
- ret = qgroup_adjust_counters(fs_info, 0, num_bytes, qgroups,
- seq, 0, new_roots, 1);
- if (ret < 0) {
- spin_unlock(&fs_info->qgroup_lock);
- ulist_free(roots);
+ /* For rescan, just pass old_roots as NULL */
+ ret = btrfs_qgroup_account_extent(trans, fs_info,
+ found.objectid, num_bytes, NULL, roots);
+ if (ret < 0)
goto out;
- }
- spin_unlock(&fs_info->qgroup_lock);
- ulist_free(roots);
}
out:
btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
@@ -2735,7 +2266,6 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
qgroup_rescan_work);
struct btrfs_path *path;
struct btrfs_trans_handle *trans = NULL;
- struct ulist *tmp = NULL, *qgroups = NULL;
struct extent_buffer *scratch_leaf = NULL;
int err = -ENOMEM;
int ret = 0;
@@ -2743,12 +2273,6 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
path = btrfs_alloc_path();
if (!path)
goto out;
- qgroups = ulist_alloc(GFP_NOFS);
- if (!qgroups)
- goto out;
- tmp = ulist_alloc(GFP_NOFS);
- if (!tmp)
- goto out;
scratch_leaf = kmalloc(sizeof(*scratch_leaf), GFP_NOFS);
if (!scratch_leaf)
goto out;
@@ -2764,7 +2288,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
err = -EINTR;
} else {
err = qgroup_rescan_leaf(fs_info, path, trans,
- qgroups, tmp, scratch_leaf);
+ scratch_leaf);
}
if (err > 0)
btrfs_commit_transaction(trans, fs_info->fs_root);
@@ -2774,8 +2298,6 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
out:
kfree(scratch_leaf);
- ulist_free(qgroups);
- ulist_free(tmp);
btrfs_free_path(path);
mutex_lock(&fs_info->qgroup_rescan_lock);
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index c5242aa9a..6387dcfa3 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -19,43 +19,18 @@
#ifndef __BTRFS_QGROUP__
#define __BTRFS_QGROUP__
+#include "ulist.h"
+#include "delayed-ref.h"
+
/*
- * A description of the operations, all of these operations only happen when we
- * are adding the 1st reference for that subvolume in the case of adding space
- * or on the last reference delete in the case of subtraction. The only
- * exception is the last one, which is added for confusion.
- *
- * BTRFS_QGROUP_OPER_ADD_EXCL: adding bytes where this subvolume is the only
- * one pointing at the bytes we are adding. This is called on the first
- * allocation.
- *
- * BTRFS_QGROUP_OPER_ADD_SHARED: adding bytes where this bytenr is going to be
- * shared between subvols. This is called on the creation of a ref that already
- * has refs from a different subvolume, so basically reflink.
- *
- * BTRFS_QGROUP_OPER_SUB_EXCL: removing bytes where this subvolume is the only
- * one referencing the range.
- *
- * BTRFS_QGROUP_OPER_SUB_SHARED: removing bytes where this subvolume shares with
- * refs with other subvolumes.
+ * Record a dirty extent, and info qgroup to update quota on it
+ * TODO: Use kmem cache to alloc it.
*/
-enum btrfs_qgroup_operation_type {
- BTRFS_QGROUP_OPER_ADD_EXCL,
- BTRFS_QGROUP_OPER_ADD_SHARED,
- BTRFS_QGROUP_OPER_SUB_EXCL,
- BTRFS_QGROUP_OPER_SUB_SHARED,
- BTRFS_QGROUP_OPER_SUB_SUBTREE,
-};
-
-struct btrfs_qgroup_operation {
- u64 ref_root;
+struct btrfs_qgroup_extent_record {
+ struct rb_node node;
u64 bytenr;
u64 num_bytes;
- u64 seq;
- enum btrfs_qgroup_operation_type type;
- struct seq_list elem;
- struct rb_node n;
- struct list_head list;
+ struct ulist *old_roots;
};
int btrfs_quota_enable(struct btrfs_trans_handle *trans,
@@ -79,16 +54,18 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info);
void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
struct btrfs_delayed_extent_op;
-int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 ref_root,
+int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info);
+struct btrfs_qgroup_extent_record
+*btrfs_qgroup_insert_dirty_extent(struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_qgroup_extent_record *record);
+int
+btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
u64 bytenr, u64 num_bytes,
- enum btrfs_qgroup_operation_type type,
- int mod_seq);
-int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info);
-void btrfs_remove_qgroup_operation(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
- struct btrfs_qgroup_operation *oper);
+ struct ulist *old_roots, struct ulist *new_roots);
+int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info);
int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 74b24b01d..88cbb5995 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1847,8 +1847,10 @@ again:
}
eb = read_tree_block(dest, old_bytenr, old_ptr_gen);
- if (!eb || !extent_buffer_uptodate(eb)) {
- ret = (!eb) ? -ENOMEM : -EIO;
+ if (IS_ERR(eb)) {
+ ret = PTR_ERR(eb);
+ } else if (!extent_buffer_uptodate(eb)) {
+ ret = -EIO;
free_extent_buffer(eb);
break;
}
@@ -2002,7 +2004,9 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
bytenr = btrfs_node_blockptr(eb, path->slots[i]);
eb = read_tree_block(root, bytenr, ptr_gen);
- if (!eb || !extent_buffer_uptodate(eb)) {
+ if (IS_ERR(eb)) {
+ return PTR_ERR(eb);
+ } else if (!extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
return -EIO;
}
@@ -2710,7 +2714,10 @@ static int do_relocation(struct btrfs_trans_handle *trans,
blocksize = root->nodesize;
generation = btrfs_node_ptr_generation(upper->eb, slot);
eb = read_tree_block(root, bytenr, generation);
- if (!eb || !extent_buffer_uptodate(eb)) {
+ if (IS_ERR(eb)) {
+ err = PTR_ERR(eb);
+ goto next;
+ } else if (!extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
err = -EIO;
goto next;
@@ -2873,7 +2880,9 @@ static int get_tree_block_key(struct reloc_control *rc,
BUG_ON(block->key_ready);
eb = read_tree_block(rc->extent_root, block->bytenr,
block->key.offset);
- if (!eb || !extent_buffer_uptodate(eb)) {
+ if (IS_ERR(eb)) {
+ return PTR_ERR(eb);
+ } else if (!extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
return -EIO;
}
@@ -4040,7 +4049,7 @@ restart:
if (trans && progress && err == -ENOSPC) {
ret = btrfs_force_chunk_alloc(trans, rc->extent_root,
rc->block_group->flags);
- if (ret == 0) {
+ if (ret == 1) {
err = 0;
progress = 0;
goto restart;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index ab5811545..94db0fa52 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2662,18 +2662,30 @@ static void scrub_free_parity(struct scrub_parity *sparity)
kfree(sparity);
}
+static void scrub_parity_bio_endio_worker(struct btrfs_work *work)
+{
+ struct scrub_parity *sparity = container_of(work, struct scrub_parity,
+ work);
+ struct scrub_ctx *sctx = sparity->sctx;
+
+ scrub_free_parity(sparity);
+ scrub_pending_bio_dec(sctx);
+}
+
static void scrub_parity_bio_endio(struct bio *bio, int error)
{
struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
- struct scrub_ctx *sctx = sparity->sctx;
if (error)
bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
sparity->nsectors);
- scrub_free_parity(sparity);
- scrub_pending_bio_dec(sctx);
bio_put(bio);
+
+ btrfs_init_work(&sparity->work, btrfs_scrubparity_helper,
+ scrub_parity_bio_endio_worker, NULL, NULL);
+ btrfs_queue_work(sparity->sctx->dev_root->fs_info->scrub_parity_workers,
+ &sparity->work);
}
static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
@@ -3559,7 +3571,6 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
int is_dev_replace)
{
- int ret = 0;
unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
int max_active = fs_info->thread_pool_size;
@@ -3572,27 +3583,36 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
fs_info->scrub_workers =
btrfs_alloc_workqueue("btrfs-scrub", flags,
max_active, 4);
- if (!fs_info->scrub_workers) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!fs_info->scrub_workers)
+ goto fail_scrub_workers;
+
fs_info->scrub_wr_completion_workers =
btrfs_alloc_workqueue("btrfs-scrubwrc", flags,
max_active, 2);
- if (!fs_info->scrub_wr_completion_workers) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!fs_info->scrub_wr_completion_workers)
+ goto fail_scrub_wr_completion_workers;
+
fs_info->scrub_nocow_workers =
btrfs_alloc_workqueue("btrfs-scrubnc", flags, 1, 0);
- if (!fs_info->scrub_nocow_workers) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!fs_info->scrub_nocow_workers)
+ goto fail_scrub_nocow_workers;
+ fs_info->scrub_parity_workers =
+ btrfs_alloc_workqueue("btrfs-scrubparity", flags,
+ max_active, 2);
+ if (!fs_info->scrub_parity_workers)
+ goto fail_scrub_parity_workers;
}
++fs_info->scrub_workers_refcnt;
-out:
- return ret;
+ return 0;
+
+fail_scrub_parity_workers:
+ btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);
+fail_scrub_nocow_workers:
+ btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
+fail_scrub_wr_completion_workers:
+ btrfs_destroy_workqueue(fs_info->scrub_workers);
+fail_scrub_workers:
+ return -ENOMEM;
}
static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
@@ -3601,6 +3621,7 @@ static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
btrfs_destroy_workqueue(fs_info->scrub_workers);
btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);
+ btrfs_destroy_workqueue(fs_info->scrub_parity_workers);
}
WARN_ON(fs_info->scrub_workers_refcnt < 0);
}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 5cf7838fb..aa72bfd28 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -243,6 +243,7 @@ struct waiting_dir_move {
* after this directory is moved, we can try to rmdir the ino rmdir_ino.
*/
u64 rmdir_ino;
+ bool orphanized;
};
struct orphan_dir_info {
@@ -1916,8 +1917,13 @@ static int did_overwrite_ref(struct send_ctx *sctx,
goto out;
}
- /* we know that it is or will be overwritten. check this now */
- if (ow_inode < sctx->send_progress)
+ /*
+ * We know that it is or will be overwritten. Check this now.
+ * The current inode being processed might have been the one that caused
+ * inode 'ino' to be orphanized, therefore ow_inode can actually be the
+ * same as sctx->send_progress.
+ */
+ if (ow_inode <= sctx->send_progress)
ret = 1;
else
ret = 0;
@@ -2239,6 +2245,8 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
fs_path_reset(dest);
while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) {
+ struct waiting_dir_move *wdm;
+
fs_path_reset(name);
if (is_waiting_for_rm(sctx, ino)) {
@@ -2249,7 +2257,11 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
break;
}
- if (is_waiting_for_move(sctx, ino)) {
+ wdm = get_waiting_dir_move(sctx, ino);
+ if (wdm && wdm->orphanized) {
+ ret = gen_unique_name(sctx, ino, gen, name);
+ stop = 1;
+ } else if (wdm) {
ret = get_first_ref(sctx->parent_root, ino,
&parent_inode, &parent_gen, name);
} else {
@@ -2344,8 +2356,12 @@ static int send_subvol_begin(struct send_ctx *sctx)
TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID,
le64_to_cpu(sctx->send_root->root_item.ctransid));
if (parent_root) {
- TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
- sctx->parent_root->root_item.uuid);
+ if (!btrfs_is_empty_uuid(parent_root->root_item.received_uuid))
+ TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
+ parent_root->root_item.received_uuid);
+ else
+ TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
+ parent_root->root_item.uuid);
TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
le64_to_cpu(sctx->parent_root->root_item.ctransid));
}
@@ -2939,7 +2955,7 @@ static int is_waiting_for_move(struct send_ctx *sctx, u64 ino)
return entry != NULL;
}
-static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino)
+static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino, bool orphanized)
{
struct rb_node **p = &sctx->waiting_dir_moves.rb_node;
struct rb_node *parent = NULL;
@@ -2950,6 +2966,7 @@ static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino)
return -ENOMEM;
dm->ino = ino;
dm->rmdir_ino = 0;
+ dm->orphanized = orphanized;
while (*p) {
parent = *p;
@@ -3046,7 +3063,7 @@ static int add_pending_dir_move(struct send_ctx *sctx,
goto out;
}
- ret = add_waiting_dir_move(sctx, pm->ino);
+ ret = add_waiting_dir_move(sctx, pm->ino, is_orphan);
if (ret)
goto out;
@@ -3369,8 +3386,40 @@ out:
return ret;
}
+/*
+ * Check if ino ino1 is an ancestor of inode ino2 in the given root.
+ * Return 1 if true, 0 if false and < 0 on error.
+ */
+static int is_ancestor(struct btrfs_root *root,
+ const u64 ino1,
+ const u64 ino1_gen,
+ const u64 ino2,
+ struct fs_path *fs_path)
+{
+ u64 ino = ino2;
+
+ while (ino > BTRFS_FIRST_FREE_OBJECTID) {
+ int ret;
+ u64 parent;
+ u64 parent_gen;
+
+ fs_path_reset(fs_path);
+ ret = get_first_ref(root, ino, &parent, &parent_gen, fs_path);
+ if (ret < 0) {
+ if (ret == -ENOENT && ino == ino2)
+ ret = 0;
+ return ret;
+ }
+ if (parent == ino1)
+ return parent_gen == ino1_gen ? 1 : 0;
+ ino = parent;
+ }
+ return 0;
+}
+
static int wait_for_parent_move(struct send_ctx *sctx,
- struct recorded_ref *parent_ref)
+ struct recorded_ref *parent_ref,
+ const bool is_orphan)
{
int ret = 0;
u64 ino = parent_ref->dir;
@@ -3390,11 +3439,24 @@ static int wait_for_parent_move(struct send_ctx *sctx,
* Our current directory inode may not yet be renamed/moved because some
* ancestor (immediate or not) has to be renamed/moved first. So find if
* such ancestor exists and make sure our own rename/move happens after
- * that ancestor is processed.
+ * that ancestor is processed to avoid path build infinite loops (done
+ * at get_cur_path()).
*/
while (ino > BTRFS_FIRST_FREE_OBJECTID) {
if (is_waiting_for_move(sctx, ino)) {
- ret = 1;
+ /*
+ * If the current inode is an ancestor of ino in the
+ * parent root, we need to delay the rename of the
+ * current inode, otherwise don't delayed the rename
+ * because we can end up with a circular dependency
+ * of renames, resulting in some directories never
+ * getting the respective rename operations issued in
+ * the send stream or getting into infinite path build
+ * loops.
+ */
+ ret = is_ancestor(sctx->parent_root,
+ sctx->cur_ino, sctx->cur_inode_gen,
+ ino, path_before);
break;
}
@@ -3436,7 +3498,7 @@ out:
ino,
&sctx->new_refs,
&sctx->deleted_refs,
- false);
+ is_orphan);
if (!ret)
ret = 1;
}
@@ -3605,6 +3667,17 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
}
}
+ if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root &&
+ can_rename) {
+ ret = wait_for_parent_move(sctx, cur, is_orphan);
+ if (ret < 0)
+ goto out;
+ if (ret == 1) {
+ can_rename = false;
+ *pending_move = 1;
+ }
+ }
+
/*
* link/move the ref to the new place. If we have an orphan
* inode, move it and update valid_path. If not, link or move
@@ -3625,18 +3698,11 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
* dirs, we always have one new and one deleted
* ref. The deleted ref is ignored later.
*/
- ret = wait_for_parent_move(sctx, cur);
- if (ret < 0)
- goto out;
- if (ret) {
- *pending_move = 1;
- } else {
- ret = send_rename(sctx, valid_path,
- cur->full_path);
- if (!ret)
- ret = fs_path_copy(valid_path,
- cur->full_path);
- }
+ ret = send_rename(sctx, valid_path,
+ cur->full_path);
+ if (!ret)
+ ret = fs_path_copy(valid_path,
+ cur->full_path);
if (ret < 0)
goto out;
} else {
@@ -4524,8 +4590,21 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
if (ret < 0)
goto out;
- TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
- clone_root->root->root_item.uuid);
+ /*
+ * If the parent we're using has a received_uuid set then use that as
+ * our clone source as that is what we will look for when doing a
+ * receive.
+ *
+ * This covers the case that we create a snapshot off of a received
+ * subvolume and then use that as the parent and try to receive on a
+ * different host.
+ */
+ if (!btrfs_is_empty_uuid(clone_root->root->root_item.received_uuid))
+ TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
+ clone_root->root->root_item.received_uuid);
+ else
+ TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
+ clone_root->root->root_item.uuid);
TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
le64_to_cpu(clone_root->root->root_item.ctransid));
TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 9e66f5e72..cd7ef34d2 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -135,6 +135,7 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
* __btrfs_std_error decodes expected errors from the caller and
* invokes the approciate error response.
*/
+__cold
void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int errno, const char *fmt, ...)
{
@@ -247,18 +248,11 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
* We'll complete the cleanup in btrfs_end_transaction and
* btrfs_commit_transaction.
*/
+__cold
void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root, const char *function,
unsigned int line, int errno)
{
- /*
- * Report first abort since mount
- */
- if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED,
- &root->fs_info->fs_state)) {
- WARN(1, KERN_DEBUG "BTRFS: Transaction aborted (error %d)\n",
- errno);
- }
trans->aborted = errno;
/* Nothing used. The other threads that have joined this
* transaction may be able to continue. */
@@ -281,6 +275,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
* __btrfs_panic decodes unexpected, fatal errors from the caller,
* issues an alert, and either panics or BUGs, depending on mount options.
*/
+__cold
void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int errno, const char *fmt, ...)
{
@@ -841,33 +836,153 @@ out:
return error;
}
-static struct dentry *get_default_root(struct super_block *sb,
- u64 subvol_objectid)
+static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
+ u64 subvol_objectid)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(sb);
struct btrfs_root *root = fs_info->tree_root;
- struct btrfs_root *new_root;
- struct btrfs_dir_item *di;
- struct btrfs_path *path;
- struct btrfs_key location;
- struct inode *inode;
- u64 dir_id;
- int new = 0;
+ struct btrfs_root *fs_root;
+ struct btrfs_root_ref *root_ref;
+ struct btrfs_inode_ref *inode_ref;
+ struct btrfs_key key;
+ struct btrfs_path *path = NULL;
+ char *name = NULL, *ptr;
+ u64 dirid;
+ int len;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ path->leave_spinning = 1;
+
+ name = kmalloc(PATH_MAX, GFP_NOFS);
+ if (!name) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ ptr = name + PATH_MAX - 1;
+ ptr[0] = '\0';
/*
- * We have a specific subvol we want to mount, just setup location and
- * go look up the root.
+ * Walk up the subvolume trees in the tree of tree roots by root
+ * backrefs until we hit the top-level subvolume.
*/
- if (subvol_objectid) {
- location.objectid = subvol_objectid;
- location.type = BTRFS_ROOT_ITEM_KEY;
- location.offset = (u64)-1;
- goto find_root;
+ while (subvol_objectid != BTRFS_FS_TREE_OBJECTID) {
+ key.objectid = subvol_objectid;
+ key.type = BTRFS_ROOT_BACKREF_KEY;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0) {
+ goto err;
+ } else if (ret > 0) {
+ ret = btrfs_previous_item(root, path, subvol_objectid,
+ BTRFS_ROOT_BACKREF_KEY);
+ if (ret < 0) {
+ goto err;
+ } else if (ret > 0) {
+ ret = -ENOENT;
+ goto err;
+ }
+ }
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ subvol_objectid = key.offset;
+
+ root_ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_root_ref);
+ len = btrfs_root_ref_name_len(path->nodes[0], root_ref);
+ ptr -= len + 1;
+ if (ptr < name) {
+ ret = -ENAMETOOLONG;
+ goto err;
+ }
+ read_extent_buffer(path->nodes[0], ptr + 1,
+ (unsigned long)(root_ref + 1), len);
+ ptr[0] = '/';
+ dirid = btrfs_root_ref_dirid(path->nodes[0], root_ref);
+ btrfs_release_path(path);
+
+ key.objectid = subvol_objectid;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = (u64)-1;
+ fs_root = btrfs_read_fs_root_no_name(fs_info, &key);
+ if (IS_ERR(fs_root)) {
+ ret = PTR_ERR(fs_root);
+ goto err;
+ }
+
+ /*
+ * Walk up the filesystem tree by inode refs until we hit the
+ * root directory.
+ */
+ while (dirid != BTRFS_FIRST_FREE_OBJECTID) {
+ key.objectid = dirid;
+ key.type = BTRFS_INODE_REF_KEY;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
+ if (ret < 0) {
+ goto err;
+ } else if (ret > 0) {
+ ret = btrfs_previous_item(fs_root, path, dirid,
+ BTRFS_INODE_REF_KEY);
+ if (ret < 0) {
+ goto err;
+ } else if (ret > 0) {
+ ret = -ENOENT;
+ goto err;
+ }
+ }
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ dirid = key.offset;
+
+ inode_ref = btrfs_item_ptr(path->nodes[0],
+ path->slots[0],
+ struct btrfs_inode_ref);
+ len = btrfs_inode_ref_name_len(path->nodes[0],
+ inode_ref);
+ ptr -= len + 1;
+ if (ptr < name) {
+ ret = -ENAMETOOLONG;
+ goto err;
+ }
+ read_extent_buffer(path->nodes[0], ptr + 1,
+ (unsigned long)(inode_ref + 1), len);
+ ptr[0] = '/';
+ btrfs_release_path(path);
+ }
}
+ btrfs_free_path(path);
+ if (ptr == name + PATH_MAX - 1) {
+ name[0] = '/';
+ name[1] = '\0';
+ } else {
+ memmove(name, ptr, name + PATH_MAX - ptr);
+ }
+ return name;
+
+err:
+ btrfs_free_path(path);
+ kfree(name);
+ return ERR_PTR(ret);
+}
+
+static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objectid)
+{
+ struct btrfs_root *root = fs_info->tree_root;
+ struct btrfs_dir_item *di;
+ struct btrfs_path *path;
+ struct btrfs_key location;
+ u64 dir_id;
+
path = btrfs_alloc_path();
if (!path)
- return ERR_PTR(-ENOMEM);
+ return -ENOMEM;
path->leave_spinning = 1;
/*
@@ -879,58 +994,23 @@ static struct dentry *get_default_root(struct super_block *sb,
di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
if (IS_ERR(di)) {
btrfs_free_path(path);
- return ERR_CAST(di);
+ return PTR_ERR(di);
}
if (!di) {
/*
* Ok the default dir item isn't there. This is weird since
* it's always been there, but don't freak out, just try and
- * mount to root most subvolume.
+ * mount the top-level subvolume.
*/
btrfs_free_path(path);
- dir_id = BTRFS_FIRST_FREE_OBJECTID;
- new_root = fs_info->fs_root;
- goto setup_root;
+ *objectid = BTRFS_FS_TREE_OBJECTID;
+ return 0;
}
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
btrfs_free_path(path);
-
-find_root:
- new_root = btrfs_read_fs_root_no_name(fs_info, &location);
- if (IS_ERR(new_root))
- return ERR_CAST(new_root);
-
- if (!(sb->s_flags & MS_RDONLY)) {
- int ret;
- down_read(&fs_info->cleanup_work_sem);
- ret = btrfs_orphan_cleanup(new_root);
- up_read(&fs_info->cleanup_work_sem);
- if (ret)
- return ERR_PTR(ret);
- }
-
- dir_id = btrfs_root_dirid(&new_root->root_item);
-setup_root:
- location.objectid = dir_id;
- location.type = BTRFS_INODE_ITEM_KEY;
- location.offset = 0;
-
- inode = btrfs_iget(sb, &location, new_root, &new);
- if (IS_ERR(inode))
- return ERR_CAST(inode);
-
- /*
- * If we're just mounting the root most subvol put the inode and return
- * a reference to the dentry. We will have already gotten a reference
- * to the inode in btrfs_fill_super so we're good to go.
- */
- if (!new && d_inode(sb->s_root) == inode) {
- iput(inode);
- return dget(sb->s_root);
- }
-
- return d_obtain_root(inode);
+ *objectid = location.objectid;
+ return 0;
}
static int btrfs_fill_super(struct super_block *sb,
@@ -1108,6 +1188,10 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
seq_puts(seq, ",fatal_errors=panic");
if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL)
seq_printf(seq, ",commit=%d", info->commit_interval);
+ seq_printf(seq, ",subvolid=%llu",
+ BTRFS_I(d_inode(dentry))->root->root_key.objectid);
+ seq_puts(seq, ",subvol=");
+ seq_dentry(seq, dentry, " \t\n\\");
return 0;
}
@@ -1138,107 +1222,139 @@ static inline int is_subvolume_inode(struct inode *inode)
}
/*
- * This will strip out the subvol=%s argument for an argument string and add
- * subvolid=0 to make sure we get the actual tree root for path walking to the
- * subvol we want.
+ * This will add subvolid=0 to the argument string while removing any subvol=
+ * and subvolid= arguments to make sure we get the top-level root for path
+ * walking to the subvol we want.
*/
static char *setup_root_args(char *args)
{
- unsigned len = strlen(args) + 2 + 1;
- char *src, *dst, *buf;
+ char *buf, *dst, *sep;
- /*
- * We need the same args as before, but with this substitution:
- * s!subvol=[^,]+!subvolid=0!
- *
- * Since the replacement string is up to 2 bytes longer than the
- * original, allocate strlen(args) + 2 + 1 bytes.
- */
+ if (!args)
+ return kstrdup("subvolid=0", GFP_NOFS);
- src = strstr(args, "subvol=");
- /* This shouldn't happen, but just in case.. */
- if (!src)
- return NULL;
-
- buf = dst = kmalloc(len, GFP_NOFS);
+ /* The worst case is that we add ",subvolid=0" to the end. */
+ buf = dst = kmalloc(strlen(args) + strlen(",subvolid=0") + 1, GFP_NOFS);
if (!buf)
return NULL;
- /*
- * If the subvol= arg is not at the start of the string,
- * copy whatever precedes it into buf.
- */
- if (src != args) {
- *src++ = '\0';
- strcpy(buf, args);
- dst += strlen(args);
+ while (1) {
+ sep = strchrnul(args, ',');
+ if (!strstarts(args, "subvol=") &&
+ !strstarts(args, "subvolid=")) {
+ memcpy(dst, args, sep - args);
+ dst += sep - args;
+ *dst++ = ',';
+ }
+ if (*sep)
+ args = sep + 1;
+ else
+ break;
}
-
strcpy(dst, "subvolid=0");
- dst += strlen("subvolid=0");
-
- /*
- * If there is a "," after the original subvol=... string,
- * copy that suffix into our buffer. Otherwise, we're done.
- */
- src = strchr(src, ',');
- if (src)
- strcpy(dst, src);
return buf;
}
-static struct dentry *mount_subvol(const char *subvol_name, int flags,
- const char *device_name, char *data)
+static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid,
+ int flags, const char *device_name,
+ char *data)
{
struct dentry *root;
- struct vfsmount *mnt;
+ struct vfsmount *mnt = NULL;
char *newargs;
+ int ret;
newargs = setup_root_args(data);
- if (!newargs)
- return ERR_PTR(-ENOMEM);
- mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name,
- newargs);
+ if (!newargs) {
+ root = ERR_PTR(-ENOMEM);
+ goto out;
+ }
- if (PTR_RET(mnt) == -EBUSY) {
+ mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name, newargs);
+ if (PTR_ERR_OR_ZERO(mnt) == -EBUSY) {
if (flags & MS_RDONLY) {
- mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~MS_RDONLY, device_name,
- newargs);
+ mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~MS_RDONLY,
+ device_name, newargs);
} else {
- int r;
- mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY, device_name,
- newargs);
+ mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY,
+ device_name, newargs);
if (IS_ERR(mnt)) {
- kfree(newargs);
- return ERR_CAST(mnt);
+ root = ERR_CAST(mnt);
+ mnt = NULL;
+ goto out;
}
- r = btrfs_remount(mnt->mnt_sb, &flags, NULL);
- if (r < 0) {
- /* FIXME: release vfsmount mnt ??*/
- kfree(newargs);
- return ERR_PTR(r);
+ down_write(&mnt->mnt_sb->s_umount);
+ ret = btrfs_remount(mnt->mnt_sb, &flags, NULL);
+ up_write(&mnt->mnt_sb->s_umount);
+ if (ret < 0) {
+ root = ERR_PTR(ret);
+ goto out;
}
}
}
+ if (IS_ERR(mnt)) {
+ root = ERR_CAST(mnt);
+ mnt = NULL;
+ goto out;
+ }
- kfree(newargs);
+ if (!subvol_name) {
+ if (!subvol_objectid) {
+ ret = get_default_subvol_objectid(btrfs_sb(mnt->mnt_sb),
+ &subvol_objectid);
+ if (ret) {
+ root = ERR_PTR(ret);
+ goto out;
+ }
+ }
+ subvol_name = get_subvol_name_from_objectid(btrfs_sb(mnt->mnt_sb),
+ subvol_objectid);
+ if (IS_ERR(subvol_name)) {
+ root = ERR_CAST(subvol_name);
+ subvol_name = NULL;
+ goto out;
+ }
- if (IS_ERR(mnt))
- return ERR_CAST(mnt);
+ }
root = mount_subtree(mnt, subvol_name);
+ /* mount_subtree() drops our reference on the vfsmount. */
+ mnt = NULL;
- if (!IS_ERR(root) && !is_subvolume_inode(d_inode(root))) {
+ if (!IS_ERR(root)) {
struct super_block *s = root->d_sb;
- dput(root);
- root = ERR_PTR(-EINVAL);
- deactivate_locked_super(s);
- printk(KERN_ERR "BTRFS: '%s' is not a valid subvolume\n",
- subvol_name);
+ struct inode *root_inode = d_inode(root);
+ u64 root_objectid = BTRFS_I(root_inode)->root->root_key.objectid;
+
+ ret = 0;
+ if (!is_subvolume_inode(root_inode)) {
+ pr_err("BTRFS: '%s' is not a valid subvolume\n",
+ subvol_name);
+ ret = -EINVAL;
+ }
+ if (subvol_objectid && root_objectid != subvol_objectid) {
+ /*
+ * This will also catch a race condition where a
+ * subvolume which was passed by ID is renamed and
+ * another subvolume is renamed over the old location.
+ */
+ pr_err("BTRFS: subvol '%s' does not match subvolid %llu\n",
+ subvol_name, subvol_objectid);
+ ret = -EINVAL;
+ }
+ if (ret) {
+ dput(root);
+ root = ERR_PTR(ret);
+ deactivate_locked_super(s);
+ }
}
+out:
+ mntput(mnt);
+ kfree(newargs);
+ kfree(subvol_name);
return root;
}
@@ -1303,7 +1419,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
{
struct block_device *bdev = NULL;
struct super_block *s;
- struct dentry *root;
struct btrfs_fs_devices *fs_devices = NULL;
struct btrfs_fs_info *fs_info = NULL;
struct security_mnt_opts new_sec_opts;
@@ -1323,10 +1438,10 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
return ERR_PTR(error);
}
- if (subvol_name) {
- root = mount_subvol(subvol_name, flags, device_name, data);
- kfree(subvol_name);
- return root;
+ if (subvol_name || subvol_objectid != BTRFS_FS_TREE_OBJECTID) {
+ /* mount_subvol() will free subvol_name. */
+ return mount_subvol(subvol_name, subvol_objectid, flags,
+ device_name, data);
}
security_init_mnt_opts(&new_sec_opts);
@@ -1392,23 +1507,19 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
error = btrfs_fill_super(s, fs_devices, data,
flags & MS_SILENT ? 1 : 0);
}
-
- root = !error ? get_default_root(s, subvol_objectid) : ERR_PTR(error);
- if (IS_ERR(root)) {
+ if (error) {
deactivate_locked_super(s);
- error = PTR_ERR(root);
goto error_sec_opts;
}
fs_info = btrfs_sb(s);
error = setup_security_options(fs_info, s, &new_sec_opts);
if (error) {
- dput(root);
deactivate_locked_super(s);
goto error_sec_opts;
}
- return root;
+ return dget(s->s_root);
error_close_devices:
btrfs_close_devices(fs_devices);
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index e8a4c86d2..603b0cc2b 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -33,6 +33,7 @@
#include "volumes.h"
static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj);
+static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj);
static u64 get_features(struct btrfs_fs_info *fs_info,
enum btrfs_feature_set set)
@@ -428,7 +429,7 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,
BTRFS_ATTR(clone_alignment, btrfs_clone_alignment_show);
-static struct attribute *btrfs_attrs[] = {
+static const struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(label),
BTRFS_ATTR_PTR(nodesize),
BTRFS_ATTR_PTR(sectorsize),
@@ -438,21 +439,29 @@ static struct attribute *btrfs_attrs[] = {
static void btrfs_release_super_kobj(struct kobject *kobj)
{
- struct btrfs_fs_info *fs_info = to_fs_info(kobj);
- complete(&fs_info->kobj_unregister);
+ struct btrfs_fs_devices *fs_devs = to_fs_devs(kobj);
+
+ memset(&fs_devs->super_kobj, 0, sizeof(struct kobject));
+ complete(&fs_devs->kobj_unregister);
}
static struct kobj_type btrfs_ktype = {
.sysfs_ops = &kobj_sysfs_ops,
.release = btrfs_release_super_kobj,
- .default_attrs = btrfs_attrs,
};
+static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj)
+{
+ if (kobj->ktype != &btrfs_ktype)
+ return NULL;
+ return container_of(kobj, struct btrfs_fs_devices, super_kobj);
+}
+
static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj)
{
if (kobj->ktype != &btrfs_ktype)
return NULL;
- return container_of(kobj, struct btrfs_fs_info, super_kobj);
+ return to_fs_devs(kobj)->fs_info;
}
#define NUM_FEATURE_BITS 64
@@ -493,12 +502,12 @@ static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add)
attrs[0] = &fa->kobj_attr.attr;
if (add) {
int ret;
- ret = sysfs_merge_group(&fs_info->super_kobj,
+ ret = sysfs_merge_group(&fs_info->fs_devices->super_kobj,
&agroup);
if (ret)
return ret;
} else
- sysfs_unmerge_group(&fs_info->super_kobj,
+ sysfs_unmerge_group(&fs_info->fs_devices->super_kobj,
&agroup);
}
@@ -506,25 +515,49 @@ static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add)
return 0;
}
-static void __btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info)
+static void __btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs)
+{
+ if (fs_devs->device_dir_kobj) {
+ kobject_del(fs_devs->device_dir_kobj);
+ kobject_put(fs_devs->device_dir_kobj);
+ fs_devs->device_dir_kobj = NULL;
+ }
+
+ if (fs_devs->super_kobj.state_initialized) {
+ kobject_del(&fs_devs->super_kobj);
+ kobject_put(&fs_devs->super_kobj);
+ wait_for_completion(&fs_devs->kobj_unregister);
+ }
+}
+
+/* when fs_devs is NULL it will remove all fsid kobject */
+void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs)
{
- kobject_del(&fs_info->super_kobj);
- kobject_put(&fs_info->super_kobj);
- wait_for_completion(&fs_info->kobj_unregister);
+ struct list_head *fs_uuids = btrfs_get_fs_uuids();
+
+ if (fs_devs) {
+ __btrfs_sysfs_remove_fsid(fs_devs);
+ return;
+ }
+
+ list_for_each_entry(fs_devs, fs_uuids, list) {
+ __btrfs_sysfs_remove_fsid(fs_devs);
+ }
}
void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info)
{
+ btrfs_reset_fs_info_ptr(fs_info);
+
if (fs_info->space_info_kobj) {
sysfs_remove_files(fs_info->space_info_kobj, allocation_attrs);
kobject_del(fs_info->space_info_kobj);
kobject_put(fs_info->space_info_kobj);
}
- kobject_del(fs_info->device_dir_kobj);
- kobject_put(fs_info->device_dir_kobj);
addrm_unknown_feature_attrs(fs_info, false);
- sysfs_remove_group(&fs_info->super_kobj, &btrfs_feature_attr_group);
- __btrfs_sysfs_remove_one(fs_info);
+ sysfs_remove_group(&fs_info->fs_devices->super_kobj, &btrfs_feature_attr_group);
+ sysfs_remove_files(&fs_info->fs_devices->super_kobj, btrfs_attrs);
+ btrfs_kobj_rm_device(fs_info->fs_devices, NULL);
}
const char * const btrfs_feature_set_names[3] = {
@@ -602,40 +635,60 @@ static void init_feature_attrs(void)
}
}
-int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info,
+/* when one_device is NULL, it removes all device links */
+
+int btrfs_kobj_rm_device(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *one_device)
{
struct hd_struct *disk;
struct kobject *disk_kobj;
- if (!fs_info->device_dir_kobj)
+ if (!fs_devices->device_dir_kobj)
return -EINVAL;
if (one_device && one_device->bdev) {
disk = one_device->bdev->bd_part;
disk_kobj = &part_to_dev(disk)->kobj;
- sysfs_remove_link(fs_info->device_dir_kobj,
+ sysfs_remove_link(fs_devices->device_dir_kobj,
+ disk_kobj->name);
+ }
+
+ if (one_device)
+ return 0;
+
+ list_for_each_entry(one_device,
+ &fs_devices->devices, dev_list) {
+ if (!one_device->bdev)
+ continue;
+ disk = one_device->bdev->bd_part;
+ disk_kobj = &part_to_dev(disk)->kobj;
+
+ sysfs_remove_link(fs_devices->device_dir_kobj,
disk_kobj->name);
}
return 0;
}
-int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info,
- struct btrfs_device *one_device)
+int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs)
{
- int error = 0;
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
- struct btrfs_device *dev;
-
- if (!fs_info->device_dir_kobj)
- fs_info->device_dir_kobj = kobject_create_and_add("devices",
- &fs_info->super_kobj);
+ if (!fs_devs->device_dir_kobj)
+ fs_devs->device_dir_kobj = kobject_create_and_add("devices",
+ &fs_devs->super_kobj);
- if (!fs_info->device_dir_kobj)
+ if (!fs_devs->device_dir_kobj)
return -ENOMEM;
+ return 0;
+}
+
+int btrfs_kobj_add_device(struct btrfs_fs_devices *fs_devices,
+ struct btrfs_device *one_device)
+{
+ int error = 0;
+ struct btrfs_device *dev;
+
list_for_each_entry(dev, &fs_devices->devices, dev_list) {
struct hd_struct *disk;
struct kobject *disk_kobj;
@@ -649,7 +702,7 @@ int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info,
disk = dev->bdev->bd_part;
disk_kobj = &part_to_dev(disk)->kobj;
- error = sysfs_create_link(fs_info->device_dir_kobj,
+ error = sysfs_create_link(fs_devices->device_dir_kobj,
disk_kobj, disk_kobj->name);
if (error)
break;
@@ -667,34 +720,51 @@ static struct dentry *btrfs_debugfs_root_dentry;
/* Debugging tunables and exported data */
u64 btrfs_debugfs_test;
+/*
+ * Can be called by the device discovery thread.
+ * And parent can be specified for seed device
+ */
+int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs,
+ struct kobject *parent)
+{
+ int error;
+
+ init_completion(&fs_devs->kobj_unregister);
+ fs_devs->super_kobj.kset = btrfs_kset;
+ error = kobject_init_and_add(&fs_devs->super_kobj,
+ &btrfs_ktype, parent, "%pU", fs_devs->fsid);
+ return error;
+}
+
int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
{
int error;
+ struct btrfs_fs_devices *fs_devs = fs_info->fs_devices;
+ struct kobject *super_kobj = &fs_devs->super_kobj;
+
+ btrfs_set_fs_info_ptr(fs_info);
- init_completion(&fs_info->kobj_unregister);
- fs_info->super_kobj.kset = btrfs_kset;
- error = kobject_init_and_add(&fs_info->super_kobj, &btrfs_ktype, NULL,
- "%pU", fs_info->fsid);
+ error = btrfs_kobj_add_device(fs_devs, NULL);
if (error)
return error;
- error = sysfs_create_group(&fs_info->super_kobj,
- &btrfs_feature_attr_group);
+ error = sysfs_create_files(super_kobj, btrfs_attrs);
if (error) {
- __btrfs_sysfs_remove_one(fs_info);
+ btrfs_kobj_rm_device(fs_devs, NULL);
return error;
}
- error = addrm_unknown_feature_attrs(fs_info, true);
+ error = sysfs_create_group(super_kobj,
+ &btrfs_feature_attr_group);
if (error)
goto failure;
- error = btrfs_kobj_add_device(fs_info, NULL);
+ error = addrm_unknown_feature_attrs(fs_info, true);
if (error)
goto failure;
fs_info->space_info_kobj = kobject_create_and_add("allocation",
- &fs_info->super_kobj);
+ super_kobj);
if (!fs_info->space_info_kobj) {
error = -ENOMEM;
goto failure;
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index 3a4bbed72..6392527bc 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -82,8 +82,12 @@ char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags);
extern const char * const btrfs_feature_set_names[3];
extern struct kobj_type space_info_ktype;
extern struct kobj_type btrfs_raid_ktype;
-int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info,
+int btrfs_kobj_add_device(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *one_device);
-int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info,
+int btrfs_kobj_rm_device(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *one_device);
+int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs,
+ struct kobject *parent);
+int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs);
+void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs);
#endif /* _BTRFS_SYSFS_H_ */
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index c32a7ba76..846d277b1 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -21,6 +21,7 @@
#include "../transaction.h"
#include "../disk-io.h"
#include "../qgroup.h"
+#include "../backref.h"
static void init_dummy_trans(struct btrfs_trans_handle *trans)
{
@@ -227,6 +228,8 @@ static int test_no_shared_qgroup(struct btrfs_root *root)
{
struct btrfs_trans_handle trans;
struct btrfs_fs_info *fs_info = root->fs_info;
+ struct ulist *old_roots = NULL;
+ struct ulist *new_roots = NULL;
int ret;
init_dummy_trans(&trans);
@@ -238,10 +241,15 @@ static int test_no_shared_qgroup(struct btrfs_root *root)
return ret;
}
- ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096,
- BTRFS_QGROUP_OPER_ADD_EXCL, 0);
+ /*
+ * Since the test trans doesn't havee the complicated delayed refs,
+ * we can only call btrfs_qgroup_account_extent() directly to test
+ * quota.
+ */
+ ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots);
if (ret) {
- test_msg("Couldn't add space to a qgroup %d\n", ret);
+ ulist_free(old_roots);
+ test_msg("Couldn't find old roots: %d\n", ret);
return ret;
}
@@ -249,9 +257,18 @@ static int test_no_shared_qgroup(struct btrfs_root *root)
if (ret)
return ret;
- ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+ ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots);
+ if (ret) {
+ ulist_free(old_roots);
+ ulist_free(new_roots);
+ test_msg("Couldn't find old roots: %d\n", ret);
+ return ret;
+ }
+
+ ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096,
+ old_roots, new_roots);
if (ret) {
- test_msg("Delayed qgroup accounting failed %d\n", ret);
+ test_msg("Couldn't account space for a qgroup %d\n", ret);
return ret;
}
@@ -259,21 +276,32 @@ static int test_no_shared_qgroup(struct btrfs_root *root)
test_msg("Qgroup counts didn't match expected values\n");
return -EINVAL;
}
+ old_roots = NULL;
+ new_roots = NULL;
+
+ ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots);
+ if (ret) {
+ ulist_free(old_roots);
+ test_msg("Couldn't find old roots: %d\n", ret);
+ return ret;
+ }
ret = remove_extent_item(root, 4096, 4096);
if (ret)
return -EINVAL;
- ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096,
- BTRFS_QGROUP_OPER_SUB_EXCL, 0);
+ ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots);
if (ret) {
- test_msg("Couldn't remove space from the qgroup %d\n", ret);
- return -EINVAL;
+ ulist_free(old_roots);
+ ulist_free(new_roots);
+ test_msg("Couldn't find old roots: %d\n", ret);
+ return ret;
}
- ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+ ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096,
+ old_roots, new_roots);
if (ret) {
- test_msg("Qgroup accounting failed %d\n", ret);
+ test_msg("Couldn't account space for a qgroup %d\n", ret);
return -EINVAL;
}
@@ -294,6 +322,8 @@ static int test_multiple_refs(struct btrfs_root *root)
{
struct btrfs_trans_handle trans;
struct btrfs_fs_info *fs_info = root->fs_info;
+ struct ulist *old_roots = NULL;
+ struct ulist *new_roots = NULL;
int ret;
init_dummy_trans(&trans);
@@ -307,20 +337,29 @@ static int test_multiple_refs(struct btrfs_root *root)
return ret;
}
+ ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots);
+ if (ret) {
+ ulist_free(old_roots);
+ test_msg("Couldn't find old roots: %d\n", ret);
+ return ret;
+ }
+
ret = insert_normal_tree_ref(root, 4096, 4096, 0, 5);
if (ret)
return ret;
- ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096,
- BTRFS_QGROUP_OPER_ADD_EXCL, 0);
+ ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots);
if (ret) {
- test_msg("Couldn't add space to a qgroup %d\n", ret);
+ ulist_free(old_roots);
+ ulist_free(new_roots);
+ test_msg("Couldn't find old roots: %d\n", ret);
return ret;
}
- ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+ ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096,
+ old_roots, new_roots);
if (ret) {
- test_msg("Delayed qgroup accounting failed %d\n", ret);
+ test_msg("Couldn't account space for a qgroup %d\n", ret);
return ret;
}
@@ -329,20 +368,29 @@ static int test_multiple_refs(struct btrfs_root *root)
return -EINVAL;
}
+ ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots);
+ if (ret) {
+ ulist_free(old_roots);
+ test_msg("Couldn't find old roots: %d\n", ret);
+ return ret;
+ }
+
ret = add_tree_ref(root, 4096, 4096, 0, 256);
if (ret)
return ret;
- ret = btrfs_qgroup_record_ref(&trans, fs_info, 256, 4096, 4096,
- BTRFS_QGROUP_OPER_ADD_SHARED, 0);
+ ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots);
if (ret) {
- test_msg("Qgroup record ref failed %d\n", ret);
+ ulist_free(old_roots);
+ ulist_free(new_roots);
+ test_msg("Couldn't find old roots: %d\n", ret);
return ret;
}
- ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+ ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096,
+ old_roots, new_roots);
if (ret) {
- test_msg("Qgroup accounting failed %d\n", ret);
+ test_msg("Couldn't account space for a qgroup %d\n", ret);
return ret;
}
@@ -356,20 +404,29 @@ static int test_multiple_refs(struct btrfs_root *root)
return -EINVAL;
}
+ ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots);
+ if (ret) {
+ ulist_free(old_roots);
+ test_msg("Couldn't find old roots: %d\n", ret);
+ return ret;
+ }
+
ret = remove_extent_ref(root, 4096, 4096, 0, 256);
if (ret)
return ret;
- ret = btrfs_qgroup_record_ref(&trans, fs_info, 256, 4096, 4096,
- BTRFS_QGROUP_OPER_SUB_SHARED, 0);
+ ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots);
if (ret) {
- test_msg("Qgroup record ref failed %d\n", ret);
+ ulist_free(old_roots);
+ ulist_free(new_roots);
+ test_msg("Couldn't find old roots: %d\n", ret);
return ret;
}
- ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+ ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096,
+ old_roots, new_roots);
if (ret) {
- test_msg("Qgroup accounting failed %d\n", ret);
+ test_msg("Couldn't account space for a qgroup %d\n", ret);
return ret;
}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 94e909c5a..f5021fcb1 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -225,12 +225,14 @@ loop:
cur_trans->dirty_bg_run = 0;
cur_trans->delayed_refs.href_root = RB_ROOT;
+ cur_trans->delayed_refs.dirty_extent_root = RB_ROOT;
atomic_set(&cur_trans->delayed_refs.num_entries, 0);
cur_trans->delayed_refs.num_heads_ready = 0;
cur_trans->delayed_refs.pending_csums = 0;
cur_trans->delayed_refs.num_heads = 0;
cur_trans->delayed_refs.flushing = 0;
cur_trans->delayed_refs.run_delayed_start = 0;
+ cur_trans->delayed_refs.qgroup_to_skip = 0;
/*
* although the tree mod log is per file system and not per transaction,
@@ -509,6 +511,7 @@ again:
h->transaction = cur_trans;
h->blocks_used = 0;
h->bytes_reserved = 0;
+ h->chunk_bytes_reserved = 0;
h->root = root;
h->delayed_ref_updates = 0;
h->use_count = 1;
@@ -792,6 +795,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
if (!list_empty(&trans->new_bgs))
btrfs_create_pending_block_groups(trans, root);
+ btrfs_trans_release_chunk_metadata(trans);
+
if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
should_end_transaction(trans, root) &&
ACCESS_ONCE(cur_trans->state) == TRANS_STATE_RUNNING) {
@@ -1290,6 +1295,12 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
if (pending->error)
goto no_free_objectid;
+ /*
+ * Make qgroup to skip current new snapshot's qgroupid, as it is
+ * accounted by later btrfs_qgroup_inherit().
+ */
+ btrfs_set_skip_qgroup(trans, objectid);
+
btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
if (to_reserve > 0) {
@@ -1298,7 +1309,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
to_reserve,
BTRFS_RESERVE_NO_FLUSH);
if (pending->error)
- goto no_free_objectid;
+ goto clear_skip_qgroup;
}
key.objectid = objectid;
@@ -1396,25 +1407,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
btrfs_abort_transaction(trans, root, ret);
goto fail;
}
-
- /*
- * We need to flush delayed refs in order to make sure all of our quota
- * operations have been done before we call btrfs_qgroup_inherit.
- */
- ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
- if (ret) {
- btrfs_abort_transaction(trans, root, ret);
- goto fail;
- }
-
- ret = btrfs_qgroup_inherit(trans, fs_info,
- root->root_key.objectid,
- objectid, pending->inherit);
- if (ret) {
- btrfs_abort_transaction(trans, root, ret);
- goto fail;
- }
-
/* see comments in should_cow_block() */
set_bit(BTRFS_ROOT_FORCE_COW, &root->state);
smp_wmb();
@@ -1497,11 +1489,37 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
goto fail;
}
}
+
+ ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto fail;
+ }
+
+ /*
+ * account qgroup counters before qgroup_inherit()
+ */
+ ret = btrfs_qgroup_prepare_account_extents(trans, fs_info);
+ if (ret)
+ goto fail;
+ ret = btrfs_qgroup_account_extents(trans, fs_info);
+ if (ret)
+ goto fail;
+ ret = btrfs_qgroup_inherit(trans, fs_info,
+ root->root_key.objectid,
+ objectid, pending->inherit);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto fail;
+ }
+
fail:
pending->error = ret;
dir_item_existed:
trans->block_rsv = rsv;
trans->bytes_reserved = 0;
+clear_skip_qgroup:
+ btrfs_clear_skip_qgroup(trans);
no_free_objectid:
kfree(new_root_item);
root_item_alloc_fail:
@@ -1963,6 +1981,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
goto scrub_continue;
}
+ /* Reocrd old roots for later qgroup accounting */
+ ret = btrfs_qgroup_prepare_account_extents(trans, root->fs_info);
+ if (ret) {
+ mutex_unlock(&root->fs_info->reloc_mutex);
+ goto scrub_continue;
+ }
+
/*
* make sure none of the code above managed to slip in a
* delayed item
@@ -2004,6 +2029,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
*/
btrfs_free_log_root_tree(trans, root->fs_info);
+ /*
+ * Since fs roots are all committed, we can get a quite accurate
+ * new_roots. So let's do quota accounting.
+ */
+ ret = btrfs_qgroup_account_extents(trans, root->fs_info);
+ if (ret < 0) {
+ mutex_unlock(&root->fs_info->tree_log_mutex);
+ mutex_unlock(&root->fs_info->reloc_mutex);
+ goto scrub_continue;
+ }
+
ret = commit_cowonly_roots(trans, root);
if (ret) {
mutex_unlock(&root->fs_info->tree_log_mutex);
@@ -2054,6 +2090,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
clear_bit(BTRFS_INODE_BTREE_LOG1_ERR, &btree_ino->runtime_flags);
clear_bit(BTRFS_INODE_BTREE_LOG2_ERR, &btree_ino->runtime_flags);
+ btrfs_trans_release_chunk_metadata(trans);
+
spin_lock(&root->fs_info->trans_lock);
cur_trans->state = TRANS_STATE_UNBLOCKED;
root->fs_info->running_transaction = NULL;
@@ -2114,7 +2152,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
kmem_cache_free(btrfs_trans_handle_cachep, trans);
- if (current != root->fs_info->transaction_kthread)
+ if (current != root->fs_info->transaction_kthread &&
+ current != root->fs_info->cleaner_kthread)
btrfs_run_delayed_iputs(root);
return ret;
@@ -2123,6 +2162,7 @@ scrub_continue:
btrfs_scrub_continue(root);
cleanup_transaction:
btrfs_trans_release_metadata(trans, root);
+ btrfs_trans_release_chunk_metadata(trans);
trans->block_rsv = NULL;
if (trans->qgroup_reserved) {
btrfs_qgroup_free(root, trans->qgroup_reserved);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 0b2475559..eb09c2067 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -102,6 +102,7 @@ struct btrfs_transaction {
struct btrfs_trans_handle {
u64 transid;
u64 bytes_reserved;
+ u64 chunk_bytes_reserved;
u64 qgroup_reserved;
unsigned long use_count;
unsigned long blocks_reserved;
@@ -153,6 +154,29 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
spin_unlock(&BTRFS_I(inode)->lock);
}
+/*
+ * Make qgroup codes to skip given qgroupid, means the old/new_roots for
+ * qgroup won't contain the qgroupid in it.
+ */
+static inline void btrfs_set_skip_qgroup(struct btrfs_trans_handle *trans,
+ u64 qgroupid)
+{
+ struct btrfs_delayed_ref_root *delayed_refs;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ WARN_ON(delayed_refs->qgroup_to_skip);
+ delayed_refs->qgroup_to_skip = qgroupid;
+}
+
+static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_delayed_ref_root *delayed_refs;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ WARN_ON(!delayed_refs->qgroup_to_skip);
+ delayed_refs->qgroup_to_skip = 0;
+}
+
int btrfs_end_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index a63719cc9..a4b9c8b2d 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -52,9 +52,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
goto out;
- if (btrfs_test_opt(root, SSD))
- goto out;
-
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 4920fceff..9c45431e6 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3881,12 +3881,6 @@ static int wait_ordered_extents(struct btrfs_trans_handle *trans,
&ordered->flags))
continue;
- if (ordered->csum_bytes_left) {
- btrfs_start_ordered_extent(inode, ordered, 0);
- wait_event(ordered->wait,
- ordered->csum_bytes_left == 0);
- }
-
list_for_each_entry(sum, &ordered->list, list) {
ret = btrfs_csum_file_blocks(trans, log, sum);
if (ret)
@@ -4123,6 +4117,187 @@ static int logged_inode_size(struct btrfs_root *log, struct inode *inode,
return 0;
}
+/*
+ * At the moment we always log all xattrs. This is to figure out at log replay
+ * time which xattrs must have their deletion replayed. If a xattr is missing
+ * in the log tree and exists in the fs/subvol tree, we delete it. This is
+ * because if a xattr is deleted, the inode is fsynced and a power failure
+ * happens, causing the log to be replayed the next time the fs is mounted,
+ * we want the xattr to not exist anymore (same behaviour as other filesystems
+ * with a journal, ext3/4, xfs, f2fs, etc).
+ */
+static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *inode,
+ struct btrfs_path *path,
+ struct btrfs_path *dst_path)
+{
+ int ret;
+ struct btrfs_key key;
+ const u64 ino = btrfs_ino(inode);
+ int ins_nr = 0;
+ int start_slot = 0;
+
+ key.objectid = ino;
+ key.type = BTRFS_XATTR_ITEM_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+
+ while (true) {
+ int slot = path->slots[0];
+ struct extent_buffer *leaf = path->nodes[0];
+ int nritems = btrfs_header_nritems(leaf);
+
+ if (slot >= nritems) {
+ if (ins_nr > 0) {
+ u64 last_extent = 0;
+
+ ret = copy_items(trans, inode, dst_path, path,
+ &last_extent, start_slot,
+ ins_nr, 1, 0);
+ /* can't be 1, extent items aren't processed */
+ ASSERT(ret <= 0);
+ if (ret < 0)
+ return ret;
+ ins_nr = 0;
+ }
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ return ret;
+ else if (ret > 0)
+ break;
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY)
+ break;
+
+ if (ins_nr == 0)
+ start_slot = slot;
+ ins_nr++;
+ path->slots[0]++;
+ cond_resched();
+ }
+ if (ins_nr > 0) {
+ u64 last_extent = 0;
+
+ ret = copy_items(trans, inode, dst_path, path,
+ &last_extent, start_slot,
+ ins_nr, 1, 0);
+ /* can't be 1, extent items aren't processed */
+ ASSERT(ret <= 0);
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
+}
+
+/*
+ * If the no holes feature is enabled we need to make sure any hole between the
+ * last extent and the i_size of our inode is explicitly marked in the log. This
+ * is to make sure that doing something like:
+ *
+ * 1) create file with 128Kb of data
+ * 2) truncate file to 64Kb
+ * 3) truncate file to 256Kb
+ * 4) fsync file
+ * 5) <crash/power failure>
+ * 6) mount fs and trigger log replay
+ *
+ * Will give us a file with a size of 256Kb, the first 64Kb of data match what
+ * the file had in its first 64Kb of data at step 1 and the last 192Kb of the
+ * file correspond to a hole. The presence of explicit holes in a log tree is
+ * what guarantees that log replay will remove/adjust file extent items in the
+ * fs/subvol tree.
+ *
+ * Here we do not need to care about holes between extents, that is already done
+ * by copy_items(). We also only need to do this in the full sync path, where we
+ * lookup for extents from the fs/subvol tree only. In the fast path case, we
+ * lookup the list of modified extent maps and if any represents a hole, we
+ * insert a corresponding extent representing a hole in the log tree.
+ */
+static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *inode,
+ struct btrfs_path *path)
+{
+ int ret;
+ struct btrfs_key key;
+ u64 hole_start;
+ u64 hole_size;
+ struct extent_buffer *leaf;
+ struct btrfs_root *log = root->log_root;
+ const u64 ino = btrfs_ino(inode);
+ const u64 i_size = i_size_read(inode);
+
+ if (!btrfs_fs_incompat(root->fs_info, NO_HOLES))
+ return 0;
+
+ key.objectid = ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ ASSERT(ret != 0);
+ if (ret < 0)
+ return ret;
+
+ ASSERT(path->slots[0] > 0);
+ path->slots[0]--;
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+ if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) {
+ /* inode does not have any extents */
+ hole_start = 0;
+ hole_size = i_size;
+ } else {
+ struct btrfs_file_extent_item *extent;
+ u64 len;
+
+ /*
+ * If there's an extent beyond i_size, an explicit hole was
+ * already inserted by copy_items().
+ */
+ if (key.offset >= i_size)
+ return 0;
+
+ extent = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+
+ if (btrfs_file_extent_type(leaf, extent) ==
+ BTRFS_FILE_EXTENT_INLINE) {
+ len = btrfs_file_extent_inline_len(leaf,
+ path->slots[0],
+ extent);
+ ASSERT(len == i_size);
+ return 0;
+ }
+
+ len = btrfs_file_extent_num_bytes(leaf, extent);
+ /* Last extent goes beyond i_size, no need to log a hole. */
+ if (key.offset + len > i_size)
+ return 0;
+ hole_start = key.offset + len;
+ hole_size = i_size - hole_start;
+ }
+ btrfs_release_path(path);
+
+ /* Last extent ends at i_size. */
+ if (hole_size == 0)
+ return 0;
+
+ hole_size = ALIGN(hole_size, root->sectorsize);
+ ret = btrfs_insert_file_extent(trans, log, ino, hole_start, 0, 0,
+ hole_size, 0, hole_size, 0, 0, 0);
+ return ret;
+}
+
/* log a single inode in the tree log.
* At least one parent directory for this inode must exist in the tree
* or be logged already.
@@ -4295,6 +4470,25 @@ again:
if (min_key.type == BTRFS_INODE_ITEM_KEY)
need_log_inode_item = false;
+ /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */
+ if (min_key.type == BTRFS_XATTR_ITEM_KEY) {
+ if (ins_nr == 0)
+ goto next_slot;
+ ret = copy_items(trans, inode, dst_path, path,
+ &last_extent, ins_start_slot,
+ ins_nr, inode_only, logged_isize);
+ if (ret < 0) {
+ err = ret;
+ goto out_unlock;
+ }
+ ins_nr = 0;
+ if (ret) {
+ btrfs_release_path(path);
+ continue;
+ }
+ goto next_slot;
+ }
+
src = path->nodes[0];
if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
ins_nr++;
@@ -4362,6 +4556,18 @@ next_slot:
ins_nr = 0;
}
+ btrfs_release_path(path);
+ btrfs_release_path(dst_path);
+ err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path);
+ if (err)
+ goto out_unlock;
+ if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
+ btrfs_release_path(path);
+ btrfs_release_path(dst_path);
+ err = btrfs_log_trailing_hole(trans, root, inode, path);
+ if (err)
+ goto out_unlock;
+ }
log_extents:
btrfs_release_path(path);
btrfs_release_path(dst_path);
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index 840a38b27..91feb2bde 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -132,6 +132,15 @@ static struct ulist_node *ulist_rbtree_search(struct ulist *ulist, u64 val)
return NULL;
}
+static void ulist_rbtree_erase(struct ulist *ulist, struct ulist_node *node)
+{
+ rb_erase(&node->rb_node, &ulist->root);
+ list_del(&node->list);
+ kfree(node);
+ BUG_ON(ulist->nnodes == 0);
+ ulist->nnodes--;
+}
+
static int ulist_rbtree_insert(struct ulist *ulist, struct ulist_node *ins)
{
struct rb_node **p = &ulist->root.rb_node;
@@ -197,9 +206,6 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
node->val = val;
node->aux = aux;
-#ifdef CONFIG_BTRFS_DEBUG
- node->seqnum = ulist->nnodes;
-#endif
ret = ulist_rbtree_insert(ulist, node);
ASSERT(!ret);
@@ -209,6 +215,33 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
return 1;
}
+/*
+ * ulist_del - delete one node from ulist
+ * @ulist: ulist to remove node from
+ * @val: value to delete
+ * @aux: aux to delete
+ *
+ * The deletion will only be done when *BOTH* val and aux matches.
+ * Return 0 for successful delete.
+ * Return > 0 for not found.
+ */
+int ulist_del(struct ulist *ulist, u64 val, u64 aux)
+{
+ struct ulist_node *node;
+
+ node = ulist_rbtree_search(ulist, val);
+ /* Not found */
+ if (!node)
+ return 1;
+
+ if (node->aux != aux)
+ return 1;
+
+ /* Found and delete */
+ ulist_rbtree_erase(ulist, node);
+ return 0;
+}
+
/**
* ulist_next - iterate ulist
* @ulist: ulist to iterate
@@ -237,15 +270,7 @@ struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter)
uiter->cur_list = uiter->cur_list->next;
} else {
uiter->cur_list = ulist->nodes.next;
-#ifdef CONFIG_BTRFS_DEBUG
- uiter->i = 0;
-#endif
}
node = list_entry(uiter->cur_list, struct ulist_node, list);
-#ifdef CONFIG_BTRFS_DEBUG
- ASSERT(node->seqnum == uiter->i);
- ASSERT(uiter->i >= 0 && uiter->i < ulist->nnodes);
- uiter->i++;
-#endif
return node;
}
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
index 4c29db604..a01a2c458 100644
--- a/fs/btrfs/ulist.h
+++ b/fs/btrfs/ulist.h
@@ -57,6 +57,7 @@ void ulist_free(struct ulist *ulist);
int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask);
int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
u64 *old_aux, gfp_t gfp_mask);
+int ulist_del(struct ulist *ulist, u64 val, u64 aux);
/* just like ulist_add_merge() but take a pointer for the aux data */
static inline int ulist_add_merge_ptr(struct ulist *ulist, u64 val, void *aux,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 174f5e1e0..fbe7c1045 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -52,6 +52,10 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
DEFINE_MUTEX(uuid_mutex);
static LIST_HEAD(fs_uuids);
+struct list_head *btrfs_get_fs_uuids(void)
+{
+ return &fs_uuids;
+}
static struct btrfs_fs_devices *__alloc_fs_devices(void)
{
@@ -345,7 +349,7 @@ loop_lock:
waitqueue_active(&fs_info->async_submit_wait))
wake_up(&fs_info->async_submit_wait);
- BUG_ON(atomic_read(&cur->bi_cnt) == 0);
+ BUG_ON(atomic_read(&cur->__bi_cnt) == 0);
/*
* if we're doing the sync list, record that our
@@ -441,6 +445,61 @@ static void pending_bios_fn(struct btrfs_work *work)
run_scheduled_bios(device);
}
+
+void btrfs_free_stale_device(struct btrfs_device *cur_dev)
+{
+ struct btrfs_fs_devices *fs_devs;
+ struct btrfs_device *dev;
+
+ if (!cur_dev->name)
+ return;
+
+ list_for_each_entry(fs_devs, &fs_uuids, list) {
+ int del = 1;
+
+ if (fs_devs->opened)
+ continue;
+ if (fs_devs->seeding)
+ continue;
+
+ list_for_each_entry(dev, &fs_devs->devices, dev_list) {
+
+ if (dev == cur_dev)
+ continue;
+ if (!dev->name)
+ continue;
+
+ /*
+ * Todo: This won't be enough. What if the same device
+ * comes back (with new uuid and) with its mapper path?
+ * But for now, this does help as mostly an admin will
+ * either use mapper or non mapper path throughout.
+ */
+ rcu_read_lock();
+ del = strcmp(rcu_str_deref(dev->name),
+ rcu_str_deref(cur_dev->name));
+ rcu_read_unlock();
+ if (!del)
+ break;
+ }
+
+ if (!del) {
+ /* delete the stale device */
+ if (fs_devs->num_devices == 1) {
+ btrfs_sysfs_remove_fsid(fs_devs);
+ list_del(&fs_devs->list);
+ free_fs_devices(fs_devs);
+ } else {
+ fs_devs->num_devices--;
+ list_del(&dev->dev_list);
+ rcu_string_free(dev->name);
+ kfree(dev);
+ }
+ break;
+ }
+ }
+}
+
/*
* Add new device to list of registered devices
*
@@ -556,6 +615,12 @@ static noinline int device_list_add(const char *path,
if (!fs_devices->opened)
device->generation = found_transid;
+ /*
+ * if there is new btrfs on an already registered device,
+ * then remove the stale device entry.
+ */
+ btrfs_free_stale_device(device);
+
*fs_devices_ret = fs_devices;
return ret;
@@ -693,13 +758,13 @@ static void free_device(struct rcu_head *head)
static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
{
- struct btrfs_device *device;
+ struct btrfs_device *device, *tmp;
if (--fs_devices->opened > 0)
return 0;
mutex_lock(&fs_devices->device_list_mutex);
- list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) {
struct btrfs_device *new_device;
struct rcu_string *name;
@@ -1067,15 +1132,31 @@ again:
map = (struct map_lookup *)em->bdev;
for (i = 0; i < map->num_stripes; i++) {
+ u64 end;
+
if (map->stripes[i].dev != device)
continue;
if (map->stripes[i].physical >= physical_start + len ||
map->stripes[i].physical + em->orig_block_len <=
physical_start)
continue;
- *start = map->stripes[i].physical +
- em->orig_block_len;
- ret = 1;
+ /*
+ * Make sure that while processing the pinned list we do
+ * not override our *start with a lower value, because
+ * we can have pinned chunks that fall within this
+ * device hole and that have lower physical addresses
+ * than the pending chunks we processed before. If we
+ * do not take this special care we can end up getting
+ * 2 pending chunks that start at the same physical
+ * device offsets because the end offset of a pinned
+ * chunk can be equal to the start offset of some
+ * pending chunk.
+ */
+ end = map->stripes[i].physical + em->orig_block_len;
+ if (end > *start) {
+ *start = end;
+ ret = 1;
+ }
}
}
if (search_list == &trans->transaction->pending_chunks) {
@@ -1706,7 +1787,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
if (device->bdev) {
device->fs_devices->open_devices--;
/* remove sysfs entry */
- btrfs_kobj_rm_device(root->fs_info, device);
+ btrfs_kobj_rm_device(root->fs_info->fs_devices, device);
}
call_rcu(&device->rcu, free_device);
@@ -1875,6 +1956,9 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
mutex_lock(&uuid_mutex);
WARN_ON(!tgtdev);
mutex_lock(&fs_info->fs_devices->device_list_mutex);
+
+ btrfs_kobj_rm_device(fs_info->fs_devices, tgtdev);
+
if (tgtdev->bdev) {
btrfs_scratch_superblock(tgtdev);
fs_info->fs_devices->open_devices--;
@@ -2211,7 +2295,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
tmp + 1);
/* add sysfs device entry */
- btrfs_kobj_add_device(root->fs_info, device);
+ btrfs_kobj_add_device(root->fs_info->fs_devices, device);
/*
* we've got more storage, clear any full flags on the space
@@ -2252,8 +2336,9 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
*/
snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
root->fs_info->fsid);
- if (kobject_rename(&root->fs_info->super_kobj, fsid_buf))
- goto error_trans;
+ if (kobject_rename(&root->fs_info->fs_devices->super_kobj,
+ fsid_buf))
+ pr_warn("BTRFS: sysfs: failed to create fsid for sprout\n");
}
root->fs_info->num_tolerated_disk_barrier_failures =
@@ -2289,7 +2374,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
error_trans:
btrfs_end_transaction(trans, root);
rcu_string_free(device->name);
- btrfs_kobj_rm_device(root->fs_info, device);
+ btrfs_kobj_rm_device(root->fs_info->fs_devices, device);
kfree(device);
error:
blkdev_put(bdev, FMODE_EXCL);
@@ -2609,6 +2694,9 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
return -EINVAL;
}
map = (struct map_lookup *)em->bdev;
+ lock_chunks(root->fs_info->chunk_root);
+ check_system_chunk(trans, extent_root, map->type);
+ unlock_chunks(root->fs_info->chunk_root);
for (i = 0; i < map->num_stripes; i++) {
struct btrfs_device *device = map->stripes[i].dev;
@@ -2678,6 +2766,20 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
root = root->fs_info->chunk_root;
extent_root = root->fs_info->extent_root;
+ /*
+ * Prevent races with automatic removal of unused block groups.
+ * After we relocate and before we remove the chunk with offset
+ * chunk_offset, automatic removal of the block group can kick in,
+ * resulting in a failure when calling btrfs_remove_chunk() below.
+ *
+ * Make sure to acquire this mutex before doing a tree search (dev
+ * or chunk trees) to find chunks. Otherwise the cleaner kthread might
+ * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
+ * we release the path used to search the chunk/dev tree and before
+ * the current task acquires this mutex and calls us.
+ */
+ ASSERT(mutex_is_locked(&root->fs_info->delete_unused_bgs_mutex));
+
ret = btrfs_can_relocate(extent_root, chunk_offset);
if (ret)
return -ENOSPC;
@@ -2726,13 +2828,18 @@ again:
key.type = BTRFS_CHUNK_ITEM_KEY;
while (1) {
+ mutex_lock(&root->fs_info->delete_unused_bgs_mutex);
ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
- if (ret < 0)
+ if (ret < 0) {
+ mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
goto error;
+ }
BUG_ON(ret == 0); /* Corruption */
ret = btrfs_previous_item(chunk_root, path, key.objectid,
key.type);
+ if (ret)
+ mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
if (ret < 0)
goto error;
if (ret > 0)
@@ -2755,6 +2862,7 @@ again:
else
BUG_ON(ret);
}
+ mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
if (found_key.offset == 0)
break;
@@ -3211,9 +3319,12 @@ again:
goto error;
}
+ mutex_lock(&fs_info->delete_unused_bgs_mutex);
ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
- if (ret < 0)
+ if (ret < 0) {
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
goto error;
+ }
/*
* this shouldn't happen, it means the last relocate
@@ -3225,6 +3336,7 @@ again:
ret = btrfs_previous_item(chunk_root, path, 0,
BTRFS_CHUNK_ITEM_KEY);
if (ret) {
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
ret = 0;
break;
}
@@ -3233,8 +3345,10 @@ again:
slot = path->slots[0];
btrfs_item_key_to_cpu(leaf, &found_key, slot);
- if (found_key.objectid != key.objectid)
+ if (found_key.objectid != key.objectid) {
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
break;
+ }
chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
@@ -3247,10 +3361,13 @@ again:
ret = should_balance_chunk(chunk_root, leaf, chunk,
found_key.offset);
btrfs_release_path(path);
- if (!ret)
+ if (!ret) {
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
goto loop;
+ }
if (counting) {
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
spin_lock(&fs_info->balance_lock);
bctl->stat.expected++;
spin_unlock(&fs_info->balance_lock);
@@ -3260,6 +3377,7 @@ again:
ret = btrfs_relocate_chunk(chunk_root,
found_key.objectid,
found_key.offset);
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
if (ret && ret != -ENOSPC)
goto error;
if (ret == -ENOSPC) {
@@ -3908,9 +4026,9 @@ int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
uuid_root = btrfs_create_tree(trans, fs_info,
BTRFS_UUID_TREE_OBJECTID);
if (IS_ERR(uuid_root)) {
- btrfs_abort_transaction(trans, tree_root,
- PTR_ERR(uuid_root));
- return PTR_ERR(uuid_root);
+ ret = PTR_ERR(uuid_root);
+ btrfs_abort_transaction(trans, tree_root, ret);
+ return ret;
}
fs_info->uuid_root = uuid_root;
@@ -3965,6 +4083,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
int slot;
int failed = 0;
bool retried = false;
+ bool checked_pending_chunks = false;
struct extent_buffer *l;
struct btrfs_key key;
struct btrfs_super_block *super_copy = root->fs_info->super_copy;
@@ -3998,11 +4117,16 @@ again:
key.type = BTRFS_DEV_EXTENT_KEY;
do {
+ mutex_lock(&root->fs_info->delete_unused_bgs_mutex);
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
+ if (ret < 0) {
+ mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
goto done;
+ }
ret = btrfs_previous_item(root, path, 0, key.type);
+ if (ret)
+ mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
if (ret < 0)
goto done;
if (ret) {
@@ -4016,6 +4140,7 @@ again:
btrfs_item_key_to_cpu(l, &key, path->slots[0]);
if (key.objectid != device->devid) {
+ mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
btrfs_release_path(path);
break;
}
@@ -4024,6 +4149,7 @@ again:
length = btrfs_dev_extent_length(l, dev_extent);
if (key.offset + length <= new_size) {
+ mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
btrfs_release_path(path);
break;
}
@@ -4033,6 +4159,7 @@ again:
btrfs_release_path(path);
ret = btrfs_relocate_chunk(root, chunk_objectid, chunk_offset);
+ mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
if (ret && ret != -ENOSPC)
goto done;
if (ret == -ENOSPC)
@@ -4045,15 +4172,6 @@ again:
goto again;
} else if (failed && retried) {
ret = -ENOSPC;
- lock_chunks(root);
-
- btrfs_device_set_total_bytes(device, old_size);
- if (device->writeable)
- device->fs_devices->total_rw_bytes += diff;
- spin_lock(&root->fs_info->free_chunk_lock);
- root->fs_info->free_chunk_space += diff;
- spin_unlock(&root->fs_info->free_chunk_lock);
- unlock_chunks(root);
goto done;
}
@@ -4065,6 +4183,35 @@ again:
}
lock_chunks(root);
+
+ /*
+ * We checked in the above loop all device extents that were already in
+ * the device tree. However before we have updated the device's
+ * total_bytes to the new size, we might have had chunk allocations that
+ * have not complete yet (new block groups attached to transaction
+ * handles), and therefore their device extents were not yet in the
+ * device tree and we missed them in the loop above. So if we have any
+ * pending chunk using a device extent that overlaps the device range
+ * that we can not use anymore, commit the current transaction and
+ * repeat the search on the device tree - this way we guarantee we will
+ * not have chunks using device extents that end beyond 'new_size'.
+ */
+ if (!checked_pending_chunks) {
+ u64 start = new_size;
+ u64 len = old_size - new_size;
+
+ if (contains_pending_extent(trans, device, &start, len)) {
+ unlock_chunks(root);
+ checked_pending_chunks = true;
+ failed = 0;
+ retried = false;
+ ret = btrfs_commit_transaction(trans, root);
+ if (ret)
+ goto done;
+ goto again;
+ }
+ }
+
btrfs_device_set_disk_total_bytes(device, new_size);
if (list_empty(&device->resized_list))
list_add_tail(&device->resized_list,
@@ -4079,6 +4226,16 @@ again:
btrfs_end_transaction(trans, root);
done:
btrfs_free_path(path);
+ if (ret) {
+ lock_chunks(root);
+ btrfs_device_set_total_bytes(device, old_size);
+ if (device->writeable)
+ device->fs_devices->total_rw_bytes += diff;
+ spin_lock(&root->fs_info->free_chunk_lock);
+ root->fs_info->free_chunk_space += diff;
+ spin_unlock(&root->fs_info->free_chunk_lock);
+ unlock_chunks(root);
+ }
return ret;
}
@@ -5586,17 +5743,16 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio, int err)
{
- if (likely(bbio->flags & BTRFS_BIO_ORIG_BIO_SUBMITTED))
- bio_endio_nodec(bio, err);
- else
- bio_endio(bio, err);
+ bio->bi_private = bbio->private;
+ bio->bi_end_io = bbio->end_io;
+ bio_endio(bio, err);
+
btrfs_put_bbio(bbio);
}
static void btrfs_end_bio(struct bio *bio, int err)
{
struct btrfs_bio *bbio = bio->bi_private;
- struct btrfs_device *dev = bbio->stripes[0].dev;
int is_orig_bio = 0;
if (err) {
@@ -5604,6 +5760,7 @@ static void btrfs_end_bio(struct bio *bio, int err)
if (err == -EIO || err == -EREMOTEIO) {
unsigned int stripe_index =
btrfs_io_bio(bio)->stripe_index;
+ struct btrfs_device *dev;
BUG_ON(stripe_index >= bbio->num_stripes);
dev = bbio->stripes[stripe_index].dev;
@@ -5633,8 +5790,6 @@ static void btrfs_end_bio(struct bio *bio, int err)
bio = bbio->orig_bio;
}
- bio->bi_private = bbio->private;
- bio->bi_end_io = bbio->end_io;
btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
/* only send an error to the higher layers if it is
* beyond the tolerance of the btrfs bio
@@ -5816,8 +5971,6 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
/* Shoud be the original bio. */
WARN_ON(bio != bbio->orig_bio);
- bio->bi_private = bbio->private;
- bio->bi_end_io = bbio->end_io;
btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
bio->bi_iter.bi_sector = logical >> 9;
@@ -5898,10 +6051,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
if (dev_nr < total_devs - 1) {
bio = btrfs_bio_clone(first_bio, GFP_NOFS);
BUG_ON(!bio); /* -ENOMEM */
- } else {
+ } else
bio = first_bio;
- bbio->flags |= BTRFS_BIO_ORIG_BIO_SUBMITTED;
- }
submit_stripe_bio(root, bbio, bio,
bbio->stripes[dev_nr].physical, dev_nr, rw,
@@ -6078,6 +6229,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
free_extent_map(em);
return -EIO;
}
+ btrfs_warn(root->fs_info, "devid %llu uuid %pU is missing",
+ devid, uuid);
}
map->stripes[i].dev->in_fs_metadata = 1;
}
@@ -6197,10 +6350,11 @@ static int read_one_dev(struct btrfs_root *root,
if (!btrfs_test_opt(root, DEGRADED))
return -EIO;
- btrfs_warn(root->fs_info, "devid %llu missing", devid);
device = add_missing_dev(root, fs_devices, devid, dev_uuid);
if (!device)
return -ENOMEM;
+ btrfs_warn(root->fs_info, "devid %llu uuid %pU missing",
+ devid, dev_uuid);
} else {
if (!device->bdev && !btrfs_test_opt(root, DEGRADED))
return -EIO;
@@ -6728,3 +6882,21 @@ void btrfs_update_commit_device_bytes_used(struct btrfs_root *root,
}
unlock_chunks(root);
}
+
+void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ while (fs_devices) {
+ fs_devices->fs_info = fs_info;
+ fs_devices = fs_devices->seed;
+ }
+}
+
+void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ while (fs_devices) {
+ fs_devices->fs_info = NULL;
+ fs_devices = fs_devices->seed;
+ }
+}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index ebc31331a..95842a909 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -253,6 +253,12 @@ struct btrfs_fs_devices {
* nonrot flag set
*/
int rotating;
+
+ struct btrfs_fs_info *fs_info;
+ /* sysfs kobjects */
+ struct kobject super_kobj;
+ struct kobject *device_dir_kobj;
+ struct completion kobj_unregister;
};
#define BTRFS_BIO_INLINE_CSUM_SIZE 64
@@ -292,8 +298,6 @@ struct btrfs_bio_stripe {
struct btrfs_bio;
typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
-#define BTRFS_BIO_ORIG_BIO_SUBMITTED (1 << 0)
-
struct btrfs_bio {
atomic_t refs;
atomic_t stripes_pending;
@@ -537,5 +541,8 @@ static inline void unlock_chunks(struct btrfs_root *root)
mutex_unlock(&root->fs_info->chunk_mutex);
}
+struct list_head *btrfs_get_fs_uuids(void);
+void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info);
+void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info);
#endif