summaryrefslogtreecommitdiff
path: root/fs/btrfs
diff options
context:
space:
mode:
authorAndré Fabian Silva Delgado <emulatorman@parabola.nu>2016-01-20 14:01:31 -0300
committerAndré Fabian Silva Delgado <emulatorman@parabola.nu>2016-01-20 14:01:31 -0300
commitb4b7ff4b08e691656c9d77c758fc355833128ac0 (patch)
tree82fcb00e6b918026dc9f2d1f05ed8eee83874cc0 /fs/btrfs
parent35acfa0fc609f2a2cd95cef4a6a9c3a5c38f1778 (diff)
Linux-libre 4.4-gnupck-4.4-gnu
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/backref.c8
-rw-r--r--fs/btrfs/check-integrity.c8
-rw-r--r--fs/btrfs/compression.c104
-rw-r--r--fs/btrfs/ctree.c10
-rw-r--r--fs/btrfs/ctree.h182
-rw-r--r--fs/btrfs/delayed-inode.c4
-rw-r--r--fs/btrfs/delayed-ref.c51
-rw-r--r--fs/btrfs/delayed-ref.h16
-rw-r--r--fs/btrfs/dev-replace.c55
-rw-r--r--fs/btrfs/disk-io.c171
-rw-r--r--fs/btrfs/disk-io.h2
-rw-r--r--fs/btrfs/extent-tree.c661
-rw-r--r--fs/btrfs/extent_io.c169
-rw-r--r--fs/btrfs/extent_io.h19
-rw-r--r--fs/btrfs/file.c236
-rw-r--r--fs/btrfs/free-space-cache.c93
-rw-r--r--fs/btrfs/free-space-cache.h1
-rw-r--r--fs/btrfs/inode-item.c2
-rw-r--r--fs/btrfs/inode-map.c6
-rw-r--r--fs/btrfs/inode.c187
-rw-r--r--fs/btrfs/ioctl.c23
-rw-r--r--fs/btrfs/locking.c12
-rw-r--r--fs/btrfs/ordered-data.c70
-rw-r--r--fs/btrfs/ordered-data.h2
-rw-r--r--fs/btrfs/props.c13
-rw-r--r--fs/btrfs/qgroup.c247
-rw-r--r--fs/btrfs/qgroup.h31
-rw-r--r--fs/btrfs/raid56.c6
-rw-r--r--fs/btrfs/reada.c8
-rw-r--r--fs/btrfs/relocation.c10
-rw-r--r--fs/btrfs/root-tree.c11
-rw-r--r--fs/btrfs/scrub.c279
-rw-r--r--fs/btrfs/send.c202
-rw-r--r--fs/btrfs/super.c57
-rw-r--r--fs/btrfs/sysfs.c52
-rw-r--r--fs/btrfs/sysfs.h4
-rw-r--r--fs/btrfs/tests/free-space-tests.c24
-rw-r--r--fs/btrfs/transaction.c162
-rw-r--r--fs/btrfs/transaction.h26
-rw-r--r--fs/btrfs/tree-log.c22
-rw-r--r--fs/btrfs/volumes.c454
-rw-r--r--fs/btrfs/volumes.h18
42 files changed, 2617 insertions, 1101 deletions
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 9a2ec79e8..d453d62ab 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -355,13 +355,19 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
index = srcu_read_lock(&fs_info->subvol_srcu);
- root = btrfs_read_fs_root_no_name(fs_info, &root_key);
+ root = btrfs_get_fs_root(fs_info, &root_key, false);
if (IS_ERR(root)) {
srcu_read_unlock(&fs_info->subvol_srcu, index);
ret = PTR_ERR(root);
goto out;
}
+ if (btrfs_test_is_dummy_root(root)) {
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
+ ret = -ENOENT;
+ goto out;
+ }
+
if (path->search_commit_root)
root_level = btrfs_header_level(root->commit_root);
else if (time_seq == (u64)-1)
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 541fbfaed..0340c57bf 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -667,7 +667,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS);
if (NULL == selected_super) {
printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
- return -1;
+ return -ENOMEM;
}
list_for_each_entry(device, dev_head, dev_list) {
@@ -845,8 +845,8 @@ static int btrfsic_process_superblock_dev_mirror(
superblock_tmp->never_written = 0;
superblock_tmp->mirror_num = 1 + superblock_mirror_num;
if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
- printk_in_rcu(KERN_INFO "New initial S-block (bdev %p, %s)"
- " @%llu (%s/%llu/%d)\n",
+ btrfs_info_in_rcu(device->dev_root->fs_info,
+ "new initial S-block (bdev %p, %s) @%llu (%s/%llu/%d)",
superblock_bdev,
rcu_str_deref(device->name), dev_bytenr,
dev_state->name, dev_bytenr,
@@ -1660,7 +1660,7 @@ static int btrfsic_read_block(struct btrfsic_state *state,
sizeof(*block_ctx->pagev)) *
num_pages, GFP_NOFS);
if (!block_ctx->mem_to_free)
- return -1;
+ return -ENOMEM;
block_ctx->datav = block_ctx->mem_to_free;
block_ctx->pagev = (struct page **)(block_ctx->datav + num_pages);
for (i = 0; i < num_pages; i++) {
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 57ee8ca29..c473c42d7 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -482,13 +482,12 @@ static noinline int add_ra_bio_pages(struct inode *inode,
goto next;
}
- page = __page_cache_alloc(mapping_gfp_mask(mapping) &
- ~__GFP_FS);
+ page = __page_cache_alloc(mapping_gfp_constraint(mapping,
+ ~__GFP_FS));
if (!page)
break;
- if (add_to_page_cache_lru(page, mapping, pg_index,
- GFP_NOFS)) {
+ if (add_to_page_cache_lru(page, mapping, pg_index, GFP_NOFS)) {
page_cache_release(page);
goto next;
}
@@ -745,11 +744,13 @@ out:
return ret;
}
-static struct list_head comp_idle_workspace[BTRFS_COMPRESS_TYPES];
-static spinlock_t comp_workspace_lock[BTRFS_COMPRESS_TYPES];
-static int comp_num_workspace[BTRFS_COMPRESS_TYPES];
-static atomic_t comp_alloc_workspace[BTRFS_COMPRESS_TYPES];
-static wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES];
+static struct {
+ struct list_head idle_ws;
+ spinlock_t ws_lock;
+ int num_ws;
+ atomic_t alloc_ws;
+ wait_queue_head_t ws_wait;
+} btrfs_comp_ws[BTRFS_COMPRESS_TYPES];
static const struct btrfs_compress_op * const btrfs_compress_op[] = {
&btrfs_zlib_compress,
@@ -761,10 +762,10 @@ void __init btrfs_init_compress(void)
int i;
for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
- INIT_LIST_HEAD(&comp_idle_workspace[i]);
- spin_lock_init(&comp_workspace_lock[i]);
- atomic_set(&comp_alloc_workspace[i], 0);
- init_waitqueue_head(&comp_workspace_wait[i]);
+ INIT_LIST_HEAD(&btrfs_comp_ws[i].idle_ws);
+ spin_lock_init(&btrfs_comp_ws[i].ws_lock);
+ atomic_set(&btrfs_comp_ws[i].alloc_ws, 0);
+ init_waitqueue_head(&btrfs_comp_ws[i].ws_wait);
}
}
@@ -778,38 +779,38 @@ static struct list_head *find_workspace(int type)
int cpus = num_online_cpus();
int idx = type - 1;
- struct list_head *idle_workspace = &comp_idle_workspace[idx];
- spinlock_t *workspace_lock = &comp_workspace_lock[idx];
- atomic_t *alloc_workspace = &comp_alloc_workspace[idx];
- wait_queue_head_t *workspace_wait = &comp_workspace_wait[idx];
- int *num_workspace = &comp_num_workspace[idx];
+ struct list_head *idle_ws = &btrfs_comp_ws[idx].idle_ws;
+ spinlock_t *ws_lock = &btrfs_comp_ws[idx].ws_lock;
+ atomic_t *alloc_ws = &btrfs_comp_ws[idx].alloc_ws;
+ wait_queue_head_t *ws_wait = &btrfs_comp_ws[idx].ws_wait;
+ int *num_ws = &btrfs_comp_ws[idx].num_ws;
again:
- spin_lock(workspace_lock);
- if (!list_empty(idle_workspace)) {
- workspace = idle_workspace->next;
+ spin_lock(ws_lock);
+ if (!list_empty(idle_ws)) {
+ workspace = idle_ws->next;
list_del(workspace);
- (*num_workspace)--;
- spin_unlock(workspace_lock);
+ (*num_ws)--;
+ spin_unlock(ws_lock);
return workspace;
}
- if (atomic_read(alloc_workspace) > cpus) {
+ if (atomic_read(alloc_ws) > cpus) {
DEFINE_WAIT(wait);
- spin_unlock(workspace_lock);
- prepare_to_wait(workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
- if (atomic_read(alloc_workspace) > cpus && !*num_workspace)
+ spin_unlock(ws_lock);
+ prepare_to_wait(ws_wait, &wait, TASK_UNINTERRUPTIBLE);
+ if (atomic_read(alloc_ws) > cpus && !*num_ws)
schedule();
- finish_wait(workspace_wait, &wait);
+ finish_wait(ws_wait, &wait);
goto again;
}
- atomic_inc(alloc_workspace);
- spin_unlock(workspace_lock);
+ atomic_inc(alloc_ws);
+ spin_unlock(ws_lock);
workspace = btrfs_compress_op[idx]->alloc_workspace();
if (IS_ERR(workspace)) {
- atomic_dec(alloc_workspace);
- wake_up(workspace_wait);
+ atomic_dec(alloc_ws);
+ wake_up(ws_wait);
}
return workspace;
}
@@ -821,27 +822,30 @@ again:
static void free_workspace(int type, struct list_head *workspace)
{
int idx = type - 1;
- struct list_head *idle_workspace = &comp_idle_workspace[idx];
- spinlock_t *workspace_lock = &comp_workspace_lock[idx];
- atomic_t *alloc_workspace = &comp_alloc_workspace[idx];
- wait_queue_head_t *workspace_wait = &comp_workspace_wait[idx];
- int *num_workspace = &comp_num_workspace[idx];
-
- spin_lock(workspace_lock);
- if (*num_workspace < num_online_cpus()) {
- list_add(workspace, idle_workspace);
- (*num_workspace)++;
- spin_unlock(workspace_lock);
+ struct list_head *idle_ws = &btrfs_comp_ws[idx].idle_ws;
+ spinlock_t *ws_lock = &btrfs_comp_ws[idx].ws_lock;
+ atomic_t *alloc_ws = &btrfs_comp_ws[idx].alloc_ws;
+ wait_queue_head_t *ws_wait = &btrfs_comp_ws[idx].ws_wait;
+ int *num_ws = &btrfs_comp_ws[idx].num_ws;
+
+ spin_lock(ws_lock);
+ if (*num_ws < num_online_cpus()) {
+ list_add(workspace, idle_ws);
+ (*num_ws)++;
+ spin_unlock(ws_lock);
goto wake;
}
- spin_unlock(workspace_lock);
+ spin_unlock(ws_lock);
btrfs_compress_op[idx]->free_workspace(workspace);
- atomic_dec(alloc_workspace);
+ atomic_dec(alloc_ws);
wake:
+ /*
+ * Make sure counter is updated before we wake up waiters.
+ */
smp_mb();
- if (waitqueue_active(workspace_wait))
- wake_up(workspace_wait);
+ if (waitqueue_active(ws_wait))
+ wake_up(ws_wait);
}
/*
@@ -853,11 +857,11 @@ static void free_workspaces(void)
int i;
for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
- while (!list_empty(&comp_idle_workspace[i])) {
- workspace = comp_idle_workspace[i].next;
+ while (!list_empty(&btrfs_comp_ws[i].idle_ws)) {
+ workspace = btrfs_comp_ws[i].idle_ws.next;
list_del(workspace);
btrfs_compress_op[i]->free_workspace(workspace);
- atomic_dec(&comp_alloc_workspace[i]);
+ atomic_dec(&btrfs_comp_ws[i].alloc_ws);
}
}
}
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 5f745eadf..5b8e235c4 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1011,7 +1011,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
return ret;
if (refs == 0) {
ret = -EROFS;
- btrfs_std_error(root->fs_info, ret);
+ btrfs_std_error(root->fs_info, ret, NULL);
return ret;
}
} else {
@@ -1927,7 +1927,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
child = read_node_slot(root, mid, 0);
if (!child) {
ret = -EROFS;
- btrfs_std_error(root->fs_info, ret);
+ btrfs_std_error(root->fs_info, ret, NULL);
goto enospc;
}
@@ -2030,7 +2030,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
*/
if (!left) {
ret = -EROFS;
- btrfs_std_error(root->fs_info, ret);
+ btrfs_std_error(root->fs_info, ret, NULL);
goto enospc;
}
wret = balance_node_right(trans, root, mid, left);
@@ -4940,8 +4940,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
{
struct extent_buffer *leaf;
struct btrfs_item *item;
- int last_off;
- int dsize = 0;
+ u32 last_off;
+ u32 dsize = 0;
int ret = 0;
int wret;
int i;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 94eea1f43..35489e712 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -823,8 +823,18 @@ struct btrfs_disk_balance_args {
*/
__le64 profiles;
- /* usage filter */
- __le64 usage;
+ /*
+ * usage filter
+ * BTRFS_BALANCE_ARGS_USAGE with a single value means '0..N'
+ * BTRFS_BALANCE_ARGS_USAGE_RANGE - range syntax, min..max
+ */
+ union {
+ __le64 usage;
+ struct {
+ __le32 usage_min;
+ __le32 usage_max;
+ };
+ };
/* devid filter */
__le64 devid;
@@ -846,10 +856,27 @@ struct btrfs_disk_balance_args {
/* BTRFS_BALANCE_ARGS_* */
__le64 flags;
- /* BTRFS_BALANCE_ARGS_LIMIT value */
- __le64 limit;
+ /*
+ * BTRFS_BALANCE_ARGS_LIMIT with value 'limit'
+ * BTRFS_BALANCE_ARGS_LIMIT_RANGE - the extend version can use minimum
+ * and maximum
+ */
+ union {
+ __le64 limit;
+ struct {
+ __le32 limit_min;
+ __le32 limit_max;
+ };
+ };
- __le64 unused[7];
+ /*
+ * Process chunks that cross stripes_min..stripes_max devices,
+ * BTRFS_BALANCE_ARGS_STRIPES_RANGE
+ */
+ __le32 stripes_min;
+ __le32 stripes_max;
+
+ __le64 unused[6];
} __attribute__ ((__packed__));
/*
@@ -1154,6 +1181,10 @@ struct btrfs_space_info {
delalloc/allocations */
u64 bytes_readonly; /* total bytes that are read only */
+ u64 max_extent_size; /* This will hold the maximum extent size of
+ the space info if we had an ENOSPC in the
+ allocator. */
+
unsigned int full:1; /* indicates that we cannot allocate any more
chunks for this space */
unsigned int chunk_alloc:1; /* set if we are allocating a chunk */
@@ -1228,6 +1259,9 @@ struct btrfs_free_cluster {
/* first extent starting offset */
u64 window_start;
+ /* We did a full search and couldn't create a cluster */
+ bool fragmented;
+
struct btrfs_block_group_cache *block_group;
/*
* when a cluster is allocated from a block group, we put the
@@ -1943,6 +1977,9 @@ struct btrfs_root {
int send_in_progress;
struct btrfs_subvolume_writers *subv_writers;
atomic_t will_be_snapshoted;
+
+ /* For qgroup metadata space reserve */
+ atomic_t qgroup_meta_rsv;
};
struct btrfs_ioctl_defrag_range_args {
@@ -2145,6 +2182,8 @@ struct btrfs_ioctl_defrag_range_args {
#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21)
#define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR (1 << 22)
#define BTRFS_MOUNT_RESCAN_UUID_TREE (1 << 23)
+#define BTRFS_MOUNT_FRAGMENT_DATA (1 << 24)
+#define BTRFS_MOUNT_FRAGMENT_METADATA (1 << 25)
#define BTRFS_DEFAULT_COMMIT_INTERVAL (30)
#define BTRFS_DEFAULT_MAX_INLINE (8192)
@@ -2169,6 +2208,18 @@ struct btrfs_ioctl_defrag_range_args {
btrfs_clear_opt(root->fs_info->mount_opt, opt); \
}
+#ifdef CONFIG_BTRFS_DEBUG
+static inline int
+btrfs_should_fragment_free_space(struct btrfs_root *root,
+ struct btrfs_block_group_cache *block_group)
+{
+ return (btrfs_test_opt(root, FRAGMENT_METADATA) &&
+ block_group->flags & BTRFS_BLOCK_GROUP_METADATA) ||
+ (btrfs_test_opt(root, FRAGMENT_DATA) &&
+ block_group->flags & BTRFS_BLOCK_GROUP_DATA);
+}
+#endif
+
/*
* Requests for changes that need to be done during transaction commit.
*
@@ -3316,7 +3367,7 @@ static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
{
- return mapping_gfp_mask(mapping) & ~__GFP_FS;
+ return mapping_gfp_constraint(mapping, ~__GFP_FS);
}
/* extent-tree.c */
@@ -3365,6 +3416,7 @@ int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
struct btrfs_block_group_cache *btrfs_lookup_block_group(
struct btrfs_fs_info *info,
u64 bytenr);
+void btrfs_get_block_group(struct btrfs_block_group_cache *cache);
void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
int get_block_group_index(struct btrfs_block_group_cache *cache);
struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
@@ -3379,7 +3431,8 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 root_objectid, u64 owner,
- u64 offset, struct btrfs_key *ins);
+ u64 offset, u64 ram_bytes,
+ struct btrfs_key *ins);
int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 root_objectid, u64 owner, u64 offset,
@@ -3427,6 +3480,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytes_used,
u64 type, u64 chunk_objectid, u64 chunk_offset,
u64 size);
+struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
+ struct btrfs_fs_info *fs_info,
+ const u64 chunk_offset);
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 group_start,
struct extent_map *em);
@@ -3449,8 +3505,11 @@ enum btrfs_reserve_flush_enum {
BTRFS_RESERVE_FLUSH_ALL,
};
-int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes);
-void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
+int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len);
+int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes);
+void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len);
+void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
+ u64 len);
void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
@@ -3466,8 +3525,8 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root,
u64 qgroup_reserved);
int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
-int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
-void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes);
+int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len);
+void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len);
void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
unsigned short type);
@@ -4004,8 +4063,8 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
/* sysfs.c */
int btrfs_init_sysfs(void);
void btrfs_exit_sysfs(void);
-int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info);
-void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info);
+int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info);
+void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info);
/* xattr.c */
ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
@@ -4039,14 +4098,102 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
#define btrfs_info(fs_info, fmt, args...) \
btrfs_printk(fs_info, KERN_INFO fmt, ##args)
+/*
+ * Wrappers that use printk_in_rcu
+ */
+#define btrfs_emerg_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_in_rcu(fs_info, KERN_EMERG fmt, ##args)
+#define btrfs_alert_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_in_rcu(fs_info, KERN_ALERT fmt, ##args)
+#define btrfs_crit_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_in_rcu(fs_info, KERN_CRIT fmt, ##args)
+#define btrfs_err_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_in_rcu(fs_info, KERN_ERR fmt, ##args)
+#define btrfs_warn_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_in_rcu(fs_info, KERN_WARNING fmt, ##args)
+#define btrfs_notice_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_in_rcu(fs_info, KERN_NOTICE fmt, ##args)
+#define btrfs_info_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_in_rcu(fs_info, KERN_INFO fmt, ##args)
+
+/*
+ * Wrappers that use a ratelimited printk_in_rcu
+ */
+#define btrfs_emerg_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_rl_in_rcu(fs_info, KERN_EMERG fmt, ##args)
+#define btrfs_alert_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_rl_in_rcu(fs_info, KERN_ALERT fmt, ##args)
+#define btrfs_crit_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_rl_in_rcu(fs_info, KERN_CRIT fmt, ##args)
+#define btrfs_err_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_rl_in_rcu(fs_info, KERN_ERR fmt, ##args)
+#define btrfs_warn_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_rl_in_rcu(fs_info, KERN_WARNING fmt, ##args)
+#define btrfs_notice_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_rl_in_rcu(fs_info, KERN_NOTICE fmt, ##args)
+#define btrfs_info_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_rl_in_rcu(fs_info, KERN_INFO fmt, ##args)
+
+/*
+ * Wrappers that use a ratelimited printk
+ */
+#define btrfs_emerg_rl(fs_info, fmt, args...) \
+ btrfs_printk_ratelimited(fs_info, KERN_EMERG fmt, ##args)
+#define btrfs_alert_rl(fs_info, fmt, args...) \
+ btrfs_printk_ratelimited(fs_info, KERN_ALERT fmt, ##args)
+#define btrfs_crit_rl(fs_info, fmt, args...) \
+ btrfs_printk_ratelimited(fs_info, KERN_CRIT fmt, ##args)
+#define btrfs_err_rl(fs_info, fmt, args...) \
+ btrfs_printk_ratelimited(fs_info, KERN_ERR fmt, ##args)
+#define btrfs_warn_rl(fs_info, fmt, args...) \
+ btrfs_printk_ratelimited(fs_info, KERN_WARNING fmt, ##args)
+#define btrfs_notice_rl(fs_info, fmt, args...) \
+ btrfs_printk_ratelimited(fs_info, KERN_NOTICE fmt, ##args)
+#define btrfs_info_rl(fs_info, fmt, args...) \
+ btrfs_printk_ratelimited(fs_info, KERN_INFO fmt, ##args)
#ifdef DEBUG
#define btrfs_debug(fs_info, fmt, args...) \
btrfs_printk(fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl(fs_info, fmt, args...) \
+ btrfs_printk_ratelimited(fs_info, KERN_DEBUG fmt, ##args)
#else
#define btrfs_debug(fs_info, fmt, args...) \
no_printk(KERN_DEBUG fmt, ##args)
+#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
+ no_printk(KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
+ no_printk(KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl(fs_info, fmt, args...) \
+ no_printk(KERN_DEBUG fmt, ##args)
#endif
+#define btrfs_printk_in_rcu(fs_info, fmt, args...) \
+do { \
+ rcu_read_lock(); \
+ btrfs_printk(fs_info, fmt, ##args); \
+ rcu_read_unlock(); \
+} while (0)
+
+#define btrfs_printk_ratelimited(fs_info, fmt, args...) \
+do { \
+ static DEFINE_RATELIMIT_STATE(_rs, \
+ DEFAULT_RATELIMIT_INTERVAL, \
+ DEFAULT_RATELIMIT_BURST); \
+ if (__ratelimit(&_rs)) \
+ btrfs_printk(fs_info, fmt, ##args); \
+} while (0)
+
+#define btrfs_printk_rl_in_rcu(fs_info, fmt, args...) \
+do { \
+ rcu_read_lock(); \
+ btrfs_printk_ratelimited(fs_info, fmt, ##args); \
+ rcu_read_unlock(); \
+} while (0)
+
#ifdef CONFIG_BTRFS_ASSERT
__cold
@@ -4127,14 +4274,7 @@ do { \
__LINE__, (errno)); \
} while (0)
-#define btrfs_std_error(fs_info, errno) \
-do { \
- if ((errno)) \
- __btrfs_std_error((fs_info), __func__, \
- __LINE__, (errno), NULL); \
-} while (0)
-
-#define btrfs_error(fs_info, errno, fmt, args...) \
+#define btrfs_std_error(fs_info, errno, fmt, args...) \
do { \
__btrfs_std_error((fs_info), __func__, __LINE__, \
(errno), fmt, ##args); \
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index a2ae42720..e0941fbb9 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -463,6 +463,10 @@ static int __btrfs_add_delayed_deletion_item(struct btrfs_delayed_node *node,
static void finish_one_item(struct btrfs_delayed_root *delayed_root)
{
int seq = atomic_inc_return(&delayed_root->items_seq);
+
+ /*
+ * atomic_dec_return implies a barrier for waitqueue_active
+ */
if ((atomic_dec_return(&delayed_root->items) <
BTRFS_DELAYED_BACKGROUND || seq % BTRFS_DELAYED_BATCH == 0) &&
waitqueue_active(&delayed_root->wait))
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 7832031fe..e06dd75ad 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -535,7 +535,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *ref,
struct btrfs_qgroup_extent_record *qrecord,
- u64 bytenr, u64 num_bytes, int action, int is_data)
+ u64 bytenr, u64 num_bytes, u64 ref_root, u64 reserved,
+ int action, int is_data)
{
struct btrfs_delayed_ref_head *existing;
struct btrfs_delayed_ref_head *head_ref = NULL;
@@ -544,6 +545,9 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
int count_mod = 1;
int must_insert_reserved = 0;
+ /* If reserved is provided, it must be a data extent. */
+ BUG_ON(!is_data && reserved);
+
/*
* the head node stores the sum of all the mods, so dropping a ref
* should drop the sum in the head node by one.
@@ -588,9 +592,16 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
INIT_LIST_HEAD(&head_ref->ref_list);
head_ref->processing = 0;
head_ref->total_ref_mod = count_mod;
+ head_ref->qgroup_reserved = 0;
+ head_ref->qgroup_ref_root = 0;
/* Record qgroup extent info if provided */
if (qrecord) {
+ if (ref_root && reserved) {
+ head_ref->qgroup_ref_root = ref_root;
+ head_ref->qgroup_reserved = reserved;
+ }
+
qrecord->bytenr = bytenr;
qrecord->num_bytes = num_bytes;
qrecord->old_roots = NULL;
@@ -609,6 +620,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
existing = htree_insert(&delayed_refs->href_root,
&head_ref->href_node);
if (existing) {
+ WARN_ON(ref_root && reserved && existing->qgroup_ref_root
+ && existing->qgroup_reserved);
update_existing_head_ref(delayed_refs, &existing->node, ref);
/*
* we've updated the existing ref, free the newly
@@ -775,7 +788,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
* the spin lock
*/
head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record,
- bytenr, num_bytes, action, 0);
+ bytenr, num_bytes, 0, 0, action, 0);
add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr,
num_bytes, parent, ref_root, level, action);
@@ -798,7 +811,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
u64 parent, u64 ref_root,
- u64 owner, u64 offset, int action,
+ u64 owner, u64 offset, u64 reserved, int action,
struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_delayed_data_ref *ref;
@@ -837,7 +850,8 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
* the spin lock
*/
head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record,
- bytenr, num_bytes, action, 1);
+ bytenr, num_bytes, ref_root, reserved,
+ action, 1);
add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr,
num_bytes, parent, ref_root, owner, offset,
@@ -847,6 +861,33 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
return 0;
}
+int btrfs_add_delayed_qgroup_reserve(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans,
+ u64 ref_root, u64 bytenr, u64 num_bytes)
+{
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_delayed_ref_head *ref_head;
+ int ret = 0;
+
+ if (!fs_info->quota_enabled || !is_fstree(ref_root))
+ return 0;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+
+ spin_lock(&delayed_refs->lock);
+ ref_head = find_ref_head(&delayed_refs->href_root, bytenr, 0);
+ if (!ref_head) {
+ ret = -ENOENT;
+ goto out;
+ }
+ WARN_ON(ref_head->qgroup_reserved || ref_head->qgroup_ref_root);
+ ref_head->qgroup_ref_root = ref_root;
+ ref_head->qgroup_reserved = num_bytes;
+out:
+ spin_unlock(&delayed_refs->lock);
+ return ret;
+}
+
int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
@@ -865,7 +906,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
spin_lock(&delayed_refs->lock);
add_delayed_ref_head(fs_info, trans, &head_ref->node, NULL, bytenr,
- num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
+ num_bytes, 0, 0, BTRFS_UPDATE_DELAYED_HEAD,
extent_op->is_data);
spin_unlock(&delayed_refs->lock);
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 930887a42..00ed02cbf 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -112,6 +112,17 @@ struct btrfs_delayed_ref_head {
int total_ref_mod;
/*
+ * For qgroup reserved space freeing.
+ *
+ * ref_root and reserved will be recorded after
+ * BTRFS_ADD_DELAYED_EXTENT is called.
+ * And will be used to free reserved qgroup space at
+ * run_delayed_refs() time.
+ */
+ u64 qgroup_ref_root;
+ u64 qgroup_reserved;
+
+ /*
* when a new extent is allocated, it is just reserved in memory
* The actual extent isn't inserted into the extent allocation tree
* until the delayed ref is processed. must_insert_reserved is
@@ -237,8 +248,11 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
u64 parent, u64 ref_root,
- u64 owner, u64 offset, int action,
+ u64 owner, u64 offset, u64 reserved, int action,
struct btrfs_delayed_extent_op *extent_op);
+int btrfs_add_delayed_qgroup_reserve(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans,
+ u64 ref_root, u64 bytenr, u64 num_bytes);
int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index e54dd5905..1e668fb7d 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -327,19 +327,6 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
args->start.tgtdev_name[0] == '\0')
return -EINVAL;
- /*
- * Here we commit the transaction to make sure commit_total_bytes
- * of all the devices are updated.
- */
- trans = btrfs_attach_transaction(root);
- if (!IS_ERR(trans)) {
- ret = btrfs_commit_transaction(trans, root);
- if (ret)
- return ret;
- } else if (PTR_ERR(trans) != -ENOENT) {
- return PTR_ERR(trans);
- }
-
/* the disk copy procedure reuses the scrub code */
mutex_lock(&fs_info->volume_mutex);
ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid,
@@ -356,6 +343,19 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
if (ret)
return ret;
+ /*
+ * Here we commit the transaction to make sure commit_total_bytes
+ * of all the devices are updated.
+ */
+ trans = btrfs_attach_transaction(root);
+ if (!IS_ERR(trans)) {
+ ret = btrfs_commit_transaction(trans, root);
+ if (ret)
+ return ret;
+ } else if (PTR_ERR(trans) != -ENOENT) {
+ return PTR_ERR(trans);
+ }
+
btrfs_dev_replace_lock(dev_replace);
switch (dev_replace->replace_state) {
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
@@ -375,12 +375,8 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
WARN_ON(!tgt_device);
dev_replace->tgtdev = tgt_device;
- ret = btrfs_kobj_add_device(tgt_device->fs_devices, tgt_device);
- if (ret)
- btrfs_err(root->fs_info, "kobj add dev failed %d\n", ret);
-
- printk_in_rcu(KERN_INFO
- "BTRFS: dev_replace from %s (devid %llu) to %s started\n",
+ btrfs_info_in_rcu(root->fs_info,
+ "dev_replace from %s (devid %llu) to %s started",
src_device->missing ? "<missing disk>" :
rcu_str_deref(src_device->name),
src_device->devid,
@@ -401,6 +397,10 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
btrfs_dev_replace_unlock(dev_replace);
+ ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device);
+ if (ret)
+ btrfs_err(root->fs_info, "kobj add dev failed %d\n", ret);
+
btrfs_wait_ordered_roots(root->fs_info, -1);
/* force writing the updated state information to disk */
@@ -454,8 +454,7 @@ static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info)
{
clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
- if (waitqueue_active(&fs_info->replace_wait))
- wake_up(&fs_info->replace_wait);
+ wake_up(&fs_info->replace_wait);
}
static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
@@ -523,8 +522,8 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
src_device,
tgt_device);
} else {
- printk_in_rcu(KERN_ERR
- "BTRFS: btrfs_scrub_dev(%s, %llu, %s) failed %d\n",
+ btrfs_err_in_rcu(root->fs_info,
+ "btrfs_scrub_dev(%s, %llu, %s) failed %d",
src_device->missing ? "<missing disk>" :
rcu_str_deref(src_device->name),
src_device->devid,
@@ -540,8 +539,8 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
return scrub_ret;
}
- printk_in_rcu(KERN_INFO
- "BTRFS: dev_replace from %s (devid %llu) to %s finished\n",
+ btrfs_info_in_rcu(root->fs_info,
+ "dev_replace from %s (devid %llu) to %s finished",
src_device->missing ? "<missing disk>" :
rcu_str_deref(src_device->name),
src_device->devid,
@@ -586,7 +585,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
mutex_unlock(&uuid_mutex);
/* replace the sysfs entry */
- btrfs_kobj_rm_device(fs_info->fs_devices, src_device);
+ btrfs_sysfs_rm_device_link(fs_info->fs_devices, src_device);
btrfs_rm_dev_replace_free_srcdev(fs_info, src_device);
/* write back the superblocks */
@@ -809,8 +808,8 @@ static int btrfs_dev_replace_kthread(void *data)
progress = status_args->status.progress_1000;
kfree(status_args);
progress = div_u64(progress, 10);
- printk_in_rcu(KERN_INFO
- "BTRFS: continuing dev_replace from %s (devid %llu) to %s @%u%%\n",
+ btrfs_info_in_rcu(fs_info,
+ "continuing dev_replace from %s (devid %llu) to %s @%u%%",
dev_replace->srcdev->missing ? "<missing disk>" :
rcu_str_deref(dev_replace->srcdev->name),
dev_replace->srcdev->devid,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1e60d00d4..974be09e7 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -319,9 +319,9 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info,
memcpy(&found, result, csum_size);
read_extent_buffer(buf, &val, 0, csum_size);
- printk_ratelimited(KERN_WARNING
- "BTRFS: %s checksum verify failed on %llu wanted %X found %X "
- "level %d\n",
+ btrfs_warn_rl(fs_info,
+ "%s checksum verify failed on %llu wanted %X found %X "
+ "level %d",
fs_info->sb->s_id, buf->start,
val, found, btrfs_header_level(buf));
if (result != (char *)&inline_result)
@@ -368,9 +368,9 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
ret = 0;
goto out;
}
- printk_ratelimited(KERN_ERR
- "BTRFS (device %s): parent transid verify failed on %llu wanted %llu found %llu\n",
- eb->fs_info->sb->s_id, eb->start,
+ btrfs_err_rl(eb->fs_info,
+ "parent transid verify failed on %llu wanted %llu found %llu",
+ eb->start,
parent_transid, btrfs_header_generation(eb));
ret = 1;
@@ -629,15 +629,14 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
found_start = btrfs_header_bytenr(eb);
if (found_start != eb->start) {
- printk_ratelimited(KERN_ERR "BTRFS (device %s): bad tree block start "
- "%llu %llu\n",
- eb->fs_info->sb->s_id, found_start, eb->start);
+ btrfs_err_rl(eb->fs_info, "bad tree block start %llu %llu",
+ found_start, eb->start);
ret = -EIO;
goto err;
}
if (check_tree_block_fsid(root->fs_info, eb)) {
- printk_ratelimited(KERN_ERR "BTRFS (device %s): bad fsid on block %llu\n",
- eb->fs_info->sb->s_id, eb->start);
+ btrfs_err_rl(eb->fs_info, "bad fsid on block %llu",
+ eb->start);
ret = -EIO;
goto err;
}
@@ -802,6 +801,9 @@ static void run_one_async_done(struct btrfs_work *work)
limit = btrfs_async_submit_limit(fs_info);
limit = limit * 2 / 3;
+ /*
+ * atomic_dec_return implies a barrier for waitqueue_active
+ */
if (atomic_dec_return(&fs_info->nr_async_submits) < limit &&
waitqueue_active(&fs_info->async_submit_wait))
wake_up(&fs_info->async_submit_wait);
@@ -1265,6 +1267,7 @@ static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize,
atomic_set(&root->orphan_inodes, 0);
atomic_set(&root->refs, 1);
atomic_set(&root->will_be_snapshoted, 0);
+ atomic_set(&root->qgroup_meta_rsv, 0);
root->log_transid = 0;
root->log_transid_committed = -1;
root->last_log_commit = 0;
@@ -1759,6 +1762,7 @@ static int cleaner_kthread(void *arg)
int again;
struct btrfs_trans_handle *trans;
+ set_freezable();
do {
again = 0;
@@ -2348,8 +2352,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
u64 bytenr = btrfs_super_log_root(disk_super);
if (fs_devices->rw_devices == 0) {
- printk(KERN_WARNING "BTRFS: log replay required "
- "on RO media\n");
+ btrfs_warn(fs_info, "log replay required on RO media");
return -EIO;
}
@@ -2364,12 +2367,12 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
log_tree_root->node = read_tree_block(tree_root, bytenr,
fs_info->generation + 1);
if (IS_ERR(log_tree_root->node)) {
- printk(KERN_ERR "BTRFS: failed to read log tree\n");
+ btrfs_warn(fs_info, "failed to read log tree");
ret = PTR_ERR(log_tree_root->node);
kfree(log_tree_root);
return ret;
} else if (!extent_buffer_uptodate(log_tree_root->node)) {
- printk(KERN_ERR "BTRFS: failed to read log tree\n");
+ btrfs_err(fs_info, "failed to read log tree");
free_extent_buffer(log_tree_root->node);
kfree(log_tree_root);
return -EIO;
@@ -2377,7 +2380,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
/* returns with log_tree_root freed on success */
ret = btrfs_recover_log_trees(log_tree_root);
if (ret) {
- btrfs_error(tree_root->fs_info, ret,
+ btrfs_std_error(tree_root->fs_info, ret,
"Failed to recover log tree");
free_extent_buffer(log_tree_root->node);
kfree(log_tree_root);
@@ -2572,7 +2575,7 @@ int open_ctree(struct super_block *sb,
fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */
/* readahead state */
- INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
+ INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
spin_lock_init(&fs_info->reada_lock);
fs_info->thread_pool_size = min_t(unsigned long,
@@ -2653,8 +2656,8 @@ int open_ctree(struct super_block *sb,
* Read super block and check the signature bytes only
*/
bh = btrfs_read_dev_super(fs_devices->latest_bdev);
- if (!bh) {
- err = -EINVAL;
+ if (IS_ERR(bh)) {
+ err = PTR_ERR(bh);
goto fail_alloc;
}
@@ -2937,7 +2940,7 @@ retry_root_backup:
goto fail_fsdev_sysfs;
}
- ret = btrfs_sysfs_add_one(fs_info);
+ ret = btrfs_sysfs_add_mounted(fs_info);
if (ret) {
pr_err("BTRFS: failed to init sysfs interface: %d\n", ret);
goto fail_fsdev_sysfs;
@@ -3117,7 +3120,7 @@ fail_cleaner:
filemap_write_and_wait(fs_info->btree_inode->i_mapping);
fail_sysfs:
- btrfs_sysfs_remove_one(fs_info);
+ btrfs_sysfs_remove_mounted(fs_info);
fail_fsdev_sysfs:
btrfs_sysfs_remove_fsid(fs_info->fs_devices);
@@ -3179,8 +3182,8 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
struct btrfs_device *device = (struct btrfs_device *)
bh->b_private;
- printk_ratelimited_in_rcu(KERN_WARNING "BTRFS: lost page write due to "
- "I/O error on %s\n",
+ btrfs_warn_rl_in_rcu(device->dev_root->fs_info,
+ "lost page write due to IO error on %s",
rcu_str_deref(device->name));
/* note, we dont' set_buffer_write_io_error because we have
* our own ways of dealing with the IO errors
@@ -3192,6 +3195,37 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
put_bh(bh);
}
+int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num,
+ struct buffer_head **bh_ret)
+{
+ struct buffer_head *bh;
+ struct btrfs_super_block *super;
+ u64 bytenr;
+
+ bytenr = btrfs_sb_offset(copy_num);
+ if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode))
+ return -EINVAL;
+
+ bh = __bread(bdev, bytenr / 4096, BTRFS_SUPER_INFO_SIZE);
+ /*
+ * If we fail to read from the underlying devices, as of now
+ * the best option we have is to mark it EIO.
+ */
+ if (!bh)
+ return -EIO;
+
+ super = (struct btrfs_super_block *)bh->b_data;
+ if (btrfs_super_bytenr(super) != bytenr ||
+ btrfs_super_magic(super) != BTRFS_MAGIC) {
+ brelse(bh);
+ return -EINVAL;
+ }
+
+ *bh_ret = bh;
+ return 0;
+}
+
+
struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
{
struct buffer_head *bh;
@@ -3199,7 +3233,7 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
struct btrfs_super_block *super;
int i;
u64 transid = 0;
- u64 bytenr;
+ int ret = -EINVAL;
/* we would like to check all the supers, but that would make
* a btrfs mount succeed after a mkfs from a different FS.
@@ -3207,21 +3241,11 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
* later supers, using BTRFS_SUPER_MIRROR_MAX instead
*/
for (i = 0; i < 1; i++) {
- bytenr = btrfs_sb_offset(i);
- if (bytenr + BTRFS_SUPER_INFO_SIZE >=
- i_size_read(bdev->bd_inode))
- break;
- bh = __bread(bdev, bytenr / 4096,
- BTRFS_SUPER_INFO_SIZE);
- if (!bh)
+ ret = btrfs_read_dev_one_super(bdev, i, &bh);
+ if (ret)
continue;
super = (struct btrfs_super_block *)bh->b_data;
- if (btrfs_super_bytenr(super) != bytenr ||
- btrfs_super_magic(super) != BTRFS_MAGIC) {
- brelse(bh);
- continue;
- }
if (!latest || btrfs_super_generation(super) > transid) {
brelse(latest);
@@ -3231,6 +3255,10 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
brelse(bh);
}
}
+
+ if (!latest)
+ return ERR_PTR(ret);
+
return latest;
}
@@ -3299,8 +3327,9 @@ static int write_dev_supers(struct btrfs_device *device,
bh = __getblk(device->bdev, bytenr / 4096,
BTRFS_SUPER_INFO_SIZE);
if (!bh) {
- printk(KERN_ERR "BTRFS: couldn't get super "
- "buffer head for bytenr %Lu\n", bytenr);
+ btrfs_err(device->dev_root->fs_info,
+ "couldn't get super buffer head for bytenr %llu",
+ bytenr);
errors++;
continue;
}
@@ -3449,22 +3478,31 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags)
{
- if ((flags & (BTRFS_BLOCK_GROUP_DUP |
- BTRFS_BLOCK_GROUP_RAID0 |
- BTRFS_AVAIL_ALLOC_BIT_SINGLE)) ||
- ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0))
- return 0;
+ int raid_type;
+ int min_tolerated = INT_MAX;
- if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
- BTRFS_BLOCK_GROUP_RAID5 |
- BTRFS_BLOCK_GROUP_RAID10))
- return 1;
+ if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 ||
+ (flags & BTRFS_AVAIL_ALLOC_BIT_SINGLE))
+ min_tolerated = min(min_tolerated,
+ btrfs_raid_array[BTRFS_RAID_SINGLE].
+ tolerated_failures);
- if (flags & BTRFS_BLOCK_GROUP_RAID6)
- return 2;
+ for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
+ if (raid_type == BTRFS_RAID_SINGLE)
+ continue;
+ if (!(flags & btrfs_raid_group[raid_type]))
+ continue;
+ min_tolerated = min(min_tolerated,
+ btrfs_raid_array[raid_type].
+ tolerated_failures);
+ }
- pr_warn("BTRFS: unknown raid type: %llu\n", flags);
- return 0;
+ if (min_tolerated == INT_MAX) {
+ pr_warn("BTRFS: unknown raid flag: %llu\n", flags);
+ min_tolerated = 0;
+ }
+
+ return min_tolerated;
}
int btrfs_calc_num_tolerated_disk_barrier_failures(
@@ -3548,7 +3586,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
if (ret) {
mutex_unlock(
&root->fs_info->fs_devices->device_list_mutex);
- btrfs_error(root->fs_info, ret,
+ btrfs_std_error(root->fs_info, ret,
"errors while submitting device barriers.");
return ret;
}
@@ -3588,7 +3626,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
/* FUA is masked off if unsupported and can't be the reason */
- btrfs_error(root->fs_info, -EIO,
+ btrfs_std_error(root->fs_info, -EIO,
"%d errors while writing supers", total_errors);
return -EIO;
}
@@ -3606,7 +3644,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
}
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
if (total_errors > max_errors) {
- btrfs_error(root->fs_info, -EIO,
+ btrfs_std_error(root->fs_info, -EIO,
"%d errors while writing supers", total_errors);
return -EIO;
}
@@ -3742,6 +3780,9 @@ void close_ctree(struct btrfs_root *root)
fs_info->closing = 1;
smp_mb();
+ /* wait for the qgroup rescan worker to stop */
+ btrfs_qgroup_wait_for_completion(fs_info);
+
/* wait for the uuid_scan task to finish */
down(&fs_info->uuid_tree_rescan_sem);
/* avoid complains from lockdep et al., set sem back to initial state */
@@ -3792,7 +3833,7 @@ void close_ctree(struct btrfs_root *root)
percpu_counter_sum(&fs_info->delalloc_bytes));
}
- btrfs_sysfs_remove_one(fs_info);
+ btrfs_sysfs_remove_mounted(fs_info);
btrfs_sysfs_remove_fsid(fs_info->fs_devices);
btrfs_free_fs_roots(fs_info);
@@ -4290,25 +4331,6 @@ again:
return 0;
}
-static void btrfs_free_pending_ordered(struct btrfs_transaction *cur_trans,
- struct btrfs_fs_info *fs_info)
-{
- struct btrfs_ordered_extent *ordered;
-
- spin_lock(&fs_info->trans_lock);
- while (!list_empty(&cur_trans->pending_ordered)) {
- ordered = list_first_entry(&cur_trans->pending_ordered,
- struct btrfs_ordered_extent,
- trans_list);
- list_del_init(&ordered->trans_list);
- spin_unlock(&fs_info->trans_lock);
-
- btrfs_put_ordered_extent(ordered);
- spin_lock(&fs_info->trans_lock);
- }
- spin_unlock(&fs_info->trans_lock);
-}
-
void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
struct btrfs_root *root)
{
@@ -4320,7 +4342,6 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
cur_trans->state = TRANS_STATE_UNBLOCKED;
wake_up(&root->fs_info->transaction_wait);
- btrfs_free_pending_ordered(cur_trans, root->fs_info);
btrfs_destroy_delayed_inodes(root);
btrfs_assert_delayed_root_empty(root);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index bdfb479ea..adeb31830 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -60,6 +60,8 @@ void close_ctree(struct btrfs_root *root);
int write_ctree_super(struct btrfs_trans_handle *trans,
struct btrfs_root *root, int max_mirrors);
struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
+int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num,
+ struct buffer_head **bh_ret);
int btrfs_commit_super(struct btrfs_root *root);
struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info,
u64 bytenr);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index cadacf643..c4661db2b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -124,7 +124,7 @@ static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
return (cache->flags & bits) == bits;
}
-static void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
+void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
{
atomic_inc(&cache->count);
}
@@ -331,6 +331,27 @@ static void put_caching_control(struct btrfs_caching_control *ctl)
kfree(ctl);
}
+#ifdef CONFIG_BTRFS_DEBUG
+static void fragment_free_space(struct btrfs_root *root,
+ struct btrfs_block_group_cache *block_group)
+{
+ u64 start = block_group->key.objectid;
+ u64 len = block_group->key.offset;
+ u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
+ root->nodesize : root->sectorsize;
+ u64 step = chunk << 1;
+
+ while (len > chunk) {
+ btrfs_remove_free_space(block_group, start, chunk);
+ start += step;
+ if (len < step)
+ len = 0;
+ else
+ len -= step;
+ }
+}
+#endif
+
/*
* this is only called by cache_block_group, since we could have freed extents
* we need to check the pinned_extents for any extents that can't be used yet
@@ -387,6 +408,7 @@ static noinline void caching_thread(struct btrfs_work *work)
u64 last = 0;
u32 nritems;
int ret = -ENOMEM;
+ bool wakeup = true;
caching_ctl = container_of(work, struct btrfs_caching_control, work);
block_group = caching_ctl->block_group;
@@ -399,6 +421,15 @@ static noinline void caching_thread(struct btrfs_work *work)
last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
+#ifdef CONFIG_BTRFS_DEBUG
+ /*
+ * If we're fragmenting we don't want to make anybody think we can
+ * allocate from this block group until we've had a chance to fragment
+ * the free space.
+ */
+ if (btrfs_should_fragment_free_space(extent_root, block_group))
+ wakeup = false;
+#endif
/*
* We don't want to deadlock with somebody trying to allocate a new
* extent for the extent root while also trying to search the extent
@@ -440,7 +471,8 @@ next:
if (need_resched() ||
rwsem_is_contended(&fs_info->commit_root_sem)) {
- caching_ctl->progress = last;
+ if (wakeup)
+ caching_ctl->progress = last;
btrfs_release_path(path);
up_read(&fs_info->commit_root_sem);
mutex_unlock(&caching_ctl->mutex);
@@ -463,7 +495,8 @@ next:
key.offset = 0;
key.type = BTRFS_EXTENT_ITEM_KEY;
- caching_ctl->progress = last;
+ if (wakeup)
+ caching_ctl->progress = last;
btrfs_release_path(path);
goto next;
}
@@ -490,7 +523,8 @@ next:
if (total_found > (1024 * 1024 * 2)) {
total_found = 0;
- wake_up(&caching_ctl->wait);
+ if (wakeup)
+ wake_up(&caching_ctl->wait);
}
}
path->slots[0]++;
@@ -500,13 +534,27 @@ next:
total_found += add_new_free_space(block_group, fs_info, last,
block_group->key.objectid +
block_group->key.offset);
- caching_ctl->progress = (u64)-1;
-
spin_lock(&block_group->lock);
block_group->caching_ctl = NULL;
block_group->cached = BTRFS_CACHE_FINISHED;
spin_unlock(&block_group->lock);
+#ifdef CONFIG_BTRFS_DEBUG
+ if (btrfs_should_fragment_free_space(extent_root, block_group)) {
+ u64 bytes_used;
+
+ spin_lock(&block_group->space_info->lock);
+ spin_lock(&block_group->lock);
+ bytes_used = block_group->key.offset -
+ btrfs_block_group_used(&block_group->item);
+ block_group->space_info->bytes_used += bytes_used >> 1;
+ spin_unlock(&block_group->lock);
+ spin_unlock(&block_group->space_info->lock);
+ fragment_free_space(extent_root, block_group);
+ }
+#endif
+
+ caching_ctl->progress = (u64)-1;
err:
btrfs_free_path(path);
up_read(&fs_info->commit_root_sem);
@@ -606,6 +654,22 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
}
}
spin_unlock(&cache->lock);
+#ifdef CONFIG_BTRFS_DEBUG
+ if (ret == 1 &&
+ btrfs_should_fragment_free_space(fs_info->extent_root,
+ cache)) {
+ u64 bytes_used;
+
+ spin_lock(&cache->space_info->lock);
+ spin_lock(&cache->lock);
+ bytes_used = cache->key.offset -
+ btrfs_block_group_used(&cache->item);
+ cache->space_info->bytes_used += bytes_used >> 1;
+ spin_unlock(&cache->lock);
+ spin_unlock(&cache->space_info->lock);
+ fragment_free_space(fs_info->extent_root, cache);
+ }
+#endif
mutex_unlock(&caching_ctl->mutex);
wake_up(&caching_ctl->wait);
@@ -2023,8 +2087,8 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
BTRFS_ADD_DELAYED_REF, NULL);
} else {
ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
- num_bytes,
- parent, root_objectid, owner, offset,
+ num_bytes, parent, root_objectid,
+ owner, offset, 0,
BTRFS_ADD_DELAYED_REF, NULL);
}
return ret;
@@ -2338,6 +2402,11 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
node->num_bytes);
}
}
+
+ /* Also free its reserved qgroup space */
+ btrfs_qgroup_free_delayed_ref(root->fs_info,
+ head->qgroup_ref_root,
+ head->qgroup_reserved);
return ret;
}
@@ -3345,6 +3414,15 @@ again:
spin_unlock(&block_group->lock);
/*
+ * We hit an ENOSPC when setting up the cache in this transaction, just
+ * skip doing the setup, we've already cleared the cache so we're safe.
+ */
+ if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
+ ret = -ENOSPC;
+ goto out_put;
+ }
+
+ /*
* Try to preallocate enough space based on how big the block group is.
* Keep in mind this has to include any pinned space which could end up
* taking up quite a bit since it's not folded into the other space
@@ -3357,16 +3435,26 @@ again:
num_pages *= 16;
num_pages *= PAGE_CACHE_SIZE;
- ret = btrfs_check_data_free_space(inode, num_pages, num_pages);
+ ret = btrfs_check_data_free_space(inode, 0, num_pages);
if (ret)
goto out_put;
ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
num_pages, num_pages,
&alloc_hint);
+ /*
+ * Our cache requires contiguous chunks so that we don't modify a bunch
+ * of metadata or split extents when writing the cache out, which means
+ * we can enospc if we are heavily fragmented in addition to just normal
+ * out of space conditions. So if we hit this just skip setting up any
+ * other block groups for this transaction, maybe we'll unpin enough
+ * space the next time around.
+ */
if (!ret)
dcs = BTRFS_DC_SETUP;
- btrfs_free_reserved_data_space(inode, num_pages);
+ else if (ret == -ENOSPC)
+ set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
+ btrfs_free_reserved_data_space(inode, 0, num_pages);
out_put:
iput(inode);
@@ -3752,6 +3840,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
found->bytes_readonly = 0;
found->bytes_may_use = 0;
found->full = 0;
+ found->max_extent_size = 0;
found->force_alloc = CHUNK_ALLOC_NO_FORCE;
found->chunk_alloc = 0;
found->flush = 0;
@@ -3828,7 +3917,8 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
{
u64 num_devices = root->fs_info->fs_devices->rw_devices;
u64 target;
- u64 tmp;
+ u64 raid_type;
+ u64 allowed = 0;
/*
* see if restripe for this chunk_type is in progress, if so
@@ -3846,31 +3936,26 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
spin_unlock(&root->fs_info->balance_lock);
/* First, mask out the RAID levels which aren't possible */
- if (num_devices == 1)
- flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 |
- BTRFS_BLOCK_GROUP_RAID5);
- if (num_devices < 3)
- flags &= ~BTRFS_BLOCK_GROUP_RAID6;
- if (num_devices < 4)
- flags &= ~BTRFS_BLOCK_GROUP_RAID10;
-
- tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
- BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 |
- BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10);
- flags &= ~tmp;
-
- if (tmp & BTRFS_BLOCK_GROUP_RAID6)
- tmp = BTRFS_BLOCK_GROUP_RAID6;
- else if (tmp & BTRFS_BLOCK_GROUP_RAID5)
- tmp = BTRFS_BLOCK_GROUP_RAID5;
- else if (tmp & BTRFS_BLOCK_GROUP_RAID10)
- tmp = BTRFS_BLOCK_GROUP_RAID10;
- else if (tmp & BTRFS_BLOCK_GROUP_RAID1)
- tmp = BTRFS_BLOCK_GROUP_RAID1;
- else if (tmp & BTRFS_BLOCK_GROUP_RAID0)
- tmp = BTRFS_BLOCK_GROUP_RAID0;
-
- return extended_to_chunk(flags | tmp);
+ for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
+ if (num_devices >= btrfs_raid_array[raid_type].devs_min)
+ allowed |= btrfs_raid_group[raid_type];
+ }
+ allowed &= flags;
+
+ if (allowed & BTRFS_BLOCK_GROUP_RAID6)
+ allowed = BTRFS_BLOCK_GROUP_RAID6;
+ else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
+ allowed = BTRFS_BLOCK_GROUP_RAID5;
+ else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
+ allowed = BTRFS_BLOCK_GROUP_RAID10;
+ else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
+ allowed = BTRFS_BLOCK_GROUP_RAID1;
+ else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
+ allowed = BTRFS_BLOCK_GROUP_RAID0;
+
+ flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+ return extended_to_chunk(flags | allowed);
}
static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags)
@@ -3909,11 +3994,7 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
return ret;
}
-/*
- * This will check the space that the inode allocates from to make sure we have
- * enough space for bytes.
- */
-int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes)
+int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes)
{
struct btrfs_space_info *data_sinfo;
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -4012,7 +4093,8 @@ commit_trans:
if (IS_ERR(trans))
return PTR_ERR(trans);
if (have_pinned_space >= 0 ||
- trans->transaction->have_free_bgs ||
+ test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
+ &trans->transaction->flags) ||
need_commit > 0) {
ret = btrfs_commit_transaction(trans, root);
if (ret)
@@ -4034,38 +4116,86 @@ commit_trans:
data_sinfo->flags, bytes, 1);
return -ENOSPC;
}
- ret = btrfs_qgroup_reserve(root, write_bytes);
- if (ret)
- goto out;
data_sinfo->bytes_may_use += bytes;
trace_btrfs_space_reservation(root->fs_info, "space_info",
data_sinfo->flags, bytes, 1);
-out:
spin_unlock(&data_sinfo->lock);
return ret;
}
/*
- * Called if we need to clear a data reservation for this inode.
+ * New check_data_free_space() with ability for precious data reservation
+ * Will replace old btrfs_check_data_free_space(), but for patch split,
+ * add a new function first and then replace it.
+ */
+int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret;
+
+ /* align the range */
+ len = round_up(start + len, root->sectorsize) -
+ round_down(start, root->sectorsize);
+ start = round_down(start, root->sectorsize);
+
+ ret = btrfs_alloc_data_chunk_ondemand(inode, len);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * Use new btrfs_qgroup_reserve_data to reserve precious data space
+ *
+ * TODO: Find a good method to avoid reserve data space for NOCOW
+ * range, but don't impact performance on quota disable case.
+ */
+ ret = btrfs_qgroup_reserve_data(inode, start, len);
+ return ret;
+}
+
+/*
+ * Called if we need to clear a data reservation for this inode
+ * Normally in a error case.
+ *
+ * This one will *NOT* use accurate qgroup reserved space API, just for case
+ * which we can't sleep and is sure it won't affect qgroup reserved space.
+ * Like clear_bit_hook().
*/
-void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
+void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
+ u64 len)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_space_info *data_sinfo;
- /* make sure bytes are sectorsize aligned */
- bytes = ALIGN(bytes, root->sectorsize);
+ /* Make sure the range is aligned to sectorsize */
+ len = round_up(start + len, root->sectorsize) -
+ round_down(start, root->sectorsize);
+ start = round_down(start, root->sectorsize);
data_sinfo = root->fs_info->data_sinfo;
spin_lock(&data_sinfo->lock);
- WARN_ON(data_sinfo->bytes_may_use < bytes);
- data_sinfo->bytes_may_use -= bytes;
+ if (WARN_ON(data_sinfo->bytes_may_use < len))
+ data_sinfo->bytes_may_use = 0;
+ else
+ data_sinfo->bytes_may_use -= len;
trace_btrfs_space_reservation(root->fs_info, "space_info",
- data_sinfo->flags, bytes, 0);
+ data_sinfo->flags, len, 0);
spin_unlock(&data_sinfo->lock);
}
+/*
+ * Called if we need to clear a data reservation for this inode
+ * Normally in a error case.
+ *
+ * This one will handle the per-indoe data rsv map for accurate reserved
+ * space framework.
+ */
+void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len)
+{
+ btrfs_free_reserved_data_space_noquota(inode, start, len);
+ btrfs_qgroup_free_data(inode, start, len);
+}
+
static void force_metadata_allocation(struct btrfs_fs_info *info)
{
struct list_head *head = &info->space_info;
@@ -4897,13 +5027,9 @@ static struct btrfs_block_rsv *get_block_rsv(
{
struct btrfs_block_rsv *block_rsv = NULL;
- if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
- block_rsv = trans->block_rsv;
-
- if (root == root->fs_info->csum_root && trans->adding_csums)
- block_rsv = trans->block_rsv;
-
- if (root == root->fs_info->uuid_root)
+ if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
+ (root == root->fs_info->csum_root && trans->adding_csums) ||
+ (root == root->fs_info->uuid_root))
block_rsv = trans->block_rsv;
if (!block_rsv)
@@ -5346,7 +5472,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
if (root->fs_info->quota_enabled) {
/* One for parent inode, two for dir entries */
num_bytes = 3 * root->nodesize;
- ret = btrfs_qgroup_reserve(root, num_bytes);
+ ret = btrfs_qgroup_reserve_meta(root, num_bytes);
if (ret)
return ret;
} else {
@@ -5364,10 +5490,8 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
if (ret == -ENOSPC && use_global_rsv)
ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes);
- if (ret) {
- if (*qgroup_reserved)
- btrfs_qgroup_free(root, *qgroup_reserved);
- }
+ if (ret && *qgroup_reserved)
+ btrfs_qgroup_free_meta(root, *qgroup_reserved);
return ret;
}
@@ -5528,15 +5652,15 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
spin_unlock(&BTRFS_I(inode)->lock);
if (root->fs_info->quota_enabled) {
- ret = btrfs_qgroup_reserve(root, nr_extents * root->nodesize);
+ ret = btrfs_qgroup_reserve_meta(root,
+ nr_extents * root->nodesize);
if (ret)
goto out_fail;
}
ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
if (unlikely(ret)) {
- if (root->fs_info->quota_enabled)
- btrfs_qgroup_free(root, nr_extents * root->nodesize);
+ btrfs_qgroup_free_meta(root, nr_extents * root->nodesize);
goto out_fail;
}
@@ -5659,41 +5783,48 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
}
/**
- * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
+ * btrfs_delalloc_reserve_space - reserve data and metadata space for
+ * delalloc
* @inode: inode we're writing to
- * @num_bytes: the number of bytes we want to allocate
+ * @start: start range we are writing to
+ * @len: how long the range we are writing to
+ *
+ * TODO: This function will finally replace old btrfs_delalloc_reserve_space()
*
* This will do the following things
*
- * o reserve space in the data space info for num_bytes
- * o reserve space in the metadata space info based on number of outstanding
+ * o reserve space in data space info for num bytes
+ * and reserve precious corresponding qgroup space
+ * (Done in check_data_free_space)
+ *
+ * o reserve space for metadata space, based on the number of outstanding
* extents and how much csums will be needed
- * o add to the inodes ->delalloc_bytes
+ * also reserve metadata space in a per root over-reserve method.
+ * o add to the inodes->delalloc_bytes
* o add it to the fs_info's delalloc inodes list.
+ * (Above 3 all done in delalloc_reserve_metadata)
*
- * This will return 0 for success and -ENOSPC if there is no space left.
+ * Return 0 for success
+ * Return <0 for error(-ENOSPC or -EQUOT)
*/
-int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
+int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len)
{
int ret;
- ret = btrfs_check_data_free_space(inode, num_bytes, num_bytes);
- if (ret)
- return ret;
-
- ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
- if (ret) {
- btrfs_free_reserved_data_space(inode, num_bytes);
+ ret = btrfs_check_data_free_space(inode, start, len);
+ if (ret < 0)
return ret;
- }
-
- return 0;
+ ret = btrfs_delalloc_reserve_metadata(inode, len);
+ if (ret < 0)
+ btrfs_free_reserved_data_space(inode, start, len);
+ return ret;
}
/**
* btrfs_delalloc_release_space - release data and metadata space for delalloc
* @inode: inode we're releasing space for
- * @num_bytes: the number of bytes we want to free up
+ * @start: start position of the space already reserved
+ * @len: the len of the space already reserved
*
* This must be matched with a call to btrfs_delalloc_reserve_space. This is
* called in the case that we don't need the metadata AND data reservations
@@ -5702,11 +5833,12 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
* This function will release the metadata space that was not used and will
* decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
* list if there are no delalloc bytes left.
+ * Also it will handle the qgroup reserved space.
*/
-void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
+void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len)
{
- btrfs_delalloc_release_metadata(inode, num_bytes);
- btrfs_free_reserved_data_space(inode, num_bytes);
+ btrfs_delalloc_release_metadata(inode, len);
+ btrfs_free_reserved_data_space(inode, start, len);
}
static int update_block_group(struct btrfs_trans_handle *trans,
@@ -5783,19 +5915,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
set_extent_dirty(info->pinned_extents,
bytenr, bytenr + num_bytes - 1,
GFP_NOFS | __GFP_NOFAIL);
- /*
- * No longer have used bytes in this block group, queue
- * it for deletion.
- */
- if (old_val == 0) {
- spin_lock(&info->unused_bgs_lock);
- if (list_empty(&cache->bg_list)) {
- btrfs_get_block_group(cache);
- list_add_tail(&cache->bg_list,
- &info->unused_bgs);
- }
- spin_unlock(&info->unused_bgs_lock);
- }
}
spin_lock(&trans->transaction->dirty_bgs_lock);
@@ -5807,6 +5926,22 @@ static int update_block_group(struct btrfs_trans_handle *trans,
}
spin_unlock(&trans->transaction->dirty_bgs_lock);
+ /*
+ * No longer have used bytes in this block group, queue it for
+ * deletion. We do this after adding the block group to the
+ * dirty list to avoid races between cleaner kthread and space
+ * cache writeout.
+ */
+ if (!alloc && old_val == 0) {
+ spin_lock(&info->unused_bgs_lock);
+ if (list_empty(&cache->bg_list)) {
+ btrfs_get_block_group(cache);
+ list_add_tail(&cache->bg_list,
+ &info->unused_bgs);
+ }
+ spin_unlock(&info->unused_bgs_lock);
+ }
+
btrfs_put_block_group(cache);
total -= num_bytes;
bytenr += num_bytes;
@@ -6071,6 +6206,34 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
update_global_block_rsv(fs_info);
}
+/*
+ * Returns the free cluster for the given space info and sets empty_cluster to
+ * what it should be based on the mount options.
+ */
+static struct btrfs_free_cluster *
+fetch_cluster_info(struct btrfs_root *root, struct btrfs_space_info *space_info,
+ u64 *empty_cluster)
+{
+ struct btrfs_free_cluster *ret = NULL;
+ bool ssd = btrfs_test_opt(root, SSD);
+
+ *empty_cluster = 0;
+ if (btrfs_mixed_space_info(space_info))
+ return ret;
+
+ if (ssd)
+ *empty_cluster = 2 * 1024 * 1024;
+ if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
+ ret = &root->fs_info->meta_alloc_cluster;
+ if (!ssd)
+ *empty_cluster = 64 * 1024;
+ } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && ssd) {
+ ret = &root->fs_info->data_alloc_cluster;
+ }
+
+ return ret;
+}
+
static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
const bool return_free_space)
{
@@ -6078,7 +6241,10 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
struct btrfs_block_group_cache *cache = NULL;
struct btrfs_space_info *space_info;
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+ struct btrfs_free_cluster *cluster = NULL;
u64 len;
+ u64 total_unpinned = 0;
+ u64 empty_cluster = 0;
bool readonly;
while (start <= end) {
@@ -6087,8 +6253,14 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
start >= cache->key.objectid + cache->key.offset) {
if (cache)
btrfs_put_block_group(cache);
+ total_unpinned = 0;
cache = btrfs_lookup_block_group(fs_info, start);
BUG_ON(!cache); /* Logic error */
+
+ cluster = fetch_cluster_info(root,
+ cache->space_info,
+ &empty_cluster);
+ empty_cluster <<= 1;
}
len = cache->key.objectid + cache->key.offset - start;
@@ -6101,12 +6273,27 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
}
start += len;
+ total_unpinned += len;
space_info = cache->space_info;
+ /*
+ * If this space cluster has been marked as fragmented and we've
+ * unpinned enough in this block group to potentially allow a
+ * cluster to be created inside of it go ahead and clear the
+ * fragmented check.
+ */
+ if (cluster && cluster->fragmented &&
+ total_unpinned > empty_cluster) {
+ spin_lock(&cluster->lock);
+ cluster->fragmented = 0;
+ spin_unlock(&cluster->lock);
+ }
+
spin_lock(&space_info->lock);
spin_lock(&cache->lock);
cache->pinned -= len;
space_info->bytes_pinned -= len;
+ space_info->max_extent_size = 0;
percpu_counter_add(&space_info->total_bytes_pinned, -len);
if (cache->ro) {
space_info->bytes_readonly += len;
@@ -6648,8 +6835,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
num_bytes,
parent, root_objectid, owner,
- offset, BTRFS_DROP_DELAYED_REF,
- NULL);
+ offset, 0,
+ BTRFS_DROP_DELAYED_REF, NULL);
}
return ret;
}
@@ -6835,7 +7022,7 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
struct btrfs_block_group_cache *block_group = NULL;
u64 search_start = 0;
u64 max_extent_size = 0;
- int empty_cluster = 2 * 1024 * 1024;
+ u64 empty_cluster = 0;
struct btrfs_space_info *space_info;
int loop = 0;
int index = __get_raid_index(flags);
@@ -6845,6 +7032,8 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
bool failed_alloc = false;
bool use_cluster = true;
bool have_caching_bg = false;
+ bool orig_have_caching_bg = false;
+ bool full_search = false;
WARN_ON(num_bytes < root->sectorsize);
ins->type = BTRFS_EXTENT_ITEM_KEY;
@@ -6860,36 +7049,47 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
}
/*
- * If the space info is for both data and metadata it means we have a
- * small filesystem and we can't use the clustering stuff.
+ * If our free space is heavily fragmented we may not be able to make
+ * big contiguous allocations, so instead of doing the expensive search
+ * for free space, simply return ENOSPC with our max_extent_size so we
+ * can go ahead and search for a more manageable chunk.
+ *
+ * If our max_extent_size is large enough for our allocation simply
+ * disable clustering since we will likely not be able to find enough
+ * space to create a cluster and induce latency trying.
*/
- if (btrfs_mixed_space_info(space_info))
- use_cluster = false;
-
- if (flags & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
- last_ptr = &root->fs_info->meta_alloc_cluster;
- if (!btrfs_test_opt(root, SSD))
- empty_cluster = 64 * 1024;
- }
-
- if ((flags & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
- btrfs_test_opt(root, SSD)) {
- last_ptr = &root->fs_info->data_alloc_cluster;
+ if (unlikely(space_info->max_extent_size)) {
+ spin_lock(&space_info->lock);
+ if (space_info->max_extent_size &&
+ num_bytes > space_info->max_extent_size) {
+ ins->offset = space_info->max_extent_size;
+ spin_unlock(&space_info->lock);
+ return -ENOSPC;
+ } else if (space_info->max_extent_size) {
+ use_cluster = false;
+ }
+ spin_unlock(&space_info->lock);
}
+ last_ptr = fetch_cluster_info(orig_root, space_info, &empty_cluster);
if (last_ptr) {
spin_lock(&last_ptr->lock);
if (last_ptr->block_group)
hint_byte = last_ptr->window_start;
+ if (last_ptr->fragmented) {
+ /*
+ * We still set window_start so we can keep track of the
+ * last place we found an allocation to try and save
+ * some time.
+ */
+ hint_byte = last_ptr->window_start;
+ use_cluster = false;
+ }
spin_unlock(&last_ptr->lock);
}
search_start = max(search_start, first_logical_byte(root, 0));
search_start = max(search_start, hint_byte);
-
- if (!last_ptr)
- empty_cluster = 0;
-
if (search_start == hint_byte) {
block_group = btrfs_lookup_block_group(root->fs_info,
search_start);
@@ -6924,6 +7124,8 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
}
search:
have_caching_bg = false;
+ if (index == 0 || index == __get_raid_index(flags))
+ full_search = true;
down_read(&space_info->groups_sem);
list_for_each_entry(block_group, &space_info->block_groups[index],
list) {
@@ -6957,6 +7159,7 @@ search:
have_block_group:
cached = block_group_cache_done(block_group);
if (unlikely(!cached)) {
+ have_caching_bg = true;
ret = cache_block_group(block_group, 0);
BUG_ON(ret < 0);
ret = 0;
@@ -6971,7 +7174,7 @@ have_block_group:
* Ok we want to try and use the cluster allocator, so
* lets look there
*/
- if (last_ptr) {
+ if (last_ptr && use_cluster) {
struct btrfs_block_group_cache *used_block_group;
unsigned long aligned_cluster;
/*
@@ -7097,6 +7300,16 @@ refill_cluster:
}
unclustered_alloc:
+ /*
+ * We are doing an unclustered alloc, set the fragmented flag so
+ * we don't bother trying to setup a cluster again until we get
+ * more space.
+ */
+ if (unlikely(last_ptr)) {
+ spin_lock(&last_ptr->lock);
+ last_ptr->fragmented = 1;
+ spin_unlock(&last_ptr->lock);
+ }
spin_lock(&block_group->free_space_ctl->tree_lock);
if (cached &&
block_group->free_space_ctl->free_space <
@@ -7129,8 +7342,6 @@ unclustered_alloc:
failed_alloc = true;
goto have_block_group;
} else if (!offset) {
- if (!cached)
- have_caching_bg = true;
goto loop;
}
checks:
@@ -7171,6 +7382,10 @@ loop:
}
up_read(&space_info->groups_sem);
+ if ((loop == LOOP_CACHING_NOWAIT) && have_caching_bg
+ && !orig_have_caching_bg)
+ orig_have_caching_bg = true;
+
if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
goto search;
@@ -7187,7 +7402,20 @@ loop:
*/
if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
index = 0;
- loop++;
+ if (loop == LOOP_CACHING_NOWAIT) {
+ /*
+ * We want to skip the LOOP_CACHING_WAIT step if we
+ * don't have any unached bgs and we've alrelady done a
+ * full search through.
+ */
+ if (orig_have_caching_bg || !full_search)
+ loop = LOOP_CACHING_WAIT;
+ else
+ loop = LOOP_ALLOC_CHUNK;
+ } else {
+ loop++;
+ }
+
if (loop == LOOP_ALLOC_CHUNK) {
struct btrfs_trans_handle *trans;
int exist = 0;
@@ -7205,6 +7433,15 @@ loop:
ret = do_chunk_alloc(trans, root, flags,
CHUNK_ALLOC_FORCE);
+
+ /*
+ * If we can't allocate a new chunk we've already looped
+ * through at least once, move on to the NO_EMPTY_SIZE
+ * case.
+ */
+ if (ret == -ENOSPC)
+ loop = LOOP_NO_EMPTY_SIZE;
+
/*
* Do not bail out on ENOSPC since we
* can do more things.
@@ -7221,6 +7458,15 @@ loop:
}
if (loop == LOOP_NO_EMPTY_SIZE) {
+ /*
+ * Don't loop again if we already have no empty_size and
+ * no empty_cluster.
+ */
+ if (empty_size == 0 &&
+ empty_cluster == 0) {
+ ret = -ENOSPC;
+ goto out;
+ }
empty_size = 0;
empty_cluster = 0;
}
@@ -7229,11 +7475,20 @@ loop:
} else if (!ins->objectid) {
ret = -ENOSPC;
} else if (ins->objectid) {
+ if (!use_cluster && last_ptr) {
+ spin_lock(&last_ptr->lock);
+ last_ptr->window_start = ins->objectid;
+ spin_unlock(&last_ptr->lock);
+ }
ret = 0;
}
out:
- if (ret == -ENOSPC)
+ if (ret == -ENOSPC) {
+ spin_lock(&space_info->lock);
+ space_info->max_extent_size = max_extent_size;
+ spin_unlock(&space_info->lock);
ins->offset = max_extent_size;
+ }
return ret;
}
@@ -7282,7 +7537,7 @@ int btrfs_reserve_extent(struct btrfs_root *root,
u64 empty_size, u64 hint_byte,
struct btrfs_key *ins, int is_data, int delalloc)
{
- bool final_tried = false;
+ bool final_tried = num_bytes == min_alloc_size;
u64 flags;
int ret;
@@ -7512,7 +7767,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 root_objectid, u64 owner,
- u64 offset, struct btrfs_key *ins)
+ u64 offset, u64 ram_bytes,
+ struct btrfs_key *ins)
{
int ret;
@@ -7521,7 +7777,8 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
ins->offset, 0,
root_objectid, owner, offset,
- BTRFS_ADD_DELAYED_EXTENT, NULL);
+ ram_bytes, BTRFS_ADD_DELAYED_EXTENT,
+ NULL);
return ret;
}
@@ -7851,21 +8108,47 @@ reada:
}
/*
- * TODO: Modify related function to add related node/leaf to dirty_extent_root,
- * for later qgroup accounting.
- *
- * Current, this function does nothing.
+ * These may not be seen by the usual inc/dec ref code so we have to
+ * add them here.
*/
+static int record_one_subtree_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes)
+{
+ struct btrfs_qgroup_extent_record *qrecord;
+ struct btrfs_delayed_ref_root *delayed_refs;
+
+ qrecord = kmalloc(sizeof(*qrecord), GFP_NOFS);
+ if (!qrecord)
+ return -ENOMEM;
+
+ qrecord->bytenr = bytenr;
+ qrecord->num_bytes = num_bytes;
+ qrecord->old_roots = NULL;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ spin_lock(&delayed_refs->lock);
+ if (btrfs_qgroup_insert_dirty_extent(delayed_refs, qrecord))
+ kfree(qrecord);
+ spin_unlock(&delayed_refs->lock);
+
+ return 0;
+}
+
static int account_leaf_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *eb)
{
int nr = btrfs_header_nritems(eb);
- int i, extent_type;
+ int i, extent_type, ret;
struct btrfs_key key;
struct btrfs_file_extent_item *fi;
u64 bytenr, num_bytes;
+ /* We can be called directly from walk_up_proc() */
+ if (!root->fs_info->quota_enabled)
+ return 0;
+
for (i = 0; i < nr; i++) {
btrfs_item_key_to_cpu(eb, &key, i);
@@ -7884,6 +8167,10 @@ static int account_leaf_items(struct btrfs_trans_handle *trans,
continue;
num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
+
+ ret = record_one_subtree_extent(trans, root, bytenr, num_bytes);
+ if (ret)
+ return ret;
}
return 0;
}
@@ -7952,8 +8239,6 @@ static int adjust_slots_upwards(struct btrfs_root *root,
/*
* root_eb is the subtree root and is locked before this function is called.
- * TODO: Modify this function to mark all (including complete shared node)
- * to dirty_extent_root to allow it get accounted in qgroup.
*/
static int account_shared_subtree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -8031,6 +8316,11 @@ walk_down:
btrfs_tree_read_lock(eb);
btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
+
+ ret = record_one_subtree_extent(trans, root, child_bytenr,
+ root->nodesize);
+ if (ret)
+ goto out;
}
if (level == 0) {
@@ -8276,10 +8566,11 @@ skip:
ret = account_shared_subtree(trans, root, next,
generation, level - 1);
if (ret) {
- printk_ratelimited(KERN_ERR "BTRFS: %s Error "
+ btrfs_err_rl(root->fs_info,
+ "Error "
"%d accounting shared subtree. Quota "
- "is out of sync, rescan required.\n",
- root->fs_info->sb->s_id, ret);
+ "is out of sync, rescan required.",
+ ret);
}
}
ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
@@ -8368,10 +8659,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
BUG_ON(ret); /* -ENOMEM */
ret = account_leaf_items(trans, root, eb);
if (ret) {
- printk_ratelimited(KERN_ERR "BTRFS: %s Error "
+ btrfs_err_rl(root->fs_info,
+ "error "
"%d accounting leaf items. Quota "
- "is out of sync, rescan required.\n",
- root->fs_info->sb->s_id, ret);
+ "is out of sync, rescan required.",
+ ret);
}
}
/* make block locked assertion in clean_tree_block happy */
@@ -8693,7 +8985,7 @@ out:
if (!for_reloc && root_dropped == false)
btrfs_add_dead_root(root);
if (err && err != -EAGAIN)
- btrfs_std_error(root->fs_info, err);
+ btrfs_std_error(root->fs_info, err, NULL);
return err;
}
@@ -8881,7 +9173,7 @@ again:
* back off and let this transaction commit
*/
mutex_lock(&root->fs_info->ro_block_group_mutex);
- if (trans->transaction->dirty_bg_run) {
+ if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
u64 transid = trans->transid;
mutex_unlock(&root->fs_info->ro_block_group_mutex);
@@ -9631,6 +9923,14 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
free_excluded_extents(root, cache);
+#ifdef CONFIG_BTRFS_DEBUG
+ if (btrfs_should_fragment_free_space(root, cache)) {
+ u64 new_bytes_used = size - bytes_used;
+
+ bytes_used += new_bytes_used >> 1;
+ fragment_free_space(root, cache);
+ }
+#endif
/*
* Call to ensure the corresponding space_info object is created and
* assigned to our block group, but don't update its counters just yet.
@@ -9992,6 +10292,47 @@ out:
return ret;
}
+struct btrfs_trans_handle *
+btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info,
+ const u64 chunk_offset)
+{
+ struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
+ struct extent_map *em;
+ struct map_lookup *map;
+ unsigned int num_items;
+
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, chunk_offset, 1);
+ read_unlock(&em_tree->lock);
+ ASSERT(em && em->start == chunk_offset);
+
+ /*
+ * We need to reserve 3 + N units from the metadata space info in order
+ * to remove a block group (done at btrfs_remove_chunk() and at
+ * btrfs_remove_block_group()), which are used for:
+ *
+ * 1 unit for adding the free space inode's orphan (located in the tree
+ * of tree roots).
+ * 1 unit for deleting the block group item (located in the extent
+ * tree).
+ * 1 unit for deleting the free space item (located in tree of tree
+ * roots).
+ * N units for deleting N device extent items corresponding to each
+ * stripe (located in the device tree).
+ *
+ * In order to remove a block group we also need to reserve units in the
+ * system space info in order to update the chunk tree (update one or
+ * more device items and remove one chunk item), but this is done at
+ * btrfs_remove_chunk() through a call to check_system_chunk().
+ */
+ map = (struct map_lookup *)em->bdev;
+ num_items = 3 + map->num_stripes;
+ free_extent_map(em);
+
+ return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
+ num_items, 1);
+}
+
/*
* Process the unused_bgs list and remove any that don't have any allocated
* space inside of them.
@@ -10015,22 +10356,25 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
block_group = list_first_entry(&fs_info->unused_bgs,
struct btrfs_block_group_cache,
bg_list);
- space_info = block_group->space_info;
list_del_init(&block_group->bg_list);
+
+ space_info = block_group->space_info;
+
if (ret || btrfs_mixed_space_info(space_info)) {
btrfs_put_block_group(block_group);
continue;
}
spin_unlock(&fs_info->unused_bgs_lock);
- mutex_lock(&root->fs_info->delete_unused_bgs_mutex);
+ mutex_lock(&fs_info->delete_unused_bgs_mutex);
/* Don't want to race with allocators so take the groups_sem */
down_write(&space_info->groups_sem);
spin_lock(&block_group->lock);
if (block_group->reserved ||
btrfs_block_group_used(&block_group->item) ||
- block_group->ro) {
+ block_group->ro ||
+ list_is_singular(&block_group->list)) {
/*
* We want to bail if we made new allocations or have
* outstanding allocations in this block group. We do
@@ -10055,8 +10399,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
* Want to do this before we do anything else so we can recover
* properly if we fail to join the transaction.
*/
- /* 1 for btrfs_orphan_reserve_metadata() */
- trans = btrfs_start_transaction(root, 1);
+ trans = btrfs_start_trans_remove_block_group(fs_info,
+ block_group->key.objectid);
if (IS_ERR(trans)) {
btrfs_dec_block_group_ro(root, block_group);
ret = PTR_ERR(trans);
@@ -10136,17 +10480,21 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
* until transaction commit to do the actual discard.
*/
if (trimming) {
- WARN_ON(!list_empty(&block_group->bg_list));
- spin_lock(&trans->transaction->deleted_bgs_lock);
+ spin_lock(&fs_info->unused_bgs_lock);
+ /*
+ * A concurrent scrub might have added us to the list
+ * fs_info->unused_bgs, so use a list_move operation
+ * to add the block group to the deleted_bgs list.
+ */
list_move(&block_group->bg_list,
&trans->transaction->deleted_bgs);
- spin_unlock(&trans->transaction->deleted_bgs_lock);
+ spin_unlock(&fs_info->unused_bgs_lock);
btrfs_get_block_group(block_group);
}
end_trans:
btrfs_end_transaction(trans, root);
next:
- mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
btrfs_put_block_group(block_group);
spin_lock(&fs_info->unused_bgs_lock);
}
@@ -10371,8 +10719,7 @@ void btrfs_end_write_no_snapshoting(struct btrfs_root *root)
{
percpu_counter_dec(&root->subv_writers->counter);
/*
- * Make sure counter is updated before we wake up
- * waiters.
+ * Make sure counter is updated before we wake up waiters.
*/
smp_mb();
if (waitqueue_active(&root->subv_writers->wait))
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 3915c9473..9abe18763 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -96,8 +96,8 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller,
inode = tree->mapping->host;
isize = i_size_read(inode);
if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
- printk_ratelimited(KERN_DEBUG
- "BTRFS: %s: ino %llu isize %llu odd range [%llu,%llu]\n",
+ btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
+ "%s: ino %llu isize %llu odd range [%llu,%llu]",
caller, btrfs_ino(inode), isize, start, end);
}
}
@@ -131,6 +131,25 @@ struct extent_page_data {
unsigned int sync_io:1;
};
+static void add_extent_changeset(struct extent_state *state, unsigned bits,
+ struct extent_changeset *changeset,
+ int set)
+{
+ int ret;
+
+ if (!changeset)
+ return;
+ if (set && (state->state & bits) == bits)
+ return;
+ if (!set && (state->state & bits) == 0)
+ return;
+ changeset->bytes_changed += state->end - state->start + 1;
+ ret = ulist_add(changeset->range_changed, state->start, state->end,
+ GFP_ATOMIC);
+ /* ENOMEM */
+ BUG_ON(ret < 0);
+}
+
static noinline void flush_write_bio(void *data);
static inline struct btrfs_fs_info *
tree_fs_info(struct extent_io_tree *tree)
@@ -410,7 +429,8 @@ static void clear_state_cb(struct extent_io_tree *tree,
}
static void set_state_bits(struct extent_io_tree *tree,
- struct extent_state *state, unsigned *bits);
+ struct extent_state *state, unsigned *bits,
+ struct extent_changeset *changeset);
/*
* insert an extent_state struct into the tree. 'bits' are set on the
@@ -426,7 +446,7 @@ static int insert_state(struct extent_io_tree *tree,
struct extent_state *state, u64 start, u64 end,
struct rb_node ***p,
struct rb_node **parent,
- unsigned *bits)
+ unsigned *bits, struct extent_changeset *changeset)
{
struct rb_node *node;
@@ -436,7 +456,7 @@ static int insert_state(struct extent_io_tree *tree,
state->start = start;
state->end = end;
- set_state_bits(tree, state, bits);
+ set_state_bits(tree, state, bits, changeset);
node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent);
if (node) {
@@ -511,7 +531,8 @@ static struct extent_state *next_state(struct extent_state *state)
*/
static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
struct extent_state *state,
- unsigned *bits, int wake)
+ unsigned *bits, int wake,
+ struct extent_changeset *changeset)
{
struct extent_state *next;
unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS;
@@ -522,6 +543,7 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
tree->dirty_bytes -= range;
}
clear_state_cb(tree, state, bits);
+ add_extent_changeset(state, bits_to_clear, changeset, 0);
state->state &= ~bits_to_clear;
if (wake)
wake_up(&state->wq);
@@ -569,10 +591,10 @@ static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
*
* This takes the tree lock, and returns 0 on success and < 0 on error.
*/
-int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, int wake, int delete,
- struct extent_state **cached_state,
- gfp_t mask)
+static int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, int wake, int delete,
+ struct extent_state **cached_state,
+ gfp_t mask, struct extent_changeset *changeset)
{
struct extent_state *state;
struct extent_state *cached;
@@ -594,7 +616,7 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
clear = 1;
again:
- if (!prealloc && (mask & __GFP_WAIT)) {
+ if (!prealloc && gfpflags_allow_blocking(mask)) {
/*
* Don't care for allocation failure here because we might end
* up not needing the pre-allocated extent state at all, which
@@ -671,7 +693,8 @@ hit_next:
if (err)
goto out;
if (state->end <= end) {
- state = clear_state_bit(tree, state, &bits, wake);
+ state = clear_state_bit(tree, state, &bits, wake,
+ changeset);
goto next;
}
goto search_again;
@@ -692,13 +715,13 @@ hit_next:
if (wake)
wake_up(&state->wq);
- clear_state_bit(tree, prealloc, &bits, wake);
+ clear_state_bit(tree, prealloc, &bits, wake, changeset);
prealloc = NULL;
goto out;
}
- state = clear_state_bit(tree, state, &bits, wake);
+ state = clear_state_bit(tree, state, &bits, wake, changeset);
next:
if (last_end == (u64)-1)
goto out;
@@ -718,7 +741,7 @@ search_again:
if (start > end)
goto out;
spin_unlock(&tree->lock);
- if (mask & __GFP_WAIT)
+ if (gfpflags_allow_blocking(mask))
cond_resched();
goto again;
}
@@ -789,7 +812,7 @@ out:
static void set_state_bits(struct extent_io_tree *tree,
struct extent_state *state,
- unsigned *bits)
+ unsigned *bits, struct extent_changeset *changeset)
{
unsigned bits_to_set = *bits & ~EXTENT_CTLBITS;
@@ -798,6 +821,7 @@ static void set_state_bits(struct extent_io_tree *tree,
u64 range = state->end - state->start + 1;
tree->dirty_bytes += range;
}
+ add_extent_changeset(state, bits_to_set, changeset, 1);
state->state |= bits_to_set;
}
@@ -835,7 +859,7 @@ static int __must_check
__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, unsigned exclusive_bits,
u64 *failed_start, struct extent_state **cached_state,
- gfp_t mask)
+ gfp_t mask, struct extent_changeset *changeset)
{
struct extent_state *state;
struct extent_state *prealloc = NULL;
@@ -850,7 +874,7 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
bits |= EXTENT_FIRST_DELALLOC;
again:
- if (!prealloc && (mask & __GFP_WAIT)) {
+ if (!prealloc && gfpflags_allow_blocking(mask)) {
prealloc = alloc_extent_state(mask);
BUG_ON(!prealloc);
}
@@ -873,7 +897,7 @@ again:
prealloc = alloc_extent_state_atomic(prealloc);
BUG_ON(!prealloc);
err = insert_state(tree, prealloc, start, end,
- &p, &parent, &bits);
+ &p, &parent, &bits, changeset);
if (err)
extent_io_tree_panic(tree, err);
@@ -899,7 +923,7 @@ hit_next:
goto out;
}
- set_state_bits(tree, state, &bits);
+ set_state_bits(tree, state, &bits, changeset);
cache_state(state, cached_state);
merge_state(tree, state);
if (last_end == (u64)-1)
@@ -945,7 +969,7 @@ hit_next:
if (err)
goto out;
if (state->end <= end) {
- set_state_bits(tree, state, &bits);
+ set_state_bits(tree, state, &bits, changeset);
cache_state(state, cached_state);
merge_state(tree, state);
if (last_end == (u64)-1)
@@ -980,7 +1004,7 @@ hit_next:
* the later extent.
*/
err = insert_state(tree, prealloc, start, this_end,
- NULL, NULL, &bits);
+ NULL, NULL, &bits, changeset);
if (err)
extent_io_tree_panic(tree, err);
@@ -1008,7 +1032,7 @@ hit_next:
if (err)
extent_io_tree_panic(tree, err);
- set_state_bits(tree, prealloc, &bits);
+ set_state_bits(tree, prealloc, &bits, changeset);
cache_state(prealloc, cached_state);
merge_state(tree, prealloc);
prealloc = NULL;
@@ -1028,7 +1052,7 @@ search_again:
if (start > end)
goto out;
spin_unlock(&tree->lock);
- if (mask & __GFP_WAIT)
+ if (gfpflags_allow_blocking(mask))
cond_resched();
goto again;
}
@@ -1038,7 +1062,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached_state, gfp_t mask)
{
return __set_extent_bit(tree, start, end, bits, 0, failed_start,
- cached_state, mask);
+ cached_state, mask, NULL);
}
@@ -1076,7 +1100,7 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
btrfs_debug_check_extent_io_range(tree, start, end);
again:
- if (!prealloc && (mask & __GFP_WAIT)) {
+ if (!prealloc && gfpflags_allow_blocking(mask)) {
/*
* Best effort, don't worry if extent state allocation fails
* here for the first iteration. We might have a cached state
@@ -1111,7 +1135,7 @@ again:
goto out;
}
err = insert_state(tree, prealloc, start, end,
- &p, &parent, &bits);
+ &p, &parent, &bits, NULL);
if (err)
extent_io_tree_panic(tree, err);
cache_state(prealloc, cached_state);
@@ -1130,9 +1154,9 @@ hit_next:
* Just lock what we found and keep going
*/
if (state->start == start && state->end <= end) {
- set_state_bits(tree, state, &bits);
+ set_state_bits(tree, state, &bits, NULL);
cache_state(state, cached_state);
- state = clear_state_bit(tree, state, &clear_bits, 0);
+ state = clear_state_bit(tree, state, &clear_bits, 0, NULL);
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
@@ -1171,9 +1195,10 @@ hit_next:
if (err)
goto out;
if (state->end <= end) {
- set_state_bits(tree, state, &bits);
+ set_state_bits(tree, state, &bits, NULL);
cache_state(state, cached_state);
- state = clear_state_bit(tree, state, &clear_bits, 0);
+ state = clear_state_bit(tree, state, &clear_bits, 0,
+ NULL);
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
@@ -1208,7 +1233,7 @@ hit_next:
* the later extent.
*/
err = insert_state(tree, prealloc, start, this_end,
- NULL, NULL, &bits);
+ NULL, NULL, &bits, NULL);
if (err)
extent_io_tree_panic(tree, err);
cache_state(prealloc, cached_state);
@@ -1233,9 +1258,9 @@ hit_next:
if (err)
extent_io_tree_panic(tree, err);
- set_state_bits(tree, prealloc, &bits);
+ set_state_bits(tree, prealloc, &bits, NULL);
cache_state(prealloc, cached_state);
- clear_state_bit(tree, prealloc, &clear_bits, 0);
+ clear_state_bit(tree, prealloc, &clear_bits, 0, NULL);
prealloc = NULL;
goto out;
}
@@ -1253,7 +1278,7 @@ search_again:
if (start > end)
goto out;
spin_unlock(&tree->lock);
- if (mask & __GFP_WAIT)
+ if (gfpflags_allow_blocking(mask))
cond_resched();
first_iteration = false;
goto again;
@@ -1274,6 +1299,30 @@ int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
NULL, mask);
}
+int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, gfp_t mask,
+ struct extent_changeset *changeset)
+{
+ /*
+ * We don't support EXTENT_LOCKED yet, as current changeset will
+ * record any bits changed, so for EXTENT_LOCKED case, it will
+ * either fail with -EEXIST or changeset will record the whole
+ * range.
+ */
+ BUG_ON(bits & EXTENT_LOCKED);
+
+ return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, mask,
+ changeset);
+}
+
+int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, int wake, int delete,
+ struct extent_state **cached, gfp_t mask)
+{
+ return __clear_extent_bit(tree, start, end, bits, wake, delete,
+ cached, mask, NULL);
+}
+
int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, gfp_t mask)
{
@@ -1285,6 +1334,20 @@ int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, mask);
}
+int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, gfp_t mask,
+ struct extent_changeset *changeset)
+{
+ /*
+ * Don't support EXTENT_LOCKED case, same reason as
+ * set_record_extent_bits().
+ */
+ BUG_ON(bits & EXTENT_LOCKED);
+
+ return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask,
+ changeset);
+}
+
int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached_state, gfp_t mask)
{
@@ -1343,7 +1406,7 @@ int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
while (1) {
err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
EXTENT_LOCKED, &failed_start,
- cached_state, GFP_NOFS);
+ cached_state, GFP_NOFS, NULL);
if (err == -EEXIST) {
wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
start = failed_start;
@@ -1365,7 +1428,7 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
u64 failed_start;
err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
- &failed_start, NULL, GFP_NOFS);
+ &failed_start, NULL, GFP_NOFS, NULL);
if (err == -EEXIST) {
if (failed_start > start)
clear_extent_bit(tree, start, failed_start - 1,
@@ -2078,8 +2141,8 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
return -EIO;
}
- printk_ratelimited_in_rcu(KERN_INFO
- "BTRFS: read error corrected: ino %llu off %llu (dev %s sector %llu)\n",
+ btrfs_info_rl_in_rcu(fs_info,
+ "read error corrected: ino %llu off %llu (dev %s sector %llu)",
btrfs_ino(inode), start,
rcu_str_deref(dev->name), sector);
bio_put(bio);
@@ -3070,8 +3133,12 @@ static int __do_readpage(struct extent_io_tree *tree,
set_extent_uptodate(tree, cur, cur + iosize - 1,
&cached, GFP_NOFS);
- unlock_extent_cached(tree, cur, cur + iosize - 1,
- &cached, GFP_NOFS);
+ if (parent_locked)
+ free_extent_state(cached);
+ else
+ unlock_extent_cached(tree, cur,
+ cur + iosize - 1,
+ &cached, GFP_NOFS);
cur = cur + iosize;
pg_offset += iosize;
continue;
@@ -4319,7 +4386,7 @@ int try_release_extent_mapping(struct extent_map_tree *map,
u64 start = page_offset(page);
u64 end = start + PAGE_CACHE_SIZE - 1;
- if ((mask & __GFP_WAIT) &&
+ if (gfpflags_allow_blocking(mask) &&
page->mapping->host->i_size > 16 * 1024 * 1024) {
u64 len;
while (start <= end) {
@@ -5566,13 +5633,15 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
unsigned long src_i;
if (src_offset + len > dst->len) {
- printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move "
- "len %lu dst len %lu\n", src_offset, len, dst->len);
+ btrfs_err(dst->fs_info,
+ "memmove bogus src_offset %lu move "
+ "len %lu dst len %lu", src_offset, len, dst->len);
BUG_ON(1);
}
if (dst_offset + len > dst->len) {
- printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move "
- "len %lu dst len %lu\n", dst_offset, len, dst->len);
+ btrfs_err(dst->fs_info,
+ "memmove bogus dst_offset %lu move "
+ "len %lu dst len %lu", dst_offset, len, dst->len);
BUG_ON(1);
}
@@ -5612,13 +5681,13 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
unsigned long src_i;
if (src_offset + len > dst->len) {
- printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move "
- "len %lu len %lu\n", src_offset, len, dst->len);
+ btrfs_err(dst->fs_info, "memmove bogus src_offset %lu move "
+ "len %lu len %lu", src_offset, len, dst->len);
BUG_ON(1);
}
if (dst_offset + len > dst->len) {
- printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move "
- "len %lu len %lu\n", dst_offset, len, dst->len);
+ btrfs_err(dst->fs_info, "memmove bogus dst_offset %lu move "
+ "len %lu len %lu", dst_offset, len, dst->len);
BUG_ON(1);
}
if (dst_offset < src_offset) {
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index c668f3689..f4c1ae118 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -2,6 +2,7 @@
#define __EXTENTIO__
#include <linux/rbtree.h>
+#include "ulist.h"
/* bits for the extent state */
#define EXTENT_DIRTY (1U << 0)
@@ -18,6 +19,7 @@
#define EXTENT_NEED_WAIT (1U << 13)
#define EXTENT_DAMAGED (1U << 14)
#define EXTENT_NORESERVE (1U << 15)
+#define EXTENT_QGROUP_RESERVED (1U << 16)
#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
@@ -161,6 +163,17 @@ struct extent_buffer {
#endif
};
+/*
+ * Structure to record how many bytes and which ranges are set/cleared
+ */
+struct extent_changeset {
+ /* How many bytes are set/cleared in this operation */
+ u64 bytes_changed;
+
+ /* Changed ranges */
+ struct ulist *range_changed;
+};
+
static inline void extent_set_compress_type(unsigned long *bio_flags,
int compress_type)
{
@@ -210,11 +223,17 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state *cached_state);
int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, gfp_t mask);
+int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, gfp_t mask,
+ struct extent_changeset *changeset);
int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, int wake, int delete,
struct extent_state **cached, gfp_t mask);
int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, gfp_t mask);
+int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, gfp_t mask,
+ struct extent_changeset *changeset);
int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, u64 *failed_start,
struct extent_state **cached_state, gfp_t mask);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e27ea7ae7..0f09526aa 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1291,7 +1291,8 @@ out:
* on error we return an unlocked page and the error value
* on success we return a locked page and 0
*/
-static int prepare_uptodate_page(struct page *page, u64 pos,
+static int prepare_uptodate_page(struct inode *inode,
+ struct page *page, u64 pos,
bool force_uptodate)
{
int ret = 0;
@@ -1306,6 +1307,10 @@ static int prepare_uptodate_page(struct page *page, u64 pos,
unlock_page(page);
return -EIO;
}
+ if (page->mapping != inode->i_mapping) {
+ unlock_page(page);
+ return -EAGAIN;
+ }
}
return 0;
}
@@ -1324,6 +1329,7 @@ static noinline int prepare_pages(struct inode *inode, struct page **pages,
int faili;
for (i = 0; i < num_pages; i++) {
+again:
pages[i] = find_or_create_page(inode->i_mapping, index + i,
mask | __GFP_WRITE);
if (!pages[i]) {
@@ -1333,13 +1339,17 @@ static noinline int prepare_pages(struct inode *inode, struct page **pages,
}
if (i == 0)
- err = prepare_uptodate_page(pages[i], pos,
+ err = prepare_uptodate_page(inode, pages[i], pos,
force_uptodate);
- if (i == num_pages - 1)
- err = prepare_uptodate_page(pages[i],
+ if (!err && i == num_pages - 1)
+ err = prepare_uptodate_page(inode, pages[i],
pos + write_bytes, false);
if (err) {
page_cache_release(pages[i]);
+ if (err == -EAGAIN) {
+ err = 0;
+ goto again;
+ }
faili = i - 1;
goto fail;
}
@@ -1477,7 +1487,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
u64 release_bytes = 0;
u64 lockstart;
u64 lockend;
- unsigned long first_index;
size_t num_written = 0;
int nrptrs;
int ret = 0;
@@ -1493,8 +1502,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
if (!pages)
return -ENOMEM;
- first_index = pos >> PAGE_CACHE_SHIFT;
-
while (iov_iter_count(i) > 0) {
size_t offset = pos & (PAGE_CACHE_SIZE - 1);
size_t write_bytes = min(iov_iter_count(i),
@@ -1518,12 +1525,17 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
}
reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
- ret = btrfs_check_data_free_space(inode, reserve_bytes, write_bytes);
- if (ret == -ENOSPC &&
- (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
- BTRFS_INODE_PREALLOC))) {
+
+ if (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
+ BTRFS_INODE_PREALLOC)) {
ret = check_can_nocow(inode, pos, &write_bytes);
+ if (ret < 0)
+ break;
if (ret > 0) {
+ /*
+ * For nodata cow case, no need to reserve
+ * data space.
+ */
only_release_metadata = true;
/*
* our prealloc extent may be smaller than
@@ -1532,20 +1544,19 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
num_pages = DIV_ROUND_UP(write_bytes + offset,
PAGE_CACHE_SIZE);
reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
- ret = 0;
- } else {
- ret = -ENOSPC;
+ goto reserve_metadata;
}
}
-
- if (ret)
+ ret = btrfs_check_data_free_space(inode, pos, write_bytes);
+ if (ret < 0)
break;
+reserve_metadata:
ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes);
if (ret) {
if (!only_release_metadata)
- btrfs_free_reserved_data_space(inode,
- reserve_bytes);
+ btrfs_free_reserved_data_space(inode, pos,
+ write_bytes);
else
btrfs_end_write_no_snapshoting(root);
break;
@@ -1611,12 +1622,17 @@ again:
BTRFS_I(inode)->outstanding_extents++;
spin_unlock(&BTRFS_I(inode)->lock);
}
- if (only_release_metadata)
+ if (only_release_metadata) {
btrfs_delalloc_release_metadata(inode,
release_bytes);
- else
- btrfs_delalloc_release_space(inode,
+ } else {
+ u64 __pos;
+
+ __pos = round_down(pos, root->sectorsize) +
+ (dirty_pages << PAGE_CACHE_SHIFT);
+ btrfs_delalloc_release_space(inode, __pos,
release_bytes);
+ }
}
release_bytes = dirty_pages << PAGE_CACHE_SHIFT;
@@ -1668,7 +1684,7 @@ again:
btrfs_end_write_no_snapshoting(root);
btrfs_delalloc_release_metadata(inode, release_bytes);
} else {
- btrfs_delalloc_release_space(inode, release_bytes);
+ btrfs_delalloc_release_space(inode, pos, release_bytes);
}
}
@@ -2278,7 +2294,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
u64 drop_end;
int ret = 0;
int err = 0;
- int rsv_count;
+ unsigned int rsv_count;
bool same_page;
bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
u64 ino_size;
@@ -2500,6 +2516,19 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
trans->block_rsv = &root->fs_info->trans_block_rsv;
/*
+ * If we are using the NO_HOLES feature we might have had already an
+ * hole that overlaps a part of the region [lockstart, lockend] and
+ * ends at (or beyond) lockend. Since we have no file extent items to
+ * represent holes, drop_end can be less than lockend and so we must
+ * make sure we have an extent map representing the existing hole (the
+ * call to __btrfs_drop_extents() might have dropped the existing extent
+ * map representing the existing hole), otherwise the fast fsync path
+ * will not record the existence of the hole region
+ * [existing_hole_start, lockend].
+ */
+ if (drop_end <= lockend)
+ drop_end = lockend + 1;
+ /*
* Don't insert file hole extent item if it's for a range beyond eof
* (because it's useless) or if it represents a 0 bytes range (when
* cur_offset == drop_end).
@@ -2553,17 +2582,61 @@ out_only_mutex:
return err;
}
+/* Helper structure to record which range is already reserved */
+struct falloc_range {
+ struct list_head list;
+ u64 start;
+ u64 len;
+};
+
+/*
+ * Helper function to add falloc range
+ *
+ * Caller should have locked the larger range of extent containing
+ * [start, len)
+ */
+static int add_falloc_range(struct list_head *head, u64 start, u64 len)
+{
+ struct falloc_range *prev = NULL;
+ struct falloc_range *range = NULL;
+
+ if (list_empty(head))
+ goto insert;
+
+ /*
+ * As fallocate iterate by bytenr order, we only need to check
+ * the last range.
+ */
+ prev = list_entry(head->prev, struct falloc_range, list);
+ if (prev->start + prev->len == start) {
+ prev->len += len;
+ return 0;
+ }
+insert:
+ range = kmalloc(sizeof(*range), GFP_NOFS);
+ if (!range)
+ return -ENOMEM;
+ range->start = start;
+ range->len = len;
+ list_add_tail(&range->list, head);
+ return 0;
+}
+
static long btrfs_fallocate(struct file *file, int mode,
loff_t offset, loff_t len)
{
struct inode *inode = file_inode(file);
struct extent_state *cached_state = NULL;
+ struct falloc_range *range;
+ struct falloc_range *tmp;
+ struct list_head reserve_list;
u64 cur_offset;
u64 last_byte;
u64 alloc_start;
u64 alloc_end;
u64 alloc_hint = 0;
u64 locked_end;
+ u64 actual_end = 0;
struct extent_map *em;
int blocksize = BTRFS_I(inode)->root->sectorsize;
int ret;
@@ -2579,11 +2652,12 @@ static long btrfs_fallocate(struct file *file, int mode,
return btrfs_punch_hole(inode, offset, len);
/*
- * Make sure we have enough space before we do the
- * allocation.
+ * Only trigger disk allocation, don't trigger qgroup reserve
+ *
+ * For qgroup space, it will be checked later.
*/
- ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start, alloc_end - alloc_start);
- if (ret)
+ ret = btrfs_alloc_data_chunk_ondemand(inode, alloc_end - alloc_start);
+ if (ret < 0)
return ret;
mutex_lock(&inode->i_mutex);
@@ -2591,6 +2665,13 @@ static long btrfs_fallocate(struct file *file, int mode,
if (ret)
goto out;
+ /*
+ * TODO: Move these two operations after we have checked
+ * accurate reserved space, or fallocate can still fail but
+ * with page truncated or size expanded.
+ *
+ * But that's a minor problem and won't do much harm BTW.
+ */
if (alloc_start > inode->i_size) {
ret = btrfs_cont_expand(inode, i_size_read(inode),
alloc_start);
@@ -2649,10 +2730,10 @@ static long btrfs_fallocate(struct file *file, int mode,
}
}
+ /* First, check if we exceed the qgroup limit */
+ INIT_LIST_HEAD(&reserve_list);
cur_offset = alloc_start;
while (1) {
- u64 actual_end;
-
em = btrfs_get_extent(inode, NULL, 0, cur_offset,
alloc_end - cur_offset, 0);
if (IS_ERR_OR_NULL(em)) {
@@ -2665,57 +2746,82 @@ static long btrfs_fallocate(struct file *file, int mode,
last_byte = min(extent_map_end(em), alloc_end);
actual_end = min_t(u64, extent_map_end(em), offset + len);
last_byte = ALIGN(last_byte, blocksize);
-
if (em->block_start == EXTENT_MAP_HOLE ||
(cur_offset >= inode->i_size &&
!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
- ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
- last_byte - cur_offset,
- 1 << inode->i_blkbits,
- offset + len,
- &alloc_hint);
- } else if (actual_end > inode->i_size &&
- !(mode & FALLOC_FL_KEEP_SIZE)) {
- struct btrfs_trans_handle *trans;
- struct btrfs_root *root = BTRFS_I(inode)->root;
-
- /*
- * We didn't need to allocate any more space, but we
- * still extended the size of the file so we need to
- * update i_size and the inode item.
- */
- trans = btrfs_start_transaction(root, 1);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- } else {
- inode->i_ctime = CURRENT_TIME;
- i_size_write(inode, actual_end);
- btrfs_ordered_update_i_size(inode, actual_end,
- NULL);
- ret = btrfs_update_inode(trans, root, inode);
- if (ret)
- btrfs_end_transaction(trans, root);
- else
- ret = btrfs_end_transaction(trans,
- root);
+ ret = add_falloc_range(&reserve_list, cur_offset,
+ last_byte - cur_offset);
+ if (ret < 0) {
+ free_extent_map(em);
+ break;
}
+ ret = btrfs_qgroup_reserve_data(inode, cur_offset,
+ last_byte - cur_offset);
+ if (ret < 0)
+ break;
}
free_extent_map(em);
- if (ret < 0)
- break;
-
cur_offset = last_byte;
- if (cur_offset >= alloc_end) {
- ret = 0;
+ if (cur_offset >= alloc_end)
break;
+ }
+
+ /*
+ * If ret is still 0, means we're OK to fallocate.
+ * Or just cleanup the list and exit.
+ */
+ list_for_each_entry_safe(range, tmp, &reserve_list, list) {
+ if (!ret)
+ ret = btrfs_prealloc_file_range(inode, mode,
+ range->start,
+ range->len, 1 << inode->i_blkbits,
+ offset + len, &alloc_hint);
+ list_del(&range->list);
+ kfree(range);
+ }
+ if (ret < 0)
+ goto out_unlock;
+
+ if (actual_end > inode->i_size &&
+ !(mode & FALLOC_FL_KEEP_SIZE)) {
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+
+ /*
+ * We didn't need to allocate any more space, but we
+ * still extended the size of the file so we need to
+ * update i_size and the inode item.
+ */
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ } else {
+ inode->i_ctime = CURRENT_TIME;
+ i_size_write(inode, actual_end);
+ btrfs_ordered_update_i_size(inode, actual_end, NULL);
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret)
+ btrfs_end_transaction(trans, root);
+ else
+ ret = btrfs_end_transaction(trans, root);
}
}
+out_unlock:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
&cached_state, GFP_NOFS);
out:
+ /*
+ * As we waited the extent range, the data_rsv_map must be empty
+ * in the range, as written data range will be released from it.
+ * And for prealloacted extent, it will also be released when
+ * its metadata is written.
+ * So this is completely used as cleanup.
+ */
+ btrfs_qgroup_free_data(inode, alloc_start, alloc_end - alloc_start);
mutex_unlock(&inode->i_mutex);
/* Let go of our reservation. */
- btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
+ btrfs_free_reserved_data_space(inode, alloc_start,
+ alloc_end - alloc_start);
return ret;
}
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index abe3a66bd..cfe99bec4 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -85,8 +85,8 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
}
mapping_set_gfp_mask(inode->i_mapping,
- mapping_gfp_mask(inode->i_mapping) &
- ~(__GFP_FS | __GFP_HIGHMEM));
+ mapping_gfp_constraint(inode->i_mapping,
+ ~(__GFP_FS | __GFP_HIGHMEM)));
return inode;
}
@@ -450,9 +450,9 @@ static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
gen = io_ctl->cur;
if (le64_to_cpu(*gen) != generation) {
- printk_ratelimited(KERN_ERR "BTRFS: space cache generation "
- "(%Lu) does not match inode (%Lu)\n", *gen,
- generation);
+ btrfs_err_rl(io_ctl->root->fs_info,
+ "space cache generation (%llu) does not match inode (%llu)",
+ *gen, generation);
io_ctl_unmap_page(io_ctl);
return -EIO;
}
@@ -506,8 +506,8 @@ static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
PAGE_CACHE_SIZE - offset);
btrfs_csum_final(crc, (char *)&crc);
if (val != crc) {
- printk_ratelimited(KERN_ERR "BTRFS: csum mismatch on free "
- "space cache\n");
+ btrfs_err_rl(io_ctl->root->fs_info,
+ "csum mismatch on free space cache");
io_ctl_unmap_page(io_ctl);
return -EIO;
}
@@ -891,7 +891,7 @@ out:
spin_unlock(&block_group->lock);
ret = 0;
- btrfs_warn(fs_info, "failed to load free space cache for block group %llu, rebuild it now",
+ btrfs_warn(fs_info, "failed to load free space cache for block group %llu, rebuilding it now",
block_group->key.objectid);
}
@@ -1215,7 +1215,7 @@ out:
* @offset - the offset for the key we'll insert
*
* This function writes out a free space cache struct to disk for quick recovery
- * on mount. This will return 0 if it was successfull in writing the cache out,
+ * on mount. This will return 0 if it was successful in writing the cache out,
* or an errno if it was not.
*/
static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
@@ -1730,7 +1730,7 @@ static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
*/
static int search_bitmap(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *bitmap_info, u64 *offset,
- u64 *bytes)
+ u64 *bytes, bool for_alloc)
{
unsigned long found_bits = 0;
unsigned long max_bits = 0;
@@ -1738,11 +1738,26 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
unsigned long next_zero;
unsigned long extent_bits;
+ /*
+ * Skip searching the bitmap if we don't have a contiguous section that
+ * is large enough for this allocation.
+ */
+ if (for_alloc &&
+ bitmap_info->max_extent_size &&
+ bitmap_info->max_extent_size < *bytes) {
+ *bytes = bitmap_info->max_extent_size;
+ return -1;
+ }
+
i = offset_to_bit(bitmap_info->offset, ctl->unit,
max_t(u64, *offset, bitmap_info->offset));
bits = bytes_to_bits(*bytes, ctl->unit);
for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) {
+ if (for_alloc && bits == 1) {
+ found_bits = 1;
+ break;
+ }
next_zero = find_next_zero_bit(bitmap_info->bitmap,
BITS_PER_BITMAP, i);
extent_bits = next_zero - i;
@@ -1762,6 +1777,7 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
}
*bytes = (u64)(max_bits) * ctl->unit;
+ bitmap_info->max_extent_size = *bytes;
return -1;
}
@@ -1813,7 +1829,7 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
if (entry->bitmap) {
u64 size = *bytes;
- ret = search_bitmap(ctl, entry, &tmp, &size);
+ ret = search_bitmap(ctl, entry, &tmp, &size, true);
if (!ret) {
*offset = tmp;
*bytes = size;
@@ -1874,7 +1890,8 @@ again:
search_start = *offset;
search_bytes = ctl->unit;
search_bytes = min(search_bytes, end - search_start + 1);
- ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes);
+ ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes,
+ false);
if (ret < 0 || search_start != *offset)
return -EINVAL;
@@ -1919,7 +1936,7 @@ again:
search_start = *offset;
search_bytes = ctl->unit;
ret = search_bitmap(ctl, bitmap_info, &search_start,
- &search_bytes);
+ &search_bytes, false);
if (ret < 0 || search_start != *offset)
return -EAGAIN;
@@ -1943,6 +1960,12 @@ static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
bitmap_set_bits(ctl, info, offset, bytes_to_set);
+ /*
+ * We set some bytes, we have no idea what the max extent size is
+ * anymore.
+ */
+ info->max_extent_size = 0;
+
return bytes_to_set;
}
@@ -1951,12 +1974,19 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info)
{
struct btrfs_block_group_cache *block_group = ctl->private;
+ bool forced = false;
+
+#ifdef CONFIG_BTRFS_DEBUG
+ if (btrfs_should_fragment_free_space(block_group->fs_info->extent_root,
+ block_group))
+ forced = true;
+#endif
/*
* If we are below the extents threshold then we can add this as an
* extent, and don't have to deal with the bitmap
*/
- if (ctl->free_extents < ctl->extents_thresh) {
+ if (!forced && ctl->free_extents < ctl->extents_thresh) {
/*
* If this block group has some small extents we don't want to
* use up all of our free slots in the cache with them, we want
@@ -2661,7 +2691,7 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
search_start = min_start;
search_bytes = bytes;
- err = search_bitmap(ctl, entry, &search_start, &search_bytes);
+ err = search_bitmap(ctl, entry, &search_start, &search_bytes, true);
if (err) {
if (search_bytes > *max_extent_size)
*max_extent_size = search_bytes;
@@ -2775,6 +2805,7 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
unsigned long want_bits;
unsigned long min_bits;
unsigned long found_bits;
+ unsigned long max_bits = 0;
unsigned long start = 0;
unsigned long total_found = 0;
int ret;
@@ -2784,6 +2815,13 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
want_bits = bytes_to_bits(bytes, ctl->unit);
min_bits = bytes_to_bits(min_bytes, ctl->unit);
+ /*
+ * Don't bother looking for a cluster in this bitmap if it's heavily
+ * fragmented.
+ */
+ if (entry->max_extent_size &&
+ entry->max_extent_size < cont1_bytes)
+ return -ENOSPC;
again:
found_bits = 0;
for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) {
@@ -2791,13 +2829,19 @@ again:
BITS_PER_BITMAP, i);
if (next_zero - i >= min_bits) {
found_bits = next_zero - i;
+ if (found_bits > max_bits)
+ max_bits = found_bits;
break;
}
+ if (next_zero - i > max_bits)
+ max_bits = next_zero - i;
i = next_zero;
}
- if (!found_bits)
+ if (!found_bits) {
+ entry->max_extent_size = (u64)max_bits * ctl->unit;
return -ENOSPC;
+ }
if (!total_found) {
start = i;
@@ -2928,7 +2972,7 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
u64 cont1_bytes, u64 min_bytes)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
- struct btrfs_free_space *entry;
+ struct btrfs_free_space *entry = NULL;
int ret = -ENOSPC;
u64 bitmap_offset = offset_to_bitmap(ctl, offset);
@@ -2939,8 +2983,10 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
* The bitmap that covers offset won't be in the list unless offset
* is just its start offset.
*/
- entry = list_first_entry(bitmaps, struct btrfs_free_space, list);
- if (entry->offset != bitmap_offset) {
+ if (!list_empty(bitmaps))
+ entry = list_first_entry(bitmaps, struct btrfs_free_space, list);
+
+ if (!entry || entry->offset != bitmap_offset) {
entry = tree_search_offset(ctl, bitmap_offset, 1, 0);
if (entry && list_empty(&entry->list))
list_add(&entry->list, bitmaps);
@@ -3056,6 +3102,7 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
spin_lock_init(&cluster->refill_lock);
cluster->root = RB_ROOT;
cluster->max_size = 0;
+ cluster->fragmented = false;
INIT_LIST_HEAD(&cluster->block_group_list);
cluster->block_group = NULL;
}
@@ -3223,7 +3270,7 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
}
bytes = minlen;
- ret2 = search_bitmap(ctl, entry, &start, &bytes);
+ ret2 = search_bitmap(ctl, entry, &start, &bytes, false);
if (ret2 || start >= end) {
spin_unlock(&ctl->tree_lock);
mutex_unlock(&ctl->cache_writeout_mutex);
@@ -3376,7 +3423,7 @@ u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root)
u64 count = 1;
int ret;
- ret = search_bitmap(ctl, entry, &offset, &count);
+ ret = search_bitmap(ctl, entry, &offset, &count, true);
/* Logic error; Should be empty if it can't find anything */
ASSERT(!ret);
@@ -3532,6 +3579,7 @@ again:
spin_lock(&ctl->tree_lock);
info->offset = offset;
info->bytes = bytes;
+ info->max_extent_size = 0;
ret = link_free_space(ctl, info);
spin_unlock(&ctl->tree_lock);
if (ret)
@@ -3559,6 +3607,7 @@ again:
}
bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes);
+
bytes -= bytes_added;
offset += bytes_added;
spin_unlock(&ctl->tree_lock);
@@ -3602,7 +3651,7 @@ have_info:
bit_off = offset;
bit_bytes = ctl->unit;
- ret = search_bitmap(ctl, info, &bit_off, &bit_bytes);
+ ret = search_bitmap(ctl, info, &bit_off, &bit_bytes, false);
if (!ret) {
if (bit_off == offset) {
ret = 1;
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index a16a029ad..f251865eb 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -23,6 +23,7 @@ struct btrfs_free_space {
struct rb_node offset_index;
u64 offset;
u64 bytes;
+ u64 max_extent_size;
unsigned long *bitmap;
struct list_head list;
};
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 265e03c73..be4d22a50 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -157,7 +157,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
*/
if (!btrfs_find_name_in_ext_backref(path, ref_objectid,
name, name_len, &extref)) {
- btrfs_std_error(root->fs_info, -ENOENT);
+ btrfs_std_error(root->fs_info, -ENOENT, NULL);
ret = -EROFS;
goto out;
}
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index d4a582ac3..767a6056a 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -488,17 +488,17 @@ again:
/* Just to make sure we have enough space */
prealloc += 8 * PAGE_CACHE_SIZE;
- ret = btrfs_delalloc_reserve_space(inode, prealloc);
+ ret = btrfs_delalloc_reserve_space(inode, 0, prealloc);
if (ret)
goto out_put;
ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
prealloc, prealloc, &alloc_hint);
if (ret) {
- btrfs_delalloc_release_space(inode, prealloc);
+ btrfs_delalloc_release_space(inode, 0, prealloc);
goto out_put;
}
- btrfs_free_reserved_data_space(inode, prealloc);
+ btrfs_free_reserved_data_space(inode, 0, prealloc);
ret = btrfs_write_out_ino_cache(root, trans, path, inode);
out_put:
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 396e3d5c4..a70c5790f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -310,6 +310,13 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
btrfs_delalloc_release_metadata(inode, end + 1 - start);
btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
out:
+ /*
+ * Don't forget to free the reserved space, as for inlined extent
+ * it won't count as data extent, free them directly here.
+ * And at reserve time, it's always aligned to page size, so
+ * just free one page here.
+ */
+ btrfs_qgroup_free_data(inode, 0, PAGE_CACHE_SIZE);
btrfs_free_path(path);
btrfs_end_transaction(trans, root);
return ret;
@@ -1096,6 +1103,9 @@ static noinline void async_cow_submit(struct btrfs_work *work)
nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
PAGE_CACHE_SHIFT;
+ /*
+ * atomic_sub_return implies a barrier for waitqueue_active
+ */
if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) <
5 * 1024 * 1024 &&
waitqueue_active(&root->fs_info->async_submit_wait))
@@ -1772,7 +1782,8 @@ static void btrfs_clear_bit_hook(struct inode *inode,
if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
&& do_list && !(state->state & EXTENT_NORESERVE))
- btrfs_free_reserved_data_space(inode, len);
+ btrfs_free_reserved_data_space_noquota(inode,
+ state->start, len);
__percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
root->fs_info->delalloc_batch);
@@ -1867,15 +1878,15 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
u64 bio_offset)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
+ enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
int ret = 0;
int skip_sum;
- int metadata = 0;
int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
if (btrfs_is_free_space_inode(inode))
- metadata = 2;
+ metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
if (!(rw & REQ_WRITE)) {
ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
@@ -1995,7 +2006,8 @@ again:
goto again;
}
- ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+ ret = btrfs_delalloc_reserve_space(inode, page_start,
+ PAGE_CACHE_SIZE);
if (ret) {
mapping_set_error(page->mapping, ret);
end_extent_writepage(page, ret, page_start, page_end);
@@ -2121,7 +2133,13 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
ins.type = BTRFS_EXTENT_ITEM_KEY;
ret = btrfs_alloc_reserved_file_extent(trans, root,
root->root_key.objectid,
- btrfs_ino(inode), file_pos, &ins);
+ btrfs_ino(inode), file_pos,
+ ram_bytes, &ins);
+ /*
+ * Release the reserved range from inode dirty range map, as it is
+ * already moved into delayed_ref_head
+ */
+ btrfs_qgroup_release_data(inode, file_pos, ram_bytes);
out:
btrfs_free_path(path);
@@ -2605,7 +2623,6 @@ static void free_sa_defrag_extent(struct new_sa_defrag_extent *new)
return;
list_for_each_entry_safe(old, tmp, &new->head, list) {
- list_del(&old->list);
kfree(old);
}
kfree(new);
@@ -2830,6 +2847,14 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
+
+ /*
+ * For mwrite(mmap + memset to write) case, we still reserve
+ * space for NOCOW range.
+ * As NOCOW won't cause a new delayed ref, just free the space
+ */
+ btrfs_qgroup_free_data(inode, ordered_extent->file_offset,
+ ordered_extent->len);
btrfs_ordered_update_i_size(inode, 0, ordered_extent);
if (nolock)
trans = btrfs_join_transaction_nolock(root);
@@ -3024,8 +3049,6 @@ static int __readpage_endio_check(struct inode *inode,
char *kaddr;
u32 csum_expected;
u32 csum = ~(u32)0;
- static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
- DEFAULT_RATELIMIT_BURST);
csum_expected = *(((u32 *)io_bio->csum) + icsum);
@@ -3038,9 +3061,8 @@ static int __readpage_endio_check(struct inode *inode,
kunmap_atomic(kaddr);
return 0;
zeroit:
- if (__ratelimit(&_rs))
- btrfs_warn(BTRFS_I(inode)->root->fs_info,
- "csum failed ino %llu off %llu csum %u expected csum %u",
+ btrfs_warn_rl(BTRFS_I(inode)->root->fs_info,
+ "csum failed ino %llu off %llu csum %u expected csum %u",
btrfs_ino(inode), start, csum, csum_expected);
memset(kaddr + pgoff, 1, len);
flush_dcache_page(page);
@@ -4024,9 +4046,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
*/
static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
{
- struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(dir)->root;
- int ret;
/*
* 1 for the possible orphan item
@@ -4035,27 +4055,7 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
* 1 for the inode ref
* 1 for the inode
*/
- trans = btrfs_start_transaction(root, 5);
- if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
- return trans;
-
- if (PTR_ERR(trans) == -ENOSPC) {
- u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5);
-
- trans = btrfs_start_transaction(root, 0);
- if (IS_ERR(trans))
- return trans;
- ret = btrfs_cond_migrate_bytes(root->fs_info,
- &root->fs_info->trans_block_rsv,
- num_bytes, 5);
- if (ret) {
- btrfs_end_transaction(trans, root);
- return ERR_PTR(ret);
- }
- trans->block_rsv = &root->fs_info->trans_block_rsv;
- trans->bytes_reserved = num_bytes;
- }
- return trans;
+ return btrfs_start_transaction_fallback_global_rsv(root, 5, 5);
}
static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
@@ -4635,14 +4635,17 @@ int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
if ((offset & (blocksize - 1)) == 0 &&
(!len || ((len & (blocksize - 1)) == 0)))
goto out;
- ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+ ret = btrfs_delalloc_reserve_space(inode,
+ round_down(from, PAGE_CACHE_SIZE), PAGE_CACHE_SIZE);
if (ret)
goto out;
again:
page = find_or_create_page(mapping, index, mask);
if (!page) {
- btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+ btrfs_delalloc_release_space(inode,
+ round_down(from, PAGE_CACHE_SIZE),
+ PAGE_CACHE_SIZE);
ret = -ENOMEM;
goto out;
}
@@ -4710,7 +4713,8 @@ again:
out_unlock:
if (ret)
- btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+ btrfs_delalloc_release_space(inode, page_start,
+ PAGE_CACHE_SIZE);
unlock_page(page);
page_cache_release(page);
out:
@@ -5108,6 +5112,18 @@ static void evict_inode_truncate_pages(struct inode *inode)
spin_unlock(&io_tree->lock);
lock_extent_bits(io_tree, start, end, 0, &cached_state);
+
+ /*
+ * If still has DELALLOC flag, the extent didn't reach disk,
+ * and its reserved space won't be freed by delayed_ref.
+ * So we need to free its reserved space here.
+ * (Refer to comment in btrfs_invalidatepage, case 2)
+ *
+ * Note, end is the bytenr of last byte, so we need + 1 here.
+ */
+ if (state->state & EXTENT_DELALLOC)
+ btrfs_qgroup_free_data(inode, start, end - start + 1);
+
clear_extent_bit(io_tree, start, end,
EXTENT_LOCKED | EXTENT_DIRTY |
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
@@ -6328,9 +6344,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
u64 objectid;
u64 index = 0;
- if (!new_valid_dev(rdev))
- return -EINVAL;
-
/*
* 2 for inode item and ref
* 2 for dir items
@@ -7474,6 +7487,28 @@ struct btrfs_dio_data {
u64 reserve;
};
+static void adjust_dio_outstanding_extents(struct inode *inode,
+ struct btrfs_dio_data *dio_data,
+ const u64 len)
+{
+ unsigned num_extents;
+
+ num_extents = (unsigned) div64_u64(len + BTRFS_MAX_EXTENT_SIZE - 1,
+ BTRFS_MAX_EXTENT_SIZE);
+ /*
+ * If we have an outstanding_extents count still set then we're
+ * within our reservation, otherwise we need to adjust our inode
+ * counter appropriately.
+ */
+ if (dio_data->outstanding_extents) {
+ dio_data->outstanding_extents -= num_extents;
+ } else {
+ spin_lock(&BTRFS_I(inode)->lock);
+ BTRFS_I(inode)->outstanding_extents += num_extents;
+ spin_unlock(&BTRFS_I(inode)->lock);
+ }
+}
+
static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
@@ -7509,8 +7544,11 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
* If this errors out it's because we couldn't invalidate pagecache for
* this range and we need to fallback to buffered.
*/
- if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create))
- return -ENOTBLK;
+ if (lock_extent_direct(inode, lockstart, lockend, &cached_state,
+ create)) {
+ ret = -ENOTBLK;
+ goto err;
+ }
em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
if (IS_ERR(em)) {
@@ -7628,20 +7666,8 @@ unlock:
if (start + len > i_size_read(inode))
i_size_write(inode, start + len);
- /*
- * If we have an outstanding_extents count still set then we're
- * within our reservation, otherwise we need to adjust our inode
- * counter appropriately.
- */
- if (dio_data->outstanding_extents) {
- (dio_data->outstanding_extents)--;
- } else {
- spin_lock(&BTRFS_I(inode)->lock);
- BTRFS_I(inode)->outstanding_extents++;
- spin_unlock(&BTRFS_I(inode)->lock);
- }
-
- btrfs_free_reserved_data_space(inode, len);
+ adjust_dio_outstanding_extents(inode, dio_data, len);
+ btrfs_free_reserved_data_space(inode, start, len);
WARN_ON(dio_data->reserve < len);
dio_data->reserve -= len;
current->journal_info = dio_data;
@@ -7667,8 +7693,17 @@ unlock:
unlock_err:
clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
unlock_bits, 1, 0, &cached_state, GFP_NOFS);
+err:
if (dio_data)
current->journal_info = dio_data;
+ /*
+ * Compensate the delalloc release we do in btrfs_direct_IO() when we
+ * write less data then expected, so that we don't underflow our inode's
+ * outstanding extents counter.
+ */
+ if (create && dio_data)
+ adjust_dio_outstanding_extents(inode, dio_data, len);
+
return ret;
}
@@ -8431,7 +8466,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
mutex_unlock(&inode->i_mutex);
relock = true;
}
- ret = btrfs_delalloc_reserve_space(inode, count);
+ ret = btrfs_delalloc_reserve_space(inode, offset, count);
if (ret)
goto out;
dio_data.outstanding_extents = div64_u64(count +
@@ -8460,10 +8495,10 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
current->journal_info = NULL;
if (ret < 0 && ret != -EIOCBQUEUED) {
if (dio_data.reserve)
- btrfs_delalloc_release_space(inode,
- dio_data.reserve);
+ btrfs_delalloc_release_space(inode, offset,
+ dio_data.reserve);
} else if (ret >= 0 && (size_t)ret < count)
- btrfs_delalloc_release_space(inode,
+ btrfs_delalloc_release_space(inode, offset,
count - (size_t)ret);
}
out:
@@ -8622,6 +8657,18 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
}
}
+ /*
+ * Qgroup reserved space handler
+ * Page here will be either
+ * 1) Already written to disk
+ * In this case, its reserved space is released from data rsv map
+ * and will be freed by delayed_ref handler finally.
+ * So even we call qgroup_free_data(), it won't decrease reserved
+ * space.
+ * 2) Not written to disk
+ * This means the reserved space should be freed here.
+ */
+ btrfs_qgroup_free_data(inode, page_start, PAGE_CACHE_SIZE);
if (!inode_evicting) {
clear_extent_bit(tree, page_start, page_end,
EXTENT_LOCKED | EXTENT_DIRTY |
@@ -8672,7 +8719,11 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
u64 page_end;
sb_start_pagefault(inode->i_sb);
- ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+ page_start = page_offset(page);
+ page_end = page_start + PAGE_CACHE_SIZE - 1;
+
+ ret = btrfs_delalloc_reserve_space(inode, page_start,
+ PAGE_CACHE_SIZE);
if (!ret) {
ret = file_update_time(vma->vm_file);
reserved = 1;
@@ -8691,8 +8742,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
again:
lock_page(page);
size = i_size_read(inode);
- page_start = page_offset(page);
- page_end = page_start + PAGE_CACHE_SIZE - 1;
if ((page->mapping != inode->i_mapping) ||
(page_start >= size)) {
@@ -8769,7 +8818,7 @@ out_unlock:
}
unlock_page(page);
out:
- btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+ btrfs_delalloc_release_space(inode, page_start, PAGE_CACHE_SIZE);
out_noreserve:
sb_end_pagefault(inode->i_sb);
return ret;
@@ -9058,6 +9107,7 @@ void btrfs_destroy_inode(struct inode *inode)
btrfs_put_ordered_extent(ordered);
}
}
+ btrfs_qgroup_check_reserved_leak(inode);
inode_tree_del(inode);
btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
free:
@@ -9694,6 +9744,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
u64 cur_offset = start;
u64 i_size;
u64 cur_bytes;
+ u64 last_alloc = (u64)-1;
int ret = 0;
bool own_trans = true;
@@ -9710,6 +9761,13 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
cur_bytes = min(num_bytes, 256ULL * 1024 * 1024);
cur_bytes = max(cur_bytes, min_size);
+ /*
+ * If we are severely fragmented we could end up with really
+ * small allocations, so if the allocator is returning small
+ * chunks lets make its job easier by only searching for those
+ * sized chunks.
+ */
+ cur_bytes = min(cur_bytes, last_alloc);
ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0,
*alloc_hint, &ins, 1, 0);
if (ret) {
@@ -9718,6 +9776,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
break;
}
+ last_alloc = ins.offset;
ret = insert_reserved_file_extent(trans, inode,
cur_offset, ins.objectid,
ins.offset, ins.offset,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 6548a3682..da94138eb 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1120,7 +1120,8 @@ static int cluster_pages_for_defrag(struct inode *inode,
page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1);
ret = btrfs_delalloc_reserve_space(inode,
- page_cnt << PAGE_CACHE_SHIFT);
+ start_index << PAGE_CACHE_SHIFT,
+ page_cnt << PAGE_CACHE_SHIFT);
if (ret)
return ret;
i_done = 0;
@@ -1210,7 +1211,8 @@ again:
BTRFS_I(inode)->outstanding_extents++;
spin_unlock(&BTRFS_I(inode)->lock);
btrfs_delalloc_release_space(inode,
- (page_cnt - i_done) << PAGE_CACHE_SHIFT);
+ start_index << PAGE_CACHE_SHIFT,
+ (page_cnt - i_done) << PAGE_CACHE_SHIFT);
}
@@ -1235,7 +1237,9 @@ out:
unlock_page(pages[i]);
page_cache_release(pages[i]);
}
- btrfs_delalloc_release_space(inode, page_cnt << PAGE_CACHE_SHIFT);
+ btrfs_delalloc_release_space(inode,
+ start_index << PAGE_CACHE_SHIFT,
+ page_cnt << PAGE_CACHE_SHIFT);
return ret;
}
@@ -1342,7 +1346,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
break;
if (btrfs_defrag_cancelled(root->fs_info)) {
- printk(KERN_DEBUG "BTRFS: defrag_file cancelled\n");
+ btrfs_debug(root->fs_info, "defrag_file cancelled");
ret = -EAGAIN;
break;
}
@@ -1579,7 +1583,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
new_size = div_u64(new_size, root->sectorsize);
new_size *= root->sectorsize;
- printk_in_rcu(KERN_INFO "BTRFS: new size for %s is %llu\n",
+ btrfs_info_in_rcu(root->fs_info, "new size for %s is %llu",
rcu_str_deref(device->name), new_size);
if (new_size > old_size) {
@@ -2081,7 +2085,7 @@ static noinline int search_ioctl(struct inode *inode,
key.offset = (u64)-1;
root = btrfs_read_fs_root_no_name(info, &key);
if (IS_ERR(root)) {
- printk(KERN_ERR "BTRFS: could not find root %llu\n",
+ btrfs_err(info, "could not find root %llu",
sk->tree_id);
btrfs_free_path(path);
return -ENOENT;
@@ -2221,7 +2225,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
key.offset = (u64)-1;
root = btrfs_read_fs_root_no_name(info, &key);
if (IS_ERR(root)) {
- printk(KERN_ERR "BTRFS: could not find root %llu\n", tree_id);
+ btrfs_err(info, "could not find root %llu", tree_id);
ret = -ENOENT;
goto out;
}
@@ -2699,7 +2703,6 @@ static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg)
{
struct btrfs_ioctl_fs_info_args *fi_args;
struct btrfs_device *device;
- struct btrfs_device *next;
struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
int ret = 0;
@@ -2711,7 +2714,7 @@ static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg)
fi_args->num_devices = fs_devices->num_devices;
memcpy(&fi_args->fsid, root->fs_info->fsid, sizeof(fi_args->fsid));
- list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
if (device->devid > fi_args->max_id)
fi_args->max_id = device->devid;
}
@@ -4863,7 +4866,7 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
/* update qgroup status and info */
err = btrfs_run_qgroups(trans, root->fs_info);
if (err < 0)
- btrfs_error(root->fs_info, ret,
+ btrfs_std_error(root->fs_info, ret,
"failed to update qgroup status and info\n");
err = btrfs_end_transaction(trans, root);
if (err && !ret)
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index d7e6baf1b..8077461fc 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -79,6 +79,9 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
write_lock(&eb->lock);
WARN_ON(atomic_read(&eb->spinning_writers));
atomic_inc(&eb->spinning_writers);
+ /*
+ * atomic_dec_and_test implies a barrier for waitqueue_active
+ */
if (atomic_dec_and_test(&eb->blocking_writers) &&
waitqueue_active(&eb->write_lock_wq))
wake_up(&eb->write_lock_wq);
@@ -86,6 +89,9 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
BUG_ON(atomic_read(&eb->blocking_readers) == 0);
read_lock(&eb->lock);
atomic_inc(&eb->spinning_readers);
+ /*
+ * atomic_dec_and_test implies a barrier for waitqueue_active
+ */
if (atomic_dec_and_test(&eb->blocking_readers) &&
waitqueue_active(&eb->read_lock_wq))
wake_up(&eb->read_lock_wq);
@@ -229,6 +235,9 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
}
btrfs_assert_tree_read_locked(eb);
WARN_ON(atomic_read(&eb->blocking_readers) == 0);
+ /*
+ * atomic_dec_and_test implies a barrier for waitqueue_active
+ */
if (atomic_dec_and_test(&eb->blocking_readers) &&
waitqueue_active(&eb->read_lock_wq))
wake_up(&eb->read_lock_wq);
@@ -280,6 +289,9 @@ void btrfs_tree_unlock(struct extent_buffer *eb)
if (blockers) {
WARN_ON(atomic_read(&eb->spinning_writers));
atomic_dec(&eb->blocking_writers);
+ /*
+ * Make sure counter is updated before we wake up waiters.
+ */
smp_mb();
if (waitqueue_active(&eb->write_lock_wq))
wake_up(&eb->write_lock_wq);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 52170cf17..8c27292ea 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -345,6 +345,9 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode,
if (entry->bytes_left == 0) {
ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
+ /*
+ * Implicit memory barrier after test_and_set_bit
+ */
if (waitqueue_active(&entry->wait))
wake_up(&entry->wait);
} else {
@@ -409,6 +412,9 @@ have_entry:
if (entry->bytes_left == 0) {
ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
+ /*
+ * Implicit memory barrier after test_and_set_bit
+ */
if (waitqueue_active(&entry->wait))
wake_up(&entry->wait);
} else {
@@ -484,15 +490,16 @@ void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
spin_lock_irq(&log->log_extents_lock[index]);
while (!list_empty(&log->logged_list[index])) {
+ struct inode *inode;
ordered = list_first_entry(&log->logged_list[index],
struct btrfs_ordered_extent,
log_list);
list_del_init(&ordered->log_list);
+ inode = ordered->inode;
spin_unlock_irq(&log->log_extents_lock[index]);
if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) &&
!test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) {
- struct inode *inode = ordered->inode;
u64 start = ordered->file_offset;
u64 end = ordered->file_offset + ordered->len - 1;
@@ -503,20 +510,25 @@ void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
&ordered->flags));
/*
- * If our ordered extent completed it means it updated the
- * fs/subvol and csum trees already, so no need to make the
- * current transaction's commit wait for it, as we end up
- * holding memory unnecessarily and delaying the inode's iput
- * until the transaction commit (we schedule an iput for the
- * inode when the ordered extent's refcount drops to 0), which
- * prevents it from being evictable until the transaction
- * commits.
+ * In order to keep us from losing our ordered extent
+ * information when committing the transaction we have to make
+ * sure that any logged extents are completed when we go to
+ * commit the transaction. To do this we simply increase the
+ * current transactions pending_ordered counter and decrement it
+ * when the ordered extent completes.
*/
- if (test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags))
- btrfs_put_ordered_extent(ordered);
- else
- list_add_tail(&ordered->trans_list, &trans->ordered);
-
+ if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
+ struct btrfs_ordered_inode_tree *tree;
+
+ tree = &BTRFS_I(inode)->ordered_tree;
+ spin_lock_irq(&tree->lock);
+ if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
+ set_bit(BTRFS_ORDERED_PENDING, &ordered->flags);
+ atomic_inc(&trans->transaction->pending_ordered);
+ }
+ spin_unlock_irq(&tree->lock);
+ }
+ btrfs_put_ordered_extent(ordered);
spin_lock_irq(&log->log_extents_lock[index]);
}
spin_unlock_irq(&log->log_extents_lock[index]);
@@ -578,6 +590,7 @@ void btrfs_remove_ordered_extent(struct inode *inode,
struct btrfs_ordered_inode_tree *tree;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct rb_node *node;
+ bool dec_pending_ordered = false;
tree = &BTRFS_I(inode)->ordered_tree;
spin_lock_irq(&tree->lock);
@@ -587,8 +600,37 @@ void btrfs_remove_ordered_extent(struct inode *inode,
if (tree->last == node)
tree->last = NULL;
set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
+ if (test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags))
+ dec_pending_ordered = true;
spin_unlock_irq(&tree->lock);
+ /*
+ * The current running transaction is waiting on us, we need to let it
+ * know that we're complete and wake it up.
+ */
+ if (dec_pending_ordered) {
+ struct btrfs_transaction *trans;
+
+ /*
+ * The checks for trans are just a formality, it should be set,
+ * but if it isn't we don't want to deref/assert under the spin
+ * lock, so be nice and check if trans is set, but ASSERT() so
+ * if it isn't set a developer will notice.
+ */
+ spin_lock(&root->fs_info->trans_lock);
+ trans = root->fs_info->running_transaction;
+ if (trans)
+ atomic_inc(&trans->use_count);
+ spin_unlock(&root->fs_info->trans_lock);
+
+ ASSERT(trans);
+ if (trans) {
+ if (atomic_dec_and_test(&trans->pending_ordered))
+ wake_up(&trans->pending_wait);
+ btrfs_put_transaction(trans);
+ }
+ }
+
spin_lock(&root->ordered_extent_lock);
list_del_init(&entry->root_extent_list);
root->nr_ordered_extents--;
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 7176cc0fe..23c96059c 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -73,6 +73,8 @@ struct btrfs_ordered_sum {
#define BTRFS_ORDERED_LOGGED 10 /* Set when we've waited on this ordered extent
* in the logging code. */
+#define BTRFS_ORDERED_PENDING 11 /* We are waiting for this ordered extent to
+ * complete in the current transaction. */
struct btrfs_ordered_extent {
/* logical offset in the file */
u64 file_offset;
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index dca137b04..f9e60231f 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -49,18 +49,16 @@ static struct prop_handler prop_handlers[] = {
.extract = prop_compression_extract,
.inheritable = 1
},
- {
- .xattr_name = NULL
- }
};
void __init btrfs_props_init(void)
{
- struct prop_handler *p;
+ int i;
hash_init(prop_handlers_ht);
- for (p = &prop_handlers[0]; p->xattr_name; p++) {
+ for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) {
+ struct prop_handler *p = &prop_handlers[i];
u64 h = btrfs_name_hash(p->xattr_name, strlen(p->xattr_name));
hash_add(prop_handlers_ht, &p->node, h);
@@ -301,15 +299,16 @@ static int inherit_props(struct btrfs_trans_handle *trans,
struct inode *inode,
struct inode *parent)
{
- const struct prop_handler *h;
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret;
+ int i;
if (!test_bit(BTRFS_INODE_HAS_PROPS,
&BTRFS_I(parent)->runtime_flags))
return 0;
- for (h = &prop_handlers[0]; h->xattr_name; h++) {
+ for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) {
+ const struct prop_handler *h = &prop_handlers[i];
const char *value;
u64 num_bytes;
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index d904ee1c5..5279fdae7 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -993,9 +993,10 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans,
mutex_lock(&fs_info->qgroup_ioctl_lock);
if (!fs_info->quota_root)
goto out;
- spin_lock(&fs_info->qgroup_lock);
fs_info->quota_enabled = 0;
fs_info->pending_quota_state = 0;
+ btrfs_qgroup_wait_for_completion(fs_info);
+ spin_lock(&fs_info->qgroup_lock);
quota_root = fs_info->quota_root;
fs_info->quota_root = NULL;
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
@@ -1461,6 +1462,8 @@ struct btrfs_qgroup_extent_record
struct btrfs_qgroup_extent_record *entry;
u64 bytenr = record->bytenr;
+ assert_spin_locked(&delayed_refs->lock);
+
while (*p) {
parent_node = *p;
entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record,
@@ -1652,10 +1655,6 @@ static int qgroup_update_counters(struct btrfs_fs_info *fs_info,
}
}
- /* For exclusive extent, free its reserved bytes too */
- if (nr_old_roots == 0 && nr_new_roots == 1 &&
- cur_new_count == nr_new_roots)
- qg->reserved -= num_bytes;
if (dirty)
qgroup_dirty(fs_info, qg);
}
@@ -2035,7 +2034,7 @@ out:
return ret;
}
-int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
+static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
{
struct btrfs_root *quota_root;
struct btrfs_qgroup *qgroup;
@@ -2116,14 +2115,13 @@ out:
return ret;
}
-void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
+void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
+ u64 ref_root, u64 num_bytes)
{
struct btrfs_root *quota_root;
struct btrfs_qgroup *qgroup;
- struct btrfs_fs_info *fs_info = root->fs_info;
struct ulist_node *unode;
struct ulist_iterator uiter;
- u64 ref_root = root->root_key.objectid;
int ret = 0;
if (!is_fstree(ref_root))
@@ -2169,6 +2167,11 @@ out:
spin_unlock(&fs_info->qgroup_lock);
}
+static inline void qgroup_free(struct btrfs_root *root, u64 num_bytes)
+{
+ return btrfs_qgroup_free_refroot(root->fs_info, root->objectid,
+ num_bytes);
+}
void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
{
if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq)
@@ -2188,17 +2191,16 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
*/
static int
qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
- struct btrfs_trans_handle *trans,
- struct extent_buffer *scratch_leaf)
+ struct btrfs_trans_handle *trans)
{
struct btrfs_key found;
+ struct extent_buffer *scratch_leaf = NULL;
struct ulist *roots = NULL;
struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem);
u64 num_bytes;
int slot;
int ret;
- path->leave_spinning = 1;
mutex_lock(&fs_info->qgroup_rescan_lock);
ret = btrfs_search_slot_for_read(fs_info->extent_root,
&fs_info->qgroup_rescan_progress,
@@ -2229,7 +2231,15 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
fs_info->qgroup_rescan_progress.objectid = found.objectid + 1;
btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
- memcpy(scratch_leaf, path->nodes[0], sizeof(*scratch_leaf));
+ scratch_leaf = btrfs_clone_extent_buffer(path->nodes[0]);
+ if (!scratch_leaf) {
+ ret = -ENOMEM;
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+ goto out;
+ }
+ extent_buffer_get(scratch_leaf);
+ btrfs_tree_read_lock(scratch_leaf);
+ btrfs_set_lock_blocking_rw(scratch_leaf, BTRFS_READ_LOCK);
slot = path->slots[0];
btrfs_release_path(path);
mutex_unlock(&fs_info->qgroup_rescan_lock);
@@ -2255,6 +2265,10 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
goto out;
}
out:
+ if (scratch_leaf) {
+ btrfs_tree_read_unlock_blocking(scratch_leaf);
+ free_extent_buffer(scratch_leaf);
+ }
btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
return ret;
@@ -2266,19 +2280,15 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
qgroup_rescan_work);
struct btrfs_path *path;
struct btrfs_trans_handle *trans = NULL;
- struct extent_buffer *scratch_leaf = NULL;
int err = -ENOMEM;
int ret = 0;
path = btrfs_alloc_path();
if (!path)
goto out;
- scratch_leaf = kmalloc(sizeof(*scratch_leaf), GFP_NOFS);
- if (!scratch_leaf)
- goto out;
err = 0;
- while (!err) {
+ while (!err && !btrfs_fs_closing(fs_info)) {
trans = btrfs_start_transaction(fs_info->fs_root, 0);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
@@ -2287,8 +2297,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
if (!fs_info->quota_enabled) {
err = -EINTR;
} else {
- err = qgroup_rescan_leaf(fs_info, path, trans,
- scratch_leaf);
+ err = qgroup_rescan_leaf(fs_info, path, trans);
}
if (err > 0)
btrfs_commit_transaction(trans, fs_info->fs_root);
@@ -2297,11 +2306,11 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
}
out:
- kfree(scratch_leaf);
btrfs_free_path(path);
mutex_lock(&fs_info->qgroup_rescan_lock);
- fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+ if (!btrfs_fs_closing(fs_info))
+ fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
if (err > 0 &&
fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) {
@@ -2330,7 +2339,9 @@ out:
}
btrfs_end_transaction(trans, fs_info->quota_root);
- if (err >= 0) {
+ if (btrfs_fs_closing(fs_info)) {
+ btrfs_info(fs_info, "qgroup scan paused");
+ } else if (err >= 0) {
btrfs_info(fs_info, "qgroup scan completed%s",
err > 0 ? " (inconsistency flag cleared)" : "");
} else {
@@ -2378,12 +2389,11 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
memset(&fs_info->qgroup_rescan_progress, 0,
sizeof(fs_info->qgroup_rescan_progress));
fs_info->qgroup_rescan_progress.objectid = progress_objectid;
+ init_completion(&fs_info->qgroup_rescan_completion);
spin_unlock(&fs_info->qgroup_lock);
mutex_unlock(&fs_info->qgroup_rescan_lock);
- init_completion(&fs_info->qgroup_rescan_completion);
-
memset(&fs_info->qgroup_rescan_work, 0,
sizeof(fs_info->qgroup_rescan_work));
btrfs_init_work(&fs_info->qgroup_rescan_work,
@@ -2486,3 +2496,190 @@ btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
btrfs_queue_work(fs_info->qgroup_rescan_workers,
&fs_info->qgroup_rescan_work);
}
+
+/*
+ * Reserve qgroup space for range [start, start + len).
+ *
+ * This function will either reserve space from related qgroups or doing
+ * nothing if the range is already reserved.
+ *
+ * Return 0 for successful reserve
+ * Return <0 for error (including -EQUOT)
+ *
+ * NOTE: this function may sleep for memory allocation.
+ */
+int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct extent_changeset changeset;
+ struct ulist_node *unode;
+ struct ulist_iterator uiter;
+ int ret;
+
+ if (!root->fs_info->quota_enabled || !is_fstree(root->objectid) ||
+ len == 0)
+ return 0;
+
+ changeset.bytes_changed = 0;
+ changeset.range_changed = ulist_alloc(GFP_NOFS);
+ ret = set_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
+ start + len -1, EXTENT_QGROUP_RESERVED, GFP_NOFS,
+ &changeset);
+ trace_btrfs_qgroup_reserve_data(inode, start, len,
+ changeset.bytes_changed,
+ QGROUP_RESERVE);
+ if (ret < 0)
+ goto cleanup;
+ ret = qgroup_reserve(root, changeset.bytes_changed);
+ if (ret < 0)
+ goto cleanup;
+
+ ulist_free(changeset.range_changed);
+ return ret;
+
+cleanup:
+ /* cleanup already reserved ranges */
+ ULIST_ITER_INIT(&uiter);
+ while ((unode = ulist_next(changeset.range_changed, &uiter)))
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val,
+ unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL,
+ GFP_NOFS);
+ ulist_free(changeset.range_changed);
+ return ret;
+}
+
+static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len,
+ int free)
+{
+ struct extent_changeset changeset;
+ int trace_op = QGROUP_RELEASE;
+ int ret;
+
+ changeset.bytes_changed = 0;
+ changeset.range_changed = ulist_alloc(GFP_NOFS);
+ if (!changeset.range_changed)
+ return -ENOMEM;
+
+ ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
+ start + len -1, EXTENT_QGROUP_RESERVED, GFP_NOFS,
+ &changeset);
+ if (ret < 0)
+ goto out;
+
+ if (free) {
+ qgroup_free(BTRFS_I(inode)->root, changeset.bytes_changed);
+ trace_op = QGROUP_FREE;
+ }
+ trace_btrfs_qgroup_release_data(inode, start, len,
+ changeset.bytes_changed, trace_op);
+out:
+ ulist_free(changeset.range_changed);
+ return ret;
+}
+
+/*
+ * Free a reserved space range from io_tree and related qgroups
+ *
+ * Should be called when a range of pages get invalidated before reaching disk.
+ * Or for error cleanup case.
+ *
+ * For data written to disk, use btrfs_qgroup_release_data().
+ *
+ * NOTE: This function may sleep for memory allocation.
+ */
+int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len)
+{
+ return __btrfs_qgroup_release_data(inode, start, len, 1);
+}
+
+/*
+ * Release a reserved space range from io_tree only.
+ *
+ * Should be called when a range of pages get written to disk and corresponding
+ * FILE_EXTENT is inserted into corresponding root.
+ *
+ * Since new qgroup accounting framework will only update qgroup numbers at
+ * commit_transaction() time, its reserved space shouldn't be freed from
+ * related qgroups.
+ *
+ * But we should release the range from io_tree, to allow further write to be
+ * COWed.
+ *
+ * NOTE: This function may sleep for memory allocation.
+ */
+int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len)
+{
+ return __btrfs_qgroup_release_data(inode, start, len, 0);
+}
+
+int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes)
+{
+ int ret;
+
+ if (!root->fs_info->quota_enabled || !is_fstree(root->objectid) ||
+ num_bytes == 0)
+ return 0;
+
+ BUG_ON(num_bytes != round_down(num_bytes, root->nodesize));
+ ret = qgroup_reserve(root, num_bytes);
+ if (ret < 0)
+ return ret;
+ atomic_add(num_bytes, &root->qgroup_meta_rsv);
+ return ret;
+}
+
+void btrfs_qgroup_free_meta_all(struct btrfs_root *root)
+{
+ int reserved;
+
+ if (!root->fs_info->quota_enabled || !is_fstree(root->objectid))
+ return;
+
+ reserved = atomic_xchg(&root->qgroup_meta_rsv, 0);
+ if (reserved == 0)
+ return;
+ qgroup_free(root, reserved);
+}
+
+void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes)
+{
+ if (!root->fs_info->quota_enabled || !is_fstree(root->objectid))
+ return;
+
+ BUG_ON(num_bytes != round_down(num_bytes, root->nodesize));
+ WARN_ON(atomic_read(&root->qgroup_meta_rsv) < num_bytes);
+ atomic_sub(num_bytes, &root->qgroup_meta_rsv);
+ qgroup_free(root, num_bytes);
+}
+
+/*
+ * Check qgroup reserved space leaking, normally at destory inode
+ * time
+ */
+void btrfs_qgroup_check_reserved_leak(struct inode *inode)
+{
+ struct extent_changeset changeset;
+ struct ulist_node *unode;
+ struct ulist_iterator iter;
+ int ret;
+
+ changeset.bytes_changed = 0;
+ changeset.range_changed = ulist_alloc(GFP_NOFS);
+ if (WARN_ON(!changeset.range_changed))
+ return;
+
+ ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
+ EXTENT_QGROUP_RESERVED, GFP_NOFS, &changeset);
+
+ WARN_ON(ret < 0);
+ if (WARN_ON(changeset.bytes_changed)) {
+ ULIST_ITER_INIT(&iter);
+ while ((unode = ulist_next(changeset.range_changed, &iter))) {
+ btrfs_warn(BTRFS_I(inode)->root->fs_info,
+ "leaking qgroup reserved space, ino: %lu, start: %llu, end: %llu",
+ inode->i_ino, unode->val, unode->aux);
+ }
+ qgroup_free(BTRFS_I(inode)->root, changeset.bytes_changed);
+ }
+ ulist_free(changeset.range_changed);
+}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 6387dcfa3..ecb2c143e 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -33,6 +33,13 @@ struct btrfs_qgroup_extent_record {
struct ulist *old_roots;
};
+/*
+ * For qgroup event trace points only
+ */
+#define QGROUP_RESERVE (1<<0)
+#define QGROUP_RELEASE (1<<1)
+#define QGROUP_FREE (1<<2)
+
int btrfs_quota_enable(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
int btrfs_quota_disable(struct btrfs_trans_handle *trans,
@@ -71,9 +78,18 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
struct btrfs_qgroup_inherit *inherit);
-int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes);
-void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes);
-
+void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
+ u64 ref_root, u64 num_bytes);
+/*
+ * TODO: Add proper trace point for it, as btrfs_qgroup_free() is
+ * called by everywhere, can't provide good trace for delayed ref case.
+ */
+static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info,
+ u64 ref_root, u64 num_bytes)
+{
+ btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes);
+ trace_btrfs_qgroup_free_delayed_ref(ref_root, num_bytes);
+}
void assert_qgroups_uptodate(struct btrfs_trans_handle *trans);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
@@ -81,4 +97,13 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
u64 rfer, u64 excl);
#endif
+/* New io_tree based accurate qgroup reserve API */
+int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len);
+int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len);
+int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len);
+
+int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes);
+void btrfs_qgroup_free_meta_all(struct btrfs_root *root);
+void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes);
+void btrfs_qgroup_check_reserved_leak(struct inode *inode);
#endif /* __BTRFS_QGROUP__ */
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index fcf7265ca..1a33d3eb3 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -810,7 +810,11 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
}
goto done_nolock;
- } else if (waitqueue_active(&h->wait)) {
+ /*
+ * The barrier for this waitqueue_active is not needed,
+ * we're protected by h->lock and can't miss a wakeup.
+ */
+ } else if (waitqueue_active(&h->wait)) {
spin_unlock(&rbio->bio_list_lock);
spin_unlock_irqrestore(&h->lock, flags);
wake_up(&h->wait);
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 4645cd16d..619f92963 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -569,7 +569,7 @@ static int reada_add_block(struct reada_control *rc, u64 logical,
rec = kzalloc(sizeof(*rec), GFP_NOFS);
if (!rec) {
reada_extent_put(root->fs_info, re);
- return -1;
+ return -ENOMEM;
}
rec->rc = rc;
@@ -918,6 +918,7 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
u64 start;
u64 generation;
int level;
+ int ret;
struct extent_buffer *node;
static struct btrfs_key max_key = {
.objectid = (u64)-1,
@@ -943,9 +944,10 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
generation = btrfs_header_generation(node);
free_extent_buffer(node);
- if (reada_add_block(rc, start, &max_key, level, generation)) {
+ ret = reada_add_block(rc, start, &max_key, level, generation);
+ if (ret) {
kfree(rc);
- return ERR_PTR(-ENOMEM);
+ return ERR_PTR(ret);
}
reada_start_machine(root->fs_info);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index ab507e3d5..b4ca5454e 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2416,7 +2416,7 @@ again:
}
out:
if (ret) {
- btrfs_std_error(root->fs_info, ret);
+ btrfs_std_error(root->fs_info, ret, NULL);
if (!list_empty(&reloc_roots))
free_reloc_roots(&reloc_roots);
@@ -3032,8 +3032,8 @@ int prealloc_file_extent_cluster(struct inode *inode,
BUG_ON(cluster->start != cluster->boundary[0]);
mutex_lock(&inode->i_mutex);
- ret = btrfs_check_data_free_space(inode, cluster->end +
- 1 - cluster->start, 0);
+ ret = btrfs_check_data_free_space(inode, cluster->start,
+ cluster->end + 1 - cluster->start);
if (ret)
goto out;
@@ -3054,8 +3054,8 @@ int prealloc_file_extent_cluster(struct inode *inode,
break;
nr++;
}
- btrfs_free_reserved_data_space(inode, cluster->end +
- 1 - cluster->start);
+ btrfs_free_reserved_data_space(inode, cluster->start,
+ cluster->end + 1 - cluster->start);
out:
mutex_unlock(&inode->i_mutex);
return ret;
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 360a728a6..7cf8509de 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -45,12 +45,13 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot,
if (!need_reset && btrfs_root_generation(item)
!= btrfs_root_generation_v2(item)) {
if (btrfs_root_generation_v2(item) != 0) {
- printk(KERN_WARNING "BTRFS: mismatching "
+ btrfs_warn(eb->fs_info,
+ "mismatching "
"generation and generation_v2 "
"found in root item. This root "
"was probably mounted with an "
"older kernel. Resetting all "
- "new fields.\n");
+ "new fields.");
}
need_reset = 1;
}
@@ -141,7 +142,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
int ret;
int slot;
unsigned long ptr;
- int old_len;
+ u32 old_len;
path = btrfs_alloc_path();
if (!path)
@@ -283,7 +284,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
trans = btrfs_join_transaction(tree_root);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
- btrfs_error(tree_root->fs_info, err,
+ btrfs_std_error(tree_root->fs_info, err,
"Failed to start trans to delete "
"orphan item");
break;
@@ -292,7 +293,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
root_key.objectid);
btrfs_end_transaction(trans, tree_root);
if (err) {
- btrfs_error(tree_root->fs_info, err,
+ btrfs_std_error(tree_root->fs_info, err,
"Failed to delete root orphan "
"item");
break;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index a39f5d114..b091d94ce 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -248,14 +248,9 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
struct scrub_block *sblocks_for_recheck);
static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
- struct scrub_block *sblock, int is_metadata,
- int have_csum, u8 *csum, u64 generation,
- u16 csum_size, int retry_failed_mirror);
-static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
- struct scrub_block *sblock,
- int is_metadata, int have_csum,
- const u8 *csum, u64 generation,
- u16 csum_size);
+ struct scrub_block *sblock,
+ int retry_failed_mirror);
+static void scrub_recheck_block_checksum(struct scrub_block *sblock);
static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
struct scrub_block *sblock_good);
static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
@@ -580,9 +575,9 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
* hold all of the paths here
*/
for (i = 0; i < ipath->fspath->elem_cnt; ++i)
- printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "
+ btrfs_warn_in_rcu(fs_info, "%s at logical %llu on dev "
"%s, sector %llu, root %llu, inode %llu, offset %llu, "
- "length %llu, links %u (path: %s)\n", swarn->errstr,
+ "length %llu, links %u (path: %s)", swarn->errstr,
swarn->logical, rcu_str_deref(swarn->dev->name),
(unsigned long long)swarn->sector, root, inum, offset,
min(isize - offset, (u64)PAGE_SIZE), nlink,
@@ -592,9 +587,9 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
return 0;
err:
- printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "
+ btrfs_warn_in_rcu(fs_info, "%s at logical %llu on dev "
"%s, sector %llu, root %llu, inode %llu, offset %llu: path "
- "resolving failed with ret=%d\n", swarn->errstr,
+ "resolving failed with ret=%d", swarn->errstr,
swarn->logical, rcu_str_deref(swarn->dev->name),
(unsigned long long)swarn->sector, root, inum, offset, ret);
@@ -649,10 +644,10 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
item_size, &ref_root,
&ref_level);
- printk_in_rcu(KERN_WARNING
- "BTRFS: %s at logical %llu on dev %s, "
+ btrfs_warn_in_rcu(fs_info,
+ "%s at logical %llu on dev %s, "
"sector %llu: metadata %s (level %d) in tree "
- "%llu\n", errstr, swarn.logical,
+ "%llu", errstr, swarn.logical,
rcu_str_deref(dev->name),
(unsigned long long)swarn.sector,
ref_level ? "node" : "leaf",
@@ -850,8 +845,8 @@ out:
btrfs_dev_replace_stats_inc(
&sctx->dev_root->fs_info->dev_replace.
num_uncorrectable_read_errors);
- printk_ratelimited_in_rcu(KERN_ERR "BTRFS: "
- "unable to fixup (nodatasum) error at logical %llu on dev %s\n",
+ btrfs_err_rl_in_rcu(sctx->dev_root->fs_info,
+ "unable to fixup (nodatasum) error at logical %llu on dev %s",
fixup->logical, rcu_str_deref(fixup->dev->name));
}
@@ -889,11 +884,9 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
struct btrfs_fs_info *fs_info;
u64 length;
u64 logical;
- u64 generation;
unsigned int failed_mirror_index;
unsigned int is_metadata;
unsigned int have_csum;
- u8 *csum;
struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
struct scrub_block *sblock_bad;
int ret;
@@ -918,13 +911,11 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
}
length = sblock_to_check->page_count * PAGE_SIZE;
logical = sblock_to_check->pagev[0]->logical;
- generation = sblock_to_check->pagev[0]->generation;
BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
is_metadata = !(sblock_to_check->pagev[0]->flags &
BTRFS_EXTENT_FLAG_DATA);
have_csum = sblock_to_check->pagev[0]->have_csum;
- csum = sblock_to_check->pagev[0]->csum;
dev = sblock_to_check->pagev[0]->dev;
if (sctx->is_dev_replace && !is_metadata && !have_csum) {
@@ -987,8 +978,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
sblock_bad = sblocks_for_recheck + failed_mirror_index;
/* build and submit the bios for the failed mirror, check checksums */
- scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
- csum, generation, sctx->csum_size, 1);
+ scrub_recheck_block(fs_info, sblock_bad, 1);
if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
sblock_bad->no_io_error_seen) {
@@ -1101,9 +1091,7 @@ nodatasum_case:
sblock_other = sblocks_for_recheck + mirror_index;
/* build and submit the bios, check checksums */
- scrub_recheck_block(fs_info, sblock_other, is_metadata,
- have_csum, csum, generation,
- sctx->csum_size, 0);
+ scrub_recheck_block(fs_info, sblock_other, 0);
if (!sblock_other->header_error &&
!sblock_other->checksum_error &&
@@ -1215,9 +1203,7 @@ nodatasum_case:
* is verified, but most likely the data comes out
* of the page cache.
*/
- scrub_recheck_block(fs_info, sblock_bad,
- is_metadata, have_csum, csum,
- generation, sctx->csum_size, 1);
+ scrub_recheck_block(fs_info, sblock_bad, 1);
if (!sblock_bad->header_error &&
!sblock_bad->checksum_error &&
sblock_bad->no_io_error_seen)
@@ -1230,8 +1216,8 @@ corrected_error:
sctx->stat.corrected_errors++;
sblock_to_check->data_corrected = 1;
spin_unlock(&sctx->stat_lock);
- printk_ratelimited_in_rcu(KERN_ERR
- "BTRFS: fixed up error at logical %llu on dev %s\n",
+ btrfs_err_rl_in_rcu(fs_info,
+ "fixed up error at logical %llu on dev %s",
logical, rcu_str_deref(dev->name));
}
} else {
@@ -1239,8 +1225,8 @@ did_not_correct_error:
spin_lock(&sctx->stat_lock);
sctx->stat.uncorrectable_errors++;
spin_unlock(&sctx->stat_lock);
- printk_ratelimited_in_rcu(KERN_ERR
- "BTRFS: unable to fixup (regular) error at logical %llu on dev %s\n",
+ btrfs_err_rl_in_rcu(fs_info,
+ "unable to fixup (regular) error at logical %llu on dev %s",
logical, rcu_str_deref(dev->name));
}
@@ -1318,6 +1304,9 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
u64 length = original_sblock->page_count * PAGE_SIZE;
u64 logical = original_sblock->pagev[0]->logical;
+ u64 generation = original_sblock->pagev[0]->generation;
+ u64 flags = original_sblock->pagev[0]->flags;
+ u64 have_csum = original_sblock->pagev[0]->have_csum;
struct scrub_recover *recover;
struct btrfs_bio *bbio;
u64 sublen;
@@ -1372,6 +1361,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
sblock = sblocks_for_recheck + mirror_index;
sblock->sctx = sctx;
+
page = kzalloc(sizeof(*page), GFP_NOFS);
if (!page) {
leave_nomem:
@@ -1383,7 +1373,15 @@ leave_nomem:
}
scrub_page_get(page);
sblock->pagev[page_index] = page;
+ page->sblock = sblock;
+ page->flags = flags;
+ page->generation = generation;
page->logical = logical;
+ page->have_csum = have_csum;
+ if (have_csum)
+ memcpy(page->csum,
+ original_sblock->pagev[0]->csum,
+ sctx->csum_size);
scrub_stripe_index_and_offset(logical,
bbio->map_type,
@@ -1474,15 +1472,12 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
* the pages that are errored in the just handled mirror can be repaired.
*/
static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
- struct scrub_block *sblock, int is_metadata,
- int have_csum, u8 *csum, u64 generation,
- u16 csum_size, int retry_failed_mirror)
+ struct scrub_block *sblock,
+ int retry_failed_mirror)
{
int page_num;
sblock->no_io_error_seen = 1;
- sblock->header_error = 0;
- sblock->checksum_error = 0;
for (page_num = 0; page_num < sblock->page_count; page_num++) {
struct bio *bio;
@@ -1518,9 +1513,7 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
}
if (sblock->no_io_error_seen)
- scrub_recheck_block_checksum(fs_info, sblock, is_metadata,
- have_csum, csum, generation,
- csum_size);
+ scrub_recheck_block_checksum(sblock);
return;
}
@@ -1535,61 +1528,16 @@ static inline int scrub_check_fsid(u8 fsid[],
return !ret;
}
-static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
- struct scrub_block *sblock,
- int is_metadata, int have_csum,
- const u8 *csum, u64 generation,
- u16 csum_size)
+static void scrub_recheck_block_checksum(struct scrub_block *sblock)
{
- int page_num;
- u8 calculated_csum[BTRFS_CSUM_SIZE];
- u32 crc = ~(u32)0;
- void *mapped_buffer;
-
- WARN_ON(!sblock->pagev[0]->page);
- if (is_metadata) {
- struct btrfs_header *h;
-
- mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
- h = (struct btrfs_header *)mapped_buffer;
-
- if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h) ||
- !scrub_check_fsid(h->fsid, sblock->pagev[0]) ||
- memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
- BTRFS_UUID_SIZE)) {
- sblock->header_error = 1;
- } else if (generation != btrfs_stack_header_generation(h)) {
- sblock->header_error = 1;
- sblock->generation_error = 1;
- }
- csum = h->csum;
- } else {
- if (!have_csum)
- return;
-
- mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
- }
-
- for (page_num = 0;;) {
- if (page_num == 0 && is_metadata)
- crc = btrfs_csum_data(
- ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE,
- crc, PAGE_SIZE - BTRFS_CSUM_SIZE);
- else
- crc = btrfs_csum_data(mapped_buffer, crc, PAGE_SIZE);
-
- kunmap_atomic(mapped_buffer);
- page_num++;
- if (page_num >= sblock->page_count)
- break;
- WARN_ON(!sblock->pagev[page_num]->page);
-
- mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page);
- }
+ sblock->header_error = 0;
+ sblock->checksum_error = 0;
+ sblock->generation_error = 0;
- btrfs_csum_final(crc, calculated_csum);
- if (memcmp(calculated_csum, csum, csum_size))
- sblock->checksum_error = 1;
+ if (sblock->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA)
+ scrub_checksum_data(sblock);
+ else
+ scrub_checksum_tree_block(sblock);
}
static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
@@ -1626,9 +1574,9 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
int ret;
if (!page_bad->dev->bdev) {
- printk_ratelimited(KERN_WARNING "BTRFS: "
+ btrfs_warn_rl(sblock_bad->sctx->dev_root->fs_info,
"scrub_repair_page_from_good_copy(bdev == NULL) "
- "is unexpected!\n");
+ "is unexpected");
return -EIO;
}
@@ -1833,6 +1781,18 @@ static int scrub_checksum(struct scrub_block *sblock)
u64 flags;
int ret;
+ /*
+ * No need to initialize these stats currently,
+ * because this function only use return value
+ * instead of these stats value.
+ *
+ * Todo:
+ * always use stats
+ */
+ sblock->header_error = 0;
+ sblock->generation_error = 0;
+ sblock->checksum_error = 0;
+
WARN_ON(sblock->page_count < 1);
flags = sblock->pagev[0]->flags;
ret = 0;
@@ -1858,7 +1818,6 @@ static int scrub_checksum_data(struct scrub_block *sblock)
struct page *page;
void *buffer;
u32 crc = ~(u32)0;
- int fail = 0;
u64 len;
int index;
@@ -1889,9 +1848,9 @@ static int scrub_checksum_data(struct scrub_block *sblock)
btrfs_csum_final(crc, csum);
if (memcmp(csum, on_disk_csum, sctx->csum_size))
- fail = 1;
+ sblock->checksum_error = 1;
- return fail;
+ return sblock->checksum_error;
}
static int scrub_checksum_tree_block(struct scrub_block *sblock)
@@ -1907,8 +1866,6 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
u64 mapped_size;
void *p;
u32 crc = ~(u32)0;
- int fail = 0;
- int crc_fail = 0;
u64 len;
int index;
@@ -1923,19 +1880,20 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
* a) don't have an extent buffer and
* b) the page is already kmapped
*/
-
if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h))
- ++fail;
+ sblock->header_error = 1;
- if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h))
- ++fail;
+ if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h)) {
+ sblock->header_error = 1;
+ sblock->generation_error = 1;
+ }
if (!scrub_check_fsid(h->fsid, sblock->pagev[0]))
- ++fail;
+ sblock->header_error = 1;
if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
BTRFS_UUID_SIZE))
- ++fail;
+ sblock->header_error = 1;
len = sctx->nodesize - BTRFS_CSUM_SIZE;
mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
@@ -1960,9 +1918,9 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
btrfs_csum_final(crc, calculated_csum);
if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
- ++crc_fail;
+ sblock->checksum_error = 1;
- return fail || crc_fail;
+ return sblock->header_error || sblock->checksum_error;
}
static int scrub_checksum_super(struct scrub_block *sblock)
@@ -2176,40 +2134,28 @@ static void scrub_missing_raid56_worker(struct btrfs_work *work)
{
struct scrub_block *sblock = container_of(work, struct scrub_block, work);
struct scrub_ctx *sctx = sblock->sctx;
- struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
- unsigned int is_metadata;
- unsigned int have_csum;
- u8 *csum;
- u64 generation;
u64 logical;
struct btrfs_device *dev;
- is_metadata = !(sblock->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA);
- have_csum = sblock->pagev[0]->have_csum;
- csum = sblock->pagev[0]->csum;
- generation = sblock->pagev[0]->generation;
logical = sblock->pagev[0]->logical;
dev = sblock->pagev[0]->dev;
- if (sblock->no_io_error_seen) {
- scrub_recheck_block_checksum(fs_info, sblock, is_metadata,
- have_csum, csum, generation,
- sctx->csum_size);
- }
+ if (sblock->no_io_error_seen)
+ scrub_recheck_block_checksum(sblock);
if (!sblock->no_io_error_seen) {
spin_lock(&sctx->stat_lock);
sctx->stat.read_errors++;
spin_unlock(&sctx->stat_lock);
- printk_ratelimited_in_rcu(KERN_ERR
- "BTRFS: I/O error rebulding logical %llu for dev %s\n",
+ btrfs_err_rl_in_rcu(sctx->dev_root->fs_info,
+ "IO error rebuilding logical %llu for dev %s",
logical, rcu_str_deref(dev->name));
} else if (sblock->header_error || sblock->checksum_error) {
spin_lock(&sctx->stat_lock);
sctx->stat.uncorrectable_errors++;
spin_unlock(&sctx->stat_lock);
- printk_ratelimited_in_rcu(KERN_ERR
- "BTRFS: failed to rebuild valid logical %llu for dev %s\n",
+ btrfs_err_rl_in_rcu(sctx->dev_root->fs_info,
+ "failed to rebuild valid logical %llu for dev %s",
logical, rcu_str_deref(dev->name));
} else {
scrub_write_block_to_dev_replace(sblock);
@@ -2500,8 +2446,7 @@ static void scrub_block_complete(struct scrub_block *sblock)
}
}
-static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
- u8 *csum)
+static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
{
struct btrfs_ordered_sum *sum = NULL;
unsigned long index;
@@ -2565,7 +2510,7 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
if (flags & BTRFS_EXTENT_FLAG_DATA) {
/* push csums to sbio */
- have_csum = scrub_find_csum(sctx, logical, l, csum);
+ have_csum = scrub_find_csum(sctx, logical, csum);
if (have_csum == 0)
++sctx->stat.no_csum;
if (sctx->is_dev_replace && !have_csum) {
@@ -2703,7 +2648,7 @@ static int scrub_extent_for_parity(struct scrub_parity *sparity,
if (flags & BTRFS_EXTENT_FLAG_DATA) {
/* push csums to sbio */
- have_csum = scrub_find_csum(sctx, logical, l, csum);
+ have_csum = scrub_find_csum(sctx, logical, csum);
if (have_csum == 0)
goto skip;
}
@@ -3012,6 +2957,9 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
logic_start + map->stripe_len)) {
btrfs_err(fs_info, "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
key.objectid, logic_start);
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.uncorrectable_errors++;
+ spin_unlock(&sctx->stat_lock);
goto next;
}
again:
@@ -3361,6 +3309,9 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
"scrub: tree block %llu spanning "
"stripes, ignored. logical=%llu",
key.objectid, logical);
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.uncorrectable_errors++;
+ spin_unlock(&sctx->stat_lock);
goto next;
}
@@ -3481,7 +3432,9 @@ out:
static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
struct btrfs_device *scrub_dev,
u64 chunk_offset, u64 length,
- u64 dev_offset, int is_dev_replace)
+ u64 dev_offset,
+ struct btrfs_block_group_cache *cache,
+ int is_dev_replace)
{
struct btrfs_mapping_tree *map_tree =
&sctx->dev_root->fs_info->mapping_tree;
@@ -3494,8 +3447,18 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
read_unlock(&map_tree->map_tree.lock);
- if (!em)
- return -EINVAL;
+ if (!em) {
+ /*
+ * Might have been an unused block group deleted by the cleaner
+ * kthread or relocation.
+ */
+ spin_lock(&cache->lock);
+ if (!cache->removed)
+ ret = -EINVAL;
+ spin_unlock(&cache->lock);
+
+ return ret;
+ }
map = (struct map_lookup *)em->bdev;
if (em->start != chunk_offset)
@@ -3532,6 +3495,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
u64 length;
u64 chunk_offset;
int ret = 0;
+ int ro_set;
int slot;
struct extent_buffer *l;
struct btrfs_key key;
@@ -3617,7 +3581,21 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
scrub_pause_on(fs_info);
ret = btrfs_inc_block_group_ro(root, cache);
scrub_pause_off(fs_info);
- if (ret) {
+
+ if (ret == 0) {
+ ro_set = 1;
+ } else if (ret == -ENOSPC) {
+ /*
+ * btrfs_inc_block_group_ro return -ENOSPC when it
+ * failed in creating new chunk for metadata.
+ * It is not a problem for scrub/replace, because
+ * metadata are always cowed, and our scrub paused
+ * commit_transactions.
+ */
+ ro_set = 0;
+ } else {
+ btrfs_warn(fs_info, "failed setting block group ro, ret=%d\n",
+ ret);
btrfs_put_block_group(cache);
break;
}
@@ -3626,7 +3604,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
dev_replace->cursor_left = found_key.offset;
dev_replace->item_needs_writeback = 1;
ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length,
- found_key.offset, is_dev_replace);
+ found_key.offset, cache, is_dev_replace);
/*
* flush, submit all pending read and write bios, afterwards
@@ -3660,7 +3638,30 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
scrub_pause_off(fs_info);
- btrfs_dec_block_group_ro(root, cache);
+ if (ro_set)
+ btrfs_dec_block_group_ro(root, cache);
+
+ /*
+ * We might have prevented the cleaner kthread from deleting
+ * this block group if it was already unused because we raced
+ * and set it to RO mode first. So add it back to the unused
+ * list, otherwise it might not ever be deleted unless a manual
+ * balance is triggered or it becomes used and unused again.
+ */
+ spin_lock(&cache->lock);
+ if (!cache->removed && !cache->ro && cache->reserved == 0 &&
+ btrfs_block_group_used(&cache->item) == 0) {
+ spin_unlock(&cache->lock);
+ spin_lock(&fs_info->unused_bgs_lock);
+ if (list_empty(&cache->bg_list)) {
+ btrfs_get_block_group(cache);
+ list_add_tail(&cache->bg_list,
+ &fs_info->unused_bgs);
+ }
+ spin_unlock(&fs_info->unused_bgs_lock);
+ } else {
+ spin_unlock(&cache->lock);
+ }
btrfs_put_block_group(cache);
if (ret)
@@ -4375,8 +4376,8 @@ static int write_page_nocow(struct scrub_ctx *sctx,
if (!dev)
return -EIO;
if (!dev->bdev) {
- printk_ratelimited(KERN_WARNING
- "BTRFS: scrub write_page_nocow(bdev == NULL) is unexpected!\n");
+ btrfs_warn_rl(dev->dev_root->fs_info,
+ "scrub write_page_nocow(bdev == NULL) is unexpected");
return -EIO;
}
bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 23bb2e4b9..355a458cb 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1434,16 +1434,6 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, "
}
if (cur_clone_root) {
- if (compressed != BTRFS_COMPRESS_NONE) {
- /*
- * Offsets given by iterate_extent_inodes() are relative
- * to the start of the extent, we need to add logical
- * offset from the file extent item.
- * (See why at backref.c:check_extent_in_eb())
- */
- cur_clone_root->offset += btrfs_file_extent_offset(eb,
- fi);
- }
*found = cur_clone_root;
ret = 0;
} else {
@@ -2570,7 +2560,7 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino);
} else if (S_ISSOCK(mode)) {
cmd = BTRFS_SEND_C_MKSOCK;
} else {
- printk(KERN_WARNING "btrfs: unexpected inode type %o",
+ btrfs_warn(sctx->send_root->fs_info, "unexpected inode type %o",
(int)(mode & S_IFMT));
ret = -ENOTSUPP;
goto out;
@@ -4693,6 +4683,171 @@ tlv_put_failure:
return ret;
}
+static int send_extent_data(struct send_ctx *sctx,
+ const u64 offset,
+ const u64 len)
+{
+ u64 sent = 0;
+
+ if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
+ return send_update_extent(sctx, offset, len);
+
+ while (sent < len) {
+ u64 size = len - sent;
+ int ret;
+
+ if (size > BTRFS_SEND_READ_SIZE)
+ size = BTRFS_SEND_READ_SIZE;
+ ret = send_write(sctx, offset + sent, size);
+ if (ret < 0)
+ return ret;
+ if (!ret)
+ break;
+ sent += ret;
+ }
+ return 0;
+}
+
+static int clone_range(struct send_ctx *sctx,
+ struct clone_root *clone_root,
+ const u64 disk_byte,
+ u64 data_offset,
+ u64 offset,
+ u64 len)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ int ret;
+
+ path = alloc_path_for_send();
+ if (!path)
+ return -ENOMEM;
+
+ /*
+ * We can't send a clone operation for the entire range if we find
+ * extent items in the respective range in the source file that
+ * refer to different extents or if we find holes.
+ * So check for that and do a mix of clone and regular write/copy
+ * operations if needed.
+ *
+ * Example:
+ *
+ * mkfs.btrfs -f /dev/sda
+ * mount /dev/sda /mnt
+ * xfs_io -f -c "pwrite -S 0xaa 0K 100K" /mnt/foo
+ * cp --reflink=always /mnt/foo /mnt/bar
+ * xfs_io -c "pwrite -S 0xbb 50K 50K" /mnt/foo
+ * btrfs subvolume snapshot -r /mnt /mnt/snap
+ *
+ * If when we send the snapshot and we are processing file bar (which
+ * has a higher inode number than foo) we blindly send a clone operation
+ * for the [0, 100K[ range from foo to bar, the receiver ends up getting
+ * a file bar that matches the content of file foo - iow, doesn't match
+ * the content from bar in the original filesystem.
+ */
+ key.objectid = clone_root->ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = clone_root->offset;
+ ret = btrfs_search_slot(NULL, clone_root->root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ if (ret > 0 && path->slots[0] > 0) {
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
+ if (key.objectid == clone_root->ino &&
+ key.type == BTRFS_EXTENT_DATA_KEY)
+ path->slots[0]--;
+ }
+
+ while (true) {
+ struct extent_buffer *leaf = path->nodes[0];
+ int slot = path->slots[0];
+ struct btrfs_file_extent_item *ei;
+ u8 type;
+ u64 ext_len;
+ u64 clone_len;
+
+ if (slot >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(clone_root->root, path);
+ if (ret < 0)
+ goto out;
+ else if (ret > 0)
+ break;
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+
+ /*
+ * We might have an implicit trailing hole (NO_HOLES feature
+ * enabled). We deal with it after leaving this loop.
+ */
+ if (key.objectid != clone_root->ino ||
+ key.type != BTRFS_EXTENT_DATA_KEY)
+ break;
+
+ ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+ type = btrfs_file_extent_type(leaf, ei);
+ if (type == BTRFS_FILE_EXTENT_INLINE) {
+ ext_len = btrfs_file_extent_inline_len(leaf, slot, ei);
+ ext_len = PAGE_CACHE_ALIGN(ext_len);
+ } else {
+ ext_len = btrfs_file_extent_num_bytes(leaf, ei);
+ }
+
+ if (key.offset + ext_len <= clone_root->offset)
+ goto next;
+
+ if (key.offset > clone_root->offset) {
+ /* Implicit hole, NO_HOLES feature enabled. */
+ u64 hole_len = key.offset - clone_root->offset;
+
+ if (hole_len > len)
+ hole_len = len;
+ ret = send_extent_data(sctx, offset, hole_len);
+ if (ret < 0)
+ goto out;
+
+ len -= hole_len;
+ if (len == 0)
+ break;
+ offset += hole_len;
+ clone_root->offset += hole_len;
+ data_offset += hole_len;
+ }
+
+ if (key.offset >= clone_root->offset + len)
+ break;
+
+ clone_len = min_t(u64, ext_len, len);
+
+ if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte &&
+ btrfs_file_extent_offset(leaf, ei) == data_offset)
+ ret = send_clone(sctx, offset, clone_len, clone_root);
+ else
+ ret = send_extent_data(sctx, offset, clone_len);
+
+ if (ret < 0)
+ goto out;
+
+ len -= clone_len;
+ if (len == 0)
+ break;
+ offset += clone_len;
+ clone_root->offset += clone_len;
+ data_offset += clone_len;
+next:
+ path->slots[0]++;
+ }
+
+ if (len > 0)
+ ret = send_extent_data(sctx, offset, len);
+ else
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
static int send_write_or_clone(struct send_ctx *sctx,
struct btrfs_path *path,
struct btrfs_key *key,
@@ -4701,9 +4856,7 @@ static int send_write_or_clone(struct send_ctx *sctx,
int ret = 0;
struct btrfs_file_extent_item *ei;
u64 offset = key->offset;
- u64 pos = 0;
u64 len;
- u32 l;
u8 type;
u64 bs = sctx->send_root->fs_info->sb->s_blocksize;
@@ -4731,22 +4884,15 @@ static int send_write_or_clone(struct send_ctx *sctx,
}
if (clone_root && IS_ALIGNED(offset + len, bs)) {
- ret = send_clone(sctx, offset, len, clone_root);
- } else if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) {
- ret = send_update_extent(sctx, offset, len);
+ u64 disk_byte;
+ u64 data_offset;
+
+ disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei);
+ data_offset = btrfs_file_extent_offset(path->nodes[0], ei);
+ ret = clone_range(sctx, clone_root, disk_byte, data_offset,
+ offset, len);
} else {
- while (pos < len) {
- l = len - pos;
- if (l > BTRFS_SEND_READ_SIZE)
- l = BTRFS_SEND_READ_SIZE;
- ret = send_write(sctx, pos + offset, l);
- if (ret < 0)
- goto out;
- if (!ret)
- break;
- pos += ret;
- }
- ret = 0;
+ ret = send_extent_data(sctx, offset, len);
}
out:
return ret;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 11d1eab92..24154e422 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -130,7 +130,6 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
}
}
-#ifdef CONFIG_PRINTK
/*
* __btrfs_std_error decodes expected errors from the caller and
* invokes the approciate error response.
@@ -140,7 +139,9 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int errno, const char *fmt, ...)
{
struct super_block *sb = fs_info->sb;
+#ifdef CONFIG_PRINTK
const char *errstr;
+#endif
/*
* Special case: if the error is EROFS, and we're already
@@ -149,6 +150,7 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
return;
+#ifdef CONFIG_PRINTK
errstr = btrfs_decode_error(errno);
if (fmt) {
struct va_format vaf;
@@ -166,6 +168,7 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
printk(KERN_CRIT "BTRFS: error (device %s) in %s:%d: errno=%d %s\n",
sb->s_id, function, line, errno, errstr);
}
+#endif
/* Don't go through full error handling during mount */
save_error_info(fs_info);
@@ -173,6 +176,7 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
btrfs_handle_error(fs_info);
}
+#ifdef CONFIG_PRINTK
static const char * const logtypes[] = {
"emergency",
"alert",
@@ -212,27 +216,6 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
va_end(args);
}
-
-#else
-
-void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
- unsigned int line, int errno, const char *fmt, ...)
-{
- struct super_block *sb = fs_info->sb;
-
- /*
- * Special case: if the error is EROFS, and we're already
- * under MS_RDONLY, then it is safe here.
- */
- if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
- return;
-
- /* Don't go through full error handling during mount */
- if (sb->s_flags & MS_BORN) {
- save_error_info(fs_info);
- btrfs_handle_error(fs_info);
- }
-}
#endif
/*
@@ -320,6 +303,9 @@ enum {
Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard,
Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl, Opt_datacow,
Opt_datasum, Opt_treelog, Opt_noinode_cache,
+#ifdef CONFIG_BTRFS_DEBUG
+ Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all,
+#endif
Opt_err,
};
@@ -372,6 +358,11 @@ static match_table_t tokens = {
{Opt_rescan_uuid_tree, "rescan_uuid_tree"},
{Opt_fatal_errors, "fatal_errors=%s"},
{Opt_commit_interval, "commit=%d"},
+#ifdef CONFIG_BTRFS_DEBUG
+ {Opt_fragment_data, "fragment=data"},
+ {Opt_fragment_metadata, "fragment=metadata"},
+ {Opt_fragment_all, "fragment=all"},
+#endif
{Opt_err, NULL},
};
@@ -738,6 +729,22 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
}
break;
+#ifdef CONFIG_BTRFS_DEBUG
+ case Opt_fragment_all:
+ btrfs_info(root->fs_info, "fragmenting all space");
+ btrfs_set_opt(info->mount_opt, FRAGMENT_DATA);
+ btrfs_set_opt(info->mount_opt, FRAGMENT_METADATA);
+ break;
+ case Opt_fragment_metadata:
+ btrfs_info(root->fs_info, "fragmenting metadata");
+ btrfs_set_opt(info->mount_opt,
+ FRAGMENT_METADATA);
+ break;
+ case Opt_fragment_data:
+ btrfs_info(root->fs_info, "fragmenting data");
+ btrfs_set_opt(info->mount_opt, FRAGMENT_DATA);
+ break;
+#endif
case Opt_err:
btrfs_info(root->fs_info, "unrecognized mount option '%s'", p);
ret = -EINVAL;
@@ -1189,6 +1196,12 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
seq_puts(seq, ",fatal_errors=panic");
if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL)
seq_printf(seq, ",commit=%d", info->commit_interval);
+#ifdef CONFIG_BTRFS_DEBUG
+ if (btrfs_test_opt(root, FRAGMENT_DATA))
+ seq_puts(seq, ",fragment=data");
+ if (btrfs_test_opt(root, FRAGMENT_METADATA))
+ seq_puts(seq, ",fragment=metadata");
+#endif
seq_printf(seq, ",subvolid=%llu",
BTRFS_I(d_inode(dentry))->root->root_key.objectid);
seq_puts(seq, ",subvol=");
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 603b0cc2b..e0ac85949 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -437,24 +437,24 @@ static const struct attribute *btrfs_attrs[] = {
NULL,
};
-static void btrfs_release_super_kobj(struct kobject *kobj)
+static void btrfs_release_fsid_kobj(struct kobject *kobj)
{
struct btrfs_fs_devices *fs_devs = to_fs_devs(kobj);
- memset(&fs_devs->super_kobj, 0, sizeof(struct kobject));
+ memset(&fs_devs->fsid_kobj, 0, sizeof(struct kobject));
complete(&fs_devs->kobj_unregister);
}
static struct kobj_type btrfs_ktype = {
.sysfs_ops = &kobj_sysfs_ops,
- .release = btrfs_release_super_kobj,
+ .release = btrfs_release_fsid_kobj,
};
static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj)
{
if (kobj->ktype != &btrfs_ktype)
return NULL;
- return container_of(kobj, struct btrfs_fs_devices, super_kobj);
+ return container_of(kobj, struct btrfs_fs_devices, fsid_kobj);
}
static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj)
@@ -502,12 +502,12 @@ static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add)
attrs[0] = &fa->kobj_attr.attr;
if (add) {
int ret;
- ret = sysfs_merge_group(&fs_info->fs_devices->super_kobj,
+ ret = sysfs_merge_group(&fs_info->fs_devices->fsid_kobj,
&agroup);
if (ret)
return ret;
} else
- sysfs_unmerge_group(&fs_info->fs_devices->super_kobj,
+ sysfs_unmerge_group(&fs_info->fs_devices->fsid_kobj,
&agroup);
}
@@ -523,9 +523,9 @@ static void __btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs)
fs_devs->device_dir_kobj = NULL;
}
- if (fs_devs->super_kobj.state_initialized) {
- kobject_del(&fs_devs->super_kobj);
- kobject_put(&fs_devs->super_kobj);
+ if (fs_devs->fsid_kobj.state_initialized) {
+ kobject_del(&fs_devs->fsid_kobj);
+ kobject_put(&fs_devs->fsid_kobj);
wait_for_completion(&fs_devs->kobj_unregister);
}
}
@@ -545,7 +545,7 @@ void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs)
}
}
-void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info)
+void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info)
{
btrfs_reset_fs_info_ptr(fs_info);
@@ -555,9 +555,9 @@ void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info)
kobject_put(fs_info->space_info_kobj);
}
addrm_unknown_feature_attrs(fs_info, false);
- sysfs_remove_group(&fs_info->fs_devices->super_kobj, &btrfs_feature_attr_group);
- sysfs_remove_files(&fs_info->fs_devices->super_kobj, btrfs_attrs);
- btrfs_kobj_rm_device(fs_info->fs_devices, NULL);
+ sysfs_remove_group(&fs_info->fs_devices->fsid_kobj, &btrfs_feature_attr_group);
+ sysfs_remove_files(&fs_info->fs_devices->fsid_kobj, btrfs_attrs);
+ btrfs_sysfs_rm_device_link(fs_info->fs_devices, NULL);
}
const char * const btrfs_feature_set_names[3] = {
@@ -637,7 +637,7 @@ static void init_feature_attrs(void)
/* when one_device is NULL, it removes all device links */
-int btrfs_kobj_rm_device(struct btrfs_fs_devices *fs_devices,
+int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *one_device)
{
struct hd_struct *disk;
@@ -675,7 +675,7 @@ int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs)
{
if (!fs_devs->device_dir_kobj)
fs_devs->device_dir_kobj = kobject_create_and_add("devices",
- &fs_devs->super_kobj);
+ &fs_devs->fsid_kobj);
if (!fs_devs->device_dir_kobj)
return -ENOMEM;
@@ -683,7 +683,7 @@ int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs)
return 0;
}
-int btrfs_kobj_add_device(struct btrfs_fs_devices *fs_devices,
+int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *one_device)
{
int error = 0;
@@ -730,31 +730,31 @@ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs,
int error;
init_completion(&fs_devs->kobj_unregister);
- fs_devs->super_kobj.kset = btrfs_kset;
- error = kobject_init_and_add(&fs_devs->super_kobj,
+ fs_devs->fsid_kobj.kset = btrfs_kset;
+ error = kobject_init_and_add(&fs_devs->fsid_kobj,
&btrfs_ktype, parent, "%pU", fs_devs->fsid);
return error;
}
-int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
+int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info)
{
int error;
struct btrfs_fs_devices *fs_devs = fs_info->fs_devices;
- struct kobject *super_kobj = &fs_devs->super_kobj;
+ struct kobject *fsid_kobj = &fs_devs->fsid_kobj;
btrfs_set_fs_info_ptr(fs_info);
- error = btrfs_kobj_add_device(fs_devs, NULL);
+ error = btrfs_sysfs_add_device_link(fs_devs, NULL);
if (error)
return error;
- error = sysfs_create_files(super_kobj, btrfs_attrs);
+ error = sysfs_create_files(fsid_kobj, btrfs_attrs);
if (error) {
- btrfs_kobj_rm_device(fs_devs, NULL);
+ btrfs_sysfs_rm_device_link(fs_devs, NULL);
return error;
}
- error = sysfs_create_group(super_kobj,
+ error = sysfs_create_group(fsid_kobj,
&btrfs_feature_attr_group);
if (error)
goto failure;
@@ -764,7 +764,7 @@ int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
goto failure;
fs_info->space_info_kobj = kobject_create_and_add("allocation",
- super_kobj);
+ fsid_kobj);
if (!fs_info->space_info_kobj) {
error = -ENOMEM;
goto failure;
@@ -776,7 +776,7 @@ int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
return 0;
failure:
- btrfs_sysfs_remove_one(fs_info);
+ btrfs_sysfs_remove_mounted(fs_info);
return error;
}
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index 6392527bc..9c0952212 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -82,9 +82,9 @@ char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags);
extern const char * const btrfs_feature_set_names[3];
extern struct kobj_type space_info_ktype;
extern struct kobj_type btrfs_raid_ktype;
-int btrfs_kobj_add_device(struct btrfs_fs_devices *fs_devices,
+int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *one_device);
-int btrfs_kobj_rm_device(struct btrfs_fs_devices *fs_devices,
+int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *one_device);
int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs,
struct kobject *parent);
diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c
index 2299bfde3..8b72b005b 100644
--- a/fs/btrfs/tests/free-space-tests.c
+++ b/fs/btrfs/tests/free-space-tests.c
@@ -19,6 +19,7 @@
#include <linux/slab.h>
#include "btrfs-tests.h"
#include "../ctree.h"
+#include "../disk-io.h"
#include "../free-space-cache.h"
#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8)
@@ -35,6 +36,12 @@ static struct btrfs_block_group_cache *init_test_block_group(void)
kfree(cache);
return NULL;
}
+ cache->fs_info = btrfs_alloc_dummy_fs_info();
+ if (!cache->fs_info) {
+ kfree(cache->free_space_ctl);
+ kfree(cache);
+ return NULL;
+ }
cache->key.objectid = 0;
cache->key.offset = 1024 * 1024 * 1024;
@@ -879,7 +886,8 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
int btrfs_test_free_space_cache(void)
{
struct btrfs_block_group_cache *cache;
- int ret;
+ struct btrfs_root *root = NULL;
+ int ret = -ENOMEM;
test_msg("Running btrfs free space cache tests\n");
@@ -889,6 +897,19 @@ int btrfs_test_free_space_cache(void)
return 0;
}
+ root = btrfs_alloc_dummy_root();
+ if (IS_ERR(root)) {
+ ret = PTR_ERR(root);
+ goto out;
+ }
+
+ root->fs_info = btrfs_alloc_dummy_fs_info();
+ if (!root->fs_info)
+ goto out;
+
+ root->fs_info->extent_root = root;
+ cache->fs_info = root->fs_info;
+
ret = test_extents(cache);
if (ret)
goto out;
@@ -904,6 +925,7 @@ out:
__btrfs_remove_free_space_cache(cache->free_space_ctl);
kfree(cache->free_space_ctl);
kfree(cache);
+ btrfs_free_dummy_root(root);
test_msg("Free space cache tests finished\n");
return ret;
}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index a5b06442f..be8eae80f 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -82,6 +82,12 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
static void clear_btree_io_tree(struct extent_io_tree *tree)
{
spin_lock(&tree->lock);
+ /*
+ * Do a single barrier for the waitqueue_active check here, the state
+ * of the waitqueue should not change once clear_btree_io_tree is
+ * called.
+ */
+ smp_mb();
while (!RB_EMPTY_ROOT(&tree->state)) {
struct rb_node *node;
struct extent_state *state;
@@ -226,25 +232,22 @@ loop:
extwriter_counter_init(cur_trans, type);
init_waitqueue_head(&cur_trans->writer_wait);
init_waitqueue_head(&cur_trans->commit_wait);
+ init_waitqueue_head(&cur_trans->pending_wait);
cur_trans->state = TRANS_STATE_RUNNING;
/*
* One for this trans handle, one so it will live on until we
* commit the transaction.
*/
atomic_set(&cur_trans->use_count, 2);
- cur_trans->have_free_bgs = 0;
+ atomic_set(&cur_trans->pending_ordered, 0);
+ cur_trans->flags = 0;
cur_trans->start_time = get_seconds();
- cur_trans->dirty_bg_run = 0;
+
+ memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs));
cur_trans->delayed_refs.href_root = RB_ROOT;
cur_trans->delayed_refs.dirty_extent_root = RB_ROOT;
atomic_set(&cur_trans->delayed_refs.num_entries, 0);
- cur_trans->delayed_refs.num_heads_ready = 0;
- cur_trans->delayed_refs.pending_csums = 0;
- cur_trans->delayed_refs.num_heads = 0;
- cur_trans->delayed_refs.flushing = 0;
- cur_trans->delayed_refs.run_delayed_start = 0;
- cur_trans->delayed_refs.qgroup_to_skip = 0;
/*
* although the tree mod log is per file system and not per transaction,
@@ -264,7 +267,6 @@ loop:
INIT_LIST_HEAD(&cur_trans->pending_snapshots);
INIT_LIST_HEAD(&cur_trans->pending_chunks);
INIT_LIST_HEAD(&cur_trans->switch_commits);
- INIT_LIST_HEAD(&cur_trans->pending_ordered);
INIT_LIST_HEAD(&cur_trans->dirty_bgs);
INIT_LIST_HEAD(&cur_trans->io_bgs);
INIT_LIST_HEAD(&cur_trans->dropped_roots);
@@ -272,7 +274,6 @@ loop:
cur_trans->num_dirty_bgs = 0;
spin_lock_init(&cur_trans->dirty_bgs_lock);
INIT_LIST_HEAD(&cur_trans->deleted_bgs);
- spin_lock_init(&cur_trans->deleted_bgs_lock);
spin_lock_init(&cur_trans->dropped_roots_lock);
list_add_tail(&cur_trans->list, &fs_info->trans_list);
extent_io_tree_init(&cur_trans->dirty_pages,
@@ -447,8 +448,8 @@ static inline bool need_reserve_reloc_root(struct btrfs_root *root)
}
static struct btrfs_trans_handle *
-start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
- enum btrfs_reserve_flush_enum flush)
+start_transaction(struct btrfs_root *root, unsigned int num_items,
+ unsigned int type, enum btrfs_reserve_flush_enum flush)
{
struct btrfs_trans_handle *h;
struct btrfs_transaction *cur_trans;
@@ -478,13 +479,10 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
* the appropriate flushing if need be.
*/
if (num_items > 0 && root != root->fs_info->chunk_root) {
- if (root->fs_info->quota_enabled &&
- is_fstree(root->root_key.objectid)) {
- qgroup_reserved = num_items * root->nodesize;
- ret = btrfs_qgroup_reserve(root, qgroup_reserved);
- if (ret)
- return ERR_PTR(ret);
- }
+ qgroup_reserved = num_items * root->nodesize;
+ ret = btrfs_qgroup_reserve_meta(root, qgroup_reserved);
+ if (ret)
+ return ERR_PTR(ret);
num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
/*
@@ -502,7 +500,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
goto reserve_fail;
}
again:
- h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
+ h = kmem_cache_zalloc(btrfs_trans_handle_cachep, GFP_NOFS);
if (!h) {
ret = -ENOMEM;
goto alloc_fail;
@@ -543,26 +541,13 @@ again:
h->transid = cur_trans->transid;
h->transaction = cur_trans;
- h->blocks_used = 0;
- h->bytes_reserved = 0;
- h->chunk_bytes_reserved = 0;
h->root = root;
- h->delayed_ref_updates = 0;
h->use_count = 1;
- h->adding_csums = 0;
- h->block_rsv = NULL;
- h->orig_rsv = NULL;
- h->aborted = 0;
- h->qgroup_reserved = 0;
- h->delayed_ref_elem.seq = 0;
+
h->type = type;
- h->allocating_chunk = false;
h->can_flush_pending_bgs = true;
- h->reloc_reserved = false;
- h->sync = false;
INIT_LIST_HEAD(&h->qgroup_ref_list);
INIT_LIST_HEAD(&h->new_bgs);
- INIT_LIST_HEAD(&h->ordered);
smp_mb();
if (cur_trans->state >= TRANS_STATE_BLOCKED &&
@@ -579,7 +564,6 @@ again:
h->bytes_reserved = num_bytes;
h->reloc_reserved = reloc_reserved;
}
- h->qgroup_reserved = qgroup_reserved;
got_it:
btrfs_record_root_in_trans(h, root);
@@ -597,20 +581,52 @@ alloc_fail:
btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv,
num_bytes);
reserve_fail:
- if (qgroup_reserved)
- btrfs_qgroup_free(root, qgroup_reserved);
+ btrfs_qgroup_free_meta(root, qgroup_reserved);
return ERR_PTR(ret);
}
struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
- int num_items)
+ unsigned int num_items)
{
return start_transaction(root, num_items, TRANS_START,
BTRFS_RESERVE_FLUSH_ALL);
}
+struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv(
+ struct btrfs_root *root,
+ unsigned int num_items,
+ int min_factor)
+{
+ struct btrfs_trans_handle *trans;
+ u64 num_bytes;
+ int ret;
+
+ trans = btrfs_start_transaction(root, num_items);
+ if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
+ return trans;
+
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans))
+ return trans;
+
+ num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
+ ret = btrfs_cond_migrate_bytes(root->fs_info,
+ &root->fs_info->trans_block_rsv,
+ num_bytes,
+ min_factor);
+ if (ret) {
+ btrfs_end_transaction(trans, root);
+ return ERR_PTR(ret);
+ }
+
+ trans->block_rsv = &root->fs_info->trans_block_rsv;
+ trans->bytes_reserved = num_bytes;
+
+ return trans;
+}
struct btrfs_trans_handle *btrfs_start_transaction_lflush(
- struct btrfs_root *root, int num_items)
+ struct btrfs_root *root,
+ unsigned int num_items)
{
return start_transaction(root, num_items, TRANS_START,
BTRFS_RESERVE_FLUSH_LIMIT);
@@ -794,12 +810,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
if (!list_empty(&trans->new_bgs))
btrfs_create_pending_block_groups(trans, root);
- if (!list_empty(&trans->ordered)) {
- spin_lock(&info->trans_lock);
- list_splice_init(&trans->ordered, &cur_trans->pending_ordered);
- spin_unlock(&info->trans_lock);
- }
-
trans->delayed_ref_updates = 0;
if (!trans->sync) {
must_run_delayed_refs =
@@ -815,15 +825,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
must_run_delayed_refs = 2;
}
- if (trans->qgroup_reserved) {
- /*
- * the same root has to be passed here between start_transaction
- * and end_transaction. Subvolume quota depends on this.
- */
- btrfs_qgroup_free(trans->root, trans->qgroup_reserved);
- trans->qgroup_reserved = 0;
- }
-
btrfs_trans_release_metadata(trans, root);
trans->block_rsv = NULL;
@@ -856,6 +857,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
atomic_dec(&cur_trans->num_writers);
extwriter_counter_dec(cur_trans, trans->type);
+ /*
+ * Make sure counter is updated before we wake up waiters.
+ */
smp_mb();
if (waitqueue_active(&cur_trans->writer_wait))
wake_up(&cur_trans->writer_wait);
@@ -1238,6 +1242,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
spin_lock(&fs_info->fs_roots_radix_lock);
if (err)
break;
+ btrfs_qgroup_free_meta_all(root);
}
}
spin_unlock(&fs_info->fs_roots_radix_lock);
@@ -1795,25 +1800,10 @@ static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
}
static inline void
-btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans,
- struct btrfs_fs_info *fs_info)
+btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans)
{
- struct btrfs_ordered_extent *ordered;
-
- spin_lock(&fs_info->trans_lock);
- while (!list_empty(&cur_trans->pending_ordered)) {
- ordered = list_first_entry(&cur_trans->pending_ordered,
- struct btrfs_ordered_extent,
- trans_list);
- list_del_init(&ordered->trans_list);
- spin_unlock(&fs_info->trans_lock);
-
- wait_event(ordered->wait, test_bit(BTRFS_ORDERED_COMPLETE,
- &ordered->flags));
- btrfs_put_ordered_extent(ordered);
- spin_lock(&fs_info->trans_lock);
- }
- spin_unlock(&fs_info->trans_lock);
+ wait_event(cur_trans->pending_wait,
+ atomic_read(&cur_trans->pending_ordered) == 0);
}
int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
@@ -1842,10 +1832,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
btrfs_trans_release_metadata(trans, root);
trans->block_rsv = NULL;
- if (trans->qgroup_reserved) {
- btrfs_qgroup_free(root, trans->qgroup_reserved);
- trans->qgroup_reserved = 0;
- }
cur_trans = trans->transaction;
@@ -1865,7 +1851,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
return ret;
}
- if (!cur_trans->dirty_bg_run) {
+ if (!test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &cur_trans->flags)) {
int run_it = 0;
/* this mutex is also taken before trying to set
@@ -1874,18 +1860,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
* after a extents from that block group have been
* allocated for cache files. btrfs_set_block_group_ro
* will wait for the transaction to commit if it
- * finds dirty_bg_run = 1
+ * finds BTRFS_TRANS_DIRTY_BG_RUN set.
*
- * The dirty_bg_run flag is also used to make sure only
- * one process starts all the block group IO. It wouldn't
+ * The BTRFS_TRANS_DIRTY_BG_RUN flag is also used to make sure
+ * only one process starts all the block group IO. It wouldn't
* hurt to have more than one go through, but there's no
* real advantage to it either.
*/
mutex_lock(&root->fs_info->ro_block_group_mutex);
- if (!cur_trans->dirty_bg_run) {
+ if (!test_and_set_bit(BTRFS_TRANS_DIRTY_BG_RUN,
+ &cur_trans->flags))
run_it = 1;
- cur_trans->dirty_bg_run = 1;
- }
mutex_unlock(&root->fs_info->ro_block_group_mutex);
if (run_it)
@@ -1897,7 +1882,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
}
spin_lock(&root->fs_info->trans_lock);
- list_splice_init(&trans->ordered, &cur_trans->pending_ordered);
if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
spin_unlock(&root->fs_info->trans_lock);
atomic_inc(&cur_trans->use_count);
@@ -1956,7 +1940,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
btrfs_wait_delalloc_flush(root->fs_info);
- btrfs_wait_pending_ordered(cur_trans, root->fs_info);
+ btrfs_wait_pending_ordered(cur_trans);
btrfs_scrub_pause(root);
/*
@@ -2136,7 +2120,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
ret = btrfs_write_and_wait_transaction(trans, root);
if (ret) {
- btrfs_error(root->fs_info, ret,
+ btrfs_std_error(root->fs_info, ret,
"Error while writing out transaction");
mutex_unlock(&root->fs_info->tree_log_mutex);
goto scrub_continue;
@@ -2156,7 +2140,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
btrfs_finish_extent_commit(trans, root);
- if (cur_trans->have_free_bgs)
+ if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags))
btrfs_clear_space_info_full(root->fs_info);
root->fs_info->last_trans_committed = cur_trans->transid;
@@ -2198,10 +2182,6 @@ cleanup_transaction:
btrfs_trans_release_metadata(trans, root);
btrfs_trans_release_chunk_metadata(trans);
trans->block_rsv = NULL;
- if (trans->qgroup_reserved) {
- btrfs_qgroup_free(root, trans->qgroup_reserved);
- trans->qgroup_reserved = 0;
- }
btrfs_warn(root->fs_info, "Skipping commit of aborted transaction.");
if (current->journal_info == trans)
current->journal_info = NULL;
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index a994bb097..64c8221b6 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -32,6 +32,10 @@ enum btrfs_trans_state {
TRANS_STATE_MAX = 6,
};
+#define BTRFS_TRANS_HAVE_FREE_BGS 0
+#define BTRFS_TRANS_DIRTY_BG_RUN 1
+#define BTRFS_TRANS_CACHE_ENOSPC 2
+
struct btrfs_transaction {
u64 transid;
/*
@@ -46,11 +50,9 @@ struct btrfs_transaction {
*/
atomic_t num_writers;
atomic_t use_count;
+ atomic_t pending_ordered;
- /*
- * true if there is free bgs operations in this transaction
- */
- int have_free_bgs;
+ unsigned long flags;
/* Be protected by fs_info->trans_lock when we want to change it. */
enum btrfs_trans_state state;
@@ -59,9 +61,9 @@ struct btrfs_transaction {
unsigned long start_time;
wait_queue_head_t writer_wait;
wait_queue_head_t commit_wait;
+ wait_queue_head_t pending_wait;
struct list_head pending_snapshots;
struct list_head pending_chunks;
- struct list_head pending_ordered;
struct list_head switch_commits;
struct list_head dirty_bgs;
struct list_head io_bgs;
@@ -75,12 +77,11 @@ struct btrfs_transaction {
*/
struct mutex cache_write_mutex;
spinlock_t dirty_bgs_lock;
+ /* Protected by spin lock fs_info->unused_bgs_lock. */
struct list_head deleted_bgs;
- spinlock_t deleted_bgs_lock;
spinlock_t dropped_roots_lock;
struct btrfs_delayed_ref_root delayed_refs;
int aborted;
- int dirty_bg_run;
};
#define __TRANS_FREEZABLE (1U << 0)
@@ -107,7 +108,6 @@ struct btrfs_trans_handle {
u64 transid;
u64 bytes_reserved;
u64 chunk_bytes_reserved;
- u64 qgroup_reserved;
unsigned long use_count;
unsigned long blocks_reserved;
unsigned long blocks_used;
@@ -129,7 +129,6 @@ struct btrfs_trans_handle {
*/
struct btrfs_root *root;
struct seq_list delayed_ref_elem;
- struct list_head ordered;
struct list_head qgroup_ref_list;
struct list_head new_bgs;
};
@@ -185,9 +184,14 @@ static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans)
int btrfs_end_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
- int num_items);
+ unsigned int num_items);
+struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv(
+ struct btrfs_root *root,
+ unsigned int num_items,
+ int min_factor);
struct btrfs_trans_handle *btrfs_start_transaction_lflush(
- struct btrfs_root *root, int num_items);
+ struct btrfs_root *root,
+ unsigned int num_items);
struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 6f8af2de5..323e12cc9 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -229,7 +229,9 @@ int btrfs_pin_log_trans(struct btrfs_root *root)
void btrfs_end_log_trans(struct btrfs_root *root)
{
if (atomic_dec_and_test(&root->log_writers)) {
- smp_mb();
+ /*
+ * Implicit memory barrier after atomic_dec_and_test
+ */
if (waitqueue_active(&root->log_writer_wait))
wake_up(&root->log_writer_wait);
}
@@ -2820,7 +2822,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
mutex_lock(&log_root_tree->log_mutex);
if (atomic_dec_and_test(&log_root_tree->log_writers)) {
- smp_mb();
+ /*
+ * Implicit memory barrier after atomic_dec_and_test
+ */
if (waitqueue_active(&log_root_tree->log_writer_wait))
wake_up(&log_root_tree->log_writer_wait);
}
@@ -2950,6 +2954,9 @@ out_wake_log_root:
atomic_set(&log_root_tree->log_commit[index2], 0);
mutex_unlock(&log_root_tree->log_mutex);
+ /*
+ * The barrier before waitqueue_active is implied by mutex_unlock
+ */
if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
wake_up(&log_root_tree->log_commit_wait[index2]);
out:
@@ -2961,6 +2968,9 @@ out:
atomic_set(&root->log_commit[index1], 0);
mutex_unlock(&root->log_mutex);
+ /*
+ * The barrier before waitqueue_active is implied by mutex_unlock
+ */
if (waitqueue_active(&root->log_commit_wait[index1]))
wake_up(&root->log_commit_wait[index1]);
return ret;
@@ -5314,7 +5324,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
ret = walk_log_tree(trans, log_root_tree, &wc);
if (ret) {
- btrfs_error(fs_info, ret, "Failed to pin buffers while "
+ btrfs_std_error(fs_info, ret, "Failed to pin buffers while "
"recovering log root tree.");
goto error;
}
@@ -5328,7 +5338,7 @@ again:
ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
if (ret < 0) {
- btrfs_error(fs_info, ret,
+ btrfs_std_error(fs_info, ret,
"Couldn't find tree log root.");
goto error;
}
@@ -5346,7 +5356,7 @@ again:
log = btrfs_read_fs_root(log_root_tree, &found_key);
if (IS_ERR(log)) {
ret = PTR_ERR(log);
- btrfs_error(fs_info, ret,
+ btrfs_std_error(fs_info, ret,
"Couldn't read tree log root.");
goto error;
}
@@ -5361,7 +5371,7 @@ again:
free_extent_buffer(log->node);
free_extent_buffer(log->commit_root);
kfree(log);
- btrfs_error(fs_info, ret, "Couldn't read target root "
+ btrfs_std_error(fs_info, ret, "Couldn't read target root "
"for tree log recovery.");
goto error;
}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 6fc735869..a23399e8e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -42,6 +42,82 @@
#include "dev-replace.h"
#include "sysfs.h"
+const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
+ [BTRFS_RAID_RAID10] = {
+ .sub_stripes = 2,
+ .dev_stripes = 1,
+ .devs_max = 0, /* 0 == as many as possible */
+ .devs_min = 4,
+ .tolerated_failures = 1,
+ .devs_increment = 2,
+ .ncopies = 2,
+ },
+ [BTRFS_RAID_RAID1] = {
+ .sub_stripes = 1,
+ .dev_stripes = 1,
+ .devs_max = 2,
+ .devs_min = 2,
+ .tolerated_failures = 1,
+ .devs_increment = 2,
+ .ncopies = 2,
+ },
+ [BTRFS_RAID_DUP] = {
+ .sub_stripes = 1,
+ .dev_stripes = 2,
+ .devs_max = 1,
+ .devs_min = 1,
+ .tolerated_failures = 0,
+ .devs_increment = 1,
+ .ncopies = 2,
+ },
+ [BTRFS_RAID_RAID0] = {
+ .sub_stripes = 1,
+ .dev_stripes = 1,
+ .devs_max = 0,
+ .devs_min = 2,
+ .tolerated_failures = 0,
+ .devs_increment = 1,
+ .ncopies = 1,
+ },
+ [BTRFS_RAID_SINGLE] = {
+ .sub_stripes = 1,
+ .dev_stripes = 1,
+ .devs_max = 1,
+ .devs_min = 1,
+ .tolerated_failures = 0,
+ .devs_increment = 1,
+ .ncopies = 1,
+ },
+ [BTRFS_RAID_RAID5] = {
+ .sub_stripes = 1,
+ .dev_stripes = 1,
+ .devs_max = 0,
+ .devs_min = 2,
+ .tolerated_failures = 1,
+ .devs_increment = 1,
+ .ncopies = 2,
+ },
+ [BTRFS_RAID_RAID6] = {
+ .sub_stripes = 1,
+ .dev_stripes = 1,
+ .devs_max = 0,
+ .devs_min = 3,
+ .tolerated_failures = 2,
+ .devs_increment = 1,
+ .ncopies = 3,
+ },
+};
+
+const u64 const btrfs_raid_group[BTRFS_NR_RAID_TYPES] = {
+ [BTRFS_RAID_RAID10] = BTRFS_BLOCK_GROUP_RAID10,
+ [BTRFS_RAID_RAID1] = BTRFS_BLOCK_GROUP_RAID1,
+ [BTRFS_RAID_DUP] = BTRFS_BLOCK_GROUP_DUP,
+ [BTRFS_RAID_RAID0] = BTRFS_BLOCK_GROUP_RAID0,
+ [BTRFS_RAID_SINGLE] = 0,
+ [BTRFS_RAID_RAID5] = BTRFS_BLOCK_GROUP_RAID5,
+ [BTRFS_RAID_RAID6] = BTRFS_BLOCK_GROUP_RAID6,
+};
+
static int init_first_rw_device(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_device *device);
@@ -156,8 +232,8 @@ static struct btrfs_device *__alloc_device(void)
spin_lock_init(&dev->reada_lock);
atomic_set(&dev->reada_in_flight, 0);
atomic_set(&dev->dev_stats_ccnt, 0);
- INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_WAIT);
- INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_WAIT);
+ INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
+ INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
return dev;
}
@@ -198,7 +274,6 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
if (IS_ERR(*bdev)) {
ret = PTR_ERR(*bdev);
- printk(KERN_INFO "BTRFS: open %s failed\n", device_path);
goto error;
}
@@ -211,8 +286,8 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
}
invalidate_bdev(*bdev);
*bh = btrfs_read_dev_super(*bdev);
- if (!*bh) {
- ret = -EINVAL;
+ if (IS_ERR(*bh)) {
+ ret = PTR_ERR(*bh);
blkdev_put(*bdev, flags);
goto error;
}
@@ -345,6 +420,9 @@ loop_lock:
pending = pending->bi_next;
cur->bi_next = NULL;
+ /*
+ * atomic_dec_return implies a barrier for waitqueue_active
+ */
if (atomic_dec_return(&fs_info->nr_async_bios) < limit &&
waitqueue_active(&fs_info->async_submit_wait))
wake_up(&fs_info->async_submit_wait);
@@ -765,36 +843,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
mutex_lock(&fs_devices->device_list_mutex);
list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) {
- struct btrfs_device *new_device;
- struct rcu_string *name;
-
- if (device->bdev)
- fs_devices->open_devices--;
-
- if (device->writeable &&
- device->devid != BTRFS_DEV_REPLACE_DEVID) {
- list_del_init(&device->dev_alloc_list);
- fs_devices->rw_devices--;
- }
-
- if (device->missing)
- fs_devices->missing_devices--;
-
- new_device = btrfs_alloc_device(NULL, &device->devid,
- device->uuid);
- BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
-
- /* Safe because we are under uuid_mutex */
- if (device->name) {
- name = rcu_string_strdup(device->name->str, GFP_NOFS);
- BUG_ON(!name); /* -ENOMEM */
- rcu_assign_pointer(new_device->name, name);
- }
-
- list_replace_rcu(&device->dev_list, &new_device->dev_list);
- new_device->fs_devices = device->fs_devices;
-
- call_rcu(&device->rcu, free_device);
+ btrfs_close_one_device(device);
}
mutex_unlock(&fs_devices->device_list_mutex);
@@ -1402,7 +1451,7 @@ again:
extent = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_dev_extent);
} else {
- btrfs_error(root->fs_info, ret, "Slot search failed");
+ btrfs_std_error(root->fs_info, ret, "Slot search failed");
goto out;
}
@@ -1410,10 +1459,10 @@ again:
ret = btrfs_del_item(trans, root, path);
if (ret) {
- btrfs_error(root->fs_info, ret,
+ btrfs_std_error(root->fs_info, ret,
"Failed to remove dev extent item");
} else {
- trans->transaction->have_free_bgs = 1;
+ set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
}
out:
btrfs_free_path(path);
@@ -1801,7 +1850,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
if (device->bdev) {
device->fs_devices->open_devices--;
/* remove sysfs entry */
- btrfs_kobj_rm_device(root->fs_info->fs_devices, device);
+ btrfs_sysfs_rm_device_link(root->fs_info->fs_devices, device);
}
call_rcu(&device->rcu, free_device);
@@ -1924,7 +1973,7 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
if (srcdev->writeable) {
fs_devices->rw_devices--;
/* zero out the old super if it is writable */
- btrfs_scratch_superblock(srcdev);
+ btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
}
if (srcdev->bdev)
@@ -1971,10 +2020,10 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
WARN_ON(!tgtdev);
mutex_lock(&fs_info->fs_devices->device_list_mutex);
- btrfs_kobj_rm_device(fs_info->fs_devices, tgtdev);
+ btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev);
if (tgtdev->bdev) {
- btrfs_scratch_superblock(tgtdev);
+ btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
fs_info->fs_devices->open_devices--;
}
fs_info->fs_devices->num_devices--;
@@ -2041,10 +2090,8 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
}
}
- if (!*device) {
- btrfs_err(root->fs_info, "no missing device found");
- return -ENOENT;
- }
+ if (!*device)
+ return BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
return 0;
} else {
@@ -2309,7 +2356,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
tmp + 1);
/* add sysfs device entry */
- btrfs_kobj_add_device(root->fs_info->fs_devices, device);
+ btrfs_sysfs_add_device_link(root->fs_info->fs_devices, device);
/*
* we've got more storage, clear any full flags on the space
@@ -2350,9 +2397,10 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
*/
snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
root->fs_info->fsid);
- if (kobject_rename(&root->fs_info->fs_devices->super_kobj,
+ if (kobject_rename(&root->fs_info->fs_devices->fsid_kobj,
fsid_buf))
- pr_warn("BTRFS: sysfs: failed to create fsid for sprout\n");
+ btrfs_warn(root->fs_info,
+ "sysfs: failed to create fsid for sprout");
}
root->fs_info->num_tolerated_disk_barrier_failures =
@@ -2368,7 +2416,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
ret = btrfs_relocate_sys_chunks(root);
if (ret < 0)
- btrfs_error(root->fs_info, ret,
+ btrfs_std_error(root->fs_info, ret,
"Failed to relocate sys chunks after "
"device initialization. This can be fixed "
"using the \"btrfs balance\" command.");
@@ -2388,7 +2436,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
error_trans:
btrfs_end_transaction(trans, root);
rcu_string_free(device->name);
- btrfs_kobj_rm_device(root->fs_info->fs_devices, device);
+ btrfs_sysfs_rm_device_link(root->fs_info->fs_devices, device);
kfree(device);
error:
blkdev_put(bdev, FMODE_EXCL);
@@ -2613,7 +2661,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
if (ret < 0)
goto out;
else if (ret > 0) { /* Logic error or corruption */
- btrfs_error(root->fs_info, -ENOENT,
+ btrfs_std_error(root->fs_info, -ENOENT,
"Failed lookup while freeing chunk.");
ret = -ENOENT;
goto out;
@@ -2621,7 +2669,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
ret = btrfs_del_item(trans, root, path);
if (ret < 0)
- btrfs_error(root->fs_info, ret,
+ btrfs_std_error(root->fs_info, ret,
"Failed to delete chunk item.");
out:
btrfs_free_path(path);
@@ -2803,10 +2851,11 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, u64 chunk_offset)
if (ret)
return ret;
- trans = btrfs_start_transaction(root, 0);
+ trans = btrfs_start_trans_remove_block_group(root->fs_info,
+ chunk_offset);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
- btrfs_std_error(root->fs_info, ret);
+ btrfs_std_error(root->fs_info, ret, NULL);
return ret;
}
@@ -3009,16 +3058,19 @@ static void update_balance_args(struct btrfs_balance_control *bctl)
* (albeit full) chunks.
*/
if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+ !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
!(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
bctl->data.usage = 90;
}
if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+ !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
!(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
bctl->sys.usage = 90;
}
if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+ !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
!(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
bctl->meta.usage = 90;
@@ -3070,17 +3122,50 @@ static int chunk_profiles_filter(u64 chunk_type,
return 1;
}
-static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
+static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
struct btrfs_balance_args *bargs)
{
struct btrfs_block_group_cache *cache;
+ u64 chunk_used;
+ u64 user_thresh_min;
+ u64 user_thresh_max;
+ int ret = 1;
+
+ cache = btrfs_lookup_block_group(fs_info, chunk_offset);
+ chunk_used = btrfs_block_group_used(&cache->item);
+
+ if (bargs->usage_min == 0)
+ user_thresh_min = 0;
+ else
+ user_thresh_min = div_factor_fine(cache->key.offset,
+ bargs->usage_min);
+
+ if (bargs->usage_max == 0)
+ user_thresh_max = 1;
+ else if (bargs->usage_max > 100)
+ user_thresh_max = cache->key.offset;
+ else
+ user_thresh_max = div_factor_fine(cache->key.offset,
+ bargs->usage_max);
+
+ if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
+ ret = 0;
+
+ btrfs_put_block_group(cache);
+ return ret;
+}
+
+static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
+ u64 chunk_offset, struct btrfs_balance_args *bargs)
+{
+ struct btrfs_block_group_cache *cache;
u64 chunk_used, user_thresh;
int ret = 1;
cache = btrfs_lookup_block_group(fs_info, chunk_offset);
chunk_used = btrfs_block_group_used(&cache->item);
- if (bargs->usage == 0)
+ if (bargs->usage_min == 0)
user_thresh = 1;
else if (bargs->usage > 100)
user_thresh = cache->key.offset;
@@ -3170,6 +3255,19 @@ static int chunk_vrange_filter(struct extent_buffer *leaf,
return 1;
}
+static int chunk_stripes_range_filter(struct extent_buffer *leaf,
+ struct btrfs_chunk *chunk,
+ struct btrfs_balance_args *bargs)
+{
+ int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+
+ if (bargs->stripes_min <= num_stripes
+ && num_stripes <= bargs->stripes_max)
+ return 0;
+
+ return 1;
+}
+
static int chunk_soft_convert_filter(u64 chunk_type,
struct btrfs_balance_args *bargs)
{
@@ -3216,6 +3314,9 @@ static int should_balance_chunk(struct btrfs_root *root,
if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) {
return 0;
+ } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
+ chunk_usage_range_filter(bctl->fs_info, chunk_offset, bargs)) {
+ return 0;
}
/* devid filter */
@@ -3236,6 +3337,12 @@ static int should_balance_chunk(struct btrfs_root *root,
return 0;
}
+ /* stripes filter */
+ if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
+ chunk_stripes_range_filter(leaf, chunk, bargs)) {
+ return 0;
+ }
+
/* soft profile changing mode */
if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
chunk_soft_convert_filter(chunk_type, bargs)) {
@@ -3250,6 +3357,16 @@ static int should_balance_chunk(struct btrfs_root *root,
return 0;
else
bargs->limit--;
+ } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
+ /*
+ * Same logic as the 'limit' filter; the minimum cannot be
+ * determined here because we do not have the global informatoin
+ * about the count of all chunks that satisfy the filters.
+ */
+ if (bargs->limit_max == 0)
+ return 0;
+ else
+ bargs->limit_max--;
}
return 1;
@@ -3264,6 +3381,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
struct btrfs_device *device;
u64 old_size;
u64 size_to_free;
+ u64 chunk_type;
struct btrfs_chunk *chunk;
struct btrfs_path *path;
struct btrfs_key key;
@@ -3274,9 +3392,14 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
int ret;
int enospc_errors = 0;
bool counting = true;
+ /* The single value limit and min/max limits use the same bytes in the */
u64 limit_data = bctl->data.limit;
u64 limit_meta = bctl->meta.limit;
u64 limit_sys = bctl->sys.limit;
+ u32 count_data = 0;
+ u32 count_meta = 0;
+ u32 count_sys = 0;
+ int chunk_reserved = 0;
/* step one make some room on all the devices */
devices = &fs_info->fs_devices->devices;
@@ -3317,6 +3440,10 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
spin_unlock(&fs_info->balance_lock);
again:
if (!counting) {
+ /*
+ * The single value limit and min/max limits use the same bytes
+ * in the
+ */
bctl->data.limit = limit_data;
bctl->meta.limit = limit_meta;
bctl->sys.limit = limit_sys;
@@ -3364,6 +3491,7 @@ again:
}
chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
+ chunk_type = btrfs_chunk_type(leaf, chunk);
if (!counting) {
spin_lock(&fs_info->balance_lock);
@@ -3373,6 +3501,7 @@ again:
ret = should_balance_chunk(chunk_root, leaf, chunk,
found_key.offset);
+
btrfs_release_path(path);
if (!ret) {
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
@@ -3384,9 +3513,49 @@ again:
spin_lock(&fs_info->balance_lock);
bctl->stat.expected++;
spin_unlock(&fs_info->balance_lock);
+
+ if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
+ count_data++;
+ else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
+ count_sys++;
+ else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
+ count_meta++;
+
goto loop;
}
+ /*
+ * Apply limit_min filter, no need to check if the LIMITS
+ * filter is used, limit_min is 0 by default
+ */
+ if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
+ count_data < bctl->data.limit_min)
+ || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) &&
+ count_meta < bctl->meta.limit_min)
+ || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
+ count_sys < bctl->sys.limit_min)) {
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ goto loop;
+ }
+
+ if ((chunk_type & BTRFS_BLOCK_GROUP_DATA) && !chunk_reserved) {
+ trans = btrfs_start_transaction(chunk_root, 0);
+ if (IS_ERR(trans)) {
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ ret = PTR_ERR(trans);
+ goto error;
+ }
+
+ ret = btrfs_force_chunk_alloc(trans, chunk_root,
+ BTRFS_BLOCK_GROUP_DATA);
+ btrfs_end_transaction(trans, chunk_root);
+ if (ret < 0) {
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ goto error;
+ }
+ chunk_reserved = 1;
+ }
+
ret = btrfs_relocate_chunk(chunk_root,
found_key.offset);
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
@@ -3461,11 +3630,20 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info)
unset_balance_control(fs_info);
ret = del_balance_item(fs_info->tree_root);
if (ret)
- btrfs_std_error(fs_info, ret);
+ btrfs_std_error(fs_info, ret, NULL);
atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
}
+/* Non-zero return value signifies invalidity */
+static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg,
+ u64 allowed)
+{
+ return ((bctl_arg->flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+ (!alloc_profile_is_valid(bctl_arg->target, 1) ||
+ (bctl_arg->target & ~allowed)));
+}
+
/*
* Should be called with both balance and volume mutexes held
*/
@@ -3523,27 +3701,21 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
if (num_devices > 3)
allowed |= (BTRFS_BLOCK_GROUP_RAID10 |
BTRFS_BLOCK_GROUP_RAID6);
- if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
- (!alloc_profile_is_valid(bctl->data.target, 1) ||
- (bctl->data.target & ~allowed))) {
+ if (validate_convert_profile(&bctl->data, allowed)) {
btrfs_err(fs_info, "unable to start balance with target "
"data profile %llu",
bctl->data.target);
ret = -EINVAL;
goto out;
}
- if ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
- (!alloc_profile_is_valid(bctl->meta.target, 1) ||
- (bctl->meta.target & ~allowed))) {
+ if (validate_convert_profile(&bctl->meta, allowed)) {
btrfs_err(fs_info,
"unable to start balance with target metadata profile %llu",
bctl->meta.target);
ret = -EINVAL;
goto out;
}
- if ((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
- (!alloc_profile_is_valid(bctl->sys.target, 1) ||
- (bctl->sys.target & ~allowed))) {
+ if (validate_convert_profile(&bctl->sys, allowed)) {
btrfs_err(fs_info,
"unable to start balance with target system profile %llu",
bctl->sys.target);
@@ -4285,65 +4457,6 @@ static int btrfs_cmp_device_info(const void *a, const void *b)
return 0;
}
-static const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
- [BTRFS_RAID_RAID10] = {
- .sub_stripes = 2,
- .dev_stripes = 1,
- .devs_max = 0, /* 0 == as many as possible */
- .devs_min = 4,
- .devs_increment = 2,
- .ncopies = 2,
- },
- [BTRFS_RAID_RAID1] = {
- .sub_stripes = 1,
- .dev_stripes = 1,
- .devs_max = 2,
- .devs_min = 2,
- .devs_increment = 2,
- .ncopies = 2,
- },
- [BTRFS_RAID_DUP] = {
- .sub_stripes = 1,
- .dev_stripes = 2,
- .devs_max = 1,
- .devs_min = 1,
- .devs_increment = 1,
- .ncopies = 2,
- },
- [BTRFS_RAID_RAID0] = {
- .sub_stripes = 1,
- .dev_stripes = 1,
- .devs_max = 0,
- .devs_min = 2,
- .devs_increment = 1,
- .ncopies = 1,
- },
- [BTRFS_RAID_SINGLE] = {
- .sub_stripes = 1,
- .dev_stripes = 1,
- .devs_max = 1,
- .devs_min = 1,
- .devs_increment = 1,
- .ncopies = 1,
- },
- [BTRFS_RAID_RAID5] = {
- .sub_stripes = 1,
- .dev_stripes = 1,
- .devs_max = 0,
- .devs_min = 2,
- .devs_increment = 1,
- .ncopies = 2,
- },
- [BTRFS_RAID_RAID6] = {
- .sub_stripes = 1,
- .dev_stripes = 1,
- .devs_max = 0,
- .devs_min = 3,
- .devs_increment = 1,
- .ncopies = 3,
- },
-};
-
static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target)
{
/* TODO allow them to set a preferred stripe size */
@@ -6594,8 +6707,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
BUG_ON(!path);
ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
if (ret < 0) {
- printk_in_rcu(KERN_WARNING "BTRFS: "
- "error %d while searching for dev_stats item for device %s!\n",
+ btrfs_warn_in_rcu(dev_root->fs_info,
+ "error %d while searching for dev_stats item for device %s",
ret, rcu_str_deref(device->name));
goto out;
}
@@ -6605,8 +6718,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
/* need to delete old one and insert a new one */
ret = btrfs_del_item(trans, dev_root, path);
if (ret != 0) {
- printk_in_rcu(KERN_WARNING "BTRFS: "
- "delete too small dev_stats item for device %s failed %d!\n",
+ btrfs_warn_in_rcu(dev_root->fs_info,
+ "delete too small dev_stats item for device %s failed %d",
rcu_str_deref(device->name), ret);
goto out;
}
@@ -6619,9 +6732,9 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
ret = btrfs_insert_empty_item(trans, dev_root, path,
&key, sizeof(*ptr));
if (ret < 0) {
- printk_in_rcu(KERN_WARNING "BTRFS: "
- "insert dev_stats item for device %s failed %d!\n",
- rcu_str_deref(device->name), ret);
+ btrfs_warn_in_rcu(dev_root->fs_info,
+ "insert dev_stats item for device %s failed %d",
+ rcu_str_deref(device->name), ret);
goto out;
}
}
@@ -6675,8 +6788,8 @@ static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
{
if (!dev->dev_stats_valid)
return;
- printk_ratelimited_in_rcu(KERN_ERR "BTRFS: "
- "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
+ btrfs_err_rl_in_rcu(dev->dev_root->fs_info,
+ "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
rcu_str_deref(dev->name),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
@@ -6695,8 +6808,8 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
if (i == BTRFS_DEV_STAT_VALUES_MAX)
return; /* all values == 0, suppress message */
- printk_in_rcu(KERN_INFO "BTRFS: "
- "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
+ btrfs_info_in_rcu(dev->dev_root->fs_info,
+ "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
rcu_str_deref(dev->name),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
@@ -6740,22 +6853,34 @@ int btrfs_get_dev_stats(struct btrfs_root *root,
return 0;
}
-int btrfs_scratch_superblock(struct btrfs_device *device)
+void btrfs_scratch_superblocks(struct block_device *bdev, char *device_path)
{
struct buffer_head *bh;
struct btrfs_super_block *disk_super;
+ int copy_num;
- bh = btrfs_read_dev_super(device->bdev);
- if (!bh)
- return -EINVAL;
- disk_super = (struct btrfs_super_block *)bh->b_data;
+ if (!bdev)
+ return;
- memset(&disk_super->magic, 0, sizeof(disk_super->magic));
- set_buffer_dirty(bh);
- sync_dirty_buffer(bh);
- brelse(bh);
+ for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX;
+ copy_num++) {
- return 0;
+ if (btrfs_read_dev_one_super(bdev, copy_num, &bh))
+ continue;
+
+ disk_super = (struct btrfs_super_block *)bh->b_data;
+
+ memset(&disk_super->magic, 0, sizeof(disk_super->magic));
+ set_buffer_dirty(bh);
+ sync_dirty_buffer(bh);
+ brelse(bh);
+ }
+
+ /* Notify udev that device has changed */
+ btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
+
+ /* Update ctime/mtime for device path for libblkid */
+ update_dev_time(device_path);
}
/*
@@ -6823,3 +6948,38 @@ void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info)
fs_devices = fs_devices->seed;
}
}
+
+void btrfs_close_one_device(struct btrfs_device *device)
+{
+ struct btrfs_fs_devices *fs_devices = device->fs_devices;
+ struct btrfs_device *new_device;
+ struct rcu_string *name;
+
+ if (device->bdev)
+ fs_devices->open_devices--;
+
+ if (device->writeable &&
+ device->devid != BTRFS_DEV_REPLACE_DEVID) {
+ list_del_init(&device->dev_alloc_list);
+ fs_devices->rw_devices--;
+ }
+
+ if (device->missing)
+ fs_devices->missing_devices--;
+
+ new_device = btrfs_alloc_device(NULL, &device->devid,
+ device->uuid);
+ BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
+
+ /* Safe because we are under uuid_mutex */
+ if (device->name) {
+ name = rcu_string_strdup(device->name->str, GFP_NOFS);
+ BUG_ON(!name); /* -ENOMEM */
+ rcu_assign_pointer(new_device->name, name);
+ }
+
+ list_replace_rcu(&device->dev_list, &new_device->dev_list);
+ new_device->fs_devices = device->fs_devices;
+
+ call_rcu(&device->rcu, free_device);
+}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 595279a8b..d5c84f6b1 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -256,7 +256,7 @@ struct btrfs_fs_devices {
struct btrfs_fs_info *fs_info;
/* sysfs kobjects */
- struct kobject super_kobj;
+ struct kobject fsid_kobj;
struct kobject *device_dir_kobj;
struct completion kobj_unregister;
};
@@ -334,10 +334,15 @@ struct btrfs_raid_attr {
int dev_stripes; /* stripes per dev */
int devs_max; /* max devs to use */
int devs_min; /* min devs needed */
+ int tolerated_failures; /* max tolerated fail devs */
int devs_increment; /* ndevs has to be a multiple of this */
int ncopies; /* how many copies to data has */
};
+extern const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES];
+
+extern const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES];
+
struct map_lookup {
u64 type;
int io_align;
@@ -375,6 +380,9 @@ struct map_lookup {
#define BTRFS_BALANCE_ARGS_DRANGE (1ULL << 3)
#define BTRFS_BALANCE_ARGS_VRANGE (1ULL << 4)
#define BTRFS_BALANCE_ARGS_LIMIT (1ULL << 5)
+#define BTRFS_BALANCE_ARGS_LIMIT_RANGE (1ULL << 6)
+#define BTRFS_BALANCE_ARGS_STRIPES_RANGE (1ULL << 7)
+#define BTRFS_BALANCE_ARGS_USAGE_RANGE (1ULL << 10)
#define BTRFS_BALANCE_ARGS_MASK \
(BTRFS_BALANCE_ARGS_PROFILES | \
@@ -382,7 +390,10 @@ struct map_lookup {
BTRFS_BALANCE_ARGS_DEVID | \
BTRFS_BALANCE_ARGS_DRANGE | \
BTRFS_BALANCE_ARGS_VRANGE | \
- BTRFS_BALANCE_ARGS_LIMIT)
+ BTRFS_BALANCE_ARGS_LIMIT | \
+ BTRFS_BALANCE_ARGS_LIMIT_RANGE | \
+ BTRFS_BALANCE_ARGS_STRIPES_RANGE | \
+ BTRFS_BALANCE_ARGS_USAGE_RANGE)
/*
* Profile changing flags. When SOFT is set we won't relocate chunk if
@@ -482,7 +493,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
struct btrfs_device *tgtdev);
void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
struct btrfs_device *tgtdev);
-int btrfs_scratch_superblock(struct btrfs_device *device);
+void btrfs_scratch_superblocks(struct block_device *bdev, char *device_path);
int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
u64 logical, u64 len, int mirror_num);
unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
@@ -555,5 +566,6 @@ static inline void unlock_chunks(struct btrfs_root *root)
struct list_head *btrfs_get_fs_uuids(void);
void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info);
void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info);
+void btrfs_close_one_device(struct btrfs_device *device);
#endif