summaryrefslogtreecommitdiff
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig14
-rw-r--r--drivers/md/Makefile2
-rw-r--r--drivers/md/bcache/closure.h3
-rw-r--r--drivers/md/bcache/io.c3
-rw-r--r--drivers/md/bcache/journal.c4
-rw-r--r--drivers/md/bcache/request.c17
-rw-r--r--drivers/md/bcache/super.c10
-rw-r--r--drivers/md/bcache/util.h10
-rw-r--r--drivers/md/bitmap.c21
-rw-r--r--drivers/md/dm-bio-prison.c26
-rw-r--r--drivers/md/dm-bio-prison.h13
-rw-r--r--drivers/md/dm-cache-metadata.c133
-rw-r--r--drivers/md/dm-cache-metadata.h10
-rw-r--r--drivers/md/dm-cache-policy-cleaner.c3
-rw-r--r--drivers/md/dm-cache-policy-internal.h47
-rw-r--r--drivers/md/dm-cache-policy-mq.c54
-rw-r--r--drivers/md/dm-cache-policy-smq.c1793
-rw-r--r--drivers/md/dm-cache-policy.h15
-rw-r--r--drivers/md/dm-cache-target.c811
-rw-r--r--drivers/md/dm-crypt.c30
-rw-r--r--drivers/md/dm-log-writes.c4
-rw-r--r--drivers/md/dm-raid.c225
-rw-r--r--drivers/md/dm-raid1.c77
-rw-r--r--drivers/md/dm-snap.c1
-rw-r--r--drivers/md/dm-stats.c339
-rw-r--r--drivers/md/dm-stats.h4
-rw-r--r--drivers/md/dm-stripe.c4
-rw-r--r--drivers/md/dm-thin-metadata.c128
-rw-r--r--drivers/md/dm-thin-metadata.h11
-rw-r--r--drivers/md/dm-thin.c651
-rw-r--r--drivers/md/dm-verity.c2
-rw-r--r--drivers/md/dm.c123
-rw-r--r--drivers/md/dm.h1
-rw-r--r--drivers/md/md-cluster.c12
-rw-r--r--drivers/md/md-cluster.h2
-rw-r--r--drivers/md/md.c166
-rw-r--r--drivers/md/md.h1
-rw-r--r--drivers/md/persistent-data/dm-block-manager.c6
-rw-r--r--drivers/md/persistent-data/dm-block-manager.h1
-rw-r--r--drivers/md/persistent-data/dm-btree-internal.h6
-rw-r--r--drivers/md/persistent-data/dm-btree-remove.c142
-rw-r--r--drivers/md/persistent-data/dm-btree-spine.c37
-rw-r--r--drivers/md/persistent-data/dm-btree.c7
-rw-r--r--drivers/md/persistent-data/dm-btree.h9
-rw-r--r--drivers/md/raid1.c11
-rw-r--r--drivers/md/raid10.c25
-rw-r--r--drivers/md/raid5.c84
-rw-r--r--drivers/md/raid5.h6
48 files changed, 4266 insertions, 838 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index edcf4ab66..bfec3bdfe 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -259,7 +259,7 @@ config DM_CRYPT
the ciphers you're going to use in the cryptoapi configuration.
For further information on dm-crypt and userspace tools see:
- <http://code.google.com/p/cryptsetup/wiki/DMCrypt>
+ <https://gitlab.com/cryptsetup/cryptsetup/wikis/DMCrypt>
To compile this code as a module, choose M here: the module will
be called dm-crypt.
@@ -304,6 +304,18 @@ config DM_CACHE_MQ
This is meant to be a general purpose policy. It prioritises
reads over writes.
+config DM_CACHE_SMQ
+ tristate "Stochastic MQ Cache Policy (EXPERIMENTAL)"
+ depends on DM_CACHE
+ default y
+ ---help---
+ A cache policy that uses a multiqueue ordered by recent hits
+ to select which blocks should be promoted and demoted.
+ This is meant to be a general purpose policy. It prioritises
+ reads over writes. This SMQ policy (vs MQ) offers the promise
+ of less memory utilization, improved performance and increased
+ adaptability in the face of changing workloads.
+
config DM_CACHE_CLEANER
tristate "Cleaner Cache Policy (EXPERIMENTAL)"
depends on DM_CACHE
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index dba4db598..462f443a4 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -13,6 +13,7 @@ dm-log-userspace-y \
dm-thin-pool-y += dm-thin.o dm-thin-metadata.o
dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o
dm-cache-mq-y += dm-cache-policy-mq.o
+dm-cache-smq-y += dm-cache-policy-smq.o
dm-cache-cleaner-y += dm-cache-policy-cleaner.o
dm-era-y += dm-era-target.o
md-mod-y += md.o bitmap.o
@@ -54,6 +55,7 @@ obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o
obj-$(CONFIG_DM_VERITY) += dm-verity.o
obj-$(CONFIG_DM_CACHE) += dm-cache.o
obj-$(CONFIG_DM_CACHE_MQ) += dm-cache-mq.o
+obj-$(CONFIG_DM_CACHE_SMQ) += dm-cache-smq.o
obj-$(CONFIG_DM_CACHE_CLEANER) += dm-cache-cleaner.o
obj-$(CONFIG_DM_ERA) += dm-era.o
obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o
diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h
index a08e3eeac..79a6d63e8 100644
--- a/drivers/md/bcache/closure.h
+++ b/drivers/md/bcache/closure.h
@@ -320,7 +320,6 @@ static inline void closure_wake_up(struct closure_waitlist *list)
do { \
set_closure_fn(_cl, _fn, _wq); \
closure_sub(_cl, CLOSURE_RUNNING + 1); \
- return; \
} while (0)
/**
@@ -349,7 +348,6 @@ do { \
do { \
set_closure_fn(_cl, _fn, _wq); \
closure_queue(_cl); \
- return; \
} while (0)
/**
@@ -365,7 +363,6 @@ do { \
do { \
set_closure_fn(_cl, _destructor, NULL); \
closure_sub(_cl, CLOSURE_RUNNING - CLOSURE_DESTRUCTOR + 1); \
- return; \
} while (0)
/**
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index fa028fa82..bf6a9ca18 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -55,7 +55,7 @@ static void bch_bio_submit_split_done(struct closure *cl)
s->bio->bi_end_io = s->bi_end_io;
s->bio->bi_private = s->bi_private;
- bio_endio_nodec(s->bio, 0);
+ bio_endio(s->bio, 0);
closure_debug_destroy(&s->cl);
mempool_free(s, s->p->bio_split_hook);
@@ -105,6 +105,7 @@ void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p)
} while (n != bio);
continue_at(&s->cl, bch_bio_submit_split_done, NULL);
+ return;
submit:
generic_make_request(bio);
}
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index fe080ad0e..418607a6b 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -157,7 +157,7 @@ int bch_journal_read(struct cache_set *c, struct list_head *list)
for_each_cache(ca, c, iter) {
struct journal_device *ja = &ca->journal;
- unsigned long bitmap[SB_JOURNAL_BUCKETS / BITS_PER_LONG];
+ DECLARE_BITMAP(bitmap, SB_JOURNAL_BUCKETS);
unsigned i, l, r, m;
uint64_t seq;
@@ -592,12 +592,14 @@ static void journal_write_unlocked(struct closure *cl)
if (!w->need_write) {
closure_return_with_destructor(cl, journal_write_unlock);
+ return;
} else if (journal_full(&c->journal)) {
journal_reclaim(c);
spin_unlock(&c->journal.lock);
btree_flush_write(c);
continue_at(cl, journal_write, system_wq);
+ return;
}
c->journal.blocks_free -= set_blocks(w->data, block_bytes(c));
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index ab43faddb..f29279099 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -15,6 +15,7 @@
#include <linux/module.h>
#include <linux/hash.h>
#include <linux/random.h>
+#include <linux/backing-dev.h>
#include <trace/events/bcache.h>
@@ -87,8 +88,10 @@ static void bch_data_insert_keys(struct closure *cl)
if (journal_ref)
atomic_dec_bug(journal_ref);
- if (!op->insert_data_done)
+ if (!op->insert_data_done) {
continue_at(cl, bch_data_insert_start, op->wq);
+ return;
+ }
bch_keylist_free(&op->insert_keys);
closure_return(cl);
@@ -215,8 +218,10 @@ static void bch_data_insert_start(struct closure *cl)
/* 1 for the device pointer and 1 for the chksum */
if (bch_keylist_realloc(&op->insert_keys,
3 + (op->csum ? 1 : 0),
- op->c))
+ op->c)) {
continue_at(cl, bch_data_insert_keys, op->wq);
+ return;
+ }
k = op->insert_keys.top;
bkey_init(k);
@@ -254,6 +259,7 @@ static void bch_data_insert_start(struct closure *cl)
op->insert_data_done = true;
continue_at(cl, bch_data_insert_keys, op->wq);
+ return;
err:
/* bch_alloc_sectors() blocks if s->writeback = true */
BUG_ON(op->writeback);
@@ -575,8 +581,10 @@ static void cache_lookup(struct closure *cl)
ret = bch_btree_map_keys(&s->op, s->iop.c,
&KEY(s->iop.inode, bio->bi_iter.bi_sector, 0),
cache_lookup_fn, MAP_END_KEY);
- if (ret == -EAGAIN)
+ if (ret == -EAGAIN) {
continue_at(cl, cache_lookup, bcache_wq);
+ return;
+ }
closure_return(cl);
}
@@ -619,7 +627,7 @@ static void do_bio_hook(struct search *s, struct bio *orig_bio)
bio->bi_end_io = request_endio;
bio->bi_private = &s->cl;
- atomic_set(&bio->bi_cnt, 3);
+ bio_cnt_set(bio, 3);
}
static void search_free(struct closure *cl)
@@ -1084,6 +1092,7 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
continue_at_nobarrier(&s->cl,
flash_dev_nodata,
bcache_wq);
+ return;
} else if (rw) {
bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys,
&KEY(d->id, bio->bi_iter.bi_sector, 0),
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 4dd2bb716..94980bfca 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -760,14 +760,8 @@ static void bcache_device_free(struct bcache_device *d)
bio_split_pool_free(&d->bio_split_hook);
if (d->bio_split)
bioset_free(d->bio_split);
- if (is_vmalloc_addr(d->full_dirty_stripes))
- vfree(d->full_dirty_stripes);
- else
- kfree(d->full_dirty_stripes);
- if (is_vmalloc_addr(d->stripe_sectors_dirty))
- vfree(d->stripe_sectors_dirty);
- else
- kfree(d->stripe_sectors_dirty);
+ kvfree(d->full_dirty_stripes);
+ kvfree(d->stripe_sectors_dirty);
closure_debug_destroy(&d->cl);
}
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index 98df7572b..1d04c4859 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -52,10 +52,7 @@ struct closure;
#define free_heap(heap) \
do { \
- if (is_vmalloc_addr((heap)->data)) \
- vfree((heap)->data); \
- else \
- kfree((heap)->data); \
+ kvfree((heap)->data); \
(heap)->data = NULL; \
} while (0)
@@ -163,10 +160,7 @@ do { \
#define free_fifo(fifo) \
do { \
- if (is_vmalloc_addr((fifo)->data)) \
- vfree((fifo)->data); \
- else \
- kfree((fifo)->data); \
+ kvfree((fifo)->data); \
(fifo)->data = NULL; \
} while (0)
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index a2ed982b0..e51de52ee 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -559,6 +559,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
unsigned long sectors_reserved = 0;
int err = -EINVAL;
struct page *sb_page;
+ loff_t offset = bitmap->mddev->bitmap_info.offset;
if (!bitmap->storage.file && !bitmap->mddev->bitmap_info.offset) {
chunksize = 128 * 1024 * 1024;
@@ -569,7 +570,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
goto out_no_sb;
}
/* page 0 is the superblock, read it... */
- sb_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ sb_page = alloc_page(GFP_KERNEL);
if (!sb_page)
return -ENOMEM;
bitmap->storage.sb_page = sb_page;
@@ -585,9 +586,9 @@ re_read:
bm_blocks = ((bm_blocks+7) >> 3) + sizeof(bitmap_super_t);
/* to 4k blocks */
bm_blocks = DIV_ROUND_UP_SECTOR_T(bm_blocks, 4096);
- bitmap->mddev->bitmap_info.offset += bitmap->cluster_slot * (bm_blocks << 3);
+ offset = bitmap->mddev->bitmap_info.offset + (bitmap->cluster_slot * (bm_blocks << 3));
pr_info("%s:%d bm slot: %d offset: %llu\n", __func__, __LINE__,
- bitmap->cluster_slot, (unsigned long long)bitmap->mddev->bitmap_info.offset);
+ bitmap->cluster_slot, offset);
}
if (bitmap->storage.file) {
@@ -598,7 +599,7 @@ re_read:
bitmap, bytes, sb_page);
} else {
err = read_sb_page(bitmap->mddev,
- bitmap->mddev->bitmap_info.offset,
+ offset,
sb_page,
0, sizeof(bitmap_super_t));
}
@@ -658,7 +659,7 @@ re_read:
goto out;
}
events = le64_to_cpu(sb->events);
- if (err == 0 && !nodes && (events < bitmap->mddev->events)) {
+ if (!nodes && (events < bitmap->mddev->events)) {
printk(KERN_INFO
"%s: bitmap file is out of date (%llu < %llu) "
"-- forcing full recovery\n",
@@ -680,7 +681,7 @@ out:
kunmap_atomic(sb);
/* Assiging chunksize is required for "re_read" */
bitmap->mddev->bitmap_info.chunksize = chunksize;
- if (nodes && (bitmap->cluster_slot < 0)) {
+ if (err == 0 && nodes && (bitmap->cluster_slot < 0)) {
err = md_setup_cluster(bitmap->mddev, nodes);
if (err) {
pr_err("%s: Could not setup cluster service (%d)\n",
@@ -848,7 +849,7 @@ static void bitmap_file_kick(struct bitmap *bitmap)
if (bitmap->storage.file) {
path = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (path)
- ptr = d_path(&bitmap->storage.file->f_path,
+ ptr = file_path(bitmap->storage.file,
path, PAGE_SIZE);
printk(KERN_ALERT
@@ -1875,10 +1876,6 @@ int bitmap_copy_from_slot(struct mddev *mddev, int slot,
if (IS_ERR(bitmap))
return PTR_ERR(bitmap);
- rv = bitmap_read_sb(bitmap);
- if (rv)
- goto err;
-
rv = bitmap_init_from_disk(bitmap, 0);
if (rv)
goto err;
@@ -1936,7 +1933,7 @@ void bitmap_status(struct seq_file *seq, struct bitmap *bitmap)
chunk_kb ? "KB" : "B");
if (bitmap->storage.file) {
seq_printf(seq, ", file: ");
- seq_path(seq, &bitmap->storage.file->f_path, " \t\n");
+ seq_file_path(seq, bitmap->storage.file, " \t\n");
}
seq_printf(seq, "\n");
diff --git a/drivers/md/dm-bio-prison.c b/drivers/md/dm-bio-prison.c
index be065300e..cd6d1d21e 100644
--- a/drivers/md/dm-bio-prison.c
+++ b/drivers/md/dm-bio-prison.c
@@ -255,6 +255,32 @@ void dm_cell_visit_release(struct dm_bio_prison *prison,
}
EXPORT_SYMBOL_GPL(dm_cell_visit_release);
+static int __promote_or_release(struct dm_bio_prison *prison,
+ struct dm_bio_prison_cell *cell)
+{
+ if (bio_list_empty(&cell->bios)) {
+ rb_erase(&cell->node, &prison->cells);
+ return 1;
+ }
+
+ cell->holder = bio_list_pop(&cell->bios);
+ return 0;
+}
+
+int dm_cell_promote_or_release(struct dm_bio_prison *prison,
+ struct dm_bio_prison_cell *cell)
+{
+ int r;
+ unsigned long flags;
+
+ spin_lock_irqsave(&prison->lock, flags);
+ r = __promote_or_release(prison, cell);
+ spin_unlock_irqrestore(&prison->lock, flags);
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(dm_cell_promote_or_release);
+
/*----------------------------------------------------------------*/
#define DEFERRED_SET_SIZE 64
diff --git a/drivers/md/dm-bio-prison.h b/drivers/md/dm-bio-prison.h
index 74cf01144..54352f009 100644
--- a/drivers/md/dm-bio-prison.h
+++ b/drivers/md/dm-bio-prison.h
@@ -101,6 +101,19 @@ void dm_cell_visit_release(struct dm_bio_prison *prison,
void (*visit_fn)(void *, struct dm_bio_prison_cell *),
void *context, struct dm_bio_prison_cell *cell);
+/*
+ * Rather than always releasing the prisoners in a cell, the client may
+ * want to promote one of them to be the new holder. There is a race here
+ * though between releasing an empty cell, and other threads adding new
+ * inmates. So this function makes the decision with its lock held.
+ *
+ * This function can have two outcomes:
+ * i) An inmate is promoted to be the holder of the cell (return value of 0).
+ * ii) The cell has no inmate for promotion and is released (return value of 1).
+ */
+int dm_cell_promote_or_release(struct dm_bio_prison *prison,
+ struct dm_bio_prison_cell *cell);
+
/*----------------------------------------------------------------*/
/*
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c
index c1c010498..20cc36b01 100644
--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@@ -39,6 +39,8 @@
enum superblock_flag_bits {
/* for spotting crashes that would invalidate the dirty bitset */
CLEAN_SHUTDOWN,
+ /* metadata must be checked using the tools */
+ NEEDS_CHECK,
};
/*
@@ -107,6 +109,7 @@ struct dm_cache_metadata {
struct dm_disk_bitset discard_info;
struct rw_semaphore root_lock;
+ unsigned long flags;
dm_block_t root;
dm_block_t hint_root;
dm_block_t discard_root;
@@ -129,6 +132,14 @@ struct dm_cache_metadata {
* buffer before the superblock is locked and updated.
*/
__u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
+
+ /*
+ * Set if a transaction has to be aborted but the attempt to roll
+ * back to the previous (good) transaction failed. The only
+ * metadata operation permissible in this state is the closing of
+ * the device.
+ */
+ bool fail_io:1;
};
/*-------------------------------------------------------------------
@@ -527,6 +538,7 @@ static unsigned long clear_clean_shutdown(unsigned long flags)
static void read_superblock_fields(struct dm_cache_metadata *cmd,
struct cache_disk_superblock *disk_super)
{
+ cmd->flags = le32_to_cpu(disk_super->flags);
cmd->root = le64_to_cpu(disk_super->mapping_root);
cmd->hint_root = le64_to_cpu(disk_super->hint_root);
cmd->discard_root = le64_to_cpu(disk_super->discard_root);
@@ -625,6 +637,7 @@ static int __commit_transaction(struct dm_cache_metadata *cmd,
if (mutator)
update_flags(disk_super, mutator);
+ disk_super->flags = cpu_to_le32(cmd->flags);
disk_super->mapping_root = cpu_to_le64(cmd->root);
disk_super->hint_root = cpu_to_le64(cmd->hint_root);
disk_super->discard_root = cpu_to_le64(cmd->discard_root);
@@ -693,6 +706,7 @@ static struct dm_cache_metadata *metadata_open(struct block_device *bdev,
cmd->cache_blocks = 0;
cmd->policy_hint_size = policy_hint_size;
cmd->changed = true;
+ cmd->fail_io = false;
r = __create_persistent_data_objects(cmd, may_format_device);
if (r) {
@@ -796,7 +810,8 @@ void dm_cache_metadata_close(struct dm_cache_metadata *cmd)
list_del(&cmd->list);
mutex_unlock(&table_lock);
- __destroy_persistent_data_objects(cmd);
+ if (!cmd->fail_io)
+ __destroy_persistent_data_objects(cmd);
kfree(cmd);
}
}
@@ -848,13 +863,26 @@ static int blocks_are_unmapped_or_clean(struct dm_cache_metadata *cmd,
return 0;
}
+#define WRITE_LOCK(cmd) \
+ if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) \
+ return -EINVAL; \
+ down_write(&cmd->root_lock)
+
+#define WRITE_LOCK_VOID(cmd) \
+ if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) \
+ return; \
+ down_write(&cmd->root_lock)
+
+#define WRITE_UNLOCK(cmd) \
+ up_write(&cmd->root_lock)
+
int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size)
{
int r;
bool clean;
__le64 null_mapping = pack_value(0, 0);
- down_write(&cmd->root_lock);
+ WRITE_LOCK(cmd);
__dm_bless_for_disk(&null_mapping);
if (from_cblock(new_cache_size) < from_cblock(cmd->cache_blocks)) {
@@ -880,7 +908,7 @@ int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size)
cmd->changed = true;
out:
- up_write(&cmd->root_lock);
+ WRITE_UNLOCK(cmd);
return r;
}
@@ -891,7 +919,7 @@ int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd,
{
int r;
- down_write(&cmd->root_lock);
+ WRITE_LOCK(cmd);
r = dm_bitset_resize(&cmd->discard_info,
cmd->discard_root,
from_dblock(cmd->discard_nr_blocks),
@@ -903,7 +931,7 @@ int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd,
}
cmd->changed = true;
- up_write(&cmd->root_lock);
+ WRITE_UNLOCK(cmd);
return r;
}
@@ -946,9 +974,9 @@ int dm_cache_set_discard(struct dm_cache_metadata *cmd,
{
int r;
- down_write(&cmd->root_lock);
+ WRITE_LOCK(cmd);
r = __discard(cmd, dblock, discard);
- up_write(&cmd->root_lock);
+ WRITE_UNLOCK(cmd);
return r;
}
@@ -1020,9 +1048,9 @@ int dm_cache_remove_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock)
{
int r;
- down_write(&cmd->root_lock);
+ WRITE_LOCK(cmd);
r = __remove(cmd, cblock);
- up_write(&cmd->root_lock);
+ WRITE_UNLOCK(cmd);
return r;
}
@@ -1048,9 +1076,9 @@ int dm_cache_insert_mapping(struct dm_cache_metadata *cmd,
{
int r;
- down_write(&cmd->root_lock);
+ WRITE_LOCK(cmd);
r = __insert(cmd, cblock, oblock);
- up_write(&cmd->root_lock);
+ WRITE_UNLOCK(cmd);
return r;
}
@@ -1234,9 +1262,9 @@ int dm_cache_set_dirty(struct dm_cache_metadata *cmd,
{
int r;
- down_write(&cmd->root_lock);
+ WRITE_LOCK(cmd);
r = __dirty(cmd, cblock, dirty);
- up_write(&cmd->root_lock);
+ WRITE_UNLOCK(cmd);
return r;
}
@@ -1252,9 +1280,9 @@ void dm_cache_metadata_get_stats(struct dm_cache_metadata *cmd,
void dm_cache_metadata_set_stats(struct dm_cache_metadata *cmd,
struct dm_cache_statistics *stats)
{
- down_write(&cmd->root_lock);
+ WRITE_LOCK_VOID(cmd);
cmd->stats = *stats;
- up_write(&cmd->root_lock);
+ WRITE_UNLOCK(cmd);
}
int dm_cache_commit(struct dm_cache_metadata *cmd, bool clean_shutdown)
@@ -1263,7 +1291,7 @@ int dm_cache_commit(struct dm_cache_metadata *cmd, bool clean_shutdown)
flags_mutator mutator = (clean_shutdown ? set_clean_shutdown :
clear_clean_shutdown);
- down_write(&cmd->root_lock);
+ WRITE_LOCK(cmd);
r = __commit_transaction(cmd, mutator);
if (r)
goto out;
@@ -1271,7 +1299,7 @@ int dm_cache_commit(struct dm_cache_metadata *cmd, bool clean_shutdown)
r = __begin_transaction(cmd);
out:
- up_write(&cmd->root_lock);
+ WRITE_UNLOCK(cmd);
return r;
}
@@ -1376,9 +1404,9 @@ int dm_cache_write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *
{
int r;
- down_write(&cmd->root_lock);
+ WRITE_LOCK(cmd);
r = write_hints(cmd, policy);
- up_write(&cmd->root_lock);
+ WRITE_UNLOCK(cmd);
return r;
}
@@ -1387,3 +1415,70 @@ int dm_cache_metadata_all_clean(struct dm_cache_metadata *cmd, bool *result)
{
return blocks_are_unmapped_or_clean(cmd, 0, cmd->cache_blocks, result);
}
+
+void dm_cache_metadata_set_read_only(struct dm_cache_metadata *cmd)
+{
+ WRITE_LOCK_VOID(cmd);
+ dm_bm_set_read_only(cmd->bm);
+ WRITE_UNLOCK(cmd);
+}
+
+void dm_cache_metadata_set_read_write(struct dm_cache_metadata *cmd)
+{
+ WRITE_LOCK_VOID(cmd);
+ dm_bm_set_read_write(cmd->bm);
+ WRITE_UNLOCK(cmd);
+}
+
+int dm_cache_metadata_set_needs_check(struct dm_cache_metadata *cmd)
+{
+ int r;
+ struct dm_block *sblock;
+ struct cache_disk_superblock *disk_super;
+
+ /*
+ * We ignore fail_io for this function.
+ */
+ down_write(&cmd->root_lock);
+ set_bit(NEEDS_CHECK, &cmd->flags);
+
+ r = superblock_lock(cmd, &sblock);
+ if (r) {
+ DMERR("couldn't read superblock");
+ goto out;
+ }
+
+ disk_super = dm_block_data(sblock);
+ disk_super->flags = cpu_to_le32(cmd->flags);
+
+ dm_bm_unlock(sblock);
+
+out:
+ up_write(&cmd->root_lock);
+ return r;
+}
+
+bool dm_cache_metadata_needs_check(struct dm_cache_metadata *cmd)
+{
+ bool needs_check;
+
+ down_read(&cmd->root_lock);
+ needs_check = !!test_bit(NEEDS_CHECK, &cmd->flags);
+ up_read(&cmd->root_lock);
+
+ return needs_check;
+}
+
+int dm_cache_metadata_abort(struct dm_cache_metadata *cmd)
+{
+ int r;
+
+ WRITE_LOCK(cmd);
+ __destroy_persistent_data_objects(cmd);
+ r = __create_persistent_data_objects(cmd, false);
+ if (r)
+ cmd->fail_io = true;
+ WRITE_UNLOCK(cmd);
+
+ return r;
+}
diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h
index 4ecc403be..2ffee21f3 100644
--- a/drivers/md/dm-cache-metadata.h
+++ b/drivers/md/dm-cache-metadata.h
@@ -102,6 +102,10 @@ struct dm_cache_statistics {
void dm_cache_metadata_get_stats(struct dm_cache_metadata *cmd,
struct dm_cache_statistics *stats);
+
+/*
+ * 'void' because it's no big deal if it fails.
+ */
void dm_cache_metadata_set_stats(struct dm_cache_metadata *cmd,
struct dm_cache_statistics *stats);
@@ -133,6 +137,12 @@ int dm_cache_write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *
*/
int dm_cache_metadata_all_clean(struct dm_cache_metadata *cmd, bool *result);
+bool dm_cache_metadata_needs_check(struct dm_cache_metadata *cmd);
+int dm_cache_metadata_set_needs_check(struct dm_cache_metadata *cmd);
+void dm_cache_metadata_set_read_only(struct dm_cache_metadata *cmd);
+void dm_cache_metadata_set_read_write(struct dm_cache_metadata *cmd);
+int dm_cache_metadata_abort(struct dm_cache_metadata *cmd);
+
/*----------------------------------------------------------------*/
#endif /* DM_CACHE_METADATA_H */
diff --git a/drivers/md/dm-cache-policy-cleaner.c b/drivers/md/dm-cache-policy-cleaner.c
index 004e463c9..240c9f0e8 100644
--- a/drivers/md/dm-cache-policy-cleaner.c
+++ b/drivers/md/dm-cache-policy-cleaner.c
@@ -359,7 +359,8 @@ static struct wb_cache_entry *get_next_dirty_entry(struct policy *p)
static int wb_writeback_work(struct dm_cache_policy *pe,
dm_oblock_t *oblock,
- dm_cblock_t *cblock)
+ dm_cblock_t *cblock,
+ bool critical_only)
{
int r = -ENOENT;
struct policy *p = to_policy(pe);
diff --git a/drivers/md/dm-cache-policy-internal.h b/drivers/md/dm-cache-policy-internal.h
index c198e6def..2816018fa 100644
--- a/drivers/md/dm-cache-policy-internal.h
+++ b/drivers/md/dm-cache-policy-internal.h
@@ -7,6 +7,7 @@
#ifndef DM_CACHE_POLICY_INTERNAL_H
#define DM_CACHE_POLICY_INTERNAL_H
+#include <linux/vmalloc.h>
#include "dm-cache-policy.h"
/*----------------------------------------------------------------*/
@@ -55,9 +56,10 @@ static inline int policy_walk_mappings(struct dm_cache_policy *p,
static inline int policy_writeback_work(struct dm_cache_policy *p,
dm_oblock_t *oblock,
- dm_cblock_t *cblock)
+ dm_cblock_t *cblock,
+ bool critical_only)
{
- return p->writeback_work ? p->writeback_work(p, oblock, cblock) : -ENOENT;
+ return p->writeback_work ? p->writeback_work(p, oblock, cblock, critical_only) : -ENOENT;
}
static inline void policy_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
@@ -81,19 +83,21 @@ static inline dm_cblock_t policy_residency(struct dm_cache_policy *p)
return p->residency(p);
}
-static inline void policy_tick(struct dm_cache_policy *p)
+static inline void policy_tick(struct dm_cache_policy *p, bool can_block)
{
if (p->tick)
- return p->tick(p);
+ return p->tick(p, can_block);
}
-static inline int policy_emit_config_values(struct dm_cache_policy *p, char *result, unsigned maxlen)
+static inline int policy_emit_config_values(struct dm_cache_policy *p, char *result,
+ unsigned maxlen, ssize_t *sz_ptr)
{
- ssize_t sz = 0;
+ ssize_t sz = *sz_ptr;
if (p->emit_config_values)
- return p->emit_config_values(p, result, maxlen);
+ return p->emit_config_values(p, result, maxlen, sz_ptr);
- DMEMIT("0");
+ DMEMIT("0 ");
+ *sz_ptr = sz;
return 0;
}
@@ -106,6 +110,33 @@ static inline int policy_set_config_value(struct dm_cache_policy *p,
/*----------------------------------------------------------------*/
/*
+ * Some utility functions commonly used by policies and the core target.
+ */
+static inline size_t bitset_size_in_bytes(unsigned nr_entries)
+{
+ return sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG);
+}
+
+static inline unsigned long *alloc_bitset(unsigned nr_entries)
+{
+ size_t s = bitset_size_in_bytes(nr_entries);
+ return vzalloc(s);
+}
+
+static inline void clear_bitset(void *bitset, unsigned nr_entries)
+{
+ size_t s = bitset_size_in_bytes(nr_entries);
+ memset(bitset, 0, s);
+}
+
+static inline void free_bitset(unsigned long *bits)
+{
+ vfree(bits);
+}
+
+/*----------------------------------------------------------------*/
+
+/*
* Creates a new cache policy given a policy name, a cache size, an origin size and the block size.
*/
struct dm_cache_policy *dm_cache_policy_create(const char *name, dm_cblock_t cache_size,
diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index 515d44bf2..aa1b41ca4 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -1236,7 +1236,7 @@ static int __mq_writeback_work(struct mq_policy *mq, dm_oblock_t *oblock,
}
static int mq_writeback_work(struct dm_cache_policy *p, dm_oblock_t *oblock,
- dm_cblock_t *cblock)
+ dm_cblock_t *cblock, bool critical_only)
{
int r;
struct mq_policy *mq = to_mq_policy(p);
@@ -1283,7 +1283,7 @@ static dm_cblock_t mq_residency(struct dm_cache_policy *p)
return r;
}
-static void mq_tick(struct dm_cache_policy *p)
+static void mq_tick(struct dm_cache_policy *p, bool can_block)
{
struct mq_policy *mq = to_mq_policy(p);
unsigned long flags;
@@ -1291,6 +1291,12 @@ static void mq_tick(struct dm_cache_policy *p)
spin_lock_irqsave(&mq->tick_lock, flags);
mq->tick_protected++;
spin_unlock_irqrestore(&mq->tick_lock, flags);
+
+ if (can_block) {
+ mutex_lock(&mq->lock);
+ copy_tick(mq);
+ mutex_unlock(&mq->lock);
+ }
}
static int mq_set_config_value(struct dm_cache_policy *p,
@@ -1323,22 +1329,24 @@ static int mq_set_config_value(struct dm_cache_policy *p,
return 0;
}
-static int mq_emit_config_values(struct dm_cache_policy *p, char *result, unsigned maxlen)
+static int mq_emit_config_values(struct dm_cache_policy *p, char *result,
+ unsigned maxlen, ssize_t *sz_ptr)
{
- ssize_t sz = 0;
+ ssize_t sz = *sz_ptr;
struct mq_policy *mq = to_mq_policy(p);
DMEMIT("10 random_threshold %u "
"sequential_threshold %u "
"discard_promote_adjustment %u "
"read_promote_adjustment %u "
- "write_promote_adjustment %u",
+ "write_promote_adjustment %u ",
mq->tracker.thresholds[PATTERN_RANDOM],
mq->tracker.thresholds[PATTERN_SEQUENTIAL],
mq->discard_promote_adjustment,
mq->read_promote_adjustment,
mq->write_promote_adjustment);
+ *sz_ptr = sz;
return 0;
}
@@ -1423,21 +1431,12 @@ bad_pre_cache_init:
static struct dm_cache_policy_type mq_policy_type = {
.name = "mq",
- .version = {1, 3, 0},
+ .version = {1, 4, 0},
.hint_size = 4,
.owner = THIS_MODULE,
.create = mq_create
};
-static struct dm_cache_policy_type default_policy_type = {
- .name = "default",
- .version = {1, 3, 0},
- .hint_size = 4,
- .owner = THIS_MODULE,
- .create = mq_create,
- .real = &mq_policy_type
-};
-
static int __init mq_init(void)
{
int r;
@@ -1447,36 +1446,21 @@ static int __init mq_init(void)
__alignof__(struct entry),
0, NULL);
if (!mq_entry_cache)
- goto bad;
+ return -ENOMEM;
r = dm_cache_policy_register(&mq_policy_type);
if (r) {
DMERR("register failed %d", r);
- goto bad_register_mq;
- }
-
- r = dm_cache_policy_register(&default_policy_type);
- if (!r) {
- DMINFO("version %u.%u.%u loaded",
- mq_policy_type.version[0],
- mq_policy_type.version[1],
- mq_policy_type.version[2]);
- return 0;
+ kmem_cache_destroy(mq_entry_cache);
+ return -ENOMEM;
}
- DMERR("register failed (as default) %d", r);
-
- dm_cache_policy_unregister(&mq_policy_type);
-bad_register_mq:
- kmem_cache_destroy(mq_entry_cache);
-bad:
- return -ENOMEM;
+ return 0;
}
static void __exit mq_exit(void)
{
dm_cache_policy_unregister(&mq_policy_type);
- dm_cache_policy_unregister(&default_policy_type);
kmem_cache_destroy(mq_entry_cache);
}
@@ -1487,5 +1471,3 @@ module_exit(mq_exit);
MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("mq cache policy");
-
-MODULE_ALIAS("dm-cache-default");
diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c
new file mode 100644
index 000000000..200366c62
--- /dev/null
+++ b/drivers/md/dm-cache-policy-smq.c
@@ -0,0 +1,1793 @@
+/*
+ * Copyright (C) 2015 Red Hat. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-cache-policy.h"
+#include "dm-cache-policy-internal.h"
+#include "dm.h"
+
+#include <linux/hash.h>
+#include <linux/jiffies.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/vmalloc.h>
+#include <linux/math64.h>
+
+#define DM_MSG_PREFIX "cache-policy-smq"
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Safe division functions that return zero on divide by zero.
+ */
+static unsigned safe_div(unsigned n, unsigned d)
+{
+ return d ? n / d : 0u;
+}
+
+static unsigned safe_mod(unsigned n, unsigned d)
+{
+ return d ? n % d : 0u;
+}
+
+/*----------------------------------------------------------------*/
+
+struct entry {
+ unsigned hash_next:28;
+ unsigned prev:28;
+ unsigned next:28;
+ unsigned level:7;
+ bool dirty:1;
+ bool allocated:1;
+ bool sentinel:1;
+
+ dm_oblock_t oblock;
+};
+
+/*----------------------------------------------------------------*/
+
+#define INDEXER_NULL ((1u << 28u) - 1u)
+
+/*
+ * An entry_space manages a set of entries that we use for the queues.
+ * The clean and dirty queues share entries, so this object is separate
+ * from the queue itself.
+ */
+struct entry_space {
+ struct entry *begin;
+ struct entry *end;
+};
+
+static int space_init(struct entry_space *es, unsigned nr_entries)
+{
+ if (!nr_entries) {
+ es->begin = es->end = NULL;
+ return 0;
+ }
+
+ es->begin = vzalloc(sizeof(struct entry) * nr_entries);
+ if (!es->begin)
+ return -ENOMEM;
+
+ es->end = es->begin + nr_entries;
+ return 0;
+}
+
+static void space_exit(struct entry_space *es)
+{
+ vfree(es->begin);
+}
+
+static struct entry *__get_entry(struct entry_space *es, unsigned block)
+{
+ struct entry *e;
+
+ e = es->begin + block;
+ BUG_ON(e >= es->end);
+
+ return e;
+}
+
+static unsigned to_index(struct entry_space *es, struct entry *e)
+{
+ BUG_ON(e < es->begin || e >= es->end);
+ return e - es->begin;
+}
+
+static struct entry *to_entry(struct entry_space *es, unsigned block)
+{
+ if (block == INDEXER_NULL)
+ return NULL;
+
+ return __get_entry(es, block);
+}
+
+/*----------------------------------------------------------------*/
+
+struct ilist {
+ unsigned nr_elts; /* excluding sentinel entries */
+ unsigned head, tail;
+};
+
+static void l_init(struct ilist *l)
+{
+ l->nr_elts = 0;
+ l->head = l->tail = INDEXER_NULL;
+}
+
+static struct entry *l_head(struct entry_space *es, struct ilist *l)
+{
+ return to_entry(es, l->head);
+}
+
+static struct entry *l_tail(struct entry_space *es, struct ilist *l)
+{
+ return to_entry(es, l->tail);
+}
+
+static struct entry *l_next(struct entry_space *es, struct entry *e)
+{
+ return to_entry(es, e->next);
+}
+
+static struct entry *l_prev(struct entry_space *es, struct entry *e)
+{
+ return to_entry(es, e->prev);
+}
+
+static bool l_empty(struct ilist *l)
+{
+ return l->head == INDEXER_NULL;
+}
+
+static void l_add_head(struct entry_space *es, struct ilist *l, struct entry *e)
+{
+ struct entry *head = l_head(es, l);
+
+ e->next = l->head;
+ e->prev = INDEXER_NULL;
+
+ if (head)
+ head->prev = l->head = to_index(es, e);
+ else
+ l->head = l->tail = to_index(es, e);
+
+ if (!e->sentinel)
+ l->nr_elts++;
+}
+
+static void l_add_tail(struct entry_space *es, struct ilist *l, struct entry *e)
+{
+ struct entry *tail = l_tail(es, l);
+
+ e->next = INDEXER_NULL;
+ e->prev = l->tail;
+
+ if (tail)
+ tail->next = l->tail = to_index(es, e);
+ else
+ l->head = l->tail = to_index(es, e);
+
+ if (!e->sentinel)
+ l->nr_elts++;
+}
+
+static void l_add_before(struct entry_space *es, struct ilist *l,
+ struct entry *old, struct entry *e)
+{
+ struct entry *prev = l_prev(es, old);
+
+ if (!prev)
+ l_add_head(es, l, e);
+
+ else {
+ e->prev = old->prev;
+ e->next = to_index(es, old);
+ prev->next = old->prev = to_index(es, e);
+
+ if (!e->sentinel)
+ l->nr_elts++;
+ }
+}
+
+static void l_del(struct entry_space *es, struct ilist *l, struct entry *e)
+{
+ struct entry *prev = l_prev(es, e);
+ struct entry *next = l_next(es, e);
+
+ if (prev)
+ prev->next = e->next;
+ else
+ l->head = e->next;
+
+ if (next)
+ next->prev = e->prev;
+ else
+ l->tail = e->prev;
+
+ if (!e->sentinel)
+ l->nr_elts--;
+}
+
+static struct entry *l_pop_tail(struct entry_space *es, struct ilist *l)
+{
+ struct entry *e;
+
+ for (e = l_tail(es, l); e; e = l_prev(es, e))
+ if (!e->sentinel) {
+ l_del(es, l, e);
+ return e;
+ }
+
+ return NULL;
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * The stochastic-multi-queue is a set of lru lists stacked into levels.
+ * Entries are moved up levels when they are used, which loosely orders the
+ * most accessed entries in the top levels and least in the bottom. This
+ * structure is *much* better than a single lru list.
+ */
+#define MAX_LEVELS 64u
+
+struct queue {
+ struct entry_space *es;
+
+ unsigned nr_elts;
+ unsigned nr_levels;
+ struct ilist qs[MAX_LEVELS];
+
+ /*
+ * We maintain a count of the number of entries we would like in each
+ * level.
+ */
+ unsigned last_target_nr_elts;
+ unsigned nr_top_levels;
+ unsigned nr_in_top_levels;
+ unsigned target_count[MAX_LEVELS];
+};
+
+static void q_init(struct queue *q, struct entry_space *es, unsigned nr_levels)
+{
+ unsigned i;
+
+ q->es = es;
+ q->nr_elts = 0;
+ q->nr_levels = nr_levels;
+
+ for (i = 0; i < q->nr_levels; i++) {
+ l_init(q->qs + i);
+ q->target_count[i] = 0u;
+ }
+
+ q->last_target_nr_elts = 0u;
+ q->nr_top_levels = 0u;
+ q->nr_in_top_levels = 0u;
+}
+
+static unsigned q_size(struct queue *q)
+{
+ return q->nr_elts;
+}
+
+/*
+ * Insert an entry to the back of the given level.
+ */
+static void q_push(struct queue *q, struct entry *e)
+{
+ if (!e->sentinel)
+ q->nr_elts++;
+
+ l_add_tail(q->es, q->qs + e->level, e);
+}
+
+static void q_push_before(struct queue *q, struct entry *old, struct entry *e)
+{
+ if (!e->sentinel)
+ q->nr_elts++;
+
+ l_add_before(q->es, q->qs + e->level, old, e);
+}
+
+static void q_del(struct queue *q, struct entry *e)
+{
+ l_del(q->es, q->qs + e->level, e);
+ if (!e->sentinel)
+ q->nr_elts--;
+}
+
+/*
+ * Return the oldest entry of the lowest populated level.
+ */
+static struct entry *q_peek(struct queue *q, unsigned max_level, bool can_cross_sentinel)
+{
+ unsigned level;
+ struct entry *e;
+
+ max_level = min(max_level, q->nr_levels);
+
+ for (level = 0; level < max_level; level++)
+ for (e = l_head(q->es, q->qs + level); e; e = l_next(q->es, e)) {
+ if (e->sentinel) {
+ if (can_cross_sentinel)
+ continue;
+ else
+ break;
+ }
+
+ return e;
+ }
+
+ return NULL;
+}
+
+static struct entry *q_pop(struct queue *q)
+{
+ struct entry *e = q_peek(q, q->nr_levels, true);
+
+ if (e)
+ q_del(q, e);
+
+ return e;
+}
+
+/*
+ * Pops an entry from a level that is not past a sentinel.
+ */
+static struct entry *q_pop_old(struct queue *q, unsigned max_level)
+{
+ struct entry *e = q_peek(q, max_level, false);
+
+ if (e)
+ q_del(q, e);
+
+ return e;
+}
+
+/*
+ * This function assumes there is a non-sentinel entry to pop. It's only
+ * used by redistribute, so we know this is true. It also doesn't adjust
+ * the q->nr_elts count.
+ */
+static struct entry *__redist_pop_from(struct queue *q, unsigned level)
+{
+ struct entry *e;
+
+ for (; level < q->nr_levels; level++)
+ for (e = l_head(q->es, q->qs + level); e; e = l_next(q->es, e))
+ if (!e->sentinel) {
+ l_del(q->es, q->qs + e->level, e);
+ return e;
+ }
+
+ return NULL;
+}
+
+static void q_set_targets_subrange_(struct queue *q, unsigned nr_elts, unsigned lbegin, unsigned lend)
+{
+ unsigned level, nr_levels, entries_per_level, remainder;
+
+ BUG_ON(lbegin > lend);
+ BUG_ON(lend > q->nr_levels);
+ nr_levels = lend - lbegin;
+ entries_per_level = safe_div(nr_elts, nr_levels);
+ remainder = safe_mod(nr_elts, nr_levels);
+
+ for (level = lbegin; level < lend; level++)
+ q->target_count[level] =
+ (level < (lbegin + remainder)) ? entries_per_level + 1u : entries_per_level;
+}
+
+/*
+ * Typically we have fewer elements in the top few levels which allows us
+ * to adjust the promote threshold nicely.
+ */
+static void q_set_targets(struct queue *q)
+{
+ if (q->last_target_nr_elts == q->nr_elts)
+ return;
+
+ q->last_target_nr_elts = q->nr_elts;
+
+ if (q->nr_top_levels > q->nr_levels)
+ q_set_targets_subrange_(q, q->nr_elts, 0, q->nr_levels);
+
+ else {
+ q_set_targets_subrange_(q, q->nr_in_top_levels,
+ q->nr_levels - q->nr_top_levels, q->nr_levels);
+
+ if (q->nr_in_top_levels < q->nr_elts)
+ q_set_targets_subrange_(q, q->nr_elts - q->nr_in_top_levels,
+ 0, q->nr_levels - q->nr_top_levels);
+ else
+ q_set_targets_subrange_(q, 0, 0, q->nr_levels - q->nr_top_levels);
+ }
+}
+
+static void q_redistribute(struct queue *q)
+{
+ unsigned target, level;
+ struct ilist *l, *l_above;
+ struct entry *e;
+
+ q_set_targets(q);
+
+ for (level = 0u; level < q->nr_levels - 1u; level++) {
+ l = q->qs + level;
+ target = q->target_count[level];
+
+ /*
+ * Pull down some entries from the level above.
+ */
+ while (l->nr_elts < target) {
+ e = __redist_pop_from(q, level + 1u);
+ if (!e) {
+ /* bug in nr_elts */
+ break;
+ }
+
+ e->level = level;
+ l_add_tail(q->es, l, e);
+ }
+
+ /*
+ * Push some entries up.
+ */
+ l_above = q->qs + level + 1u;
+ while (l->nr_elts > target) {
+ e = l_pop_tail(q->es, l);
+
+ if (!e)
+ /* bug in nr_elts */
+ break;
+
+ e->level = level + 1u;
+ l_add_head(q->es, l_above, e);
+ }
+ }
+}
+
+static void q_requeue_before(struct queue *q, struct entry *dest, struct entry *e, unsigned extra_levels)
+{
+ struct entry *de;
+ unsigned new_level;
+
+ q_del(q, e);
+
+ if (extra_levels && (e->level < q->nr_levels - 1u)) {
+ new_level = min(q->nr_levels - 1u, e->level + extra_levels);
+ for (de = l_head(q->es, q->qs + new_level); de; de = l_next(q->es, de)) {
+ if (de->sentinel)
+ continue;
+
+ q_del(q, de);
+ de->level = e->level;
+
+ if (dest)
+ q_push_before(q, dest, de);
+ else
+ q_push(q, de);
+ break;
+ }
+
+ e->level = new_level;
+ }
+
+ q_push(q, e);
+}
+
+static void q_requeue(struct queue *q, struct entry *e, unsigned extra_levels)
+{
+ q_requeue_before(q, NULL, e, extra_levels);
+}
+
+/*----------------------------------------------------------------*/
+
+#define FP_SHIFT 8
+#define SIXTEENTH (1u << (FP_SHIFT - 4u))
+#define EIGHTH (1u << (FP_SHIFT - 3u))
+
+struct stats {
+ unsigned hit_threshold;
+ unsigned hits;
+ unsigned misses;
+};
+
+enum performance {
+ Q_POOR,
+ Q_FAIR,
+ Q_WELL
+};
+
+static void stats_init(struct stats *s, unsigned nr_levels)
+{
+ s->hit_threshold = (nr_levels * 3u) / 4u;
+ s->hits = 0u;
+ s->misses = 0u;
+}
+
+static void stats_reset(struct stats *s)
+{
+ s->hits = s->misses = 0u;
+}
+
+static void stats_level_accessed(struct stats *s, unsigned level)
+{
+ if (level >= s->hit_threshold)
+ s->hits++;
+ else
+ s->misses++;
+}
+
+static void stats_miss(struct stats *s)
+{
+ s->misses++;
+}
+
+/*
+ * There are times when we don't have any confidence in the hotspot queue.
+ * Such as when a fresh cache is created and the blocks have been spread
+ * out across the levels, or if an io load changes. We detect this by
+ * seeing how often a lookup is in the top levels of the hotspot queue.
+ */
+static enum performance stats_assess(struct stats *s)
+{
+ unsigned confidence = safe_div(s->hits << FP_SHIFT, s->hits + s->misses);
+
+ if (confidence < SIXTEENTH)
+ return Q_POOR;
+
+ else if (confidence < EIGHTH)
+ return Q_FAIR;
+
+ else
+ return Q_WELL;
+}
+
+/*----------------------------------------------------------------*/
+
+struct hash_table {
+ struct entry_space *es;
+ unsigned long long hash_bits;
+ unsigned *buckets;
+};
+
+/*
+ * All cache entries are stored in a chained hash table. To save space we
+ * use indexing again, and only store indexes to the next entry.
+ */
+static int h_init(struct hash_table *ht, struct entry_space *es, unsigned nr_entries)
+{
+ unsigned i, nr_buckets;
+
+ ht->es = es;
+ nr_buckets = roundup_pow_of_two(max(nr_entries / 4u, 16u));
+ ht->hash_bits = ffs(nr_buckets) - 1;
+
+ ht->buckets = vmalloc(sizeof(*ht->buckets) * nr_buckets);
+ if (!ht->buckets)
+ return -ENOMEM;
+
+ for (i = 0; i < nr_buckets; i++)
+ ht->buckets[i] = INDEXER_NULL;
+
+ return 0;
+}
+
+static void h_exit(struct hash_table *ht)
+{
+ vfree(ht->buckets);
+}
+
+static struct entry *h_head(struct hash_table *ht, unsigned bucket)
+{
+ return to_entry(ht->es, ht->buckets[bucket]);
+}
+
+static struct entry *h_next(struct hash_table *ht, struct entry *e)
+{
+ return to_entry(ht->es, e->hash_next);
+}
+
+static void __h_insert(struct hash_table *ht, unsigned bucket, struct entry *e)
+{
+ e->hash_next = ht->buckets[bucket];
+ ht->buckets[bucket] = to_index(ht->es, e);
+}
+
+static void h_insert(struct hash_table *ht, struct entry *e)
+{
+ unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits);
+ __h_insert(ht, h, e);
+}
+
+static struct entry *__h_lookup(struct hash_table *ht, unsigned h, dm_oblock_t oblock,
+ struct entry **prev)
+{
+ struct entry *e;
+
+ *prev = NULL;
+ for (e = h_head(ht, h); e; e = h_next(ht, e)) {
+ if (e->oblock == oblock)
+ return e;
+
+ *prev = e;
+ }
+
+ return NULL;
+}
+
+static void __h_unlink(struct hash_table *ht, unsigned h,
+ struct entry *e, struct entry *prev)
+{
+ if (prev)
+ prev->hash_next = e->hash_next;
+ else
+ ht->buckets[h] = e->hash_next;
+}
+
+/*
+ * Also moves each entry to the front of the bucket.
+ */
+static struct entry *h_lookup(struct hash_table *ht, dm_oblock_t oblock)
+{
+ struct entry *e, *prev;
+ unsigned h = hash_64(from_oblock(oblock), ht->hash_bits);
+
+ e = __h_lookup(ht, h, oblock, &prev);
+ if (e && prev) {
+ /*
+ * Move to the front because this entry is likely
+ * to be hit again.
+ */
+ __h_unlink(ht, h, e, prev);
+ __h_insert(ht, h, e);
+ }
+
+ return e;
+}
+
+static void h_remove(struct hash_table *ht, struct entry *e)
+{
+ unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits);
+ struct entry *prev;
+
+ /*
+ * The down side of using a singly linked list is we have to
+ * iterate the bucket to remove an item.
+ */
+ e = __h_lookup(ht, h, e->oblock, &prev);
+ if (e)
+ __h_unlink(ht, h, e, prev);
+}
+
+/*----------------------------------------------------------------*/
+
+struct entry_alloc {
+ struct entry_space *es;
+ unsigned begin;
+
+ unsigned nr_allocated;
+ struct ilist free;
+};
+
+static void init_allocator(struct entry_alloc *ea, struct entry_space *es,
+ unsigned begin, unsigned end)
+{
+ unsigned i;
+
+ ea->es = es;
+ ea->nr_allocated = 0u;
+ ea->begin = begin;
+
+ l_init(&ea->free);
+ for (i = begin; i != end; i++)
+ l_add_tail(ea->es, &ea->free, __get_entry(ea->es, i));
+}
+
+static void init_entry(struct entry *e)
+{
+ /*
+ * We can't memset because that would clear the hotspot and
+ * sentinel bits which remain constant.
+ */
+ e->hash_next = INDEXER_NULL;
+ e->next = INDEXER_NULL;
+ e->prev = INDEXER_NULL;
+ e->level = 0u;
+ e->allocated = true;
+}
+
+static struct entry *alloc_entry(struct entry_alloc *ea)
+{
+ struct entry *e;
+
+ if (l_empty(&ea->free))
+ return NULL;
+
+ e = l_pop_tail(ea->es, &ea->free);
+ init_entry(e);
+ ea->nr_allocated++;
+
+ return e;
+}
+
+/*
+ * This assumes the cblock hasn't already been allocated.
+ */
+static struct entry *alloc_particular_entry(struct entry_alloc *ea, unsigned i)
+{
+ struct entry *e = __get_entry(ea->es, ea->begin + i);
+
+ BUG_ON(e->allocated);
+
+ l_del(ea->es, &ea->free, e);
+ init_entry(e);
+ ea->nr_allocated++;
+
+ return e;
+}
+
+static void free_entry(struct entry_alloc *ea, struct entry *e)
+{
+ BUG_ON(!ea->nr_allocated);
+ BUG_ON(!e->allocated);
+
+ ea->nr_allocated--;
+ e->allocated = false;
+ l_add_tail(ea->es, &ea->free, e);
+}
+
+static bool allocator_empty(struct entry_alloc *ea)
+{
+ return l_empty(&ea->free);
+}
+
+static unsigned get_index(struct entry_alloc *ea, struct entry *e)
+{
+ return to_index(ea->es, e) - ea->begin;
+}
+
+static struct entry *get_entry(struct entry_alloc *ea, unsigned index)
+{
+ return __get_entry(ea->es, ea->begin + index);
+}
+
+/*----------------------------------------------------------------*/
+
+#define NR_HOTSPOT_LEVELS 64u
+#define NR_CACHE_LEVELS 64u
+
+#define WRITEBACK_PERIOD (10 * HZ)
+#define DEMOTE_PERIOD (60 * HZ)
+
+#define HOTSPOT_UPDATE_PERIOD (HZ)
+#define CACHE_UPDATE_PERIOD (10u * HZ)
+
+struct smq_policy {
+ struct dm_cache_policy policy;
+
+ /* protects everything */
+ struct mutex lock;
+ dm_cblock_t cache_size;
+ sector_t cache_block_size;
+
+ sector_t hotspot_block_size;
+ unsigned nr_hotspot_blocks;
+ unsigned cache_blocks_per_hotspot_block;
+ unsigned hotspot_level_jump;
+
+ struct entry_space es;
+ struct entry_alloc writeback_sentinel_alloc;
+ struct entry_alloc demote_sentinel_alloc;
+ struct entry_alloc hotspot_alloc;
+ struct entry_alloc cache_alloc;
+
+ unsigned long *hotspot_hit_bits;
+ unsigned long *cache_hit_bits;
+
+ /*
+ * We maintain three queues of entries. The cache proper,
+ * consisting of a clean and dirty queue, containing the currently
+ * active mappings. The hotspot queue uses a larger block size to
+ * track blocks that are being hit frequently and potential
+ * candidates for promotion to the cache.
+ */
+ struct queue hotspot;
+ struct queue clean;
+ struct queue dirty;
+
+ struct stats hotspot_stats;
+ struct stats cache_stats;
+
+ /*
+ * Keeps track of time, incremented by the core. We use this to
+ * avoid attributing multiple hits within the same tick.
+ *
+ * Access to tick_protected should be done with the spin lock held.
+ * It's copied to tick at the start of the map function (within the
+ * mutex).
+ */
+ spinlock_t tick_lock;
+ unsigned tick_protected;
+ unsigned tick;
+
+ /*
+ * The hash tables allows us to quickly find an entry by origin
+ * block.
+ */
+ struct hash_table table;
+ struct hash_table hotspot_table;
+
+ bool current_writeback_sentinels;
+ unsigned long next_writeback_period;
+
+ bool current_demote_sentinels;
+ unsigned long next_demote_period;
+
+ unsigned write_promote_level;
+ unsigned read_promote_level;
+
+ unsigned long next_hotspot_period;
+ unsigned long next_cache_period;
+};
+
+/*----------------------------------------------------------------*/
+
+static struct entry *get_sentinel(struct entry_alloc *ea, unsigned level, bool which)
+{
+ return get_entry(ea, which ? level : NR_CACHE_LEVELS + level);
+}
+
+static struct entry *writeback_sentinel(struct smq_policy *mq, unsigned level)
+{
+ return get_sentinel(&mq->writeback_sentinel_alloc, level, mq->current_writeback_sentinels);
+}
+
+static struct entry *demote_sentinel(struct smq_policy *mq, unsigned level)
+{
+ return get_sentinel(&mq->demote_sentinel_alloc, level, mq->current_demote_sentinels);
+}
+
+static void __update_writeback_sentinels(struct smq_policy *mq)
+{
+ unsigned level;
+ struct queue *q = &mq->dirty;
+ struct entry *sentinel;
+
+ for (level = 0; level < q->nr_levels; level++) {
+ sentinel = writeback_sentinel(mq, level);
+ q_del(q, sentinel);
+ q_push(q, sentinel);
+ }
+}
+
+static void __update_demote_sentinels(struct smq_policy *mq)
+{
+ unsigned level;
+ struct queue *q = &mq->clean;
+ struct entry *sentinel;
+
+ for (level = 0; level < q->nr_levels; level++) {
+ sentinel = demote_sentinel(mq, level);
+ q_del(q, sentinel);
+ q_push(q, sentinel);
+ }
+}
+
+static void update_sentinels(struct smq_policy *mq)
+{
+ if (time_after(jiffies, mq->next_writeback_period)) {
+ __update_writeback_sentinels(mq);
+ mq->next_writeback_period = jiffies + WRITEBACK_PERIOD;
+ mq->current_writeback_sentinels = !mq->current_writeback_sentinels;
+ }
+
+ if (time_after(jiffies, mq->next_demote_period)) {
+ __update_demote_sentinels(mq);
+ mq->next_demote_period = jiffies + DEMOTE_PERIOD;
+ mq->current_demote_sentinels = !mq->current_demote_sentinels;
+ }
+}
+
+static void __sentinels_init(struct smq_policy *mq)
+{
+ unsigned level;
+ struct entry *sentinel;
+
+ for (level = 0; level < NR_CACHE_LEVELS; level++) {
+ sentinel = writeback_sentinel(mq, level);
+ sentinel->level = level;
+ q_push(&mq->dirty, sentinel);
+
+ sentinel = demote_sentinel(mq, level);
+ sentinel->level = level;
+ q_push(&mq->clean, sentinel);
+ }
+}
+
+static void sentinels_init(struct smq_policy *mq)
+{
+ mq->next_writeback_period = jiffies + WRITEBACK_PERIOD;
+ mq->next_demote_period = jiffies + DEMOTE_PERIOD;
+
+ mq->current_writeback_sentinels = false;
+ mq->current_demote_sentinels = false;
+ __sentinels_init(mq);
+
+ mq->current_writeback_sentinels = !mq->current_writeback_sentinels;
+ mq->current_demote_sentinels = !mq->current_demote_sentinels;
+ __sentinels_init(mq);
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * These methods tie together the dirty queue, clean queue and hash table.
+ */
+static void push_new(struct smq_policy *mq, struct entry *e)
+{
+ struct queue *q = e->dirty ? &mq->dirty : &mq->clean;
+ h_insert(&mq->table, e);
+ q_push(q, e);
+}
+
+static void push(struct smq_policy *mq, struct entry *e)
+{
+ struct entry *sentinel;
+
+ h_insert(&mq->table, e);
+
+ /*
+ * Punch this into the queue just in front of the sentinel, to
+ * ensure it's cleaned straight away.
+ */
+ if (e->dirty) {
+ sentinel = writeback_sentinel(mq, e->level);
+ q_push_before(&mq->dirty, sentinel, e);
+ } else {
+ sentinel = demote_sentinel(mq, e->level);
+ q_push_before(&mq->clean, sentinel, e);
+ }
+}
+
+/*
+ * Removes an entry from cache. Removes from the hash table.
+ */
+static void __del(struct smq_policy *mq, struct queue *q, struct entry *e)
+{
+ q_del(q, e);
+ h_remove(&mq->table, e);
+}
+
+static void del(struct smq_policy *mq, struct entry *e)
+{
+ __del(mq, e->dirty ? &mq->dirty : &mq->clean, e);
+}
+
+static struct entry *pop_old(struct smq_policy *mq, struct queue *q, unsigned max_level)
+{
+ struct entry *e = q_pop_old(q, max_level);
+ if (e)
+ h_remove(&mq->table, e);
+ return e;
+}
+
+static dm_cblock_t infer_cblock(struct smq_policy *mq, struct entry *e)
+{
+ return to_cblock(get_index(&mq->cache_alloc, e));
+}
+
+static void requeue(struct smq_policy *mq, struct entry *e)
+{
+ struct entry *sentinel;
+
+ if (!test_and_set_bit(from_cblock(infer_cblock(mq, e)), mq->cache_hit_bits)) {
+ if (e->dirty) {
+ sentinel = writeback_sentinel(mq, e->level);
+ q_requeue_before(&mq->dirty, sentinel, e, 1u);
+ } else {
+ sentinel = demote_sentinel(mq, e->level);
+ q_requeue_before(&mq->clean, sentinel, e, 1u);
+ }
+ }
+}
+
+static unsigned default_promote_level(struct smq_policy *mq)
+{
+ /*
+ * The promote level depends on the current performance of the
+ * cache.
+ *
+ * If the cache is performing badly, then we can't afford
+ * to promote much without causing performance to drop below that
+ * of the origin device.
+ *
+ * If the cache is performing well, then we don't need to promote
+ * much. If it isn't broken, don't fix it.
+ *
+ * If the cache is middling then we promote more.
+ *
+ * This scheme reminds me of a graph of entropy vs probability of a
+ * binary variable.
+ */
+ static unsigned table[] = {1, 1, 1, 2, 4, 6, 7, 8, 7, 6, 4, 4, 3, 3, 2, 2, 1};
+
+ unsigned hits = mq->cache_stats.hits;
+ unsigned misses = mq->cache_stats.misses;
+ unsigned index = safe_div(hits << 4u, hits + misses);
+ return table[index];
+}
+
+static void update_promote_levels(struct smq_policy *mq)
+{
+ /*
+ * If there are unused cache entries then we want to be really
+ * eager to promote.
+ */
+ unsigned threshold_level = allocator_empty(&mq->cache_alloc) ?
+ default_promote_level(mq) : (NR_HOTSPOT_LEVELS / 2u);
+
+ /*
+ * If the hotspot queue is performing badly then we have little
+ * confidence that we know which blocks to promote. So we cut down
+ * the amount of promotions.
+ */
+ switch (stats_assess(&mq->hotspot_stats)) {
+ case Q_POOR:
+ threshold_level /= 4u;
+ break;
+
+ case Q_FAIR:
+ threshold_level /= 2u;
+ break;
+
+ case Q_WELL:
+ break;
+ }
+
+ mq->read_promote_level = NR_HOTSPOT_LEVELS - threshold_level;
+ mq->write_promote_level = (NR_HOTSPOT_LEVELS - threshold_level) + 2u;
+}
+
+/*
+ * If the hotspot queue is performing badly, then we try and move entries
+ * around more quickly.
+ */
+static void update_level_jump(struct smq_policy *mq)
+{
+ switch (stats_assess(&mq->hotspot_stats)) {
+ case Q_POOR:
+ mq->hotspot_level_jump = 4u;
+ break;
+
+ case Q_FAIR:
+ mq->hotspot_level_jump = 2u;
+ break;
+
+ case Q_WELL:
+ mq->hotspot_level_jump = 1u;
+ break;
+ }
+}
+
+static void end_hotspot_period(struct smq_policy *mq)
+{
+ clear_bitset(mq->hotspot_hit_bits, mq->nr_hotspot_blocks);
+ update_promote_levels(mq);
+
+ if (time_after(jiffies, mq->next_hotspot_period)) {
+ update_level_jump(mq);
+ q_redistribute(&mq->hotspot);
+ stats_reset(&mq->hotspot_stats);
+ mq->next_hotspot_period = jiffies + HOTSPOT_UPDATE_PERIOD;
+ }
+}
+
+static void end_cache_period(struct smq_policy *mq)
+{
+ if (time_after(jiffies, mq->next_cache_period)) {
+ clear_bitset(mq->cache_hit_bits, from_cblock(mq->cache_size));
+
+ q_redistribute(&mq->dirty);
+ q_redistribute(&mq->clean);
+ stats_reset(&mq->cache_stats);
+
+ mq->next_cache_period = jiffies + CACHE_UPDATE_PERIOD;
+ }
+}
+
+static int demote_cblock(struct smq_policy *mq,
+ struct policy_locker *locker,
+ dm_oblock_t *oblock)
+{
+ struct entry *demoted = q_peek(&mq->clean, mq->clean.nr_levels, false);
+ if (!demoted)
+ /*
+ * We could get a block from mq->dirty, but that
+ * would add extra latency to the triggering bio as it
+ * waits for the writeback. Better to not promote this
+ * time and hope there's a clean block next time this block
+ * is hit.
+ */
+ return -ENOSPC;
+
+ if (locker->fn(locker, demoted->oblock))
+ /*
+ * We couldn't lock this block.
+ */
+ return -EBUSY;
+
+ del(mq, demoted);
+ *oblock = demoted->oblock;
+ free_entry(&mq->cache_alloc, demoted);
+
+ return 0;
+}
+
+enum promote_result {
+ PROMOTE_NOT,
+ PROMOTE_TEMPORARY,
+ PROMOTE_PERMANENT
+};
+
+/*
+ * Converts a boolean into a promote result.
+ */
+static enum promote_result maybe_promote(bool promote)
+{
+ return promote ? PROMOTE_PERMANENT : PROMOTE_NOT;
+}
+
+static enum promote_result should_promote(struct smq_policy *mq, struct entry *hs_e, struct bio *bio,
+ bool fast_promote)
+{
+ if (bio_data_dir(bio) == WRITE) {
+ if (!allocator_empty(&mq->cache_alloc) && fast_promote)
+ return PROMOTE_TEMPORARY;
+
+ else
+ return maybe_promote(hs_e->level >= mq->write_promote_level);
+ } else
+ return maybe_promote(hs_e->level >= mq->read_promote_level);
+}
+
+static void insert_in_cache(struct smq_policy *mq, dm_oblock_t oblock,
+ struct policy_locker *locker,
+ struct policy_result *result, enum promote_result pr)
+{
+ int r;
+ struct entry *e;
+
+ if (allocator_empty(&mq->cache_alloc)) {
+ result->op = POLICY_REPLACE;
+ r = demote_cblock(mq, locker, &result->old_oblock);
+ if (r) {
+ result->op = POLICY_MISS;
+ return;
+ }
+
+ } else
+ result->op = POLICY_NEW;
+
+ e = alloc_entry(&mq->cache_alloc);
+ BUG_ON(!e);
+ e->oblock = oblock;
+
+ if (pr == PROMOTE_TEMPORARY)
+ push(mq, e);
+ else
+ push_new(mq, e);
+
+ result->cblock = infer_cblock(mq, e);
+}
+
+static dm_oblock_t to_hblock(struct smq_policy *mq, dm_oblock_t b)
+{
+ sector_t r = from_oblock(b);
+ (void) sector_div(r, mq->cache_blocks_per_hotspot_block);
+ return to_oblock(r);
+}
+
+static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b, struct bio *bio)
+{
+ unsigned hi;
+ dm_oblock_t hb = to_hblock(mq, b);
+ struct entry *e = h_lookup(&mq->hotspot_table, hb);
+
+ if (e) {
+ stats_level_accessed(&mq->hotspot_stats, e->level);
+
+ hi = get_index(&mq->hotspot_alloc, e);
+ q_requeue(&mq->hotspot, e,
+ test_and_set_bit(hi, mq->hotspot_hit_bits) ?
+ 0u : mq->hotspot_level_jump);
+
+ } else {
+ stats_miss(&mq->hotspot_stats);
+
+ e = alloc_entry(&mq->hotspot_alloc);
+ if (!e) {
+ e = q_pop(&mq->hotspot);
+ if (e) {
+ h_remove(&mq->hotspot_table, e);
+ hi = get_index(&mq->hotspot_alloc, e);
+ clear_bit(hi, mq->hotspot_hit_bits);
+ }
+
+ }
+
+ if (e) {
+ e->oblock = hb;
+ q_push(&mq->hotspot, e);
+ h_insert(&mq->hotspot_table, e);
+ }
+ }
+
+ return e;
+}
+
+/*
+ * Looks the oblock up in the hash table, then decides whether to put in
+ * pre_cache, or cache etc.
+ */
+static int map(struct smq_policy *mq, struct bio *bio, dm_oblock_t oblock,
+ bool can_migrate, bool fast_promote,
+ struct policy_locker *locker, struct policy_result *result)
+{
+ struct entry *e, *hs_e;
+ enum promote_result pr;
+
+ hs_e = update_hotspot_queue(mq, oblock, bio);
+
+ e = h_lookup(&mq->table, oblock);
+ if (e) {
+ stats_level_accessed(&mq->cache_stats, e->level);
+
+ requeue(mq, e);
+ result->op = POLICY_HIT;
+ result->cblock = infer_cblock(mq, e);
+
+ } else {
+ stats_miss(&mq->cache_stats);
+
+ pr = should_promote(mq, hs_e, bio, fast_promote);
+ if (pr == PROMOTE_NOT)
+ result->op = POLICY_MISS;
+
+ else {
+ if (!can_migrate) {
+ result->op = POLICY_MISS;
+ return -EWOULDBLOCK;
+ }
+
+ insert_in_cache(mq, oblock, locker, result, pr);
+ }
+ }
+
+ return 0;
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Public interface, via the policy struct. See dm-cache-policy.h for a
+ * description of these.
+ */
+
+static struct smq_policy *to_smq_policy(struct dm_cache_policy *p)
+{
+ return container_of(p, struct smq_policy, policy);
+}
+
+static void smq_destroy(struct dm_cache_policy *p)
+{
+ struct smq_policy *mq = to_smq_policy(p);
+
+ h_exit(&mq->hotspot_table);
+ h_exit(&mq->table);
+ free_bitset(mq->hotspot_hit_bits);
+ free_bitset(mq->cache_hit_bits);
+ space_exit(&mq->es);
+ kfree(mq);
+}
+
+static void copy_tick(struct smq_policy *mq)
+{
+ unsigned long flags, tick;
+
+ spin_lock_irqsave(&mq->tick_lock, flags);
+ tick = mq->tick_protected;
+ if (tick != mq->tick) {
+ update_sentinels(mq);
+ end_hotspot_period(mq);
+ end_cache_period(mq);
+ mq->tick = tick;
+ }
+ spin_unlock_irqrestore(&mq->tick_lock, flags);
+}
+
+static bool maybe_lock(struct smq_policy *mq, bool can_block)
+{
+ if (can_block) {
+ mutex_lock(&mq->lock);
+ return true;
+ } else
+ return mutex_trylock(&mq->lock);
+}
+
+static int smq_map(struct dm_cache_policy *p, dm_oblock_t oblock,
+ bool can_block, bool can_migrate, bool fast_promote,
+ struct bio *bio, struct policy_locker *locker,
+ struct policy_result *result)
+{
+ int r;
+ struct smq_policy *mq = to_smq_policy(p);
+
+ result->op = POLICY_MISS;
+
+ if (!maybe_lock(mq, can_block))
+ return -EWOULDBLOCK;
+
+ copy_tick(mq);
+ r = map(mq, bio, oblock, can_migrate, fast_promote, locker, result);
+ mutex_unlock(&mq->lock);
+
+ return r;
+}
+
+static int smq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock)
+{
+ int r;
+ struct smq_policy *mq = to_smq_policy(p);
+ struct entry *e;
+
+ if (!mutex_trylock(&mq->lock))
+ return -EWOULDBLOCK;
+
+ e = h_lookup(&mq->table, oblock);
+ if (e) {
+ *cblock = infer_cblock(mq, e);
+ r = 0;
+ } else
+ r = -ENOENT;
+
+ mutex_unlock(&mq->lock);
+
+ return r;
+}
+
+static void __smq_set_clear_dirty(struct smq_policy *mq, dm_oblock_t oblock, bool set)
+{
+ struct entry *e;
+
+ e = h_lookup(&mq->table, oblock);
+ BUG_ON(!e);
+
+ del(mq, e);
+ e->dirty = set;
+ push(mq, e);
+}
+
+static void smq_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
+{
+ struct smq_policy *mq = to_smq_policy(p);
+
+ mutex_lock(&mq->lock);
+ __smq_set_clear_dirty(mq, oblock, true);
+ mutex_unlock(&mq->lock);
+}
+
+static void smq_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
+{
+ struct smq_policy *mq = to_smq_policy(p);
+
+ mutex_lock(&mq->lock);
+ __smq_set_clear_dirty(mq, oblock, false);
+ mutex_unlock(&mq->lock);
+}
+
+static int smq_load_mapping(struct dm_cache_policy *p,
+ dm_oblock_t oblock, dm_cblock_t cblock,
+ uint32_t hint, bool hint_valid)
+{
+ struct smq_policy *mq = to_smq_policy(p);
+ struct entry *e;
+
+ e = alloc_particular_entry(&mq->cache_alloc, from_cblock(cblock));
+ e->oblock = oblock;
+ e->dirty = false; /* this gets corrected in a minute */
+ e->level = hint_valid ? min(hint, NR_CACHE_LEVELS - 1) : 1;
+ push(mq, e);
+
+ return 0;
+}
+
+static int smq_save_hints(struct smq_policy *mq, struct queue *q,
+ policy_walk_fn fn, void *context)
+{
+ int r;
+ unsigned level;
+ struct entry *e;
+
+ for (level = 0; level < q->nr_levels; level++)
+ for (e = l_head(q->es, q->qs + level); e; e = l_next(q->es, e)) {
+ if (!e->sentinel) {
+ r = fn(context, infer_cblock(mq, e),
+ e->oblock, e->level);
+ if (r)
+ return r;
+ }
+ }
+
+ return 0;
+}
+
+static int smq_walk_mappings(struct dm_cache_policy *p, policy_walk_fn fn,
+ void *context)
+{
+ struct smq_policy *mq = to_smq_policy(p);
+ int r = 0;
+
+ mutex_lock(&mq->lock);
+
+ r = smq_save_hints(mq, &mq->clean, fn, context);
+ if (!r)
+ r = smq_save_hints(mq, &mq->dirty, fn, context);
+
+ mutex_unlock(&mq->lock);
+
+ return r;
+}
+
+static void __remove_mapping(struct smq_policy *mq, dm_oblock_t oblock)
+{
+ struct entry *e;
+
+ e = h_lookup(&mq->table, oblock);
+ BUG_ON(!e);
+
+ del(mq, e);
+ free_entry(&mq->cache_alloc, e);
+}
+
+static void smq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
+{
+ struct smq_policy *mq = to_smq_policy(p);
+
+ mutex_lock(&mq->lock);
+ __remove_mapping(mq, oblock);
+ mutex_unlock(&mq->lock);
+}
+
+static int __remove_cblock(struct smq_policy *mq, dm_cblock_t cblock)
+{
+ struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
+
+ if (!e || !e->allocated)
+ return -ENODATA;
+
+ del(mq, e);
+ free_entry(&mq->cache_alloc, e);
+
+ return 0;
+}
+
+static int smq_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock)
+{
+ int r;
+ struct smq_policy *mq = to_smq_policy(p);
+
+ mutex_lock(&mq->lock);
+ r = __remove_cblock(mq, cblock);
+ mutex_unlock(&mq->lock);
+
+ return r;
+}
+
+
+#define CLEAN_TARGET_CRITICAL 5u /* percent */
+
+static bool clean_target_met(struct smq_policy *mq, bool critical)
+{
+ if (critical) {
+ /*
+ * Cache entries may not be populated. So we're cannot rely on the
+ * size of the clean queue.
+ */
+ unsigned nr_clean = from_cblock(mq->cache_size) - q_size(&mq->dirty);
+ unsigned target = from_cblock(mq->cache_size) * CLEAN_TARGET_CRITICAL / 100u;
+
+ return nr_clean >= target;
+ } else
+ return !q_size(&mq->dirty);
+}
+
+static int __smq_writeback_work(struct smq_policy *mq, dm_oblock_t *oblock,
+ dm_cblock_t *cblock, bool critical_only)
+{
+ struct entry *e = NULL;
+ bool target_met = clean_target_met(mq, critical_only);
+
+ if (critical_only)
+ /*
+ * Always try and keep the bottom level clean.
+ */
+ e = pop_old(mq, &mq->dirty, target_met ? 1u : mq->dirty.nr_levels);
+
+ else
+ e = pop_old(mq, &mq->dirty, mq->dirty.nr_levels);
+
+ if (!e)
+ return -ENODATA;
+
+ *oblock = e->oblock;
+ *cblock = infer_cblock(mq, e);
+ e->dirty = false;
+ push_new(mq, e);
+
+ return 0;
+}
+
+static int smq_writeback_work(struct dm_cache_policy *p, dm_oblock_t *oblock,
+ dm_cblock_t *cblock, bool critical_only)
+{
+ int r;
+ struct smq_policy *mq = to_smq_policy(p);
+
+ mutex_lock(&mq->lock);
+ r = __smq_writeback_work(mq, oblock, cblock, critical_only);
+ mutex_unlock(&mq->lock);
+
+ return r;
+}
+
+static void __force_mapping(struct smq_policy *mq,
+ dm_oblock_t current_oblock, dm_oblock_t new_oblock)
+{
+ struct entry *e = h_lookup(&mq->table, current_oblock);
+
+ if (e) {
+ del(mq, e);
+ e->oblock = new_oblock;
+ e->dirty = true;
+ push(mq, e);
+ }
+}
+
+static void smq_force_mapping(struct dm_cache_policy *p,
+ dm_oblock_t current_oblock, dm_oblock_t new_oblock)
+{
+ struct smq_policy *mq = to_smq_policy(p);
+
+ mutex_lock(&mq->lock);
+ __force_mapping(mq, current_oblock, new_oblock);
+ mutex_unlock(&mq->lock);
+}
+
+static dm_cblock_t smq_residency(struct dm_cache_policy *p)
+{
+ dm_cblock_t r;
+ struct smq_policy *mq = to_smq_policy(p);
+
+ mutex_lock(&mq->lock);
+ r = to_cblock(mq->cache_alloc.nr_allocated);
+ mutex_unlock(&mq->lock);
+
+ return r;
+}
+
+static void smq_tick(struct dm_cache_policy *p, bool can_block)
+{
+ struct smq_policy *mq = to_smq_policy(p);
+ unsigned long flags;
+
+ spin_lock_irqsave(&mq->tick_lock, flags);
+ mq->tick_protected++;
+ spin_unlock_irqrestore(&mq->tick_lock, flags);
+
+ if (can_block) {
+ mutex_lock(&mq->lock);
+ copy_tick(mq);
+ mutex_unlock(&mq->lock);
+ }
+}
+
+/* Init the policy plugin interface function pointers. */
+static void init_policy_functions(struct smq_policy *mq)
+{
+ mq->policy.destroy = smq_destroy;
+ mq->policy.map = smq_map;
+ mq->policy.lookup = smq_lookup;
+ mq->policy.set_dirty = smq_set_dirty;
+ mq->policy.clear_dirty = smq_clear_dirty;
+ mq->policy.load_mapping = smq_load_mapping;
+ mq->policy.walk_mappings = smq_walk_mappings;
+ mq->policy.remove_mapping = smq_remove_mapping;
+ mq->policy.remove_cblock = smq_remove_cblock;
+ mq->policy.writeback_work = smq_writeback_work;
+ mq->policy.force_mapping = smq_force_mapping;
+ mq->policy.residency = smq_residency;
+ mq->policy.tick = smq_tick;
+}
+
+static bool too_many_hotspot_blocks(sector_t origin_size,
+ sector_t hotspot_block_size,
+ unsigned nr_hotspot_blocks)
+{
+ return (hotspot_block_size * nr_hotspot_blocks) > origin_size;
+}
+
+static void calc_hotspot_params(sector_t origin_size,
+ sector_t cache_block_size,
+ unsigned nr_cache_blocks,
+ sector_t *hotspot_block_size,
+ unsigned *nr_hotspot_blocks)
+{
+ *hotspot_block_size = cache_block_size * 16u;
+ *nr_hotspot_blocks = max(nr_cache_blocks / 4u, 1024u);
+
+ while ((*hotspot_block_size > cache_block_size) &&
+ too_many_hotspot_blocks(origin_size, *hotspot_block_size, *nr_hotspot_blocks))
+ *hotspot_block_size /= 2u;
+}
+
+static struct dm_cache_policy *smq_create(dm_cblock_t cache_size,
+ sector_t origin_size,
+ sector_t cache_block_size)
+{
+ unsigned i;
+ unsigned nr_sentinels_per_queue = 2u * NR_CACHE_LEVELS;
+ unsigned total_sentinels = 2u * nr_sentinels_per_queue;
+ struct smq_policy *mq = kzalloc(sizeof(*mq), GFP_KERNEL);
+
+ if (!mq)
+ return NULL;
+
+ init_policy_functions(mq);
+ mq->cache_size = cache_size;
+ mq->cache_block_size = cache_block_size;
+
+ calc_hotspot_params(origin_size, cache_block_size, from_cblock(cache_size),
+ &mq->hotspot_block_size, &mq->nr_hotspot_blocks);
+
+ mq->cache_blocks_per_hotspot_block = div64_u64(mq->hotspot_block_size, mq->cache_block_size);
+ mq->hotspot_level_jump = 1u;
+ if (space_init(&mq->es, total_sentinels + mq->nr_hotspot_blocks + from_cblock(cache_size))) {
+ DMERR("couldn't initialize entry space");
+ goto bad_pool_init;
+ }
+
+ init_allocator(&mq->writeback_sentinel_alloc, &mq->es, 0, nr_sentinels_per_queue);
+ for (i = 0; i < nr_sentinels_per_queue; i++)
+ get_entry(&mq->writeback_sentinel_alloc, i)->sentinel = true;
+
+ init_allocator(&mq->demote_sentinel_alloc, &mq->es, nr_sentinels_per_queue, total_sentinels);
+ for (i = 0; i < nr_sentinels_per_queue; i++)
+ get_entry(&mq->demote_sentinel_alloc, i)->sentinel = true;
+
+ init_allocator(&mq->hotspot_alloc, &mq->es, total_sentinels,
+ total_sentinels + mq->nr_hotspot_blocks);
+
+ init_allocator(&mq->cache_alloc, &mq->es,
+ total_sentinels + mq->nr_hotspot_blocks,
+ total_sentinels + mq->nr_hotspot_blocks + from_cblock(cache_size));
+
+ mq->hotspot_hit_bits = alloc_bitset(mq->nr_hotspot_blocks);
+ if (!mq->hotspot_hit_bits) {
+ DMERR("couldn't allocate hotspot hit bitset");
+ goto bad_hotspot_hit_bits;
+ }
+ clear_bitset(mq->hotspot_hit_bits, mq->nr_hotspot_blocks);
+
+ if (from_cblock(cache_size)) {
+ mq->cache_hit_bits = alloc_bitset(from_cblock(cache_size));
+ if (!mq->cache_hit_bits) {
+ DMERR("couldn't allocate cache hit bitset");
+ goto bad_cache_hit_bits;
+ }
+ clear_bitset(mq->cache_hit_bits, from_cblock(mq->cache_size));
+ } else
+ mq->cache_hit_bits = NULL;
+
+ mq->tick_protected = 0;
+ mq->tick = 0;
+ mutex_init(&mq->lock);
+ spin_lock_init(&mq->tick_lock);
+
+ q_init(&mq->hotspot, &mq->es, NR_HOTSPOT_LEVELS);
+ mq->hotspot.nr_top_levels = 8;
+ mq->hotspot.nr_in_top_levels = min(mq->nr_hotspot_blocks / NR_HOTSPOT_LEVELS,
+ from_cblock(mq->cache_size) / mq->cache_blocks_per_hotspot_block);
+
+ q_init(&mq->clean, &mq->es, NR_CACHE_LEVELS);
+ q_init(&mq->dirty, &mq->es, NR_CACHE_LEVELS);
+
+ stats_init(&mq->hotspot_stats, NR_HOTSPOT_LEVELS);
+ stats_init(&mq->cache_stats, NR_CACHE_LEVELS);
+
+ if (h_init(&mq->table, &mq->es, from_cblock(cache_size)))
+ goto bad_alloc_table;
+
+ if (h_init(&mq->hotspot_table, &mq->es, mq->nr_hotspot_blocks))
+ goto bad_alloc_hotspot_table;
+
+ sentinels_init(mq);
+ mq->write_promote_level = mq->read_promote_level = NR_HOTSPOT_LEVELS;
+
+ mq->next_hotspot_period = jiffies;
+ mq->next_cache_period = jiffies;
+
+ return &mq->policy;
+
+bad_alloc_hotspot_table:
+ h_exit(&mq->table);
+bad_alloc_table:
+ free_bitset(mq->cache_hit_bits);
+bad_cache_hit_bits:
+ free_bitset(mq->hotspot_hit_bits);
+bad_hotspot_hit_bits:
+ space_exit(&mq->es);
+bad_pool_init:
+ kfree(mq);
+
+ return NULL;
+}
+
+/*----------------------------------------------------------------*/
+
+static struct dm_cache_policy_type smq_policy_type = {
+ .name = "smq",
+ .version = {1, 0, 0},
+ .hint_size = 4,
+ .owner = THIS_MODULE,
+ .create = smq_create
+};
+
+static struct dm_cache_policy_type default_policy_type = {
+ .name = "default",
+ .version = {1, 4, 0},
+ .hint_size = 4,
+ .owner = THIS_MODULE,
+ .create = smq_create,
+ .real = &smq_policy_type
+};
+
+static int __init smq_init(void)
+{
+ int r;
+
+ r = dm_cache_policy_register(&smq_policy_type);
+ if (r) {
+ DMERR("register failed %d", r);
+ return -ENOMEM;
+ }
+
+ r = dm_cache_policy_register(&default_policy_type);
+ if (r) {
+ DMERR("register failed (as default) %d", r);
+ dm_cache_policy_unregister(&smq_policy_type);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void __exit smq_exit(void)
+{
+ dm_cache_policy_unregister(&smq_policy_type);
+ dm_cache_policy_unregister(&default_policy_type);
+}
+
+module_init(smq_init);
+module_exit(smq_exit);
+
+MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("smq cache policy");
+
+MODULE_ALIAS("dm-cache-default");
diff --git a/drivers/md/dm-cache-policy.h b/drivers/md/dm-cache-policy.h
index 5524e21e4..05db56eed 100644
--- a/drivers/md/dm-cache-policy.h
+++ b/drivers/md/dm-cache-policy.h
@@ -178,7 +178,9 @@ struct dm_cache_policy {
int (*remove_cblock)(struct dm_cache_policy *p, dm_cblock_t cblock);
/*
- * Provide a dirty block to be written back by the core target.
+ * Provide a dirty block to be written back by the core target. If
+ * critical_only is set then the policy should only provide work if
+ * it urgently needs it.
*
* Returns:
*
@@ -186,7 +188,8 @@ struct dm_cache_policy {
*
* -ENODATA: no dirty blocks available
*/
- int (*writeback_work)(struct dm_cache_policy *p, dm_oblock_t *oblock, dm_cblock_t *cblock);
+ int (*writeback_work)(struct dm_cache_policy *p, dm_oblock_t *oblock, dm_cblock_t *cblock,
+ bool critical_only);
/*
* How full is the cache?
@@ -197,16 +200,16 @@ struct dm_cache_policy {
* Because of where we sit in the block layer, we can be asked to
* map a lot of little bios that are all in the same block (no
* queue merging has occurred). To stop the policy being fooled by
- * these the core target sends regular tick() calls to the policy.
+ * these, the core target sends regular tick() calls to the policy.
* The policy should only count an entry as hit once per tick.
*/
- void (*tick)(struct dm_cache_policy *p);
+ void (*tick)(struct dm_cache_policy *p, bool can_block);
/*
* Configuration.
*/
- int (*emit_config_values)(struct dm_cache_policy *p,
- char *result, unsigned maxlen);
+ int (*emit_config_values)(struct dm_cache_policy *p, char *result,
+ unsigned maxlen, ssize_t *sz_ptr);
int (*set_config_value)(struct dm_cache_policy *p,
const char *key, const char *value);
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index e049becaa..1fe93cfea 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -25,44 +25,93 @@ DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
/*----------------------------------------------------------------*/
-/*
- * Glossary:
- *
- * oblock: index of an origin block
- * cblock: index of a cache block
- * promotion: movement of a block from origin to cache
- * demotion: movement of a block from cache to origin
- * migration: movement of a block between the origin and cache device,
- * either direction
- */
+#define IOT_RESOLUTION 4
-/*----------------------------------------------------------------*/
+struct io_tracker {
+ spinlock_t lock;
+
+ /*
+ * Sectors of in-flight IO.
+ */
+ sector_t in_flight;
+
+ /*
+ * The time, in jiffies, when this device became idle (if it is
+ * indeed idle).
+ */
+ unsigned long idle_time;
+ unsigned long last_update_time;
+};
+
+static void iot_init(struct io_tracker *iot)
+{
+ spin_lock_init(&iot->lock);
+ iot->in_flight = 0ul;
+ iot->idle_time = 0ul;
+ iot->last_update_time = jiffies;
+}
-static size_t bitset_size_in_bytes(unsigned nr_entries)
+static bool __iot_idle_for(struct io_tracker *iot, unsigned long jifs)
{
- return sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG);
+ if (iot->in_flight)
+ return false;
+
+ return time_after(jiffies, iot->idle_time + jifs);
}
-static unsigned long *alloc_bitset(unsigned nr_entries)
+static bool iot_idle_for(struct io_tracker *iot, unsigned long jifs)
{
- size_t s = bitset_size_in_bytes(nr_entries);
- return vzalloc(s);
+ bool r;
+ unsigned long flags;
+
+ spin_lock_irqsave(&iot->lock, flags);
+ r = __iot_idle_for(iot, jifs);
+ spin_unlock_irqrestore(&iot->lock, flags);
+
+ return r;
+}
+
+static void iot_io_begin(struct io_tracker *iot, sector_t len)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&iot->lock, flags);
+ iot->in_flight += len;
+ spin_unlock_irqrestore(&iot->lock, flags);
}
-static void clear_bitset(void *bitset, unsigned nr_entries)
+static void __iot_io_end(struct io_tracker *iot, sector_t len)
{
- size_t s = bitset_size_in_bytes(nr_entries);
- memset(bitset, 0, s);
+ iot->in_flight -= len;
+ if (!iot->in_flight)
+ iot->idle_time = jiffies;
}
-static void free_bitset(unsigned long *bits)
+static void iot_io_end(struct io_tracker *iot, sector_t len)
{
- vfree(bits);
+ unsigned long flags;
+
+ spin_lock_irqsave(&iot->lock, flags);
+ __iot_io_end(iot, len);
+ spin_unlock_irqrestore(&iot->lock, flags);
}
/*----------------------------------------------------------------*/
/*
+ * Glossary:
+ *
+ * oblock: index of an origin block
+ * cblock: index of a cache block
+ * promotion: movement of a block from origin to cache
+ * demotion: movement of a block from cache to origin
+ * migration: movement of a block between the origin and cache device,
+ * either direction
+ */
+
+/*----------------------------------------------------------------*/
+
+/*
* There are a couple of places where we let a bio run, but want to do some
* work before calling its endio function. We do this by temporarily
* changing the endio fn.
@@ -86,12 +135,6 @@ static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio)
{
bio->bi_end_io = h->bi_end_io;
bio->bi_private = h->bi_private;
-
- /*
- * Must bump bi_remaining to allow bio to complete with
- * restored bi_end_io.
- */
- atomic_inc(&bio->bi_remaining);
}
/*----------------------------------------------------------------*/
@@ -107,12 +150,10 @@ static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio)
#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT)
#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
-/*
- * FIXME: the cache is read/write for the time being.
- */
enum cache_metadata_mode {
CM_WRITE, /* metadata may be changed */
CM_READ_ONLY, /* metadata may not be changed */
+ CM_FAIL
};
enum cache_io_mode {
@@ -214,6 +255,7 @@ struct cache {
int sectors_per_block_shift;
spinlock_t lock;
+ struct list_head deferred_cells;
struct bio_list deferred_bios;
struct bio_list deferred_flush_bios;
struct bio_list deferred_writethrough_bios;
@@ -288,6 +330,8 @@ struct cache {
*/
spinlock_t invalidation_lock;
struct list_head invalidation_requests;
+
+ struct io_tracker origin_tracker;
};
struct per_bio_data {
@@ -295,6 +339,7 @@ struct per_bio_data {
unsigned req_nr:2;
struct dm_deferred_entry *all_io_entry;
struct dm_hook_info hook_info;
+ sector_t len;
/*
* writethrough fields. These MUST remain at the end of this
@@ -338,6 +383,8 @@ struct prealloc {
struct dm_bio_prison_cell *cell2;
};
+static enum cache_metadata_mode get_cache_mode(struct cache *cache);
+
static void wake_worker(struct cache *cache)
{
queue_work(cache->wq, &cache->worker);
@@ -371,10 +418,13 @@ static struct dm_cache_migration *alloc_migration(struct cache *cache)
static void free_migration(struct dm_cache_migration *mg)
{
- if (atomic_dec_and_test(&mg->cache->nr_allocated_migrations))
- wake_up(&mg->cache->migration_wait);
+ struct cache *cache = mg->cache;
+
+ if (atomic_dec_and_test(&cache->nr_allocated_migrations))
+ wake_up(&cache->migration_wait);
- mempool_free(mg, mg->cache->migration_pool);
+ mempool_free(mg, cache->migration_pool);
+ wake_worker(cache);
}
static int prealloc_data_structs(struct cache *cache, struct prealloc *p)
@@ -649,6 +699,9 @@ static void save_stats(struct cache *cache)
{
struct dm_cache_statistics stats;
+ if (get_cache_mode(cache) >= CM_READ_ONLY)
+ return;
+
stats.read_hits = atomic_read(&cache->stats.read_hit);
stats.read_misses = atomic_read(&cache->stats.read_miss);
stats.write_hits = atomic_read(&cache->stats.write_hit);
@@ -701,6 +754,7 @@ static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size)
pb->tick = false;
pb->req_nr = dm_bio_get_target_bio_nr(bio);
pb->all_io_entry = NULL;
+ pb->len = 0;
return pb;
}
@@ -798,12 +852,43 @@ static void inc_ds(struct cache *cache, struct bio *bio,
pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
}
+static bool accountable_bio(struct cache *cache, struct bio *bio)
+{
+ return ((bio->bi_bdev == cache->origin_dev->bdev) &&
+ !(bio->bi_rw & REQ_DISCARD));
+}
+
+static void accounted_begin(struct cache *cache, struct bio *bio)
+{
+ size_t pb_data_size = get_per_bio_data_size(cache);
+ struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
+
+ if (accountable_bio(cache, bio)) {
+ pb->len = bio_sectors(bio);
+ iot_io_begin(&cache->origin_tracker, pb->len);
+ }
+}
+
+static void accounted_complete(struct cache *cache, struct bio *bio)
+{
+ size_t pb_data_size = get_per_bio_data_size(cache);
+ struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
+
+ iot_io_end(&cache->origin_tracker, pb->len);
+}
+
+static void accounted_request(struct cache *cache, struct bio *bio)
+{
+ accounted_begin(cache, bio);
+ generic_make_request(bio);
+}
+
static void issue(struct cache *cache, struct bio *bio)
{
unsigned long flags;
if (!bio_triggers_commit(cache, bio)) {
- generic_make_request(bio);
+ accounted_request(cache, bio);
return;
}
@@ -876,6 +961,94 @@ static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio,
}
/*----------------------------------------------------------------
+ * Failure modes
+ *--------------------------------------------------------------*/
+static enum cache_metadata_mode get_cache_mode(struct cache *cache)
+{
+ return cache->features.mode;
+}
+
+static const char *cache_device_name(struct cache *cache)
+{
+ return dm_device_name(dm_table_get_md(cache->ti->table));
+}
+
+static void notify_mode_switch(struct cache *cache, enum cache_metadata_mode mode)
+{
+ const char *descs[] = {
+ "write",
+ "read-only",
+ "fail"
+ };
+
+ dm_table_event(cache->ti->table);
+ DMINFO("%s: switching cache to %s mode",
+ cache_device_name(cache), descs[(int)mode]);
+}
+
+static void set_cache_mode(struct cache *cache, enum cache_metadata_mode new_mode)
+{
+ bool needs_check = dm_cache_metadata_needs_check(cache->cmd);
+ enum cache_metadata_mode old_mode = get_cache_mode(cache);
+
+ if (new_mode == CM_WRITE && needs_check) {
+ DMERR("%s: unable to switch cache to write mode until repaired.",
+ cache_device_name(cache));
+ if (old_mode != new_mode)
+ new_mode = old_mode;
+ else
+ new_mode = CM_READ_ONLY;
+ }
+
+ /* Never move out of fail mode */
+ if (old_mode == CM_FAIL)
+ new_mode = CM_FAIL;
+
+ switch (new_mode) {
+ case CM_FAIL:
+ case CM_READ_ONLY:
+ dm_cache_metadata_set_read_only(cache->cmd);
+ break;
+
+ case CM_WRITE:
+ dm_cache_metadata_set_read_write(cache->cmd);
+ break;
+ }
+
+ cache->features.mode = new_mode;
+
+ if (new_mode != old_mode)
+ notify_mode_switch(cache, new_mode);
+}
+
+static void abort_transaction(struct cache *cache)
+{
+ const char *dev_name = cache_device_name(cache);
+
+ if (get_cache_mode(cache) >= CM_READ_ONLY)
+ return;
+
+ if (dm_cache_metadata_set_needs_check(cache->cmd)) {
+ DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);
+ set_cache_mode(cache, CM_FAIL);
+ }
+
+ DMERR_LIMIT("%s: aborting current metadata transaction", dev_name);
+ if (dm_cache_metadata_abort(cache->cmd)) {
+ DMERR("%s: failed to abort metadata transaction", dev_name);
+ set_cache_mode(cache, CM_FAIL);
+ }
+}
+
+static void metadata_operation_failed(struct cache *cache, const char *op, int r)
+{
+ DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
+ cache_device_name(cache), op, r);
+ abort_transaction(cache);
+ set_cache_mode(cache, CM_READ_ONLY);
+}
+
+/*----------------------------------------------------------------
* Migration processing
*
* Migration covers moving data from the origin device to the cache, or
@@ -891,26 +1064,63 @@ static void dec_io_migrations(struct cache *cache)
atomic_dec(&cache->nr_io_migrations);
}
-static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
- bool holder)
+static void __cell_release(struct cache *cache, struct dm_bio_prison_cell *cell,
+ bool holder, struct bio_list *bios)
{
(holder ? dm_cell_release : dm_cell_release_no_holder)
- (cache->prison, cell, &cache->deferred_bios);
+ (cache->prison, cell, bios);
free_prison_cell(cache, cell);
}
-static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
- bool holder)
+static bool discard_or_flush(struct bio *bio)
+{
+ return bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD);
+}
+
+static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell)
+{
+ if (discard_or_flush(cell->holder))
+ /*
+ * We have to handle these bios
+ * individually.
+ */
+ __cell_release(cache, cell, true, &cache->deferred_bios);
+
+ else
+ list_add_tail(&cell->user_list, &cache->deferred_cells);
+}
+
+static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, bool holder)
{
unsigned long flags;
+ if (!holder && dm_cell_promote_or_release(cache->prison, cell)) {
+ /*
+ * There was no prisoner to promote to holder, the
+ * cell has been released.
+ */
+ free_prison_cell(cache, cell);
+ return;
+ }
+
spin_lock_irqsave(&cache->lock, flags);
- __cell_defer(cache, cell, holder);
+ __cell_defer(cache, cell);
spin_unlock_irqrestore(&cache->lock, flags);
wake_worker(cache);
}
+static void cell_error_with_code(struct cache *cache, struct dm_bio_prison_cell *cell, int err)
+{
+ dm_cell_error(cache->prison, cell, err);
+ dm_bio_prison_free_cell(cache->prison, cell);
+}
+
+static void cell_requeue(struct cache *cache, struct dm_bio_prison_cell *cell)
+{
+ cell_error_with_code(cache, cell, DM_ENDIO_REQUEUE);
+}
+
static void free_io_migration(struct dm_cache_migration *mg)
{
dec_io_migrations(mg->cache);
@@ -920,21 +1130,22 @@ static void free_io_migration(struct dm_cache_migration *mg)
static void migration_failure(struct dm_cache_migration *mg)
{
struct cache *cache = mg->cache;
+ const char *dev_name = cache_device_name(cache);
if (mg->writeback) {
- DMWARN_LIMIT("writeback failed; couldn't copy block");
+ DMERR_LIMIT("%s: writeback failed; couldn't copy block", dev_name);
set_dirty(cache, mg->old_oblock, mg->cblock);
cell_defer(cache, mg->old_ocell, false);
} else if (mg->demote) {
- DMWARN_LIMIT("demotion failed; couldn't copy block");
+ DMERR_LIMIT("%s: demotion failed; couldn't copy block", dev_name);
policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock);
cell_defer(cache, mg->old_ocell, mg->promote ? false : true);
if (mg->promote)
cell_defer(cache, mg->new_ocell, true);
} else {
- DMWARN_LIMIT("promotion failed; couldn't copy block");
+ DMERR_LIMIT("%s: promotion failed; couldn't copy block", dev_name);
policy_remove_mapping(cache->policy, mg->new_oblock);
cell_defer(cache, mg->new_ocell, true);
}
@@ -944,6 +1155,7 @@ static void migration_failure(struct dm_cache_migration *mg)
static void migration_success_pre_commit(struct dm_cache_migration *mg)
{
+ int r;
unsigned long flags;
struct cache *cache = mg->cache;
@@ -954,8 +1166,11 @@ static void migration_success_pre_commit(struct dm_cache_migration *mg)
return;
} else if (mg->demote) {
- if (dm_cache_remove_mapping(cache->cmd, mg->cblock)) {
- DMWARN_LIMIT("demotion failed; couldn't update on disk metadata");
+ r = dm_cache_remove_mapping(cache->cmd, mg->cblock);
+ if (r) {
+ DMERR_LIMIT("%s: demotion failed; couldn't update on disk metadata",
+ cache_device_name(cache));
+ metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
policy_force_mapping(cache->policy, mg->new_oblock,
mg->old_oblock);
if (mg->promote)
@@ -964,8 +1179,11 @@ static void migration_success_pre_commit(struct dm_cache_migration *mg)
return;
}
} else {
- if (dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock)) {
- DMWARN_LIMIT("promotion failed; couldn't update on disk metadata");
+ r = dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock);
+ if (r) {
+ DMERR_LIMIT("%s: promotion failed; couldn't update on disk metadata",
+ cache_device_name(cache));
+ metadata_operation_failed(cache, "dm_cache_insert_mapping", r);
policy_remove_mapping(cache->policy, mg->new_oblock);
free_io_migration(mg);
return;
@@ -984,7 +1202,8 @@ static void migration_success_post_commit(struct dm_cache_migration *mg)
struct cache *cache = mg->cache;
if (mg->writeback) {
- DMWARN("writeback unexpectedly triggered commit");
+ DMWARN_LIMIT("%s: writeback unexpectedly triggered commit",
+ cache_device_name(cache));
return;
} else if (mg->demote) {
@@ -1060,7 +1279,7 @@ static void issue_copy(struct dm_cache_migration *mg)
}
if (r < 0) {
- DMERR_LIMIT("issuing migration failed");
+ DMERR_LIMIT("%s: issuing migration failed", cache_device_name(cache));
migration_failure(mg);
}
}
@@ -1099,7 +1318,7 @@ static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio)
* No need to inc_ds() here, since the cell will be held for the
* duration of the io.
*/
- generic_make_request(bio);
+ accounted_request(mg->cache, bio);
}
static bool bio_writes_complete_block(struct cache *cache, struct bio *bio)
@@ -1447,6 +1666,107 @@ static void inc_miss_counter(struct cache *cache, struct bio *bio)
/*----------------------------------------------------------------*/
+struct inc_detail {
+ struct cache *cache;
+ struct bio_list bios_for_issue;
+ struct bio_list unhandled_bios;
+ bool any_writes;
+};
+
+static void inc_fn(void *context, struct dm_bio_prison_cell *cell)
+{
+ struct bio *bio;
+ struct inc_detail *detail = context;
+ struct cache *cache = detail->cache;
+
+ inc_ds(cache, cell->holder, cell);
+ if (bio_data_dir(cell->holder) == WRITE)
+ detail->any_writes = true;
+
+ while ((bio = bio_list_pop(&cell->bios))) {
+ if (discard_or_flush(bio)) {
+ bio_list_add(&detail->unhandled_bios, bio);
+ continue;
+ }
+
+ if (bio_data_dir(bio) == WRITE)
+ detail->any_writes = true;
+
+ bio_list_add(&detail->bios_for_issue, bio);
+ inc_ds(cache, bio, cell);
+ }
+}
+
+// FIXME: refactor these two
+static void remap_cell_to_origin_clear_discard(struct cache *cache,
+ struct dm_bio_prison_cell *cell,
+ dm_oblock_t oblock, bool issue_holder)
+{
+ struct bio *bio;
+ unsigned long flags;
+ struct inc_detail detail;
+
+ detail.cache = cache;
+ bio_list_init(&detail.bios_for_issue);
+ bio_list_init(&detail.unhandled_bios);
+ detail.any_writes = false;
+
+ spin_lock_irqsave(&cache->lock, flags);
+ dm_cell_visit_release(cache->prison, inc_fn, &detail, cell);
+ bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios);
+ spin_unlock_irqrestore(&cache->lock, flags);
+
+ remap_to_origin(cache, cell->holder);
+ if (issue_holder)
+ issue(cache, cell->holder);
+ else
+ accounted_begin(cache, cell->holder);
+
+ if (detail.any_writes)
+ clear_discard(cache, oblock_to_dblock(cache, oblock));
+
+ while ((bio = bio_list_pop(&detail.bios_for_issue))) {
+ remap_to_origin(cache, bio);
+ issue(cache, bio);
+ }
+}
+
+static void remap_cell_to_cache_dirty(struct cache *cache, struct dm_bio_prison_cell *cell,
+ dm_oblock_t oblock, dm_cblock_t cblock, bool issue_holder)
+{
+ struct bio *bio;
+ unsigned long flags;
+ struct inc_detail detail;
+
+ detail.cache = cache;
+ bio_list_init(&detail.bios_for_issue);
+ bio_list_init(&detail.unhandled_bios);
+ detail.any_writes = false;
+
+ spin_lock_irqsave(&cache->lock, flags);
+ dm_cell_visit_release(cache->prison, inc_fn, &detail, cell);
+ bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios);
+ spin_unlock_irqrestore(&cache->lock, flags);
+
+ remap_to_cache(cache, cell->holder, cblock);
+ if (issue_holder)
+ issue(cache, cell->holder);
+ else
+ accounted_begin(cache, cell->holder);
+
+ if (detail.any_writes) {
+ set_dirty(cache, oblock, cblock);
+ clear_discard(cache, oblock_to_dblock(cache, oblock));
+ }
+
+ while ((bio = bio_list_pop(&detail.bios_for_issue))) {
+ remap_to_cache(cache, bio, cblock);
+ issue(cache, bio);
+ }
+}
+
+/*----------------------------------------------------------------*/
+
struct old_oblock_lock {
struct policy_locker locker;
struct cache *cache;
@@ -1471,36 +1791,26 @@ static int cell_locker(struct policy_locker *locker, dm_oblock_t b)
l->structs, &l->cell);
}
-static void process_bio(struct cache *cache, struct prealloc *structs,
- struct bio *bio)
+static void process_cell(struct cache *cache, struct prealloc *structs,
+ struct dm_bio_prison_cell *new_ocell)
{
int r;
bool release_cell = true;
+ struct bio *bio = new_ocell->holder;
dm_oblock_t block = get_bio_block(cache, bio);
- struct dm_bio_prison_cell *cell_prealloc, *new_ocell;
struct policy_result lookup_result;
bool passthrough = passthrough_mode(&cache->features);
- bool discarded_block, can_migrate;
+ bool fast_promotion, can_migrate;
struct old_oblock_lock ool;
- /*
- * Check to see if that block is currently migrating.
- */
- cell_prealloc = prealloc_get_cell(structs);
- r = bio_detain(cache, block, bio, cell_prealloc,
- (cell_free_fn) prealloc_put_cell,
- structs, &new_ocell);
- if (r > 0)
- return;
-
- discarded_block = is_discarded_oblock(cache, block);
- can_migrate = !passthrough && (discarded_block || spare_migration_bandwidth(cache));
+ fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio);
+ can_migrate = !passthrough && (fast_promotion || spare_migration_bandwidth(cache));
ool.locker.fn = cell_locker;
ool.cache = cache;
ool.structs = structs;
ool.cell = NULL;
- r = policy_map(cache->policy, block, true, can_migrate, discarded_block,
+ r = policy_map(cache->policy, block, true, can_migrate, fast_promotion,
bio, &ool.locker, &lookup_result);
if (r == -EWOULDBLOCK)
@@ -1537,9 +1847,9 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
inc_and_issue(cache, bio, new_ocell);
- } else {
- remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
- inc_and_issue(cache, bio, new_ocell);
+ } else {
+ remap_cell_to_cache_dirty(cache, new_ocell, block, lookup_result.cblock, true);
+ release_cell = false;
}
}
@@ -1547,8 +1857,8 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
case POLICY_MISS:
inc_miss_counter(cache, bio);
- remap_to_origin_clear_discard(cache, bio, block);
- inc_and_issue(cache, bio, new_ocell);
+ remap_cell_to_origin_clear_discard(cache, new_ocell, block, true);
+ release_cell = false;
break;
case POLICY_NEW:
@@ -1567,7 +1877,8 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
break;
default:
- DMERR_LIMIT("%s: erroring bio, unknown policy op: %u", __func__,
+ DMERR_LIMIT("%s: %s: erroring bio, unknown policy op: %u",
+ cache_device_name(cache), __func__,
(unsigned) lookup_result.op);
bio_io_error(bio);
}
@@ -1576,10 +1887,48 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
cell_defer(cache, new_ocell, false);
}
+static void process_bio(struct cache *cache, struct prealloc *structs,
+ struct bio *bio)
+{
+ int r;
+ dm_oblock_t block = get_bio_block(cache, bio);
+ struct dm_bio_prison_cell *cell_prealloc, *new_ocell;
+
+ /*
+ * Check to see if that block is currently migrating.
+ */
+ cell_prealloc = prealloc_get_cell(structs);
+ r = bio_detain(cache, block, bio, cell_prealloc,
+ (cell_free_fn) prealloc_put_cell,
+ structs, &new_ocell);
+ if (r > 0)
+ return;
+
+ process_cell(cache, structs, new_ocell);
+}
+
static int need_commit_due_to_time(struct cache *cache)
{
- return !time_in_range(jiffies, cache->last_commit_jiffies,
- cache->last_commit_jiffies + COMMIT_PERIOD);
+ return jiffies < cache->last_commit_jiffies ||
+ jiffies > cache->last_commit_jiffies + COMMIT_PERIOD;
+}
+
+/*
+ * A non-zero return indicates read_only or fail_io mode.
+ */
+static int commit(struct cache *cache, bool clean_shutdown)
+{
+ int r;
+
+ if (get_cache_mode(cache) >= CM_READ_ONLY)
+ return -EINVAL;
+
+ atomic_inc(&cache->stats.commit_count);
+ r = dm_cache_commit(cache->cmd, clean_shutdown);
+ if (r)
+ metadata_operation_failed(cache, "dm_cache_commit", r);
+
+ return r;
}
static int commit_if_needed(struct cache *cache)
@@ -1588,9 +1937,8 @@ static int commit_if_needed(struct cache *cache)
if ((cache->commit_requested || need_commit_due_to_time(cache)) &&
dm_cache_changed_this_transaction(cache->cmd)) {
- atomic_inc(&cache->stats.commit_count);
+ r = commit(cache, false);
cache->commit_requested = false;
- r = dm_cache_commit(cache->cmd, false);
cache->last_commit_jiffies = jiffies;
}
@@ -1599,6 +1947,7 @@ static int commit_if_needed(struct cache *cache)
static void process_deferred_bios(struct cache *cache)
{
+ bool prealloc_used = false;
unsigned long flags;
struct bio_list bios;
struct bio *bio;
@@ -1618,6 +1967,7 @@ static void process_deferred_bios(struct cache *cache)
* this bio might require one, we pause until there are some
* prepared mappings to process.
*/
+ prealloc_used = true;
if (prealloc_data_structs(cache, &structs)) {
spin_lock_irqsave(&cache->lock, flags);
bio_list_merge(&cache->deferred_bios, &bios);
@@ -1635,7 +1985,45 @@ static void process_deferred_bios(struct cache *cache)
process_bio(cache, &structs, bio);
}
- prealloc_free_structs(cache, &structs);
+ if (prealloc_used)
+ prealloc_free_structs(cache, &structs);
+}
+
+static void process_deferred_cells(struct cache *cache)
+{
+ bool prealloc_used = false;
+ unsigned long flags;
+ struct dm_bio_prison_cell *cell, *tmp;
+ struct list_head cells;
+ struct prealloc structs;
+
+ memset(&structs, 0, sizeof(structs));
+
+ INIT_LIST_HEAD(&cells);
+
+ spin_lock_irqsave(&cache->lock, flags);
+ list_splice_init(&cache->deferred_cells, &cells);
+ spin_unlock_irqrestore(&cache->lock, flags);
+
+ list_for_each_entry_safe(cell, tmp, &cells, user_list) {
+ /*
+ * If we've got no free migration structs, and processing
+ * this bio might require one, we pause until there are some
+ * prepared mappings to process.
+ */
+ prealloc_used = true;
+ if (prealloc_data_structs(cache, &structs)) {
+ spin_lock_irqsave(&cache->lock, flags);
+ list_splice(&cells, &cache->deferred_cells);
+ spin_unlock_irqrestore(&cache->lock, flags);
+ break;
+ }
+
+ process_cell(cache, &structs, cell);
+ }
+
+ if (prealloc_used)
+ prealloc_free_structs(cache, &structs);
}
static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
@@ -1655,7 +2043,7 @@ static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
* These bios have already been through inc_ds()
*/
while ((bio = bio_list_pop(&bios)))
- submit_bios ? generic_make_request(bio) : bio_io_error(bio);
+ submit_bios ? accounted_request(cache, bio) : bio_io_error(bio);
}
static void process_deferred_writethrough_bios(struct cache *cache)
@@ -1675,29 +2063,27 @@ static void process_deferred_writethrough_bios(struct cache *cache)
* These bios have already been through inc_ds()
*/
while ((bio = bio_list_pop(&bios)))
- generic_make_request(bio);
+ accounted_request(cache, bio);
}
static void writeback_some_dirty_blocks(struct cache *cache)
{
- int r = 0;
+ bool prealloc_used = false;
dm_oblock_t oblock;
dm_cblock_t cblock;
struct prealloc structs;
struct dm_bio_prison_cell *old_ocell;
+ bool busy = !iot_idle_for(&cache->origin_tracker, HZ);
memset(&structs, 0, sizeof(structs));
while (spare_migration_bandwidth(cache)) {
- if (prealloc_data_structs(cache, &structs))
- break;
-
- r = policy_writeback_work(cache->policy, &oblock, &cblock);
- if (r)
- break;
+ if (policy_writeback_work(cache->policy, &oblock, &cblock, busy))
+ break; /* no work to do */
- r = get_cell(cache, oblock, &structs, &old_ocell);
- if (r) {
+ prealloc_used = true;
+ if (prealloc_data_structs(cache, &structs) ||
+ get_cell(cache, oblock, &structs, &old_ocell)) {
policy_set_dirty(cache->policy, oblock);
break;
}
@@ -1705,7 +2091,8 @@ static void writeback_some_dirty_blocks(struct cache *cache)
writeback(cache, &structs, oblock, cblock, old_ocell);
}
- prealloc_free_structs(cache, &structs);
+ if (prealloc_used)
+ prealloc_free_structs(cache, &structs);
}
/*----------------------------------------------------------------
@@ -1723,15 +2110,17 @@ static void process_invalidation_request(struct cache *cache, struct invalidatio
r = policy_remove_cblock(cache->policy, to_cblock(begin));
if (!r) {
r = dm_cache_remove_mapping(cache->cmd, to_cblock(begin));
- if (r)
+ if (r) {
+ metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
break;
+ }
} else if (r == -ENODATA) {
/* harmless, already unmapped */
r = 0;
} else {
- DMERR("policy_remove_cblock failed");
+ DMERR("%s: policy_remove_cblock failed", cache_device_name(cache));
break;
}
@@ -1804,7 +2193,22 @@ static void stop_worker(struct cache *cache)
flush_workqueue(cache->wq);
}
-static void requeue_deferred_io(struct cache *cache)
+static void requeue_deferred_cells(struct cache *cache)
+{
+ unsigned long flags;
+ struct list_head cells;
+ struct dm_bio_prison_cell *cell, *tmp;
+
+ INIT_LIST_HEAD(&cells);
+ spin_lock_irqsave(&cache->lock, flags);
+ list_splice_init(&cache->deferred_cells, &cells);
+ spin_unlock_irqrestore(&cache->lock, flags);
+
+ list_for_each_entry_safe(cell, tmp, &cells, user_list)
+ cell_requeue(cache, cell);
+}
+
+static void requeue_deferred_bios(struct cache *cache)
{
struct bio *bio;
struct bio_list bios;
@@ -1825,6 +2229,7 @@ static int more_work(struct cache *cache)
!list_empty(&cache->need_commit_migrations);
else
return !bio_list_empty(&cache->deferred_bios) ||
+ !list_empty(&cache->deferred_cells) ||
!bio_list_empty(&cache->deferred_flush_bios) ||
!bio_list_empty(&cache->deferred_writethrough_bios) ||
!list_empty(&cache->quiesced_migrations) ||
@@ -1842,6 +2247,7 @@ static void do_worker(struct work_struct *ws)
writeback_some_dirty_blocks(cache);
process_deferred_writethrough_bios(cache);
process_deferred_bios(cache);
+ process_deferred_cells(cache);
process_invalidation_requests(cache);
}
@@ -1851,11 +2257,6 @@ static void do_worker(struct work_struct *ws)
if (commit_if_needed(cache)) {
process_deferred_flush_bios(cache, false);
process_migrations(cache, &cache->need_commit_migrations, migration_failure);
-
- /*
- * FIXME: rollback metadata or just go into a
- * failure mode and error everything
- */
} else {
process_deferred_flush_bios(cache, true);
process_migrations(cache, &cache->need_commit_migrations,
@@ -1874,7 +2275,7 @@ static void do_worker(struct work_struct *ws)
static void do_waker(struct work_struct *ws)
{
struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker);
- policy_tick(cache->policy);
+ policy_tick(cache->policy, true);
wake_worker(cache);
queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD);
}
@@ -2428,6 +2829,12 @@ static int cache_create(struct cache_args *ca, struct cache **result)
goto bad;
}
cache->cmd = cmd;
+ set_cache_mode(cache, CM_WRITE);
+ if (get_cache_mode(cache) != CM_WRITE) {
+ *error = "Unable to get write access to metadata, please check/repair metadata.";
+ r = -EINVAL;
+ goto bad;
+ }
if (passthrough_mode(&cache->features)) {
bool all_clean;
@@ -2446,6 +2853,7 @@ static int cache_create(struct cache_args *ca, struct cache **result)
}
spin_lock_init(&cache->lock);
+ INIT_LIST_HEAD(&cache->deferred_cells);
bio_list_init(&cache->deferred_bios);
bio_list_init(&cache->deferred_flush_bios);
bio_list_init(&cache->deferred_writethrough_bios);
@@ -2535,6 +2943,8 @@ static int cache_create(struct cache_args *ca, struct cache **result)
spin_lock_init(&cache->invalidation_lock);
INIT_LIST_HEAD(&cache->invalidation_requests);
+ iot_init(&cache->origin_tracker);
+
*result = cache;
return 0;
@@ -2601,13 +3011,18 @@ out:
return r;
}
-static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell **cell)
+/*----------------------------------------------------------------*/
+
+static int cache_map(struct dm_target *ti, struct bio *bio)
{
+ struct cache *cache = ti->private;
+
int r;
+ struct dm_bio_prison_cell *cell = NULL;
dm_oblock_t block = get_bio_block(cache, bio);
size_t pb_data_size = get_per_bio_data_size(cache);
bool can_migrate = false;
- bool discarded_block;
+ bool fast_promotion;
struct policy_result lookup_result;
struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size);
struct old_oblock_lock ool;
@@ -2621,10 +3036,11 @@ static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_priso
* Just remap to the origin and carry on.
*/
remap_to_origin(cache, bio);
+ accounted_begin(cache, bio);
return DM_MAPIO_REMAPPED;
}
- if (bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD)) {
+ if (discard_or_flush(bio)) {
defer_bio(cache, bio);
return DM_MAPIO_SUBMITTED;
}
@@ -2632,15 +3048,15 @@ static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_priso
/*
* Check to see if that block is currently migrating.
*/
- *cell = alloc_prison_cell(cache);
- if (!*cell) {
+ cell = alloc_prison_cell(cache);
+ if (!cell) {
defer_bio(cache, bio);
return DM_MAPIO_SUBMITTED;
}
- r = bio_detain(cache, block, bio, *cell,
+ r = bio_detain(cache, block, bio, cell,
(cell_free_fn) free_prison_cell,
- cache, cell);
+ cache, &cell);
if (r) {
if (r < 0)
defer_bio(cache, bio);
@@ -2648,17 +3064,18 @@ static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_priso
return DM_MAPIO_SUBMITTED;
}
- discarded_block = is_discarded_oblock(cache, block);
+ fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio);
- r = policy_map(cache->policy, block, false, can_migrate, discarded_block,
+ r = policy_map(cache->policy, block, false, can_migrate, fast_promotion,
bio, &ool.locker, &lookup_result);
if (r == -EWOULDBLOCK) {
- cell_defer(cache, *cell, true);
+ cell_defer(cache, cell, true);
return DM_MAPIO_SUBMITTED;
} else if (r) {
- DMERR_LIMIT("Unexpected return from cache replacement policy: %d", r);
- cell_defer(cache, *cell, false);
+ DMERR_LIMIT("%s: Unexpected return from cache replacement policy: %d",
+ cache_device_name(cache), r);
+ cell_defer(cache, cell, false);
bio_io_error(bio);
return DM_MAPIO_SUBMITTED;
}
@@ -2672,21 +3089,30 @@ static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_priso
* We need to invalidate this block, so
* defer for the worker thread.
*/
- cell_defer(cache, *cell, true);
+ cell_defer(cache, cell, true);
r = DM_MAPIO_SUBMITTED;
} else {
inc_miss_counter(cache, bio);
remap_to_origin_clear_discard(cache, bio, block);
+ accounted_begin(cache, bio);
+ inc_ds(cache, bio, cell);
+ // FIXME: we want to remap hits or misses straight
+ // away rather than passing over to the worker.
+ cell_defer(cache, cell, false);
}
} else {
inc_hit_counter(cache, bio);
if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) &&
- !is_dirty(cache, lookup_result.cblock))
+ !is_dirty(cache, lookup_result.cblock)) {
remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
- else
- remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
+ accounted_begin(cache, bio);
+ inc_ds(cache, bio, cell);
+ cell_defer(cache, cell, false);
+
+ } else
+ remap_cell_to_cache_dirty(cache, cell, block, lookup_result.cblock, false);
}
break;
@@ -2698,18 +3124,19 @@ static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_priso
* longer needed because the block has been demoted.
*/
bio_endio(bio, 0);
- cell_defer(cache, *cell, false);
+ // FIXME: remap everything as a miss
+ cell_defer(cache, cell, false);
r = DM_MAPIO_SUBMITTED;
} else
- remap_to_origin_clear_discard(cache, bio, block);
-
+ remap_cell_to_origin_clear_discard(cache, cell, block, false);
break;
default:
- DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__,
+ DMERR_LIMIT("%s: %s: erroring bio: unknown policy op: %u",
+ cache_device_name(cache), __func__,
(unsigned) lookup_result.op);
- cell_defer(cache, *cell, false);
+ cell_defer(cache, cell, false);
bio_io_error(bio);
r = DM_MAPIO_SUBMITTED;
}
@@ -2717,21 +3144,6 @@ static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_priso
return r;
}
-static int cache_map(struct dm_target *ti, struct bio *bio)
-{
- int r;
- struct dm_bio_prison_cell *cell = NULL;
- struct cache *cache = ti->private;
-
- r = __cache_map(cache, bio, &cell);
- if (r == DM_MAPIO_REMAPPED && cell) {
- inc_ds(cache, bio, cell);
- cell_defer(cache, cell, false);
- }
-
- return r;
-}
-
static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
{
struct cache *cache = ti->private;
@@ -2740,7 +3152,7 @@ static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
if (pb->tick) {
- policy_tick(cache->policy);
+ policy_tick(cache->policy, false);
spin_lock_irqsave(&cache->lock, flags);
cache->need_tick_bio = true;
@@ -2748,6 +3160,7 @@ static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
}
check_for_quiesced_migrations(cache, pb);
+ accounted_complete(cache, bio);
return 0;
}
@@ -2756,11 +3169,16 @@ static int write_dirty_bitset(struct cache *cache)
{
unsigned i, r;
+ if (get_cache_mode(cache) >= CM_READ_ONLY)
+ return -EINVAL;
+
for (i = 0; i < from_cblock(cache->cache_size); i++) {
r = dm_cache_set_dirty(cache->cmd, to_cblock(i),
is_dirty(cache, to_cblock(i)));
- if (r)
+ if (r) {
+ metadata_operation_failed(cache, "dm_cache_set_dirty", r);
return r;
+ }
}
return 0;
@@ -2770,18 +3188,40 @@ static int write_discard_bitset(struct cache *cache)
{
unsigned i, r;
+ if (get_cache_mode(cache) >= CM_READ_ONLY)
+ return -EINVAL;
+
r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size,
cache->discard_nr_blocks);
if (r) {
- DMERR("could not resize on-disk discard bitset");
+ DMERR("%s: could not resize on-disk discard bitset", cache_device_name(cache));
+ metadata_operation_failed(cache, "dm_cache_discard_bitset_resize", r);
return r;
}
for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) {
r = dm_cache_set_discard(cache->cmd, to_dblock(i),
is_discarded(cache, to_dblock(i)));
- if (r)
+ if (r) {
+ metadata_operation_failed(cache, "dm_cache_set_discard", r);
return r;
+ }
+ }
+
+ return 0;
+}
+
+static int write_hints(struct cache *cache)
+{
+ int r;
+
+ if (get_cache_mode(cache) >= CM_READ_ONLY)
+ return -EINVAL;
+
+ r = dm_cache_write_hints(cache->cmd, cache->policy);
+ if (r) {
+ metadata_operation_failed(cache, "dm_cache_write_hints", r);
+ return r;
}
return 0;
@@ -2796,26 +3236,26 @@ static bool sync_metadata(struct cache *cache)
r1 = write_dirty_bitset(cache);
if (r1)
- DMERR("could not write dirty bitset");
+ DMERR("%s: could not write dirty bitset", cache_device_name(cache));
r2 = write_discard_bitset(cache);
if (r2)
- DMERR("could not write discard bitset");
+ DMERR("%s: could not write discard bitset", cache_device_name(cache));
save_stats(cache);
- r3 = dm_cache_write_hints(cache->cmd, cache->policy);
+ r3 = write_hints(cache);
if (r3)
- DMERR("could not write hints");
+ DMERR("%s: could not write hints", cache_device_name(cache));
/*
* If writing the above metadata failed, we still commit, but don't
* set the clean shutdown flag. This will effectively force every
* dirty bit to be set on reload.
*/
- r4 = dm_cache_commit(cache->cmd, !r1 && !r2 && !r3);
+ r4 = commit(cache, !r1 && !r2 && !r3);
if (r4)
- DMERR("could not write cache metadata. Data loss may occur.");
+ DMERR("%s: could not write cache metadata", cache_device_name(cache));
return !r1 && !r2 && !r3 && !r4;
}
@@ -2827,10 +3267,12 @@ static void cache_postsuspend(struct dm_target *ti)
start_quiescing(cache);
wait_for_migrations(cache);
stop_worker(cache);
- requeue_deferred_io(cache);
+ requeue_deferred_bios(cache);
+ requeue_deferred_cells(cache);
stop_quiescing(cache);
- (void) sync_metadata(cache);
+ if (get_cache_mode(cache) == CM_WRITE)
+ (void) sync_metadata(cache);
}
static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
@@ -2953,7 +3395,8 @@ static bool can_resize(struct cache *cache, dm_cblock_t new_size)
while (from_cblock(new_size) < from_cblock(cache->cache_size)) {
new_size = to_cblock(from_cblock(new_size) + 1);
if (is_dirty(cache, new_size)) {
- DMERR("unable to shrink cache; cache block %llu is dirty",
+ DMERR("%s: unable to shrink cache; cache block %llu is dirty",
+ cache_device_name(cache),
(unsigned long long) from_cblock(new_size));
return false;
}
@@ -2968,7 +3411,8 @@ static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size)
r = dm_cache_resize(cache->cmd, new_size);
if (r) {
- DMERR("could not resize cache metadata");
+ DMERR("%s: could not resize cache metadata", cache_device_name(cache));
+ metadata_operation_failed(cache, "dm_cache_resize", r);
return r;
}
@@ -3006,7 +3450,8 @@ static int cache_preresume(struct dm_target *ti)
r = dm_cache_load_mappings(cache->cmd, cache->policy,
load_mapping, cache);
if (r) {
- DMERR("could not load cache mappings");
+ DMERR("%s: could not load cache mappings", cache_device_name(cache));
+ metadata_operation_failed(cache, "dm_cache_load_mappings", r);
return r;
}
@@ -3026,7 +3471,8 @@ static int cache_preresume(struct dm_target *ti)
discard_load_info_init(cache, &li);
r = dm_cache_load_discards(cache->cmd, load_discard, &li);
if (r) {
- DMERR("could not load origin discards");
+ DMERR("%s: could not load origin discards", cache_device_name(cache));
+ metadata_operation_failed(cache, "dm_cache_load_discards", r);
return r;
}
set_discard_range(&li);
@@ -3054,7 +3500,7 @@ static void cache_resume(struct dm_target *ti)
* <#demotions> <#promotions> <#dirty>
* <#features> <features>*
* <#core args> <core args>
- * <policy name> <#policy args> <policy args>*
+ * <policy name> <#policy args> <policy args>* <cache metadata mode> <needs_check>
*/
static void cache_status(struct dm_target *ti, status_type_t type,
unsigned status_flags, char *result, unsigned maxlen)
@@ -3070,23 +3516,26 @@ static void cache_status(struct dm_target *ti, status_type_t type,
switch (type) {
case STATUSTYPE_INFO:
- /* Commit to ensure statistics aren't out-of-date */
- if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) {
- r = dm_cache_commit(cache->cmd, false);
- if (r)
- DMERR("could not commit metadata for accurate status");
+ if (get_cache_mode(cache) == CM_FAIL) {
+ DMEMIT("Fail");
+ break;
}
- r = dm_cache_get_free_metadata_block_count(cache->cmd,
- &nr_free_blocks_metadata);
+ /* Commit to ensure statistics aren't out-of-date */
+ if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
+ (void) commit(cache, false);
+
+ r = dm_cache_get_free_metadata_block_count(cache->cmd, &nr_free_blocks_metadata);
if (r) {
- DMERR("could not get metadata free block count");
+ DMERR("%s: dm_cache_get_free_metadata_block_count returned %d",
+ cache_device_name(cache), r);
goto err;
}
r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata);
if (r) {
- DMERR("could not get metadata device size");
+ DMERR("%s: dm_cache_get_metadata_dev_size returned %d",
+ cache_device_name(cache), r);
goto err;
}
@@ -3117,7 +3566,8 @@ static void cache_status(struct dm_target *ti, status_type_t type,
DMEMIT("1 writeback ");
else {
- DMERR("internal error: unknown io mode: %d", (int) cache->features.io_mode);
+ DMERR("%s: internal error: unknown io mode: %d",
+ cache_device_name(cache), (int) cache->features.io_mode);
goto err;
}
@@ -3125,11 +3575,22 @@ static void cache_status(struct dm_target *ti, status_type_t type,
DMEMIT("%s ", dm_cache_policy_get_name(cache->policy));
if (sz < maxlen) {
- r = policy_emit_config_values(cache->policy, result + sz, maxlen - sz);
+ r = policy_emit_config_values(cache->policy, result, maxlen, &sz);
if (r)
- DMERR("policy_emit_config_values returned %d", r);
+ DMERR("%s: policy_emit_config_values returned %d",
+ cache_device_name(cache), r);
}
+ if (get_cache_mode(cache) == CM_READ_ONLY)
+ DMEMIT("ro ");
+ else
+ DMEMIT("rw ");
+
+ if (dm_cache_metadata_needs_check(cache->cmd))
+ DMEMIT("needs_check ");
+ else
+ DMEMIT("- ");
+
break;
case STATUSTYPE_TABLE:
@@ -3191,7 +3652,7 @@ static int parse_cblock_range(struct cache *cache, const char *str,
return 0;
}
- DMERR("invalid cblock range '%s'", str);
+ DMERR("%s: invalid cblock range '%s'", cache_device_name(cache), str);
return -EINVAL;
}
@@ -3202,17 +3663,20 @@ static int validate_cblock_range(struct cache *cache, struct cblock_range *range
uint64_t n = from_cblock(cache->cache_size);
if (b >= n) {
- DMERR("begin cblock out of range: %llu >= %llu", b, n);
+ DMERR("%s: begin cblock out of range: %llu >= %llu",
+ cache_device_name(cache), b, n);
return -EINVAL;
}
if (e > n) {
- DMERR("end cblock out of range: %llu > %llu", e, n);
+ DMERR("%s: end cblock out of range: %llu > %llu",
+ cache_device_name(cache), e, n);
return -EINVAL;
}
if (b >= e) {
- DMERR("invalid cblock range: %llu >= %llu", b, e);
+ DMERR("%s: invalid cblock range: %llu >= %llu",
+ cache_device_name(cache), b, e);
return -EINVAL;
}
@@ -3246,7 +3710,8 @@ static int process_invalidate_cblocks_message(struct cache *cache, unsigned coun
struct cblock_range range;
if (!passthrough_mode(&cache->features)) {
- DMERR("cache has to be in passthrough mode for invalidation");
+ DMERR("%s: cache has to be in passthrough mode for invalidation",
+ cache_device_name(cache));
return -EPERM;
}
@@ -3285,6 +3750,12 @@ static int cache_message(struct dm_target *ti, unsigned argc, char **argv)
if (!argc)
return -EINVAL;
+ if (get_cache_mode(cache) >= CM_READ_ONLY) {
+ DMERR("%s: unable to service cache target messages in READ_ONLY or FAIL mode",
+ cache_device_name(cache));
+ return -EOPNOTSUPP;
+ }
+
if (!strcasecmp(argv[0], "invalidate_cblocks"))
return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1);
@@ -3358,7 +3829,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
static struct target_type cache_target = {
.name = "cache",
- .version = {1, 6, 0},
+ .version = {1, 8, 0},
.module = THIS_MODULE,
.ctr = cache_ctr,
.dtr = cache_dtr,
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 5503e43e5..0f48fed44 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1,7 +1,7 @@
/*
* Copyright (C) 2003 Jana Saout <jana@saout.de>
* Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org>
- * Copyright (C) 2006-2009 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2006-2015 Red Hat, Inc. All rights reserved.
* Copyright (C) 2013 Milan Broz <gmazyland@gmail.com>
*
* This file is released under the GPL.
@@ -891,6 +891,11 @@ static void crypt_alloc_req(struct crypt_config *cc,
ctx->req = mempool_alloc(cc->req_pool, GFP_NOIO);
ablkcipher_request_set_tfm(ctx->req, cc->tfms[key_index]);
+
+ /*
+ * Use REQ_MAY_BACKLOG so a cipher driver internally backlogs
+ * requests if driver request queue is full.
+ */
ablkcipher_request_set_callback(ctx->req,
CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
kcryptd_async_done, dmreq_of_req(cc, ctx->req));
@@ -924,24 +929,32 @@ static int crypt_convert(struct crypt_config *cc,
r = crypt_convert_block(cc, ctx, ctx->req);
switch (r) {
- /* async */
+ /*
+ * The request was queued by a crypto driver
+ * but the driver request queue is full, let's wait.
+ */
case -EBUSY:
wait_for_completion(&ctx->restart);
reinit_completion(&ctx->restart);
- /* fall through*/
+ /* fall through */
+ /*
+ * The request is queued and processed asynchronously,
+ * completion function kcryptd_async_done() will be called.
+ */
case -EINPROGRESS:
ctx->req = NULL;
ctx->cc_sector++;
continue;
-
- /* sync */
+ /*
+ * The request was already processed (synchronously).
+ */
case 0:
atomic_dec(&ctx->cc_pending);
ctx->cc_sector++;
cond_resched();
continue;
- /* error */
+ /* There was an error while processing the request. */
default:
atomic_dec(&ctx->cc_pending);
return r;
@@ -1346,6 +1359,11 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
struct dm_crypt_io *io = container_of(ctx, struct dm_crypt_io, ctx);
struct crypt_config *cc = io->cc;
+ /*
+ * A request from crypto driver backlog is going to be processed now,
+ * finish the completion and continue in crypt_convert().
+ * (Callback will be called for the second time for this request.)
+ */
if (error == -EINPROGRESS) {
complete(&ctx->restart);
return;
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
index 93e08446a..ad1b049ae 100644
--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -55,8 +55,8 @@
#define LOG_DISCARD_FLAG (1 << 2)
#define LOG_MARK_FLAG (1 << 3)
-#define WRITE_LOG_VERSION 1
-#define WRITE_LOG_MAGIC 0x6a736677736872
+#define WRITE_LOG_VERSION 1ULL
+#define WRITE_LOG_MAGIC 0x6a736677736872ULL
/*
* The disk format for this is braindead simple.
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 88e4c7f24..2daa67793 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -1,6 +1,6 @@
/*
* Copyright (C) 2010-2011 Neil Brown
- * Copyright (C) 2010-2014 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2010-2015 Red Hat, Inc. All rights reserved.
*
* This file is released under the GPL.
*/
@@ -17,6 +17,7 @@
#include <linux/device-mapper.h>
#define DM_MSG_PREFIX "raid"
+#define MAX_RAID_DEVICES 253 /* raid4/5/6 limit */
static bool devices_handle_discard_safely = false;
@@ -45,25 +46,25 @@ struct raid_dev {
};
/*
- * Flags for rs->print_flags field.
+ * Flags for rs->ctr_flags field.
*/
-#define DMPF_SYNC 0x1
-#define DMPF_NOSYNC 0x2
-#define DMPF_REBUILD 0x4
-#define DMPF_DAEMON_SLEEP 0x8
-#define DMPF_MIN_RECOVERY_RATE 0x10
-#define DMPF_MAX_RECOVERY_RATE 0x20
-#define DMPF_MAX_WRITE_BEHIND 0x40
-#define DMPF_STRIPE_CACHE 0x80
-#define DMPF_REGION_SIZE 0x100
-#define DMPF_RAID10_COPIES 0x200
-#define DMPF_RAID10_FORMAT 0x400
+#define CTR_FLAG_SYNC 0x1
+#define CTR_FLAG_NOSYNC 0x2
+#define CTR_FLAG_REBUILD 0x4
+#define CTR_FLAG_DAEMON_SLEEP 0x8
+#define CTR_FLAG_MIN_RECOVERY_RATE 0x10
+#define CTR_FLAG_MAX_RECOVERY_RATE 0x20
+#define CTR_FLAG_MAX_WRITE_BEHIND 0x40
+#define CTR_FLAG_STRIPE_CACHE 0x80
+#define CTR_FLAG_REGION_SIZE 0x100
+#define CTR_FLAG_RAID10_COPIES 0x200
+#define CTR_FLAG_RAID10_FORMAT 0x400
struct raid_set {
struct dm_target *ti;
uint32_t bitmap_loaded;
- uint32_t print_flags;
+ uint32_t ctr_flags;
struct mddev md;
struct raid_type *raid_type;
@@ -81,6 +82,7 @@ static struct raid_type {
const unsigned level; /* RAID level. */
const unsigned algorithm; /* RAID algorithm. */
} raid_types[] = {
+ {"raid0", "RAID0 (striping)", 0, 2, 0, 0 /* NONE */},
{"raid1", "RAID1 (mirroring)", 0, 2, 1, 0 /* NONE */},
{"raid10", "RAID10 (striped mirrors)", 0, 2, 10, UINT_MAX /* Varies */},
{"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0},
@@ -119,15 +121,15 @@ static int raid10_format_to_md_layout(char *format, unsigned copies)
{
unsigned n = 1, f = 1;
- if (!strcmp("near", format))
+ if (!strcasecmp("near", format))
n = copies;
else
f = copies;
- if (!strcmp("offset", format))
+ if (!strcasecmp("offset", format))
return 0x30000 | (f << 8) | n;
- if (!strcmp("far", format))
+ if (!strcasecmp("far", format))
return 0x20000 | (f << 8) | n;
return (f << 8) | n;
@@ -477,8 +479,6 @@ too_many:
* will form the "stripe"
* [[no]sync] Force or prevent recovery of the
* entire array
- * [devices_handle_discard_safely] Allow discards on RAID4/5/6; useful if RAID
- * member device(s) properly support TRIM/UNMAP
* [rebuild <idx>] Rebuild the drive indicated by the index
* [daemon_sleep <ms>] Time between bitmap daemon work to
* clear bits
@@ -555,12 +555,12 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
for (i = 0; i < num_raid_params; i++) {
if (!strcasecmp(argv[i], "nosync")) {
rs->md.recovery_cp = MaxSector;
- rs->print_flags |= DMPF_NOSYNC;
+ rs->ctr_flags |= CTR_FLAG_NOSYNC;
continue;
}
if (!strcasecmp(argv[i], "sync")) {
rs->md.recovery_cp = 0;
- rs->print_flags |= DMPF_SYNC;
+ rs->ctr_flags |= CTR_FLAG_SYNC;
continue;
}
@@ -585,7 +585,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
return -EINVAL;
}
raid10_format = argv[i];
- rs->print_flags |= DMPF_RAID10_FORMAT;
+ rs->ctr_flags |= CTR_FLAG_RAID10_FORMAT;
continue;
}
@@ -602,7 +602,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
}
clear_bit(In_sync, &rs->dev[value].rdev.flags);
rs->dev[value].rdev.recovery_offset = 0;
- rs->print_flags |= DMPF_REBUILD;
+ rs->ctr_flags |= CTR_FLAG_REBUILD;
} else if (!strcasecmp(key, "write_mostly")) {
if (rs->raid_type->level != 1) {
rs->ti->error = "write_mostly option is only valid for RAID1";
@@ -618,7 +618,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
rs->ti->error = "max_write_behind option is only valid for RAID1";
return -EINVAL;
}
- rs->print_flags |= DMPF_MAX_WRITE_BEHIND;
+ rs->ctr_flags |= CTR_FLAG_MAX_WRITE_BEHIND;
/*
* In device-mapper, we specify things in sectors, but
@@ -631,14 +631,14 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
}
rs->md.bitmap_info.max_write_behind = value;
} else if (!strcasecmp(key, "daemon_sleep")) {
- rs->print_flags |= DMPF_DAEMON_SLEEP;
+ rs->ctr_flags |= CTR_FLAG_DAEMON_SLEEP;
if (!value || (value > MAX_SCHEDULE_TIMEOUT)) {
rs->ti->error = "daemon sleep period out of range";
return -EINVAL;
}
rs->md.bitmap_info.daemon_sleep = value;
} else if (!strcasecmp(key, "stripe_cache")) {
- rs->print_flags |= DMPF_STRIPE_CACHE;
+ rs->ctr_flags |= CTR_FLAG_STRIPE_CACHE;
/*
* In device-mapper, we specify things in sectors, but
@@ -656,21 +656,21 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
return -EINVAL;
}
} else if (!strcasecmp(key, "min_recovery_rate")) {
- rs->print_flags |= DMPF_MIN_RECOVERY_RATE;
+ rs->ctr_flags |= CTR_FLAG_MIN_RECOVERY_RATE;
if (value > INT_MAX) {
rs->ti->error = "min_recovery_rate out of range";
return -EINVAL;
}
rs->md.sync_speed_min = (int)value;
} else if (!strcasecmp(key, "max_recovery_rate")) {
- rs->print_flags |= DMPF_MAX_RECOVERY_RATE;
+ rs->ctr_flags |= CTR_FLAG_MAX_RECOVERY_RATE;
if (value > INT_MAX) {
rs->ti->error = "max_recovery_rate out of range";
return -EINVAL;
}
rs->md.sync_speed_max = (int)value;
} else if (!strcasecmp(key, "region_size")) {
- rs->print_flags |= DMPF_REGION_SIZE;
+ rs->ctr_flags |= CTR_FLAG_REGION_SIZE;
region_size = value;
} else if (!strcasecmp(key, "raid10_copies") &&
(rs->raid_type->level == 10)) {
@@ -678,7 +678,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
rs->ti->error = "Bad value for 'raid10_copies'";
return -EINVAL;
}
- rs->print_flags |= DMPF_RAID10_COPIES;
+ rs->ctr_flags |= CTR_FLAG_RAID10_COPIES;
raid10_copies = value;
} else {
DMERR("Unable to parse RAID parameter: %s", key);
@@ -720,7 +720,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
rs->md.layout = raid10_format_to_md_layout(raid10_format,
raid10_copies);
rs->md.new_layout = rs->md.layout;
- } else if ((rs->raid_type->level > 1) &&
+ } else if ((!rs->raid_type->level || rs->raid_type->level > 1) &&
sector_div(sectors_per_dev,
(rs->md.raid_disks - rs->raid_type->parity_devs))) {
rs->ti->error = "Target length not divisible by number of data devices";
@@ -947,7 +947,7 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
return -EINVAL;
}
- if (!(rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC)))
+ if (!(rs->ctr_flags & (CTR_FLAG_SYNC | CTR_FLAG_NOSYNC)))
mddev->recovery_cp = le64_to_cpu(sb->array_resync_offset);
/*
@@ -1026,8 +1026,9 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
return 0;
}
-static int super_validate(struct mddev *mddev, struct md_rdev *rdev)
+static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
{
+ struct mddev *mddev = &rs->md;
struct dm_raid_superblock *sb = page_address(rdev->sb_page);
/*
@@ -1037,8 +1038,10 @@ static int super_validate(struct mddev *mddev, struct md_rdev *rdev)
if (!mddev->events && super_init_validation(mddev, rdev))
return -EINVAL;
- mddev->bitmap_info.offset = 4096 >> 9; /* Enable bitmap creation */
- rdev->mddev->bitmap_info.default_offset = 4096 >> 9;
+ /* Enable bitmap creation for RAID levels != 0 */
+ mddev->bitmap_info.offset = (rs->raid_type->level) ? to_sector(4096) : 0;
+ rdev->mddev->bitmap_info.default_offset = mddev->bitmap_info.offset;
+
if (!test_bit(FirstUse, &rdev->flags)) {
rdev->recovery_offset = le64_to_cpu(sb->disk_recovery_offset);
if (rdev->recovery_offset != MaxSector)
@@ -1073,7 +1076,7 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
freshest = NULL;
rdev_for_each_safe(rdev, tmp, mddev) {
/*
- * Skipping super_load due to DMPF_SYNC will cause
+ * Skipping super_load due to CTR_FLAG_SYNC will cause
* the array to undergo initialization again as
* though it were new. This is the intended effect
* of the "sync" directive.
@@ -1082,7 +1085,9 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
* that the "sync" directive is disallowed during the
* reshape.
*/
- if (rs->print_flags & DMPF_SYNC)
+ rdev->sectors = to_sector(i_size_read(rdev->bdev->bd_inode));
+
+ if (rs->ctr_flags & CTR_FLAG_SYNC)
continue;
if (!rdev->meta_bdev)
@@ -1140,11 +1145,11 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
* validation for the remaining devices.
*/
ti->error = "Unable to assemble array: Invalid superblocks";
- if (super_validate(mddev, freshest))
+ if (super_validate(rs, freshest))
return -EINVAL;
rdev_for_each(rdev, mddev)
- if ((rdev != freshest) && super_validate(mddev, rdev))
+ if ((rdev != freshest) && super_validate(rs, rdev))
return -EINVAL;
return 0;
@@ -1243,7 +1248,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
}
if ((kstrtoul(argv[num_raid_params], 10, &num_raid_devs) < 0) ||
- (num_raid_devs >= INT_MAX)) {
+ (num_raid_devs > MAX_RAID_DEVICES)) {
ti->error = "Cannot understand number of raid devices";
return -EINVAL;
}
@@ -1282,10 +1287,11 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
*/
configure_discard_support(ti, rs);
- mutex_lock(&rs->md.reconfig_mutex);
+ /* Has to be held on running the array */
+ mddev_lock_nointr(&rs->md);
ret = md_run(&rs->md);
rs->md.in_sync = 0; /* Assume already marked dirty */
- mutex_unlock(&rs->md.reconfig_mutex);
+ mddev_unlock(&rs->md);
if (ret) {
ti->error = "Fail to run raid array";
@@ -1368,34 +1374,40 @@ static void raid_status(struct dm_target *ti, status_type_t type,
case STATUSTYPE_INFO:
DMEMIT("%s %d ", rs->raid_type->name, rs->md.raid_disks);
- if (test_bit(MD_RECOVERY_RUNNING, &rs->md.recovery))
- sync = rs->md.curr_resync_completed;
- else
- sync = rs->md.recovery_cp;
-
- if (sync >= rs->md.resync_max_sectors) {
- /*
- * Sync complete.
- */
+ if (rs->raid_type->level) {
+ if (test_bit(MD_RECOVERY_RUNNING, &rs->md.recovery))
+ sync = rs->md.curr_resync_completed;
+ else
+ sync = rs->md.recovery_cp;
+
+ if (sync >= rs->md.resync_max_sectors) {
+ /*
+ * Sync complete.
+ */
+ array_in_sync = 1;
+ sync = rs->md.resync_max_sectors;
+ } else if (test_bit(MD_RECOVERY_REQUESTED, &rs->md.recovery)) {
+ /*
+ * If "check" or "repair" is occurring, the array has
+ * undergone and initial sync and the health characters
+ * should not be 'a' anymore.
+ */
+ array_in_sync = 1;
+ } else {
+ /*
+ * The array may be doing an initial sync, or it may
+ * be rebuilding individual components. If all the
+ * devices are In_sync, then it is the array that is
+ * being initialized.
+ */
+ for (i = 0; i < rs->md.raid_disks; i++)
+ if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
+ array_in_sync = 1;
+ }
+ } else {
+ /* RAID0 */
array_in_sync = 1;
sync = rs->md.resync_max_sectors;
- } else if (test_bit(MD_RECOVERY_REQUESTED, &rs->md.recovery)) {
- /*
- * If "check" or "repair" is occurring, the array has
- * undergone and initial sync and the health characters
- * should not be 'a' anymore.
- */
- array_in_sync = 1;
- } else {
- /*
- * The array may be doing an initial sync, or it may
- * be rebuilding individual components. If all the
- * devices are In_sync, then it is the array that is
- * being initialized.
- */
- for (i = 0; i < rs->md.raid_disks; i++)
- if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
- array_in_sync = 1;
}
/*
@@ -1446,7 +1458,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
case STATUSTYPE_TABLE:
/* The string you would use to construct this array */
for (i = 0; i < rs->md.raid_disks; i++) {
- if ((rs->print_flags & DMPF_REBUILD) &&
+ if ((rs->ctr_flags & CTR_FLAG_REBUILD) &&
rs->dev[i].data_dev &&
!test_bit(In_sync, &rs->dev[i].rdev.flags))
raid_param_cnt += 2; /* for rebuilds */
@@ -1455,33 +1467,33 @@ static void raid_status(struct dm_target *ti, status_type_t type,
raid_param_cnt += 2;
}
- raid_param_cnt += (hweight32(rs->print_flags & ~DMPF_REBUILD) * 2);
- if (rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC))
+ raid_param_cnt += (hweight32(rs->ctr_flags & ~CTR_FLAG_REBUILD) * 2);
+ if (rs->ctr_flags & (CTR_FLAG_SYNC | CTR_FLAG_NOSYNC))
raid_param_cnt--;
DMEMIT("%s %u %u", rs->raid_type->name,
raid_param_cnt, rs->md.chunk_sectors);
- if ((rs->print_flags & DMPF_SYNC) &&
+ if ((rs->ctr_flags & CTR_FLAG_SYNC) &&
(rs->md.recovery_cp == MaxSector))
DMEMIT(" sync");
- if (rs->print_flags & DMPF_NOSYNC)
+ if (rs->ctr_flags & CTR_FLAG_NOSYNC)
DMEMIT(" nosync");
for (i = 0; i < rs->md.raid_disks; i++)
- if ((rs->print_flags & DMPF_REBUILD) &&
+ if ((rs->ctr_flags & CTR_FLAG_REBUILD) &&
rs->dev[i].data_dev &&
!test_bit(In_sync, &rs->dev[i].rdev.flags))
DMEMIT(" rebuild %u", i);
- if (rs->print_flags & DMPF_DAEMON_SLEEP)
+ if (rs->ctr_flags & CTR_FLAG_DAEMON_SLEEP)
DMEMIT(" daemon_sleep %lu",
rs->md.bitmap_info.daemon_sleep);
- if (rs->print_flags & DMPF_MIN_RECOVERY_RATE)
+ if (rs->ctr_flags & CTR_FLAG_MIN_RECOVERY_RATE)
DMEMIT(" min_recovery_rate %d", rs->md.sync_speed_min);
- if (rs->print_flags & DMPF_MAX_RECOVERY_RATE)
+ if (rs->ctr_flags & CTR_FLAG_MAX_RECOVERY_RATE)
DMEMIT(" max_recovery_rate %d", rs->md.sync_speed_max);
for (i = 0; i < rs->md.raid_disks; i++)
@@ -1489,11 +1501,11 @@ static void raid_status(struct dm_target *ti, status_type_t type,
test_bit(WriteMostly, &rs->dev[i].rdev.flags))
DMEMIT(" write_mostly %u", i);
- if (rs->print_flags & DMPF_MAX_WRITE_BEHIND)
+ if (rs->ctr_flags & CTR_FLAG_MAX_WRITE_BEHIND)
DMEMIT(" max_write_behind %lu",
rs->md.bitmap_info.max_write_behind);
- if (rs->print_flags & DMPF_STRIPE_CACHE) {
+ if (rs->ctr_flags & CTR_FLAG_STRIPE_CACHE) {
struct r5conf *conf = rs->md.private;
/* convert from kiB to sectors */
@@ -1501,15 +1513,15 @@ static void raid_status(struct dm_target *ti, status_type_t type,
conf ? conf->max_nr_stripes * 2 : 0);
}
- if (rs->print_flags & DMPF_REGION_SIZE)
+ if (rs->ctr_flags & CTR_FLAG_REGION_SIZE)
DMEMIT(" region_size %lu",
rs->md.bitmap_info.chunksize >> 9);
- if (rs->print_flags & DMPF_RAID10_COPIES)
+ if (rs->ctr_flags & CTR_FLAG_RAID10_COPIES)
DMEMIT(" raid10_copies %u",
raid10_md_layout_to_copies(rs->md.layout));
- if (rs->print_flags & DMPF_RAID10_FORMAT)
+ if (rs->ctr_flags & CTR_FLAG_RAID10_FORMAT)
DMEMIT(" raid10_format %s",
raid10_md_layout_to_format(rs->md.layout));
@@ -1684,26 +1696,48 @@ static void raid_resume(struct dm_target *ti)
{
struct raid_set *rs = ti->private;
- set_bit(MD_CHANGE_DEVS, &rs->md.flags);
- if (!rs->bitmap_loaded) {
- bitmap_load(&rs->md);
- rs->bitmap_loaded = 1;
- } else {
- /*
- * A secondary resume while the device is active.
- * Take this opportunity to check whether any failed
- * devices are reachable again.
- */
- attempt_restore_of_faulty_devices(rs);
+ if (rs->raid_type->level) {
+ set_bit(MD_CHANGE_DEVS, &rs->md.flags);
+
+ if (!rs->bitmap_loaded) {
+ bitmap_load(&rs->md);
+ rs->bitmap_loaded = 1;
+ } else {
+ /*
+ * A secondary resume while the device is active.
+ * Take this opportunity to check whether any failed
+ * devices are reachable again.
+ */
+ attempt_restore_of_faulty_devices(rs);
+ }
+
+ clear_bit(MD_RECOVERY_FROZEN, &rs->md.recovery);
}
- clear_bit(MD_RECOVERY_FROZEN, &rs->md.recovery);
mddev_resume(&rs->md);
}
+static int raid_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
+ struct bio_vec *biovec, int max_size)
+{
+ struct raid_set *rs = ti->private;
+ struct md_personality *pers = rs->md.pers;
+
+ if (pers && pers->mergeable_bvec)
+ return min(max_size, pers->mergeable_bvec(&rs->md, bvm, biovec));
+
+ /*
+ * In case we can't request the personality because
+ * the raid set is not running yet
+ *
+ * -> return safe minimum
+ */
+ return rs->md.chunk_sectors;
+}
+
static struct target_type raid_target = {
.name = "raid",
- .version = {1, 6, 0},
+ .version = {1, 7, 0},
.module = THIS_MODULE,
.ctr = raid_ctr,
.dtr = raid_dtr,
@@ -1715,6 +1749,7 @@ static struct target_type raid_target = {
.presuspend = raid_presuspend,
.postsuspend = raid_postsuspend,
.resume = raid_resume,
+ .merge = raid_merge,
};
static int __init dm_raid_init(void)
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 089d62751..d83696bf4 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -23,8 +23,10 @@
#define MAX_RECOVERY 1 /* Maximum number of regions recovered in parallel. */
-#define DM_RAID1_HANDLE_ERRORS 0x01
+#define DM_RAID1_HANDLE_ERRORS 0x01
+#define DM_RAID1_KEEP_LOG 0x02
#define errors_handled(p) ((p)->features & DM_RAID1_HANDLE_ERRORS)
+#define keep_log(p) ((p)->features & DM_RAID1_KEEP_LOG)
static DECLARE_WAIT_QUEUE_HEAD(_kmirrord_recovery_stopped);
@@ -229,7 +231,7 @@ static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type)
if (m != get_default_mirror(ms))
goto out;
- if (!ms->in_sync) {
+ if (!ms->in_sync && !keep_log(ms)) {
/*
* Better to issue requests to same failing device
* than to risk returning corrupt data.
@@ -370,6 +372,17 @@ static int recover(struct mirror_set *ms, struct dm_region *reg)
return r;
}
+static void reset_ms_flags(struct mirror_set *ms)
+{
+ unsigned int m;
+
+ ms->leg_failure = 0;
+ for (m = 0; m < ms->nr_mirrors; m++) {
+ atomic_set(&(ms->mirror[m].error_count), 0);
+ ms->mirror[m].error_type = 0;
+ }
+}
+
static void do_recovery(struct mirror_set *ms)
{
struct dm_region *reg;
@@ -398,6 +411,7 @@ static void do_recovery(struct mirror_set *ms)
/* the sync is complete */
dm_table_event(ms->ti->table);
ms->in_sync = 1;
+ reset_ms_flags(ms);
}
}
@@ -759,7 +773,7 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
dm_rh_delay(ms->rh, bio);
while ((bio = bio_list_pop(&nosync))) {
- if (unlikely(ms->leg_failure) && errors_handled(ms)) {
+ if (unlikely(ms->leg_failure) && errors_handled(ms) && !keep_log(ms)) {
spin_lock_irq(&ms->lock);
bio_list_add(&ms->failures, bio);
spin_unlock_irq(&ms->lock);
@@ -803,15 +817,21 @@ static void do_failures(struct mirror_set *ms, struct bio_list *failures)
/*
* If all the legs are dead, fail the I/O.
- * If we have been told to handle errors, hold the bio
- * and wait for userspace to deal with the problem.
+ * If the device has failed and keep_log is enabled,
+ * fail the I/O.
+ *
+ * If we have been told to handle errors, and keep_log
+ * isn't enabled, hold the bio and wait for userspace to
+ * deal with the problem.
+ *
* Otherwise pretend that the I/O succeeded. (This would
* be wrong if the failed leg returned after reboot and
* got replicated back to the good legs.)
*/
- if (!get_valid_mirror(ms))
+
+ if (unlikely(!get_valid_mirror(ms) || (keep_log(ms) && ms->log_failure)))
bio_endio(bio, -EIO);
- else if (errors_handled(ms))
+ else if (errors_handled(ms) && !keep_log(ms))
hold_bio(ms, bio);
else
bio_endio(bio, 0);
@@ -987,6 +1007,7 @@ static int parse_features(struct mirror_set *ms, unsigned argc, char **argv,
unsigned num_features;
struct dm_target *ti = ms->ti;
char dummy;
+ int i;
*args_used = 0;
@@ -1007,15 +1028,25 @@ static int parse_features(struct mirror_set *ms, unsigned argc, char **argv,
return -EINVAL;
}
- if (!strcmp("handle_errors", argv[0]))
- ms->features |= DM_RAID1_HANDLE_ERRORS;
- else {
- ti->error = "Unrecognised feature requested";
+ for (i = 0; i < num_features; i++) {
+ if (!strcmp("handle_errors", argv[0]))
+ ms->features |= DM_RAID1_HANDLE_ERRORS;
+ else if (!strcmp("keep_log", argv[0]))
+ ms->features |= DM_RAID1_KEEP_LOG;
+ else {
+ ti->error = "Unrecognised feature requested";
+ return -EINVAL;
+ }
+
+ argc--;
+ argv++;
+ (*args_used)++;
+ }
+ if (!errors_handled(ms) && keep_log(ms)) {
+ ti->error = "keep_log feature requires the handle_errors feature";
return -EINVAL;
}
- (*args_used)++;
-
return 0;
}
@@ -1029,7 +1060,7 @@ static int parse_features(struct mirror_set *ms, unsigned argc, char **argv,
* log_type is "core" or "disk"
* #log_params is between 1 and 3
*
- * If present, features must be "handle_errors".
+ * If present, supported features are "handle_errors" and "keep_log".
*/
static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
{
@@ -1254,8 +1285,6 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error)
dm_bio_restore(bd, bio);
bio_record->details.bi_bdev = NULL;
- atomic_inc(&bio->bi_remaining);
-
queue_bio(ms, bio, rw);
return DM_ENDIO_INCOMPLETE;
}
@@ -1365,6 +1394,7 @@ static void mirror_status(struct dm_target *ti, status_type_t type,
unsigned status_flags, char *result, unsigned maxlen)
{
unsigned int m, sz = 0;
+ int num_feature_args = 0;
struct mirror_set *ms = (struct mirror_set *) ti->private;
struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
char buffer[ms->nr_mirrors + 1];
@@ -1394,8 +1424,17 @@ static void mirror_status(struct dm_target *ti, status_type_t type,
DMEMIT(" %s %llu", ms->mirror[m].dev->name,
(unsigned long long)ms->mirror[m].offset);
- if (ms->features & DM_RAID1_HANDLE_ERRORS)
- DMEMIT(" 1 handle_errors");
+ num_feature_args += !!errors_handled(ms);
+ num_feature_args += !!keep_log(ms);
+ if (num_feature_args) {
+ DMEMIT(" %d", num_feature_args);
+ if (errors_handled(ms))
+ DMEMIT(" handle_errors");
+ if (keep_log(ms))
+ DMEMIT(" keep_log");
+ }
+
+ break;
}
}
@@ -1415,7 +1454,7 @@ static int mirror_iterate_devices(struct dm_target *ti,
static struct target_type mirror_target = {
.name = "mirror",
- .version = {1, 13, 2},
+ .version = {1, 14, 0},
.module = THIS_MODULE,
.ctr = mirror_ctr,
.dtr = mirror_dtr,
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index f83a0f3fc..7c82d3ccc 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1478,7 +1478,6 @@ out:
if (full_bio) {
full_bio->bi_end_io = pe->full_bio_end_io;
full_bio->bi_private = pe->full_bio_private;
- atomic_inc(&full_bio->bi_remaining);
}
increment_pending_exceptions_done_count();
diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c
index 419bdd4fc..8a8b48fa9 100644
--- a/drivers/md/dm-stats.c
+++ b/drivers/md/dm-stats.c
@@ -29,30 +29,37 @@ struct dm_stat_percpu {
unsigned long long io_ticks[2];
unsigned long long io_ticks_total;
unsigned long long time_in_queue;
+ unsigned long long *histogram;
};
struct dm_stat_shared {
atomic_t in_flight[2];
- unsigned long stamp;
+ unsigned long long stamp;
struct dm_stat_percpu tmp;
};
struct dm_stat {
struct list_head list_entry;
int id;
+ unsigned stat_flags;
size_t n_entries;
sector_t start;
sector_t end;
sector_t step;
+ unsigned n_histogram_entries;
+ unsigned long long *histogram_boundaries;
const char *program_id;
const char *aux_data;
struct rcu_head rcu_head;
size_t shared_alloc_size;
size_t percpu_alloc_size;
+ size_t histogram_alloc_size;
struct dm_stat_percpu *stat_percpu[NR_CPUS];
struct dm_stat_shared stat_shared[0];
};
+#define STAT_PRECISE_TIMESTAMPS 1
+
struct dm_stats_last_position {
sector_t last_sector;
unsigned last_rw;
@@ -160,10 +167,7 @@ static void dm_kvfree(void *ptr, size_t alloc_size)
free_shared_memory(alloc_size);
- if (is_vmalloc_addr(ptr))
- vfree(ptr);
- else
- kfree(ptr);
+ kvfree(ptr);
}
static void dm_stat_free(struct rcu_head *head)
@@ -173,8 +177,11 @@ static void dm_stat_free(struct rcu_head *head)
kfree(s->program_id);
kfree(s->aux_data);
- for_each_possible_cpu(cpu)
+ for_each_possible_cpu(cpu) {
+ dm_kvfree(s->stat_percpu[cpu][0].histogram, s->histogram_alloc_size);
dm_kvfree(s->stat_percpu[cpu], s->percpu_alloc_size);
+ }
+ dm_kvfree(s->stat_shared[0].tmp.histogram, s->histogram_alloc_size);
dm_kvfree(s, s->shared_alloc_size);
}
@@ -227,7 +234,10 @@ void dm_stats_cleanup(struct dm_stats *stats)
}
static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
- sector_t step, const char *program_id, const char *aux_data,
+ sector_t step, unsigned stat_flags,
+ unsigned n_histogram_entries,
+ unsigned long long *histogram_boundaries,
+ const char *program_id, const char *aux_data,
void (*suspend_callback)(struct mapped_device *),
void (*resume_callback)(struct mapped_device *),
struct mapped_device *md)
@@ -238,6 +248,7 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
size_t ni;
size_t shared_alloc_size;
size_t percpu_alloc_size;
+ size_t histogram_alloc_size;
struct dm_stat_percpu *p;
int cpu;
int ret_id;
@@ -261,19 +272,34 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
if (percpu_alloc_size / sizeof(struct dm_stat_percpu) != n_entries)
return -EOVERFLOW;
- if (!check_shared_memory(shared_alloc_size + num_possible_cpus() * percpu_alloc_size))
+ histogram_alloc_size = (n_histogram_entries + 1) * (size_t)n_entries * sizeof(unsigned long long);
+ if (histogram_alloc_size / (n_histogram_entries + 1) != (size_t)n_entries * sizeof(unsigned long long))
+ return -EOVERFLOW;
+
+ if (!check_shared_memory(shared_alloc_size + histogram_alloc_size +
+ num_possible_cpus() * (percpu_alloc_size + histogram_alloc_size)))
return -ENOMEM;
s = dm_kvzalloc(shared_alloc_size, NUMA_NO_NODE);
if (!s)
return -ENOMEM;
+ s->stat_flags = stat_flags;
s->n_entries = n_entries;
s->start = start;
s->end = end;
s->step = step;
s->shared_alloc_size = shared_alloc_size;
s->percpu_alloc_size = percpu_alloc_size;
+ s->histogram_alloc_size = histogram_alloc_size;
+
+ s->n_histogram_entries = n_histogram_entries;
+ s->histogram_boundaries = kmemdup(histogram_boundaries,
+ s->n_histogram_entries * sizeof(unsigned long long), GFP_KERNEL);
+ if (!s->histogram_boundaries) {
+ r = -ENOMEM;
+ goto out;
+ }
s->program_id = kstrdup(program_id, GFP_KERNEL);
if (!s->program_id) {
@@ -291,6 +317,19 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
atomic_set(&s->stat_shared[ni].in_flight[WRITE], 0);
}
+ if (s->n_histogram_entries) {
+ unsigned long long *hi;
+ hi = dm_kvzalloc(s->histogram_alloc_size, NUMA_NO_NODE);
+ if (!hi) {
+ r = -ENOMEM;
+ goto out;
+ }
+ for (ni = 0; ni < n_entries; ni++) {
+ s->stat_shared[ni].tmp.histogram = hi;
+ hi += s->n_histogram_entries + 1;
+ }
+ }
+
for_each_possible_cpu(cpu) {
p = dm_kvzalloc(percpu_alloc_size, cpu_to_node(cpu));
if (!p) {
@@ -298,6 +337,18 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
goto out;
}
s->stat_percpu[cpu] = p;
+ if (s->n_histogram_entries) {
+ unsigned long long *hi;
+ hi = dm_kvzalloc(s->histogram_alloc_size, cpu_to_node(cpu));
+ if (!hi) {
+ r = -ENOMEM;
+ goto out;
+ }
+ for (ni = 0; ni < n_entries; ni++) {
+ p[ni].histogram = hi;
+ hi += s->n_histogram_entries + 1;
+ }
+ }
}
/*
@@ -375,9 +426,11 @@ static int dm_stats_delete(struct dm_stats *stats, int id)
* vfree can't be called from RCU callback
*/
for_each_possible_cpu(cpu)
- if (is_vmalloc_addr(s->stat_percpu))
+ if (is_vmalloc_addr(s->stat_percpu) ||
+ is_vmalloc_addr(s->stat_percpu[cpu][0].histogram))
goto do_sync_free;
- if (is_vmalloc_addr(s)) {
+ if (is_vmalloc_addr(s) ||
+ is_vmalloc_addr(s->stat_shared[0].tmp.histogram)) {
do_sync_free:
synchronize_rcu_expedited();
dm_stat_free(&s->rcu_head);
@@ -417,18 +470,24 @@ static int dm_stats_list(struct dm_stats *stats, const char *program,
return 1;
}
-static void dm_stat_round(struct dm_stat_shared *shared, struct dm_stat_percpu *p)
+static void dm_stat_round(struct dm_stat *s, struct dm_stat_shared *shared,
+ struct dm_stat_percpu *p)
{
/*
* This is racy, but so is part_round_stats_single.
*/
- unsigned long now = jiffies;
- unsigned in_flight_read;
- unsigned in_flight_write;
- unsigned long difference = now - shared->stamp;
+ unsigned long long now, difference;
+ unsigned in_flight_read, in_flight_write;
+
+ if (likely(!(s->stat_flags & STAT_PRECISE_TIMESTAMPS)))
+ now = jiffies;
+ else
+ now = ktime_to_ns(ktime_get());
+ difference = now - shared->stamp;
if (!difference)
return;
+
in_flight_read = (unsigned)atomic_read(&shared->in_flight[READ]);
in_flight_write = (unsigned)atomic_read(&shared->in_flight[WRITE]);
if (in_flight_read)
@@ -443,8 +502,9 @@ static void dm_stat_round(struct dm_stat_shared *shared, struct dm_stat_percpu *
}
static void dm_stat_for_entry(struct dm_stat *s, size_t entry,
- unsigned long bi_rw, sector_t len, bool merged,
- bool end, unsigned long duration)
+ unsigned long bi_rw, sector_t len,
+ struct dm_stats_aux *stats_aux, bool end,
+ unsigned long duration_jiffies)
{
unsigned long idx = bi_rw & REQ_WRITE;
struct dm_stat_shared *shared = &s->stat_shared[entry];
@@ -474,15 +534,35 @@ static void dm_stat_for_entry(struct dm_stat *s, size_t entry,
p = &s->stat_percpu[smp_processor_id()][entry];
if (!end) {
- dm_stat_round(shared, p);
+ dm_stat_round(s, shared, p);
atomic_inc(&shared->in_flight[idx]);
} else {
- dm_stat_round(shared, p);
+ unsigned long long duration;
+ dm_stat_round(s, shared, p);
atomic_dec(&shared->in_flight[idx]);
p->sectors[idx] += len;
p->ios[idx] += 1;
- p->merges[idx] += merged;
- p->ticks[idx] += duration;
+ p->merges[idx] += stats_aux->merged;
+ if (!(s->stat_flags & STAT_PRECISE_TIMESTAMPS)) {
+ p->ticks[idx] += duration_jiffies;
+ duration = jiffies_to_msecs(duration_jiffies);
+ } else {
+ p->ticks[idx] += stats_aux->duration_ns;
+ duration = stats_aux->duration_ns;
+ }
+ if (s->n_histogram_entries) {
+ unsigned lo = 0, hi = s->n_histogram_entries + 1;
+ while (lo + 1 < hi) {
+ unsigned mid = (lo + hi) / 2;
+ if (s->histogram_boundaries[mid - 1] > duration) {
+ hi = mid;
+ } else {
+ lo = mid;
+ }
+
+ }
+ p->histogram[lo]++;
+ }
}
#if BITS_PER_LONG == 32
@@ -494,7 +574,7 @@ static void dm_stat_for_entry(struct dm_stat *s, size_t entry,
static void __dm_stat_bio(struct dm_stat *s, unsigned long bi_rw,
sector_t bi_sector, sector_t end_sector,
- bool end, unsigned long duration,
+ bool end, unsigned long duration_jiffies,
struct dm_stats_aux *stats_aux)
{
sector_t rel_sector, offset, todo, fragment_len;
@@ -523,7 +603,7 @@ static void __dm_stat_bio(struct dm_stat *s, unsigned long bi_rw,
if (fragment_len > s->step - offset)
fragment_len = s->step - offset;
dm_stat_for_entry(s, entry, bi_rw, fragment_len,
- stats_aux->merged, end, duration);
+ stats_aux, end, duration_jiffies);
todo -= fragment_len;
entry++;
offset = 0;
@@ -532,11 +612,13 @@ static void __dm_stat_bio(struct dm_stat *s, unsigned long bi_rw,
void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw,
sector_t bi_sector, unsigned bi_sectors, bool end,
- unsigned long duration, struct dm_stats_aux *stats_aux)
+ unsigned long duration_jiffies,
+ struct dm_stats_aux *stats_aux)
{
struct dm_stat *s;
sector_t end_sector;
struct dm_stats_last_position *last;
+ bool got_precise_time;
if (unlikely(!bi_sectors))
return;
@@ -560,8 +642,17 @@ void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw,
rcu_read_lock();
- list_for_each_entry_rcu(s, &stats->list, list_entry)
- __dm_stat_bio(s, bi_rw, bi_sector, end_sector, end, duration, stats_aux);
+ got_precise_time = false;
+ list_for_each_entry_rcu(s, &stats->list, list_entry) {
+ if (s->stat_flags & STAT_PRECISE_TIMESTAMPS && !got_precise_time) {
+ if (!end)
+ stats_aux->duration_ns = ktime_to_ns(ktime_get());
+ else
+ stats_aux->duration_ns = ktime_to_ns(ktime_get()) - stats_aux->duration_ns;
+ got_precise_time = true;
+ }
+ __dm_stat_bio(s, bi_rw, bi_sector, end_sector, end, duration_jiffies, stats_aux);
+ }
rcu_read_unlock();
}
@@ -574,10 +665,25 @@ static void __dm_stat_init_temporary_percpu_totals(struct dm_stat_shared *shared
local_irq_disable();
p = &s->stat_percpu[smp_processor_id()][x];
- dm_stat_round(shared, p);
+ dm_stat_round(s, shared, p);
local_irq_enable();
- memset(&shared->tmp, 0, sizeof(shared->tmp));
+ shared->tmp.sectors[READ] = 0;
+ shared->tmp.sectors[WRITE] = 0;
+ shared->tmp.ios[READ] = 0;
+ shared->tmp.ios[WRITE] = 0;
+ shared->tmp.merges[READ] = 0;
+ shared->tmp.merges[WRITE] = 0;
+ shared->tmp.ticks[READ] = 0;
+ shared->tmp.ticks[WRITE] = 0;
+ shared->tmp.io_ticks[READ] = 0;
+ shared->tmp.io_ticks[WRITE] = 0;
+ shared->tmp.io_ticks_total = 0;
+ shared->tmp.time_in_queue = 0;
+
+ if (s->n_histogram_entries)
+ memset(shared->tmp.histogram, 0, (s->n_histogram_entries + 1) * sizeof(unsigned long long));
+
for_each_possible_cpu(cpu) {
p = &s->stat_percpu[cpu][x];
shared->tmp.sectors[READ] += ACCESS_ONCE(p->sectors[READ]);
@@ -592,6 +698,11 @@ static void __dm_stat_init_temporary_percpu_totals(struct dm_stat_shared *shared
shared->tmp.io_ticks[WRITE] += ACCESS_ONCE(p->io_ticks[WRITE]);
shared->tmp.io_ticks_total += ACCESS_ONCE(p->io_ticks_total);
shared->tmp.time_in_queue += ACCESS_ONCE(p->time_in_queue);
+ if (s->n_histogram_entries) {
+ unsigned i;
+ for (i = 0; i < s->n_histogram_entries + 1; i++)
+ shared->tmp.histogram[i] += ACCESS_ONCE(p->histogram[i]);
+ }
}
}
@@ -621,6 +732,15 @@ static void __dm_stat_clear(struct dm_stat *s, size_t idx_start, size_t idx_end,
p->io_ticks_total -= shared->tmp.io_ticks_total;
p->time_in_queue -= shared->tmp.time_in_queue;
local_irq_enable();
+ if (s->n_histogram_entries) {
+ unsigned i;
+ for (i = 0; i < s->n_histogram_entries + 1; i++) {
+ local_irq_disable();
+ p = &s->stat_percpu[smp_processor_id()][x];
+ p->histogram[i] -= shared->tmp.histogram[i];
+ local_irq_enable();
+ }
+ }
}
}
@@ -646,11 +766,15 @@ static int dm_stats_clear(struct dm_stats *stats, int id)
/*
* This is like jiffies_to_msec, but works for 64-bit values.
*/
-static unsigned long long dm_jiffies_to_msec64(unsigned long long j)
+static unsigned long long dm_jiffies_to_msec64(struct dm_stat *s, unsigned long long j)
{
- unsigned long long result = 0;
+ unsigned long long result;
unsigned mult;
+ if (s->stat_flags & STAT_PRECISE_TIMESTAMPS)
+ return j;
+
+ result = 0;
if (j)
result = jiffies_to_msecs(j & 0x3fffff);
if (j >= 1 << 22) {
@@ -706,22 +830,29 @@ static int dm_stats_print(struct dm_stats *stats, int id,
__dm_stat_init_temporary_percpu_totals(shared, s, x);
- DMEMIT("%llu+%llu %llu %llu %llu %llu %llu %llu %llu %llu %d %llu %llu %llu %llu\n",
+ DMEMIT("%llu+%llu %llu %llu %llu %llu %llu %llu %llu %llu %d %llu %llu %llu %llu",
(unsigned long long)start,
(unsigned long long)step,
shared->tmp.ios[READ],
shared->tmp.merges[READ],
shared->tmp.sectors[READ],
- dm_jiffies_to_msec64(shared->tmp.ticks[READ]),
+ dm_jiffies_to_msec64(s, shared->tmp.ticks[READ]),
shared->tmp.ios[WRITE],
shared->tmp.merges[WRITE],
shared->tmp.sectors[WRITE],
- dm_jiffies_to_msec64(shared->tmp.ticks[WRITE]),
+ dm_jiffies_to_msec64(s, shared->tmp.ticks[WRITE]),
dm_stat_in_flight(shared),
- dm_jiffies_to_msec64(shared->tmp.io_ticks_total),
- dm_jiffies_to_msec64(shared->tmp.time_in_queue),
- dm_jiffies_to_msec64(shared->tmp.io_ticks[READ]),
- dm_jiffies_to_msec64(shared->tmp.io_ticks[WRITE]));
+ dm_jiffies_to_msec64(s, shared->tmp.io_ticks_total),
+ dm_jiffies_to_msec64(s, shared->tmp.time_in_queue),
+ dm_jiffies_to_msec64(s, shared->tmp.io_ticks[READ]),
+ dm_jiffies_to_msec64(s, shared->tmp.io_ticks[WRITE]));
+ if (s->n_histogram_entries) {
+ unsigned i;
+ for (i = 0; i < s->n_histogram_entries + 1; i++) {
+ DMEMIT("%s%llu", !i ? " " : ":", shared->tmp.histogram[i]);
+ }
+ }
+ DMEMIT("\n");
if (unlikely(sz + 1 >= maxlen))
goto buffer_overflow;
@@ -763,38 +894,89 @@ static int dm_stats_set_aux(struct dm_stats *stats, int id, const char *aux_data
return 0;
}
+static int parse_histogram(const char *h, unsigned *n_histogram_entries,
+ unsigned long long **histogram_boundaries)
+{
+ const char *q;
+ unsigned n;
+ unsigned long long last;
+
+ *n_histogram_entries = 1;
+ for (q = h; *q; q++)
+ if (*q == ',')
+ (*n_histogram_entries)++;
+
+ *histogram_boundaries = kmalloc(*n_histogram_entries * sizeof(unsigned long long), GFP_KERNEL);
+ if (!*histogram_boundaries)
+ return -ENOMEM;
+
+ n = 0;
+ last = 0;
+ while (1) {
+ unsigned long long hi;
+ int s;
+ char ch;
+ s = sscanf(h, "%llu%c", &hi, &ch);
+ if (!s || (s == 2 && ch != ','))
+ return -EINVAL;
+ if (hi <= last)
+ return -EINVAL;
+ last = hi;
+ (*histogram_boundaries)[n] = hi;
+ if (s == 1)
+ return 0;
+ h = strchr(h, ',') + 1;
+ n++;
+ }
+}
+
static int message_stats_create(struct mapped_device *md,
unsigned argc, char **argv,
char *result, unsigned maxlen)
{
+ int r;
int id;
char dummy;
unsigned long long start, end, len, step;
unsigned divisor;
const char *program_id, *aux_data;
+ unsigned stat_flags = 0;
+
+ unsigned n_histogram_entries = 0;
+ unsigned long long *histogram_boundaries = NULL;
+
+ struct dm_arg_set as, as_backup;
+ const char *a;
+ unsigned feature_args;
/*
* Input format:
- * <range> <step> [<program_id> [<aux_data>]]
+ * <range> <step> [<extra_parameters> <parameters>] [<program_id> [<aux_data>]]
*/
- if (argc < 3 || argc > 5)
- return -EINVAL;
+ if (argc < 3)
+ goto ret_einval;
+
+ as.argc = argc;
+ as.argv = argv;
+ dm_consume_args(&as, 1);
- if (!strcmp(argv[1], "-")) {
+ a = dm_shift_arg(&as);
+ if (!strcmp(a, "-")) {
start = 0;
len = dm_get_size(md);
if (!len)
len = 1;
- } else if (sscanf(argv[1], "%llu+%llu%c", &start, &len, &dummy) != 2 ||
+ } else if (sscanf(a, "%llu+%llu%c", &start, &len, &dummy) != 2 ||
start != (sector_t)start || len != (sector_t)len)
- return -EINVAL;
+ goto ret_einval;
end = start + len;
if (start >= end)
- return -EINVAL;
+ goto ret_einval;
- if (sscanf(argv[2], "/%u%c", &divisor, &dummy) == 1) {
+ a = dm_shift_arg(&as);
+ if (sscanf(a, "/%u%c", &divisor, &dummy) == 1) {
if (!divisor)
return -EINVAL;
step = end - start;
@@ -802,18 +984,44 @@ static int message_stats_create(struct mapped_device *md,
step++;
if (!step)
step = 1;
- } else if (sscanf(argv[2], "%llu%c", &step, &dummy) != 1 ||
+ } else if (sscanf(a, "%llu%c", &step, &dummy) != 1 ||
step != (sector_t)step || !step)
- return -EINVAL;
+ goto ret_einval;
+
+ as_backup = as;
+ a = dm_shift_arg(&as);
+ if (a && sscanf(a, "%u%c", &feature_args, &dummy) == 1) {
+ while (feature_args--) {
+ a = dm_shift_arg(&as);
+ if (!a)
+ goto ret_einval;
+ if (!strcasecmp(a, "precise_timestamps"))
+ stat_flags |= STAT_PRECISE_TIMESTAMPS;
+ else if (!strncasecmp(a, "histogram:", 10)) {
+ if (n_histogram_entries)
+ goto ret_einval;
+ if ((r = parse_histogram(a + 10, &n_histogram_entries, &histogram_boundaries)))
+ goto ret;
+ } else
+ goto ret_einval;
+ }
+ } else {
+ as = as_backup;
+ }
program_id = "-";
aux_data = "-";
- if (argc > 3)
- program_id = argv[3];
+ a = dm_shift_arg(&as);
+ if (a)
+ program_id = a;
- if (argc > 4)
- aux_data = argv[4];
+ a = dm_shift_arg(&as);
+ if (a)
+ aux_data = a;
+
+ if (as.argc)
+ goto ret_einval;
/*
* If a buffer overflow happens after we created the region,
@@ -822,17 +1030,29 @@ static int message_stats_create(struct mapped_device *md,
* leaked). So we must detect buffer overflow in advance.
*/
snprintf(result, maxlen, "%d", INT_MAX);
- if (dm_message_test_buffer_overflow(result, maxlen))
- return 1;
+ if (dm_message_test_buffer_overflow(result, maxlen)) {
+ r = 1;
+ goto ret;
+ }
- id = dm_stats_create(dm_get_stats(md), start, end, step, program_id, aux_data,
+ id = dm_stats_create(dm_get_stats(md), start, end, step, stat_flags,
+ n_histogram_entries, histogram_boundaries, program_id, aux_data,
dm_internal_suspend_fast, dm_internal_resume_fast, md);
- if (id < 0)
- return id;
+ if (id < 0) {
+ r = id;
+ goto ret;
+ }
snprintf(result, maxlen, "%d", id);
- return 1;
+ r = 1;
+ goto ret;
+
+ret_einval:
+ r = -EINVAL;
+ret:
+ kfree(histogram_boundaries);
+ return r;
}
static int message_stats_delete(struct mapped_device *md,
@@ -935,11 +1155,6 @@ int dm_stats_message(struct mapped_device *md, unsigned argc, char **argv,
{
int r;
- if (dm_request_based(md)) {
- DMWARN("Statistics are only supported for bio-based devices");
- return -EOPNOTSUPP;
- }
-
/* All messages here must start with '@' */
if (!strcasecmp(argv[0], "@stats_create"))
r = message_stats_create(md, argc, argv, result, maxlen);
diff --git a/drivers/md/dm-stats.h b/drivers/md/dm-stats.h
index e7c4984bf..f1c0956e3 100644
--- a/drivers/md/dm-stats.h
+++ b/drivers/md/dm-stats.h
@@ -18,6 +18,7 @@ struct dm_stats {
struct dm_stats_aux {
bool merged;
+ unsigned long long duration_ns;
};
void dm_stats_init(struct dm_stats *st);
@@ -30,7 +31,8 @@ int dm_stats_message(struct mapped_device *md, unsigned argc, char **argv,
void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw,
sector_t bi_sector, unsigned bi_sectors, bool end,
- unsigned long duration, struct dm_stats_aux *aux);
+ unsigned long duration_jiffies,
+ struct dm_stats_aux *aux);
static inline bool dm_stats_used(struct dm_stats *st)
{
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index f8b37d4c0..a672a1502 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -451,10 +451,8 @@ int __init dm_stripe_init(void)
int r;
r = dm_register_target(&stripe_target);
- if (r < 0) {
+ if (r < 0)
DMWARN("target registration failed");
- return r;
- }
return r;
}
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 79f694120..6ba47cfb1 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -184,7 +184,6 @@ struct dm_pool_metadata {
uint64_t trans_id;
unsigned long flags;
sector_t data_block_size;
- bool read_only:1;
/*
* Set if a transaction has to be aborted but the attempt to roll back
@@ -836,7 +835,6 @@ struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
init_rwsem(&pmd->root_lock);
pmd->time = 0;
INIT_LIST_HEAD(&pmd->thin_devices);
- pmd->read_only = false;
pmd->fail_io = false;
pmd->bdev = bdev;
pmd->data_block_size = data_block_size;
@@ -880,7 +878,7 @@ int dm_pool_metadata_close(struct dm_pool_metadata *pmd)
return -EBUSY;
}
- if (!pmd->read_only && !pmd->fail_io) {
+ if (!dm_bm_is_read_only(pmd->bm) && !pmd->fail_io) {
r = __commit_transaction(pmd);
if (r < 0)
DMWARN("%s: __commit_transaction() failed, error = %d",
@@ -1295,8 +1293,8 @@ static int __release_metadata_snap(struct dm_pool_metadata *pmd)
return r;
disk_super = dm_block_data(copy);
- dm_sm_dec_block(pmd->metadata_sm, le64_to_cpu(disk_super->data_mapping_root));
- dm_sm_dec_block(pmd->metadata_sm, le64_to_cpu(disk_super->device_details_root));
+ dm_btree_del(&pmd->info, le64_to_cpu(disk_super->data_mapping_root));
+ dm_btree_del(&pmd->details_info, le64_to_cpu(disk_super->device_details_root));
dm_sm_dec_block(pmd->metadata_sm, held_root);
return dm_tm_unlock(pmd->tm, copy);
@@ -1392,10 +1390,11 @@ int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
dm_block_t keys[2] = { td->id, block };
struct dm_btree_info *info;
- if (pmd->fail_io)
- return -EINVAL;
-
down_read(&pmd->root_lock);
+ if (pmd->fail_io) {
+ up_read(&pmd->root_lock);
+ return -EINVAL;
+ }
if (can_issue_io) {
info = &pmd->info;
@@ -1419,6 +1418,63 @@ int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
return r;
}
+/* FIXME: write a more efficient one in btree */
+int dm_thin_find_mapped_range(struct dm_thin_device *td,
+ dm_block_t begin, dm_block_t end,
+ dm_block_t *thin_begin, dm_block_t *thin_end,
+ dm_block_t *pool_begin, bool *maybe_shared)
+{
+ int r;
+ dm_block_t pool_end;
+ struct dm_thin_lookup_result lookup;
+
+ if (end < begin)
+ return -ENODATA;
+
+ /*
+ * Find first mapped block.
+ */
+ while (begin < end) {
+ r = dm_thin_find_block(td, begin, true, &lookup);
+ if (r) {
+ if (r != -ENODATA)
+ return r;
+ } else
+ break;
+
+ begin++;
+ }
+
+ if (begin == end)
+ return -ENODATA;
+
+ *thin_begin = begin;
+ *pool_begin = lookup.block;
+ *maybe_shared = lookup.shared;
+
+ begin++;
+ pool_end = *pool_begin + 1;
+ while (begin != end) {
+ r = dm_thin_find_block(td, begin, true, &lookup);
+ if (r) {
+ if (r == -ENODATA)
+ break;
+ else
+ return r;
+ }
+
+ if ((lookup.block != pool_end) ||
+ (lookup.shared != *maybe_shared))
+ break;
+
+ pool_end++;
+ begin++;
+ }
+
+ *thin_end = begin;
+ return 0;
+}
+
static int __insert(struct dm_thin_device *td, dm_block_t block,
dm_block_t data_block)
{
@@ -1471,6 +1527,47 @@ static int __remove(struct dm_thin_device *td, dm_block_t block)
return 0;
}
+static int __remove_range(struct dm_thin_device *td, dm_block_t begin, dm_block_t end)
+{
+ int r;
+ unsigned count;
+ struct dm_pool_metadata *pmd = td->pmd;
+ dm_block_t keys[1] = { td->id };
+ __le64 value;
+ dm_block_t mapping_root;
+
+ /*
+ * Find the mapping tree
+ */
+ r = dm_btree_lookup(&pmd->tl_info, pmd->root, keys, &value);
+ if (r)
+ return r;
+
+ /*
+ * Remove from the mapping tree, taking care to inc the
+ * ref count so it doesn't get deleted.
+ */
+ mapping_root = le64_to_cpu(value);
+ dm_tm_inc(pmd->tm, mapping_root);
+ r = dm_btree_remove(&pmd->tl_info, pmd->root, keys, &pmd->root);
+ if (r)
+ return r;
+
+ r = dm_btree_remove_leaves(&pmd->bl_info, mapping_root, &begin, end, &mapping_root, &count);
+ if (r)
+ return r;
+
+ td->mapped_blocks -= count;
+ td->changed = 1;
+
+ /*
+ * Reinsert the mapping tree.
+ */
+ value = cpu_to_le64(mapping_root);
+ __dm_bless_for_disk(&value);
+ return dm_btree_insert(&pmd->tl_info, pmd->root, keys, &value, &pmd->root);
+}
+
int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block)
{
int r = -EINVAL;
@@ -1483,6 +1580,19 @@ int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block)
return r;
}
+int dm_thin_remove_range(struct dm_thin_device *td,
+ dm_block_t begin, dm_block_t end)
+{
+ int r = -EINVAL;
+
+ down_write(&td->pmd->root_lock);
+ if (!td->pmd->fail_io)
+ r = __remove_range(td, begin, end);
+ up_write(&td->pmd->root_lock);
+
+ return r;
+}
+
int dm_pool_block_is_used(struct dm_pool_metadata *pmd, dm_block_t b, bool *result)
{
int r;
@@ -1739,7 +1849,6 @@ int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_cou
void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd)
{
down_write(&pmd->root_lock);
- pmd->read_only = true;
dm_bm_set_read_only(pmd->bm);
up_write(&pmd->root_lock);
}
@@ -1747,7 +1856,6 @@ void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd)
void dm_pool_metadata_read_write(struct dm_pool_metadata *pmd)
{
down_write(&pmd->root_lock);
- pmd->read_only = false;
dm_bm_set_read_write(pmd->bm);
up_write(&pmd->root_lock);
}
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h
index fac01a96d..a938babe4 100644
--- a/drivers/md/dm-thin-metadata.h
+++ b/drivers/md/dm-thin-metadata.h
@@ -147,6 +147,15 @@ int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
int can_issue_io, struct dm_thin_lookup_result *result);
/*
+ * Retrieve the next run of contiguously mapped blocks. Useful for working
+ * out where to break up IO. Returns 0 on success, < 0 on error.
+ */
+int dm_thin_find_mapped_range(struct dm_thin_device *td,
+ dm_block_t begin, dm_block_t end,
+ dm_block_t *thin_begin, dm_block_t *thin_end,
+ dm_block_t *pool_begin, bool *maybe_shared);
+
+/*
* Obtain an unused block.
*/
int dm_pool_alloc_data_block(struct dm_pool_metadata *pmd, dm_block_t *result);
@@ -158,6 +167,8 @@ int dm_thin_insert_block(struct dm_thin_device *td, dm_block_t block,
dm_block_t data_block);
int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block);
+int dm_thin_remove_range(struct dm_thin_device *td,
+ dm_block_t begin, dm_block_t end);
/*
* Queries.
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index e22e6c892..d2bbe8cc1 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -112,22 +112,30 @@ DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
/*
* Key building.
*/
-static void build_data_key(struct dm_thin_device *td,
- dm_block_t b, struct dm_cell_key *key)
+enum lock_space {
+ VIRTUAL,
+ PHYSICAL
+};
+
+static void build_key(struct dm_thin_device *td, enum lock_space ls,
+ dm_block_t b, dm_block_t e, struct dm_cell_key *key)
{
- key->virtual = 0;
+ key->virtual = (ls == VIRTUAL);
key->dev = dm_thin_dev_id(td);
key->block_begin = b;
- key->block_end = b + 1ULL;
+ key->block_end = e;
+}
+
+static void build_data_key(struct dm_thin_device *td, dm_block_t b,
+ struct dm_cell_key *key)
+{
+ build_key(td, PHYSICAL, b, b + 1llu, key);
}
static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
struct dm_cell_key *key)
{
- key->virtual = 1;
- key->dev = dm_thin_dev_id(td);
- key->block_begin = b;
- key->block_end = b + 1ULL;
+ build_key(td, VIRTUAL, b, b + 1llu, key);
}
/*----------------------------------------------------------------*/
@@ -313,6 +321,138 @@ struct thin_c {
/*----------------------------------------------------------------*/
+/**
+ * __blkdev_issue_discard_async - queue a discard with async completion
+ * @bdev: blockdev to issue discard for
+ * @sector: start sector
+ * @nr_sects: number of sectors to discard
+ * @gfp_mask: memory allocation flags (for bio_alloc)
+ * @flags: BLKDEV_IFL_* flags to control behaviour
+ * @parent_bio: parent discard bio that all sub discards get chained to
+ *
+ * Description:
+ * Asynchronously issue a discard request for the sectors in question.
+ * NOTE: this variant of blk-core's blkdev_issue_discard() is a stop-gap
+ * that is being kept local to DM thinp until the block changes to allow
+ * late bio splitting land upstream.
+ */
+static int __blkdev_issue_discard_async(struct block_device *bdev, sector_t sector,
+ sector_t nr_sects, gfp_t gfp_mask, unsigned long flags,
+ struct bio *parent_bio)
+{
+ struct request_queue *q = bdev_get_queue(bdev);
+ int type = REQ_WRITE | REQ_DISCARD;
+ unsigned int max_discard_sectors, granularity;
+ int alignment;
+ struct bio *bio;
+ int ret = 0;
+ struct blk_plug plug;
+
+ if (!q)
+ return -ENXIO;
+
+ if (!blk_queue_discard(q))
+ return -EOPNOTSUPP;
+
+ /* Zero-sector (unknown) and one-sector granularities are the same. */
+ granularity = max(q->limits.discard_granularity >> 9, 1U);
+ alignment = (bdev_discard_alignment(bdev) >> 9) % granularity;
+
+ /*
+ * Ensure that max_discard_sectors is of the proper
+ * granularity, so that requests stay aligned after a split.
+ */
+ max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9);
+ max_discard_sectors -= max_discard_sectors % granularity;
+ if (unlikely(!max_discard_sectors)) {
+ /* Avoid infinite loop below. Being cautious never hurts. */
+ return -EOPNOTSUPP;
+ }
+
+ if (flags & BLKDEV_DISCARD_SECURE) {
+ if (!blk_queue_secdiscard(q))
+ return -EOPNOTSUPP;
+ type |= REQ_SECURE;
+ }
+
+ blk_start_plug(&plug);
+ while (nr_sects) {
+ unsigned int req_sects;
+ sector_t end_sect, tmp;
+
+ /*
+ * Required bio_put occurs in bio_endio thanks to bio_chain below
+ */
+ bio = bio_alloc(gfp_mask, 1);
+ if (!bio) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ req_sects = min_t(sector_t, nr_sects, max_discard_sectors);
+
+ /*
+ * If splitting a request, and the next starting sector would be
+ * misaligned, stop the discard at the previous aligned sector.
+ */
+ end_sect = sector + req_sects;
+ tmp = end_sect;
+ if (req_sects < nr_sects &&
+ sector_div(tmp, granularity) != alignment) {
+ end_sect = end_sect - alignment;
+ sector_div(end_sect, granularity);
+ end_sect = end_sect * granularity + alignment;
+ req_sects = end_sect - sector;
+ }
+
+ bio_chain(bio, parent_bio);
+
+ bio->bi_iter.bi_sector = sector;
+ bio->bi_bdev = bdev;
+
+ bio->bi_iter.bi_size = req_sects << 9;
+ nr_sects -= req_sects;
+ sector = end_sect;
+
+ submit_bio(type, bio);
+
+ /*
+ * We can loop for a long time in here, if someone does
+ * full device discards (like mkfs). Be nice and allow
+ * us to schedule out to avoid softlocking if preempt
+ * is disabled.
+ */
+ cond_resched();
+ }
+ blk_finish_plug(&plug);
+
+ return ret;
+}
+
+static bool block_size_is_power_of_two(struct pool *pool)
+{
+ return pool->sectors_per_block_shift >= 0;
+}
+
+static sector_t block_to_sectors(struct pool *pool, dm_block_t b)
+{
+ return block_size_is_power_of_two(pool) ?
+ (b << pool->sectors_per_block_shift) :
+ (b * pool->sectors_per_block);
+}
+
+static int issue_discard(struct thin_c *tc, dm_block_t data_b, dm_block_t data_e,
+ struct bio *parent_bio)
+{
+ sector_t s = block_to_sectors(tc->pool, data_b);
+ sector_t len = block_to_sectors(tc->pool, data_e - data_b);
+
+ return __blkdev_issue_discard_async(tc->pool_dev->bdev, s, len,
+ GFP_NOWAIT, 0, parent_bio);
+}
+
+/*----------------------------------------------------------------*/
+
/*
* wake_worker() is used when new work is queued and when pool_resume is
* ready to continue deferred IO processing.
@@ -462,6 +602,7 @@ struct dm_thin_endio_hook {
struct dm_deferred_entry *all_io_entry;
struct dm_thin_new_mapping *overwrite_mapping;
struct rb_node rb_node;
+ struct dm_bio_prison_cell *cell;
};
static void __merge_bio_list(struct bio_list *bios, struct bio_list *master)
@@ -525,16 +666,21 @@ static void requeue_io(struct thin_c *tc)
requeue_deferred_cells(tc);
}
-static void error_retry_list(struct pool *pool)
+static void error_retry_list_with_code(struct pool *pool, int error)
{
struct thin_c *tc;
rcu_read_lock();
list_for_each_entry_rcu(tc, &pool->active_thins, list)
- error_thin_bio_list(tc, &tc->retry_on_resume_list, -EIO);
+ error_thin_bio_list(tc, &tc->retry_on_resume_list, error);
rcu_read_unlock();
}
+static void error_retry_list(struct pool *pool)
+{
+ return error_retry_list_with_code(pool, -EIO);
+}
+
/*
* This section of code contains the logic for processing a thin device's IO.
* Much of the code depends on pool object resources (lists, workqueues, etc)
@@ -542,11 +688,6 @@ static void error_retry_list(struct pool *pool)
* target.
*/
-static bool block_size_is_power_of_two(struct pool *pool)
-{
- return pool->sectors_per_block_shift >= 0;
-}
-
static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
{
struct pool *pool = tc->pool;
@@ -560,6 +701,34 @@ static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
return block_nr;
}
+/*
+ * Returns the _complete_ blocks that this bio covers.
+ */
+static void get_bio_block_range(struct thin_c *tc, struct bio *bio,
+ dm_block_t *begin, dm_block_t *end)
+{
+ struct pool *pool = tc->pool;
+ sector_t b = bio->bi_iter.bi_sector;
+ sector_t e = b + (bio->bi_iter.bi_size >> SECTOR_SHIFT);
+
+ b += pool->sectors_per_block - 1ull; /* so we round up */
+
+ if (block_size_is_power_of_two(pool)) {
+ b >>= pool->sectors_per_block_shift;
+ e >>= pool->sectors_per_block_shift;
+ } else {
+ (void) sector_div(b, pool->sectors_per_block);
+ (void) sector_div(e, pool->sectors_per_block);
+ }
+
+ if (e < b)
+ /* Can happen if the bio is within a single block. */
+ e = b;
+
+ *begin = b;
+ *end = e;
+}
+
static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
{
struct pool *pool = tc->pool;
@@ -648,7 +817,7 @@ struct dm_thin_new_mapping {
struct list_head list;
bool pass_discard:1;
- bool definitely_not_shared:1;
+ bool maybe_shared:1;
/*
* Track quiescing, copying and zeroing preparation actions. When this
@@ -659,9 +828,9 @@ struct dm_thin_new_mapping {
int err;
struct thin_c *tc;
- dm_block_t virt_block;
+ dm_block_t virt_begin, virt_end;
dm_block_t data_block;
- struct dm_bio_prison_cell *cell, *cell2;
+ struct dm_bio_prison_cell *cell;
/*
* If the bio covers the whole area of a block then we can avoid
@@ -706,6 +875,8 @@ static void overwrite_endio(struct bio *bio, int err)
struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
struct dm_thin_new_mapping *m = h->overwrite_mapping;
+ bio->bi_end_io = m->saved_bi_end_io;
+
m->err = err;
complete_mapping_preparation(m);
}
@@ -794,10 +965,6 @@ static void inc_remap_and_issue_cell(struct thin_c *tc,
static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
{
- if (m->bio) {
- m->bio->bi_end_io = m->saved_bi_end_io;
- atomic_inc(&m->bio->bi_remaining);
- }
cell_error(m->tc->pool, m->cell);
list_del(&m->list);
mempool_free(m, m->tc->pool->mapping_pool);
@@ -807,15 +974,9 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
{
struct thin_c *tc = m->tc;
struct pool *pool = tc->pool;
- struct bio *bio;
+ struct bio *bio = m->bio;
int r;
- bio = m->bio;
- if (bio) {
- bio->bi_end_io = m->saved_bi_end_io;
- atomic_inc(&bio->bi_remaining);
- }
-
if (m->err) {
cell_error(pool, m->cell);
goto out;
@@ -826,7 +987,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
* Any I/O for this block arriving after this point will get
* remapped to it directly.
*/
- r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
+ r = dm_thin_insert_block(tc->td, m->virt_begin, m->data_block);
if (r) {
metadata_operation_failed(pool, "dm_thin_insert_block", r);
cell_error(pool, m->cell);
@@ -853,50 +1014,112 @@ out:
mempool_free(m, pool->mapping_pool);
}
-static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
+/*----------------------------------------------------------------*/
+
+static void free_discard_mapping(struct dm_thin_new_mapping *m)
{
struct thin_c *tc = m->tc;
+ if (m->cell)
+ cell_defer_no_holder(tc, m->cell);
+ mempool_free(m, tc->pool->mapping_pool);
+}
+static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
+{
bio_io_error(m->bio);
+ free_discard_mapping(m);
+}
+
+static void process_prepared_discard_success(struct dm_thin_new_mapping *m)
+{
+ bio_endio(m->bio, 0);
+ free_discard_mapping(m);
+}
+
+static void process_prepared_discard_no_passdown(struct dm_thin_new_mapping *m)
+{
+ int r;
+ struct thin_c *tc = m->tc;
+
+ r = dm_thin_remove_range(tc->td, m->cell->key.block_begin, m->cell->key.block_end);
+ if (r) {
+ metadata_operation_failed(tc->pool, "dm_thin_remove_range", r);
+ bio_io_error(m->bio);
+ } else
+ bio_endio(m->bio, 0);
+
cell_defer_no_holder(tc, m->cell);
- cell_defer_no_holder(tc, m->cell2);
mempool_free(m, tc->pool->mapping_pool);
}
-static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)
+static int passdown_double_checking_shared_status(struct dm_thin_new_mapping *m)
{
+ /*
+ * We've already unmapped this range of blocks, but before we
+ * passdown we have to check that these blocks are now unused.
+ */
+ int r;
+ bool used = true;
struct thin_c *tc = m->tc;
+ struct pool *pool = tc->pool;
+ dm_block_t b = m->data_block, e, end = m->data_block + m->virt_end - m->virt_begin;
- inc_all_io_entry(tc->pool, m->bio);
- cell_defer_no_holder(tc, m->cell);
- cell_defer_no_holder(tc, m->cell2);
+ while (b != end) {
+ /* find start of unmapped run */
+ for (; b < end; b++) {
+ r = dm_pool_block_is_used(pool->pmd, b, &used);
+ if (r)
+ return r;
- if (m->pass_discard)
- if (m->definitely_not_shared)
- remap_and_issue(tc, m->bio, m->data_block);
- else {
- bool used = false;
- if (dm_pool_block_is_used(tc->pool->pmd, m->data_block, &used) || used)
- bio_endio(m->bio, 0);
- else
- remap_and_issue(tc, m->bio, m->data_block);
+ if (!used)
+ break;
}
- else
- bio_endio(m->bio, 0);
- mempool_free(m, tc->pool->mapping_pool);
+ if (b == end)
+ break;
+
+ /* find end of run */
+ for (e = b + 1; e != end; e++) {
+ r = dm_pool_block_is_used(pool->pmd, e, &used);
+ if (r)
+ return r;
+
+ if (used)
+ break;
+ }
+
+ r = issue_discard(tc, b, e, m->bio);
+ if (r)
+ return r;
+
+ b = e;
+ }
+
+ return 0;
}
-static void process_prepared_discard(struct dm_thin_new_mapping *m)
+static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)
{
int r;
struct thin_c *tc = m->tc;
+ struct pool *pool = tc->pool;
- r = dm_thin_remove_block(tc->td, m->virt_block);
+ r = dm_thin_remove_range(tc->td, m->virt_begin, m->virt_end);
if (r)
- DMERR_LIMIT("dm_thin_remove_block() failed");
+ metadata_operation_failed(pool, "dm_thin_remove_range", r);
- process_prepared_discard_passdown(m);
+ else if (m->maybe_shared)
+ r = passdown_double_checking_shared_status(m);
+ else
+ r = issue_discard(tc, m->data_block, m->data_block + (m->virt_end - m->virt_begin), m->bio);
+
+ /*
+ * Even if r is set, there could be sub discards in flight that we
+ * need to wait for.
+ */
+ bio_endio(m->bio, r);
+ cell_defer_no_holder(tc, m->cell);
+ mempool_free(m, pool->mapping_pool);
}
static void process_prepared(struct pool *pool, struct list_head *head,
@@ -980,7 +1203,7 @@ static void ll_zero(struct thin_c *tc, struct dm_thin_new_mapping *m,
}
static void remap_and_issue_overwrite(struct thin_c *tc, struct bio *bio,
- dm_block_t data_block,
+ dm_block_t data_begin,
struct dm_thin_new_mapping *m)
{
struct pool *pool = tc->pool;
@@ -990,7 +1213,7 @@ static void remap_and_issue_overwrite(struct thin_c *tc, struct bio *bio,
m->bio = bio;
save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
inc_all_io_entry(pool, bio);
- remap_and_issue(tc, bio, data_block);
+ remap_and_issue(tc, bio, data_begin);
}
/*
@@ -1007,7 +1230,8 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
struct dm_thin_new_mapping *m = get_next_mapping(pool);
m->tc = tc;
- m->virt_block = virt_block;
+ m->virt_begin = virt_block;
+ m->virt_end = virt_block + 1u;
m->data_block = data_dest;
m->cell = cell;
@@ -1086,7 +1310,8 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
atomic_set(&m->prepare_actions, 1); /* no need to quiesce */
m->tc = tc;
- m->virt_block = virt_block;
+ m->virt_begin = virt_block;
+ m->virt_end = virt_block + 1u;
m->data_block = data_block;
m->cell = cell;
@@ -1095,16 +1320,14 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
* zeroing pre-existing data, we can issue the bio immediately.
* Otherwise we use kcopyd to zero the data first.
*/
- if (!pool->pf.zero_new_blocks)
+ if (pool->pf.zero_new_blocks) {
+ if (io_overwrites_block(pool, bio))
+ remap_and_issue_overwrite(tc, bio, data_block, m);
+ else
+ ll_zero(tc, m, data_block * pool->sectors_per_block,
+ (data_block + 1) * pool->sectors_per_block);
+ } else
process_prepared_mapping(m);
-
- else if (io_overwrites_block(pool, bio))
- remap_and_issue_overwrite(tc, bio, data_block, m);
-
- else
- ll_zero(tc, m,
- data_block * pool->sectors_per_block,
- (data_block + 1) * pool->sectors_per_block);
}
static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
@@ -1295,99 +1518,149 @@ static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *c
retry_on_resume(bio);
}
-static void process_discard_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
+static void process_discard_cell_no_passdown(struct thin_c *tc,
+ struct dm_bio_prison_cell *virt_cell)
{
- int r;
- struct bio *bio = cell->holder;
struct pool *pool = tc->pool;
- struct dm_bio_prison_cell *cell2;
- struct dm_cell_key key2;
- dm_block_t block = get_bio_block(tc, bio);
- struct dm_thin_lookup_result lookup_result;
- struct dm_thin_new_mapping *m;
+ struct dm_thin_new_mapping *m = get_next_mapping(pool);
- if (tc->requeue_mode) {
- cell_requeue(pool, cell);
- return;
- }
+ /*
+ * We don't need to lock the data blocks, since there's no
+ * passdown. We only lock data blocks for allocation and breaking sharing.
+ */
+ m->tc = tc;
+ m->virt_begin = virt_cell->key.block_begin;
+ m->virt_end = virt_cell->key.block_end;
+ m->cell = virt_cell;
+ m->bio = virt_cell->holder;
- r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
- switch (r) {
- case 0:
- /*
- * Check nobody is fiddling with this pool block. This can
- * happen if someone's in the process of breaking sharing
- * on this block.
- */
- build_data_key(tc->td, lookup_result.block, &key2);
- if (bio_detain(tc->pool, &key2, bio, &cell2)) {
- cell_defer_no_holder(tc, cell);
- break;
- }
+ if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
+ pool->process_prepared_discard(m);
+}
- if (io_overlaps_block(pool, bio)) {
- /*
- * IO may still be going to the destination block. We must
- * quiesce before we can do the removal.
- */
- m = get_next_mapping(pool);
- m->tc = tc;
- m->pass_discard = pool->pf.discard_passdown;
- m->definitely_not_shared = !lookup_result.shared;
- m->virt_block = block;
- m->data_block = lookup_result.block;
- m->cell = cell;
- m->cell2 = cell2;
- m->bio = bio;
-
- if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
- pool->process_prepared_discard(m);
+/*
+ * FIXME: DM local hack to defer parent bios's end_io until we
+ * _know_ all chained sub range discard bios have completed.
+ * Will go away once late bio splitting lands upstream!
+ */
+static inline void __bio_inc_remaining(struct bio *bio)
+{
+ bio->bi_flags |= (1 << BIO_CHAIN);
+ smp_mb__before_atomic();
+ atomic_inc(&bio->__bi_remaining);
+}
- } else {
- inc_all_io_entry(pool, bio);
- cell_defer_no_holder(tc, cell);
- cell_defer_no_holder(tc, cell2);
+static void break_up_discard_bio(struct thin_c *tc, dm_block_t begin, dm_block_t end,
+ struct bio *bio)
+{
+ struct pool *pool = tc->pool;
+
+ int r;
+ bool maybe_shared;
+ struct dm_cell_key data_key;
+ struct dm_bio_prison_cell *data_cell;
+ struct dm_thin_new_mapping *m;
+ dm_block_t virt_begin, virt_end, data_begin;
+
+ while (begin != end) {
+ r = ensure_next_mapping(pool);
+ if (r)
+ /* we did our best */
+ return;
+ r = dm_thin_find_mapped_range(tc->td, begin, end, &virt_begin, &virt_end,
+ &data_begin, &maybe_shared);
+ if (r)
/*
- * The DM core makes sure that the discard doesn't span
- * a block boundary. So we submit the discard of a
- * partial block appropriately.
+ * Silently fail, letting any mappings we've
+ * created complete.
*/
- if ((!lookup_result.shared) && pool->pf.discard_passdown)
- remap_and_issue(tc, bio, lookup_result.block);
- else
- bio_endio(bio, 0);
+ break;
+
+ build_key(tc->td, PHYSICAL, data_begin, data_begin + (virt_end - virt_begin), &data_key);
+ if (bio_detain(tc->pool, &data_key, NULL, &data_cell)) {
+ /* contention, we'll give up with this range */
+ begin = virt_end;
+ continue;
}
- break;
- case -ENODATA:
/*
- * It isn't provisioned, just forget it.
+ * IO may still be going to the destination block. We must
+ * quiesce before we can do the removal.
*/
- cell_defer_no_holder(tc, cell);
- bio_endio(bio, 0);
- break;
+ m = get_next_mapping(pool);
+ m->tc = tc;
+ m->maybe_shared = maybe_shared;
+ m->virt_begin = virt_begin;
+ m->virt_end = virt_end;
+ m->data_block = data_begin;
+ m->cell = data_cell;
+ m->bio = bio;
- default:
- DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
- __func__, r);
- cell_defer_no_holder(tc, cell);
- bio_io_error(bio);
- break;
+ /*
+ * The parent bio must not complete before sub discard bios are
+ * chained to it (see __blkdev_issue_discard_async's bio_chain)!
+ *
+ * This per-mapping bi_remaining increment is paired with
+ * the implicit decrement that occurs via bio_endio() in
+ * process_prepared_discard_{passdown,no_passdown}.
+ */
+ __bio_inc_remaining(bio);
+ if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
+ pool->process_prepared_discard(m);
+
+ begin = virt_end;
}
}
+static void process_discard_cell_passdown(struct thin_c *tc, struct dm_bio_prison_cell *virt_cell)
+{
+ struct bio *bio = virt_cell->holder;
+ struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
+
+ /*
+ * The virt_cell will only get freed once the origin bio completes.
+ * This means it will remain locked while all the individual
+ * passdown bios are in flight.
+ */
+ h->cell = virt_cell;
+ break_up_discard_bio(tc, virt_cell->key.block_begin, virt_cell->key.block_end, bio);
+
+ /*
+ * We complete the bio now, knowing that the bi_remaining field
+ * will prevent completion until the sub range discards have
+ * completed.
+ */
+ bio_endio(bio, 0);
+}
+
static void process_discard_bio(struct thin_c *tc, struct bio *bio)
{
- struct dm_bio_prison_cell *cell;
- struct dm_cell_key key;
- dm_block_t block = get_bio_block(tc, bio);
+ dm_block_t begin, end;
+ struct dm_cell_key virt_key;
+ struct dm_bio_prison_cell *virt_cell;
- build_virtual_key(tc->td, block, &key);
- if (bio_detain(tc->pool, &key, bio, &cell))
+ get_bio_block_range(tc, bio, &begin, &end);
+ if (begin == end) {
+ /*
+ * The discard covers less than a block.
+ */
+ bio_endio(bio, 0);
return;
+ }
- process_discard_cell(tc, cell);
+ build_key(tc->td, VIRTUAL, begin, end, &virt_key);
+ if (bio_detain(tc->pool, &virt_key, bio, &virt_cell))
+ /*
+ * Potential starvation issue: We're relying on the
+ * fs/application being well behaved, and not trying to
+ * send IO to a region at the same time as discarding it.
+ * If they do this persistently then it's possible this
+ * cell will never be granted.
+ */
+ return;
+
+ tc->pool->process_discard_cell(tc, virt_cell);
}
static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
@@ -2014,18 +2287,23 @@ static void do_waker(struct work_struct *ws)
queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD);
}
+static void notify_of_pool_mode_change_to_oods(struct pool *pool);
+
/*
* We're holding onto IO to allow userland time to react. After the
* timeout either the pool will have been resized (and thus back in
- * PM_WRITE mode), or we degrade to PM_READ_ONLY and start erroring IO.
+ * PM_WRITE mode), or we degrade to PM_OUT_OF_DATA_SPACE w/ error_if_no_space.
*/
static void do_no_space_timeout(struct work_struct *ws)
{
struct pool *pool = container_of(to_delayed_work(ws), struct pool,
no_space_timeout);
- if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space)
- set_pool_mode(pool, PM_READ_ONLY);
+ if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space) {
+ pool->pf.error_if_no_space = true;
+ notify_of_pool_mode_change_to_oods(pool);
+ error_retry_list_with_code(pool, -ENOSPC);
+ }
}
/*----------------------------------------------------------------*/
@@ -2103,6 +2381,32 @@ static void notify_of_pool_mode_change(struct pool *pool, const char *new_mode)
dm_device_name(pool->pool_md), new_mode);
}
+static void notify_of_pool_mode_change_to_oods(struct pool *pool)
+{
+ if (!pool->pf.error_if_no_space)
+ notify_of_pool_mode_change(pool, "out-of-data-space (queue IO)");
+ else
+ notify_of_pool_mode_change(pool, "out-of-data-space (error IO)");
+}
+
+static bool passdown_enabled(struct pool_c *pt)
+{
+ return pt->adjusted_pf.discard_passdown;
+}
+
+static void set_discard_callbacks(struct pool *pool)
+{
+ struct pool_c *pt = pool->ti->private;
+
+ if (passdown_enabled(pt)) {
+ pool->process_discard_cell = process_discard_cell_passdown;
+ pool->process_prepared_discard = process_prepared_discard_passdown;
+ } else {
+ pool->process_discard_cell = process_discard_cell_no_passdown;
+ pool->process_prepared_discard = process_prepared_discard_no_passdown;
+ }
+}
+
static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
{
struct pool_c *pt = pool->ti->private;
@@ -2154,7 +2458,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
pool->process_cell = process_cell_read_only;
pool->process_discard_cell = process_cell_success;
pool->process_prepared_mapping = process_prepared_mapping_fail;
- pool->process_prepared_discard = process_prepared_discard_passdown;
+ pool->process_prepared_discard = process_prepared_discard_success;
error_retry_list(pool);
break;
@@ -2169,13 +2473,12 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
* frequently seeing this mode.
*/
if (old_mode != new_mode)
- notify_of_pool_mode_change(pool, "out-of-data-space");
+ notify_of_pool_mode_change_to_oods(pool);
pool->process_bio = process_bio_read_only;
pool->process_discard = process_discard_bio;
pool->process_cell = process_cell_read_only;
- pool->process_discard_cell = process_discard_cell;
pool->process_prepared_mapping = process_prepared_mapping;
- pool->process_prepared_discard = process_prepared_discard;
+ set_discard_callbacks(pool);
if (!pool->pf.error_if_no_space && no_space_timeout)
queue_delayed_work(pool->wq, &pool->no_space_timeout, no_space_timeout);
@@ -2188,9 +2491,8 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
pool->process_bio = process_bio;
pool->process_discard = process_discard_bio;
pool->process_cell = process_cell;
- pool->process_discard_cell = process_discard_cell;
pool->process_prepared_mapping = process_prepared_mapping;
- pool->process_prepared_discard = process_prepared_discard;
+ set_discard_callbacks(pool);
break;
}
@@ -2279,6 +2581,7 @@ static void thin_hook_bio(struct thin_c *tc, struct bio *bio)
h->shared_read_entry = NULL;
h->all_io_entry = NULL;
h->overwrite_mapping = NULL;
+ h->cell = NULL;
}
/*
@@ -2426,7 +2729,6 @@ static void disable_passdown_if_not_supported(struct pool_c *pt)
struct pool *pool = pt->pool;
struct block_device *data_bdev = pt->data_dev->bdev;
struct queue_limits *data_limits = &bdev_get_queue(data_bdev)->limits;
- sector_t block_size = pool->sectors_per_block << SECTOR_SHIFT;
const char *reason = NULL;
char buf[BDEVNAME_SIZE];
@@ -2439,12 +2741,6 @@ static void disable_passdown_if_not_supported(struct pool_c *pt)
else if (data_limits->max_discard_sectors < pool->sectors_per_block)
reason = "max discard sectors smaller than a block";
- else if (data_limits->discard_granularity > block_size)
- reason = "discard granularity larger than a block";
-
- else if (!is_factor(block_size, data_limits->discard_granularity))
- reason = "discard granularity not a factor of block size";
-
if (reason) {
DMWARN("Data device (%s) %s: Disabling discard passdown.", bdevname(data_bdev, buf), reason);
pt->adjusted_pf.discard_passdown = false;
@@ -3389,7 +3685,7 @@ static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
if (get_pool_mode(pool) >= PM_READ_ONLY) {
DMERR("%s: unable to service pool target messages in READ_ONLY or FAIL mode",
dm_device_name(pool->pool_md));
- return -EINVAL;
+ return -EOPNOTSUPP;
}
if (!strcasecmp(argv[0], "create_thin"))
@@ -3447,6 +3743,7 @@ static void emit_flags(struct pool_features *pf, char *result,
* Status line is:
* <transaction id> <used metadata sectors>/<total metadata sectors>
* <used data sectors>/<total data sectors> <held metadata root>
+ * <pool mode> <discard config> <no space config> <needs_check>
*/
static void pool_status(struct dm_target *ti, status_type_t type,
unsigned status_flags, char *result, unsigned maxlen)
@@ -3548,6 +3845,11 @@ static void pool_status(struct dm_target *ti, status_type_t type,
else
DMEMIT("queue_if_no_space ");
+ if (dm_pool_metadata_needs_check(pool->pmd))
+ DMEMIT("needs_check ");
+ else
+ DMEMIT("- ");
+
break;
case STATUSTYPE_TABLE:
@@ -3587,24 +3889,6 @@ static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
}
-static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits)
-{
- struct pool *pool = pt->pool;
- struct queue_limits *data_limits;
-
- limits->max_discard_sectors = pool->sectors_per_block;
-
- /*
- * discard_granularity is just a hint, and not enforced.
- */
- if (pt->adjusted_pf.discard_passdown) {
- data_limits = &bdev_get_queue(pt->data_dev->bdev)->limits;
- limits->discard_granularity = max(data_limits->discard_granularity,
- pool->sectors_per_block << SECTOR_SHIFT);
- } else
- limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
-}
-
static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
{
struct pool_c *pt = ti->private;
@@ -3659,14 +3943,17 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
disable_passdown_if_not_supported(pt);
- set_discard_limits(pt, limits);
+ /*
+ * The pool uses the same discard limits as the underlying data
+ * device. DM core has already set this up.
+ */
}
static struct target_type pool_target = {
.name = "thin-pool",
.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
DM_TARGET_IMMUTABLE,
- .version = {1, 14, 0},
+ .version = {1, 16, 0},
.module = THIS_MODULE,
.ctr = pool_ctr,
.dtr = pool_dtr,
@@ -3825,8 +4112,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
if (tc->pool->pf.discard_enabled) {
ti->discards_supported = true;
ti->num_discard_bios = 1;
- /* Discard bios must be split on a block boundary */
- ti->split_discard_bios = true;
+ ti->split_discard_bios = false;
}
mutex_unlock(&dm_thin_pool_table.mutex);
@@ -3913,6 +4199,9 @@ static int thin_endio(struct dm_target *ti, struct bio *bio, int err)
}
}
+ if (h->cell)
+ cell_defer_no_holder(h->tc, h->cell);
+
return 0;
}
@@ -4040,9 +4329,18 @@ static int thin_iterate_devices(struct dm_target *ti,
return 0;
}
+static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
+{
+ struct thin_c *tc = ti->private;
+ struct pool *pool = tc->pool;
+
+ limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
+ limits->max_discard_sectors = 2048 * 1024 * 16; /* 16G */
+}
+
static struct target_type thin_target = {
.name = "thin",
- .version = {1, 14, 0},
+ .version = {1, 16, 0},
.module = THIS_MODULE,
.ctr = thin_ctr,
.dtr = thin_dtr,
@@ -4054,6 +4352,7 @@ static struct target_type thin_target = {
.status = thin_status,
.merge = thin_merge,
.iterate_devices = thin_iterate_devices,
+ .io_hints = thin_io_hints,
};
/*----------------------------------------------------------------*/
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c
index 66616db33..bb9c6a00e 100644
--- a/drivers/md/dm-verity.c
+++ b/drivers/md/dm-verity.c
@@ -459,7 +459,7 @@ static void verity_finish_io(struct dm_verity_io *io, int error)
bio->bi_end_io = io->orig_bi_end_io;
bio->bi_private = io->orig_bi_private;
- bio_endio_nodec(bio, error);
+ bio_endio(bio, error);
}
static void verity_work(struct work_struct *w)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 697f34fba..0d7ab20c5 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -86,6 +86,9 @@ struct dm_rq_target_io {
struct kthread_work work;
int error;
union map_info info;
+ struct dm_stats_aux stats_aux;
+ unsigned long duration_jiffies;
+ unsigned n_sectors;
};
/*
@@ -1046,6 +1049,17 @@ static struct dm_rq_target_io *tio_from_request(struct request *rq)
return (rq->q->mq_ops ? blk_mq_rq_to_pdu(rq) : rq->special);
}
+static void rq_end_stats(struct mapped_device *md, struct request *orig)
+{
+ if (unlikely(dm_stats_used(&md->stats))) {
+ struct dm_rq_target_io *tio = tio_from_request(orig);
+ tio->duration_jiffies = jiffies - tio->duration_jiffies;
+ dm_stats_account_io(&md->stats, orig->cmd_flags, blk_rq_pos(orig),
+ tio->n_sectors, true, tio->duration_jiffies,
+ &tio->stats_aux);
+ }
+}
+
/*
* Don't touch any member of the md after calling this function because
* the md may be freed in dm_put() at the end of this function.
@@ -1127,6 +1141,7 @@ static void dm_end_request(struct request *clone, int error)
}
free_rq_clone(clone);
+ rq_end_stats(md, rq);
if (!rq->q->mq_ops)
blk_end_request_all(rq, error);
else
@@ -1162,13 +1177,14 @@ static void old_requeue_request(struct request *rq)
spin_unlock_irqrestore(q->queue_lock, flags);
}
-static void dm_requeue_unmapped_original_request(struct mapped_device *md,
- struct request *rq)
+static void dm_requeue_original_request(struct mapped_device *md,
+ struct request *rq)
{
int rw = rq_data_dir(rq);
dm_unprep_request(rq);
+ rq_end_stats(md, rq);
if (!rq->q->mq_ops)
old_requeue_request(rq);
else {
@@ -1179,13 +1195,6 @@ static void dm_requeue_unmapped_original_request(struct mapped_device *md,
rq_completed(md, rw, false);
}
-static void dm_requeue_unmapped_request(struct request *clone)
-{
- struct dm_rq_target_io *tio = clone->end_io_data;
-
- dm_requeue_unmapped_original_request(tio->md, tio->orig);
-}
-
static void old_stop_queue(struct request_queue *q)
{
unsigned long flags;
@@ -1249,7 +1258,7 @@ static void dm_done(struct request *clone, int error, bool mapped)
return;
else if (r == DM_ENDIO_REQUEUE)
/* The target wants to requeue the I/O */
- dm_requeue_unmapped_request(clone);
+ dm_requeue_original_request(tio->md, tio->orig);
else {
DMWARN("unimplemented target endio return value: %d", r);
BUG();
@@ -1267,6 +1276,7 @@ static void dm_softirq_done(struct request *rq)
int rw;
if (!clone) {
+ rq_end_stats(tio->md, rq);
rw = rq_data_dir(rq);
if (!rq->q->mq_ops) {
blk_end_request_all(rq, tio->error);
@@ -1987,7 +1997,7 @@ static int map_request(struct dm_rq_target_io *tio, struct request *rq,
break;
case DM_MAPIO_REQUEUE:
/* The target wants to requeue the I/O */
- dm_requeue_unmapped_request(clone);
+ dm_requeue_original_request(md, tio->orig);
break;
default:
if (r > 0) {
@@ -2010,7 +2020,7 @@ static void map_tio_request(struct kthread_work *work)
struct mapped_device *md = tio->md;
if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE)
- dm_requeue_unmapped_original_request(md, rq);
+ dm_requeue_original_request(md, rq);
}
static void dm_start_request(struct mapped_device *md, struct request *orig)
@@ -2027,6 +2037,14 @@ static void dm_start_request(struct mapped_device *md, struct request *orig)
md->last_rq_start_time = ktime_get();
}
+ if (unlikely(dm_stats_used(&md->stats))) {
+ struct dm_rq_target_io *tio = tio_from_request(orig);
+ tio->duration_jiffies = jiffies;
+ tio->n_sectors = blk_rq_sectors(orig);
+ dm_stats_account_io(&md->stats, orig->cmd_flags, blk_rq_pos(orig),
+ tio->n_sectors, false, 0, &tio->stats_aux);
+ }
+
/*
* Hold the md reference here for the in-flight I/O.
* We can't rely on the reference count by device opener,
@@ -2157,7 +2175,7 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
* the query about congestion status of request_queue
*/
if (dm_request_based(md))
- r = md->queue->backing_dev_info.state &
+ r = md->queue->backing_dev_info.wb.state &
bdi_bits;
else
r = dm_table_any_congested(map, bdi_bits);
@@ -2250,6 +2268,40 @@ static void dm_init_old_md_queue(struct mapped_device *md)
blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
}
+static void cleanup_mapped_device(struct mapped_device *md)
+{
+ if (md->wq)
+ destroy_workqueue(md->wq);
+ if (md->kworker_task)
+ kthread_stop(md->kworker_task);
+ if (md->io_pool)
+ mempool_destroy(md->io_pool);
+ if (md->rq_pool)
+ mempool_destroy(md->rq_pool);
+ if (md->bs)
+ bioset_free(md->bs);
+
+ cleanup_srcu_struct(&md->io_barrier);
+
+ if (md->disk) {
+ spin_lock(&_minor_lock);
+ md->disk->private_data = NULL;
+ spin_unlock(&_minor_lock);
+ if (blk_get_integrity(md->disk))
+ blk_integrity_unregister(md->disk);
+ del_gendisk(md->disk);
+ put_disk(md->disk);
+ }
+
+ if (md->queue)
+ blk_cleanup_queue(md->queue);
+
+ if (md->bdev) {
+ bdput(md->bdev);
+ md->bdev = NULL;
+ }
+}
+
/*
* Allocate and initialise a blank device with a given minor.
*/
@@ -2295,13 +2347,13 @@ static struct mapped_device *alloc_dev(int minor)
md->queue = blk_alloc_queue(GFP_KERNEL);
if (!md->queue)
- goto bad_queue;
+ goto bad;
dm_init_md_queue(md);
md->disk = alloc_disk(1);
if (!md->disk)
- goto bad_disk;
+ goto bad;
atomic_set(&md->pending[0], 0);
atomic_set(&md->pending[1], 0);
@@ -2322,11 +2374,11 @@ static struct mapped_device *alloc_dev(int minor)
md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0);
if (!md->wq)
- goto bad_thread;
+ goto bad;
md->bdev = bdget_disk(md->disk, 0);
if (!md->bdev)
- goto bad_bdev;
+ goto bad;
bio_init(&md->flush_bio);
md->flush_bio.bi_bdev = md->bdev;
@@ -2343,15 +2395,8 @@ static struct mapped_device *alloc_dev(int minor)
return md;
-bad_bdev:
- destroy_workqueue(md->wq);
-bad_thread:
- del_gendisk(md->disk);
- put_disk(md->disk);
-bad_disk:
- blk_cleanup_queue(md->queue);
-bad_queue:
- cleanup_srcu_struct(&md->io_barrier);
+bad:
+ cleanup_mapped_device(md);
bad_io_barrier:
free_minor(minor);
bad_minor:
@@ -2368,32 +2413,13 @@ static void free_dev(struct mapped_device *md)
int minor = MINOR(disk_devt(md->disk));
unlock_fs(md);
- destroy_workqueue(md->wq);
- if (md->kworker_task)
- kthread_stop(md->kworker_task);
- if (md->io_pool)
- mempool_destroy(md->io_pool);
- if (md->rq_pool)
- mempool_destroy(md->rq_pool);
- if (md->bs)
- bioset_free(md->bs);
+ cleanup_mapped_device(md);
+ if (md->use_blk_mq)
+ blk_mq_free_tag_set(&md->tag_set);
- cleanup_srcu_struct(&md->io_barrier);
free_table_devices(&md->table_devices);
dm_stats_cleanup(&md->stats);
-
- spin_lock(&_minor_lock);
- md->disk->private_data = NULL;
- spin_unlock(&_minor_lock);
- if (blk_get_integrity(md->disk))
- blk_integrity_unregister(md->disk);
- del_gendisk(md->disk);
- put_disk(md->disk);
- blk_cleanup_queue(md->queue);
- if (md->use_blk_mq)
- blk_mq_free_tag_set(&md->tag_set);
- bdput(md->bdev);
free_minor(minor);
module_put(THIS_MODULE);
@@ -2754,6 +2780,7 @@ static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
/* Direct call is fine since .queue_rq allows allocations */
if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE) {
/* Undo dm_start_request() before requeuing */
+ rq_end_stats(md, rq);
rq_completed(md, rq_data_dir(rq), false);
return BLK_MQ_RQ_QUEUE_BUSY;
}
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 6123c2bf9..4e984993d 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -14,6 +14,7 @@
#include <linux/device-mapper.h>
#include <linux/list.h>
#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
#include <linux/hdreg.h>
#include <linux/completion.h>
#include <linux/kobject.h>
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index fcfc4b9b2..007219051 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -44,6 +44,7 @@ struct resync_info {
/* md_cluster_info flags */
#define MD_CLUSTER_WAITING_FOR_NEWDISK 1
+#define MD_CLUSTER_SUSPEND_READ_BALANCING 2
struct md_cluster_info {
@@ -275,6 +276,9 @@ clear_bit:
static void recover_prep(void *arg)
{
+ struct mddev *mddev = arg;
+ struct md_cluster_info *cinfo = mddev->cluster_info;
+ set_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state);
}
static void recover_slot(void *arg, struct dlm_slot *slot)
@@ -307,6 +311,7 @@ static void recover_done(void *arg, struct dlm_slot *slots,
cinfo->slot_number = our_slot;
complete(&cinfo->completion);
+ clear_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state);
}
static const struct dlm_lockspace_ops md_ls_ops = {
@@ -816,12 +821,17 @@ static void resync_finish(struct mddev *mddev)
resync_send(mddev, RESYNCING, 0, 0);
}
-static int area_resyncing(struct mddev *mddev, sector_t lo, sector_t hi)
+static int area_resyncing(struct mddev *mddev, int direction,
+ sector_t lo, sector_t hi)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
int ret = 0;
struct suspend_info *s;
+ if ((direction == READ) &&
+ test_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state))
+ return 1;
+
spin_lock_irq(&cinfo->suspend_lock);
if (list_empty(&cinfo->suspend_list))
goto out;
diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h
index 6817ee00e..00defe2ba 100644
--- a/drivers/md/md-cluster.h
+++ b/drivers/md/md-cluster.h
@@ -18,7 +18,7 @@ struct md_cluster_operations {
int (*metadata_update_start)(struct mddev *mddev);
int (*metadata_update_finish)(struct mddev *mddev);
int (*metadata_update_cancel)(struct mddev *mddev);
- int (*area_resyncing)(struct mddev *mddev, sector_t lo, sector_t hi);
+ int (*area_resyncing)(struct mddev *mddev, int direction, sector_t lo, sector_t hi);
int (*add_new_disk_start)(struct mddev *mddev, struct md_rdev *rdev);
int (*add_new_disk_finish)(struct mddev *mddev);
int (*new_disk_ack)(struct mddev *mddev, bool ack);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 0c699709f..e25f00f01 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2024,7 +2024,6 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
{
char b[BDEVNAME_SIZE];
struct kobject *ko;
- char *s;
int err;
/* prevent duplicates */
@@ -2070,8 +2069,7 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
return -EBUSY;
}
bdevname(rdev->bdev,b);
- while ( (s=strchr(b, '/')) != NULL)
- *s = '!';
+ strreplace(b, '/', '!');
rdev->mddev = mddev;
printk(KERN_INFO "md: bind<%s>\n", b);
@@ -2630,13 +2628,14 @@ errors_show(struct md_rdev *rdev, char *page)
static ssize_t
errors_store(struct md_rdev *rdev, const char *buf, size_t len)
{
- char *e;
- unsigned long n = simple_strtoul(buf, &e, 10);
- if (*buf && (*e == 0 || *e == '\n')) {
- atomic_set(&rdev->corrected_errors, n);
- return len;
- }
- return -EINVAL;
+ unsigned int n;
+ int rv;
+
+ rv = kstrtouint(buf, 10, &n);
+ if (rv < 0)
+ return rv;
+ atomic_set(&rdev->corrected_errors, n);
+ return len;
}
static struct rdev_sysfs_entry rdev_errors =
__ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
@@ -2653,13 +2652,16 @@ slot_show(struct md_rdev *rdev, char *page)
static ssize_t
slot_store(struct md_rdev *rdev, const char *buf, size_t len)
{
- char *e;
+ int slot;
int err;
- int slot = simple_strtoul(buf, &e, 10);
+
if (strncmp(buf, "none", 4)==0)
slot = -1;
- else if (e==buf || (*e && *e!= '\n'))
- return -EINVAL;
+ else {
+ err = kstrtouint(buf, 10, (unsigned int *)&slot);
+ if (err < 0)
+ return err;
+ }
if (rdev->mddev->pers && slot == -1) {
/* Setting 'slot' on an active array requires also
* updating the 'rd%d' link, and communicating
@@ -3544,12 +3546,12 @@ layout_show(struct mddev *mddev, char *page)
static ssize_t
layout_store(struct mddev *mddev, const char *buf, size_t len)
{
- char *e;
- unsigned long n = simple_strtoul(buf, &e, 10);
+ unsigned int n;
int err;
- if (!*buf || (*e && *e != '\n'))
- return -EINVAL;
+ err = kstrtouint(buf, 10, &n);
+ if (err < 0)
+ return err;
err = mddev_lock(mddev);
if (err)
return err;
@@ -3593,12 +3595,12 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks);
static ssize_t
raid_disks_store(struct mddev *mddev, const char *buf, size_t len)
{
- char *e;
+ unsigned int n;
int err;
- unsigned long n = simple_strtoul(buf, &e, 10);
- if (!*buf || (*e && *e != '\n'))
- return -EINVAL;
+ err = kstrtouint(buf, 10, &n);
+ if (err < 0)
+ return err;
err = mddev_lock(mddev);
if (err)
@@ -3645,12 +3647,12 @@ chunk_size_show(struct mddev *mddev, char *page)
static ssize_t
chunk_size_store(struct mddev *mddev, const char *buf, size_t len)
{
+ unsigned long n;
int err;
- char *e;
- unsigned long n = simple_strtoul(buf, &e, 10);
- if (!*buf || (*e && *e != '\n'))
- return -EINVAL;
+ err = kstrtoul(buf, 10, &n);
+ if (err < 0)
+ return err;
err = mddev_lock(mddev);
if (err)
@@ -3688,19 +3690,24 @@ resync_start_show(struct mddev *mddev, char *page)
static ssize_t
resync_start_store(struct mddev *mddev, const char *buf, size_t len)
{
+ unsigned long long n;
int err;
- char *e;
- unsigned long long n = simple_strtoull(buf, &e, 10);
+
+ if (cmd_match(buf, "none"))
+ n = MaxSector;
+ else {
+ err = kstrtoull(buf, 10, &n);
+ if (err < 0)
+ return err;
+ if (n != (sector_t)n)
+ return -EINVAL;
+ }
err = mddev_lock(mddev);
if (err)
return err;
if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
err = -EBUSY;
- else if (cmd_match(buf, "none"))
- n = MaxSector;
- else if (!*buf || (*e && *e != '\n'))
- err = -EINVAL;
if (!err) {
mddev->recovery_cp = n;
@@ -3936,14 +3943,14 @@ max_corrected_read_errors_show(struct mddev *mddev, char *page) {
static ssize_t
max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len)
{
- char *e;
- unsigned long n = simple_strtoul(buf, &e, 10);
+ unsigned int n;
+ int rv;
- if (*buf && (*e == 0 || *e == '\n')) {
- atomic_set(&mddev->max_corr_read_errors, n);
- return len;
- }
- return -EINVAL;
+ rv = kstrtouint(buf, 10, &n);
+ if (rv < 0)
+ return rv;
+ atomic_set(&mddev->max_corr_read_errors, n);
+ return len;
}
static struct md_sysfs_entry max_corr_read_errors =
@@ -4302,15 +4309,18 @@ sync_min_show(struct mddev *mddev, char *page)
static ssize_t
sync_min_store(struct mddev *mddev, const char *buf, size_t len)
{
- int min;
- char *e;
+ unsigned int min;
+ int rv;
+
if (strncmp(buf, "system", 6)==0) {
- mddev->sync_speed_min = 0;
- return len;
+ min = 0;
+ } else {
+ rv = kstrtouint(buf, 10, &min);
+ if (rv < 0)
+ return rv;
+ if (min == 0)
+ return -EINVAL;
}
- min = simple_strtoul(buf, &e, 10);
- if (buf == e || (*e && *e != '\n') || min <= 0)
- return -EINVAL;
mddev->sync_speed_min = min;
return len;
}
@@ -4328,15 +4338,18 @@ sync_max_show(struct mddev *mddev, char *page)
static ssize_t
sync_max_store(struct mddev *mddev, const char *buf, size_t len)
{
- int max;
- char *e;
+ unsigned int max;
+ int rv;
+
if (strncmp(buf, "system", 6)==0) {
- mddev->sync_speed_max = 0;
- return len;
+ max = 0;
+ } else {
+ rv = kstrtouint(buf, 10, &max);
+ if (rv < 0)
+ return rv;
+ if (max == 0)
+ return -EINVAL;
}
- max = simple_strtoul(buf, &e, 10);
- if (buf == e || (*e && *e != '\n') || max <= 0)
- return -EINVAL;
mddev->sync_speed_max = max;
return len;
}
@@ -4519,12 +4532,13 @@ suspend_lo_show(struct mddev *mddev, char *page)
static ssize_t
suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
{
- char *e;
- unsigned long long new = simple_strtoull(buf, &e, 10);
- unsigned long long old;
+ unsigned long long old, new;
int err;
- if (buf == e || (*e && *e != '\n'))
+ err = kstrtoull(buf, 10, &new);
+ if (err < 0)
+ return err;
+ if (new != (sector_t)new)
return -EINVAL;
err = mddev_lock(mddev);
@@ -4561,12 +4575,13 @@ suspend_hi_show(struct mddev *mddev, char *page)
static ssize_t
suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
{
- char *e;
- unsigned long long new = simple_strtoull(buf, &e, 10);
- unsigned long long old;
+ unsigned long long old, new;
int err;
- if (buf == e || (*e && *e != '\n'))
+ err = kstrtoull(buf, 10, &new);
+ if (err < 0)
+ return err;
+ if (new != (sector_t)new)
return -EINVAL;
err = mddev_lock(mddev);
@@ -4608,11 +4623,13 @@ static ssize_t
reshape_position_store(struct mddev *mddev, const char *buf, size_t len)
{
struct md_rdev *rdev;
- char *e;
+ unsigned long long new;
int err;
- unsigned long long new = simple_strtoull(buf, &e, 10);
- if (buf == e || (*e && *e != '\n'))
+ err = kstrtoull(buf, 10, &new);
+ if (err < 0)
+ return err;
+ if (new != (sector_t)new)
return -EINVAL;
err = mddev_lock(mddev);
if (err)
@@ -5365,6 +5382,8 @@ static void __md_stop(struct mddev *mddev)
{
struct md_personality *pers = mddev->pers;
mddev_detach(mddev);
+ /* Ensure ->event_work is done */
+ flush_workqueue(md_misc_wq);
spin_lock(&mddev->lock);
mddev->ready = 0;
mddev->pers = NULL;
@@ -5749,7 +5768,7 @@ static int get_bitmap_file(struct mddev *mddev, void __user * arg)
/* bitmap disabled, zero the first byte and copy out */
if (!mddev->bitmap_info.file)
file->pathname[0] = '\0';
- else if ((ptr = d_path(&mddev->bitmap_info.file->f_path,
+ else if ((ptr = file_path(mddev->bitmap_info.file,
file->pathname, sizeof(file->pathname))),
IS_ERR(ptr))
err = PTR_ERR(ptr);
@@ -8111,6 +8130,15 @@ void md_check_recovery(struct mddev *mddev)
int spares = 0;
if (mddev->ro) {
+ struct md_rdev *rdev;
+ if (!mddev->external && mddev->in_sync)
+ /* 'Blocked' flag not needed as failed devices
+ * will be recorded if array switched to read/write.
+ * Leaving it set will prevent the device
+ * from being removed.
+ */
+ rdev_for_each(rdev, mddev)
+ clear_bit(Blocked, &rdev->flags);
/* On a read-only array we can:
* - remove failed devices
* - add already-in_sync devices if the array itself
@@ -9018,13 +9046,7 @@ static int get_ro(char *buffer, struct kernel_param *kp)
}
static int set_ro(const char *val, struct kernel_param *kp)
{
- char *e;
- int num = simple_strtoul(val, &e, 10);
- if (*val && (*e == '\0' || *e == '\n')) {
- start_readonly = num;
- return 0;
- }
- return -EINVAL;
+ return kstrtouint(val, 10, (unsigned int *)&start_readonly);
}
module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 4046a6c6f..7da6e9c3c 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -16,6 +16,7 @@
#define _MD_MD_H
#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
#include <linux/kobject.h>
#include <linux/list.h>
#include <linux/mm.h>
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c
index 087411c95..4d6c9b689 100644
--- a/drivers/md/persistent-data/dm-block-manager.c
+++ b/drivers/md/persistent-data/dm-block-manager.c
@@ -609,6 +609,12 @@ void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b)
dm_bufio_prefetch(bm->bufio, b, 1);
}
+bool dm_bm_is_read_only(struct dm_block_manager *bm)
+{
+ return bm->read_only;
+}
+EXPORT_SYMBOL_GPL(dm_bm_is_read_only);
+
void dm_bm_set_read_only(struct dm_block_manager *bm)
{
bm->read_only = true;
diff --git a/drivers/md/persistent-data/dm-block-manager.h b/drivers/md/persistent-data/dm-block-manager.h
index 1b95dfc17..84330f598 100644
--- a/drivers/md/persistent-data/dm-block-manager.h
+++ b/drivers/md/persistent-data/dm-block-manager.h
@@ -123,6 +123,7 @@ void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b);
* Additionally you should not use dm_bm_unlock_move, however no error will
* be returned if you do.
*/
+bool dm_bm_is_read_only(struct dm_block_manager *bm);
void dm_bm_set_read_only(struct dm_block_manager *bm);
void dm_bm_set_read_write(struct dm_block_manager *bm);
diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h
index bf2b80d5c..8731b6ea0 100644
--- a/drivers/md/persistent-data/dm-btree-internal.h
+++ b/drivers/md/persistent-data/dm-btree-internal.h
@@ -138,4 +138,10 @@ int lower_bound(struct btree_node *n, uint64_t key);
extern struct dm_block_validator btree_node_validator;
+/*
+ * Value type for upper levels of multi-level btrees.
+ */
+extern void init_le64_type(struct dm_transaction_manager *tm,
+ struct dm_btree_value_type *vt);
+
#endif /* DM_BTREE_INTERNAL_H */
diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c
index a03178e91..4222f774c 100644
--- a/drivers/md/persistent-data/dm-btree-remove.c
+++ b/drivers/md/persistent-data/dm-btree-remove.c
@@ -544,14 +544,6 @@ static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info,
return r;
}
-static struct dm_btree_value_type le64_type = {
- .context = NULL,
- .size = sizeof(__le64),
- .inc = NULL,
- .dec = NULL,
- .equal = NULL
-};
-
int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
uint64_t *keys, dm_block_t *new_root)
{
@@ -559,12 +551,14 @@ int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
int index = 0, r = 0;
struct shadow_spine spine;
struct btree_node *n;
+ struct dm_btree_value_type le64_vt;
+ init_le64_type(info->tm, &le64_vt);
init_shadow_spine(&spine, info);
for (level = 0; level < info->levels; level++) {
r = remove_raw(&spine, info,
(level == last_level ?
- &info->value_type : &le64_type),
+ &info->value_type : &le64_vt),
root, keys[level], (unsigned *)&index);
if (r < 0)
break;
@@ -590,3 +584,133 @@ int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
return r;
}
EXPORT_SYMBOL_GPL(dm_btree_remove);
+
+/*----------------------------------------------------------------*/
+
+static int remove_nearest(struct shadow_spine *s, struct dm_btree_info *info,
+ struct dm_btree_value_type *vt, dm_block_t root,
+ uint64_t key, int *index)
+{
+ int i = *index, r;
+ struct btree_node *n;
+
+ for (;;) {
+ r = shadow_step(s, root, vt);
+ if (r < 0)
+ break;
+
+ /*
+ * We have to patch up the parent node, ugly, but I don't
+ * see a way to do this automatically as part of the spine
+ * op.
+ */
+ if (shadow_has_parent(s)) {
+ __le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
+ memcpy(value_ptr(dm_block_data(shadow_parent(s)), i),
+ &location, sizeof(__le64));
+ }
+
+ n = dm_block_data(shadow_current(s));
+
+ if (le32_to_cpu(n->header.flags) & LEAF_NODE) {
+ *index = lower_bound(n, key);
+ return 0;
+ }
+
+ r = rebalance_children(s, info, vt, key);
+ if (r)
+ break;
+
+ n = dm_block_data(shadow_current(s));
+ if (le32_to_cpu(n->header.flags) & LEAF_NODE) {
+ *index = lower_bound(n, key);
+ return 0;
+ }
+
+ i = lower_bound(n, key);
+
+ /*
+ * We know the key is present, or else
+ * rebalance_children would have returned
+ * -ENODATA
+ */
+ root = value64(n, i);
+ }
+
+ return r;
+}
+
+static int remove_one(struct dm_btree_info *info, dm_block_t root,
+ uint64_t *keys, uint64_t end_key,
+ dm_block_t *new_root, unsigned *nr_removed)
+{
+ unsigned level, last_level = info->levels - 1;
+ int index = 0, r = 0;
+ struct shadow_spine spine;
+ struct btree_node *n;
+ struct dm_btree_value_type le64_vt;
+ uint64_t k;
+
+ init_le64_type(info->tm, &le64_vt);
+ init_shadow_spine(&spine, info);
+ for (level = 0; level < last_level; level++) {
+ r = remove_raw(&spine, info, &le64_vt,
+ root, keys[level], (unsigned *) &index);
+ if (r < 0)
+ goto out;
+
+ n = dm_block_data(shadow_current(&spine));
+ root = value64(n, index);
+ }
+
+ r = remove_nearest(&spine, info, &info->value_type,
+ root, keys[last_level], &index);
+ if (r < 0)
+ goto out;
+
+ n = dm_block_data(shadow_current(&spine));
+
+ if (index < 0)
+ index = 0;
+
+ if (index >= le32_to_cpu(n->header.nr_entries)) {
+ r = -ENODATA;
+ goto out;
+ }
+
+ k = le64_to_cpu(n->keys[index]);
+ if (k >= keys[last_level] && k < end_key) {
+ if (info->value_type.dec)
+ info->value_type.dec(info->value_type.context,
+ value_ptr(n, index));
+
+ delete_at(n, index);
+ keys[last_level] = k + 1ull;
+
+ } else
+ r = -ENODATA;
+
+out:
+ *new_root = shadow_root(&spine);
+ exit_shadow_spine(&spine);
+
+ return r;
+}
+
+int dm_btree_remove_leaves(struct dm_btree_info *info, dm_block_t root,
+ uint64_t *first_key, uint64_t end_key,
+ dm_block_t *new_root, unsigned *nr_removed)
+{
+ int r;
+
+ *nr_removed = 0;
+ do {
+ r = remove_one(info, root, first_key, end_key, &root, nr_removed);
+ if (!r)
+ (*nr_removed)++;
+ } while (!r);
+
+ *new_root = root;
+ return r == -ENODATA ? 0 : r;
+}
+EXPORT_SYMBOL_GPL(dm_btree_remove_leaves);
diff --git a/drivers/md/persistent-data/dm-btree-spine.c b/drivers/md/persistent-data/dm-btree-spine.c
index 1b5e13ec7..0dee514ba 100644
--- a/drivers/md/persistent-data/dm-btree-spine.c
+++ b/drivers/md/persistent-data/dm-btree-spine.c
@@ -249,3 +249,40 @@ int shadow_root(struct shadow_spine *s)
{
return s->root;
}
+
+static void le64_inc(void *context, const void *value_le)
+{
+ struct dm_transaction_manager *tm = context;
+ __le64 v_le;
+
+ memcpy(&v_le, value_le, sizeof(v_le));
+ dm_tm_inc(tm, le64_to_cpu(v_le));
+}
+
+static void le64_dec(void *context, const void *value_le)
+{
+ struct dm_transaction_manager *tm = context;
+ __le64 v_le;
+
+ memcpy(&v_le, value_le, sizeof(v_le));
+ dm_tm_dec(tm, le64_to_cpu(v_le));
+}
+
+static int le64_equal(void *context, const void *value1_le, const void *value2_le)
+{
+ __le64 v1_le, v2_le;
+
+ memcpy(&v1_le, value1_le, sizeof(v1_le));
+ memcpy(&v2_le, value2_le, sizeof(v2_le));
+ return v1_le == v2_le;
+}
+
+void init_le64_type(struct dm_transaction_manager *tm,
+ struct dm_btree_value_type *vt)
+{
+ vt->context = tm;
+ vt->size = sizeof(__le64);
+ vt->inc = le64_inc;
+ vt->dec = le64_dec;
+ vt->equal = le64_equal;
+}
diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c
index fdd3793e2..c7726cebc 100644
--- a/drivers/md/persistent-data/dm-btree.c
+++ b/drivers/md/persistent-data/dm-btree.c
@@ -667,12 +667,7 @@ static int insert(struct dm_btree_info *info, dm_block_t root,
struct btree_node *n;
struct dm_btree_value_type le64_type;
- le64_type.context = NULL;
- le64_type.size = sizeof(__le64);
- le64_type.inc = NULL;
- le64_type.dec = NULL;
- le64_type.equal = NULL;
-
+ init_le64_type(info->tm, &le64_type);
init_shadow_spine(&spine, info);
for (level = 0; level < (info->levels - 1); level++) {
diff --git a/drivers/md/persistent-data/dm-btree.h b/drivers/md/persistent-data/dm-btree.h
index dacfc3418..11d8cf786 100644
--- a/drivers/md/persistent-data/dm-btree.h
+++ b/drivers/md/persistent-data/dm-btree.h
@@ -135,6 +135,15 @@ int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
uint64_t *keys, dm_block_t *new_root);
/*
+ * Removes values between 'keys' and keys2, where keys2 is keys with the
+ * final key replaced with 'end_key'. 'end_key' is the one-past-the-end
+ * value. 'keys' may be altered.
+ */
+int dm_btree_remove_leaves(struct dm_btree_info *info, dm_block_t root,
+ uint64_t *keys, uint64_t end_key,
+ dm_block_t *new_root, unsigned *nr_removed);
+
+/*
* Returns < 0 on failure. Otherwise the number of key entries that have
* been filled out. Remember trees can have zero entries, and as such have
* no lowest key.
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 5ce3cd5c4..967a4ed73 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -541,7 +541,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
if ((conf->mddev->recovery_cp < this_sector + sectors) ||
(mddev_is_clustered(conf->mddev) &&
- md_cluster_ops->area_resyncing(conf->mddev, this_sector,
+ md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector,
this_sector + sectors)))
choose_first = 1;
else
@@ -745,7 +745,7 @@ static int raid1_congested(struct mddev *mddev, int bits)
struct r1conf *conf = mddev->private;
int i, ret = 0;
- if ((bits & (1 << BDI_async_congested)) &&
+ if ((bits & (1 << WB_async_congested)) &&
conf->pending_count >= max_queued_requests)
return 1;
@@ -760,7 +760,7 @@ static int raid1_congested(struct mddev *mddev, int bits)
/* Note the '|| 1' - when read_balance prefers
* non-congested targets, it can be removed
*/
- if ((bits & (1<<BDI_async_congested)) || 1)
+ if ((bits & (1 << WB_async_congested)) || 1)
ret |= bdi_congested(&q->backing_dev_info, bits);
else
ret &= bdi_congested(&q->backing_dev_info, bits);
@@ -1111,7 +1111,8 @@ static void make_request(struct mddev *mddev, struct bio * bio)
((bio_end_sector(bio) > mddev->suspend_lo &&
bio->bi_iter.bi_sector < mddev->suspend_hi) ||
(mddev_is_clustered(mddev) &&
- md_cluster_ops->area_resyncing(mddev, bio->bi_iter.bi_sector, bio_end_sector(bio))))) {
+ md_cluster_ops->area_resyncing(mddev, WRITE,
+ bio->bi_iter.bi_sector, bio_end_sector(bio))))) {
/* As the suspend_* range is controlled by
* userspace, we want an interruptible
* wait.
@@ -1124,7 +1125,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
if (bio_end_sector(bio) <= mddev->suspend_lo ||
bio->bi_iter.bi_sector >= mddev->suspend_hi ||
(mddev_is_clustered(mddev) &&
- !md_cluster_ops->area_resyncing(mddev,
+ !md_cluster_ops->area_resyncing(mddev, WRITE,
bio->bi_iter.bi_sector, bio_end_sector(bio))))
break;
schedule();
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index f55c3f35b..38c58e19c 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -914,7 +914,7 @@ static int raid10_congested(struct mddev *mddev, int bits)
struct r10conf *conf = mddev->private;
int i, ret = 0;
- if ((bits & (1 << BDI_async_congested)) &&
+ if ((bits & (1 << WB_async_congested)) &&
conf->pending_count >= max_queued_requests)
return 1;
@@ -2099,17 +2099,10 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
tbio->bi_rw = WRITE;
tbio->bi_private = r10_bio;
tbio->bi_iter.bi_sector = r10_bio->devs[i].addr;
-
- for (j=0; j < vcnt ; j++) {
- tbio->bi_io_vec[j].bv_offset = 0;
- tbio->bi_io_vec[j].bv_len = PAGE_SIZE;
-
- memcpy(page_address(tbio->bi_io_vec[j].bv_page),
- page_address(fbio->bi_io_vec[j].bv_page),
- PAGE_SIZE);
- }
tbio->bi_end_io = end_sync_write;
+ bio_copy_data(tbio, fbio);
+
d = r10_bio->devs[i].devnum;
atomic_inc(&conf->mirrors[d].rdev->nr_pending);
atomic_inc(&r10_bio->remaining);
@@ -2124,17 +2117,14 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
* that are active
*/
for (i = 0; i < conf->copies; i++) {
- int j, d;
+ int d;
tbio = r10_bio->devs[i].repl_bio;
if (!tbio || !tbio->bi_end_io)
continue;
if (r10_bio->devs[i].bio->bi_end_io != end_sync_write
&& r10_bio->devs[i].bio != fbio)
- for (j = 0; j < vcnt; j++)
- memcpy(page_address(tbio->bi_io_vec[j].bv_page),
- page_address(fbio->bi_io_vec[j].bv_page),
- PAGE_SIZE);
+ bio_copy_data(tbio, fbio);
d = r10_bio->devs[i].devnum;
atomic_inc(&r10_bio->remaining);
md_sync_acct(conf->mirrors[d].replacement->bdev,
@@ -3566,6 +3556,7 @@ static struct r10conf *setup_conf(struct mddev *mddev)
/* far_copies must be 1 */
conf->prev.stride = conf->dev_sectors;
}
+ conf->reshape_safe = conf->reshape_progress;
spin_lock_init(&conf->device_lock);
INIT_LIST_HEAD(&conf->retry_list);
@@ -3770,7 +3761,6 @@ static int run(struct mddev *mddev)
}
conf->offset_diff = min_offset_diff;
- conf->reshape_safe = conf->reshape_progress;
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
@@ -4113,6 +4103,7 @@ static int raid10_start_reshape(struct mddev *mddev)
conf->reshape_progress = size;
} else
conf->reshape_progress = 0;
+ conf->reshape_safe = conf->reshape_progress;
spin_unlock_irq(&conf->device_lock);
if (mddev->delta_disks && mddev->bitmap) {
@@ -4180,6 +4171,7 @@ abort:
rdev->new_data_offset = rdev->data_offset;
smp_wmb();
conf->reshape_progress = MaxSector;
+ conf->reshape_safe = MaxSector;
mddev->reshape_position = MaxSector;
spin_unlock_irq(&conf->device_lock);
return ret;
@@ -4534,6 +4526,7 @@ static void end_reshape(struct r10conf *conf)
md_finish_reshape(conf->mddev);
smp_wmb();
conf->reshape_progress = MaxSector;
+ conf->reshape_safe = MaxSector;
spin_unlock_irq(&conf->device_lock);
/* read-ahead size must cover two whole stripes, which is
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b6793d2e0..f757023fc 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -344,7 +344,8 @@ static void release_inactive_stripe_list(struct r5conf *conf,
int hash)
{
int size;
- bool do_wakeup = false;
+ unsigned long do_wakeup = 0;
+ int i = 0;
unsigned long flags;
if (hash == NR_STRIPE_HASH_LOCKS) {
@@ -365,15 +366,21 @@ static void release_inactive_stripe_list(struct r5conf *conf,
!list_empty(list))
atomic_dec(&conf->empty_inactive_list_nr);
list_splice_tail_init(list, conf->inactive_list + hash);
- do_wakeup = true;
+ do_wakeup |= 1 << hash;
spin_unlock_irqrestore(conf->hash_locks + hash, flags);
}
size--;
hash--;
}
+ for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) {
+ if (do_wakeup & (1 << i))
+ wake_up(&conf->wait_for_stripe[i]);
+ }
+
if (do_wakeup) {
- wake_up(&conf->wait_for_stripe);
+ if (atomic_read(&conf->active_stripes) == 0)
+ wake_up(&conf->wait_for_quiescent);
if (conf->retry_read_aligned)
md_wakeup_thread(conf->mddev->thread);
}
@@ -667,15 +674,15 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
spin_lock_irq(conf->hash_locks + hash);
do {
- wait_event_lock_irq(conf->wait_for_stripe,
+ wait_event_lock_irq(conf->wait_for_quiescent,
conf->quiesce == 0 || noquiesce,
*(conf->hash_locks + hash));
sh = __find_stripe(conf, sector, conf->generation - previous);
if (!sh) {
if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) {
sh = get_free_stripe(conf, hash);
- if (!sh && llist_empty(&conf->released_stripes) &&
- !test_bit(R5_DID_ALLOC, &conf->cache_state))
+ if (!sh && !test_bit(R5_DID_ALLOC,
+ &conf->cache_state))
set_bit(R5_ALLOC_MORE,
&conf->cache_state);
}
@@ -684,14 +691,15 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
if (!sh) {
set_bit(R5_INACTIVE_BLOCKED,
&conf->cache_state);
- wait_event_lock_irq(
- conf->wait_for_stripe,
+ wait_event_exclusive_cmd(
+ conf->wait_for_stripe[hash],
!list_empty(conf->inactive_list + hash) &&
(atomic_read(&conf->active_stripes)
< (conf->max_nr_stripes * 3 / 4)
|| !test_bit(R5_INACTIVE_BLOCKED,
&conf->cache_state)),
- *(conf->hash_locks + hash));
+ spin_unlock_irq(conf->hash_locks + hash),
+ spin_lock_irq(conf->hash_locks + hash));
clear_bit(R5_INACTIVE_BLOCKED,
&conf->cache_state);
} else {
@@ -716,6 +724,9 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
}
} while (sh == NULL);
+ if (!list_empty(conf->inactive_list + hash))
+ wake_up(&conf->wait_for_stripe[hash]);
+
spin_unlock_irq(conf->hash_locks + hash);
return sh;
}
@@ -2151,6 +2162,9 @@ static int resize_stripes(struct r5conf *conf, int newsize)
if (!sc)
return -ENOMEM;
+ /* Need to ensure auto-resizing doesn't interfere */
+ mutex_lock(&conf->cache_size_mutex);
+
for (i = conf->max_nr_stripes; i; i--) {
nsh = alloc_stripe(sc, GFP_KERNEL);
if (!nsh)
@@ -2167,6 +2181,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
kmem_cache_free(sc, nsh);
}
kmem_cache_destroy(sc);
+ mutex_unlock(&conf->cache_size_mutex);
return -ENOMEM;
}
/* Step 2 - Must use GFP_NOIO now.
@@ -2177,7 +2192,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
cnt = 0;
list_for_each_entry(nsh, &newstripes, lru) {
lock_device_hash_lock(conf, hash);
- wait_event_cmd(conf->wait_for_stripe,
+ wait_event_exclusive_cmd(conf->wait_for_stripe[hash],
!list_empty(conf->inactive_list + hash),
unlock_device_hash_lock(conf, hash),
lock_device_hash_lock(conf, hash));
@@ -2213,6 +2228,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
} else
err = -ENOMEM;
+ mutex_unlock(&conf->cache_size_mutex);
/* Step 4, return new stripes to service */
while(!list_empty(&newstripes)) {
nsh = list_entry(newstripes.next, struct stripe_head, lru);
@@ -2240,7 +2256,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
static int drop_one_stripe(struct r5conf *conf)
{
struct stripe_head *sh;
- int hash = (conf->max_nr_stripes - 1) % NR_STRIPE_HASH_LOCKS;
+ int hash = (conf->max_nr_stripes - 1) & STRIPE_HASH_LOCKS_MASK;
spin_lock_irq(conf->hash_locks + hash);
sh = get_free_stripe(conf, hash);
@@ -4050,8 +4066,10 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
&first_bad, &bad_sectors))
set_bit(R5_ReadRepl, &dev->flags);
else {
- if (rdev)
+ if (rdev && !test_bit(Faulty, &rdev->flags))
set_bit(R5_NeedReplace, &dev->flags);
+ else
+ clear_bit(R5_NeedReplace, &dev->flags);
rdev = rcu_dereference(conf->disks[i].rdev);
clear_bit(R5_ReadRepl, &dev->flags);
}
@@ -4760,7 +4778,7 @@ static void raid5_align_endio(struct bio *bi, int error)
raid_bi, 0);
bio_endio(raid_bi, 0);
if (atomic_dec_and_test(&conf->active_aligned_reads))
- wake_up(&conf->wait_for_stripe);
+ wake_up(&conf->wait_for_quiescent);
return;
}
@@ -4855,7 +4873,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
align_bi->bi_iter.bi_sector += rdev->data_offset;
spin_lock_irq(&conf->device_lock);
- wait_event_lock_irq(conf->wait_for_stripe,
+ wait_event_lock_irq(conf->wait_for_quiescent,
conf->quiesce == 0,
conf->device_lock);
atomic_inc(&conf->active_aligned_reads);
@@ -5699,7 +5717,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
bio_endio(raid_bio, 0);
}
if (atomic_dec_and_test(&conf->active_aligned_reads))
- wake_up(&conf->wait_for_stripe);
+ wake_up(&conf->wait_for_quiescent);
return handled;
}
@@ -5846,12 +5864,14 @@ static void raid5d(struct md_thread *thread)
pr_debug("%d stripes handled\n", handled);
spin_unlock_irq(&conf->device_lock);
- if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state)) {
+ if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state) &&
+ mutex_trylock(&conf->cache_size_mutex)) {
grow_one_stripe(conf, __GFP_NOWARN);
/* Set flag even if allocation failed. This helps
* slow down allocation requests when mem is short
*/
set_bit(R5_DID_ALLOC, &conf->cache_state);
+ mutex_unlock(&conf->cache_size_mutex);
}
async_tx_issue_pending_all();
@@ -5883,18 +5903,22 @@ raid5_set_cache_size(struct mddev *mddev, int size)
return -EINVAL;
conf->min_nr_stripes = size;
+ mutex_lock(&conf->cache_size_mutex);
while (size < conf->max_nr_stripes &&
drop_one_stripe(conf))
;
+ mutex_unlock(&conf->cache_size_mutex);
err = md_allow_write(mddev);
if (err)
return err;
+ mutex_lock(&conf->cache_size_mutex);
while (size > conf->max_nr_stripes)
if (!grow_one_stripe(conf, GFP_KERNEL))
break;
+ mutex_unlock(&conf->cache_size_mutex);
return 0;
}
@@ -6360,11 +6384,19 @@ static unsigned long raid5_cache_scan(struct shrinker *shrink,
struct shrink_control *sc)
{
struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
- int ret = 0;
- while (ret < sc->nr_to_scan) {
- if (drop_one_stripe(conf) == 0)
- return SHRINK_STOP;
- ret++;
+ unsigned long ret = SHRINK_STOP;
+
+ if (mutex_trylock(&conf->cache_size_mutex)) {
+ ret= 0;
+ while (ret < sc->nr_to_scan &&
+ conf->max_nr_stripes > conf->min_nr_stripes) {
+ if (drop_one_stripe(conf) == 0) {
+ ret = SHRINK_STOP;
+ break;
+ }
+ ret++;
+ }
+ mutex_unlock(&conf->cache_size_mutex);
}
return ret;
}
@@ -6433,7 +6465,11 @@ static struct r5conf *setup_conf(struct mddev *mddev)
goto abort;
spin_lock_init(&conf->device_lock);
seqcount_init(&conf->gen_lock);
- init_waitqueue_head(&conf->wait_for_stripe);
+ mutex_init(&conf->cache_size_mutex);
+ init_waitqueue_head(&conf->wait_for_quiescent);
+ for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) {
+ init_waitqueue_head(&conf->wait_for_stripe[i]);
+ }
init_waitqueue_head(&conf->wait_for_overlap);
INIT_LIST_HEAD(&conf->handle_list);
INIT_LIST_HEAD(&conf->hold_list);
@@ -7466,7 +7502,7 @@ static void raid5_quiesce(struct mddev *mddev, int state)
* active stripes can drain
*/
conf->quiesce = 2;
- wait_event_cmd(conf->wait_for_stripe,
+ wait_event_cmd(conf->wait_for_quiescent,
atomic_read(&conf->active_stripes) == 0 &&
atomic_read(&conf->active_aligned_reads) == 0,
unlock_all_device_hash_locks_irq(conf),
@@ -7480,7 +7516,7 @@ static void raid5_quiesce(struct mddev *mddev, int state)
case 0: /* re-enable writes */
lock_all_device_hash_locks_irq(conf);
conf->quiesce = 0;
- wake_up(&conf->wait_for_stripe);
+ wake_up(&conf->wait_for_quiescent);
wake_up(&conf->wait_for_overlap);
unlock_all_device_hash_locks_irq(conf);
break;
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 896d603ad..d05144278 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -482,7 +482,8 @@ struct r5conf {
*/
int active_name;
char cache_name[2][32];
- struct kmem_cache *slab_cache; /* for allocating stripes */
+ struct kmem_cache *slab_cache; /* for allocating stripes */
+ struct mutex cache_size_mutex; /* Protect changes to cache size */
int seq_flush, seq_write;
int quiesce;
@@ -511,7 +512,8 @@ struct r5conf {
struct list_head inactive_list[NR_STRIPE_HASH_LOCKS];
atomic_t empty_inactive_list_nr;
struct llist_head released_stripes;
- wait_queue_head_t wait_for_stripe;
+ wait_queue_head_t wait_for_quiescent;
+ wait_queue_head_t wait_for_stripe[NR_STRIPE_HASH_LOCKS];
wait_queue_head_t wait_for_overlap;
unsigned long cache_state;
#define R5_INACTIVE_BLOCKED 1 /* release of inactive stripes blocked,