diff options
Diffstat (limited to 'drivers/md')
31 files changed, 799 insertions, 1989 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 0a2e7273d..02a5345a4 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -249,6 +249,7 @@ config DM_DEBUG_BLOCK_STACK_TRACING block manager locking used by thin provisioning and caching. If unsure, say N. + config DM_BIO_PRISON tristate depends on BLK_DEV_DM @@ -304,16 +305,6 @@ config DM_CACHE algorithms used to select which blocks are promoted, demoted, cleaned etc. It supports writeback and writethrough modes. -config DM_CACHE_MQ - tristate "MQ Cache Policy (EXPERIMENTAL)" - depends on DM_CACHE - default y - ---help--- - A cache policy that uses a multiqueue ordered by recent hit - count to select which blocks should be promoted and demoted. - This is meant to be a general purpose policy. It prioritises - reads over writes. - config DM_CACHE_SMQ tristate "Stochastic MQ Cache Policy (EXPERIMENTAL)" depends on DM_CACHE diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 62a65764e..52ba8dd82 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -12,7 +12,6 @@ dm-log-userspace-y \ += dm-log-userspace-base.o dm-log-userspace-transfer.o dm-thin-pool-y += dm-thin.o dm-thin-metadata.o dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o -dm-cache-mq-y += dm-cache-policy-mq.o dm-cache-smq-y += dm-cache-policy-smq.o dm-cache-cleaner-y += dm-cache-policy-cleaner.o dm-era-y += dm-era-target.o @@ -55,7 +54,6 @@ obj-$(CONFIG_DM_RAID) += dm-raid.o obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o obj-$(CONFIG_DM_VERITY) += dm-verity.o obj-$(CONFIG_DM_CACHE) += dm-cache.o -obj-$(CONFIG_DM_CACHE_MQ) += dm-cache-mq.o obj-$(CONFIG_DM_CACHE_SMQ) += dm-cache-smq.o obj-$(CONFIG_DM_CACHE_CLEANER) += dm-cache-cleaner.o obj-$(CONFIG_DM_ERA) += dm-era.o diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index d80cce499..3fe86b54d 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -98,7 +98,6 @@ __acquires(bitmap->lock) bitmap->bp[page].hijacked) { /* somebody beat us to getting the page */ kfree(mappage); - return 0; } else { /* no page was in place and we have one, so install it */ @@ -323,7 +322,7 @@ __clear_page_buffers(struct page *page) { ClearPagePrivate(page); set_page_private(page, 0); - page_cache_release(page); + put_page(page); } static void free_buffers(struct page *page) { @@ -510,8 +509,7 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap) sb->chunksize = cpu_to_le32(chunksize); daemon_sleep = bitmap->mddev->bitmap_info.daemon_sleep; - if (!daemon_sleep || - (daemon_sleep < 1) || (daemon_sleep > MAX_SCHEDULE_TIMEOUT)) { + if (!daemon_sleep || (daemon_sleep > MAX_SCHEDULE_TIMEOUT)) { printk(KERN_INFO "Choosing daemon_sleep default (5 sec)\n"); daemon_sleep = 5 * HZ; } @@ -1675,6 +1673,9 @@ static void bitmap_free(struct bitmap *bitmap) if (!bitmap) /* there was no bitmap */ return; + if (bitmap->sysfs_can_clear) + sysfs_put(bitmap->sysfs_can_clear); + if (mddev_is_clustered(bitmap->mddev) && bitmap->mddev->cluster_info && bitmap->cluster_slot == md_cluster_ops->slot_number(bitmap->mddev)) md_cluster_stop(bitmap->mddev); @@ -1714,15 +1715,13 @@ void bitmap_destroy(struct mddev *mddev) if (mddev->thread) mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; - if (bitmap->sysfs_can_clear) - sysfs_put(bitmap->sysfs_can_clear); - bitmap_free(bitmap); } /* * initialize the bitmap structure * if this returns an error, bitmap_destroy must be called to do clean up + * once mddev->bitmap is set */ struct bitmap *bitmap_create(struct mddev *mddev, int slot) { @@ -1867,8 +1866,10 @@ int bitmap_copy_from_slot(struct mddev *mddev, int slot, struct bitmap_counts *counts; struct bitmap *bitmap = bitmap_create(mddev, slot); - if (IS_ERR(bitmap)) + if (IS_ERR(bitmap)) { + bitmap_free(bitmap); return PTR_ERR(bitmap); + } rv = bitmap_init_from_disk(bitmap, 0); if (rv) @@ -2172,14 +2173,14 @@ location_store(struct mddev *mddev, const char *buf, size_t len) else { mddev->bitmap = bitmap; rv = bitmap_load(mddev); - if (rv) { - bitmap_destroy(mddev); + if (rv) mddev->bitmap_info.offset = 0; - } } mddev->pers->quiesce(mddev, 0); - if (rv) + if (rv) { + bitmap_destroy(mddev); return rv; + } } } } diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h index 7d5c3a610..5e3fcd6ec 100644 --- a/drivers/md/bitmap.h +++ b/drivers/md/bitmap.h @@ -49,8 +49,8 @@ * When we set a bit, or in the counter (to start a write), if the fields is * 0, we first set the disk bit and set the counter to 1. * - * If the counter is 0, the on-disk bit is clear and the stipe is clean - * Anything that dirties the stipe pushes the counter to 2 (at least) + * If the counter is 0, the on-disk bit is clear and the stripe is clean + * Anything that dirties the stripe pushes the counter to 2 (at least) * and sets the on-disk bit (lazily). * If a periodic sweep find the counter at 2, it is decremented to 1. * If the sweep find the counter at 1, the on-disk bit is cleared and the diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c deleted file mode 100644 index ddb26980c..000000000 --- a/drivers/md/dm-cache-policy-mq.c +++ /dev/null @@ -1,1473 +0,0 @@ -/* - * Copyright (C) 2012 Red Hat. All rights reserved. - * - * This file is released under the GPL. - */ - -#include "dm-cache-policy.h" -#include "dm.h" - -#include <linux/hash.h> -#include <linux/jiffies.h> -#include <linux/module.h> -#include <linux/mutex.h> -#include <linux/slab.h> -#include <linux/vmalloc.h> - -#define DM_MSG_PREFIX "cache-policy-mq" - -static struct kmem_cache *mq_entry_cache; - -/*----------------------------------------------------------------*/ - -static unsigned next_power(unsigned n, unsigned min) -{ - return roundup_pow_of_two(max(n, min)); -} - -/*----------------------------------------------------------------*/ - -/* - * Large, sequential ios are probably better left on the origin device since - * spindles tend to have good bandwidth. - * - * The io_tracker tries to spot when the io is in one of these sequential - * modes. - * - * Two thresholds to switch between random and sequential io mode are defaulting - * as follows and can be adjusted via the constructor and message interfaces. - */ -#define RANDOM_THRESHOLD_DEFAULT 4 -#define SEQUENTIAL_THRESHOLD_DEFAULT 512 - -enum io_pattern { - PATTERN_SEQUENTIAL, - PATTERN_RANDOM -}; - -struct io_tracker { - enum io_pattern pattern; - - unsigned nr_seq_samples; - unsigned nr_rand_samples; - unsigned thresholds[2]; - - dm_oblock_t last_end_oblock; -}; - -static void iot_init(struct io_tracker *t, - int sequential_threshold, int random_threshold) -{ - t->pattern = PATTERN_RANDOM; - t->nr_seq_samples = 0; - t->nr_rand_samples = 0; - t->last_end_oblock = 0; - t->thresholds[PATTERN_RANDOM] = random_threshold; - t->thresholds[PATTERN_SEQUENTIAL] = sequential_threshold; -} - -static enum io_pattern iot_pattern(struct io_tracker *t) -{ - return t->pattern; -} - -static void iot_update_stats(struct io_tracker *t, struct bio *bio) -{ - if (bio->bi_iter.bi_sector == from_oblock(t->last_end_oblock) + 1) - t->nr_seq_samples++; - else { - /* - * Just one non-sequential IO is enough to reset the - * counters. - */ - if (t->nr_seq_samples) { - t->nr_seq_samples = 0; - t->nr_rand_samples = 0; - } - - t->nr_rand_samples++; - } - - t->last_end_oblock = to_oblock(bio_end_sector(bio) - 1); -} - -static void iot_check_for_pattern_switch(struct io_tracker *t) -{ - switch (t->pattern) { - case PATTERN_SEQUENTIAL: - if (t->nr_rand_samples >= t->thresholds[PATTERN_RANDOM]) { - t->pattern = PATTERN_RANDOM; - t->nr_seq_samples = t->nr_rand_samples = 0; - } - break; - - case PATTERN_RANDOM: - if (t->nr_seq_samples >= t->thresholds[PATTERN_SEQUENTIAL]) { - t->pattern = PATTERN_SEQUENTIAL; - t->nr_seq_samples = t->nr_rand_samples = 0; - } - break; - } -} - -static void iot_examine_bio(struct io_tracker *t, struct bio *bio) -{ - iot_update_stats(t, bio); - iot_check_for_pattern_switch(t); -} - -/*----------------------------------------------------------------*/ - - -/* - * This queue is divided up into different levels. Allowing us to push - * entries to the back of any of the levels. Think of it as a partially - * sorted queue. - */ -#define NR_QUEUE_LEVELS 16u -#define NR_SENTINELS NR_QUEUE_LEVELS * 3 - -#define WRITEBACK_PERIOD HZ - -struct queue { - unsigned nr_elts; - bool current_writeback_sentinels; - unsigned long next_writeback; - struct list_head qs[NR_QUEUE_LEVELS]; - struct list_head sentinels[NR_SENTINELS]; -}; - -static void queue_init(struct queue *q) -{ - unsigned i; - - q->nr_elts = 0; - q->current_writeback_sentinels = false; - q->next_writeback = 0; - for (i = 0; i < NR_QUEUE_LEVELS; i++) { - INIT_LIST_HEAD(q->qs + i); - INIT_LIST_HEAD(q->sentinels + i); - INIT_LIST_HEAD(q->sentinels + NR_QUEUE_LEVELS + i); - INIT_LIST_HEAD(q->sentinels + (2 * NR_QUEUE_LEVELS) + i); - } -} - -static unsigned queue_size(struct queue *q) -{ - return q->nr_elts; -} - -static bool queue_empty(struct queue *q) -{ - return q->nr_elts == 0; -} - -/* - * Insert an entry to the back of the given level. - */ -static void queue_push(struct queue *q, unsigned level, struct list_head *elt) -{ - q->nr_elts++; - list_add_tail(elt, q->qs + level); -} - -static void queue_remove(struct queue *q, struct list_head *elt) -{ - q->nr_elts--; - list_del(elt); -} - -static bool is_sentinel(struct queue *q, struct list_head *h) -{ - return (h >= q->sentinels) && (h < (q->sentinels + NR_SENTINELS)); -} - -/* - * Gives us the oldest entry of the lowest popoulated level. If the first - * level is emptied then we shift down one level. - */ -static struct list_head *queue_peek(struct queue *q) -{ - unsigned level; - struct list_head *h; - - for (level = 0; level < NR_QUEUE_LEVELS; level++) - list_for_each(h, q->qs + level) - if (!is_sentinel(q, h)) - return h; - - return NULL; -} - -static struct list_head *queue_pop(struct queue *q) -{ - struct list_head *r = queue_peek(q); - - if (r) { - q->nr_elts--; - list_del(r); - } - - return r; -} - -/* - * Pops an entry from a level that is not past a sentinel. - */ -static struct list_head *queue_pop_old(struct queue *q) -{ - unsigned level; - struct list_head *h; - - for (level = 0; level < NR_QUEUE_LEVELS; level++) - list_for_each(h, q->qs + level) { - if (is_sentinel(q, h)) - break; - - q->nr_elts--; - list_del(h); - return h; - } - - return NULL; -} - -static struct list_head *list_pop(struct list_head *lh) -{ - struct list_head *r = lh->next; - - BUG_ON(!r); - list_del_init(r); - - return r; -} - -static struct list_head *writeback_sentinel(struct queue *q, unsigned level) -{ - if (q->current_writeback_sentinels) - return q->sentinels + NR_QUEUE_LEVELS + level; - else - return q->sentinels + 2 * NR_QUEUE_LEVELS + level; -} - -static void queue_update_writeback_sentinels(struct queue *q) -{ - unsigned i; - struct list_head *h; - - if (time_after(jiffies, q->next_writeback)) { - for (i = 0; i < NR_QUEUE_LEVELS; i++) { - h = writeback_sentinel(q, i); - list_del(h); - list_add_tail(h, q->qs + i); - } - - q->next_writeback = jiffies + WRITEBACK_PERIOD; - q->current_writeback_sentinels = !q->current_writeback_sentinels; - } -} - -/* - * Sometimes we want to iterate through entries that have been pushed since - * a certain event. We use sentinel entries on the queues to delimit these - * 'tick' events. - */ -static void queue_tick(struct queue *q) -{ - unsigned i; - - for (i = 0; i < NR_QUEUE_LEVELS; i++) { - list_del(q->sentinels + i); - list_add_tail(q->sentinels + i, q->qs + i); - } -} - -typedef void (*iter_fn)(struct list_head *, void *); -static void queue_iterate_tick(struct queue *q, iter_fn fn, void *context) -{ - unsigned i; - struct list_head *h; - - for (i = 0; i < NR_QUEUE_LEVELS; i++) { - list_for_each_prev(h, q->qs + i) { - if (is_sentinel(q, h)) - break; - - fn(h, context); - } - } -} - -/*----------------------------------------------------------------*/ - -/* - * Describes a cache entry. Used in both the cache and the pre_cache. - */ -struct entry { - struct hlist_node hlist; - struct list_head list; - dm_oblock_t oblock; - - /* - * FIXME: pack these better - */ - bool dirty:1; - unsigned hit_count; -}; - -/* - * Rather than storing the cblock in an entry, we allocate all entries in - * an array, and infer the cblock from the entry position. - * - * Free entries are linked together into a list. - */ -struct entry_pool { - struct entry *entries, *entries_end; - struct list_head free; - unsigned nr_allocated; -}; - -static int epool_init(struct entry_pool *ep, unsigned nr_entries) -{ - unsigned i; - - ep->entries = vzalloc(sizeof(struct entry) * nr_entries); - if (!ep->entries) - return -ENOMEM; - - ep->entries_end = ep->entries + nr_entries; - - INIT_LIST_HEAD(&ep->free); - for (i = 0; i < nr_entries; i++) - list_add(&ep->entries[i].list, &ep->free); - - ep->nr_allocated = 0; - - return 0; -} - -static void epool_exit(struct entry_pool *ep) -{ - vfree(ep->entries); -} - -static struct entry *alloc_entry(struct entry_pool *ep) -{ - struct entry *e; - - if (list_empty(&ep->free)) - return NULL; - - e = list_entry(list_pop(&ep->free), struct entry, list); - INIT_LIST_HEAD(&e->list); - INIT_HLIST_NODE(&e->hlist); - ep->nr_allocated++; - - return e; -} - -/* - * This assumes the cblock hasn't already been allocated. - */ -static struct entry *alloc_particular_entry(struct entry_pool *ep, dm_cblock_t cblock) -{ - struct entry *e = ep->entries + from_cblock(cblock); - - list_del_init(&e->list); - INIT_HLIST_NODE(&e->hlist); - ep->nr_allocated++; - - return e; -} - -static void free_entry(struct entry_pool *ep, struct entry *e) -{ - BUG_ON(!ep->nr_allocated); - ep->nr_allocated--; - INIT_HLIST_NODE(&e->hlist); - list_add(&e->list, &ep->free); -} - -/* - * Returns NULL if the entry is free. - */ -static struct entry *epool_find(struct entry_pool *ep, dm_cblock_t cblock) -{ - struct entry *e = ep->entries + from_cblock(cblock); - return !hlist_unhashed(&e->hlist) ? e : NULL; -} - -static bool epool_empty(struct entry_pool *ep) -{ - return list_empty(&ep->free); -} - -static bool in_pool(struct entry_pool *ep, struct entry *e) -{ - return e >= ep->entries && e < ep->entries_end; -} - -static dm_cblock_t infer_cblock(struct entry_pool *ep, struct entry *e) -{ - return to_cblock(e - ep->entries); -} - -/*----------------------------------------------------------------*/ - -struct mq_policy { - struct dm_cache_policy policy; - - /* protects everything */ - struct mutex lock; - dm_cblock_t cache_size; - struct io_tracker tracker; - - /* - * Entries come from two pools, one of pre-cache entries, and one - * for the cache proper. - */ - struct entry_pool pre_cache_pool; - struct entry_pool cache_pool; - - /* - * We maintain three queues of entries. The cache proper, - * consisting of a clean and dirty queue, contains the currently - * active mappings. Whereas the pre_cache tracks blocks that - * are being hit frequently and potential candidates for promotion - * to the cache. - */ - struct queue pre_cache; - struct queue cache_clean; - struct queue cache_dirty; - - /* - * Keeps track of time, incremented by the core. We use this to - * avoid attributing multiple hits within the same tick. - * - * Access to tick_protected should be done with the spin lock held. - * It's copied to tick at the start of the map function (within the - * mutex). - */ - spinlock_t tick_lock; - unsigned tick_protected; - unsigned tick; - - /* - * A count of the number of times the map function has been called - * and found an entry in the pre_cache or cache. Currently used to - * calculate the generation. - */ - unsigned hit_count; - - /* - * A generation is a longish period that is used to trigger some - * book keeping effects. eg, decrementing hit counts on entries. - * This is needed to allow the cache to evolve as io patterns - * change. - */ - unsigned generation; - unsigned generation_period; /* in lookups (will probably change) */ - - unsigned discard_promote_adjustment; - unsigned read_promote_adjustment; - unsigned write_promote_adjustment; - - /* - * The hash table allows us to quickly find an entry by origin - * block. Both pre_cache and cache entries are in here. - */ - unsigned nr_buckets; - dm_block_t hash_bits; - struct hlist_head *table; -}; - -#define DEFAULT_DISCARD_PROMOTE_ADJUSTMENT 1 -#define DEFAULT_READ_PROMOTE_ADJUSTMENT 4 -#define DEFAULT_WRITE_PROMOTE_ADJUSTMENT 8 -#define DISCOURAGE_DEMOTING_DIRTY_THRESHOLD 128 - -/*----------------------------------------------------------------*/ - -/* - * Simple hash table implementation. Should replace with the standard hash - * table that's making its way upstream. - */ -static void hash_insert(struct mq_policy *mq, struct entry *e) -{ - unsigned h = hash_64(from_oblock(e->oblock), mq->hash_bits); - - hlist_add_head(&e->hlist, mq->table + h); -} - -static struct entry *hash_lookup(struct mq_policy *mq, dm_oblock_t oblock) -{ - unsigned h = hash_64(from_oblock(oblock), mq->hash_bits); - struct hlist_head *bucket = mq->table + h; - struct entry *e; - - hlist_for_each_entry(e, bucket, hlist) - if (e->oblock == oblock) { - hlist_del(&e->hlist); - hlist_add_head(&e->hlist, bucket); - return e; - } - - return NULL; -} - -static void hash_remove(struct entry *e) -{ - hlist_del(&e->hlist); -} - -/*----------------------------------------------------------------*/ - -static bool any_free_cblocks(struct mq_policy *mq) -{ - return !epool_empty(&mq->cache_pool); -} - -static bool any_clean_cblocks(struct mq_policy *mq) -{ - return !queue_empty(&mq->cache_clean); -} - -/*----------------------------------------------------------------*/ - -/* - * Now we get to the meat of the policy. This section deals with deciding - * when to to add entries to the pre_cache and cache, and move between - * them. - */ - -/* - * The queue level is based on the log2 of the hit count. - */ -static unsigned queue_level(struct entry *e) -{ - return min((unsigned) ilog2(e->hit_count), NR_QUEUE_LEVELS - 1u); -} - -static bool in_cache(struct mq_policy *mq, struct entry *e) -{ - return in_pool(&mq->cache_pool, e); -} - -/* - * Inserts the entry into the pre_cache or the cache. Ensures the cache - * block is marked as allocated if necc. Inserts into the hash table. - * Sets the tick which records when the entry was last moved about. - */ -static void push(struct mq_policy *mq, struct entry *e) -{ - hash_insert(mq, e); - - if (in_cache(mq, e)) - queue_push(e->dirty ? &mq->cache_dirty : &mq->cache_clean, - queue_level(e), &e->list); - else - queue_push(&mq->pre_cache, queue_level(e), &e->list); -} - -/* - * Removes an entry from pre_cache or cache. Removes from the hash table. - */ -static void del(struct mq_policy *mq, struct entry *e) -{ - if (in_cache(mq, e)) - queue_remove(e->dirty ? &mq->cache_dirty : &mq->cache_clean, &e->list); - else - queue_remove(&mq->pre_cache, &e->list); - - hash_remove(e); -} - -/* - * Like del, except it removes the first entry in the queue (ie. the least - * recently used). - */ -static struct entry *pop(struct mq_policy *mq, struct queue *q) -{ - struct entry *e; - struct list_head *h = queue_pop(q); - - if (!h) - return NULL; - - e = container_of(h, struct entry, list); - hash_remove(e); - - return e; -} - -static struct entry *pop_old(struct mq_policy *mq, struct queue *q) -{ - struct entry *e; - struct list_head *h = queue_pop_old(q); - - if (!h) - return NULL; - - e = container_of(h, struct entry, list); - hash_remove(e); - - return e; -} - -static struct entry *peek(struct queue *q) -{ - struct list_head *h = queue_peek(q); - return h ? container_of(h, struct entry, list) : NULL; -} - -/* - * The promotion threshold is adjusted every generation. As are the counts - * of the entries. - * - * At the moment the threshold is taken by averaging the hit counts of some - * of the entries in the cache (the first 20 entries across all levels in - * ascending order, giving preference to the clean entries at each level). - * - * We can be much cleverer than this though. For example, each promotion - * could bump up the threshold helping to prevent churn. Much more to do - * here. - */ - -#define MAX_TO_AVERAGE 20 - -static void check_generation(struct mq_policy *mq) -{ - unsigned total = 0, nr = 0, count = 0, level; - struct list_head *head; - struct entry *e; - - if ((mq->hit_count >= mq->generation_period) && (epool_empty(&mq->cache_pool))) { - mq->hit_count = 0; - mq->generation++; - - for (level = 0; level < NR_QUEUE_LEVELS && count < MAX_TO_AVERAGE; level++) { - head = mq->cache_clean.qs + level; - list_for_each_entry(e, head, list) { - nr++; - total += e->hit_count; - - if (++count >= MAX_TO_AVERAGE) - break; - } - - head = mq->cache_dirty.qs + level; - list_for_each_entry(e, head, list) { - nr++; - total += e->hit_count; - - if (++count >= MAX_TO_AVERAGE) - break; - } - } - } -} - -/* - * Whenever we use an entry we bump up it's hit counter, and push it to the - * back to it's current level. - */ -static void requeue(struct mq_policy *mq, struct entry *e) -{ - check_generation(mq); - del(mq, e); - push(mq, e); -} - -/* - * Demote the least recently used entry from the cache to the pre_cache. - * Returns the new cache entry to use, and the old origin block it was - * mapped to. - * - * We drop the hit count on the demoted entry back to 1 to stop it bouncing - * straight back into the cache if it's subsequently hit. There are - * various options here, and more experimentation would be good: - * - * - just forget about the demoted entry completely (ie. don't insert it - into the pre_cache). - * - divide the hit count rather that setting to some hard coded value. - * - set the hit count to a hard coded value other than 1, eg, is it better - * if it goes in at level 2? - */ -static int demote_cblock(struct mq_policy *mq, - struct policy_locker *locker, dm_oblock_t *oblock) -{ - struct entry *demoted = peek(&mq->cache_clean); - - if (!demoted) - /* - * We could get a block from mq->cache_dirty, but that - * would add extra latency to the triggering bio as it - * waits for the writeback. Better to not promote this - * time and hope there's a clean block next time this block - * is hit. - */ - return -ENOSPC; - - if (locker->fn(locker, demoted->oblock)) - /* - * We couldn't lock the demoted block. - */ - return -EBUSY; - - del(mq, demoted); - *oblock = demoted->oblock; - free_entry(&mq->cache_pool, demoted); - - /* - * We used to put the demoted block into the pre-cache, but I think - * it's simpler to just let it work it's way up from zero again. - * Stops blocks flickering in and out of the cache. - */ - - return 0; -} - -/* - * Entries in the pre_cache whose hit count passes the promotion - * threshold move to the cache proper. Working out the correct - * value for the promotion_threshold is crucial to this policy. - */ -static unsigned promote_threshold(struct mq_policy *mq) -{ - struct entry *e; - - if (any_free_cblocks(mq)) - return 0; - - e = peek(&mq->cache_clean); - if (e) - return e->hit_count; - - e = peek(&mq->cache_dirty); - if (e) - return e->hit_count + DISCOURAGE_DEMOTING_DIRTY_THRESHOLD; - - /* This should never happen */ - return 0; -} - -/* - * We modify the basic promotion_threshold depending on the specific io. - * - * If the origin block has been discarded then there's no cost to copy it - * to the cache. - * - * We bias towards reads, since they can be demoted at no cost if they - * haven't been dirtied. - */ -static unsigned adjusted_promote_threshold(struct mq_policy *mq, - bool discarded_oblock, int data_dir) -{ - if (data_dir == READ) - return promote_threshold(mq) + mq->read_promote_adjustment; - - if (discarded_oblock && (any_free_cblocks(mq) || any_clean_cblocks(mq))) { - /* - * We don't need to do any copying at all, so give this a - * very low threshold. - */ - return mq->discard_promote_adjustment; - } - - return promote_threshold(mq) + mq->write_promote_adjustment; -} - -static bool should_promote(struct mq_policy *mq, struct entry *e, - bool discarded_oblock, int data_dir) -{ - return e->hit_count >= - adjusted_promote_threshold(mq, discarded_oblock, data_dir); -} - -static int cache_entry_found(struct mq_policy *mq, - struct entry *e, - struct policy_result *result) -{ - requeue(mq, e); - - if (in_cache(mq, e)) { - result->op = POLICY_HIT; - result->cblock = infer_cblock(&mq->cache_pool, e); - } - - return 0; -} - -/* - * Moves an entry from the pre_cache to the cache. The main work is - * finding which cache block to use. - */ -static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e, - struct policy_locker *locker, - struct policy_result *result) -{ - int r; - struct entry *new_e; - - /* Ensure there's a free cblock in the cache */ - if (epool_empty(&mq->cache_pool)) { - result->op = POLICY_REPLACE; - r = demote_cblock(mq, locker, &result->old_oblock); - if (r) { - result->op = POLICY_MISS; - return 0; - } - - } else - result->op = POLICY_NEW; - - new_e = alloc_entry(&mq->cache_pool); - BUG_ON(!new_e); - - new_e->oblock = e->oblock; - new_e->dirty = false; - new_e->hit_count = e->hit_count; - - del(mq, e); - free_entry(&mq->pre_cache_pool, e); - push(mq, new_e); - - result->cblock = infer_cblock(&mq->cache_pool, new_e); - - return 0; -} - -static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e, - bool can_migrate, bool discarded_oblock, - int data_dir, struct policy_locker *locker, - struct policy_result *result) -{ - int r = 0; - - if (!should_promote(mq, e, discarded_oblock, data_dir)) { - requeue(mq, e); - result->op = POLICY_MISS; - - } else if (!can_migrate) - r = -EWOULDBLOCK; - - else { - requeue(mq, e); - r = pre_cache_to_cache(mq, e, locker, result); - } - - return r; -} - -static void insert_in_pre_cache(struct mq_policy *mq, - dm_oblock_t oblock) -{ - struct entry *e = alloc_entry(&mq->pre_cache_pool); - - if (!e) - /* - * There's no spare entry structure, so we grab the least - * used one from the pre_cache. - */ - e = pop(mq, &mq->pre_cache); - - if (unlikely(!e)) { - DMWARN("couldn't pop from pre cache"); - return; - } - - e->dirty = false; - e->oblock = oblock; - e->hit_count = 1; - push(mq, e); -} - -static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock, - struct policy_locker *locker, - struct policy_result *result) -{ - int r; - struct entry *e; - - if (epool_empty(&mq->cache_pool)) { - result->op = POLICY_REPLACE; - r = demote_cblock(mq, locker, &result->old_oblock); - if (unlikely(r)) { - result->op = POLICY_MISS; - insert_in_pre_cache(mq, oblock); - return; - } - - /* - * This will always succeed, since we've just demoted. - */ - e = alloc_entry(&mq->cache_pool); - BUG_ON(!e); - - } else { - e = alloc_entry(&mq->cache_pool); - result->op = POLICY_NEW; - } - - e->oblock = oblock; - e->dirty = false; - e->hit_count = 1; - push(mq, e); - - result->cblock = infer_cblock(&mq->cache_pool, e); -} - -static int no_entry_found(struct mq_policy *mq, dm_oblock_t oblock, - bool can_migrate, bool discarded_oblock, - int data_dir, struct policy_locker *locker, - struct policy_result *result) -{ - if (adjusted_promote_threshold(mq, discarded_oblock, data_dir) <= 1) { - if (can_migrate) - insert_in_cache(mq, oblock, locker, result); - else - return -EWOULDBLOCK; - } else { - insert_in_pre_cache(mq, oblock); - result->op = POLICY_MISS; - } - - return 0; -} - -/* - * Looks the oblock up in the hash table, then decides whether to put in - * pre_cache, or cache etc. - */ -static int map(struct mq_policy *mq, dm_oblock_t oblock, - bool can_migrate, bool discarded_oblock, - int data_dir, struct policy_locker *locker, - struct policy_result *result) -{ - int r = 0; - struct entry *e = hash_lookup(mq, oblock); - - if (e && in_cache(mq, e)) - r = cache_entry_found(mq, e, result); - - else if (mq->tracker.thresholds[PATTERN_SEQUENTIAL] && - iot_pattern(&mq->tracker) == PATTERN_SEQUENTIAL) - result->op = POLICY_MISS; - - else if (e) - r = pre_cache_entry_found(mq, e, can_migrate, discarded_oblock, - data_dir, locker, result); - - else - r = no_entry_found(mq, oblock, can_migrate, discarded_oblock, - data_dir, locker, result); - - if (r == -EWOULDBLOCK) - result->op = POLICY_MISS; - - return r; -} - -/*----------------------------------------------------------------*/ - -/* - * Public interface, via the policy struct. See dm-cache-policy.h for a - * description of these. - */ - -static struct mq_policy *to_mq_policy(struct dm_cache_policy *p) -{ - return container_of(p, struct mq_policy, policy); -} - -static void mq_destroy(struct dm_cache_policy *p) -{ - struct mq_policy *mq = to_mq_policy(p); - - vfree(mq->table); - epool_exit(&mq->cache_pool); - epool_exit(&mq->pre_cache_pool); - kfree(mq); -} - -static void update_pre_cache_hits(struct list_head *h, void *context) -{ - struct entry *e = container_of(h, struct entry, list); - e->hit_count++; -} - -static void update_cache_hits(struct list_head *h, void *context) -{ - struct mq_policy *mq = context; - struct entry *e = container_of(h, struct entry, list); - e->hit_count++; - mq->hit_count++; -} - -static void copy_tick(struct mq_policy *mq) -{ - unsigned long flags, tick; - - spin_lock_irqsave(&mq->tick_lock, flags); - tick = mq->tick_protected; - if (tick != mq->tick) { - queue_iterate_tick(&mq->pre_cache, update_pre_cache_hits, mq); - queue_iterate_tick(&mq->cache_dirty, update_cache_hits, mq); - queue_iterate_tick(&mq->cache_clean, update_cache_hits, mq); - mq->tick = tick; - } - - queue_tick(&mq->pre_cache); - queue_tick(&mq->cache_dirty); - queue_tick(&mq->cache_clean); - queue_update_writeback_sentinels(&mq->cache_dirty); - spin_unlock_irqrestore(&mq->tick_lock, flags); -} - -static int mq_map(struct dm_cache_policy *p, dm_oblock_t oblock, - bool can_block, bool can_migrate, bool discarded_oblock, - struct bio *bio, struct policy_locker *locker, - struct policy_result *result) -{ - int r; - struct mq_policy *mq = to_mq_policy(p); - - result->op = POLICY_MISS; - - if (can_block) - mutex_lock(&mq->lock); - else if (!mutex_trylock(&mq->lock)) - return -EWOULDBLOCK; - - copy_tick(mq); - - iot_examine_bio(&mq->tracker, bio); - r = map(mq, oblock, can_migrate, discarded_oblock, - bio_data_dir(bio), locker, result); - - mutex_unlock(&mq->lock); - - return r; -} - -static int mq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock) -{ - int r; - struct mq_policy *mq = to_mq_policy(p); - struct entry *e; - - if (!mutex_trylock(&mq->lock)) - return -EWOULDBLOCK; - - e = hash_lookup(mq, oblock); - if (e && in_cache(mq, e)) { - *cblock = infer_cblock(&mq->cache_pool, e); - r = 0; - } else - r = -ENOENT; - - mutex_unlock(&mq->lock); - - return r; -} - -static void __mq_set_clear_dirty(struct mq_policy *mq, dm_oblock_t oblock, bool set) -{ - struct entry *e; - - e = hash_lookup(mq, oblock); - BUG_ON(!e || !in_cache(mq, e)); - - del(mq, e); - e->dirty = set; - push(mq, e); -} - -static void mq_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) -{ - struct mq_policy *mq = to_mq_policy(p); - - mutex_lock(&mq->lock); - __mq_set_clear_dirty(mq, oblock, true); - mutex_unlock(&mq->lock); -} - -static void mq_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) -{ - struct mq_policy *mq = to_mq_policy(p); - - mutex_lock(&mq->lock); - __mq_set_clear_dirty(mq, oblock, false); - mutex_unlock(&mq->lock); -} - -static int mq_load_mapping(struct dm_cache_policy *p, - dm_oblock_t oblock, dm_cblock_t cblock, - uint32_t hint, bool hint_valid) -{ - struct mq_policy *mq = to_mq_policy(p); - struct entry *e; - - e = alloc_particular_entry(&mq->cache_pool, cblock); - e->oblock = oblock; - e->dirty = false; /* this gets corrected in a minute */ - e->hit_count = hint_valid ? hint : 1; - push(mq, e); - - return 0; -} - -static int mq_save_hints(struct mq_policy *mq, struct queue *q, - policy_walk_fn fn, void *context) -{ - int r; - unsigned level; - struct list_head *h; - struct entry *e; - - for (level = 0; level < NR_QUEUE_LEVELS; level++) - list_for_each(h, q->qs + level) { - if (is_sentinel(q, h)) - continue; - - e = container_of(h, struct entry, list); - r = fn(context, infer_cblock(&mq->cache_pool, e), - e->oblock, e->hit_count); - if (r) - return r; - } - - return 0; -} - -static int mq_walk_mappings(struct dm_cache_policy *p, policy_walk_fn fn, - void *context) -{ - struct mq_policy *mq = to_mq_policy(p); - int r = 0; - - mutex_lock(&mq->lock); - - r = mq_save_hints(mq, &mq->cache_clean, fn, context); - if (!r) - r = mq_save_hints(mq, &mq->cache_dirty, fn, context); - - mutex_unlock(&mq->lock); - - return r; -} - -static void __remove_mapping(struct mq_policy *mq, dm_oblock_t oblock) -{ - struct entry *e; - - e = hash_lookup(mq, oblock); - BUG_ON(!e || !in_cache(mq, e)); - - del(mq, e); - free_entry(&mq->cache_pool, e); -} - -static void mq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock) -{ - struct mq_policy *mq = to_mq_policy(p); - - mutex_lock(&mq->lock); - __remove_mapping(mq, oblock); - mutex_unlock(&mq->lock); -} - -static int __remove_cblock(struct mq_policy *mq, dm_cblock_t cblock) -{ - struct entry *e = epool_find(&mq->cache_pool, cblock); - - if (!e) - return -ENODATA; - - del(mq, e); - free_entry(&mq->cache_pool, e); - - return 0; -} - -static int mq_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock) -{ - int r; - struct mq_policy *mq = to_mq_policy(p); - - mutex_lock(&mq->lock); - r = __remove_cblock(mq, cblock); - mutex_unlock(&mq->lock); - - return r; -} - -#define CLEAN_TARGET_PERCENTAGE 25 - -static bool clean_target_met(struct mq_policy *mq) -{ - /* - * Cache entries may not be populated. So we're cannot rely on the - * size of the clean queue. - */ - unsigned nr_clean = from_cblock(mq->cache_size) - queue_size(&mq->cache_dirty); - unsigned target = from_cblock(mq->cache_size) * CLEAN_TARGET_PERCENTAGE / 100; - - return nr_clean >= target; -} - -static int __mq_writeback_work(struct mq_policy *mq, dm_oblock_t *oblock, - dm_cblock_t *cblock) -{ - struct entry *e = pop_old(mq, &mq->cache_dirty); - - if (!e && !clean_target_met(mq)) - e = pop(mq, &mq->cache_dirty); - - if (!e) - return -ENODATA; - - *oblock = e->oblock; - *cblock = infer_cblock(&mq->cache_pool, e); - e->dirty = false; - push(mq, e); - - return 0; -} - -static int mq_writeback_work(struct dm_cache_policy *p, dm_oblock_t *oblock, - dm_cblock_t *cblock, bool critical_only) -{ - int r; - struct mq_policy *mq = to_mq_policy(p); - - mutex_lock(&mq->lock); - r = __mq_writeback_work(mq, oblock, cblock); - mutex_unlock(&mq->lock); - - return r; -} - -static void __force_mapping(struct mq_policy *mq, - dm_oblock_t current_oblock, dm_oblock_t new_oblock) -{ - struct entry *e = hash_lookup(mq, current_oblock); - - if (e && in_cache(mq, e)) { - del(mq, e); - e->oblock = new_oblock; - e->dirty = true; - push(mq, e); - } -} - -static void mq_force_mapping(struct dm_cache_policy *p, - dm_oblock_t current_oblock, dm_oblock_t new_oblock) -{ - struct mq_policy *mq = to_mq_policy(p); - - mutex_lock(&mq->lock); - __force_mapping(mq, current_oblock, new_oblock); - mutex_unlock(&mq->lock); -} - -static dm_cblock_t mq_residency(struct dm_cache_policy *p) -{ - dm_cblock_t r; - struct mq_policy *mq = to_mq_policy(p); - - mutex_lock(&mq->lock); - r = to_cblock(mq->cache_pool.nr_allocated); - mutex_unlock(&mq->lock); - - return r; -} - -static void mq_tick(struct dm_cache_policy *p, bool can_block) -{ - struct mq_policy *mq = to_mq_policy(p); - unsigned long flags; - - spin_lock_irqsave(&mq->tick_lock, flags); - mq->tick_protected++; - spin_unlock_irqrestore(&mq->tick_lock, flags); - - if (can_block) { - mutex_lock(&mq->lock); - copy_tick(mq); - mutex_unlock(&mq->lock); - } -} - -static int mq_set_config_value(struct dm_cache_policy *p, - const char *key, const char *value) -{ - struct mq_policy *mq = to_mq_policy(p); - unsigned long tmp; - - if (kstrtoul(value, 10, &tmp)) - return -EINVAL; - - if (!strcasecmp(key, "random_threshold")) { - mq->tracker.thresholds[PATTERN_RANDOM] = tmp; - - } else if (!strcasecmp(key, "sequential_threshold")) { - mq->tracker.thresholds[PATTERN_SEQUENTIAL] = tmp; - - } else if (!strcasecmp(key, "discard_promote_adjustment")) - mq->discard_promote_adjustment = tmp; - - else if (!strcasecmp(key, "read_promote_adjustment")) - mq->read_promote_adjustment = tmp; - - else if (!strcasecmp(key, "write_promote_adjustment")) - mq->write_promote_adjustment = tmp; - - else - return -EINVAL; - - return 0; -} - -static int mq_emit_config_values(struct dm_cache_policy *p, char *result, - unsigned maxlen, ssize_t *sz_ptr) -{ - ssize_t sz = *sz_ptr; - struct mq_policy *mq = to_mq_policy(p); - - DMEMIT("10 random_threshold %u " - "sequential_threshold %u " - "discard_promote_adjustment %u " - "read_promote_adjustment %u " - "write_promote_adjustment %u ", - mq->tracker.thresholds[PATTERN_RANDOM], - mq->tracker.thresholds[PATTERN_SEQUENTIAL], - mq->discard_promote_adjustment, - mq->read_promote_adjustment, - mq->write_promote_adjustment); - - *sz_ptr = sz; - return 0; -} - -/* Init the policy plugin interface function pointers. */ -static void init_policy_functions(struct mq_policy *mq) -{ - mq->policy.destroy = mq_destroy; - mq->policy.map = mq_map; - mq->policy.lookup = mq_lookup; - mq->policy.set_dirty = mq_set_dirty; - mq->policy.clear_dirty = mq_clear_dirty; - mq->policy.load_mapping = mq_load_mapping; - mq->policy.walk_mappings = mq_walk_mappings; - mq->policy.remove_mapping = mq_remove_mapping; - mq->policy.remove_cblock = mq_remove_cblock; - mq->policy.writeback_work = mq_writeback_work; - mq->policy.force_mapping = mq_force_mapping; - mq->policy.residency = mq_residency; - mq->policy.tick = mq_tick; - mq->policy.emit_config_values = mq_emit_config_values; - mq->policy.set_config_value = mq_set_config_value; -} - -static struct dm_cache_policy *mq_create(dm_cblock_t cache_size, - sector_t origin_size, - sector_t cache_block_size) -{ - struct mq_policy *mq = kzalloc(sizeof(*mq), GFP_KERNEL); - - if (!mq) - return NULL; - - init_policy_functions(mq); - iot_init(&mq->tracker, SEQUENTIAL_THRESHOLD_DEFAULT, RANDOM_THRESHOLD_DEFAULT); - mq->cache_size = cache_size; - - if (epool_init(&mq->pre_cache_pool, from_cblock(cache_size))) { - DMERR("couldn't initialize pool of pre-cache entries"); - goto bad_pre_cache_init; - } - - if (epool_init(&mq->cache_pool, from_cblock(cache_size))) { - DMERR("couldn't initialize pool of cache entries"); - goto bad_cache_init; - } - - mq->tick_protected = 0; - mq->tick = 0; - mq->hit_count = 0; - mq->generation = 0; - mq->discard_promote_adjustment = DEFAULT_DISCARD_PROMOTE_ADJUSTMENT; - mq->read_promote_adjustment = DEFAULT_READ_PROMOTE_ADJUSTMENT; - mq->write_promote_adjustment = DEFAULT_WRITE_PROMOTE_ADJUSTMENT; - mutex_init(&mq->lock); - spin_lock_init(&mq->tick_lock); - - queue_init(&mq->pre_cache); - queue_init(&mq->cache_clean); - queue_init(&mq->cache_dirty); - - mq->generation_period = max((unsigned) from_cblock(cache_size), 1024U); - - mq->nr_buckets = next_power(from_cblock(cache_size) / 2, 16); - mq->hash_bits = __ffs(mq->nr_buckets); - mq->table = vzalloc(sizeof(*mq->table) * mq->nr_buckets); - if (!mq->table) - goto bad_alloc_table; - - return &mq->policy; - -bad_alloc_table: - epool_exit(&mq->cache_pool); -bad_cache_init: - epool_exit(&mq->pre_cache_pool); -bad_pre_cache_init: - kfree(mq); - - return NULL; -} - -/*----------------------------------------------------------------*/ - -static struct dm_cache_policy_type mq_policy_type = { - .name = "mq", - .version = {1, 4, 0}, - .hint_size = 4, - .owner = THIS_MODULE, - .create = mq_create -}; - -static int __init mq_init(void) -{ - int r; - - mq_entry_cache = kmem_cache_create("dm_mq_policy_cache_entry", - sizeof(struct entry), - __alignof__(struct entry), - 0, NULL); - if (!mq_entry_cache) - return -ENOMEM; - - r = dm_cache_policy_register(&mq_policy_type); - if (r) { - DMERR("register failed %d", r); - kmem_cache_destroy(mq_entry_cache); - return -ENOMEM; - } - - return 0; -} - -static void __exit mq_exit(void) -{ - dm_cache_policy_unregister(&mq_policy_type); - - kmem_cache_destroy(mq_entry_cache); -} - -module_init(mq_init); -module_exit(mq_exit); - -MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("mq cache policy"); diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c index 28d458674..cf48a617a 100644 --- a/drivers/md/dm-cache-policy-smq.c +++ b/drivers/md/dm-cache-policy-smq.c @@ -1567,8 +1567,48 @@ static void smq_tick(struct dm_cache_policy *p, bool can_block) spin_unlock_irqrestore(&mq->lock, flags); } +/* + * smq has no config values, but the old mq policy did. To avoid breaking + * software we continue to accept these configurables for the mq policy, + * but they have no effect. + */ +static int mq_set_config_value(struct dm_cache_policy *p, + const char *key, const char *value) +{ + unsigned long tmp; + + if (kstrtoul(value, 10, &tmp)) + return -EINVAL; + + if (!strcasecmp(key, "random_threshold") || + !strcasecmp(key, "sequential_threshold") || + !strcasecmp(key, "discard_promote_adjustment") || + !strcasecmp(key, "read_promote_adjustment") || + !strcasecmp(key, "write_promote_adjustment")) { + DMWARN("tunable '%s' no longer has any effect, mq policy is now an alias for smq", key); + return 0; + } + + return -EINVAL; +} + +static int mq_emit_config_values(struct dm_cache_policy *p, char *result, + unsigned maxlen, ssize_t *sz_ptr) +{ + ssize_t sz = *sz_ptr; + + DMEMIT("10 random_threshold 0 " + "sequential_threshold 0 " + "discard_promote_adjustment 0 " + "read_promote_adjustment 0 " + "write_promote_adjustment 0 "); + + *sz_ptr = sz; + return 0; +} + /* Init the policy plugin interface function pointers. */ -static void init_policy_functions(struct smq_policy *mq) +static void init_policy_functions(struct smq_policy *mq, bool mimic_mq) { mq->policy.destroy = smq_destroy; mq->policy.map = smq_map; @@ -1583,6 +1623,11 @@ static void init_policy_functions(struct smq_policy *mq) mq->policy.force_mapping = smq_force_mapping; mq->policy.residency = smq_residency; mq->policy.tick = smq_tick; + + if (mimic_mq) { + mq->policy.set_config_value = mq_set_config_value; + mq->policy.emit_config_values = mq_emit_config_values; + } } static bool too_many_hotspot_blocks(sector_t origin_size, @@ -1606,9 +1651,10 @@ static void calc_hotspot_params(sector_t origin_size, *hotspot_block_size /= 2u; } -static struct dm_cache_policy *smq_create(dm_cblock_t cache_size, - sector_t origin_size, - sector_t cache_block_size) +static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size, + sector_t origin_size, + sector_t cache_block_size, + bool mimic_mq) { unsigned i; unsigned nr_sentinels_per_queue = 2u * NR_CACHE_LEVELS; @@ -1618,7 +1664,7 @@ static struct dm_cache_policy *smq_create(dm_cblock_t cache_size, if (!mq) return NULL; - init_policy_functions(mq); + init_policy_functions(mq, mimic_mq); mq->cache_size = cache_size; mq->cache_block_size = cache_block_size; @@ -1706,19 +1752,41 @@ bad_pool_init: return NULL; } +static struct dm_cache_policy *smq_create(dm_cblock_t cache_size, + sector_t origin_size, + sector_t cache_block_size) +{ + return __smq_create(cache_size, origin_size, cache_block_size, false); +} + +static struct dm_cache_policy *mq_create(dm_cblock_t cache_size, + sector_t origin_size, + sector_t cache_block_size) +{ + return __smq_create(cache_size, origin_size, cache_block_size, true); +} + /*----------------------------------------------------------------*/ static struct dm_cache_policy_type smq_policy_type = { .name = "smq", - .version = {1, 0, 0}, + .version = {1, 5, 0}, .hint_size = 4, .owner = THIS_MODULE, .create = smq_create }; +static struct dm_cache_policy_type mq_policy_type = { + .name = "mq", + .version = {1, 5, 0}, + .hint_size = 4, + .owner = THIS_MODULE, + .create = mq_create, +}; + static struct dm_cache_policy_type default_policy_type = { .name = "default", - .version = {1, 4, 0}, + .version = {1, 5, 0}, .hint_size = 4, .owner = THIS_MODULE, .create = smq_create, @@ -1735,9 +1803,17 @@ static int __init smq_init(void) return -ENOMEM; } + r = dm_cache_policy_register(&mq_policy_type); + if (r) { + DMERR("register failed (as mq) %d", r); + dm_cache_policy_unregister(&smq_policy_type); + return -ENOMEM; + } + r = dm_cache_policy_register(&default_policy_type); if (r) { DMERR("register failed (as default) %d", r); + dm_cache_policy_unregister(&mq_policy_type); dm_cache_policy_unregister(&smq_policy_type); return -ENOMEM; } @@ -1748,6 +1824,7 @@ static int __init smq_init(void) static void __exit smq_exit(void) { dm_cache_policy_unregister(&smq_policy_type); + dm_cache_policy_unregister(&mq_policy_type); dm_cache_policy_unregister(&default_policy_type); } @@ -1759,3 +1836,4 @@ MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("smq cache policy"); MODULE_ALIAS("dm-cache-default"); +MODULE_ALIAS("dm-cache-mq"); diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index bb9b92ebb..ee0510f9a 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -2776,7 +2776,7 @@ static int cache_create(struct cache_args *ca, struct cache **result) ti->split_discard_bios = false; cache->features = ca->features; - ti->per_bio_data_size = get_per_bio_data_size(cache); + ti->per_io_data_size = get_per_bio_data_size(cache); cache->callbacks.congested_fn = cache_is_congested; dm_table_add_target_callbacks(ti->table, &cache->callbacks); @@ -3814,7 +3814,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type cache_target = { .name = "cache", - .version = {1, 8, 0}, + .version = {1, 9, 0}, .module = THIS_MODULE, .ctr = cache_ctr, .dtr = cache_dtr, diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 3147c8d09..4f3cb3554 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -28,6 +28,7 @@ #include <crypto/hash.h> #include <crypto/md5.h> #include <crypto/algapi.h> +#include <crypto/skcipher.h> #include <linux/device-mapper.h> @@ -44,7 +45,7 @@ struct convert_context { struct bvec_iter iter_out; sector_t cc_sector; atomic_t cc_pending; - struct ablkcipher_request *req; + struct skcipher_request *req; }; /* @@ -86,7 +87,7 @@ struct crypt_iv_operations { }; struct iv_essiv_private { - struct crypto_hash *hash_tfm; + struct crypto_ahash *hash_tfm; u8 *salt; }; @@ -153,13 +154,13 @@ struct crypt_config { /* ESSIV: struct crypto_cipher *essiv_tfm */ void *iv_private; - struct crypto_ablkcipher **tfms; + struct crypto_skcipher **tfms; unsigned tfms_count; /* * Layout of each crypto request: * - * struct ablkcipher_request + * struct skcipher_request * context * padding * struct dm_crypt_request @@ -189,7 +190,7 @@ static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq); /* * Use this to access cipher attributes that are the same for each CPU. */ -static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc) +static struct crypto_skcipher *any_tfm(struct crypt_config *cc) { return cc->tfms[0]; } @@ -263,23 +264,25 @@ static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv, static int crypt_iv_essiv_init(struct crypt_config *cc) { struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; - struct hash_desc desc; + AHASH_REQUEST_ON_STACK(req, essiv->hash_tfm); struct scatterlist sg; struct crypto_cipher *essiv_tfm; int err; sg_init_one(&sg, cc->key, cc->key_size); - desc.tfm = essiv->hash_tfm; - desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; + ahash_request_set_tfm(req, essiv->hash_tfm); + ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL); + ahash_request_set_crypt(req, &sg, essiv->salt, cc->key_size); - err = crypto_hash_digest(&desc, &sg, cc->key_size, essiv->salt); + err = crypto_ahash_digest(req); + ahash_request_zero(req); if (err) return err; essiv_tfm = cc->iv_private; err = crypto_cipher_setkey(essiv_tfm, essiv->salt, - crypto_hash_digestsize(essiv->hash_tfm)); + crypto_ahash_digestsize(essiv->hash_tfm)); if (err) return err; @@ -290,7 +293,7 @@ static int crypt_iv_essiv_init(struct crypt_config *cc) static int crypt_iv_essiv_wipe(struct crypt_config *cc) { struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; - unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm); + unsigned salt_size = crypto_ahash_digestsize(essiv->hash_tfm); struct crypto_cipher *essiv_tfm; int r, err = 0; @@ -320,7 +323,7 @@ static struct crypto_cipher *setup_essiv_cpu(struct crypt_config *cc, } if (crypto_cipher_blocksize(essiv_tfm) != - crypto_ablkcipher_ivsize(any_tfm(cc))) { + crypto_skcipher_ivsize(any_tfm(cc))) { ti->error = "Block size of ESSIV cipher does " "not match IV size of block cipher"; crypto_free_cipher(essiv_tfm); @@ -342,7 +345,7 @@ static void crypt_iv_essiv_dtr(struct crypt_config *cc) struct crypto_cipher *essiv_tfm; struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; - crypto_free_hash(essiv->hash_tfm); + crypto_free_ahash(essiv->hash_tfm); essiv->hash_tfm = NULL; kzfree(essiv->salt); @@ -360,7 +363,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, const char *opts) { struct crypto_cipher *essiv_tfm = NULL; - struct crypto_hash *hash_tfm = NULL; + struct crypto_ahash *hash_tfm = NULL; u8 *salt = NULL; int err; @@ -370,14 +373,14 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, } /* Allocate hash algorithm */ - hash_tfm = crypto_alloc_hash(opts, 0, CRYPTO_ALG_ASYNC); + hash_tfm = crypto_alloc_ahash(opts, 0, CRYPTO_ALG_ASYNC); if (IS_ERR(hash_tfm)) { ti->error = "Error initializing ESSIV hash"; err = PTR_ERR(hash_tfm); goto bad; } - salt = kzalloc(crypto_hash_digestsize(hash_tfm), GFP_KERNEL); + salt = kzalloc(crypto_ahash_digestsize(hash_tfm), GFP_KERNEL); if (!salt) { ti->error = "Error kmallocing salt storage in ESSIV"; err = -ENOMEM; @@ -388,7 +391,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, cc->iv_gen_private.essiv.hash_tfm = hash_tfm; essiv_tfm = setup_essiv_cpu(cc, ti, salt, - crypto_hash_digestsize(hash_tfm)); + crypto_ahash_digestsize(hash_tfm)); if (IS_ERR(essiv_tfm)) { crypt_iv_essiv_dtr(cc); return PTR_ERR(essiv_tfm); @@ -399,7 +402,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, bad: if (hash_tfm && !IS_ERR(hash_tfm)) - crypto_free_hash(hash_tfm); + crypto_free_ahash(hash_tfm); kfree(salt); return err; } @@ -419,7 +422,7 @@ static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti, const char *opts) { - unsigned bs = crypto_ablkcipher_blocksize(any_tfm(cc)); + unsigned bs = crypto_skcipher_blocksize(any_tfm(cc)); int log = ilog2(bs); /* we need to calculate how far we must shift the sector count @@ -816,27 +819,27 @@ static void crypt_convert_init(struct crypt_config *cc, } static struct dm_crypt_request *dmreq_of_req(struct crypt_config *cc, - struct ablkcipher_request *req) + struct skcipher_request *req) { return (struct dm_crypt_request *)((char *)req + cc->dmreq_start); } -static struct ablkcipher_request *req_of_dmreq(struct crypt_config *cc, +static struct skcipher_request *req_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq) { - return (struct ablkcipher_request *)((char *)dmreq - cc->dmreq_start); + return (struct skcipher_request *)((char *)dmreq - cc->dmreq_start); } static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq) { return (u8 *)ALIGN((unsigned long)(dmreq + 1), - crypto_ablkcipher_alignmask(any_tfm(cc)) + 1); + crypto_skcipher_alignmask(any_tfm(cc)) + 1); } static int crypt_convert_block(struct crypt_config *cc, struct convert_context *ctx, - struct ablkcipher_request *req) + struct skcipher_request *req) { struct bio_vec bv_in = bio_iter_iovec(ctx->bio_in, ctx->iter_in); struct bio_vec bv_out = bio_iter_iovec(ctx->bio_out, ctx->iter_out); @@ -866,13 +869,13 @@ static int crypt_convert_block(struct crypt_config *cc, return r; } - ablkcipher_request_set_crypt(req, &dmreq->sg_in, &dmreq->sg_out, - 1 << SECTOR_SHIFT, iv); + skcipher_request_set_crypt(req, &dmreq->sg_in, &dmreq->sg_out, + 1 << SECTOR_SHIFT, iv); if (bio_data_dir(ctx->bio_in) == WRITE) - r = crypto_ablkcipher_encrypt(req); + r = crypto_skcipher_encrypt(req); else - r = crypto_ablkcipher_decrypt(req); + r = crypto_skcipher_decrypt(req); if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post) r = cc->iv_gen_ops->post(cc, iv, dmreq); @@ -891,23 +894,23 @@ static void crypt_alloc_req(struct crypt_config *cc, if (!ctx->req) ctx->req = mempool_alloc(cc->req_pool, GFP_NOIO); - ablkcipher_request_set_tfm(ctx->req, cc->tfms[key_index]); + skcipher_request_set_tfm(ctx->req, cc->tfms[key_index]); /* * Use REQ_MAY_BACKLOG so a cipher driver internally backlogs * requests if driver request queue is full. */ - ablkcipher_request_set_callback(ctx->req, + skcipher_request_set_callback(ctx->req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, kcryptd_async_done, dmreq_of_req(cc, ctx->req)); } static void crypt_free_req(struct crypt_config *cc, - struct ablkcipher_request *req, struct bio *base_bio) + struct skcipher_request *req, struct bio *base_bio) { struct dm_crypt_io *io = dm_per_bio_data(base_bio, cc->per_bio_data_size); - if ((struct ablkcipher_request *)(io + 1) != req) + if ((struct skcipher_request *)(io + 1) != req) mempool_free(req, cc->req_pool); } @@ -1437,7 +1440,7 @@ static void crypt_free_tfms(struct crypt_config *cc) for (i = 0; i < cc->tfms_count; i++) if (cc->tfms[i] && !IS_ERR(cc->tfms[i])) { - crypto_free_ablkcipher(cc->tfms[i]); + crypto_free_skcipher(cc->tfms[i]); cc->tfms[i] = NULL; } @@ -1450,13 +1453,13 @@ static int crypt_alloc_tfms(struct crypt_config *cc, char *ciphermode) unsigned i; int err; - cc->tfms = kmalloc(cc->tfms_count * sizeof(struct crypto_ablkcipher *), + cc->tfms = kmalloc(cc->tfms_count * sizeof(struct crypto_skcipher *), GFP_KERNEL); if (!cc->tfms) return -ENOMEM; for (i = 0; i < cc->tfms_count; i++) { - cc->tfms[i] = crypto_alloc_ablkcipher(ciphermode, 0, 0); + cc->tfms[i] = crypto_alloc_skcipher(ciphermode, 0, 0); if (IS_ERR(cc->tfms[i])) { err = PTR_ERR(cc->tfms[i]); crypt_free_tfms(cc); @@ -1476,9 +1479,9 @@ static int crypt_setkey_allcpus(struct crypt_config *cc) subkey_size = (cc->key_size - cc->key_extra_size) >> ilog2(cc->tfms_count); for (i = 0; i < cc->tfms_count; i++) { - r = crypto_ablkcipher_setkey(cc->tfms[i], - cc->key + (i * subkey_size), - subkey_size); + r = crypto_skcipher_setkey(cc->tfms[i], + cc->key + (i * subkey_size), + subkey_size); if (r) err = r; } @@ -1645,7 +1648,7 @@ static int crypt_ctr_cipher(struct dm_target *ti, } /* Initialize IV */ - cc->iv_size = crypto_ablkcipher_ivsize(any_tfm(cc)); + cc->iv_size = crypto_skcipher_ivsize(any_tfm(cc)); if (cc->iv_size) /* at least a 64 bit sector number should fit in our buffer */ cc->iv_size = max(cc->iv_size, @@ -1763,21 +1766,21 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (ret < 0) goto bad; - cc->dmreq_start = sizeof(struct ablkcipher_request); - cc->dmreq_start += crypto_ablkcipher_reqsize(any_tfm(cc)); + cc->dmreq_start = sizeof(struct skcipher_request); + cc->dmreq_start += crypto_skcipher_reqsize(any_tfm(cc)); cc->dmreq_start = ALIGN(cc->dmreq_start, __alignof__(struct dm_crypt_request)); - if (crypto_ablkcipher_alignmask(any_tfm(cc)) < CRYPTO_MINALIGN) { + if (crypto_skcipher_alignmask(any_tfm(cc)) < CRYPTO_MINALIGN) { /* Allocate the padding exactly */ iv_size_padding = -(cc->dmreq_start + sizeof(struct dm_crypt_request)) - & crypto_ablkcipher_alignmask(any_tfm(cc)); + & crypto_skcipher_alignmask(any_tfm(cc)); } else { /* * If the cipher requires greater alignment than kmalloc * alignment, we don't know the exact position of the * initialization vector. We must assume worst case. */ - iv_size_padding = crypto_ablkcipher_alignmask(any_tfm(cc)); + iv_size_padding = crypto_skcipher_alignmask(any_tfm(cc)); } ret = -ENOMEM; @@ -1788,7 +1791,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } - cc->per_bio_data_size = ti->per_bio_data_size = + cc->per_bio_data_size = ti->per_io_data_size = ALIGN(sizeof(struct dm_crypt_io) + cc->dmreq_start + sizeof(struct dm_crypt_request) + iv_size_padding + cc->iv_size, ARCH_KMALLOC_MINALIGN); @@ -1922,7 +1925,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio) io = dm_per_bio_data(bio, cc->per_bio_data_size); crypt_io_init(io, cc, bio, dm_target_offset(ti, bio->bi_iter.bi_sector)); - io->ctx.req = (struct ablkcipher_request *)(io + 1); + io->ctx.req = (struct skcipher_request *)(io + 1); if (bio_data_dir(io->base_bio) == READ) { if (kcryptd_io_read(io, GFP_NOWAIT)) diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index b4c356a21..cc70871a6 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -204,7 +204,7 @@ out: ti->num_flush_bios = 1; ti->num_discard_bios = 1; - ti->per_bio_data_size = sizeof(struct dm_delay_info); + ti->per_io_data_size = sizeof(struct dm_delay_info); ti->private = dc; return 0; diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index 09e2afcaf..b7341de87 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -220,7 +220,7 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->num_flush_bios = 1; ti->num_discard_bios = 1; - ti->per_bio_data_size = sizeof(struct per_bio_data); + ti->per_io_data_size = sizeof(struct per_bio_data); ti->private = fc; return 0; diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 80a439543..2adf81d81 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1291,7 +1291,8 @@ static int table_load(struct dm_ioctl *param, size_t param_size) immutable_target_type = dm_get_immutable_target_type(md); if (immutable_target_type && - (immutable_target_type != dm_table_get_immutable_target_type(t))) { + (immutable_target_type != dm_table_get_immutable_target_type(t)) && + !dm_table_get_wildcard_target(t)) { DMWARN("can't replace immutable target type %s", immutable_target_type->name); r = -EINVAL; @@ -1303,7 +1304,7 @@ static int table_load(struct dm_ioctl *param, size_t param_size) dm_set_md_type(md, dm_table_get_type(t)); /* setup md->queue to reflect md's type (may block) */ - r = dm_setup_md_queue(md); + r = dm_setup_md_queue(md, t); if (r) { DMWARN("unable to set up device queue for new table."); goto err_unlock_md_type; diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c index 624589d51..608302e22 100644 --- a/drivers/md/dm-log-writes.c +++ b/drivers/md/dm-log-writes.c @@ -475,7 +475,7 @@ static int log_writes_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->flush_supported = true; ti->num_discard_bios = 1; ti->discards_supported = true; - ti->per_bio_data_size = sizeof(struct per_bio_data); + ti->per_io_data_size = sizeof(struct per_bio_data); ti->private = lc; return 0; diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index cfa29f574..677ba223e 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -23,6 +23,7 @@ #include <linux/delay.h> #include <scsi/scsi_dh.h> #include <linux/atomic.h> +#include <linux/blk-mq.h> #define DM_MSG_PREFIX "multipath" #define DM_PG_INIT_DELAY_MSECS 2000 @@ -33,11 +34,12 @@ struct pgpath { struct list_head list; struct priority_group *pg; /* Owning PG */ - unsigned is_active; /* Path status */ unsigned fail_count; /* Cumulative failure count */ struct dm_path path; struct delayed_work activate_path; + + bool is_active:1; /* Path status */ }; #define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path) @@ -53,10 +55,10 @@ struct priority_group { struct path_selector ps; unsigned pg_num; /* Reference number */ - unsigned bypassed; /* Temporarily bypass this PG? */ - unsigned nr_pgpaths; /* Number of paths in PG */ struct list_head pgpaths; + + bool bypassed:1; /* Temporarily bypass this PG? */ }; /* Multipath context */ @@ -74,21 +76,20 @@ struct multipath { wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */ - unsigned pg_init_required; /* pg_init needs calling? */ unsigned pg_init_in_progress; /* Only one pg_init allowed at once */ - unsigned pg_init_delay_retry; /* Delay pg_init retry? */ unsigned nr_valid_paths; /* Total number of usable paths */ struct pgpath *current_pgpath; struct priority_group *current_pg; struct priority_group *next_pg; /* Switch to this PG if set */ - unsigned repeat_count; /* I/Os left before calling PS again */ - unsigned queue_io:1; /* Must we queue all I/O? */ - unsigned queue_if_no_path:1; /* Queue I/O if last path fails? */ - unsigned saved_queue_if_no_path:1; /* Saved state during suspension */ - unsigned retain_attached_hw_handler:1; /* If there's already a hw_handler present, don't change it. */ - unsigned pg_init_disabled:1; /* pg_init is not currently allowed */ + bool queue_io:1; /* Must we queue all I/O? */ + bool queue_if_no_path:1; /* Queue I/O if last path fails? */ + bool saved_queue_if_no_path:1; /* Saved state during suspension */ + bool retain_attached_hw_handler:1; /* If there's already a hw_handler present, don't change it. */ + bool pg_init_disabled:1; /* pg_init is not currently allowed */ + bool pg_init_required:1; /* pg_init needs calling? */ + bool pg_init_delay_retry:1; /* Delay pg_init retry? */ unsigned pg_init_retries; /* Number of times to retry pg_init */ unsigned pg_init_count; /* Number of times pg_init called */ @@ -120,7 +121,6 @@ static struct kmem_cache *_mpio_cache; static struct workqueue_struct *kmultipathd, *kmpath_handlerd; static void trigger_event(struct work_struct *work); static void activate_path(struct work_struct *work); -static int __pgpath_busy(struct pgpath *pgpath); /*----------------------------------------------- @@ -132,7 +132,7 @@ static struct pgpath *alloc_pgpath(void) struct pgpath *pgpath = kzalloc(sizeof(*pgpath), GFP_KERNEL); if (pgpath) { - pgpath->is_active = 1; + pgpath->is_active = true; INIT_DELAYED_WORK(&pgpath->activate_path, activate_path); } @@ -181,25 +181,31 @@ static void free_priority_group(struct priority_group *pg, kfree(pg); } -static struct multipath *alloc_multipath(struct dm_target *ti) +static struct multipath *alloc_multipath(struct dm_target *ti, bool use_blk_mq) { struct multipath *m; - unsigned min_ios = dm_get_reserved_rq_based_ios(); m = kzalloc(sizeof(*m), GFP_KERNEL); if (m) { INIT_LIST_HEAD(&m->priority_groups); spin_lock_init(&m->lock); - m->queue_io = 1; + m->queue_io = true; m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT; INIT_WORK(&m->trigger_event, trigger_event); init_waitqueue_head(&m->pg_init_wait); mutex_init(&m->work_mutex); - m->mpio_pool = mempool_create_slab_pool(min_ios, _mpio_cache); - if (!m->mpio_pool) { - kfree(m); - return NULL; + + m->mpio_pool = NULL; + if (!use_blk_mq) { + unsigned min_ios = dm_get_reserved_rq_based_ios(); + + m->mpio_pool = mempool_create_slab_pool(min_ios, _mpio_cache); + if (!m->mpio_pool) { + kfree(m); + return NULL; + } } + m->ti = ti; ti->private = m; } @@ -222,26 +228,41 @@ static void free_multipath(struct multipath *m) kfree(m); } -static int set_mapinfo(struct multipath *m, union map_info *info) +static struct dm_mpath_io *get_mpio(union map_info *info) +{ + return info->ptr; +} + +static struct dm_mpath_io *set_mpio(struct multipath *m, union map_info *info) { struct dm_mpath_io *mpio; + if (!m->mpio_pool) { + /* Use blk-mq pdu memory requested via per_io_data_size */ + mpio = get_mpio(info); + memset(mpio, 0, sizeof(*mpio)); + return mpio; + } + mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC); if (!mpio) - return -ENOMEM; + return NULL; memset(mpio, 0, sizeof(*mpio)); info->ptr = mpio; - return 0; + return mpio; } -static void clear_mapinfo(struct multipath *m, union map_info *info) +static void clear_request_fn_mpio(struct multipath *m, union map_info *info) { - struct dm_mpath_io *mpio = info->ptr; + /* Only needed for non blk-mq (.request_fn) multipath */ + if (m->mpio_pool) { + struct dm_mpath_io *mpio = info->ptr; - info->ptr = NULL; - mempool_free(mpio, m->mpio_pool); + info->ptr = NULL; + mempool_free(mpio, m->mpio_pool); + } } /*----------------------------------------------- @@ -257,7 +278,7 @@ static int __pg_init_all_paths(struct multipath *m) return 0; m->pg_init_count++; - m->pg_init_required = 0; + m->pg_init_required = false; /* Check here to reset pg_init_required */ if (!m->current_pg) @@ -283,11 +304,11 @@ static void __switch_pg(struct multipath *m, struct pgpath *pgpath) /* Must we initialise the PG first, and queue I/O till it's ready? */ if (m->hw_handler_name) { - m->pg_init_required = 1; - m->queue_io = 1; + m->pg_init_required = true; + m->queue_io = true; } else { - m->pg_init_required = 0; - m->queue_io = 0; + m->pg_init_required = false; + m->queue_io = false; } m->pg_init_count = 0; @@ -298,7 +319,7 @@ static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg, { struct dm_path *path; - path = pg->ps.type->select_path(&pg->ps, &m->repeat_count, nr_bytes); + path = pg->ps.type->select_path(&pg->ps, nr_bytes); if (!path) return -ENXIO; @@ -313,10 +334,10 @@ static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg, static void __choose_pgpath(struct multipath *m, size_t nr_bytes) { struct priority_group *pg; - unsigned bypassed = 1; + bool bypassed = true; if (!m->nr_valid_paths) { - m->queue_io = 0; + m->queue_io = false; goto failed; } @@ -344,7 +365,7 @@ static void __choose_pgpath(struct multipath *m, size_t nr_bytes) continue; if (!__choose_path_in_pg(m, pg, nr_bytes)) { if (!bypassed) - m->pg_init_delay_retry = 1; + m->pg_init_delay_retry = true; return; } } @@ -380,7 +401,7 @@ static int __multipath_map(struct dm_target *ti, struct request *clone, union map_info *map_context, struct request *rq, struct request **__clone) { - struct multipath *m = (struct multipath *) ti->private; + struct multipath *m = ti->private; int r = DM_MAPIO_REQUEUE; size_t nr_bytes = clone ? blk_rq_bytes(clone) : blk_rq_bytes(rq); struct pgpath *pgpath; @@ -390,8 +411,7 @@ static int __multipath_map(struct dm_target *ti, struct request *clone, spin_lock_irq(&m->lock); /* Do we need to select a new pgpath? */ - if (!m->current_pgpath || - (!m->queue_io && (m->repeat_count && --m->repeat_count == 0))) + if (!m->current_pgpath || !m->queue_io) __choose_pgpath(m, nr_bytes); pgpath = m->current_pgpath; @@ -405,11 +425,11 @@ static int __multipath_map(struct dm_target *ti, struct request *clone, goto out_unlock; } - if (set_mapinfo(m, map_context) < 0) + mpio = set_mpio(m, map_context); + if (!mpio) /* ENOMEM, requeue */ goto out_unlock; - mpio = map_context->ptr; mpio->pgpath = pgpath; mpio->nr_bytes = nr_bytes; @@ -418,17 +438,24 @@ static int __multipath_map(struct dm_target *ti, struct request *clone, spin_unlock_irq(&m->lock); if (clone) { - /* Old request-based interface: allocated clone is passed in */ + /* + * Old request-based interface: allocated clone is passed in. + * Used by: .request_fn stacked on .request_fn path(s). + */ clone->q = bdev_get_queue(bdev); clone->rq_disk = bdev->bd_disk; clone->cmd_flags |= REQ_FAILFAST_TRANSPORT; } else { - /* blk-mq request-based interface */ - *__clone = blk_get_request(bdev_get_queue(bdev), - rq_data_dir(rq), GFP_ATOMIC); + /* + * blk-mq request-based interface; used by both: + * .request_fn stacked on blk-mq path(s) and + * blk-mq stacked on blk-mq path(s). + */ + *__clone = blk_mq_alloc_request(bdev_get_queue(bdev), + rq_data_dir(rq), BLK_MQ_REQ_NOWAIT); if (IS_ERR(*__clone)) { /* ENOMEM, requeue */ - clear_mapinfo(m, map_context); + clear_request_fn_mpio(m, map_context); return r; } (*__clone)->bio = (*__clone)->biotail = NULL; @@ -463,14 +490,14 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq, static void multipath_release_clone(struct request *clone) { - blk_put_request(clone); + blk_mq_free_request(clone); } /* * If we run out of usable paths, should we queue I/O or error it? */ -static int queue_if_no_path(struct multipath *m, unsigned queue_if_no_path, - unsigned save_old_value) +static int queue_if_no_path(struct multipath *m, bool queue_if_no_path, + bool save_old_value) { unsigned long flags; @@ -776,12 +803,12 @@ static int parse_features(struct dm_arg_set *as, struct multipath *m) argc--; if (!strcasecmp(arg_name, "queue_if_no_path")) { - r = queue_if_no_path(m, 1, 0); + r = queue_if_no_path(m, true, false); continue; } if (!strcasecmp(arg_name, "retain_attached_hw_handler")) { - m->retain_attached_hw_handler = 1; + m->retain_attached_hw_handler = true; continue; } @@ -820,11 +847,12 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, struct dm_arg_set as; unsigned pg_count = 0; unsigned next_pg_num; + bool use_blk_mq = dm_use_blk_mq(dm_table_get_md(ti->table)); as.argc = argc; as.argv = argv; - m = alloc_multipath(ti); + m = alloc_multipath(ti, use_blk_mq); if (!m) { ti->error = "can't allocate multipath"; return -EINVAL; @@ -880,6 +908,8 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, ti->num_flush_bios = 1; ti->num_discard_bios = 1; ti->num_write_same_bios = 1; + if (use_blk_mq) + ti->per_io_data_size = sizeof(struct dm_mpath_io); return 0; @@ -917,7 +947,7 @@ static void flush_multipath_work(struct multipath *m) unsigned long flags; spin_lock_irqsave(&m->lock, flags); - m->pg_init_disabled = 1; + m->pg_init_disabled = true; spin_unlock_irqrestore(&m->lock, flags); flush_workqueue(kmpath_handlerd); @@ -926,7 +956,7 @@ static void flush_multipath_work(struct multipath *m) flush_work(&m->trigger_event); spin_lock_irqsave(&m->lock, flags); - m->pg_init_disabled = 0; + m->pg_init_disabled = false; spin_unlock_irqrestore(&m->lock, flags); } @@ -954,7 +984,7 @@ static int fail_path(struct pgpath *pgpath) DMWARN("Failing path %s.", pgpath->path.dev->name); pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path); - pgpath->is_active = 0; + pgpath->is_active = false; pgpath->fail_count++; m->nr_valid_paths--; @@ -987,18 +1017,13 @@ static int reinstate_path(struct pgpath *pgpath) if (pgpath->is_active) goto out; - if (!pgpath->pg->ps.type->reinstate_path) { - DMWARN("Reinstate path not supported by path selector %s", - pgpath->pg->ps.type->name); - r = -EINVAL; - goto out; - } + DMWARN("Reinstating path %s.", pgpath->path.dev->name); r = pgpath->pg->ps.type->reinstate_path(&pgpath->pg->ps, &pgpath->path); if (r) goto out; - pgpath->is_active = 1; + pgpath->is_active = true; if (!m->nr_valid_paths++) { m->current_pgpath = NULL; @@ -1045,7 +1070,7 @@ static int action_dev(struct multipath *m, struct dm_dev *dev, * Temporarily try to avoid having to use the specified PG */ static void bypass_pg(struct multipath *m, struct priority_group *pg, - int bypassed) + bool bypassed) { unsigned long flags; @@ -1078,7 +1103,7 @@ static int switch_pg_num(struct multipath *m, const char *pgstr) spin_lock_irqsave(&m->lock, flags); list_for_each_entry(pg, &m->priority_groups, list) { - pg->bypassed = 0; + pg->bypassed = false; if (--pgnum) continue; @@ -1096,7 +1121,7 @@ static int switch_pg_num(struct multipath *m, const char *pgstr) * Set/clear bypassed status of a PG. * PGs are numbered upwards from 1 in the order they were declared. */ -static int bypass_pg_num(struct multipath *m, const char *pgstr, int bypassed) +static int bypass_pg_num(struct multipath *m, const char *pgstr, bool bypassed) { struct priority_group *pg; unsigned pgnum; @@ -1120,17 +1145,17 @@ static int bypass_pg_num(struct multipath *m, const char *pgstr, int bypassed) /* * Should we retry pg_init immediately? */ -static int pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath) +static bool pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath) { unsigned long flags; - int limit_reached = 0; + bool limit_reached = false; spin_lock_irqsave(&m->lock, flags); if (m->pg_init_count <= m->pg_init_retries && !m->pg_init_disabled) - m->pg_init_required = 1; + m->pg_init_required = true; else - limit_reached = 1; + limit_reached = true; spin_unlock_irqrestore(&m->lock, flags); @@ -1143,7 +1168,7 @@ static void pg_init_done(void *data, int errors) struct priority_group *pg = pgpath->pg; struct multipath *m = pg->m; unsigned long flags; - unsigned delay_retry = 0; + bool delay_retry = false; /* device or driver problems */ switch (errors) { @@ -1166,7 +1191,7 @@ static void pg_init_done(void *data, int errors) * Probably doing something like FW upgrade on the * controller so try the other pg. */ - bypass_pg(m, pg, 1); + bypass_pg(m, pg, true); break; case SCSI_DH_RETRY: /* Wait before retrying. */ @@ -1177,6 +1202,7 @@ static void pg_init_done(void *data, int errors) fail_path(pgpath); errors = 0; break; + case SCSI_DH_DEV_OFFLINED: default: /* * We probably do not want to fail the path for a device @@ -1194,7 +1220,7 @@ static void pg_init_done(void *data, int errors) m->current_pg = NULL; } } else if (!m->pg_init_required) - pg->bypassed = 0; + pg->bypassed = false; if (--m->pg_init_in_progress) /* Activations of other paths are still on going */ @@ -1205,7 +1231,7 @@ static void pg_init_done(void *data, int errors) if (__pg_init_all_paths(m)) goto out; } - m->queue_io = 0; + m->queue_io = false; /* * Wake up any thread waiting to suspend. @@ -1291,21 +1317,21 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone, int error, union map_info *map_context) { struct multipath *m = ti->private; - struct dm_mpath_io *mpio = map_context->ptr; + struct dm_mpath_io *mpio = get_mpio(map_context); struct pgpath *pgpath; struct path_selector *ps; int r; BUG_ON(!mpio); - r = do_end_io(m, clone, error, mpio); + r = do_end_io(m, clone, error, mpio); pgpath = mpio->pgpath; if (pgpath) { ps = &pgpath->pg->ps; if (ps->type->end_io) ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes); } - clear_mapinfo(m, map_context); + clear_request_fn_mpio(m, map_context); return r; } @@ -1318,9 +1344,9 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone, */ static void multipath_presuspend(struct dm_target *ti) { - struct multipath *m = (struct multipath *) ti->private; + struct multipath *m = ti->private; - queue_if_no_path(m, 0, 1); + queue_if_no_path(m, false, true); } static void multipath_postsuspend(struct dm_target *ti) @@ -1337,7 +1363,7 @@ static void multipath_postsuspend(struct dm_target *ti) */ static void multipath_resume(struct dm_target *ti) { - struct multipath *m = (struct multipath *) ti->private; + struct multipath *m = ti->private; unsigned long flags; spin_lock_irqsave(&m->lock, flags); @@ -1366,7 +1392,7 @@ static void multipath_status(struct dm_target *ti, status_type_t type, { int sz = 0; unsigned long flags; - struct multipath *m = (struct multipath *) ti->private; + struct multipath *m = ti->private; struct priority_group *pg; struct pgpath *p; unsigned pg_num; @@ -1474,7 +1500,7 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv) { int r = -EINVAL; struct dm_dev *dev; - struct multipath *m = (struct multipath *) ti->private; + struct multipath *m = ti->private; action_fn action; mutex_lock(&m->work_mutex); @@ -1486,10 +1512,10 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv) if (argc == 1) { if (!strcasecmp(argv[0], "queue_if_no_path")) { - r = queue_if_no_path(m, 1, 0); + r = queue_if_no_path(m, true, false); goto out; } else if (!strcasecmp(argv[0], "fail_if_no_path")) { - r = queue_if_no_path(m, 0, 0); + r = queue_if_no_path(m, false, false); goto out; } } @@ -1500,10 +1526,10 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv) } if (!strcasecmp(argv[0], "disable_group")) { - r = bypass_pg_num(m, argv[1], 1); + r = bypass_pg_num(m, argv[1], true); goto out; } else if (!strcasecmp(argv[0], "enable_group")) { - r = bypass_pg_num(m, argv[1], 0); + r = bypass_pg_num(m, argv[1], false); goto out; } else if (!strcasecmp(argv[0], "switch_group")) { r = switch_pg_num(m, argv[1]); @@ -1604,7 +1630,7 @@ out: return ret; } -static int __pgpath_busy(struct pgpath *pgpath) +static int pgpath_busy(struct pgpath *pgpath) { struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev); @@ -1621,7 +1647,7 @@ static int __pgpath_busy(struct pgpath *pgpath) */ static int multipath_busy(struct dm_target *ti) { - int busy = 0, has_active = 0; + bool busy = false, has_active = false; struct multipath *m = ti->private; struct priority_group *pg; struct pgpath *pgpath; @@ -1632,7 +1658,7 @@ static int multipath_busy(struct dm_target *ti) /* pg_init in progress or no paths available */ if (m->pg_init_in_progress || (!m->nr_valid_paths && m->queue_if_no_path)) { - busy = 1; + busy = true; goto out; } /* Guess which priority_group will be used at next mapping time */ @@ -1654,13 +1680,12 @@ static int multipath_busy(struct dm_target *ti) * If there is one non-busy active path at least, the path selector * will be able to select it. So we consider such a pg as not busy. */ - busy = 1; + busy = true; list_for_each_entry(pgpath, &pg->pgpaths, list) if (pgpath->is_active) { - has_active = 1; - - if (!__pgpath_busy(pgpath)) { - busy = 0; + has_active = true; + if (!pgpath_busy(pgpath)) { + busy = false; break; } } @@ -1671,7 +1696,7 @@ static int multipath_busy(struct dm_target *ti) * the current_pg will be changed at next mapping time. * We need to try mapping to determine it. */ - busy = 0; + busy = false; out: spin_unlock_irqrestore(&m->lock, flags); @@ -1684,7 +1709,8 @@ out: *---------------------------------------------------------------*/ static struct target_type multipath_target = { .name = "multipath", - .version = {1, 10, 0}, + .version = {1, 11, 0}, + .features = DM_TARGET_SINGLETON | DM_TARGET_IMMUTABLE, .module = THIS_MODULE, .ctr = multipath_ctr, .dtr = multipath_dtr, diff --git a/drivers/md/dm-path-selector.h b/drivers/md/dm-path-selector.h index e7d1fa8b0..b6eb5365b 100644 --- a/drivers/md/dm-path-selector.h +++ b/drivers/md/dm-path-selector.h @@ -50,13 +50,8 @@ struct path_selector_type { /* * Chooses a path for this io, if no paths are available then * NULL will be returned. - * - * repeat_count is the number of times to use the path before - * calling the function again. 0 means don't call it again unless - * the path fails. */ struct dm_path *(*select_path) (struct path_selector *ps, - unsigned *repeat_count, size_t nr_bytes); /* diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-queue-length.c index 3941fae0d..23f178641 100644 --- a/drivers/md/dm-queue-length.c +++ b/drivers/md/dm-queue-length.c @@ -23,12 +23,13 @@ #include <linux/atomic.h> #define DM_MSG_PREFIX "multipath queue-length" -#define QL_MIN_IO 128 -#define QL_VERSION "0.1.0" +#define QL_MIN_IO 1 +#define QL_VERSION "0.2.0" struct selector { struct list_head valid_paths; struct list_head failed_paths; + spinlock_t lock; }; struct path_info { @@ -45,6 +46,7 @@ static struct selector *alloc_selector(void) if (s) { INIT_LIST_HEAD(&s->valid_paths); INIT_LIST_HEAD(&s->failed_paths); + spin_lock_init(&s->lock); } return s; @@ -113,6 +115,7 @@ static int ql_add_path(struct path_selector *ps, struct dm_path *path, struct path_info *pi; unsigned repeat_count = QL_MIN_IO; char dummy; + unsigned long flags; /* * Arguments: [<repeat_count>] @@ -129,6 +132,11 @@ static int ql_add_path(struct path_selector *ps, struct dm_path *path, return -EINVAL; } + if (repeat_count > 1) { + DMWARN_LIMIT("repeat_count > 1 is deprecated, using 1 instead"); + repeat_count = 1; + } + /* Allocate the path information structure */ pi = kmalloc(sizeof(*pi), GFP_KERNEL); if (!pi) { @@ -142,7 +150,9 @@ static int ql_add_path(struct path_selector *ps, struct dm_path *path, path->pscontext = pi; + spin_lock_irqsave(&s->lock, flags); list_add_tail(&pi->list, &s->valid_paths); + spin_unlock_irqrestore(&s->lock, flags); return 0; } @@ -151,16 +161,22 @@ static void ql_fail_path(struct path_selector *ps, struct dm_path *path) { struct selector *s = ps->context; struct path_info *pi = path->pscontext; + unsigned long flags; + spin_lock_irqsave(&s->lock, flags); list_move(&pi->list, &s->failed_paths); + spin_unlock_irqrestore(&s->lock, flags); } static int ql_reinstate_path(struct path_selector *ps, struct dm_path *path) { struct selector *s = ps->context; struct path_info *pi = path->pscontext; + unsigned long flags; + spin_lock_irqsave(&s->lock, flags); list_move_tail(&pi->list, &s->valid_paths); + spin_unlock_irqrestore(&s->lock, flags); return 0; } @@ -168,14 +184,16 @@ static int ql_reinstate_path(struct path_selector *ps, struct dm_path *path) /* * Select a path having the minimum number of in-flight I/Os */ -static struct dm_path *ql_select_path(struct path_selector *ps, - unsigned *repeat_count, size_t nr_bytes) +static struct dm_path *ql_select_path(struct path_selector *ps, size_t nr_bytes) { struct selector *s = ps->context; struct path_info *pi = NULL, *best = NULL; + struct dm_path *ret = NULL; + unsigned long flags; + spin_lock_irqsave(&s->lock, flags); if (list_empty(&s->valid_paths)) - return NULL; + goto out; /* Change preferred (first in list) path to evenly balance. */ list_move_tail(s->valid_paths.next, &s->valid_paths); @@ -190,11 +208,12 @@ static struct dm_path *ql_select_path(struct path_selector *ps, } if (!best) - return NULL; - - *repeat_count = best->repeat_count; + goto out; - return best->path; + ret = best->path; +out: + spin_unlock_irqrestore(&s->lock, flags); + return ret; } static int ql_start_io(struct path_selector *ps, struct dm_path *path, diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index f2a363a89..b3ccf1e0d 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -1121,7 +1121,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->num_flush_bios = 1; ti->num_discard_bios = 1; - ti->per_bio_data_size = sizeof(struct dm_raid1_bio_record); + ti->per_io_data_size = sizeof(struct dm_raid1_bio_record); ti->discard_zeroes_data_unsupported = true; ms->kmirrord_wq = alloc_workqueue("kmirrord", WQ_MEM_RECLAIM, 0); diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c index 6ab1192cd..4ace1da17 100644 --- a/drivers/md/dm-round-robin.c +++ b/drivers/md/dm-round-robin.c @@ -17,6 +17,8 @@ #include <linux/module.h> #define DM_MSG_PREFIX "multipath round-robin" +#define RR_MIN_IO 1000 +#define RR_VERSION "1.1.0" /*----------------------------------------------------------------- * Path-handling code, paths are held in lists @@ -41,23 +43,48 @@ static void free_paths(struct list_head *paths) * Round-robin selector *---------------------------------------------------------------*/ -#define RR_MIN_IO 1000 - struct selector { struct list_head valid_paths; struct list_head invalid_paths; + spinlock_t lock; + struct dm_path * __percpu *current_path; + struct percpu_counter repeat_count; }; +static void set_percpu_current_path(struct selector *s, struct dm_path *path) +{ + int cpu; + + for_each_possible_cpu(cpu) + *per_cpu_ptr(s->current_path, cpu) = path; +} + static struct selector *alloc_selector(void) { struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL); - if (s) { - INIT_LIST_HEAD(&s->valid_paths); - INIT_LIST_HEAD(&s->invalid_paths); - } + if (!s) + return NULL; + + INIT_LIST_HEAD(&s->valid_paths); + INIT_LIST_HEAD(&s->invalid_paths); + spin_lock_init(&s->lock); + + s->current_path = alloc_percpu(struct dm_path *); + if (!s->current_path) + goto out_current_path; + set_percpu_current_path(s, NULL); + + if (percpu_counter_init(&s->repeat_count, 0, GFP_KERNEL)) + goto out_repeat_count; return s; + +out_repeat_count: + free_percpu(s->current_path); +out_current_path: + kfree(s); + return NULL;; } static int rr_create(struct path_selector *ps, unsigned argc, char **argv) @@ -74,10 +101,12 @@ static int rr_create(struct path_selector *ps, unsigned argc, char **argv) static void rr_destroy(struct path_selector *ps) { - struct selector *s = (struct selector *) ps->context; + struct selector *s = ps->context; free_paths(&s->valid_paths); free_paths(&s->invalid_paths); + free_percpu(s->current_path); + percpu_counter_destroy(&s->repeat_count); kfree(s); ps->context = NULL; } @@ -111,10 +140,11 @@ static int rr_status(struct path_selector *ps, struct dm_path *path, static int rr_add_path(struct path_selector *ps, struct dm_path *path, int argc, char **argv, char **error) { - struct selector *s = (struct selector *) ps->context; + struct selector *s = ps->context; struct path_info *pi; unsigned repeat_count = RR_MIN_IO; char dummy; + unsigned long flags; if (argc > 1) { *error = "round-robin ps: incorrect number of arguments"; @@ -139,42 +169,65 @@ static int rr_add_path(struct path_selector *ps, struct dm_path *path, path->pscontext = pi; + spin_lock_irqsave(&s->lock, flags); list_add_tail(&pi->list, &s->valid_paths); + spin_unlock_irqrestore(&s->lock, flags); return 0; } static void rr_fail_path(struct path_selector *ps, struct dm_path *p) { - struct selector *s = (struct selector *) ps->context; + unsigned long flags; + struct selector *s = ps->context; struct path_info *pi = p->pscontext; + spin_lock_irqsave(&s->lock, flags); + if (p == *this_cpu_ptr(s->current_path)) + set_percpu_current_path(s, NULL); + list_move(&pi->list, &s->invalid_paths); + spin_unlock_irqrestore(&s->lock, flags); } static int rr_reinstate_path(struct path_selector *ps, struct dm_path *p) { - struct selector *s = (struct selector *) ps->context; + unsigned long flags; + struct selector *s = ps->context; struct path_info *pi = p->pscontext; + spin_lock_irqsave(&s->lock, flags); list_move(&pi->list, &s->valid_paths); + spin_unlock_irqrestore(&s->lock, flags); return 0; } -static struct dm_path *rr_select_path(struct path_selector *ps, - unsigned *repeat_count, size_t nr_bytes) +static struct dm_path *rr_select_path(struct path_selector *ps, size_t nr_bytes) { - struct selector *s = (struct selector *) ps->context; + unsigned long flags; + struct selector *s = ps->context; struct path_info *pi = NULL; + struct dm_path *current_path = NULL; + + current_path = *this_cpu_ptr(s->current_path); + if (current_path) { + percpu_counter_dec(&s->repeat_count); + if (percpu_counter_read_positive(&s->repeat_count) > 0) + return current_path; + } + spin_lock_irqsave(&s->lock, flags); if (!list_empty(&s->valid_paths)) { pi = list_entry(s->valid_paths.next, struct path_info, list); list_move_tail(&pi->list, &s->valid_paths); - *repeat_count = pi->repeat_count; + percpu_counter_set(&s->repeat_count, pi->repeat_count); + set_percpu_current_path(s, pi->path); + current_path = pi->path; } + spin_unlock_irqrestore(&s->lock, flags); - return pi ? pi->path : NULL; + return current_path; } static struct path_selector_type rr_ps = { @@ -198,7 +251,7 @@ static int __init dm_rr_init(void) if (r < 0) DMERR("register failed %d", r); - DMINFO("version 1.0.0 loaded"); + DMINFO("version " RR_VERSION " loaded"); return r; } diff --git a/drivers/md/dm-service-time.c b/drivers/md/dm-service-time.c index 9df8f6bd6..7b8642045 100644 --- a/drivers/md/dm-service-time.c +++ b/drivers/md/dm-service-time.c @@ -19,11 +19,12 @@ #define ST_MAX_RELATIVE_THROUGHPUT 100 #define ST_MAX_RELATIVE_THROUGHPUT_SHIFT 7 #define ST_MAX_INFLIGHT_SIZE ((size_t)-1 >> ST_MAX_RELATIVE_THROUGHPUT_SHIFT) -#define ST_VERSION "0.2.0" +#define ST_VERSION "0.3.0" struct selector { struct list_head valid_paths; struct list_head failed_paths; + spinlock_t lock; }; struct path_info { @@ -41,6 +42,7 @@ static struct selector *alloc_selector(void) if (s) { INIT_LIST_HEAD(&s->valid_paths); INIT_LIST_HEAD(&s->failed_paths); + spin_lock_init(&s->lock); } return s; @@ -111,6 +113,7 @@ static int st_add_path(struct path_selector *ps, struct dm_path *path, unsigned repeat_count = ST_MIN_IO; unsigned relative_throughput = 1; char dummy; + unsigned long flags; /* * Arguments: [<repeat_count> [<relative_throughput>]] @@ -134,6 +137,11 @@ static int st_add_path(struct path_selector *ps, struct dm_path *path, return -EINVAL; } + if (repeat_count > 1) { + DMWARN_LIMIT("repeat_count > 1 is deprecated, using 1 instead"); + repeat_count = 1; + } + if ((argc == 2) && (sscanf(argv[1], "%u%c", &relative_throughput, &dummy) != 1 || relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) { @@ -155,7 +163,9 @@ static int st_add_path(struct path_selector *ps, struct dm_path *path, path->pscontext = pi; + spin_lock_irqsave(&s->lock, flags); list_add_tail(&pi->list, &s->valid_paths); + spin_unlock_irqrestore(&s->lock, flags); return 0; } @@ -164,16 +174,22 @@ static void st_fail_path(struct path_selector *ps, struct dm_path *path) { struct selector *s = ps->context; struct path_info *pi = path->pscontext; + unsigned long flags; + spin_lock_irqsave(&s->lock, flags); list_move(&pi->list, &s->failed_paths); + spin_unlock_irqrestore(&s->lock, flags); } static int st_reinstate_path(struct path_selector *ps, struct dm_path *path) { struct selector *s = ps->context; struct path_info *pi = path->pscontext; + unsigned long flags; + spin_lock_irqsave(&s->lock, flags); list_move_tail(&pi->list, &s->valid_paths); + spin_unlock_irqrestore(&s->lock, flags); return 0; } @@ -255,14 +271,16 @@ static int st_compare_load(struct path_info *pi1, struct path_info *pi2, return pi2->relative_throughput - pi1->relative_throughput; } -static struct dm_path *st_select_path(struct path_selector *ps, - unsigned *repeat_count, size_t nr_bytes) +static struct dm_path *st_select_path(struct path_selector *ps, size_t nr_bytes) { struct selector *s = ps->context; struct path_info *pi = NULL, *best = NULL; + struct dm_path *ret = NULL; + unsigned long flags; + spin_lock_irqsave(&s->lock, flags); if (list_empty(&s->valid_paths)) - return NULL; + goto out; /* Change preferred (first in list) path to evenly balance. */ list_move_tail(s->valid_paths.next, &s->valid_paths); @@ -272,11 +290,12 @@ static struct dm_path *st_select_path(struct path_selector *ps, best = pi; if (!best) - return NULL; - - *repeat_count = best->repeat_count; + goto out; - return best->path; + ret = best->path; +out: + spin_unlock_irqrestore(&s->lock, flags); + return ret; } static int st_start_io(struct path_selector *ps, struct dm_path *path, diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index e4d1bafe7..70bb0e8b6 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -1210,7 +1210,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->private = s; ti->num_flush_bios = num_flush_bios; - ti->per_bio_data_size = sizeof(struct dm_snap_tracked_chunk); + ti->per_io_data_size = sizeof(struct dm_snap_tracked_chunk); /* Add snapshot to the list of snapshots for this origin */ /* Exceptions aren't triggered till snapshot_resume() is called */ diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index cb5d0daf5..f9e8f0bef 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -932,6 +932,30 @@ struct target_type *dm_table_get_immutable_target_type(struct dm_table *t) return t->immutable_target_type; } +struct dm_target *dm_table_get_immutable_target(struct dm_table *t) +{ + /* Immutable target is implicitly a singleton */ + if (t->num_targets > 1 || + !dm_target_is_immutable(t->targets[0].type)) + return NULL; + + return t->targets; +} + +struct dm_target *dm_table_get_wildcard_target(struct dm_table *t) +{ + struct dm_target *uninitialized_var(ti); + unsigned i = 0; + + while (i < dm_table_get_num_targets(t)) { + ti = dm_table_get_target(t, i++); + if (dm_target_is_wildcard(ti->type)) + return ti; + } + + return NULL; +} + bool dm_table_request_based(struct dm_table *t) { return __table_type_request_based(dm_table_get_type(t)); @@ -945,7 +969,7 @@ bool dm_table_mq_request_based(struct dm_table *t) static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *md) { unsigned type = dm_table_get_type(t); - unsigned per_bio_data_size = 0; + unsigned per_io_data_size = 0; struct dm_target *tgt; unsigned i; @@ -957,10 +981,10 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device * if (type == DM_TYPE_BIO_BASED) for (i = 0; i < t->num_targets; i++) { tgt = t->targets + i; - per_bio_data_size = max(per_bio_data_size, tgt->per_bio_data_size); + per_io_data_size = max(per_io_data_size, tgt->per_io_data_size); } - t->mempools = dm_alloc_md_mempools(md, type, t->integrity_supported, per_bio_data_size); + t->mempools = dm_alloc_md_mempools(md, type, t->integrity_supported, per_io_data_size); if (!t->mempools) return -ENOMEM; diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c index 925ec1b15..a317dd884 100644 --- a/drivers/md/dm-target.c +++ b/drivers/md/dm-target.c @@ -150,7 +150,8 @@ static void io_err_release_clone_rq(struct request *clone) static struct target_type error_target = { .name = "error", - .version = {1, 3, 0}, + .version = {1, 4, 0}, + .features = DM_TARGET_WILDCARD, .ctr = io_err_ctr, .dtr = io_err_dtr, .map = io_err_map, diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index 185010d9c..43824d733 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -344,7 +344,7 @@ static void subtree_dec(void *context, const void *value) memcpy(&root_le, value, sizeof(root_le)); root = le64_to_cpu(root_le); if (dm_btree_del(info, root)) - DMERR("btree delete failed\n"); + DMERR("btree delete failed"); } static int subtree_equal(void *context, const void *value1_le, const void *value2_le) diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 72d91f477..92237b6fa 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -235,6 +235,7 @@ struct pool { struct pool_features pf; bool low_water_triggered:1; /* A dm event has been sent */ bool suspended:1; + bool out_of_data_space:1; struct dm_bio_prison *prison; struct dm_kcopyd_client *copier; @@ -461,9 +462,16 @@ static void cell_error_with_code(struct pool *pool, dm_bio_prison_free_cell(pool->prison, cell); } +static int get_pool_io_error_code(struct pool *pool) +{ + return pool->out_of_data_space ? -ENOSPC : -EIO; +} + static void cell_error(struct pool *pool, struct dm_bio_prison_cell *cell) { - cell_error_with_code(pool, cell, -EIO); + int error = get_pool_io_error_code(pool); + + cell_error_with_code(pool, cell, error); } static void cell_success(struct pool *pool, struct dm_bio_prison_cell *cell) @@ -622,7 +630,9 @@ static void error_retry_list_with_code(struct pool *pool, int error) static void error_retry_list(struct pool *pool) { - return error_retry_list_with_code(pool, -EIO); + int error = get_pool_io_error_code(pool); + + return error_retry_list_with_code(pool, error); } /* @@ -2419,6 +2429,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) */ if (old_mode != new_mode) notify_of_pool_mode_change_to_oods(pool); + pool->out_of_data_space = true; pool->process_bio = process_bio_read_only; pool->process_discard = process_discard_bio; pool->process_cell = process_cell_read_only; @@ -2432,6 +2443,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) case PM_WRITE: if (old_mode != new_mode) notify_of_pool_mode_change(pool, "write"); + pool->out_of_data_space = false; pool->pf.error_if_no_space = pt->requested_pf.error_if_no_space; dm_pool_metadata_read_write(pool->pmd); pool->process_bio = process_bio; @@ -2832,6 +2844,7 @@ static struct pool *pool_create(struct mapped_device *pool_md, INIT_LIST_HEAD(&pool->active_thins); pool->low_water_triggered = false; pool->suspended = true; + pool->out_of_data_space = false; pool->shared_read_ds = dm_deferred_set_create(); if (!pool->shared_read_ds) { @@ -3886,7 +3899,7 @@ static struct target_type pool_target = { .name = "thin-pool", .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | DM_TARGET_IMMUTABLE, - .version = {1, 17, 0}, + .version = {1, 18, 0}, .module = THIS_MODULE, .ctr = pool_ctr, .dtr = pool_dtr, @@ -4037,7 +4050,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) ti->num_flush_bios = 1; ti->flush_supported = true; - ti->per_bio_data_size = sizeof(struct dm_thin_endio_hook); + ti->per_io_data_size = sizeof(struct dm_thin_endio_hook); /* In case the pool supports discards, pass them on. */ ti->discard_zeroes_data_unsupported = true; @@ -4260,7 +4273,7 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type thin_target = { .name = "thin", - .version = {1, 17, 0}, + .version = {1, 18, 0}, .module = THIS_MODULE, .ctr = thin_ctr, .dtr = thin_dtr, diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index 1cc10c4de..459a9f890 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -812,7 +812,7 @@ int verity_fec_ctr(struct dm_verity *v) } /* Reserve space for our per-bio data */ - ti->per_bio_data_size += sizeof(struct dm_verity_fec_io); + ti->per_io_data_size += sizeof(struct dm_verity_fec_io); return 0; } diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 5c5d30cb6..0aba34a7b 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -354,7 +354,7 @@ int verity_for_bv_block(struct dm_verity *v, struct dm_verity_io *io, size_t len)) { unsigned todo = 1 << v->data_dev_block_bits; - struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_bio_data_size); + struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size); do { int r; @@ -460,7 +460,7 @@ static int verity_verify_io(struct dm_verity_io *io) static void verity_finish_io(struct dm_verity_io *io, int error) { struct dm_verity *v = io->v; - struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_bio_data_size); + struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size); bio->bi_end_io = io->orig_bi_end_io; bio->bi_error = error; @@ -574,7 +574,7 @@ static int verity_map(struct dm_target *ti, struct bio *bio) if (bio_data_dir(bio) == WRITE) return -EIO; - io = dm_per_bio_data(bio, ti->per_bio_data_size); + io = dm_per_bio_data(bio, ti->per_io_data_size); io->v = v; io->orig_bi_end_io = bio->bi_end_io; io->block = bio->bi_iter.bi_sector >> (v->data_dev_block_bits - SECTOR_SHIFT); @@ -1036,15 +1036,15 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) goto bad; } - ti->per_bio_data_size = sizeof(struct dm_verity_io) + + ti->per_io_data_size = sizeof(struct dm_verity_io) + v->shash_descsize + v->digest_size * 2; r = verity_fec_ctr(v); if (r) goto bad; - ti->per_bio_data_size = roundup(ti->per_bio_data_size, - __alignof__(struct dm_verity_io)); + ti->per_io_data_size = roundup(ti->per_io_data_size, + __alignof__(struct dm_verity_io)); return 0; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index c338aebb4..3d3ac1328 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -106,14 +106,6 @@ struct dm_rq_clone_bio_info { struct bio clone; }; -union map_info *dm_get_rq_mapinfo(struct request *rq) -{ - if (rq && rq->end_io_data) - return &((struct dm_rq_target_io *)rq->end_io_data)->info; - return NULL; -} -EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo); - #define MINOR_ALLOCED ((void *)-1) /* @@ -129,28 +121,18 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo); #define DMF_SUSPENDED_INTERNALLY 7 /* - * A dummy definition to make RCU happy. - * struct dm_table should never be dereferenced in this file. - */ -struct dm_table { - int undefined__; -}; - -/* * Work processed by per-device workqueue. */ struct mapped_device { struct srcu_struct io_barrier; struct mutex suspend_lock; - atomic_t holders; - atomic_t open_count; /* - * The current mapping. + * The current mapping (struct dm_table *). * Use dm_get_live_table{_fast} or take suspend_lock for * dereference. */ - struct dm_table __rcu *map; + void __rcu *map; struct list_head table_devices; struct mutex table_devices_lock; @@ -158,10 +140,16 @@ struct mapped_device { unsigned long flags; struct request_queue *queue; + int numa_node_id; + unsigned type; /* Protect queue and type against concurrent access. */ struct mutex type_lock; + atomic_t holders; + atomic_t open_count; + + struct dm_target *immutable_target; struct target_type *immutable_target_type; struct gendisk *disk; @@ -175,8 +163,20 @@ struct mapped_device { atomic_t pending[2]; wait_queue_head_t wait; struct work_struct work; - struct bio_list deferred; spinlock_t deferred_lock; + struct bio_list deferred; + + /* + * Event handling. + */ + wait_queue_head_t eventq; + atomic_t event_nr; + atomic_t uevent_seq; + struct list_head uevent_list; + spinlock_t uevent_lock; /* Protect access to uevent_list */ + + /* the number of internal suspends */ + unsigned internal_suspend_count; /* * Processing queue (flush) @@ -192,32 +192,21 @@ struct mapped_device { struct bio_set *bs; /* - * Event handling. - */ - atomic_t event_nr; - wait_queue_head_t eventq; - atomic_t uevent_seq; - struct list_head uevent_list; - spinlock_t uevent_lock; /* Protect access to uevent_list */ - - /* * freeze/thaw support require holding onto a super block */ struct super_block *frozen_sb; - struct block_device *bdev; /* forced geometry settings */ struct hd_geometry geometry; + struct block_device *bdev; + /* kobject and completion */ struct dm_kobject_holder kobj_holder; /* zero-length flush that will be cloned and submitted to targets */ struct bio flush_bio; - /* the number of internal suspends */ - unsigned internal_suspend_count; - struct dm_stats stats; struct kthread_worker kworker; @@ -230,8 +219,9 @@ struct mapped_device { ktime_t last_rq_start_time; /* for blk-mq request-based DM support */ - struct blk_mq_tag_set tag_set; - bool use_blk_mq; + struct blk_mq_tag_set *tag_set; + bool use_blk_mq:1; + bool init_tio_pdu:1; }; #ifdef CONFIG_DM_MQ_DEFAULT @@ -240,10 +230,19 @@ static bool use_blk_mq = true; static bool use_blk_mq = false; #endif +#define DM_MQ_NR_HW_QUEUES 1 +#define DM_MQ_QUEUE_DEPTH 2048 +#define DM_NUMA_NODE NUMA_NO_NODE + +static unsigned dm_mq_nr_hw_queues = DM_MQ_NR_HW_QUEUES; +static unsigned dm_mq_queue_depth = DM_MQ_QUEUE_DEPTH; +static int dm_numa_node = DM_NUMA_NODE; + bool dm_use_blk_mq(struct mapped_device *md) { return md->use_blk_mq; } +EXPORT_SYMBOL_GPL(dm_use_blk_mq); /* * For mempools pre-allocation at the table loading time. @@ -277,6 +276,27 @@ static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS; */ static unsigned reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS; +static int __dm_get_module_param_int(int *module_param, int min, int max) +{ + int param = ACCESS_ONCE(*module_param); + int modified_param = 0; + bool modified = true; + + if (param < min) + modified_param = min; + else if (param > max) + modified_param = max; + else + modified = false; + + if (modified) { + (void)cmpxchg(module_param, param, modified_param); + param = modified_param; + } + + return param; +} + static unsigned __dm_get_module_param(unsigned *module_param, unsigned def, unsigned max) { @@ -310,6 +330,23 @@ unsigned dm_get_reserved_rq_based_ios(void) } EXPORT_SYMBOL_GPL(dm_get_reserved_rq_based_ios); +static unsigned dm_get_blk_mq_nr_hw_queues(void) +{ + return __dm_get_module_param(&dm_mq_nr_hw_queues, 1, 32); +} + +static unsigned dm_get_blk_mq_queue_depth(void) +{ + return __dm_get_module_param(&dm_mq_queue_depth, + DM_MQ_QUEUE_DEPTH, BLK_MQ_MAX_DEPTH); +} + +static unsigned dm_get_numa_node(void) +{ + return __dm_get_module_param_int(&dm_numa_node, + DM_NUMA_NODE, num_online_nodes() - 1); +} + static int __init local_init(void) { int r = -ENOMEM; @@ -323,7 +360,7 @@ static int __init local_init(void) if (!_rq_tio_cache) goto out_free_io_cache; - _rq_cache = kmem_cache_create("dm_clone_request", sizeof(struct request), + _rq_cache = kmem_cache_create("dm_old_clone_request", sizeof(struct request), __alignof__(struct request), 0, NULL); if (!_rq_cache) goto out_free_rq_tio_cache; @@ -556,16 +593,17 @@ static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) return dm_get_geometry(md, geo); } -static int dm_get_live_table_for_ioctl(struct mapped_device *md, - struct dm_target **tgt, struct block_device **bdev, - fmode_t *mode, int *srcu_idx) +static int dm_grab_bdev_for_ioctl(struct mapped_device *md, + struct block_device **bdev, + fmode_t *mode) { + struct dm_target *tgt; struct dm_table *map; - int r; + int srcu_idx, r; retry: r = -ENOTTY; - map = dm_get_live_table(md, srcu_idx); + map = dm_get_live_table(md, &srcu_idx); if (!map || !dm_table_get_size(map)) goto out; @@ -573,9 +611,8 @@ retry: if (dm_table_get_num_targets(map) != 1) goto out; - *tgt = dm_table_get_target(map, 0); - - if (!(*tgt)->type->prepare_ioctl) + tgt = dm_table_get_target(map, 0); + if (!tgt->type->prepare_ioctl) goto out; if (dm_suspended_md(md)) { @@ -583,14 +620,16 @@ retry: goto out; } - r = (*tgt)->type->prepare_ioctl(*tgt, bdev, mode); + r = tgt->type->prepare_ioctl(tgt, bdev, mode); if (r < 0) goto out; + bdgrab(*bdev); + dm_put_live_table(md, srcu_idx); return r; out: - dm_put_live_table(md, *srcu_idx); + dm_put_live_table(md, srcu_idx); if (r == -ENOTCONN && !fatal_signal_pending(current)) { msleep(10); goto retry; @@ -602,11 +641,9 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { struct mapped_device *md = bdev->bd_disk->private_data; - struct dm_target *tgt; - struct block_device *tgt_bdev = NULL; - int srcu_idx, r; + int r; - r = dm_get_live_table_for_ioctl(md, &tgt, &tgt_bdev, &mode, &srcu_idx); + r = dm_grab_bdev_for_ioctl(md, &bdev, &mode); if (r < 0) return r; @@ -621,9 +658,9 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, goto out; } - r = __blkdev_driver_ioctl(tgt_bdev, mode, cmd, arg); + r = __blkdev_driver_ioctl(bdev, mode, cmd, arg); out: - dm_put_live_table(md, srcu_idx); + bdput(bdev); return r; } @@ -642,24 +679,24 @@ static void free_tio(struct mapped_device *md, struct dm_target_io *tio) bio_put(&tio->clone); } -static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md, - gfp_t gfp_mask) +static struct dm_rq_target_io *alloc_old_rq_tio(struct mapped_device *md, + gfp_t gfp_mask) { return mempool_alloc(md->io_pool, gfp_mask); } -static void free_rq_tio(struct dm_rq_target_io *tio) +static void free_old_rq_tio(struct dm_rq_target_io *tio) { mempool_free(tio, tio->md->io_pool); } -static struct request *alloc_clone_request(struct mapped_device *md, - gfp_t gfp_mask) +static struct request *alloc_old_clone_request(struct mapped_device *md, + gfp_t gfp_mask) { return mempool_alloc(md->rq_pool, gfp_mask); } -static void free_clone_request(struct mapped_device *md, struct request *rq) +static void free_old_clone_request(struct mapped_device *md, struct request *rq) { mempool_free(rq, md->rq_pool); } @@ -827,7 +864,7 @@ int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode, mutex_lock(&md->table_devices_lock); td = find_table_device(&md->table_devices, dev, mode); if (!td) { - td = kmalloc(sizeof(*td), GFP_KERNEL); + td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id); if (!td) { mutex_unlock(&md->table_devices_lock); return -ENOMEM; @@ -1130,15 +1167,10 @@ static void free_rq_clone(struct request *clone) tio->ti->type->release_clone_rq(clone); else if (!md->queue->mq_ops) /* request_fn queue stacked on request_fn queue(s) */ - free_clone_request(md, clone); - /* - * NOTE: for the blk-mq queue stacked on request_fn queue(s) case: - * no need to call free_clone_request() because we leverage blk-mq by - * allocating the clone at the end of the blk-mq pdu (see: clone_rq) - */ + free_old_clone_request(md, clone); if (!md->queue->mq_ops) - free_rq_tio(tio); + free_old_rq_tio(tio); } /* @@ -1188,13 +1220,13 @@ static void dm_unprep_request(struct request *rq) if (clone) free_rq_clone(clone); else if (!tio->md->queue->mq_ops) - free_rq_tio(tio); + free_old_rq_tio(tio); } /* * Requeue the original request of a clone. */ -static void old_requeue_request(struct request *rq) +static void dm_old_requeue_request(struct request *rq) { struct request_queue *q = rq->q; unsigned long flags; @@ -1205,6 +1237,18 @@ static void old_requeue_request(struct request *rq) spin_unlock_irqrestore(q->queue_lock, flags); } +static void dm_mq_requeue_request(struct request *rq) +{ + struct request_queue *q = rq->q; + unsigned long flags; + + blk_mq_requeue_request(rq); + spin_lock_irqsave(q->queue_lock, flags); + if (!blk_queue_stopped(q)) + blk_mq_kick_requeue_list(q); + spin_unlock_irqrestore(q->queue_lock, flags); +} + static void dm_requeue_original_request(struct mapped_device *md, struct request *rq) { @@ -1214,36 +1258,36 @@ static void dm_requeue_original_request(struct mapped_device *md, dm_unprep_request(rq); if (!rq->q->mq_ops) - old_requeue_request(rq); - else { - blk_mq_requeue_request(rq); - blk_mq_kick_requeue_list(rq->q); - } + dm_old_requeue_request(rq); + else + dm_mq_requeue_request(rq); rq_completed(md, rw, false); } -static void old_stop_queue(struct request_queue *q) +static void dm_old_stop_queue(struct request_queue *q) { unsigned long flags; - if (blk_queue_stopped(q)) + spin_lock_irqsave(q->queue_lock, flags); + if (blk_queue_stopped(q)) { + spin_unlock_irqrestore(q->queue_lock, flags); return; + } - spin_lock_irqsave(q->queue_lock, flags); blk_stop_queue(q); spin_unlock_irqrestore(q->queue_lock, flags); } -static void stop_queue(struct request_queue *q) +static void dm_stop_queue(struct request_queue *q) { if (!q->mq_ops) - old_stop_queue(q); + dm_old_stop_queue(q); else blk_mq_stop_hw_queues(q); } -static void old_start_queue(struct request_queue *q) +static void dm_old_start_queue(struct request_queue *q) { unsigned long flags; @@ -1253,12 +1297,14 @@ static void old_start_queue(struct request_queue *q) spin_unlock_irqrestore(q->queue_lock, flags); } -static void start_queue(struct request_queue *q) +static void dm_start_queue(struct request_queue *q) { if (!q->mq_ops) - old_start_queue(q); - else + dm_old_start_queue(q); + else { blk_mq_start_stopped_hw_queues(q, true); + blk_mq_kick_requeue_list(q); + } } static void dm_done(struct request *clone, int error, bool mapped) @@ -1309,7 +1355,7 @@ static void dm_softirq_done(struct request *rq) if (!rq->q->mq_ops) { blk_end_request_all(rq, tio->error); rq_completed(tio->md, rw, false); - free_rq_tio(tio); + free_old_rq_tio(tio); } else { blk_mq_end_request(rq, tio->error); rq_completed(tio->md, rw, false); @@ -1351,7 +1397,7 @@ static void dm_kill_unmapped_request(struct request *rq, int error) } /* - * Called with the clone's queue lock held (for non-blk-mq) + * Called with the clone's queue lock held (in the case of .request_fn) */ static void end_clone_request(struct request *clone, int error) { @@ -1521,21 +1567,26 @@ static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len) /* * Creates a bio that consists of range of complete bvecs. */ -static void clone_bio(struct dm_target_io *tio, struct bio *bio, - sector_t sector, unsigned len) +static int clone_bio(struct dm_target_io *tio, struct bio *bio, + sector_t sector, unsigned len) { struct bio *clone = &tio->clone; __bio_clone_fast(clone, bio); - if (bio_integrity(bio)) - bio_integrity_clone(clone, bio, GFP_NOIO); + if (bio_integrity(bio)) { + int r = bio_integrity_clone(clone, bio, GFP_NOIO); + if (r < 0) + return r; + } bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector)); clone->bi_iter.bi_size = to_bytes(len); if (bio_integrity(bio)) bio_integrity_trim(clone, 0, len); + + return 0; } static struct dm_target_io *alloc_tio(struct clone_info *ci, @@ -1592,13 +1643,14 @@ static int __send_empty_flush(struct clone_info *ci) return 0; } -static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti, +static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti, sector_t sector, unsigned *len) { struct bio *bio = ci->bio; struct dm_target_io *tio; unsigned target_bio_nr; unsigned num_target_bios = 1; + int r = 0; /* * Does the target want to receive duplicate copies of the bio? @@ -1609,9 +1661,15 @@ static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) { tio = alloc_tio(ci, ti, target_bio_nr); tio->len_ptr = len; - clone_bio(tio, bio, sector, *len); + r = clone_bio(tio, bio, sector, *len); + if (r < 0) { + free_tio(ci->md, tio); + break; + } __map_bio(tio); } + + return r; } typedef unsigned (*get_num_bios_fn)(struct dm_target *ti); @@ -1688,6 +1746,7 @@ static int __split_and_process_non_flush(struct clone_info *ci) struct bio *bio = ci->bio; struct dm_target *ti; unsigned len; + int r; if (unlikely(bio->bi_rw & REQ_DISCARD)) return __send_discard(ci); @@ -1700,7 +1759,9 @@ static int __split_and_process_non_flush(struct clone_info *ci) len = min_t(sector_t, max_io_len(ci->sector, ti), ci->sector_count); - __clone_and_map_data_bio(ci, ti, ci->sector, &len); + r = __clone_and_map_data_bio(ci, ti, ci->sector, &len); + if (r < 0) + return r; ci->sector += len; ci->sector_count -= len; @@ -1838,28 +1899,22 @@ static int setup_clone(struct request *clone, struct request *rq, return 0; } -static struct request *clone_rq(struct request *rq, struct mapped_device *md, - struct dm_rq_target_io *tio, gfp_t gfp_mask) +static struct request *clone_old_rq(struct request *rq, struct mapped_device *md, + struct dm_rq_target_io *tio, gfp_t gfp_mask) { /* - * Do not allocate a clone if tio->clone was already set - * (see: dm_mq_queue_rq). + * Create clone for use with .request_fn request_queue */ - bool alloc_clone = !tio->clone; struct request *clone; - if (alloc_clone) { - clone = alloc_clone_request(md, gfp_mask); - if (!clone) - return NULL; - } else - clone = tio->clone; + clone = alloc_old_clone_request(md, gfp_mask); + if (!clone) + return NULL; blk_rq_init(NULL, clone); if (setup_clone(clone, rq, tio, gfp_mask)) { /* -ENOMEM */ - if (alloc_clone) - free_clone_request(md, clone); + free_old_clone_request(md, clone); return NULL; } @@ -1876,29 +1931,40 @@ static void init_tio(struct dm_rq_target_io *tio, struct request *rq, tio->clone = NULL; tio->orig = rq; tio->error = 0; - memset(&tio->info, 0, sizeof(tio->info)); + /* + * Avoid initializing info for blk-mq; it passes + * target-specific data through info.ptr + * (see: dm_mq_init_request) + */ + if (!md->init_tio_pdu) + memset(&tio->info, 0, sizeof(tio->info)); if (md->kworker_task) init_kthread_work(&tio->work, map_tio_request); } -static struct dm_rq_target_io *prep_tio(struct request *rq, - struct mapped_device *md, gfp_t gfp_mask) +static struct dm_rq_target_io *dm_old_prep_tio(struct request *rq, + struct mapped_device *md, + gfp_t gfp_mask) { struct dm_rq_target_io *tio; int srcu_idx; struct dm_table *table; - tio = alloc_rq_tio(md, gfp_mask); + tio = alloc_old_rq_tio(md, gfp_mask); if (!tio) return NULL; init_tio(tio, rq, md); table = dm_get_live_table(md, &srcu_idx); + /* + * Must clone a request if this .request_fn DM device + * is stacked on .request_fn device(s). + */ if (!dm_table_mq_request_based(table)) { - if (!clone_rq(rq, md, tio, gfp_mask)) { + if (!clone_old_rq(rq, md, tio, gfp_mask)) { dm_put_live_table(md, srcu_idx); - free_rq_tio(tio); + free_old_rq_tio(tio); return NULL; } } @@ -1910,7 +1976,7 @@ static struct dm_rq_target_io *prep_tio(struct request *rq, /* * Called with the queue lock held. */ -static int dm_prep_fn(struct request_queue *q, struct request *rq) +static int dm_old_prep_fn(struct request_queue *q, struct request *rq) { struct mapped_device *md = q->queuedata; struct dm_rq_target_io *tio; @@ -1920,7 +1986,7 @@ static int dm_prep_fn(struct request_queue *q, struct request *rq) return BLKPREP_KILL; } - tio = prep_tio(rq, md, GFP_ATOMIC); + tio = dm_old_prep_tio(rq, md, GFP_ATOMIC); if (!tio) return BLKPREP_DEFER; @@ -2078,12 +2144,18 @@ static bool dm_request_peeked_before_merge_deadline(struct mapped_device *md) static void dm_request_fn(struct request_queue *q) { struct mapped_device *md = q->queuedata; - int srcu_idx; - struct dm_table *map = dm_get_live_table(md, &srcu_idx); - struct dm_target *ti; + struct dm_target *ti = md->immutable_target; struct request *rq; struct dm_rq_target_io *tio; - sector_t pos; + sector_t pos = 0; + + if (unlikely(!ti)) { + int srcu_idx; + struct dm_table *map = dm_get_live_table(md, &srcu_idx); + + ti = dm_table_find_target(map, pos); + dm_put_live_table(md, srcu_idx); + } /* * For suspend, check blk_queue_stopped() and increment @@ -2094,33 +2166,21 @@ static void dm_request_fn(struct request_queue *q) while (!blk_queue_stopped(q)) { rq = blk_peek_request(q); if (!rq) - goto out; + return; /* always use block 0 to find the target for flushes for now */ pos = 0; if (!(rq->cmd_flags & REQ_FLUSH)) pos = blk_rq_pos(rq); - ti = dm_table_find_target(map, pos); - if (!dm_target_is_valid(ti)) { - /* - * Must perform setup, that rq_completed() requires, - * before calling dm_kill_unmapped_request - */ - DMERR_LIMIT("request attempted access beyond the end of device"); - dm_start_request(md, rq); - dm_kill_unmapped_request(rq, -EIO); - continue; + if ((dm_request_peeked_before_merge_deadline(md) && + md_in_flight(md) && rq->bio && rq->bio->bi_vcnt == 1 && + md->last_rq_pos == pos && md->last_rq_rw == rq_data_dir(rq)) || + (ti->type->busy && ti->type->busy(ti))) { + blk_delay_queue(q, HZ / 100); + return; } - if (dm_request_peeked_before_merge_deadline(md) && - md_in_flight(md) && rq->bio && rq->bio->bi_vcnt == 1 && - md->last_rq_pos == pos && md->last_rq_rw == rq_data_dir(rq)) - goto delay_and_out; - - if (ti->type->busy && ti->type->busy(ti)) - goto delay_and_out; - dm_start_request(md, rq); tio = tio_from_request(rq); @@ -2129,13 +2189,6 @@ static void dm_request_fn(struct request_queue *q) queue_kthread_work(&md->kworker, &tio->work); BUG_ON(!irqs_disabled()); } - - goto out; - -delay_and_out: - blk_delay_queue(q, HZ / 100); -out: - dm_put_live_table(md, srcu_idx); } static int dm_any_congested(void *congested_data, int bdi_bits) @@ -2145,19 +2198,18 @@ static int dm_any_congested(void *congested_data, int bdi_bits) struct dm_table *map; if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { - map = dm_get_live_table_fast(md); - if (map) { + if (dm_request_based(md)) { /* - * Request-based dm cares about only own queue for - * the query about congestion status of request_queue + * With request-based DM we only need to check the + * top-level queue for congestion. */ - if (dm_request_based(md)) - r = md->queue->backing_dev_info.wb.state & - bdi_bits; - else + r = md->queue->backing_dev_info.wb.state & bdi_bits; + } else { + map = dm_get_live_table_fast(md); + if (map) r = dm_table_any_congested(map, bdi_bits); + dm_put_live_table_fast(md); } - dm_put_live_table_fast(md); } return r; @@ -2237,7 +2289,7 @@ static void dm_init_md_queue(struct mapped_device *md) md->queue->backing_dev_info.congested_data = md; } -static void dm_init_old_md_queue(struct mapped_device *md) +static void dm_init_normal_md_queue(struct mapped_device *md) { md->use_blk_mq = false; dm_init_md_queue(md); @@ -2284,10 +2336,11 @@ static void cleanup_mapped_device(struct mapped_device *md) */ static struct mapped_device *alloc_dev(int minor) { - int r; - struct mapped_device *md = kzalloc(sizeof(*md), GFP_KERNEL); + int r, numa_node_id = dm_get_numa_node(); + struct mapped_device *md; void *old_md; + md = kzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id); if (!md) { DMWARN("unable to allocate device, out of memory."); return NULL; @@ -2308,7 +2361,9 @@ static struct mapped_device *alloc_dev(int minor) if (r < 0) goto bad_io_barrier; + md->numa_node_id = numa_node_id; md->use_blk_mq = use_blk_mq; + md->init_tio_pdu = false; md->type = DM_TYPE_NONE; mutex_init(&md->suspend_lock); mutex_init(&md->type_lock); @@ -2322,13 +2377,13 @@ static struct mapped_device *alloc_dev(int minor) INIT_LIST_HEAD(&md->table_devices); spin_lock_init(&md->uevent_lock); - md->queue = blk_alloc_queue(GFP_KERNEL); + md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id); if (!md->queue) goto bad; dm_init_md_queue(md); - md->disk = alloc_disk(1); + md->disk = alloc_disk_node(1, numa_node_id); if (!md->disk) goto bad; @@ -2392,8 +2447,10 @@ static void free_dev(struct mapped_device *md) unlock_fs(md); cleanup_mapped_device(md); - if (md->use_blk_mq) - blk_mq_free_tag_set(&md->tag_set); + if (md->tag_set) { + blk_mq_free_tag_set(md->tag_set); + kfree(md->tag_set); + } free_table_devices(&md->table_devices); dm_stats_cleanup(&md->stats); @@ -2501,13 +2558,20 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, * This must be done before setting the queue restrictions, * because request-based dm may be run just after the setting. */ - if (dm_table_request_based(t)) - stop_queue(q); + if (dm_table_request_based(t)) { + dm_stop_queue(q); + /* + * Leverage the fact that request-based DM targets are + * immutable singletons and establish md->immutable_target + * - used to optimize both dm_request_fn and dm_mq_queue_rq + */ + md->immutable_target = dm_table_get_immutable_target(t); + } __bind_mempools(md, t); old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); - rcu_assign_pointer(md->map, t); + rcu_assign_pointer(md->map, (void *)t); md->immutable_target_type = dm_table_get_immutable_target_type(t); dm_table_set_restrictions(t, q, limits); @@ -2573,7 +2637,6 @@ void dm_set_md_type(struct mapped_device *md, unsigned type) unsigned dm_get_md_type(struct mapped_device *md) { - BUG_ON(!mutex_is_locked(&md->type_lock)); return md->type; } @@ -2593,7 +2656,7 @@ struct queue_limits *dm_get_queue_limits(struct mapped_device *md) } EXPORT_SYMBOL_GPL(dm_get_queue_limits); -static void init_rq_based_worker_thread(struct mapped_device *md) +static void dm_old_init_rq_based_worker_thread(struct mapped_device *md) { /* Initialize the request-based DM worker thread */ init_kthread_worker(&md->kworker); @@ -2602,26 +2665,22 @@ static void init_rq_based_worker_thread(struct mapped_device *md) } /* - * Fully initialize a request-based queue (->elevator, ->request_fn, etc). + * Fully initialize a .request_fn request-based queue. */ -static int dm_init_request_based_queue(struct mapped_device *md) +static int dm_old_init_request_queue(struct mapped_device *md) { - struct request_queue *q = NULL; - /* Fully initialize the queue */ - q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL); - if (!q) + if (!blk_init_allocated_queue(md->queue, dm_request_fn, NULL)) return -EINVAL; /* disable dm_request_fn's merge heuristic by default */ md->seq_rq_merge_deadline_usecs = 0; - md->queue = q; - dm_init_old_md_queue(md); + dm_init_normal_md_queue(md); blk_queue_softirq_done(md->queue, dm_softirq_done); - blk_queue_prep_rq(md->queue, dm_prep_fn); + blk_queue_prep_rq(md->queue, dm_old_prep_fn); - init_rq_based_worker_thread(md); + dm_old_init_rq_based_worker_thread(md); elv_register_queue(md->queue); @@ -2641,6 +2700,11 @@ static int dm_mq_init_request(void *data, struct request *rq, */ tio->md = md; + if (md->init_tio_pdu) { + /* target-specific per-io data is immediately after the tio */ + tio->info.ptr = tio + 1; + } + return 0; } @@ -2650,28 +2714,15 @@ static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq = bd->rq; struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq); struct mapped_device *md = tio->md; - int srcu_idx; - struct dm_table *map = dm_get_live_table(md, &srcu_idx); - struct dm_target *ti; - sector_t pos; + struct dm_target *ti = md->immutable_target; - /* always use block 0 to find the target for flushes for now */ - pos = 0; - if (!(rq->cmd_flags & REQ_FLUSH)) - pos = blk_rq_pos(rq); + if (unlikely(!ti)) { + int srcu_idx; + struct dm_table *map = dm_get_live_table(md, &srcu_idx); - ti = dm_table_find_target(map, pos); - if (!dm_target_is_valid(ti)) { + ti = dm_table_find_target(map, 0); dm_put_live_table(md, srcu_idx); - DMERR_LIMIT("request attempted access beyond the end of device"); - /* - * Must perform setup, that rq_completed() requires, - * before returning BLK_MQ_RQ_QUEUE_ERROR - */ - dm_start_request(md, rq); - return BLK_MQ_RQ_QUEUE_ERROR; } - dm_put_live_table(md, srcu_idx); if (ti->type->busy && ti->type->busy(ti)) return BLK_MQ_RQ_QUEUE_BUSY; @@ -2687,20 +2738,12 @@ static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx, */ tio->ti = ti; - /* Clone the request if underlying devices aren't blk-mq */ - if (dm_table_get_type(map) == DM_TYPE_REQUEST_BASED) { - /* clone request is allocated at the end of the pdu */ - tio->clone = (void *)blk_mq_rq_to_pdu(rq) + sizeof(struct dm_rq_target_io); - (void) clone_rq(rq, md, tio, GFP_ATOMIC); - queue_kthread_work(&md->kworker, &tio->work); - } else { - /* Direct call is fine since .queue_rq allows allocations */ - if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE) { - /* Undo dm_start_request() before requeuing */ - rq_end_stats(md, rq); - rq_completed(md, rq_data_dir(rq), false); - return BLK_MQ_RQ_QUEUE_BUSY; - } + /* Direct call is fine since .queue_rq allows allocations */ + if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE) { + /* Undo dm_start_request() before requeuing */ + rq_end_stats(md, rq); + rq_completed(md, rq_data_dir(rq), false); + return BLK_MQ_RQ_QUEUE_BUSY; } return BLK_MQ_RQ_QUEUE_OK; @@ -2713,47 +2756,56 @@ static struct blk_mq_ops dm_mq_ops = { .init_request = dm_mq_init_request, }; -static int dm_init_request_based_blk_mq_queue(struct mapped_device *md) +static int dm_mq_init_request_queue(struct mapped_device *md, + struct dm_target *immutable_tgt) { - unsigned md_type = dm_get_md_type(md); struct request_queue *q; int err; - memset(&md->tag_set, 0, sizeof(md->tag_set)); - md->tag_set.ops = &dm_mq_ops; - md->tag_set.queue_depth = BLKDEV_MAX_RQ; - md->tag_set.numa_node = NUMA_NO_NODE; - md->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; - md->tag_set.nr_hw_queues = 1; - if (md_type == DM_TYPE_REQUEST_BASED) { - /* make the memory for non-blk-mq clone part of the pdu */ - md->tag_set.cmd_size = sizeof(struct dm_rq_target_io) + sizeof(struct request); - } else - md->tag_set.cmd_size = sizeof(struct dm_rq_target_io); - md->tag_set.driver_data = md; - - err = blk_mq_alloc_tag_set(&md->tag_set); + if (dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) { + DMERR("request-based dm-mq may only be stacked on blk-mq device(s)"); + return -EINVAL; + } + + md->tag_set = kzalloc_node(sizeof(struct blk_mq_tag_set), GFP_KERNEL, md->numa_node_id); + if (!md->tag_set) + return -ENOMEM; + + md->tag_set->ops = &dm_mq_ops; + md->tag_set->queue_depth = dm_get_blk_mq_queue_depth(); + md->tag_set->numa_node = md->numa_node_id; + md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; + md->tag_set->nr_hw_queues = dm_get_blk_mq_nr_hw_queues(); + md->tag_set->driver_data = md; + + md->tag_set->cmd_size = sizeof(struct dm_rq_target_io); + if (immutable_tgt && immutable_tgt->per_io_data_size) { + /* any target-specific per-io data is immediately after the tio */ + md->tag_set->cmd_size += immutable_tgt->per_io_data_size; + md->init_tio_pdu = true; + } + + err = blk_mq_alloc_tag_set(md->tag_set); if (err) - return err; + goto out_kfree_tag_set; - q = blk_mq_init_allocated_queue(&md->tag_set, md->queue); + q = blk_mq_init_allocated_queue(md->tag_set, md->queue); if (IS_ERR(q)) { err = PTR_ERR(q); goto out_tag_set; } - md->queue = q; dm_init_md_queue(md); /* backfill 'mq' sysfs registration normally done in blk_register_queue */ blk_mq_register_disk(md->disk); - if (md_type == DM_TYPE_REQUEST_BASED) - init_rq_based_worker_thread(md); - return 0; out_tag_set: - blk_mq_free_tag_set(&md->tag_set); + blk_mq_free_tag_set(md->tag_set); +out_kfree_tag_set: + kfree(md->tag_set); + return err; } @@ -2768,28 +2820,28 @@ static unsigned filter_md_type(unsigned type, struct mapped_device *md) /* * Setup the DM device's queue based on md's type */ -int dm_setup_md_queue(struct mapped_device *md) +int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) { int r; unsigned md_type = filter_md_type(dm_get_md_type(md), md); switch (md_type) { case DM_TYPE_REQUEST_BASED: - r = dm_init_request_based_queue(md); + r = dm_old_init_request_queue(md); if (r) { - DMWARN("Cannot initialize queue for request-based mapped device"); + DMERR("Cannot initialize queue for request-based mapped device"); return r; } break; case DM_TYPE_MQ_REQUEST_BASED: - r = dm_init_request_based_blk_mq_queue(md); + r = dm_mq_init_request_queue(md, dm_table_get_immutable_target(t)); if (r) { - DMWARN("Cannot initialize queue for request-based blk-mq mapped device"); + DMERR("Cannot initialize queue for request-based dm-mq mapped device"); return r; } break; case DM_TYPE_BIO_BASED: - dm_init_old_md_queue(md); + dm_init_normal_md_queue(md); blk_queue_make_request(md->queue, dm_make_request); /* * DM handles splitting bios as needed. Free the bio_split bioset @@ -3132,7 +3184,7 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map, * dm defers requests to md->wq from md->queue. */ if (dm_request_based(md)) { - stop_queue(md->queue); + dm_stop_queue(md->queue); if (md->kworker_task) flush_kthread_worker(&md->kworker); } @@ -3156,7 +3208,7 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map, dm_queue_flush(md); if (dm_request_based(md)) - start_queue(md->queue); + dm_start_queue(md->queue); unlock_fs(md); dm_table_presuspend_undo_targets(map); @@ -3235,7 +3287,7 @@ static int __dm_resume(struct mapped_device *md, struct dm_table *map) * Request-based dm is queueing the deferred I/Os in its request_queue. */ if (dm_request_based(md)) - start_queue(md->queue); + dm_start_queue(md->queue); unlock_fs(md); @@ -3481,9 +3533,9 @@ int dm_noflush_suspending(struct dm_target *ti) EXPORT_SYMBOL_GPL(dm_noflush_suspending); struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type, - unsigned integrity, unsigned per_bio_data_size) + unsigned integrity, unsigned per_io_data_size) { - struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL); + struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id); struct kmem_cache *cachep = NULL; unsigned int pool_size = 0; unsigned int front_pad; @@ -3497,7 +3549,7 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned t case DM_TYPE_BIO_BASED: cachep = _io_cache; pool_size = dm_get_reserved_bio_based_ios(); - front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone); + front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone); break; case DM_TYPE_REQUEST_BASED: cachep = _rq_tio_cache; @@ -3510,8 +3562,7 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned t if (!pool_size) pool_size = dm_get_reserved_rq_based_ios(); front_pad = offsetof(struct dm_rq_clone_bio_info, clone); - /* per_bio_data_size is not used. See __bind_mempools(). */ - WARN_ON(per_bio_data_size != 0); + /* per_io_data_size is used for blk-mq pdu at queue allocation */ break; default: BUG(); @@ -3553,15 +3604,14 @@ void dm_free_md_mempools(struct dm_md_mempools *pools) } static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key, - u32 flags) + u32 flags) { struct mapped_device *md = bdev->bd_disk->private_data; const struct pr_ops *ops; - struct dm_target *tgt; fmode_t mode; - int srcu_idx, r; + int r; - r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx); + r = dm_grab_bdev_for_ioctl(md, &bdev, &mode); if (r < 0) return r; @@ -3571,20 +3621,19 @@ static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key, else r = -EOPNOTSUPP; - dm_put_live_table(md, srcu_idx); + bdput(bdev); return r; } static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type, - u32 flags) + u32 flags) { struct mapped_device *md = bdev->bd_disk->private_data; const struct pr_ops *ops; - struct dm_target *tgt; fmode_t mode; - int srcu_idx, r; + int r; - r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx); + r = dm_grab_bdev_for_ioctl(md, &bdev, &mode); if (r < 0) return r; @@ -3594,7 +3643,7 @@ static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type, else r = -EOPNOTSUPP; - dm_put_live_table(md, srcu_idx); + bdput(bdev); return r; } @@ -3602,11 +3651,10 @@ static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type) { struct mapped_device *md = bdev->bd_disk->private_data; const struct pr_ops *ops; - struct dm_target *tgt; fmode_t mode; - int srcu_idx, r; + int r; - r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx); + r = dm_grab_bdev_for_ioctl(md, &bdev, &mode); if (r < 0) return r; @@ -3616,20 +3664,19 @@ static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type) else r = -EOPNOTSUPP; - dm_put_live_table(md, srcu_idx); + bdput(bdev); return r; } static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key, - enum pr_type type, bool abort) + enum pr_type type, bool abort) { struct mapped_device *md = bdev->bd_disk->private_data; const struct pr_ops *ops; - struct dm_target *tgt; fmode_t mode; - int srcu_idx, r; + int r; - r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx); + r = dm_grab_bdev_for_ioctl(md, &bdev, &mode); if (r < 0) return r; @@ -3639,7 +3686,7 @@ static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key, else r = -EOPNOTSUPP; - dm_put_live_table(md, srcu_idx); + bdput(bdev); return r; } @@ -3647,11 +3694,10 @@ static int dm_pr_clear(struct block_device *bdev, u64 key) { struct mapped_device *md = bdev->bd_disk->private_data; const struct pr_ops *ops; - struct dm_target *tgt; fmode_t mode; - int srcu_idx, r; + int r; - r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx); + r = dm_grab_bdev_for_ioctl(md, &bdev, &mode); if (r < 0) return r; @@ -3661,7 +3707,7 @@ static int dm_pr_clear(struct block_device *bdev, u64 key) else r = -EOPNOTSUPP; - dm_put_live_table(md, srcu_idx); + bdput(bdev); return r; } @@ -3700,6 +3746,15 @@ MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools" module_param(use_blk_mq, bool, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(use_blk_mq, "Use block multiqueue for request-based DM devices"); +module_param(dm_mq_nr_hw_queues, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(dm_mq_nr_hw_queues, "Number of hardware queues for request-based dm-mq devices"); + +module_param(dm_mq_queue_depth, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(dm_mq_queue_depth, "Queue depth for request-based dm-mq devices"); + +module_param(dm_numa_node, int, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations"); + MODULE_DESCRIPTION(DM_NAME " driver"); MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 7edcf97df..13a758ec0 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -73,6 +73,8 @@ int dm_table_resume_targets(struct dm_table *t); int dm_table_any_congested(struct dm_table *t, int bdi_bits); unsigned dm_table_get_type(struct dm_table *t); struct target_type *dm_table_get_immutable_target_type(struct dm_table *t); +struct dm_target *dm_table_get_immutable_target(struct dm_table *t); +struct dm_target *dm_table_get_wildcard_target(struct dm_table *t); bool dm_table_request_based(struct dm_table *t); bool dm_table_mq_request_based(struct dm_table *t); void dm_table_free_md_mempools(struct dm_table *t); @@ -84,7 +86,7 @@ void dm_set_md_type(struct mapped_device *md, unsigned type); unsigned dm_get_md_type(struct mapped_device *md); struct target_type *dm_get_immutable_target_type(struct mapped_device *md); -int dm_setup_md_queue(struct mapped_device *md); +int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t); /* * To check the return value from dm_table_find_target(). diff --git a/drivers/md/md.c b/drivers/md/md.c index 755127803..85b16aadd 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -307,6 +307,7 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) */ void mddev_suspend(struct mddev *mddev) { + WARN_ON_ONCE(mddev->thread && current == mddev->thread->tsk); if (mddev->suspended++) return; synchronize_rcu(); @@ -719,6 +720,7 @@ static void super_written(struct bio *bio) if (atomic_dec_and_test(&mddev->pending_writes)) wake_up(&mddev->sb_wait); + rdev_dec_pending(rdev, mddev); bio_put(bio); } @@ -733,6 +735,8 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev, */ struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev); + atomic_inc(&rdev->nr_pending); + bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev; bio->bi_iter.bi_sector = sector; bio_add_page(bio, page, size, 0); @@ -5673,7 +5677,6 @@ static int do_md_stop(struct mddev *mddev, int mode, export_array(mddev); md_clean(mddev); - kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); if (mddev->hold_active == UNTIL_STOP) mddev->hold_active = 0; } @@ -6885,7 +6888,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, case ADD_NEW_DISK: /* We can support ADD_NEW_DISK on read-only arrays - * on if we are re-adding a preexisting device. + * only if we are re-adding a preexisting device. * So require mddev->pers and MD_DISK_SYNC. */ if (mddev->pers) { diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 2ea12c6bf..34783a3c8 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -70,7 +70,6 @@ static void dump_zones(struct mddev *mddev) (unsigned long long)zone_size>>1); zone_start = conf->strip_zone[j].zone_end; } - printk(KERN_INFO "\n"); } static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) @@ -85,6 +84,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) struct r0conf *conf = kzalloc(sizeof(*conf), GFP_KERNEL); unsigned short blksize = 512; + *private_conf = ERR_PTR(-ENOMEM); if (!conf) return -ENOMEM; rdev_for_each(rdev1, mddev) { diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index bb5bce059..a7f2b9c9f 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -570,7 +570,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect if (best_dist_disk < 0) { if (is_badblock(rdev, this_sector, sectors, &first_bad, &bad_sectors)) { - if (first_bad < this_sector) + if (first_bad <= this_sector) /* Cannot use this */ continue; best_good_sectors = first_bad - this_sector; @@ -2698,7 +2698,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) break; - BUG_ON(sync_blocks < (PAGE_SIZE>>9)); if ((len >> 9) > sync_blocks) len = sync_blocks<<9; } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 32d52878f..e48c262ce 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3502,8 +3502,6 @@ returnbi: dev = &sh->dev[i]; } else if (test_bit(R5_Discard, &dev->flags)) discard_pending = 1; - WARN_ON(test_bit(R5_SkipCopy, &dev->flags)); - WARN_ON(dev->page != dev->orig_page); } r5l_stripe_write_finished(sh); @@ -4236,7 +4234,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh, list_del_init(&sh->batch_list); - WARN_ON_ONCE(sh->state & ((1 << STRIPE_ACTIVE) | + WARN_ONCE(sh->state & ((1 << STRIPE_ACTIVE) | (1 << STRIPE_SYNCING) | (1 << STRIPE_REPLACED) | (1 << STRIPE_DELAYED) | @@ -4248,9 +4246,11 @@ static void break_stripe_batch_list(struct stripe_head *head_sh, (1 << STRIPE_DISCARD) | (1 << STRIPE_BATCH_READY) | (1 << STRIPE_BATCH_ERR) | - (1 << STRIPE_BITMAP_PENDING))); - WARN_ON_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) | - (1 << STRIPE_REPLACED))); + (1 << STRIPE_BITMAP_PENDING)), + "stripe state: %lx\n", sh->state); + WARN_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) | + (1 << STRIPE_REPLACED)), + "head stripe state: %lx\n", head_sh->state); set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS | (1 << STRIPE_PREREAD_ACTIVE) | @@ -6379,6 +6379,8 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action, break; case CPU_DEAD: case CPU_DEAD_FROZEN: + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu)); break; default: |