summaryrefslogtreecommitdiff
path: root/drivers/md/dm-thin.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/dm-thin.c')
-rw-r--r--drivers/md/dm-thin.c651
1 files changed, 475 insertions, 176 deletions
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index e22e6c892..d2bbe8cc1 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -112,22 +112,30 @@ DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
/*
* Key building.
*/
-static void build_data_key(struct dm_thin_device *td,
- dm_block_t b, struct dm_cell_key *key)
+enum lock_space {
+ VIRTUAL,
+ PHYSICAL
+};
+
+static void build_key(struct dm_thin_device *td, enum lock_space ls,
+ dm_block_t b, dm_block_t e, struct dm_cell_key *key)
{
- key->virtual = 0;
+ key->virtual = (ls == VIRTUAL);
key->dev = dm_thin_dev_id(td);
key->block_begin = b;
- key->block_end = b + 1ULL;
+ key->block_end = e;
+}
+
+static void build_data_key(struct dm_thin_device *td, dm_block_t b,
+ struct dm_cell_key *key)
+{
+ build_key(td, PHYSICAL, b, b + 1llu, key);
}
static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
struct dm_cell_key *key)
{
- key->virtual = 1;
- key->dev = dm_thin_dev_id(td);
- key->block_begin = b;
- key->block_end = b + 1ULL;
+ build_key(td, VIRTUAL, b, b + 1llu, key);
}
/*----------------------------------------------------------------*/
@@ -313,6 +321,138 @@ struct thin_c {
/*----------------------------------------------------------------*/
+/**
+ * __blkdev_issue_discard_async - queue a discard with async completion
+ * @bdev: blockdev to issue discard for
+ * @sector: start sector
+ * @nr_sects: number of sectors to discard
+ * @gfp_mask: memory allocation flags (for bio_alloc)
+ * @flags: BLKDEV_IFL_* flags to control behaviour
+ * @parent_bio: parent discard bio that all sub discards get chained to
+ *
+ * Description:
+ * Asynchronously issue a discard request for the sectors in question.
+ * NOTE: this variant of blk-core's blkdev_issue_discard() is a stop-gap
+ * that is being kept local to DM thinp until the block changes to allow
+ * late bio splitting land upstream.
+ */
+static int __blkdev_issue_discard_async(struct block_device *bdev, sector_t sector,
+ sector_t nr_sects, gfp_t gfp_mask, unsigned long flags,
+ struct bio *parent_bio)
+{
+ struct request_queue *q = bdev_get_queue(bdev);
+ int type = REQ_WRITE | REQ_DISCARD;
+ unsigned int max_discard_sectors, granularity;
+ int alignment;
+ struct bio *bio;
+ int ret = 0;
+ struct blk_plug plug;
+
+ if (!q)
+ return -ENXIO;
+
+ if (!blk_queue_discard(q))
+ return -EOPNOTSUPP;
+
+ /* Zero-sector (unknown) and one-sector granularities are the same. */
+ granularity = max(q->limits.discard_granularity >> 9, 1U);
+ alignment = (bdev_discard_alignment(bdev) >> 9) % granularity;
+
+ /*
+ * Ensure that max_discard_sectors is of the proper
+ * granularity, so that requests stay aligned after a split.
+ */
+ max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9);
+ max_discard_sectors -= max_discard_sectors % granularity;
+ if (unlikely(!max_discard_sectors)) {
+ /* Avoid infinite loop below. Being cautious never hurts. */
+ return -EOPNOTSUPP;
+ }
+
+ if (flags & BLKDEV_DISCARD_SECURE) {
+ if (!blk_queue_secdiscard(q))
+ return -EOPNOTSUPP;
+ type |= REQ_SECURE;
+ }
+
+ blk_start_plug(&plug);
+ while (nr_sects) {
+ unsigned int req_sects;
+ sector_t end_sect, tmp;
+
+ /*
+ * Required bio_put occurs in bio_endio thanks to bio_chain below
+ */
+ bio = bio_alloc(gfp_mask, 1);
+ if (!bio) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ req_sects = min_t(sector_t, nr_sects, max_discard_sectors);
+
+ /*
+ * If splitting a request, and the next starting sector would be
+ * misaligned, stop the discard at the previous aligned sector.
+ */
+ end_sect = sector + req_sects;
+ tmp = end_sect;
+ if (req_sects < nr_sects &&
+ sector_div(tmp, granularity) != alignment) {
+ end_sect = end_sect - alignment;
+ sector_div(end_sect, granularity);
+ end_sect = end_sect * granularity + alignment;
+ req_sects = end_sect - sector;
+ }
+
+ bio_chain(bio, parent_bio);
+
+ bio->bi_iter.bi_sector = sector;
+ bio->bi_bdev = bdev;
+
+ bio->bi_iter.bi_size = req_sects << 9;
+ nr_sects -= req_sects;
+ sector = end_sect;
+
+ submit_bio(type, bio);
+
+ /*
+ * We can loop for a long time in here, if someone does
+ * full device discards (like mkfs). Be nice and allow
+ * us to schedule out to avoid softlocking if preempt
+ * is disabled.
+ */
+ cond_resched();
+ }
+ blk_finish_plug(&plug);
+
+ return ret;
+}
+
+static bool block_size_is_power_of_two(struct pool *pool)
+{
+ return pool->sectors_per_block_shift >= 0;
+}
+
+static sector_t block_to_sectors(struct pool *pool, dm_block_t b)
+{
+ return block_size_is_power_of_two(pool) ?
+ (b << pool->sectors_per_block_shift) :
+ (b * pool->sectors_per_block);
+}
+
+static int issue_discard(struct thin_c *tc, dm_block_t data_b, dm_block_t data_e,
+ struct bio *parent_bio)
+{
+ sector_t s = block_to_sectors(tc->pool, data_b);
+ sector_t len = block_to_sectors(tc->pool, data_e - data_b);
+
+ return __blkdev_issue_discard_async(tc->pool_dev->bdev, s, len,
+ GFP_NOWAIT, 0, parent_bio);
+}
+
+/*----------------------------------------------------------------*/
+
/*
* wake_worker() is used when new work is queued and when pool_resume is
* ready to continue deferred IO processing.
@@ -462,6 +602,7 @@ struct dm_thin_endio_hook {
struct dm_deferred_entry *all_io_entry;
struct dm_thin_new_mapping *overwrite_mapping;
struct rb_node rb_node;
+ struct dm_bio_prison_cell *cell;
};
static void __merge_bio_list(struct bio_list *bios, struct bio_list *master)
@@ -525,16 +666,21 @@ static void requeue_io(struct thin_c *tc)
requeue_deferred_cells(tc);
}
-static void error_retry_list(struct pool *pool)
+static void error_retry_list_with_code(struct pool *pool, int error)
{
struct thin_c *tc;
rcu_read_lock();
list_for_each_entry_rcu(tc, &pool->active_thins, list)
- error_thin_bio_list(tc, &tc->retry_on_resume_list, -EIO);
+ error_thin_bio_list(tc, &tc->retry_on_resume_list, error);
rcu_read_unlock();
}
+static void error_retry_list(struct pool *pool)
+{
+ return error_retry_list_with_code(pool, -EIO);
+}
+
/*
* This section of code contains the logic for processing a thin device's IO.
* Much of the code depends on pool object resources (lists, workqueues, etc)
@@ -542,11 +688,6 @@ static void error_retry_list(struct pool *pool)
* target.
*/
-static bool block_size_is_power_of_two(struct pool *pool)
-{
- return pool->sectors_per_block_shift >= 0;
-}
-
static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
{
struct pool *pool = tc->pool;
@@ -560,6 +701,34 @@ static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
return block_nr;
}
+/*
+ * Returns the _complete_ blocks that this bio covers.
+ */
+static void get_bio_block_range(struct thin_c *tc, struct bio *bio,
+ dm_block_t *begin, dm_block_t *end)
+{
+ struct pool *pool = tc->pool;
+ sector_t b = bio->bi_iter.bi_sector;
+ sector_t e = b + (bio->bi_iter.bi_size >> SECTOR_SHIFT);
+
+ b += pool->sectors_per_block - 1ull; /* so we round up */
+
+ if (block_size_is_power_of_two(pool)) {
+ b >>= pool->sectors_per_block_shift;
+ e >>= pool->sectors_per_block_shift;
+ } else {
+ (void) sector_div(b, pool->sectors_per_block);
+ (void) sector_div(e, pool->sectors_per_block);
+ }
+
+ if (e < b)
+ /* Can happen if the bio is within a single block. */
+ e = b;
+
+ *begin = b;
+ *end = e;
+}
+
static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
{
struct pool *pool = tc->pool;
@@ -648,7 +817,7 @@ struct dm_thin_new_mapping {
struct list_head list;
bool pass_discard:1;
- bool definitely_not_shared:1;
+ bool maybe_shared:1;
/*
* Track quiescing, copying and zeroing preparation actions. When this
@@ -659,9 +828,9 @@ struct dm_thin_new_mapping {
int err;
struct thin_c *tc;
- dm_block_t virt_block;
+ dm_block_t virt_begin, virt_end;
dm_block_t data_block;
- struct dm_bio_prison_cell *cell, *cell2;
+ struct dm_bio_prison_cell *cell;
/*
* If the bio covers the whole area of a block then we can avoid
@@ -706,6 +875,8 @@ static void overwrite_endio(struct bio *bio, int err)
struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
struct dm_thin_new_mapping *m = h->overwrite_mapping;
+ bio->bi_end_io = m->saved_bi_end_io;
+
m->err = err;
complete_mapping_preparation(m);
}
@@ -794,10 +965,6 @@ static void inc_remap_and_issue_cell(struct thin_c *tc,
static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
{
- if (m->bio) {
- m->bio->bi_end_io = m->saved_bi_end_io;
- atomic_inc(&m->bio->bi_remaining);
- }
cell_error(m->tc->pool, m->cell);
list_del(&m->list);
mempool_free(m, m->tc->pool->mapping_pool);
@@ -807,15 +974,9 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
{
struct thin_c *tc = m->tc;
struct pool *pool = tc->pool;
- struct bio *bio;
+ struct bio *bio = m->bio;
int r;
- bio = m->bio;
- if (bio) {
- bio->bi_end_io = m->saved_bi_end_io;
- atomic_inc(&bio->bi_remaining);
- }
-
if (m->err) {
cell_error(pool, m->cell);
goto out;
@@ -826,7 +987,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
* Any I/O for this block arriving after this point will get
* remapped to it directly.
*/
- r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
+ r = dm_thin_insert_block(tc->td, m->virt_begin, m->data_block);
if (r) {
metadata_operation_failed(pool, "dm_thin_insert_block", r);
cell_error(pool, m->cell);
@@ -853,50 +1014,112 @@ out:
mempool_free(m, pool->mapping_pool);
}
-static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
+/*----------------------------------------------------------------*/
+
+static void free_discard_mapping(struct dm_thin_new_mapping *m)
{
struct thin_c *tc = m->tc;
+ if (m->cell)
+ cell_defer_no_holder(tc, m->cell);
+ mempool_free(m, tc->pool->mapping_pool);
+}
+static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
+{
bio_io_error(m->bio);
+ free_discard_mapping(m);
+}
+
+static void process_prepared_discard_success(struct dm_thin_new_mapping *m)
+{
+ bio_endio(m->bio, 0);
+ free_discard_mapping(m);
+}
+
+static void process_prepared_discard_no_passdown(struct dm_thin_new_mapping *m)
+{
+ int r;
+ struct thin_c *tc = m->tc;
+
+ r = dm_thin_remove_range(tc->td, m->cell->key.block_begin, m->cell->key.block_end);
+ if (r) {
+ metadata_operation_failed(tc->pool, "dm_thin_remove_range", r);
+ bio_io_error(m->bio);
+ } else
+ bio_endio(m->bio, 0);
+
cell_defer_no_holder(tc, m->cell);
- cell_defer_no_holder(tc, m->cell2);
mempool_free(m, tc->pool->mapping_pool);
}
-static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)
+static int passdown_double_checking_shared_status(struct dm_thin_new_mapping *m)
{
+ /*
+ * We've already unmapped this range of blocks, but before we
+ * passdown we have to check that these blocks are now unused.
+ */
+ int r;
+ bool used = true;
struct thin_c *tc = m->tc;
+ struct pool *pool = tc->pool;
+ dm_block_t b = m->data_block, e, end = m->data_block + m->virt_end - m->virt_begin;
- inc_all_io_entry(tc->pool, m->bio);
- cell_defer_no_holder(tc, m->cell);
- cell_defer_no_holder(tc, m->cell2);
+ while (b != end) {
+ /* find start of unmapped run */
+ for (; b < end; b++) {
+ r = dm_pool_block_is_used(pool->pmd, b, &used);
+ if (r)
+ return r;
- if (m->pass_discard)
- if (m->definitely_not_shared)
- remap_and_issue(tc, m->bio, m->data_block);
- else {
- bool used = false;
- if (dm_pool_block_is_used(tc->pool->pmd, m->data_block, &used) || used)
- bio_endio(m->bio, 0);
- else
- remap_and_issue(tc, m->bio, m->data_block);
+ if (!used)
+ break;
}
- else
- bio_endio(m->bio, 0);
- mempool_free(m, tc->pool->mapping_pool);
+ if (b == end)
+ break;
+
+ /* find end of run */
+ for (e = b + 1; e != end; e++) {
+ r = dm_pool_block_is_used(pool->pmd, e, &used);
+ if (r)
+ return r;
+
+ if (used)
+ break;
+ }
+
+ r = issue_discard(tc, b, e, m->bio);
+ if (r)
+ return r;
+
+ b = e;
+ }
+
+ return 0;
}
-static void process_prepared_discard(struct dm_thin_new_mapping *m)
+static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)
{
int r;
struct thin_c *tc = m->tc;
+ struct pool *pool = tc->pool;
- r = dm_thin_remove_block(tc->td, m->virt_block);
+ r = dm_thin_remove_range(tc->td, m->virt_begin, m->virt_end);
if (r)
- DMERR_LIMIT("dm_thin_remove_block() failed");
+ metadata_operation_failed(pool, "dm_thin_remove_range", r);
- process_prepared_discard_passdown(m);
+ else if (m->maybe_shared)
+ r = passdown_double_checking_shared_status(m);
+ else
+ r = issue_discard(tc, m->data_block, m->data_block + (m->virt_end - m->virt_begin), m->bio);
+
+ /*
+ * Even if r is set, there could be sub discards in flight that we
+ * need to wait for.
+ */
+ bio_endio(m->bio, r);
+ cell_defer_no_holder(tc, m->cell);
+ mempool_free(m, pool->mapping_pool);
}
static void process_prepared(struct pool *pool, struct list_head *head,
@@ -980,7 +1203,7 @@ static void ll_zero(struct thin_c *tc, struct dm_thin_new_mapping *m,
}
static void remap_and_issue_overwrite(struct thin_c *tc, struct bio *bio,
- dm_block_t data_block,
+ dm_block_t data_begin,
struct dm_thin_new_mapping *m)
{
struct pool *pool = tc->pool;
@@ -990,7 +1213,7 @@ static void remap_and_issue_overwrite(struct thin_c *tc, struct bio *bio,
m->bio = bio;
save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
inc_all_io_entry(pool, bio);
- remap_and_issue(tc, bio, data_block);
+ remap_and_issue(tc, bio, data_begin);
}
/*
@@ -1007,7 +1230,8 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
struct dm_thin_new_mapping *m = get_next_mapping(pool);
m->tc = tc;
- m->virt_block = virt_block;
+ m->virt_begin = virt_block;
+ m->virt_end = virt_block + 1u;
m->data_block = data_dest;
m->cell = cell;
@@ -1086,7 +1310,8 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
atomic_set(&m->prepare_actions, 1); /* no need to quiesce */
m->tc = tc;
- m->virt_block = virt_block;
+ m->virt_begin = virt_block;
+ m->virt_end = virt_block + 1u;
m->data_block = data_block;
m->cell = cell;
@@ -1095,16 +1320,14 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
* zeroing pre-existing data, we can issue the bio immediately.
* Otherwise we use kcopyd to zero the data first.
*/
- if (!pool->pf.zero_new_blocks)
+ if (pool->pf.zero_new_blocks) {
+ if (io_overwrites_block(pool, bio))
+ remap_and_issue_overwrite(tc, bio, data_block, m);
+ else
+ ll_zero(tc, m, data_block * pool->sectors_per_block,
+ (data_block + 1) * pool->sectors_per_block);
+ } else
process_prepared_mapping(m);
-
- else if (io_overwrites_block(pool, bio))
- remap_and_issue_overwrite(tc, bio, data_block, m);
-
- else
- ll_zero(tc, m,
- data_block * pool->sectors_per_block,
- (data_block + 1) * pool->sectors_per_block);
}
static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
@@ -1295,99 +1518,149 @@ static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *c
retry_on_resume(bio);
}
-static void process_discard_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
+static void process_discard_cell_no_passdown(struct thin_c *tc,
+ struct dm_bio_prison_cell *virt_cell)
{
- int r;
- struct bio *bio = cell->holder;
struct pool *pool = tc->pool;
- struct dm_bio_prison_cell *cell2;
- struct dm_cell_key key2;
- dm_block_t block = get_bio_block(tc, bio);
- struct dm_thin_lookup_result lookup_result;
- struct dm_thin_new_mapping *m;
+ struct dm_thin_new_mapping *m = get_next_mapping(pool);
- if (tc->requeue_mode) {
- cell_requeue(pool, cell);
- return;
- }
+ /*
+ * We don't need to lock the data blocks, since there's no
+ * passdown. We only lock data blocks for allocation and breaking sharing.
+ */
+ m->tc = tc;
+ m->virt_begin = virt_cell->key.block_begin;
+ m->virt_end = virt_cell->key.block_end;
+ m->cell = virt_cell;
+ m->bio = virt_cell->holder;
- r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
- switch (r) {
- case 0:
- /*
- * Check nobody is fiddling with this pool block. This can
- * happen if someone's in the process of breaking sharing
- * on this block.
- */
- build_data_key(tc->td, lookup_result.block, &key2);
- if (bio_detain(tc->pool, &key2, bio, &cell2)) {
- cell_defer_no_holder(tc, cell);
- break;
- }
+ if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
+ pool->process_prepared_discard(m);
+}
- if (io_overlaps_block(pool, bio)) {
- /*
- * IO may still be going to the destination block. We must
- * quiesce before we can do the removal.
- */
- m = get_next_mapping(pool);
- m->tc = tc;
- m->pass_discard = pool->pf.discard_passdown;
- m->definitely_not_shared = !lookup_result.shared;
- m->virt_block = block;
- m->data_block = lookup_result.block;
- m->cell = cell;
- m->cell2 = cell2;
- m->bio = bio;
-
- if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
- pool->process_prepared_discard(m);
+/*
+ * FIXME: DM local hack to defer parent bios's end_io until we
+ * _know_ all chained sub range discard bios have completed.
+ * Will go away once late bio splitting lands upstream!
+ */
+static inline void __bio_inc_remaining(struct bio *bio)
+{
+ bio->bi_flags |= (1 << BIO_CHAIN);
+ smp_mb__before_atomic();
+ atomic_inc(&bio->__bi_remaining);
+}
- } else {
- inc_all_io_entry(pool, bio);
- cell_defer_no_holder(tc, cell);
- cell_defer_no_holder(tc, cell2);
+static void break_up_discard_bio(struct thin_c *tc, dm_block_t begin, dm_block_t end,
+ struct bio *bio)
+{
+ struct pool *pool = tc->pool;
+
+ int r;
+ bool maybe_shared;
+ struct dm_cell_key data_key;
+ struct dm_bio_prison_cell *data_cell;
+ struct dm_thin_new_mapping *m;
+ dm_block_t virt_begin, virt_end, data_begin;
+
+ while (begin != end) {
+ r = ensure_next_mapping(pool);
+ if (r)
+ /* we did our best */
+ return;
+ r = dm_thin_find_mapped_range(tc->td, begin, end, &virt_begin, &virt_end,
+ &data_begin, &maybe_shared);
+ if (r)
/*
- * The DM core makes sure that the discard doesn't span
- * a block boundary. So we submit the discard of a
- * partial block appropriately.
+ * Silently fail, letting any mappings we've
+ * created complete.
*/
- if ((!lookup_result.shared) && pool->pf.discard_passdown)
- remap_and_issue(tc, bio, lookup_result.block);
- else
- bio_endio(bio, 0);
+ break;
+
+ build_key(tc->td, PHYSICAL, data_begin, data_begin + (virt_end - virt_begin), &data_key);
+ if (bio_detain(tc->pool, &data_key, NULL, &data_cell)) {
+ /* contention, we'll give up with this range */
+ begin = virt_end;
+ continue;
}
- break;
- case -ENODATA:
/*
- * It isn't provisioned, just forget it.
+ * IO may still be going to the destination block. We must
+ * quiesce before we can do the removal.
*/
- cell_defer_no_holder(tc, cell);
- bio_endio(bio, 0);
- break;
+ m = get_next_mapping(pool);
+ m->tc = tc;
+ m->maybe_shared = maybe_shared;
+ m->virt_begin = virt_begin;
+ m->virt_end = virt_end;
+ m->data_block = data_begin;
+ m->cell = data_cell;
+ m->bio = bio;
- default:
- DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
- __func__, r);
- cell_defer_no_holder(tc, cell);
- bio_io_error(bio);
- break;
+ /*
+ * The parent bio must not complete before sub discard bios are
+ * chained to it (see __blkdev_issue_discard_async's bio_chain)!
+ *
+ * This per-mapping bi_remaining increment is paired with
+ * the implicit decrement that occurs via bio_endio() in
+ * process_prepared_discard_{passdown,no_passdown}.
+ */
+ __bio_inc_remaining(bio);
+ if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
+ pool->process_prepared_discard(m);
+
+ begin = virt_end;
}
}
+static void process_discard_cell_passdown(struct thin_c *tc, struct dm_bio_prison_cell *virt_cell)
+{
+ struct bio *bio = virt_cell->holder;
+ struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
+
+ /*
+ * The virt_cell will only get freed once the origin bio completes.
+ * This means it will remain locked while all the individual
+ * passdown bios are in flight.
+ */
+ h->cell = virt_cell;
+ break_up_discard_bio(tc, virt_cell->key.block_begin, virt_cell->key.block_end, bio);
+
+ /*
+ * We complete the bio now, knowing that the bi_remaining field
+ * will prevent completion until the sub range discards have
+ * completed.
+ */
+ bio_endio(bio, 0);
+}
+
static void process_discard_bio(struct thin_c *tc, struct bio *bio)
{
- struct dm_bio_prison_cell *cell;
- struct dm_cell_key key;
- dm_block_t block = get_bio_block(tc, bio);
+ dm_block_t begin, end;
+ struct dm_cell_key virt_key;
+ struct dm_bio_prison_cell *virt_cell;
- build_virtual_key(tc->td, block, &key);
- if (bio_detain(tc->pool, &key, bio, &cell))
+ get_bio_block_range(tc, bio, &begin, &end);
+ if (begin == end) {
+ /*
+ * The discard covers less than a block.
+ */
+ bio_endio(bio, 0);
return;
+ }
- process_discard_cell(tc, cell);
+ build_key(tc->td, VIRTUAL, begin, end, &virt_key);
+ if (bio_detain(tc->pool, &virt_key, bio, &virt_cell))
+ /*
+ * Potential starvation issue: We're relying on the
+ * fs/application being well behaved, and not trying to
+ * send IO to a region at the same time as discarding it.
+ * If they do this persistently then it's possible this
+ * cell will never be granted.
+ */
+ return;
+
+ tc->pool->process_discard_cell(tc, virt_cell);
}
static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
@@ -2014,18 +2287,23 @@ static void do_waker(struct work_struct *ws)
queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD);
}
+static void notify_of_pool_mode_change_to_oods(struct pool *pool);
+
/*
* We're holding onto IO to allow userland time to react. After the
* timeout either the pool will have been resized (and thus back in
- * PM_WRITE mode), or we degrade to PM_READ_ONLY and start erroring IO.
+ * PM_WRITE mode), or we degrade to PM_OUT_OF_DATA_SPACE w/ error_if_no_space.
*/
static void do_no_space_timeout(struct work_struct *ws)
{
struct pool *pool = container_of(to_delayed_work(ws), struct pool,
no_space_timeout);
- if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space)
- set_pool_mode(pool, PM_READ_ONLY);
+ if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space) {
+ pool->pf.error_if_no_space = true;
+ notify_of_pool_mode_change_to_oods(pool);
+ error_retry_list_with_code(pool, -ENOSPC);
+ }
}
/*----------------------------------------------------------------*/
@@ -2103,6 +2381,32 @@ static void notify_of_pool_mode_change(struct pool *pool, const char *new_mode)
dm_device_name(pool->pool_md), new_mode);
}
+static void notify_of_pool_mode_change_to_oods(struct pool *pool)
+{
+ if (!pool->pf.error_if_no_space)
+ notify_of_pool_mode_change(pool, "out-of-data-space (queue IO)");
+ else
+ notify_of_pool_mode_change(pool, "out-of-data-space (error IO)");
+}
+
+static bool passdown_enabled(struct pool_c *pt)
+{
+ return pt->adjusted_pf.discard_passdown;
+}
+
+static void set_discard_callbacks(struct pool *pool)
+{
+ struct pool_c *pt = pool->ti->private;
+
+ if (passdown_enabled(pt)) {
+ pool->process_discard_cell = process_discard_cell_passdown;
+ pool->process_prepared_discard = process_prepared_discard_passdown;
+ } else {
+ pool->process_discard_cell = process_discard_cell_no_passdown;
+ pool->process_prepared_discard = process_prepared_discard_no_passdown;
+ }
+}
+
static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
{
struct pool_c *pt = pool->ti->private;
@@ -2154,7 +2458,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
pool->process_cell = process_cell_read_only;
pool->process_discard_cell = process_cell_success;
pool->process_prepared_mapping = process_prepared_mapping_fail;
- pool->process_prepared_discard = process_prepared_discard_passdown;
+ pool->process_prepared_discard = process_prepared_discard_success;
error_retry_list(pool);
break;
@@ -2169,13 +2473,12 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
* frequently seeing this mode.
*/
if (old_mode != new_mode)
- notify_of_pool_mode_change(pool, "out-of-data-space");
+ notify_of_pool_mode_change_to_oods(pool);
pool->process_bio = process_bio_read_only;
pool->process_discard = process_discard_bio;
pool->process_cell = process_cell_read_only;
- pool->process_discard_cell = process_discard_cell;
pool->process_prepared_mapping = process_prepared_mapping;
- pool->process_prepared_discard = process_prepared_discard;
+ set_discard_callbacks(pool);
if (!pool->pf.error_if_no_space && no_space_timeout)
queue_delayed_work(pool->wq, &pool->no_space_timeout, no_space_timeout);
@@ -2188,9 +2491,8 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
pool->process_bio = process_bio;
pool->process_discard = process_discard_bio;
pool->process_cell = process_cell;
- pool->process_discard_cell = process_discard_cell;
pool->process_prepared_mapping = process_prepared_mapping;
- pool->process_prepared_discard = process_prepared_discard;
+ set_discard_callbacks(pool);
break;
}
@@ -2279,6 +2581,7 @@ static void thin_hook_bio(struct thin_c *tc, struct bio *bio)
h->shared_read_entry = NULL;
h->all_io_entry = NULL;
h->overwrite_mapping = NULL;
+ h->cell = NULL;
}
/*
@@ -2426,7 +2729,6 @@ static void disable_passdown_if_not_supported(struct pool_c *pt)
struct pool *pool = pt->pool;
struct block_device *data_bdev = pt->data_dev->bdev;
struct queue_limits *data_limits = &bdev_get_queue(data_bdev)->limits;
- sector_t block_size = pool->sectors_per_block << SECTOR_SHIFT;
const char *reason = NULL;
char buf[BDEVNAME_SIZE];
@@ -2439,12 +2741,6 @@ static void disable_passdown_if_not_supported(struct pool_c *pt)
else if (data_limits->max_discard_sectors < pool->sectors_per_block)
reason = "max discard sectors smaller than a block";
- else if (data_limits->discard_granularity > block_size)
- reason = "discard granularity larger than a block";
-
- else if (!is_factor(block_size, data_limits->discard_granularity))
- reason = "discard granularity not a factor of block size";
-
if (reason) {
DMWARN("Data device (%s) %s: Disabling discard passdown.", bdevname(data_bdev, buf), reason);
pt->adjusted_pf.discard_passdown = false;
@@ -3389,7 +3685,7 @@ static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
if (get_pool_mode(pool) >= PM_READ_ONLY) {
DMERR("%s: unable to service pool target messages in READ_ONLY or FAIL mode",
dm_device_name(pool->pool_md));
- return -EINVAL;
+ return -EOPNOTSUPP;
}
if (!strcasecmp(argv[0], "create_thin"))
@@ -3447,6 +3743,7 @@ static void emit_flags(struct pool_features *pf, char *result,
* Status line is:
* <transaction id> <used metadata sectors>/<total metadata sectors>
* <used data sectors>/<total data sectors> <held metadata root>
+ * <pool mode> <discard config> <no space config> <needs_check>
*/
static void pool_status(struct dm_target *ti, status_type_t type,
unsigned status_flags, char *result, unsigned maxlen)
@@ -3548,6 +3845,11 @@ static void pool_status(struct dm_target *ti, status_type_t type,
else
DMEMIT("queue_if_no_space ");
+ if (dm_pool_metadata_needs_check(pool->pmd))
+ DMEMIT("needs_check ");
+ else
+ DMEMIT("- ");
+
break;
case STATUSTYPE_TABLE:
@@ -3587,24 +3889,6 @@ static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
}
-static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits)
-{
- struct pool *pool = pt->pool;
- struct queue_limits *data_limits;
-
- limits->max_discard_sectors = pool->sectors_per_block;
-
- /*
- * discard_granularity is just a hint, and not enforced.
- */
- if (pt->adjusted_pf.discard_passdown) {
- data_limits = &bdev_get_queue(pt->data_dev->bdev)->limits;
- limits->discard_granularity = max(data_limits->discard_granularity,
- pool->sectors_per_block << SECTOR_SHIFT);
- } else
- limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
-}
-
static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
{
struct pool_c *pt = ti->private;
@@ -3659,14 +3943,17 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
disable_passdown_if_not_supported(pt);
- set_discard_limits(pt, limits);
+ /*
+ * The pool uses the same discard limits as the underlying data
+ * device. DM core has already set this up.
+ */
}
static struct target_type pool_target = {
.name = "thin-pool",
.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
DM_TARGET_IMMUTABLE,
- .version = {1, 14, 0},
+ .version = {1, 16, 0},
.module = THIS_MODULE,
.ctr = pool_ctr,
.dtr = pool_dtr,
@@ -3825,8 +4112,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
if (tc->pool->pf.discard_enabled) {
ti->discards_supported = true;
ti->num_discard_bios = 1;
- /* Discard bios must be split on a block boundary */
- ti->split_discard_bios = true;
+ ti->split_discard_bios = false;
}
mutex_unlock(&dm_thin_pool_table.mutex);
@@ -3913,6 +4199,9 @@ static int thin_endio(struct dm_target *ti, struct bio *bio, int err)
}
}
+ if (h->cell)
+ cell_defer_no_holder(h->tc, h->cell);
+
return 0;
}
@@ -4040,9 +4329,18 @@ static int thin_iterate_devices(struct dm_target *ti,
return 0;
}
+static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
+{
+ struct thin_c *tc = ti->private;
+ struct pool *pool = tc->pool;
+
+ limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
+ limits->max_discard_sectors = 2048 * 1024 * 16; /* 16G */
+}
+
static struct target_type thin_target = {
.name = "thin",
- .version = {1, 14, 0},
+ .version = {1, 16, 0},
.module = THIS_MODULE,
.ctr = thin_ctr,
.dtr = thin_dtr,
@@ -4054,6 +4352,7 @@ static struct target_type thin_target = {
.status = thin_status,
.merge = thin_merge,
.iterate_devices = thin_iterate_devices,
+ .io_hints = thin_io_hints,
};
/*----------------------------------------------------------------*/