summaryrefslogtreecommitdiff
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig1
-rw-r--r--drivers/md/Makefile2
-rw-r--r--drivers/md/bcache/closure.c4
-rw-r--r--drivers/md/bcache/request.c11
-rw-r--r--drivers/md/bitmap.c14
-rw-r--r--drivers/md/bitmap.h4
-rw-r--r--drivers/md/dm-bufio.c18
-rw-r--r--drivers/md/dm-cache-metadata.c8
-rw-r--r--drivers/md/dm-cache-policy-cleaner.c2
-rw-r--r--drivers/md/dm-cache-policy-mq.c2
-rw-r--r--drivers/md/dm-cache-policy-smq.c2
-rw-r--r--drivers/md/dm-cache-target.c3
-rw-r--r--drivers/md/dm-crypt.c34
-rw-r--r--drivers/md/dm-delay.c7
-rw-r--r--drivers/md/dm-era-target.c15
-rw-r--r--drivers/md/dm-exception-store.c2
-rw-r--r--drivers/md/dm-flakey.c16
-rw-r--r--drivers/md/dm-io.c3
-rw-r--r--drivers/md/dm-kcopyd.c2
-rw-r--r--drivers/md/dm-linear.c20
-rw-r--r--drivers/md/dm-log-userspace-base.c3
-rw-r--r--drivers/md/dm-log-writes.c13
-rw-r--r--drivers/md/dm-mpath.c58
-rw-r--r--drivers/md/dm-region-hash.c6
-rw-r--r--drivers/md/dm-snap-persistent.c2
-rw-r--r--drivers/md/dm-switch.c25
-rw-r--r--drivers/md/dm-table.c88
-rw-r--r--drivers/md/dm-thin-metadata.c50
-rw-r--r--drivers/md/dm-thin.c6
-rw-r--r--drivers/md/dm-verity.c15
-rw-r--r--drivers/md/dm.c220
-rw-r--r--drivers/md/md-cluster.c228
-rw-r--r--drivers/md/md-cluster.h12
-rw-r--r--drivers/md/md.c521
-rw-r--r--drivers/md/md.h25
-rw-r--r--drivers/md/multipath.c2
-rw-r--r--drivers/md/persistent-data/dm-array.c4
-rw-r--r--drivers/md/persistent-data/dm-block-manager.c4
-rw-r--r--drivers/md/persistent-data/dm-block-manager.h2
-rw-r--r--drivers/md/persistent-data/dm-btree-internal.h2
-rw-r--r--drivers/md/persistent-data/dm-btree-remove.c36
-rw-r--r--drivers/md/persistent-data/dm-btree-spine.c20
-rw-r--r--drivers/md/persistent-data/dm-btree.c105
-rw-r--r--drivers/md/persistent-data/dm-btree.h14
-rw-r--r--drivers/md/persistent-data/dm-space-map-common.c32
-rw-r--r--drivers/md/persistent-data/dm-space-map-metadata.c32
-rw-r--r--drivers/md/persistent-data/dm-transaction-manager.c4
-rw-r--r--drivers/md/persistent-data/dm-transaction-manager.h2
-rw-r--r--drivers/md/raid1.c43
-rw-r--r--drivers/md/raid1.h7
-rw-r--r--drivers/md/raid10.c8
-rw-r--r--drivers/md/raid5-cache.c1191
-rw-r--r--drivers/md/raid5.c189
-rw-r--r--drivers/md/raid5.h24
54 files changed, 2533 insertions, 630 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 3e01e6fb3..7913fdcfc 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -123,6 +123,7 @@ config MD_RAID456
tristate "RAID-4/RAID-5/RAID-6 mode"
depends on BLK_DEV_MD
select RAID6_PQ
+ select LIBCRC32C
select ASYNC_MEMCPY
select ASYNC_XOR
select ASYNC_PQ
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 462f443a4..f34979cd1 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -17,7 +17,7 @@ dm-cache-smq-y += dm-cache-policy-smq.o
dm-cache-cleaner-y += dm-cache-policy-cleaner.o
dm-era-y += dm-era-target.o
md-mod-y += md.o bitmap.o
-raid456-y += raid5.o
+raid456-y += raid5.o raid5-cache.o
# Note: link order is important. All raid personalities
# and must come before md.o, as they each initialise
diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c
index 7a228de95..9eaf1d6e8 100644
--- a/drivers/md/bcache/closure.c
+++ b/drivers/md/bcache/closure.c
@@ -167,8 +167,6 @@ EXPORT_SYMBOL(closure_debug_destroy);
static struct dentry *debug;
-#define work_data_bits(work) ((unsigned long *)(&(work)->data))
-
static int debug_seq_show(struct seq_file *f, void *data)
{
struct closure *cl;
@@ -182,7 +180,7 @@ static int debug_seq_show(struct seq_file *f, void *data)
r & CLOSURE_REMAINING_MASK);
seq_printf(f, "%s%s%s%s\n",
- test_bit(WORK_STRUCT_PENDING,
+ test_bit(WORK_STRUCT_PENDING_BIT,
work_data_bits(&cl->work)) ? "Q" : "",
r & CLOSURE_RUNNING ? "R" : "",
r & CLOSURE_STACK ? "S" : "",
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 8e9877b04..25fa8445b 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -958,7 +958,8 @@ static void cached_dev_nodata(struct closure *cl)
/* Cached devices - read & write stuff */
-static void cached_dev_make_request(struct request_queue *q, struct bio *bio)
+static blk_qc_t cached_dev_make_request(struct request_queue *q,
+ struct bio *bio)
{
struct search *s;
struct bcache_device *d = bio->bi_bdev->bd_disk->private_data;
@@ -997,6 +998,8 @@ static void cached_dev_make_request(struct request_queue *q, struct bio *bio)
else
generic_make_request(bio);
}
+
+ return BLK_QC_T_NONE;
}
static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode,
@@ -1070,7 +1073,8 @@ static void flash_dev_nodata(struct closure *cl)
continue_at(cl, search_free, NULL);
}
-static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
+static blk_qc_t flash_dev_make_request(struct request_queue *q,
+ struct bio *bio)
{
struct search *s;
struct closure *cl;
@@ -1093,7 +1097,7 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
continue_at_nobarrier(&s->cl,
flash_dev_nodata,
bcache_wq);
- return;
+ return BLK_QC_T_NONE;
} else if (rw) {
bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys,
&KEY(d->id, bio->bi_iter.bi_sector, 0),
@@ -1109,6 +1113,7 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
}
continue_at(cl, search_free, NULL);
+ return BLK_QC_T_NONE;
}
static int flash_dev_ioctl(struct bcache_device *d, fmode_t mode,
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 48b5890c2..4f22e9197 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -613,12 +613,10 @@ re_read:
daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ;
write_behind = le32_to_cpu(sb->write_behind);
sectors_reserved = le32_to_cpu(sb->sectors_reserved);
- /* XXX: This is a hack to ensure that we don't use clustering
- * in case:
- * - dm-raid is in use and
- * - the nodes written in bitmap_sb is erroneous.
+ /* Setup nodes/clustername only if bitmap version is
+ * cluster-compatible
*/
- if (!bitmap->mddev->sync_super) {
+ if (sb->version == cpu_to_le32(BITMAP_MAJOR_CLUSTERED)) {
nodes = le32_to_cpu(sb->nodes);
strlcpy(bitmap->mddev->bitmap_info.cluster_name,
sb->cluster_name, 64);
@@ -628,7 +626,7 @@ re_read:
if (sb->magic != cpu_to_le32(BITMAP_MAGIC))
reason = "bad magic";
else if (le32_to_cpu(sb->version) < BITMAP_MAJOR_LO ||
- le32_to_cpu(sb->version) > BITMAP_MAJOR_HI)
+ le32_to_cpu(sb->version) > BITMAP_MAJOR_CLUSTERED)
reason = "unrecognized superblock version";
else if (chunksize < 512)
reason = "bitmap chunksize too small";
@@ -1572,7 +1570,7 @@ void bitmap_close_sync(struct bitmap *bitmap)
}
EXPORT_SYMBOL(bitmap_close_sync);
-void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector)
+void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force)
{
sector_t s = 0;
sector_t blocks;
@@ -1583,7 +1581,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector)
bitmap->last_end_sync = jiffies;
return;
}
- if (time_before(jiffies, (bitmap->last_end_sync
+ if (!force && time_before(jiffies, (bitmap->last_end_sync
+ bitmap->mddev->bitmap_info.daemon_sleep)))
return;
wait_event(bitmap->mddev->recovery_wait,
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index f1f4dd010..7d5c3a610 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -9,8 +9,10 @@
#define BITMAP_MAJOR_LO 3
/* version 4 insists the bitmap is in little-endian order
* with version 3, it is host-endian which is non-portable
+ * Version 5 is currently set only for clustered devices
*/
#define BITMAP_MAJOR_HI 4
+#define BITMAP_MAJOR_CLUSTERED 5
#define BITMAP_MAJOR_HOSTENDIAN 3
/*
@@ -255,7 +257,7 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset,
int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int degraded);
void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted);
void bitmap_close_sync(struct bitmap *bitmap);
-void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector);
+void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force);
void bitmap_unplug(struct bitmap *bitmap);
void bitmap_daemon_work(struct mddev *mddev);
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 83cc52eaf..2dd33085b 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -1598,11 +1598,11 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
c->bdev = bdev;
c->block_size = block_size;
- c->sectors_per_block_bits = ffs(block_size) - 1 - SECTOR_SHIFT;
- c->pages_per_block_bits = (ffs(block_size) - 1 >= PAGE_SHIFT) ?
- ffs(block_size) - 1 - PAGE_SHIFT : 0;
- c->blocks_per_page_bits = (ffs(block_size) - 1 < PAGE_SHIFT ?
- PAGE_SHIFT - (ffs(block_size) - 1) : 0);
+ c->sectors_per_block_bits = __ffs(block_size) - SECTOR_SHIFT;
+ c->pages_per_block_bits = (__ffs(block_size) >= PAGE_SHIFT) ?
+ __ffs(block_size) - PAGE_SHIFT : 0;
+ c->blocks_per_page_bits = (__ffs(block_size) < PAGE_SHIFT ?
+ PAGE_SHIFT - __ffs(block_size) : 0);
c->aux_size = aux_size;
c->alloc_callback = alloc_callback;
@@ -1861,12 +1861,8 @@ static void __exit dm_bufio_exit(void)
cancel_delayed_work_sync(&dm_bufio_work);
destroy_workqueue(dm_bufio_wq);
- for (i = 0; i < ARRAY_SIZE(dm_bufio_caches); i++) {
- struct kmem_cache *kc = dm_bufio_caches[i];
-
- if (kc)
- kmem_cache_destroy(kc);
- }
+ for (i = 0; i < ARRAY_SIZE(dm_bufio_caches); i++)
+ kmem_cache_destroy(dm_bufio_caches[i]);
for (i = 0; i < ARRAY_SIZE(dm_bufio_cache_names); i++)
kfree(dm_bufio_cache_names[i]);
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c
index 0a17d1b91..f6543f3a9 100644
--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@@ -260,7 +260,9 @@ static int __superblock_all_zeroes(struct dm_block_manager *bm, bool *result)
}
}
- return dm_bm_unlock(b);
+ dm_bm_unlock(b);
+
+ return 0;
}
static void __setup_mapping_info(struct dm_cache_metadata *cmd)
@@ -465,7 +467,9 @@ static int __open_metadata(struct dm_cache_metadata *cmd)
dm_disk_bitset_init(cmd->tm, &cmd->discard_info);
sb_flags = le32_to_cpu(disk_super->flags);
cmd->clean_when_opened = test_bit(CLEAN_SHUTDOWN, &sb_flags);
- return dm_bm_unlock(sblock);
+ dm_bm_unlock(sblock);
+
+ return 0;
bad:
dm_bm_unlock(sblock);
diff --git a/drivers/md/dm-cache-policy-cleaner.c b/drivers/md/dm-cache-policy-cleaner.c
index 8a0964565..14aaaf059 100644
--- a/drivers/md/dm-cache-policy-cleaner.c
+++ b/drivers/md/dm-cache-policy-cleaner.c
@@ -83,7 +83,7 @@ static struct list_head *list_pop(struct list_head *q)
static int alloc_hash(struct hash *hash, unsigned elts)
{
hash->nr_buckets = next_power(elts >> 4, 16);
- hash->hash_bits = ffs(hash->nr_buckets) - 1;
+ hash->hash_bits = __ffs(hash->nr_buckets);
hash->table = vzalloc(sizeof(*hash->table) * hash->nr_buckets);
return hash->table ? 0 : -ENOMEM;
diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index aa1b41ca4..ddb26980c 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -1410,7 +1410,7 @@ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
mq->generation_period = max((unsigned) from_cblock(cache_size), 1024U);
mq->nr_buckets = next_power(from_cblock(cache_size) / 2, 16);
- mq->hash_bits = ffs(mq->nr_buckets) - 1;
+ mq->hash_bits = __ffs(mq->nr_buckets);
mq->table = vzalloc(sizeof(*mq->table) * mq->nr_buckets);
if (!mq->table)
goto bad_alloc_table;
diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c
index 1ffbeb1b3..28d458674 100644
--- a/drivers/md/dm-cache-policy-smq.c
+++ b/drivers/md/dm-cache-policy-smq.c
@@ -566,7 +566,7 @@ static int h_init(struct hash_table *ht, struct entry_space *es, unsigned nr_ent
ht->es = es;
nr_buckets = roundup_pow_of_two(max(nr_entries / 4u, 16u));
- ht->hash_bits = ffs(nr_buckets) - 1;
+ ht->hash_bits = __ffs(nr_buckets);
ht->buckets = vmalloc(sizeof(*ht->buckets) * nr_buckets);
if (!ht->buckets)
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index dd90d1236..2fd4c8296 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -2309,8 +2309,7 @@ static void destroy(struct cache *cache)
{
unsigned i;
- if (cache->migration_pool)
- mempool_destroy(cache->migration_pool);
+ mempool_destroy(cache->migration_pool);
if (cache->all_io_ds)
dm_deferred_set_destroy(cache->all_io_ds);
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 4b3b6f8af..3147c8d09 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -112,7 +112,8 @@ struct iv_tcw_private {
* and encrypts / decrypts at the same time.
*/
enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID,
- DM_CRYPT_SAME_CPU, DM_CRYPT_NO_OFFLOAD };
+ DM_CRYPT_SAME_CPU, DM_CRYPT_NO_OFFLOAD,
+ DM_CRYPT_EXIT_THREAD};
/*
* The fields in here must be read only after initialization.
@@ -994,7 +995,7 @@ static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size)
struct bio_vec *bvec;
retry:
- if (unlikely(gfp_mask & __GFP_WAIT))
+ if (unlikely(gfp_mask & __GFP_DIRECT_RECLAIM))
mutex_lock(&cc->bio_alloc_lock);
clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs);
@@ -1010,7 +1011,7 @@ retry:
if (!page) {
crypt_free_buffer_pages(cc, clone);
bio_put(clone);
- gfp_mask |= __GFP_WAIT;
+ gfp_mask |= __GFP_DIRECT_RECLAIM;
goto retry;
}
@@ -1027,7 +1028,7 @@ retry:
}
return_clone:
- if (unlikely(gfp_mask & __GFP_WAIT))
+ if (unlikely(gfp_mask & __GFP_DIRECT_RECLAIM))
mutex_unlock(&cc->bio_alloc_lock);
return clone;
@@ -1203,20 +1204,18 @@ continue_locked:
if (!RB_EMPTY_ROOT(&cc->write_tree))
goto pop_from_list;
+ if (unlikely(test_bit(DM_CRYPT_EXIT_THREAD, &cc->flags))) {
+ spin_unlock_irq(&cc->write_thread_wait.lock);
+ break;
+ }
+
__set_current_state(TASK_INTERRUPTIBLE);
__add_wait_queue(&cc->write_thread_wait, &wait);
spin_unlock_irq(&cc->write_thread_wait.lock);
- if (unlikely(kthread_should_stop())) {
- set_task_state(current, TASK_RUNNING);
- remove_wait_queue(&cc->write_thread_wait, &wait);
- break;
- }
-
schedule();
- set_task_state(current, TASK_RUNNING);
spin_lock_irq(&cc->write_thread_wait.lock);
__remove_wait_queue(&cc->write_thread_wait, &wait);
goto continue_locked;
@@ -1531,8 +1530,13 @@ static void crypt_dtr(struct dm_target *ti)
if (!cc)
return;
- if (cc->write_thread)
+ if (cc->write_thread) {
+ spin_lock_irq(&cc->write_thread_wait.lock);
+ set_bit(DM_CRYPT_EXIT_THREAD, &cc->flags);
+ wake_up_locked(&cc->write_thread_wait);
+ spin_unlock_irq(&cc->write_thread_wait.lock);
kthread_stop(cc->write_thread);
+ }
if (cc->io_queue)
destroy_workqueue(cc->io_queue);
@@ -1544,10 +1548,8 @@ static void crypt_dtr(struct dm_target *ti)
if (cc->bs)
bioset_free(cc->bs);
- if (cc->page_pool)
- mempool_destroy(cc->page_pool);
- if (cc->req_pool)
- mempool_destroy(cc->req_pool);
+ mempool_destroy(cc->page_pool);
+ mempool_destroy(cc->req_pool);
if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
cc->iv_gen_ops->dtr(cc);
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index b34f6e272..b4c356a21 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -122,6 +122,7 @@ static void flush_expired_bios(struct work_struct *work)
* <device> <offset> <delay> [<write_device> <write_offset> <write_delay>]
*
* With separate write parameters, the first set is only used for reads.
+ * Offsets are specified in sectors.
* Delays are specified in milliseconds.
*/
static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
@@ -132,7 +133,7 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
int ret;
if (argc != 3 && argc != 6) {
- ti->error = "requires exactly 3 or 6 arguments";
+ ti->error = "Requires exactly 3 or 6 arguments";
return -EINVAL;
}
@@ -237,7 +238,7 @@ static int delay_bio(struct delay_c *dc, int delay, struct bio *bio)
unsigned long expires = 0;
if (!delay || !atomic_read(&dc->may_delay))
- return 1;
+ return DM_MAPIO_REMAPPED;
delayed = dm_per_bio_data(bio, sizeof(struct dm_delay_info));
@@ -257,7 +258,7 @@ static int delay_bio(struct delay_c *dc, int delay, struct bio *bio)
queue_timeout(dc, expires);
- return 0;
+ return DM_MAPIO_SUBMITTED;
}
static void delay_presuspend(struct dm_target *ti)
diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c
index 0119ebfb3..665bf3285 100644
--- a/drivers/md/dm-era-target.c
+++ b/drivers/md/dm-era-target.c
@@ -343,7 +343,9 @@ static int superblock_all_zeroes(struct dm_block_manager *bm, bool *result)
}
}
- return dm_bm_unlock(b);
+ dm_bm_unlock(b);
+
+ return 0;
}
/*----------------------------------------------------------------*/
@@ -582,7 +584,9 @@ static int open_metadata(struct era_metadata *md)
md->metadata_snap = le64_to_cpu(disk->metadata_snap);
md->archived_writesets = true;
- return dm_bm_unlock(sblock);
+ dm_bm_unlock(sblock);
+
+ return 0;
bad:
dm_bm_unlock(sblock);
@@ -1046,12 +1050,7 @@ static int metadata_take_snap(struct era_metadata *md)
md->metadata_snap = dm_block_location(clone);
- r = dm_tm_unlock(md->tm, clone);
- if (r) {
- DMERR("%s: couldn't unlock clone", __func__);
- md->metadata_snap = SUPERBLOCK_LOCATION;
- return r;
- }
+ dm_tm_unlock(md->tm, clone);
return 0;
}
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index 192bb8bee..3997f34cf 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -183,7 +183,7 @@ int dm_exception_store_set_chunk_size(struct dm_exception_store *store,
store->chunk_size = chunk_size;
store->chunk_mask = chunk_size - 1;
- store->chunk_shift = ffs(chunk_size) - 1;
+ store->chunk_shift = __ffs(chunk_size);
return 0;
}
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index 645e8b4f8..09e2afcaf 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -373,20 +373,20 @@ static void flakey_status(struct dm_target *ti, status_type_t type,
}
}
-static int flakey_ioctl(struct dm_target *ti, unsigned int cmd, unsigned long arg)
+static int flakey_prepare_ioctl(struct dm_target *ti,
+ struct block_device **bdev, fmode_t *mode)
{
struct flakey_c *fc = ti->private;
- struct dm_dev *dev = fc->dev;
- int r = 0;
+
+ *bdev = fc->dev->bdev;
/*
* Only pass ioctls through if the device sizes match exactly.
*/
if (fc->start ||
- ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT)
- r = scsi_verify_blk_ioctl(NULL, cmd);
-
- return r ? : __blkdev_driver_ioctl(dev->bdev, dev->mode, cmd, arg);
+ ti->len != i_size_read((*bdev)->bd_inode) >> SECTOR_SHIFT)
+ return 1;
+ return 0;
}
static int flakey_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data)
@@ -405,7 +405,7 @@ static struct target_type flakey_target = {
.map = flakey_map,
.end_io = flakey_end_io,
.status = flakey_status,
- .ioctl = flakey_ioctl,
+ .prepare_ioctl = flakey_prepare_ioctl,
.iterate_devices = flakey_iterate_devices,
};
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 6f8e83b2a..81c5e1a1f 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -65,8 +65,7 @@ struct dm_io_client *dm_io_client_create(void)
return client;
bad:
- if (client->pool)
- mempool_destroy(client->pool);
+ mempool_destroy(client->pool);
kfree(client);
return ERR_PTR(-ENOMEM);
}
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 3a7cade5e..1452ed9aa 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -244,7 +244,7 @@ static int kcopyd_get_pages(struct dm_kcopyd_client *kc,
*pages = NULL;
do {
- pl = alloc_pl(__GFP_NOWARN | __GFP_NORETRY);
+ pl = alloc_pl(__GFP_NOWARN | __GFP_NORETRY | __GFP_KSWAPD_RECLAIM);
if (unlikely(!pl)) {
/* Use reserved pages */
pl = kc->pages;
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 436f5c9b6..05c35aacb 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -39,20 +39,20 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
lc = kmalloc(sizeof(*lc), GFP_KERNEL);
if (lc == NULL) {
- ti->error = "dm-linear: Cannot allocate linear context";
+ ti->error = "Cannot allocate linear context";
return -ENOMEM;
}
ret = -EINVAL;
if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1) {
- ti->error = "dm-linear: Invalid device sector";
+ ti->error = "Invalid device sector";
goto bad;
}
lc->start = tmp;
ret = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &lc->dev);
if (ret) {
- ti->error = "dm-linear: Device lookup failed";
+ ti->error = "Device lookup failed";
goto bad;
}
@@ -116,21 +116,21 @@ static void linear_status(struct dm_target *ti, status_type_t type,
}
}
-static int linear_ioctl(struct dm_target *ti, unsigned int cmd,
- unsigned long arg)
+static int linear_prepare_ioctl(struct dm_target *ti,
+ struct block_device **bdev, fmode_t *mode)
{
struct linear_c *lc = (struct linear_c *) ti->private;
struct dm_dev *dev = lc->dev;
- int r = 0;
+
+ *bdev = dev->bdev;
/*
* Only pass ioctls through if the device sizes match exactly.
*/
if (lc->start ||
ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT)
- r = scsi_verify_blk_ioctl(NULL, cmd);
-
- return r ? : __blkdev_driver_ioctl(dev->bdev, dev->mode, cmd, arg);
+ return 1;
+ return 0;
}
static int linear_iterate_devices(struct dm_target *ti,
@@ -149,7 +149,7 @@ static struct target_type linear_target = {
.dtr = linear_dtr,
.map = linear_map,
.status = linear_status,
- .ioctl = linear_ioctl,
+ .prepare_ioctl = linear_prepare_ioctl,
.iterate_devices = linear_iterate_devices,
};
diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
index 058256d2e..53b7b06d0 100644
--- a/drivers/md/dm-log-userspace-base.c
+++ b/drivers/md/dm-log-userspace-base.c
@@ -313,8 +313,7 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
out:
kfree(devices_rdata);
if (r) {
- if (lc->flush_entry_pool)
- mempool_destroy(lc->flush_entry_pool);
+ mempool_destroy(lc->flush_entry_pool);
kfree(lc);
kfree(ctr_str);
} else {
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
index b2912dbac..624589d51 100644
--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -714,20 +714,19 @@ static void log_writes_status(struct dm_target *ti, status_type_t type,
}
}
-static int log_writes_ioctl(struct dm_target *ti, unsigned int cmd,
- unsigned long arg)
+static int log_writes_prepare_ioctl(struct dm_target *ti,
+ struct block_device **bdev, fmode_t *mode)
{
struct log_writes_c *lc = ti->private;
struct dm_dev *dev = lc->dev;
- int r = 0;
+ *bdev = dev->bdev;
/*
* Only pass ioctls through if the device sizes match exactly.
*/
if (ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT)
- r = scsi_verify_blk_ioctl(NULL, cmd);
-
- return r ? : __blkdev_driver_ioctl(dev->bdev, dev->mode, cmd, arg);
+ return 1;
+ return 0;
}
static int log_writes_iterate_devices(struct dm_target *ti,
@@ -782,7 +781,7 @@ static struct target_type log_writes_target = {
.map = log_writes_map,
.end_io = normal_end_io,
.status = log_writes_status,
- .ioctl = log_writes_ioctl,
+ .prepare_ioctl = log_writes_prepare_ioctl,
.message = log_writes_message,
.iterate_devices = log_writes_iterate_devices,
.io_hints = log_writes_io_hints,
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 5a67671a3..cfa29f574 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -1533,49 +1533,38 @@ out:
return r;
}
-static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
- unsigned long arg)
+static int multipath_prepare_ioctl(struct dm_target *ti,
+ struct block_device **bdev, fmode_t *mode)
{
struct multipath *m = ti->private;
- struct pgpath *pgpath;
- struct block_device *bdev;
- fmode_t mode;
unsigned long flags;
int r;
- bdev = NULL;
- mode = 0;
- r = 0;
-
spin_lock_irqsave(&m->lock, flags);
if (!m->current_pgpath)
__choose_pgpath(m, 0);
- pgpath = m->current_pgpath;
-
- if (pgpath) {
- bdev = pgpath->path.dev->bdev;
- mode = pgpath->path.dev->mode;
+ if (m->current_pgpath) {
+ if (!m->queue_io) {
+ *bdev = m->current_pgpath->path.dev->bdev;
+ *mode = m->current_pgpath->path.dev->mode;
+ r = 0;
+ } else {
+ /* pg_init has not started or completed */
+ r = -ENOTCONN;
+ }
+ } else {
+ /* No path is available */
+ if (m->queue_if_no_path)
+ r = -ENOTCONN;
+ else
+ r = -EIO;
}
- if ((pgpath && m->queue_io) || (!pgpath && m->queue_if_no_path))
- r = -ENOTCONN;
- else if (!bdev)
- r = -EIO;
-
spin_unlock_irqrestore(&m->lock, flags);
- /*
- * Only pass ioctls through if the device sizes match exactly.
- */
- if (!bdev || ti->len != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT) {
- int err = scsi_verify_blk_ioctl(NULL, cmd);
- if (err)
- r = err;
- }
-
- if (r == -ENOTCONN && !fatal_signal_pending(current)) {
+ if (r == -ENOTCONN) {
spin_lock_irqsave(&m->lock, flags);
if (!m->current_pg) {
/* Path status changed, redo selection */
@@ -1587,7 +1576,12 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
dm_table_run_md_queue_async(m->ti->table);
}
- return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg);
+ /*
+ * Only pass ioctls through if the device sizes match exactly.
+ */
+ if (!r && ti->len != i_size_read((*bdev)->bd_inode) >> SECTOR_SHIFT)
+ return 1;
+ return r;
}
static int multipath_iterate_devices(struct dm_target *ti,
@@ -1690,7 +1684,7 @@ out:
*---------------------------------------------------------------*/
static struct target_type multipath_target = {
.name = "multipath",
- .version = {1, 9, 0},
+ .version = {1, 10, 0},
.module = THIS_MODULE,
.ctr = multipath_ctr,
.dtr = multipath_dtr,
@@ -1703,7 +1697,7 @@ static struct target_type multipath_target = {
.resume = multipath_resume,
.status = multipath_status,
.message = multipath_message,
- .ioctl = multipath_ioctl,
+ .prepare_ioctl = multipath_prepare_ioctl,
.iterate_devices = multipath_iterate_devices,
.busy = multipath_busy,
};
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
index b929fd5f4..74cb7b991 100644
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -193,7 +193,7 @@ struct dm_region_hash *dm_region_hash_create(
rh->max_recovery = max_recovery;
rh->log = log;
rh->region_size = region_size;
- rh->region_shift = ffs(region_size) - 1;
+ rh->region_shift = __ffs(region_size);
rwlock_init(&rh->hash_lock);
rh->mask = nr_buckets - 1;
rh->nr_buckets = nr_buckets;
@@ -249,9 +249,7 @@ void dm_region_hash_destroy(struct dm_region_hash *rh)
if (rh->log)
dm_dirty_log_destroy(rh->log);
- if (rh->region_pool)
- mempool_destroy(rh->region_pool);
-
+ mempool_destroy(rh->region_pool);
vfree(rh->buckets);
kfree(rh);
}
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 117a05e40..3164b8bce 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -322,7 +322,7 @@ static int read_header(struct pstore *ps, int *new_snapshot)
bdev_logical_block_size(dm_snap_cow(ps->store->snap)->
bdev) >> 9);
ps->store->chunk_mask = ps->store->chunk_size - 1;
- ps->store->chunk_shift = ffs(ps->store->chunk_size) - 1;
+ ps->store->chunk_shift = __ffs(ps->store->chunk_size);
chunk_size_supplied = 0;
}
diff --git a/drivers/md/dm-switch.c b/drivers/md/dm-switch.c
index 50fca469c..871c18fe0 100644
--- a/drivers/md/dm-switch.c
+++ b/drivers/md/dm-switch.c
@@ -99,11 +99,11 @@ static int alloc_region_table(struct dm_target *ti, unsigned nr_paths)
if (sector_div(nr_regions, sctx->region_size))
nr_regions++;
- sctx->nr_regions = nr_regions;
- if (sctx->nr_regions != nr_regions || sctx->nr_regions >= ULONG_MAX) {
+ if (nr_regions >= ULONG_MAX) {
ti->error = "Region table too large";
return -EINVAL;
}
+ sctx->nr_regions = nr_regions;
nr_slots = nr_regions;
if (sector_div(nr_slots, sctx->region_entries_per_slot))
@@ -511,27 +511,24 @@ static void switch_status(struct dm_target *ti, status_type_t type,
*
* Passthrough all ioctls to the path for sector 0
*/
-static int switch_ioctl(struct dm_target *ti, unsigned cmd,
- unsigned long arg)
+static int switch_prepare_ioctl(struct dm_target *ti,
+ struct block_device **bdev, fmode_t *mode)
{
struct switch_ctx *sctx = ti->private;
- struct block_device *bdev;
- fmode_t mode;
unsigned path_nr;
- int r = 0;
path_nr = switch_get_path_nr(sctx, 0);
- bdev = sctx->path_list[path_nr].dmdev->bdev;
- mode = sctx->path_list[path_nr].dmdev->mode;
+ *bdev = sctx->path_list[path_nr].dmdev->bdev;
+ *mode = sctx->path_list[path_nr].dmdev->mode;
/*
* Only pass ioctls through if the device sizes match exactly.
*/
- if (ti->len + sctx->path_list[path_nr].start != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT)
- r = scsi_verify_blk_ioctl(NULL, cmd);
-
- return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg);
+ if (ti->len + sctx->path_list[path_nr].start !=
+ i_size_read((*bdev)->bd_inode) >> SECTOR_SHIFT)
+ return 1;
+ return 0;
}
static int switch_iterate_devices(struct dm_target *ti,
@@ -560,7 +557,7 @@ static struct target_type switch_target = {
.map = switch_map,
.message = switch_message,
.status = switch_status,
- .ioctl = switch_ioctl,
+ .prepare_ioctl = switch_prepare_ioctl,
.iterate_devices = switch_iterate_devices,
};
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index e76ed0037..061152a43 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1014,15 +1014,16 @@ static int dm_table_build_index(struct dm_table *t)
return r;
}
+static bool integrity_profile_exists(struct gendisk *disk)
+{
+ return !!blk_get_integrity(disk);
+}
+
/*
* Get a disk whose integrity profile reflects the table's profile.
- * If %match_all is true, all devices' profiles must match.
- * If %match_all is false, all devices must at least have an
- * allocated integrity profile; but uninitialized is ok.
* Returns NULL if integrity support was inconsistent or unavailable.
*/
-static struct gendisk * dm_table_get_integrity_disk(struct dm_table *t,
- bool match_all)
+static struct gendisk * dm_table_get_integrity_disk(struct dm_table *t)
{
struct list_head *devices = dm_table_get_devices(t);
struct dm_dev_internal *dd = NULL;
@@ -1030,10 +1031,8 @@ static struct gendisk * dm_table_get_integrity_disk(struct dm_table *t,
list_for_each_entry(dd, devices, list) {
template_disk = dd->dm_dev->bdev->bd_disk;
- if (!blk_get_integrity(template_disk))
+ if (!integrity_profile_exists(template_disk))
goto no_integrity;
- if (!match_all && !blk_integrity_is_initialized(template_disk))
- continue; /* skip uninitialized profiles */
else if (prev_disk &&
blk_integrity_compare(prev_disk, template_disk) < 0)
goto no_integrity;
@@ -1052,34 +1051,40 @@ no_integrity:
}
/*
- * Register the mapped device for blk_integrity support if
- * the underlying devices have an integrity profile. But all devices
- * may not have matching profiles (checking all devices isn't reliable
+ * Register the mapped device for blk_integrity support if the
+ * underlying devices have an integrity profile. But all devices may
+ * not have matching profiles (checking all devices isn't reliable
* during table load because this table may use other DM device(s) which
- * must be resumed before they will have an initialized integity profile).
- * Stacked DM devices force a 2 stage integrity profile validation:
- * 1 - during load, validate all initialized integrity profiles match
- * 2 - during resume, validate all integrity profiles match
+ * must be resumed before they will have an initialized integity
+ * profile). Consequently, stacked DM devices force a 2 stage integrity
+ * profile validation: First pass during table load, final pass during
+ * resume.
*/
-static int dm_table_prealloc_integrity(struct dm_table *t, struct mapped_device *md)
+static int dm_table_register_integrity(struct dm_table *t)
{
+ struct mapped_device *md = t->md;
struct gendisk *template_disk = NULL;
- template_disk = dm_table_get_integrity_disk(t, false);
+ template_disk = dm_table_get_integrity_disk(t);
if (!template_disk)
return 0;
- if (!blk_integrity_is_initialized(dm_disk(md))) {
+ if (!integrity_profile_exists(dm_disk(md))) {
t->integrity_supported = 1;
- return blk_integrity_register(dm_disk(md), NULL);
+ /*
+ * Register integrity profile during table load; we can do
+ * this because the final profile must match during resume.
+ */
+ blk_integrity_register(dm_disk(md),
+ blk_get_integrity(template_disk));
+ return 0;
}
/*
- * If DM device already has an initalized integrity
+ * If DM device already has an initialized integrity
* profile the new profile should not conflict.
*/
- if (blk_integrity_is_initialized(template_disk) &&
- blk_integrity_compare(dm_disk(md), template_disk) < 0) {
+ if (blk_integrity_compare(dm_disk(md), template_disk) < 0) {
DMWARN("%s: conflict with existing integrity profile: "
"%s profile mismatch",
dm_device_name(t->md),
@@ -1087,7 +1092,7 @@ static int dm_table_prealloc_integrity(struct dm_table *t, struct mapped_device
return 1;
}
- /* Preserve existing initialized integrity profile */
+ /* Preserve existing integrity profile */
t->integrity_supported = 1;
return 0;
}
@@ -1112,7 +1117,7 @@ int dm_table_complete(struct dm_table *t)
return r;
}
- r = dm_table_prealloc_integrity(t, t->md);
+ r = dm_table_register_integrity(t);
if (r) {
DMERR("could not register integrity profile.");
return r;
@@ -1278,29 +1283,30 @@ combine_limits:
}
/*
- * Set the integrity profile for this device if all devices used have
- * matching profiles. We're quite deep in the resume path but still
- * don't know if all devices (particularly DM devices this device
- * may be stacked on) have matching profiles. Even if the profiles
- * don't match we have no way to fail (to resume) at this point.
+ * Verify that all devices have an integrity profile that matches the
+ * DM device's registered integrity profile. If the profiles don't
+ * match then unregister the DM device's integrity profile.
*/
-static void dm_table_set_integrity(struct dm_table *t)
+static void dm_table_verify_integrity(struct dm_table *t)
{
struct gendisk *template_disk = NULL;
- if (!blk_get_integrity(dm_disk(t->md)))
- return;
+ if (t->integrity_supported) {
+ /*
+ * Verify that the original integrity profile
+ * matches all the devices in this table.
+ */
+ template_disk = dm_table_get_integrity_disk(t);
+ if (template_disk &&
+ blk_integrity_compare(dm_disk(t->md), template_disk) >= 0)
+ return;
+ }
- template_disk = dm_table_get_integrity_disk(t, true);
- if (template_disk)
- blk_integrity_register(dm_disk(t->md),
- blk_get_integrity(template_disk));
- else if (blk_integrity_is_initialized(dm_disk(t->md)))
- DMWARN("%s: device no longer has a valid integrity profile",
- dm_device_name(t->md));
- else
+ if (integrity_profile_exists(dm_disk(t->md))) {
DMWARN("%s: unable to establish an integrity profile",
dm_device_name(t->md));
+ blk_integrity_unregister(dm_disk(t->md));
+ }
}
static int device_flush_capable(struct dm_target *ti, struct dm_dev *dev,
@@ -1500,7 +1506,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
else
queue_flag_set_unlocked(QUEUE_FLAG_NO_SG_MERGE, q);
- dm_table_set_integrity(t);
+ dm_table_verify_integrity(t);
/*
* Determine whether or not this queue's I/O timings contribute
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 6ba47cfb1..c219a053c 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -396,7 +396,9 @@ static int __superblock_all_zeroes(struct dm_block_manager *bm, int *result)
}
}
- return dm_bm_unlock(b);
+ dm_bm_unlock(b);
+
+ return 0;
}
static void __setup_btree_details(struct dm_pool_metadata *pmd)
@@ -650,7 +652,9 @@ static int __open_metadata(struct dm_pool_metadata *pmd)
}
__setup_btree_details(pmd);
- return dm_bm_unlock(sblock);
+ dm_bm_unlock(sblock);
+
+ return 0;
bad_cleanup_data_sm:
dm_sm_destroy(pmd->data_sm);
@@ -1203,6 +1207,12 @@ static int __reserve_metadata_snap(struct dm_pool_metadata *pmd)
dm_block_t held_root;
/*
+ * We commit to ensure the btree roots which we increment in a
+ * moment are up to date.
+ */
+ __commit_transaction(pmd);
+
+ /*
* Copy the superblock.
*/
dm_sm_inc_block(pmd->metadata_sm, THIN_SUPERBLOCK_LOCATION);
@@ -1297,7 +1307,9 @@ static int __release_metadata_snap(struct dm_pool_metadata *pmd)
dm_btree_del(&pmd->details_info, le64_to_cpu(disk_super->device_details_root));
dm_sm_dec_block(pmd->metadata_sm, held_root);
- return dm_tm_unlock(pmd->tm, copy);
+ dm_tm_unlock(pmd->tm, copy);
+
+ return 0;
}
int dm_pool_release_metadata_snap(struct dm_pool_metadata *pmd)
@@ -1327,7 +1339,9 @@ static int __get_metadata_snap(struct dm_pool_metadata *pmd,
disk_super = dm_block_data(sblock);
*result = le64_to_cpu(disk_super->held_root);
- return dm_bm_unlock(sblock);
+ dm_bm_unlock(sblock);
+
+ return 0;
}
int dm_pool_get_metadata_snap(struct dm_pool_metadata *pmd,
@@ -1530,7 +1544,7 @@ static int __remove(struct dm_thin_device *td, dm_block_t block)
static int __remove_range(struct dm_thin_device *td, dm_block_t begin, dm_block_t end)
{
int r;
- unsigned count;
+ unsigned count, total_count = 0;
struct dm_pool_metadata *pmd = td->pmd;
dm_block_t keys[1] = { td->id };
__le64 value;
@@ -1553,11 +1567,29 @@ static int __remove_range(struct dm_thin_device *td, dm_block_t begin, dm_block_
if (r)
return r;
- r = dm_btree_remove_leaves(&pmd->bl_info, mapping_root, &begin, end, &mapping_root, &count);
- if (r)
- return r;
+ /*
+ * Remove leaves stops at the first unmapped entry, so we have to
+ * loop round finding mapped ranges.
+ */
+ while (begin < end) {
+ r = dm_btree_lookup_next(&pmd->bl_info, mapping_root, &begin, &begin, &value);
+ if (r == -ENODATA)
+ break;
+
+ if (r)
+ return r;
+
+ if (begin >= end)
+ break;
+
+ r = dm_btree_remove_leaves(&pmd->bl_info, mapping_root, &begin, end, &mapping_root, &count);
+ if (r)
+ return r;
+
+ total_count += count;
+ }
- td->mapped_blocks -= count;
+ td->mapped_blocks -= total_count;
td->changed = 1;
/*
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 3897b90bd..63903a5a5 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -2432,6 +2432,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
case PM_WRITE:
if (old_mode != new_mode)
notify_of_pool_mode_change(pool, "write");
+ pool->pf.error_if_no_space = pt->requested_pf.error_if_no_space;
dm_pool_metadata_read_write(pool->pmd);
pool->process_bio = process_bio;
pool->process_discard = process_discard_bio;
@@ -4249,10 +4250,9 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
{
struct thin_c *tc = ti->private;
struct pool *pool = tc->pool;
- struct queue_limits *pool_limits = dm_get_queue_limits(pool->pool_md);
- if (!pool_limits->discard_granularity)
- return; /* pool's discard support is disabled */
+ if (!pool->pf.discard_enabled)
+ return;
limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
limits->max_discard_sectors = 2048 * 1024 * 16; /* 16G */
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c
index edc624bcc..ccf41886e 100644
--- a/drivers/md/dm-verity.c
+++ b/drivers/md/dm-verity.c
@@ -631,18 +631,17 @@ static void verity_status(struct dm_target *ti, status_type_t type,
}
}
-static int verity_ioctl(struct dm_target *ti, unsigned cmd,
- unsigned long arg)
+static int verity_prepare_ioctl(struct dm_target *ti,
+ struct block_device **bdev, fmode_t *mode)
{
struct dm_verity *v = ti->private;
- int r = 0;
+
+ *bdev = v->data_dev->bdev;
if (v->data_start ||
ti->len != i_size_read(v->data_dev->bdev->bd_inode) >> SECTOR_SHIFT)
- r = scsi_verify_blk_ioctl(NULL, cmd);
-
- return r ? : __blkdev_driver_ioctl(v->data_dev->bdev, v->data_dev->mode,
- cmd, arg);
+ return 1;
+ return 0;
}
static int verity_iterate_devices(struct dm_target *ti,
@@ -965,7 +964,7 @@ static struct target_type verity_target = {
.dtr = verity_dtr,
.map = verity_map,
.status = verity_status,
- .ioctl = verity_ioctl,
+ .prepare_ioctl = verity_prepare_ioctl,
.iterate_devices = verity_iterate_devices,
.io_hints = verity_io_hints,
};
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 1b5c6047e..5df404802 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -24,6 +24,7 @@
#include <linux/ktime.h>
#include <linux/elevator.h> /* for rq_end_sector() */
#include <linux/blk-mq.h>
+#include <linux/pr.h>
#include <trace/events/block.h>
@@ -555,18 +556,16 @@ static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
return dm_get_geometry(md, geo);
}
-static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
- unsigned int cmd, unsigned long arg)
+static int dm_get_live_table_for_ioctl(struct mapped_device *md,
+ struct dm_target **tgt, struct block_device **bdev,
+ fmode_t *mode, int *srcu_idx)
{
- struct mapped_device *md = bdev->bd_disk->private_data;
- int srcu_idx;
struct dm_table *map;
- struct dm_target *tgt;
- int r = -ENOTTY;
+ int r;
retry:
- map = dm_get_live_table(md, &srcu_idx);
-
+ r = -ENOTTY;
+ map = dm_get_live_table(md, srcu_idx);
if (!map || !dm_table_get_size(map))
goto out;
@@ -574,8 +573,9 @@ retry:
if (dm_table_get_num_targets(map) != 1)
goto out;
- tgt = dm_table_get_target(map, 0);
- if (!tgt->type->ioctl)
+ *tgt = dm_table_get_target(map, 0);
+
+ if (!(*tgt)->type->prepare_ioctl)
goto out;
if (dm_suspended_md(md)) {
@@ -583,16 +583,47 @@ retry:
goto out;
}
- r = tgt->type->ioctl(tgt, cmd, arg);
+ r = (*tgt)->type->prepare_ioctl(*tgt, bdev, mode);
+ if (r < 0)
+ goto out;
-out:
- dm_put_live_table(md, srcu_idx);
+ return r;
- if (r == -ENOTCONN) {
+out:
+ dm_put_live_table(md, *srcu_idx);
+ if (r == -ENOTCONN && !fatal_signal_pending(current)) {
msleep(10);
goto retry;
}
+ return r;
+}
+
+static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned int cmd, unsigned long arg)
+{
+ struct mapped_device *md = bdev->bd_disk->private_data;
+ struct dm_target *tgt;
+ struct block_device *tgt_bdev = NULL;
+ int srcu_idx, r;
+
+ r = dm_get_live_table_for_ioctl(md, &tgt, &tgt_bdev, &mode, &srcu_idx);
+ if (r < 0)
+ return r;
+ if (r > 0) {
+ /*
+ * Target determined this ioctl is being issued against
+ * a logical partition of the parent bdev; so extra
+ * validation is needed.
+ */
+ r = scsi_verify_blk_ioctl(NULL, cmd);
+ if (r)
+ goto out;
+ }
+
+ r = __blkdev_driver_ioctl(tgt_bdev, mode, cmd, arg);
+out:
+ dm_put_live_table(md, srcu_idx);
return r;
}
@@ -1725,7 +1756,7 @@ static void __split_and_process_bio(struct mapped_device *md,
* The request function that just remaps the bio built up by
* dm_merge_bvec.
*/
-static void dm_make_request(struct request_queue *q, struct bio *bio)
+static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio)
{
int rw = bio_data_dir(bio);
struct mapped_device *md = q->queuedata;
@@ -1734,8 +1765,6 @@ static void dm_make_request(struct request_queue *q, struct bio *bio)
map = dm_get_live_table(md, &srcu_idx);
- blk_queue_split(q, &bio, q->bio_split);
-
generic_start_io_acct(rw, bio_sectors(bio), &dm_disk(md)->part0);
/* if we're suspended, we have to queue this io for later */
@@ -1746,12 +1775,12 @@ static void dm_make_request(struct request_queue *q, struct bio *bio)
queue_io(md, bio);
else
bio_io_error(bio);
- return;
+ return BLK_QC_T_NONE;
}
__split_and_process_bio(md, map, bio);
dm_put_live_table(md, srcu_idx);
- return;
+ return BLK_QC_T_NONE;
}
int dm_request_based(struct mapped_device *md)
@@ -2198,6 +2227,13 @@ static void dm_init_md_queue(struct mapped_device *md)
* This queue is new, so no concurrency on the queue_flags.
*/
queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue);
+
+ /*
+ * Initialize data that will only be used by a non-blk-mq DM queue
+ * - must do so here (in alloc_dev callchain) before queue is used
+ */
+ md->queue->queuedata = md;
+ md->queue->backing_dev_info.congested_data = md;
}
static void dm_init_old_md_queue(struct mapped_device *md)
@@ -2208,10 +2244,7 @@ static void dm_init_old_md_queue(struct mapped_device *md)
/*
* Initialize aspects of queue that aren't relevant for blk-mq
*/
- md->queue->queuedata = md;
md->queue->backing_dev_info.congested_fn = dm_any_congested;
- md->queue->backing_dev_info.congested_data = md;
-
blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
}
@@ -2221,10 +2254,8 @@ static void cleanup_mapped_device(struct mapped_device *md)
destroy_workqueue(md->wq);
if (md->kworker_task)
kthread_stop(md->kworker_task);
- if (md->io_pool)
- mempool_destroy(md->io_pool);
- if (md->rq_pool)
- mempool_destroy(md->rq_pool);
+ mempool_destroy(md->io_pool);
+ mempool_destroy(md->rq_pool);
if (md->bs)
bioset_free(md->bs);
@@ -2234,8 +2265,6 @@ static void cleanup_mapped_device(struct mapped_device *md)
spin_lock(&_minor_lock);
md->disk->private_data = NULL;
spin_unlock(&_minor_lock);
- if (blk_get_integrity(md->disk))
- blk_integrity_unregister(md->disk);
del_gendisk(md->disk);
put_disk(md->disk);
}
@@ -2761,6 +2790,12 @@ int dm_setup_md_queue(struct mapped_device *md)
case DM_TYPE_BIO_BASED:
dm_init_old_md_queue(md);
blk_queue_make_request(md->queue, dm_make_request);
+ /*
+ * DM handles splitting bios as needed. Free the bio_split bioset
+ * since it won't be used (saves 1 process per bio-based DM device).
+ */
+ bioset_free(md->queue->bio_split);
+ md->queue->bio_split = NULL;
break;
}
@@ -3507,11 +3542,8 @@ void dm_free_md_mempools(struct dm_md_mempools *pools)
if (!pools)
return;
- if (pools->io_pool)
- mempool_destroy(pools->io_pool);
-
- if (pools->rq_pool)
- mempool_destroy(pools->rq_pool);
+ mempool_destroy(pools->io_pool);
+ mempool_destroy(pools->rq_pool);
if (pools->bs)
bioset_free(pools->bs);
@@ -3519,11 +3551,133 @@ void dm_free_md_mempools(struct dm_md_mempools *pools)
kfree(pools);
}
+static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
+ u32 flags)
+{
+ struct mapped_device *md = bdev->bd_disk->private_data;
+ const struct pr_ops *ops;
+ struct dm_target *tgt;
+ fmode_t mode;
+ int srcu_idx, r;
+
+ r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx);
+ if (r < 0)
+ return r;
+
+ ops = bdev->bd_disk->fops->pr_ops;
+ if (ops && ops->pr_register)
+ r = ops->pr_register(bdev, old_key, new_key, flags);
+ else
+ r = -EOPNOTSUPP;
+
+ dm_put_live_table(md, srcu_idx);
+ return r;
+}
+
+static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type,
+ u32 flags)
+{
+ struct mapped_device *md = bdev->bd_disk->private_data;
+ const struct pr_ops *ops;
+ struct dm_target *tgt;
+ fmode_t mode;
+ int srcu_idx, r;
+
+ r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx);
+ if (r < 0)
+ return r;
+
+ ops = bdev->bd_disk->fops->pr_ops;
+ if (ops && ops->pr_reserve)
+ r = ops->pr_reserve(bdev, key, type, flags);
+ else
+ r = -EOPNOTSUPP;
+
+ dm_put_live_table(md, srcu_idx);
+ return r;
+}
+
+static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
+{
+ struct mapped_device *md = bdev->bd_disk->private_data;
+ const struct pr_ops *ops;
+ struct dm_target *tgt;
+ fmode_t mode;
+ int srcu_idx, r;
+
+ r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx);
+ if (r < 0)
+ return r;
+
+ ops = bdev->bd_disk->fops->pr_ops;
+ if (ops && ops->pr_release)
+ r = ops->pr_release(bdev, key, type);
+ else
+ r = -EOPNOTSUPP;
+
+ dm_put_live_table(md, srcu_idx);
+ return r;
+}
+
+static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key,
+ enum pr_type type, bool abort)
+{
+ struct mapped_device *md = bdev->bd_disk->private_data;
+ const struct pr_ops *ops;
+ struct dm_target *tgt;
+ fmode_t mode;
+ int srcu_idx, r;
+
+ r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx);
+ if (r < 0)
+ return r;
+
+ ops = bdev->bd_disk->fops->pr_ops;
+ if (ops && ops->pr_preempt)
+ r = ops->pr_preempt(bdev, old_key, new_key, type, abort);
+ else
+ r = -EOPNOTSUPP;
+
+ dm_put_live_table(md, srcu_idx);
+ return r;
+}
+
+static int dm_pr_clear(struct block_device *bdev, u64 key)
+{
+ struct mapped_device *md = bdev->bd_disk->private_data;
+ const struct pr_ops *ops;
+ struct dm_target *tgt;
+ fmode_t mode;
+ int srcu_idx, r;
+
+ r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx);
+ if (r < 0)
+ return r;
+
+ ops = bdev->bd_disk->fops->pr_ops;
+ if (ops && ops->pr_clear)
+ r = ops->pr_clear(bdev, key);
+ else
+ r = -EOPNOTSUPP;
+
+ dm_put_live_table(md, srcu_idx);
+ return r;
+}
+
+static const struct pr_ops dm_pr_ops = {
+ .pr_register = dm_pr_register,
+ .pr_reserve = dm_pr_reserve,
+ .pr_release = dm_pr_release,
+ .pr_preempt = dm_pr_preempt,
+ .pr_clear = dm_pr_clear,
+};
+
static const struct block_device_operations dm_blk_dops = {
.open = dm_blk_open,
.release = dm_blk_close,
.ioctl = dm_blk_ioctl,
.getgeo = dm_blk_getgeo,
+ .pr_ops = &dm_pr_ops,
.owner = THIS_MODULE
};
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 11e3bc9d2..d6a1126d8 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -28,6 +28,7 @@ struct dlm_lock_resource {
struct completion completion; /* completion for synchronized locking */
void (*bast)(void *arg, int mode); /* blocking AST function pointer*/
struct mddev *mddev; /* pointing back to mddev. */
+ int mode;
};
struct suspend_info {
@@ -53,8 +54,8 @@ struct md_cluster_info {
dlm_lockspace_t *lockspace;
int slot_number;
struct completion completion;
- struct mutex sb_mutex;
struct dlm_lock_resource *bitmap_lockres;
+ struct dlm_lock_resource *resync_lockres;
struct list_head suspend_list;
spinlock_t suspend_lock;
struct md_thread *recovery_thread;
@@ -79,20 +80,20 @@ enum msg_type {
};
struct cluster_msg {
- int type;
- int slot;
+ __le32 type;
+ __le32 slot;
/* TODO: Unionize this for smaller footprint */
- sector_t low;
- sector_t high;
+ __le64 low;
+ __le64 high;
char uuid[16];
- int raid_slot;
+ __le32 raid_slot;
};
static void sync_ast(void *arg)
{
struct dlm_lock_resource *res;
- res = (struct dlm_lock_resource *) arg;
+ res = arg;
complete(&res->completion);
}
@@ -106,6 +107,8 @@ static int dlm_lock_sync(struct dlm_lock_resource *res, int mode)
if (ret)
return ret;
wait_for_completion(&res->completion);
+ if (res->lksb.sb_status == 0)
+ res->mode = mode;
return res->lksb.sb_status;
}
@@ -127,6 +130,7 @@ static struct dlm_lock_resource *lockres_init(struct mddev *mddev,
init_completion(&res->completion);
res->ls = cinfo->lockspace;
res->mddev = mddev;
+ res->mode = DLM_LOCK_IV;
namelen = strlen(name);
res->name = kzalloc(namelen + 1, GFP_KERNEL);
if (!res->name) {
@@ -191,8 +195,8 @@ retry:
kfree(res);
}
-static void add_resync_info(struct mddev *mddev, struct dlm_lock_resource *lockres,
- sector_t lo, sector_t hi)
+static void add_resync_info(struct dlm_lock_resource *lockres,
+ sector_t lo, sector_t hi)
{
struct resync_info *ri;
@@ -210,7 +214,7 @@ static struct suspend_info *read_resync_info(struct mddev *mddev, struct dlm_loc
dlm_lock_sync(lockres, DLM_LOCK_CR);
memcpy(&ri, lockres->lksb.sb_lvbptr, sizeof(struct resync_info));
hi = le64_to_cpu(ri.hi);
- if (ri.hi > 0) {
+ if (hi > 0) {
s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL);
if (!s)
goto out;
@@ -345,7 +349,7 @@ static const struct dlm_lockspace_ops md_ls_ops = {
*/
static void ack_bast(void *arg, int mode)
{
- struct dlm_lock_resource *res = (struct dlm_lock_resource *)arg;
+ struct dlm_lock_resource *res = arg;
struct md_cluster_info *cinfo = res->mddev->cluster_info;
if (mode == DLM_LOCK_EX)
@@ -358,29 +362,32 @@ static void __remove_suspend_info(struct md_cluster_info *cinfo, int slot)
list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list)
if (slot == s->slot) {
- pr_info("%s:%d Deleting suspend_info: %d\n",
- __func__, __LINE__, slot);
list_del(&s->list);
kfree(s);
break;
}
}
-static void remove_suspend_info(struct md_cluster_info *cinfo, int slot)
+static void remove_suspend_info(struct mddev *mddev, int slot)
{
+ struct md_cluster_info *cinfo = mddev->cluster_info;
spin_lock_irq(&cinfo->suspend_lock);
__remove_suspend_info(cinfo, slot);
spin_unlock_irq(&cinfo->suspend_lock);
+ mddev->pers->quiesce(mddev, 2);
}
-static void process_suspend_info(struct md_cluster_info *cinfo,
+static void process_suspend_info(struct mddev *mddev,
int slot, sector_t lo, sector_t hi)
{
+ struct md_cluster_info *cinfo = mddev->cluster_info;
struct suspend_info *s;
if (!hi) {
- remove_suspend_info(cinfo, slot);
+ remove_suspend_info(mddev, slot);
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
return;
}
s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL);
@@ -389,11 +396,14 @@ static void process_suspend_info(struct md_cluster_info *cinfo,
s->slot = slot;
s->lo = lo;
s->hi = hi;
+ mddev->pers->quiesce(mddev, 1);
+ mddev->pers->quiesce(mddev, 0);
spin_lock_irq(&cinfo->suspend_lock);
/* Remove existing entry (if exists) before adding */
__remove_suspend_info(cinfo, slot);
list_add(&s->list, &cinfo->suspend_list);
spin_unlock_irq(&cinfo->suspend_lock);
+ mddev->pers->quiesce(mddev, 2);
}
static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
@@ -407,7 +417,7 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
len = snprintf(disk_uuid, 64, "DEVICE_UUID=");
sprintf(disk_uuid + len, "%pU", cmsg->uuid);
- snprintf(raid_slot, 16, "RAID_DISK=%d", cmsg->raid_slot);
+ snprintf(raid_slot, 16, "RAID_DISK=%d", le32_to_cpu(cmsg->raid_slot));
pr_info("%s:%d Sending kobject change with %s and %s\n", __func__, __LINE__, disk_uuid, raid_slot);
init_completion(&cinfo->newdisk_completion);
set_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state);
@@ -421,64 +431,59 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
-
- md_reload_sb(mddev);
+ md_reload_sb(mddev, le32_to_cpu(msg->raid_slot));
dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
}
static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg)
{
- struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, msg->raid_slot);
+ struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev,
+ le32_to_cpu(msg->raid_slot));
if (rdev)
md_kick_rdev_from_array(rdev);
else
- pr_warn("%s: %d Could not find disk(%d) to REMOVE\n", __func__, __LINE__, msg->raid_slot);
+ pr_warn("%s: %d Could not find disk(%d) to REMOVE\n",
+ __func__, __LINE__, le32_to_cpu(msg->raid_slot));
}
static void process_readd_disk(struct mddev *mddev, struct cluster_msg *msg)
{
- struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, msg->raid_slot);
+ struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev,
+ le32_to_cpu(msg->raid_slot));
if (rdev && test_bit(Faulty, &rdev->flags))
clear_bit(Faulty, &rdev->flags);
else
- pr_warn("%s: %d Could not find disk(%d) which is faulty", __func__, __LINE__, msg->raid_slot);
+ pr_warn("%s: %d Could not find disk(%d) which is faulty",
+ __func__, __LINE__, le32_to_cpu(msg->raid_slot));
}
static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
{
- switch (msg->type) {
+ if (WARN(mddev->cluster_info->slot_number - 1 == le32_to_cpu(msg->slot),
+ "node %d received it's own msg\n", le32_to_cpu(msg->slot)))
+ return;
+ switch (le32_to_cpu(msg->type)) {
case METADATA_UPDATED:
- pr_info("%s: %d Received message: METADATA_UPDATE from %d\n",
- __func__, __LINE__, msg->slot);
process_metadata_update(mddev, msg);
break;
case RESYNCING:
- pr_info("%s: %d Received message: RESYNCING from %d\n",
- __func__, __LINE__, msg->slot);
- process_suspend_info(mddev->cluster_info, msg->slot,
- msg->low, msg->high);
+ process_suspend_info(mddev, le32_to_cpu(msg->slot),
+ le64_to_cpu(msg->low),
+ le64_to_cpu(msg->high));
break;
case NEWDISK:
- pr_info("%s: %d Received message: NEWDISK from %d\n",
- __func__, __LINE__, msg->slot);
process_add_new_disk(mddev, msg);
break;
case REMOVE:
- pr_info("%s: %d Received REMOVE from %d\n",
- __func__, __LINE__, msg->slot);
process_remove_disk(mddev, msg);
break;
case RE_ADD:
- pr_info("%s: %d Received RE_ADD from %d\n",
- __func__, __LINE__, msg->slot);
process_readd_disk(mddev, msg);
break;
case BITMAP_NEEDS_SYNC:
- pr_info("%s: %d Received BITMAP_NEEDS_SYNC from %d\n",
- __func__, __LINE__, msg->slot);
- __recover_slot(mddev, msg->slot);
+ __recover_slot(mddev, le32_to_cpu(msg->slot));
break;
default:
pr_warn("%s:%d Received unknown message from %d\n",
@@ -528,11 +533,17 @@ static void recv_daemon(struct md_thread *thread)
/* lock_comm()
* Takes the lock on the TOKEN lock resource so no other
* node can communicate while the operation is underway.
+ * If called again, and the TOKEN lock is alread in EX mode
+ * return success. However, care must be taken that unlock_comm()
+ * is called only once.
*/
static int lock_comm(struct md_cluster_info *cinfo)
{
int error;
+ if (cinfo->token_lockres->mode == DLM_LOCK_EX)
+ return 0;
+
error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX);
if (error)
pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n",
@@ -542,6 +553,7 @@ static int lock_comm(struct md_cluster_info *cinfo)
static void unlock_comm(struct md_cluster_info *cinfo)
{
+ WARN_ON(cinfo->token_lockres->mode != DLM_LOCK_EX);
dlm_unlock_sync(cinfo->token_lockres);
}
@@ -696,7 +708,6 @@ static int join(struct mddev *mddev, int nodes)
init_completion(&cinfo->completion);
set_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state);
- mutex_init(&cinfo->sb_mutex);
mddev->cluster_info = cinfo;
memset(str, 0, 64);
@@ -753,6 +764,10 @@ static int join(struct mddev *mddev, int nodes)
goto err;
}
+ cinfo->resync_lockres = lockres_init(mddev, "resync", NULL, 0);
+ if (!cinfo->resync_lockres)
+ goto err;
+
ret = gather_all_resync_info(mddev, nodes);
if (ret)
goto err;
@@ -763,6 +778,7 @@ err:
lockres_free(cinfo->token_lockres);
lockres_free(cinfo->ack_lockres);
lockres_free(cinfo->no_new_dev_lockres);
+ lockres_free(cinfo->resync_lockres);
lockres_free(cinfo->bitmap_lockres);
if (cinfo->lockspace)
dlm_release_lockspace(cinfo->lockspace, 2);
@@ -771,12 +787,32 @@ err:
return ret;
}
+static void resync_bitmap(struct mddev *mddev)
+{
+ struct md_cluster_info *cinfo = mddev->cluster_info;
+ struct cluster_msg cmsg = {0};
+ int err;
+
+ cmsg.type = cpu_to_le32(BITMAP_NEEDS_SYNC);
+ err = sendmsg(cinfo, &cmsg);
+ if (err)
+ pr_err("%s:%d: failed to send BITMAP_NEEDS_SYNC message (%d)\n",
+ __func__, __LINE__, err);
+}
+
static int leave(struct mddev *mddev)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
if (!cinfo)
return 0;
+
+ /* BITMAP_NEEDS_SYNC message should be sent when node
+ * is leaving the cluster with dirty bitmap, also we
+ * can only deliver it when dlm connection is available */
+ if (cinfo->slot_number > 0 && mddev->recovery_cp != MaxSector)
+ resync_bitmap(mddev);
+
md_unregister_thread(&cinfo->recovery_thread);
md_unregister_thread(&cinfo->recv_thread);
lockres_free(cinfo->message_lockres);
@@ -799,15 +835,6 @@ static int slot_number(struct mddev *mddev)
return cinfo->slot_number - 1;
}
-static void resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
-{
- struct md_cluster_info *cinfo = mddev->cluster_info;
-
- add_resync_info(mddev, cinfo->bitmap_lockres, lo, hi);
- /* Re-acquire the lock to refresh LVB */
- dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW);
-}
-
static int metadata_update_start(struct mddev *mddev)
{
return lock_comm(mddev->cluster_info);
@@ -817,59 +844,62 @@ static int metadata_update_finish(struct mddev *mddev)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
struct cluster_msg cmsg;
- int ret;
+ struct md_rdev *rdev;
+ int ret = 0;
+ int raid_slot = -1;
memset(&cmsg, 0, sizeof(cmsg));
cmsg.type = cpu_to_le32(METADATA_UPDATED);
- ret = __sendmsg(cinfo, &cmsg);
+ /* Pick up a good active device number to send.
+ */
+ rdev_for_each(rdev, mddev)
+ if (rdev->raid_disk > -1 && !test_bit(Faulty, &rdev->flags)) {
+ raid_slot = rdev->desc_nr;
+ break;
+ }
+ if (raid_slot >= 0) {
+ cmsg.raid_slot = cpu_to_le32(raid_slot);
+ ret = __sendmsg(cinfo, &cmsg);
+ } else
+ pr_warn("md-cluster: No good device id found to send\n");
unlock_comm(cinfo);
return ret;
}
-static int metadata_update_cancel(struct mddev *mddev)
+static void metadata_update_cancel(struct mddev *mddev)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
+ unlock_comm(cinfo);
+}
- return dlm_unlock_sync(cinfo->token_lockres);
+static int resync_start(struct mddev *mddev)
+{
+ struct md_cluster_info *cinfo = mddev->cluster_info;
+ cinfo->resync_lockres->flags |= DLM_LKF_NOQUEUE;
+ return dlm_lock_sync(cinfo->resync_lockres, DLM_LOCK_EX);
}
-static int resync_send(struct mddev *mddev, enum msg_type type,
- sector_t lo, sector_t hi)
+static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
- struct cluster_msg cmsg;
- int slot = cinfo->slot_number - 1;
+ struct cluster_msg cmsg = {0};
- pr_info("%s:%d lo: %llu hi: %llu\n", __func__, __LINE__,
- (unsigned long long)lo,
- (unsigned long long)hi);
- resync_info_update(mddev, lo, hi);
- cmsg.type = cpu_to_le32(type);
- cmsg.slot = cpu_to_le32(slot);
+ add_resync_info(cinfo->bitmap_lockres, lo, hi);
+ /* Re-acquire the lock to refresh LVB */
+ dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW);
+ cmsg.type = cpu_to_le32(RESYNCING);
cmsg.low = cpu_to_le64(lo);
cmsg.high = cpu_to_le64(hi);
- return sendmsg(cinfo, &cmsg);
-}
-static int resync_start(struct mddev *mddev, sector_t lo, sector_t hi)
-{
- pr_info("%s:%d\n", __func__, __LINE__);
- return resync_send(mddev, RESYNCING, lo, hi);
+ return sendmsg(cinfo, &cmsg);
}
-static void resync_finish(struct mddev *mddev)
+static int resync_finish(struct mddev *mddev)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
- struct cluster_msg cmsg;
- int slot = cinfo->slot_number - 1;
-
- pr_info("%s:%d\n", __func__, __LINE__);
- resync_send(mddev, RESYNCING, 0, 0);
- if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
- cmsg.type = cpu_to_le32(BITMAP_NEEDS_SYNC);
- cmsg.slot = cpu_to_le32(slot);
- sendmsg(cinfo, &cmsg);
- }
+ cinfo->resync_lockres->flags &= ~DLM_LKF_NOQUEUE;
+ dlm_unlock_sync(cinfo->resync_lockres);
+ return resync_info_update(mddev, 0, 0);
}
static int area_resyncing(struct mddev *mddev, int direction,
@@ -896,7 +926,11 @@ out:
return ret;
}
-static int add_new_disk_start(struct mddev *mddev, struct md_rdev *rdev)
+/* add_new_disk() - initiates a disk add
+ * However, if this fails before writing md_update_sb(),
+ * add_new_disk_cancel() must be called to release token lock
+ */
+static int add_new_disk(struct mddev *mddev, struct md_rdev *rdev)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
struct cluster_msg cmsg;
@@ -907,7 +941,7 @@ static int add_new_disk_start(struct mddev *mddev, struct md_rdev *rdev)
memset(&cmsg, 0, sizeof(cmsg));
cmsg.type = cpu_to_le32(NEWDISK);
memcpy(cmsg.uuid, uuid, 16);
- cmsg.raid_slot = rdev->desc_nr;
+ cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
lock_comm(cinfo);
ret = __sendmsg(cinfo, &cmsg);
if (ret)
@@ -918,22 +952,17 @@ static int add_new_disk_start(struct mddev *mddev, struct md_rdev *rdev)
/* Some node does not "see" the device */
if (ret == -EAGAIN)
ret = -ENOENT;
+ if (ret)
+ unlock_comm(cinfo);
else
dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
return ret;
}
-static int add_new_disk_finish(struct mddev *mddev)
+static void add_new_disk_cancel(struct mddev *mddev)
{
- struct cluster_msg cmsg;
struct md_cluster_info *cinfo = mddev->cluster_info;
- int ret;
- /* Write sb and inform others */
- md_update_sb(mddev, 1);
- cmsg.type = METADATA_UPDATED;
- ret = __sendmsg(cinfo, &cmsg);
unlock_comm(cinfo);
- return ret;
}
static int new_disk_ack(struct mddev *mddev, bool ack)
@@ -953,10 +982,10 @@ static int new_disk_ack(struct mddev *mddev, bool ack)
static int remove_disk(struct mddev *mddev, struct md_rdev *rdev)
{
- struct cluster_msg cmsg;
+ struct cluster_msg cmsg = {0};
struct md_cluster_info *cinfo = mddev->cluster_info;
- cmsg.type = REMOVE;
- cmsg.raid_slot = rdev->desc_nr;
+ cmsg.type = cpu_to_le32(REMOVE);
+ cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
return __sendmsg(cinfo, &cmsg);
}
@@ -964,12 +993,12 @@ static int gather_bitmaps(struct md_rdev *rdev)
{
int sn, err;
sector_t lo, hi;
- struct cluster_msg cmsg;
+ struct cluster_msg cmsg = {0};
struct mddev *mddev = rdev->mddev;
struct md_cluster_info *cinfo = mddev->cluster_info;
- cmsg.type = RE_ADD;
- cmsg.raid_slot = rdev->desc_nr;
+ cmsg.type = cpu_to_le32(RE_ADD);
+ cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
err = sendmsg(cinfo, &cmsg);
if (err)
goto out;
@@ -993,15 +1022,15 @@ static struct md_cluster_operations cluster_ops = {
.join = join,
.leave = leave,
.slot_number = slot_number,
- .resync_info_update = resync_info_update,
.resync_start = resync_start,
.resync_finish = resync_finish,
+ .resync_info_update = resync_info_update,
.metadata_update_start = metadata_update_start,
.metadata_update_finish = metadata_update_finish,
.metadata_update_cancel = metadata_update_cancel,
.area_resyncing = area_resyncing,
- .add_new_disk_start = add_new_disk_start,
- .add_new_disk_finish = add_new_disk_finish,
+ .add_new_disk = add_new_disk,
+ .add_new_disk_cancel = add_new_disk_cancel,
.new_disk_ack = new_disk_ack,
.remove_disk = remove_disk,
.gather_bitmaps = gather_bitmaps,
@@ -1022,5 +1051,6 @@ static void cluster_exit(void)
module_init(cluster_init);
module_exit(cluster_exit);
+MODULE_AUTHOR("SUSE");
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Clustering support for MD");
diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h
index 00defe2ba..e75ea2613 100644
--- a/drivers/md/md-cluster.h
+++ b/drivers/md/md-cluster.h
@@ -12,15 +12,15 @@ struct md_cluster_operations {
int (*join)(struct mddev *mddev, int nodes);
int (*leave)(struct mddev *mddev);
int (*slot_number)(struct mddev *mddev);
- void (*resync_info_update)(struct mddev *mddev, sector_t lo, sector_t hi);
- int (*resync_start)(struct mddev *mddev, sector_t lo, sector_t hi);
- void (*resync_finish)(struct mddev *mddev);
+ int (*resync_info_update)(struct mddev *mddev, sector_t lo, sector_t hi);
int (*metadata_update_start)(struct mddev *mddev);
int (*metadata_update_finish)(struct mddev *mddev);
- int (*metadata_update_cancel)(struct mddev *mddev);
+ void (*metadata_update_cancel)(struct mddev *mddev);
+ int (*resync_start)(struct mddev *mddev);
+ int (*resync_finish)(struct mddev *mddev);
int (*area_resyncing)(struct mddev *mddev, int direction, sector_t lo, sector_t hi);
- int (*add_new_disk_start)(struct mddev *mddev, struct md_rdev *rdev);
- int (*add_new_disk_finish)(struct mddev *mddev);
+ int (*add_new_disk)(struct mddev *mddev, struct md_rdev *rdev);
+ void (*add_new_disk_cancel)(struct mddev *mddev);
int (*new_disk_ack)(struct mddev *mddev, bool ack);
int (*remove_disk)(struct mddev *mddev, struct md_rdev *rdev);
int (*gather_bitmaps)(struct md_rdev *rdev);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 3fe3d04a9..61aacab42 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -250,7 +250,7 @@ static DEFINE_SPINLOCK(all_mddevs_lock);
* call has finished, the bio has been linked into some internal structure
* and so is visible to ->quiesce(), so we don't need the refcount any more.
*/
-static void md_make_request(struct request_queue *q, struct bio *bio)
+static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
{
const int rw = bio_data_dir(bio);
struct mddev *mddev = q->queuedata;
@@ -262,13 +262,13 @@ static void md_make_request(struct request_queue *q, struct bio *bio)
if (mddev == NULL || mddev->pers == NULL
|| !mddev->ready) {
bio_io_error(bio);
- return;
+ return BLK_QC_T_NONE;
}
if (mddev->ro == 1 && unlikely(rw == WRITE)) {
if (bio_sectors(bio) != 0)
bio->bi_error = -EROFS;
bio_endio(bio);
- return;
+ return BLK_QC_T_NONE;
}
smp_rmb(); /* Ensure implications of 'active' are visible */
rcu_read_lock();
@@ -302,6 +302,8 @@ static void md_make_request(struct request_queue *q, struct bio *bio)
if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
wake_up(&mddev->sb_wait);
+
+ return BLK_QC_T_NONE;
}
/* mddev_suspend makes sure no new requests are submitted
@@ -312,8 +314,8 @@ static void md_make_request(struct request_queue *q, struct bio *bio)
*/
void mddev_suspend(struct mddev *mddev)
{
- BUG_ON(mddev->suspended);
- mddev->suspended = 1;
+ if (mddev->suspended++)
+ return;
synchronize_rcu();
wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
mddev->pers->quiesce(mddev, 1);
@@ -324,7 +326,8 @@ EXPORT_SYMBOL_GPL(mddev_suspend);
void mddev_resume(struct mddev *mddev)
{
- mddev->suspended = 0;
+ if (--mddev->suspended)
+ return;
wake_up(&mddev->sb_wait);
mddev->pers->quiesce(mddev, 0);
@@ -1608,7 +1611,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
++ev1;
if (rdev->desc_nr >= 0 &&
rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
- le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < 0xfffe)
+ (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
+ le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))
if (ev1 < mddev->events)
return -EINVAL;
} else if (mddev->bitmap) {
@@ -1628,16 +1632,29 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
int role;
if (rdev->desc_nr < 0 ||
rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
- role = 0xffff;
+ role = MD_DISK_ROLE_SPARE;
rdev->desc_nr = -1;
} else
role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
switch(role) {
- case 0xffff: /* spare */
+ case MD_DISK_ROLE_SPARE: /* spare */
break;
- case 0xfffe: /* faulty */
+ case MD_DISK_ROLE_FAULTY: /* faulty */
set_bit(Faulty, &rdev->flags);
break;
+ case MD_DISK_ROLE_JOURNAL: /* journal device */
+ if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) {
+ /* journal device without journal feature */
+ printk(KERN_WARNING
+ "md: journal device provided without journal feature, ignoring the device\n");
+ return -EINVAL;
+ }
+ set_bit(Journal, &rdev->flags);
+ rdev->journal_tail = le64_to_cpu(sb->journal_tail);
+ if (mddev->recovery_cp == MaxSector)
+ set_bit(MD_JOURNAL_CLEAN, &mddev->flags);
+ rdev->raid_disk = 0;
+ break;
default:
rdev->saved_raid_disk = role;
if ((le32_to_cpu(sb->feature_map) &
@@ -1655,6 +1672,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
set_bit(WriteMostly, &rdev->flags);
if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
set_bit(Replacement, &rdev->flags);
+ if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)
+ set_bit(MD_HAS_JOURNAL, &mddev->flags);
} else /* MULTIPATH are always insync */
set_bit(In_sync, &rdev->flags);
@@ -1679,6 +1698,8 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
sb->events = cpu_to_le64(mddev->events);
if (mddev->in_sync)
sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
+ else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags))
+ sb->resync_offset = cpu_to_le64(MaxSector);
else
sb->resync_offset = cpu_to_le64(0);
@@ -1702,7 +1723,7 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
}
- if (rdev->raid_disk >= 0 &&
+ if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) &&
!test_bit(In_sync, &rdev->flags)) {
sb->feature_map |=
cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
@@ -1712,6 +1733,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
sb->feature_map |=
cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP);
}
+ /* Note: recovery_offset and journal_tail share space */
+ if (test_bit(Journal, &rdev->flags))
+ sb->journal_tail = cpu_to_le64(rdev->journal_tail);
if (test_bit(Replacement, &rdev->flags))
sb->feature_map |=
cpu_to_le32(MD_FEATURE_REPLACEMENT);
@@ -1735,6 +1759,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
}
}
+ if (mddev_is_clustered(mddev))
+ sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED);
+
if (rdev->badblocks.count == 0)
/* Nothing to do for bad blocks*/ ;
else if (sb->bblog_offset == 0)
@@ -1785,18 +1812,23 @@ retry:
max_dev = le32_to_cpu(sb->max_dev);
for (i=0; i<max_dev;i++)
- sb->dev_roles[i] = cpu_to_le16(0xfffe);
+ sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY);
+
+ if (test_bit(MD_HAS_JOURNAL, &mddev->flags))
+ sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL);
rdev_for_each(rdev2, mddev) {
i = rdev2->desc_nr;
if (test_bit(Faulty, &rdev2->flags))
- sb->dev_roles[i] = cpu_to_le16(0xfffe);
+ sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY);
else if (test_bit(In_sync, &rdev2->flags))
sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
+ else if (test_bit(Journal, &rdev2->flags))
+ sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL);
else if (rdev2->raid_disk >= 0)
sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
else
- sb->dev_roles[i] = cpu_to_le16(0xffff);
+ sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE);
}
sb->sb_csum = calc_sb_1_csum(sb);
@@ -1912,13 +1944,23 @@ static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2)
struct md_rdev *rdev, *rdev2;
rcu_read_lock();
- rdev_for_each_rcu(rdev, mddev1)
- rdev_for_each_rcu(rdev2, mddev2)
+ rdev_for_each_rcu(rdev, mddev1) {
+ if (test_bit(Faulty, &rdev->flags) ||
+ test_bit(Journal, &rdev->flags) ||
+ rdev->raid_disk == -1)
+ continue;
+ rdev_for_each_rcu(rdev2, mddev2) {
+ if (test_bit(Faulty, &rdev2->flags) ||
+ test_bit(Journal, &rdev2->flags) ||
+ rdev2->raid_disk == -1)
+ continue;
if (rdev->bdev->bd_contains ==
rdev2->bdev->bd_contains) {
rcu_read_unlock();
return 1;
}
+ }
+ }
rcu_read_unlock();
return 0;
}
@@ -1962,12 +2004,9 @@ int md_integrity_register(struct mddev *mddev)
* All component devices are integrity capable and have matching
* profiles, register the common profile for the md device.
*/
- if (blk_integrity_register(mddev->gendisk,
- bdev_get_integrity(reference->bdev)) != 0) {
- printk(KERN_ERR "md: failed to register integrity for %s\n",
- mdname(mddev));
- return -EINVAL;
- }
+ blk_integrity_register(mddev->gendisk,
+ bdev_get_integrity(reference->bdev));
+
printk(KERN_NOTICE "md: data integrity enabled on %s\n", mdname(mddev));
if (bioset_integrity_create(mddev->bio_set, BIO_POOL_SIZE)) {
printk(KERN_ERR "md: failed to create integrity pool for %s\n",
@@ -1997,6 +2036,7 @@ void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev)
if (bi_rdev && blk_integrity_compare(mddev->gendisk,
rdev->bdev->bd_disk) >= 0)
return;
+ WARN_ON_ONCE(!mddev->suspended);
printk(KERN_NOTICE "disabling data integrity on %s\n", mdname(mddev));
blk_integrity_unregister(mddev->gendisk);
}
@@ -2196,23 +2236,77 @@ static void sync_sbs(struct mddev *mddev, int nospares)
}
}
+static bool does_sb_need_changing(struct mddev *mddev)
+{
+ struct md_rdev *rdev;
+ struct mdp_superblock_1 *sb;
+ int role;
+
+ /* Find a good rdev */
+ rdev_for_each(rdev, mddev)
+ if ((rdev->raid_disk >= 0) && !test_bit(Faulty, &rdev->flags))
+ break;
+
+ /* No good device found. */
+ if (!rdev)
+ return false;
+
+ sb = page_address(rdev->sb_page);
+ /* Check if a device has become faulty or a spare become active */
+ rdev_for_each(rdev, mddev) {
+ role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
+ /* Device activated? */
+ if (role == 0xffff && rdev->raid_disk >=0 &&
+ !test_bit(Faulty, &rdev->flags))
+ return true;
+ /* Device turned faulty? */
+ if (test_bit(Faulty, &rdev->flags) && (role < 0xfffd))
+ return true;
+ }
+
+ /* Check if any mddev parameters have changed */
+ if ((mddev->dev_sectors != le64_to_cpu(sb->size)) ||
+ (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) ||
+ (mddev->layout != le64_to_cpu(sb->layout)) ||
+ (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) ||
+ (mddev->chunk_sectors != le32_to_cpu(sb->chunksize)))
+ return true;
+
+ return false;
+}
+
void md_update_sb(struct mddev *mddev, int force_change)
{
struct md_rdev *rdev;
int sync_req;
int nospares = 0;
int any_badblocks_changed = 0;
+ int ret = -1;
if (mddev->ro) {
if (force_change)
set_bit(MD_CHANGE_DEVS, &mddev->flags);
return;
}
+
+ if (mddev_is_clustered(mddev)) {
+ if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags))
+ force_change = 1;
+ ret = md_cluster_ops->metadata_update_start(mddev);
+ /* Has someone else has updated the sb */
+ if (!does_sb_need_changing(mddev)) {
+ if (ret == 0)
+ md_cluster_ops->metadata_update_cancel(mddev);
+ clear_bit(MD_CHANGE_PENDING, &mddev->flags);
+ return;
+ }
+ }
repeat:
/* First make sure individual recovery_offsets are correct */
rdev_for_each(rdev, mddev) {
if (rdev->raid_disk >= 0 &&
mddev->delta_disks >= 0 &&
+ !test_bit(Journal, &rdev->flags) &&
!test_bit(In_sync, &rdev->flags) &&
mddev->curr_resync_completed > rdev->recovery_offset)
rdev->recovery_offset = mddev->curr_resync_completed;
@@ -2356,6 +2450,9 @@ repeat:
clear_bit(BlockedBadBlocks, &rdev->flags);
wake_up(&rdev->blocked_wait);
}
+
+ if (mddev_is_clustered(mddev) && ret == 0)
+ md_cluster_ops->metadata_update_finish(mddev);
}
EXPORT_SYMBOL(md_update_sb);
@@ -2431,6 +2528,10 @@ state_show(struct md_rdev *rdev, char *page)
len += sprintf(page+len, "%sin_sync",sep);
sep = ",";
}
+ if (test_bit(Journal, &flags)) {
+ len += sprintf(page+len, "%sjournal",sep);
+ sep = ",";
+ }
if (test_bit(WriteMostly, &flags)) {
len += sprintf(page+len, "%swrite_mostly",sep);
sep = ",";
@@ -2442,6 +2543,7 @@ state_show(struct md_rdev *rdev, char *page)
sep = ",";
}
if (!test_bit(Faulty, &flags) &&
+ !test_bit(Journal, &flags) &&
!test_bit(In_sync, &flags)) {
len += sprintf(page+len, "%sspare", sep);
sep = ",";
@@ -2490,17 +2592,16 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
err = -EBUSY;
else {
struct mddev *mddev = rdev->mddev;
- if (mddev_is_clustered(mddev))
- md_cluster_ops->remove_disk(mddev, rdev);
- md_kick_rdev_from_array(rdev);
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_start(mddev);
- if (mddev->pers)
- md_update_sb(mddev, 1);
- md_new_event(mddev);
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_finish(mddev);
err = 0;
+ if (mddev_is_clustered(mddev))
+ err = md_cluster_ops->remove_disk(mddev, rdev);
+
+ if (err == 0) {
+ md_kick_rdev_from_array(rdev);
+ if (mddev->pers)
+ md_update_sb(mddev, 1);
+ md_new_event(mddev);
+ }
}
} else if (cmd_match(buf, "writemostly")) {
set_bit(WriteMostly, &rdev->flags);
@@ -2529,7 +2630,8 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
} else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
set_bit(In_sync, &rdev->flags);
err = 0;
- } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0) {
+ } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 &&
+ !test_bit(Journal, &rdev->flags)) {
if (rdev->mddev->pers == NULL) {
clear_bit(In_sync, &rdev->flags);
rdev->saved_raid_disk = rdev->raid_disk;
@@ -2548,6 +2650,7 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
* check if recovery is needed.
*/
if (rdev->raid_disk >= 0 &&
+ !test_bit(Journal, &rdev->flags) &&
!test_bit(Replacement, &rdev->flags))
set_bit(WantReplacement, &rdev->flags);
set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
@@ -2625,7 +2728,9 @@ __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
static ssize_t
slot_show(struct md_rdev *rdev, char *page)
{
- if (rdev->raid_disk < 0)
+ if (test_bit(Journal, &rdev->flags))
+ return sprintf(page, "journal\n");
+ else if (rdev->raid_disk < 0)
return sprintf(page, "none\n");
else
return sprintf(page, "%d\n", rdev->raid_disk);
@@ -2637,6 +2742,8 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
int slot;
int err;
+ if (test_bit(Journal, &rdev->flags))
+ return -EBUSY;
if (strncmp(buf, "none", 4)==0)
slot = -1;
else {
@@ -2667,6 +2774,7 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
/* Activating a spare .. or possibly reactivating
* if we ever get bitmaps working here.
*/
+ int err;
if (rdev->raid_disk != -1)
return -EBUSY;
@@ -2841,6 +2949,8 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
sector_t oldsectors = rdev->sectors;
sector_t sectors;
+ if (test_bit(Journal, &rdev->flags))
+ return -EBUSY;
if (strict_blocks_to_sectors(buf, &sectors) < 0)
return -EINVAL;
if (rdev->data_offset != rdev->new_data_offset)
@@ -3198,20 +3308,14 @@ static void analyze_sbs(struct mddev *mddev)
md_kick_rdev_from_array(rdev);
continue;
}
- /* No device should have a Candidate flag
- * when reading devices
- */
- if (test_bit(Candidate, &rdev->flags)) {
- pr_info("md: kicking Cluster Candidate %s from array!\n",
- bdevname(rdev->bdev, b));
- md_kick_rdev_from_array(rdev);
- }
}
if (mddev->level == LEVEL_MULTIPATH) {
rdev->desc_nr = i++;
rdev->raid_disk = rdev->desc_nr;
set_bit(In_sync, &rdev->flags);
- } else if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks))) {
+ } else if (rdev->raid_disk >=
+ (mddev->raid_disks - min(0, mddev->delta_disks)) &&
+ !test_bit(Journal, &rdev->flags)) {
rdev->raid_disk = -1;
clear_bit(In_sync, &rdev->flags);
}
@@ -3269,6 +3373,11 @@ safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len)
{
unsigned long msec;
+ if (mddev_is_clustered(mddev)) {
+ pr_info("md: Safemode is disabled for clustered mode\n");
+ return -EINVAL;
+ }
+
if (strict_strtoul_scaled(cbuf, &msec, 3) < 0)
return -EINVAL;
if (msec == 0)
@@ -3869,7 +3978,9 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
break;
case clean:
if (mddev->pers) {
- restart_array(mddev);
+ err = restart_array(mddev);
+ if (err)
+ break;
spin_lock(&mddev->lock);
if (atomic_read(&mddev->writes_pending) == 0) {
if (mddev->in_sync == 0) {
@@ -3887,7 +3998,9 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
break;
case active:
if (mddev->pers) {
- restart_array(mddev);
+ err = restart_array(mddev);
+ if (err)
+ break;
clear_bit(MD_CHANGE_PENDING, &mddev->flags);
wake_up(&mddev->sb_wait);
err = 0;
@@ -4066,12 +4179,8 @@ size_store(struct mddev *mddev, const char *buf, size_t len)
if (err)
return err;
if (mddev->pers) {
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_start(mddev);
err = update_size(mddev, sectors);
md_update_sb(mddev, 1);
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_finish(mddev);
} else {
if (mddev->dev_sectors == 0 ||
mddev->dev_sectors > sectors)
@@ -4217,8 +4326,7 @@ action_store(struct mddev *mddev, const char *page, size_t len)
}
mddev_unlock(mddev);
}
- } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
- test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
+ } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
return -EBUSY;
else if (cmd_match(page, "resync"))
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
@@ -4231,8 +4339,12 @@ action_store(struct mddev *mddev, const char *page, size_t len)
return -EINVAL;
err = mddev_lock(mddev);
if (!err) {
- clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
- err = mddev->pers->start_reshape(mddev);
+ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+ err = -EBUSY;
+ else {
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ err = mddev->pers->start_reshape(mddev);
+ }
mddev_unlock(mddev);
}
if (err)
@@ -5183,7 +5295,10 @@ int md_run(struct mddev *mddev)
atomic_set(&mddev->max_corr_read_errors,
MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
mddev->safemode = 0;
- mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
+ if (mddev_is_clustered(mddev))
+ mddev->safemode_delay = 0;
+ else
+ mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
mddev->in_sync = 1;
smp_wmb();
spin_lock(&mddev->lock);
@@ -5226,6 +5341,9 @@ static int do_md_run(struct mddev *mddev)
goto out;
}
+ if (mddev_is_clustered(mddev))
+ md_allow_write(mddev);
+
md_wakeup_thread(mddev->thread);
md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
@@ -5248,6 +5366,25 @@ static int restart_array(struct mddev *mddev)
return -EINVAL;
if (!mddev->ro)
return -EBUSY;
+ if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
+ struct md_rdev *rdev;
+ bool has_journal = false;
+
+ rcu_read_lock();
+ rdev_for_each_rcu(rdev, mddev) {
+ if (test_bit(Journal, &rdev->flags) &&
+ !test_bit(Faulty, &rdev->flags)) {
+ has_journal = true;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ /* Don't restart rw with journal missing/faulty */
+ if (!has_journal)
+ return -EINVAL;
+ }
+
mddev->safemode = 0;
mddev->ro = 0;
set_disk_ro(disk, 0);
@@ -5309,8 +5446,6 @@ static void md_clean(struct mddev *mddev)
static void __md_stop_writes(struct mddev *mddev)
{
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_start(mddev);
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
flush_workqueue(md_misc_wq);
if (mddev->sync_thread) {
@@ -5324,13 +5459,13 @@ static void __md_stop_writes(struct mddev *mddev)
md_super_wait(mddev);
if (mddev->ro == 0 &&
- (!mddev->in_sync || (mddev->flags & MD_UPDATE_SB_FLAGS))) {
+ ((!mddev->in_sync && !mddev_is_clustered(mddev)) ||
+ (mddev->flags & MD_UPDATE_SB_FLAGS))) {
/* mark array as shutdown cleanly */
- mddev->in_sync = 1;
+ if (!mddev_is_clustered(mddev))
+ mddev->in_sync = 1;
md_update_sb(mddev, 1);
}
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_finish(mddev);
}
void md_stop_writes(struct mddev *mddev)
@@ -5542,7 +5677,6 @@ static int do_md_stop(struct mddev *mddev, int mode,
if (mddev->hold_active == UNTIL_STOP)
mddev->hold_active = 0;
}
- blk_integrity_unregister(disk);
md_new_event(mddev);
sysfs_notify_dirent_safe(mddev->sysfs_state);
return 0;
@@ -5792,6 +5926,8 @@ static int get_disk_info(struct mddev *mddev, void __user * arg)
info.state |= (1<<MD_DISK_ACTIVE);
info.state |= (1<<MD_DISK_SYNC);
}
+ if (test_bit(Journal, &rdev->flags))
+ info.state |= (1<<MD_DISK_JOURNAL);
if (test_bit(WriteMostly, &rdev->flags))
info.state |= (1<<MD_DISK_WRITEMOSTLY);
} else {
@@ -5906,23 +6042,18 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
else
clear_bit(WriteMostly, &rdev->flags);
+ if (info->state & (1<<MD_DISK_JOURNAL))
+ set_bit(Journal, &rdev->flags);
/*
* check whether the device shows up in other nodes
*/
if (mddev_is_clustered(mddev)) {
- if (info->state & (1 << MD_DISK_CANDIDATE)) {
- /* Through --cluster-confirm */
+ if (info->state & (1 << MD_DISK_CANDIDATE))
set_bit(Candidate, &rdev->flags);
- err = md_cluster_ops->new_disk_ack(mddev, true);
- if (err) {
- export_rdev(rdev);
- return err;
- }
- } else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) {
+ else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) {
/* --add initiated by this node */
- err = md_cluster_ops->add_new_disk_start(mddev, rdev);
+ err = md_cluster_ops->add_new_disk(mddev, rdev);
if (err) {
- md_cluster_ops->add_new_disk_finish(mddev);
export_rdev(rdev);
return err;
}
@@ -5931,13 +6062,23 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
rdev->raid_disk = -1;
err = bind_rdev_to_array(rdev, mddev);
+
if (err)
export_rdev(rdev);
- else
+
+ if (mddev_is_clustered(mddev)) {
+ if (info->state & (1 << MD_DISK_CANDIDATE))
+ md_cluster_ops->new_disk_ack(mddev, (err == 0));
+ else {
+ if (err)
+ md_cluster_ops->add_new_disk_cancel(mddev);
+ else
+ err = add_bound_rdev(rdev);
+ }
+
+ } else if (!err)
err = add_bound_rdev(rdev);
- if (mddev_is_clustered(mddev) &&
- (info->state & (1 << MD_DISK_CLUSTER_ADD)))
- md_cluster_ops->add_new_disk_finish(mddev);
+
return err;
}
@@ -5993,13 +6134,17 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev)
{
char b[BDEVNAME_SIZE];
struct md_rdev *rdev;
+ int ret = -1;
rdev = find_rdev(mddev, dev);
if (!rdev)
return -ENXIO;
if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_start(mddev);
+ ret = md_cluster_ops->metadata_update_start(mddev);
+
+ if (rdev->raid_disk < 0)
+ goto kick_rdev;
clear_bit(Blocked, &rdev->flags);
remove_and_add_spares(mddev, rdev);
@@ -6007,20 +6152,19 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev)
if (rdev->raid_disk >= 0)
goto busy;
- if (mddev_is_clustered(mddev))
+kick_rdev:
+ if (mddev_is_clustered(mddev) && ret == 0)
md_cluster_ops->remove_disk(mddev, rdev);
md_kick_rdev_from_array(rdev);
md_update_sb(mddev, 1);
md_new_event(mddev);
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_finish(mddev);
-
return 0;
busy:
- if (mddev_is_clustered(mddev))
+ if (mddev_is_clustered(mddev) && ret == 0)
md_cluster_ops->metadata_update_cancel(mddev);
+
printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n",
bdevname(rdev->bdev,b), mdname(mddev));
return -EBUSY;
@@ -6071,14 +6215,12 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
goto abort_export;
}
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_start(mddev);
clear_bit(In_sync, &rdev->flags);
rdev->desc_nr = -1;
rdev->saved_raid_disk = -1;
err = bind_rdev_to_array(rdev, mddev);
if (err)
- goto abort_clustered;
+ goto abort_export;
/*
* The rest should better be atomic, we can have disk failures
@@ -6088,9 +6230,6 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
rdev->raid_disk = -1;
md_update_sb(mddev, 1);
-
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_finish(mddev);
/*
* Kick recovery, maybe this spare has to be added to the
* array immediately.
@@ -6100,9 +6239,6 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
md_new_event(mddev);
return 0;
-abort_clustered:
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_cancel(mddev);
abort_export:
export_rdev(rdev);
return err;
@@ -6420,8 +6556,6 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
return rv;
}
}
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_start(mddev);
if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
rv = update_size(mddev, (sector_t)info->size * 2);
@@ -6479,12 +6613,8 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
}
}
md_update_sb(mddev, 1);
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_finish(mddev);
return rv;
err:
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_cancel(mddev);
return rv;
}
@@ -7285,6 +7415,8 @@ static int md_seq_show(struct seq_file *seq, void *v)
bdevname(rdev->bdev,b), rdev->desc_nr);
if (test_bit(WriteMostly, &rdev->flags))
seq_printf(seq, "(W)");
+ if (test_bit(Journal, &rdev->flags))
+ seq_printf(seq, "(J)");
if (test_bit(Faulty, &rdev->flags)) {
seq_printf(seq, "(F)");
continue;
@@ -7597,11 +7729,7 @@ int md_allow_write(struct mddev *mddev)
mddev->safemode == 0)
mddev->safemode = 1;
spin_unlock(&mddev->lock);
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_start(mddev);
md_update_sb(mddev, 0);
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_finish(mddev);
sysfs_notify_dirent_safe(mddev->sysfs_state);
} else
spin_unlock(&mddev->lock);
@@ -7633,6 +7761,7 @@ void md_do_sync(struct md_thread *thread)
struct md_rdev *rdev;
char *desc, *action = NULL;
struct blk_plug plug;
+ bool cluster_resync_finished = false;
/* just incase thread restarts... */
if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
@@ -7742,6 +7871,7 @@ void md_do_sync(struct md_thread *thread)
rcu_read_lock();
rdev_for_each_rcu(rdev, mddev)
if (rdev->raid_disk >= 0 &&
+ !test_bit(Journal, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags) &&
!test_bit(In_sync, &rdev->flags) &&
rdev->recovery_offset < j)
@@ -7802,9 +7932,6 @@ void md_do_sync(struct md_thread *thread)
md_new_event(mddev);
update_time = jiffies;
- if (mddev_is_clustered(mddev))
- md_cluster_ops->resync_start(mddev, j, max_sectors);
-
blk_start_plug(&plug);
while (j < max_sectors) {
sector_t sectors;
@@ -7868,8 +7995,6 @@ void md_do_sync(struct md_thread *thread)
j = max_sectors;
if (j > 2)
mddev->curr_resync = j;
- if (mddev_is_clustered(mddev))
- md_cluster_ops->resync_info_update(mddev, j, max_sectors);
mddev->curr_mark_cnt = io_sectors;
if (last_check == 0)
/* this is the earliest that rebuild will be
@@ -7940,7 +8065,11 @@ void md_do_sync(struct md_thread *thread)
mddev->curr_resync_completed = mddev->curr_resync;
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
}
- /* tell personality that we are finished */
+ /* tell personality and other nodes that we are finished */
+ if (mddev_is_clustered(mddev)) {
+ md_cluster_ops->resync_finish(mddev);
+ cluster_resync_finished = true;
+ }
mddev->pers->sync_request(mddev, max_sectors, &skipped);
if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
@@ -7968,6 +8097,7 @@ void md_do_sync(struct md_thread *thread)
rdev_for_each_rcu(rdev, mddev)
if (rdev->raid_disk >= 0 &&
mddev->delta_disks >= 0 &&
+ !test_bit(Journal, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags) &&
!test_bit(In_sync, &rdev->flags) &&
rdev->recovery_offset < mddev->curr_resync)
@@ -7976,11 +8106,13 @@ void md_do_sync(struct md_thread *thread)
}
}
skip:
- if (mddev_is_clustered(mddev))
- md_cluster_ops->resync_finish(mddev);
-
set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ if (mddev_is_clustered(mddev) &&
+ test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
+ !cluster_resync_finished)
+ md_cluster_ops->resync_finish(mddev);
+
spin_lock(&mddev->lock);
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
/* We completed so min/max setting can be forgotten if used. */
@@ -8011,7 +8143,8 @@ static int remove_and_add_spares(struct mddev *mddev,
rdev->raid_disk >= 0 &&
!test_bit(Blocked, &rdev->flags) &&
(test_bit(Faulty, &rdev->flags) ||
- ! test_bit(In_sync, &rdev->flags)) &&
+ (!test_bit(In_sync, &rdev->flags) &&
+ !test_bit(Journal, &rdev->flags))) &&
atomic_read(&rdev->nr_pending)==0) {
if (mddev->pers->hot_remove_disk(
mddev, rdev) == 0) {
@@ -8023,18 +8156,25 @@ static int remove_and_add_spares(struct mddev *mddev,
if (removed && mddev->kobj.sd)
sysfs_notify(&mddev->kobj, NULL, "degraded");
- if (this)
+ if (this && removed)
goto no_add;
rdev_for_each(rdev, mddev) {
+ if (this && this != rdev)
+ continue;
+ if (test_bit(Candidate, &rdev->flags))
+ continue;
if (rdev->raid_disk >= 0 &&
!test_bit(In_sync, &rdev->flags) &&
+ !test_bit(Journal, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags))
spares++;
if (rdev->raid_disk >= 0)
continue;
if (test_bit(Faulty, &rdev->flags))
continue;
+ if (test_bit(Journal, &rdev->flags))
+ continue;
if (mddev->ro &&
! (rdev->saved_raid_disk >= 0 &&
!test_bit(Bitmap_sync, &rdev->flags)))
@@ -8059,14 +8199,25 @@ no_add:
static void md_start_sync(struct work_struct *ws)
{
struct mddev *mddev = container_of(ws, struct mddev, del_work);
+ int ret = 0;
+
+ if (mddev_is_clustered(mddev)) {
+ ret = md_cluster_ops->resync_start(mddev);
+ if (ret) {
+ mddev->sync_thread = NULL;
+ goto out;
+ }
+ }
mddev->sync_thread = md_register_thread(md_do_sync,
mddev,
"resync");
+out:
if (!mddev->sync_thread) {
- printk(KERN_ERR "%s: could not start resync"
- " thread...\n",
- mdname(mddev));
+ if (!(mddev_is_clustered(mddev) && ret == -EAGAIN))
+ printk(KERN_ERR "%s: could not start resync"
+ " thread...\n",
+ mdname(mddev));
/* leave the spares where they are, it shouldn't hurt */
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
@@ -8185,13 +8336,8 @@ void md_check_recovery(struct mddev *mddev)
sysfs_notify_dirent_safe(mddev->sysfs_state);
}
- if (mddev->flags & MD_UPDATE_SB_FLAGS) {
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_start(mddev);
+ if (mddev->flags & MD_UPDATE_SB_FLAGS)
md_update_sb(mddev, 0);
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_finish(mddev);
- }
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
@@ -8289,8 +8435,6 @@ void md_reap_sync_thread(struct mddev *mddev)
set_bit(MD_CHANGE_DEVS, &mddev->flags);
}
}
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_start(mddev);
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
mddev->pers->finish_reshape)
mddev->pers->finish_reshape(mddev);
@@ -8303,8 +8447,6 @@ void md_reap_sync_thread(struct mddev *mddev)
rdev->saved_raid_disk = -1;
md_update_sb(mddev, 1);
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_finish(mddev);
clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
@@ -8927,25 +9069,128 @@ err_wq:
return ret;
}
-void md_reload_sb(struct mddev *mddev)
+static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
{
- struct md_rdev *rdev, *tmp;
+ struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
+ struct md_rdev *rdev2;
+ int role, ret;
+ char b[BDEVNAME_SIZE];
- rdev_for_each_safe(rdev, tmp, mddev) {
- rdev->sb_loaded = 0;
- ClearPageUptodate(rdev->sb_page);
+ /* Check for change of roles in the active devices */
+ rdev_for_each(rdev2, mddev) {
+ if (test_bit(Faulty, &rdev2->flags))
+ continue;
+
+ /* Check if the roles changed */
+ role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]);
+
+ if (test_bit(Candidate, &rdev2->flags)) {
+ if (role == 0xfffe) {
+ pr_info("md: Removing Candidate device %s because add failed\n", bdevname(rdev2->bdev,b));
+ md_kick_rdev_from_array(rdev2);
+ continue;
+ }
+ else
+ clear_bit(Candidate, &rdev2->flags);
+ }
+
+ if (role != rdev2->raid_disk) {
+ /* got activated */
+ if (rdev2->raid_disk == -1 && role != 0xffff) {
+ rdev2->saved_raid_disk = role;
+ ret = remove_and_add_spares(mddev, rdev2);
+ pr_info("Activated spare: %s\n",
+ bdevname(rdev2->bdev,b));
+ continue;
+ }
+ /* device faulty
+ * We just want to do the minimum to mark the disk
+ * as faulty. The recovery is performed by the
+ * one who initiated the error.
+ */
+ if ((role == 0xfffe) || (role == 0xfffd)) {
+ md_error(mddev, rdev2);
+ clear_bit(Blocked, &rdev2->flags);
+ }
+ }
}
- mddev->raid_disks = 0;
- analyze_sbs(mddev);
- rdev_for_each_safe(rdev, tmp, mddev) {
- struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
- /* since we don't write to faulty devices, we figure out if the
- * disk is faulty by comparing events
- */
- if (mddev->events > sb->events)
- set_bit(Faulty, &rdev->flags);
+
+ if (mddev->raid_disks != le32_to_cpu(sb->raid_disks))
+ update_raid_disks(mddev, le32_to_cpu(sb->raid_disks));
+
+ /* Finally set the event to be up to date */
+ mddev->events = le64_to_cpu(sb->events);
+}
+
+static int read_rdev(struct mddev *mddev, struct md_rdev *rdev)
+{
+ int err;
+ struct page *swapout = rdev->sb_page;
+ struct mdp_superblock_1 *sb;
+
+ /* Store the sb page of the rdev in the swapout temporary
+ * variable in case we err in the future
+ */
+ rdev->sb_page = NULL;
+ alloc_disk_sb(rdev);
+ ClearPageUptodate(rdev->sb_page);
+ rdev->sb_loaded = 0;
+ err = super_types[mddev->major_version].load_super(rdev, NULL, mddev->minor_version);
+
+ if (err < 0) {
+ pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n",
+ __func__, __LINE__, rdev->desc_nr, err);
+ put_page(rdev->sb_page);
+ rdev->sb_page = swapout;
+ rdev->sb_loaded = 1;
+ return err;
+ }
+
+ sb = page_address(rdev->sb_page);
+ /* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET
+ * is not set
+ */
+
+ if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET))
+ rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
+
+ /* The other node finished recovery, call spare_active to set
+ * device In_sync and mddev->degraded
+ */
+ if (rdev->recovery_offset == MaxSector &&
+ !test_bit(In_sync, &rdev->flags) &&
+ mddev->pers->spare_active(mddev))
+ sysfs_notify(&mddev->kobj, NULL, "degraded");
+
+ put_page(swapout);
+ return 0;
+}
+
+void md_reload_sb(struct mddev *mddev, int nr)
+{
+ struct md_rdev *rdev;
+ int err;
+
+ /* Find the rdev */
+ rdev_for_each_rcu(rdev, mddev) {
+ if (rdev->desc_nr == nr)
+ break;
+ }
+
+ if (!rdev || rdev->desc_nr != nr) {
+ pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr);
+ return;
}
+ err = read_rdev(mddev, rdev);
+ if (err < 0)
+ return;
+
+ check_sb_changes(mddev, rdev);
+
+ /* Read all rdev's to update recovery_offset */
+ rdev_for_each_rcu(rdev, mddev)
+ read_rdev(mddev, rdev);
}
EXPORT_SYMBOL(md_reload_sb);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index ab339571e..ca0b643fe 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -87,10 +87,16 @@ struct md_rdev {
* array and could again if we did a partial
* resync from the bitmap
*/
- sector_t recovery_offset;/* If this device has been partially
+ union {
+ sector_t recovery_offset;/* If this device has been partially
* recovered, this is where we were
* up to.
*/
+ sector_t journal_tail; /* If this device is a journal device,
+ * this is the journal tail (journal
+ * recovery start point)
+ */
+ };
atomic_t nr_pending; /* number of pending requests.
* only maintained for arrays that
@@ -172,6 +178,11 @@ enum flag_bits {
* This device is seen locally but not
* by the whole cluster
*/
+ Journal, /* This device is used as journal for
+ * raid-5/6.
+ * Usually, this device should be faster
+ * than other devices in the array
+ */
};
#define BB_LEN_MASK (0x00000000000001FFULL)
@@ -221,6 +232,8 @@ struct mddev {
#define MD_STILL_CLOSED 4 /* If set, then array has not been opened since
* md_ioctl checked on it.
*/
+#define MD_JOURNAL_CLEAN 5 /* A raid with journal is already clean */
+#define MD_HAS_JOURNAL 6 /* The raid array has journal feature set */
int suspended;
atomic_t active_io;
@@ -553,7 +566,9 @@ static inline char * mdname (struct mddev * mddev)
static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev)
{
char nm[20];
- if (!test_bit(Replacement, &rdev->flags) && mddev->kobj.sd) {
+ if (!test_bit(Replacement, &rdev->flags) &&
+ !test_bit(Journal, &rdev->flags) &&
+ mddev->kobj.sd) {
sprintf(nm, "rd%d", rdev->raid_disk);
return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
} else
@@ -563,7 +578,9 @@ static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev)
static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev)
{
char nm[20];
- if (!test_bit(Replacement, &rdev->flags) && mddev->kobj.sd) {
+ if (!test_bit(Replacement, &rdev->flags) &&
+ !test_bit(Journal, &rdev->flags) &&
+ mddev->kobj.sd) {
sprintf(nm, "rd%d", rdev->raid_disk);
sysfs_remove_link(&mddev->kobj, nm);
}
@@ -658,7 +675,7 @@ extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
struct mddev *mddev);
extern void md_unplug(struct blk_plug_cb *cb, bool from_schedule);
-extern void md_reload_sb(struct mddev *mddev);
+extern void md_reload_sb(struct mddev *mddev, int raid_disk);
extern void md_update_sb(struct mddev *mddev, int force);
extern void md_kick_rdev_from_array(struct md_rdev * rdev);
struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr);
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index d132f06af..7331a80d8 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -264,7 +264,9 @@ static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev)
spin_unlock_irq(&conf->device_lock);
rcu_assign_pointer(p->rdev, rdev);
err = 0;
+ mddev_suspend(mddev);
md_integrity_add_rdev(rdev, mddev);
+ mddev_resume(mddev);
break;
}
diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c
index e64b61ad0..431a03067 100644
--- a/drivers/md/persistent-data/dm-array.c
+++ b/drivers/md/persistent-data/dm-array.c
@@ -233,9 +233,9 @@ static int get_ablock(struct dm_array_info *info, dm_block_t b,
/*
* Unlocks an array block.
*/
-static int unlock_ablock(struct dm_array_info *info, struct dm_block *block)
+static void unlock_ablock(struct dm_array_info *info, struct dm_block *block)
{
- return dm_tm_unlock(info->btree_info.tm, block);
+ dm_tm_unlock(info->btree_info.tm, block);
}
/*----------------------------------------------------------------*/
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c
index 88dbe7b97..f2393ba83 100644
--- a/drivers/md/persistent-data/dm-block-manager.c
+++ b/drivers/md/persistent-data/dm-block-manager.c
@@ -578,7 +578,7 @@ int dm_bm_write_lock_zero(struct dm_block_manager *bm,
}
EXPORT_SYMBOL_GPL(dm_bm_write_lock_zero);
-int dm_bm_unlock(struct dm_block *b)
+void dm_bm_unlock(struct dm_block *b)
{
struct buffer_aux *aux;
aux = dm_bufio_get_aux_data(to_buffer(b));
@@ -590,8 +590,6 @@ int dm_bm_unlock(struct dm_block *b)
bl_up_read(&aux->lock);
dm_bufio_release(to_buffer(b));
-
- return 0;
}
EXPORT_SYMBOL_GPL(dm_bm_unlock);
diff --git a/drivers/md/persistent-data/dm-block-manager.h b/drivers/md/persistent-data/dm-block-manager.h
index 84330f598..3627d1b76 100644
--- a/drivers/md/persistent-data/dm-block-manager.h
+++ b/drivers/md/persistent-data/dm-block-manager.h
@@ -94,7 +94,7 @@ int dm_bm_write_lock_zero(struct dm_block_manager *bm, dm_block_t b,
struct dm_block_validator *v,
struct dm_block **result);
-int dm_bm_unlock(struct dm_block *b);
+void dm_bm_unlock(struct dm_block *b);
/*
* It's a common idiom to have a superblock that should be committed last.
diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h
index 8731b6ea0..a240990a7 100644
--- a/drivers/md/persistent-data/dm-btree-internal.h
+++ b/drivers/md/persistent-data/dm-btree-internal.h
@@ -52,7 +52,7 @@ void inc_children(struct dm_transaction_manager *tm, struct btree_node *n,
struct dm_btree_value_type *vt);
int new_block(struct dm_btree_info *info, struct dm_block **result);
-int unlock_block(struct dm_btree_info *info, struct dm_block *b);
+void unlock_block(struct dm_btree_info *info, struct dm_block *b);
/*
* Spines keep track of the rolling locks. There are 2 variants, read-only
diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c
index 2e4c4cb79..21ea537bd 100644
--- a/drivers/md/persistent-data/dm-btree-remove.c
+++ b/drivers/md/persistent-data/dm-btree-remove.c
@@ -165,9 +165,9 @@ static int init_child(struct dm_btree_info *info, struct dm_btree_value_type *vt
return 0;
}
-static int exit_child(struct dm_btree_info *info, struct child *c)
+static void exit_child(struct dm_btree_info *info, struct child *c)
{
- return dm_tm_unlock(info->tm, c->block);
+ dm_tm_unlock(info->tm, c->block);
}
static void shift(struct btree_node *left, struct btree_node *right, int count)
@@ -249,13 +249,10 @@ static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info,
__rebalance2(info, parent, &left, &right);
- r = exit_child(info, &left);
- if (r) {
- exit_child(info, &right);
- return r;
- }
+ exit_child(info, &left);
+ exit_child(info, &right);
- return exit_child(info, &right);
+ return 0;
}
/*
@@ -394,22 +391,9 @@ static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info,
__rebalance3(info, parent, &left, &center, &right);
- r = exit_child(info, &left);
- if (r) {
- exit_child(info, &center);
- exit_child(info, &right);
- return r;
- }
-
- r = exit_child(info, &center);
- if (r) {
- exit_child(info, &right);
- return r;
- }
-
- r = exit_child(info, &right);
- if (r)
- return r;
+ exit_child(info, &left);
+ exit_child(info, &center);
+ exit_child(info, &right);
return 0;
}
@@ -433,9 +417,7 @@ static int rebalance_children(struct shadow_spine *s,
memcpy(n, dm_block_data(child),
dm_bm_block_size(dm_tm_get_bm(info->tm)));
- r = dm_tm_unlock(info->tm, child);
- if (r)
- return r;
+ dm_tm_unlock(info->tm, child);
dm_tm_dec(info->tm, dm_block_location(child));
return 0;
diff --git a/drivers/md/persistent-data/dm-btree-spine.c b/drivers/md/persistent-data/dm-btree-spine.c
index 0dee514ba..b27b8091a 100644
--- a/drivers/md/persistent-data/dm-btree-spine.c
+++ b/drivers/md/persistent-data/dm-btree-spine.c
@@ -117,9 +117,9 @@ int new_block(struct dm_btree_info *info, struct dm_block **result)
return dm_tm_new_block(info->tm, &btree_node_validator, result);
}
-int unlock_block(struct dm_btree_info *info, struct dm_block *b)
+void unlock_block(struct dm_btree_info *info, struct dm_block *b)
{
- return dm_tm_unlock(info->tm, b);
+ dm_tm_unlock(info->tm, b);
}
/*----------------------------------------------------------------*/
@@ -137,9 +137,7 @@ int exit_ro_spine(struct ro_spine *s)
int r = 0, i;
for (i = 0; i < s->count; i++) {
- int r2 = unlock_block(s->info, s->nodes[i]);
- if (r2 < 0)
- r = r2;
+ unlock_block(s->info, s->nodes[i]);
}
return r;
@@ -150,9 +148,7 @@ int ro_step(struct ro_spine *s, dm_block_t new_child)
int r;
if (s->count == 2) {
- r = unlock_block(s->info, s->nodes[0]);
- if (r < 0)
- return r;
+ unlock_block(s->info, s->nodes[0]);
s->nodes[0] = s->nodes[1];
s->count--;
}
@@ -194,9 +190,7 @@ int exit_shadow_spine(struct shadow_spine *s)
int r = 0, i;
for (i = 0; i < s->count; i++) {
- int r2 = unlock_block(s->info, s->nodes[i]);
- if (r2 < 0)
- r = r2;
+ unlock_block(s->info, s->nodes[i]);
}
return r;
@@ -208,9 +202,7 @@ int shadow_step(struct shadow_spine *s, dm_block_t b,
int r;
if (s->count == 2) {
- r = unlock_block(s->info, s->nodes[0]);
- if (r < 0)
- return r;
+ unlock_block(s->info, s->nodes[0]);
s->nodes[0] = s->nodes[1];
s->count--;
}
diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c
index 0e09aef43..b1ced58eb 100644
--- a/drivers/md/persistent-data/dm-btree.c
+++ b/drivers/md/persistent-data/dm-btree.c
@@ -63,6 +63,11 @@ int lower_bound(struct btree_node *n, uint64_t key)
return bsearch(n, key, 0);
}
+static int upper_bound(struct btree_node *n, uint64_t key)
+{
+ return bsearch(n, key, 1);
+}
+
void inc_children(struct dm_transaction_manager *tm, struct btree_node *n,
struct dm_btree_value_type *vt)
{
@@ -141,7 +146,9 @@ int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root)
n->header.value_size = cpu_to_le32(info->value_type.size);
*root = dm_block_location(b);
- return unlock_block(info, b);
+ unlock_block(info, b);
+
+ return 0;
}
EXPORT_SYMBOL_GPL(dm_btree_empty);
@@ -250,6 +257,16 @@ static void pop_frame(struct del_stack *s)
dm_tm_unlock(s->tm, f->b);
}
+static void unlock_all_frames(struct del_stack *s)
+{
+ struct frame *f;
+
+ while (unprocessed_frames(s)) {
+ f = s->spine + s->top--;
+ dm_tm_unlock(s->tm, f->b);
+ }
+}
+
int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
{
int r;
@@ -306,9 +323,13 @@ int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
pop_frame(s);
}
}
-
out:
+ if (r) {
+ /* cleanup all frames of del_stack */
+ unlock_all_frames(s);
+ }
kfree(s);
+
return r;
}
EXPORT_SYMBOL_GPL(dm_btree_del);
@@ -390,6 +411,82 @@ int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root,
}
EXPORT_SYMBOL_GPL(dm_btree_lookup);
+static int dm_btree_lookup_next_single(struct dm_btree_info *info, dm_block_t root,
+ uint64_t key, uint64_t *rkey, void *value_le)
+{
+ int r, i;
+ uint32_t flags, nr_entries;
+ struct dm_block *node;
+ struct btree_node *n;
+
+ r = bn_read_lock(info, root, &node);
+ if (r)
+ return r;
+
+ n = dm_block_data(node);
+ flags = le32_to_cpu(n->header.flags);
+ nr_entries = le32_to_cpu(n->header.nr_entries);
+
+ if (flags & INTERNAL_NODE) {
+ i = lower_bound(n, key);
+ if (i < 0 || i >= nr_entries) {
+ r = -ENODATA;
+ goto out;
+ }
+
+ r = dm_btree_lookup_next_single(info, value64(n, i), key, rkey, value_le);
+ if (r == -ENODATA && i < (nr_entries - 1)) {
+ i++;
+ r = dm_btree_lookup_next_single(info, value64(n, i), key, rkey, value_le);
+ }
+
+ } else {
+ i = upper_bound(n, key);
+ if (i < 0 || i >= nr_entries) {
+ r = -ENODATA;
+ goto out;
+ }
+
+ *rkey = le64_to_cpu(n->keys[i]);
+ memcpy(value_le, value_ptr(n, i), info->value_type.size);
+ }
+out:
+ dm_tm_unlock(info->tm, node);
+ return r;
+}
+
+int dm_btree_lookup_next(struct dm_btree_info *info, dm_block_t root,
+ uint64_t *keys, uint64_t *rkey, void *value_le)
+{
+ unsigned level;
+ int r = -ENODATA;
+ __le64 internal_value_le;
+ struct ro_spine spine;
+
+ init_ro_spine(&spine, info);
+ for (level = 0; level < info->levels - 1u; level++) {
+ r = btree_lookup_raw(&spine, root, keys[level],
+ lower_bound, rkey,
+ &internal_value_le, sizeof(uint64_t));
+ if (r)
+ goto out;
+
+ if (*rkey != keys[level]) {
+ r = -ENODATA;
+ goto out;
+ }
+
+ root = le64_to_cpu(internal_value_le);
+ }
+
+ r = dm_btree_lookup_next_single(info, root, keys[level], rkey, value_le);
+out:
+ exit_ro_spine(&spine);
+ return r;
+}
+
+EXPORT_SYMBOL_GPL(dm_btree_lookup_next);
+
/*
* Splits a node by creating a sibling node and shifting half the nodes
* contents across. Assumes there is a parent node, and it has room for
@@ -471,8 +568,10 @@ static int btree_split_sibling(struct shadow_spine *s, unsigned parent_index,
r = insert_at(sizeof(__le64), pn, parent_index + 1,
le64_to_cpu(rn->keys[0]), &location);
- if (r)
+ if (r) {
+ unlock_block(s->info, right);
return r;
+ }
if (key < le64_to_cpu(rn->keys[0])) {
unlock_block(s->info, right);
diff --git a/drivers/md/persistent-data/dm-btree.h b/drivers/md/persistent-data/dm-btree.h
index 11d8cf786..c74301fa5 100644
--- a/drivers/md/persistent-data/dm-btree.h
+++ b/drivers/md/persistent-data/dm-btree.h
@@ -110,6 +110,13 @@ int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root,
uint64_t *keys, void *value_le);
/*
+ * Tries to find the first key where the bottom level key is >= to that
+ * given. Useful for skipping empty sections of the btree.
+ */
+int dm_btree_lookup_next(struct dm_btree_info *info, dm_block_t root,
+ uint64_t *keys, uint64_t *rkey, void *value_le);
+
+/*
* Insertion (or overwrite an existing value). O(ln(n))
*/
int dm_btree_insert(struct dm_btree_info *info, dm_block_t root,
@@ -135,9 +142,10 @@ int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
uint64_t *keys, dm_block_t *new_root);
/*
- * Removes values between 'keys' and keys2, where keys2 is keys with the
- * final key replaced with 'end_key'. 'end_key' is the one-past-the-end
- * value. 'keys' may be altered.
+ * Removes a _contiguous_ run of values starting from 'keys' and not
+ * reaching keys2 (where keys2 is keys with the final key replaced with
+ * 'end_key'). 'end_key' is the one-past-the-end value. 'keys' may be
+ * altered.
*/
int dm_btree_remove_leaves(struct dm_btree_info *info, dm_block_t root,
uint64_t *keys, uint64_t end_key,
diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c
index aacbe70c2..306d2e450 100644
--- a/drivers/md/persistent-data/dm-space-map-common.c
+++ b/drivers/md/persistent-data/dm-space-map-common.c
@@ -259,9 +259,7 @@ int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks)
idx.blocknr = cpu_to_le64(dm_block_location(b));
- r = dm_tm_unlock(ll->tm, b);
- if (r < 0)
- return r;
+ dm_tm_unlock(ll->tm, b);
idx.nr_free = cpu_to_le32(ll->entries_per_block);
idx.none_free_before = 0;
@@ -293,7 +291,9 @@ int sm_ll_lookup_bitmap(struct ll_disk *ll, dm_block_t b, uint32_t *result)
*result = sm_lookup_bitmap(dm_bitmap_data(blk), b);
- return dm_tm_unlock(ll->tm, blk);
+ dm_tm_unlock(ll->tm, blk);
+
+ return 0;
}
static int sm_ll_lookup_big_ref_count(struct ll_disk *ll, dm_block_t b,
@@ -373,9 +373,7 @@ int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin,
return r;
}
- r = dm_tm_unlock(ll->tm, blk);
- if (r < 0)
- return r;
+ dm_tm_unlock(ll->tm, blk);
*result = i * ll->entries_per_block + (dm_block_t) position;
return 0;
@@ -429,9 +427,7 @@ static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b,
if (ref_count <= 2) {
sm_set_bitmap(bm_le, bit, ref_count);
- r = dm_tm_unlock(ll->tm, nb);
- if (r < 0)
- return r;
+ dm_tm_unlock(ll->tm, nb);
if (old > 2) {
r = dm_btree_remove(&ll->ref_count_info,
@@ -445,9 +441,7 @@ static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b,
__le32 le_rc = cpu_to_le32(ref_count);
sm_set_bitmap(bm_le, bit, 3);
- r = dm_tm_unlock(ll->tm, nb);
- if (r < 0)
- return r;
+ dm_tm_unlock(ll->tm, nb);
__dm_bless_for_disk(&le_rc);
r = dm_btree_insert(&ll->ref_count_info, ll->ref_count_root,
@@ -556,7 +550,9 @@ static int metadata_ll_init_index(struct ll_disk *ll)
memcpy(dm_block_data(b), &ll->mi_le, sizeof(ll->mi_le));
ll->bitmap_root = dm_block_location(b);
- return dm_tm_unlock(ll->tm, b);
+ dm_tm_unlock(ll->tm, b);
+
+ return 0;
}
static int metadata_ll_open(struct ll_disk *ll)
@@ -570,7 +566,9 @@ static int metadata_ll_open(struct ll_disk *ll)
return r;
memcpy(&ll->mi_le, dm_block_data(block), sizeof(ll->mi_le));
- return dm_tm_unlock(ll->tm, block);
+ dm_tm_unlock(ll->tm, block);
+
+ return 0;
}
static dm_block_t metadata_ll_max_entries(struct ll_disk *ll)
@@ -590,7 +588,9 @@ static int metadata_ll_commit(struct ll_disk *ll)
memcpy(dm_block_data(b), &ll->mi_le, sizeof(ll->mi_le));
ll->bitmap_root = dm_block_location(b);
- return dm_tm_unlock(ll->tm, b);
+ dm_tm_unlock(ll->tm, b);
+
+ return 0;
}
int sm_ll_new_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm)
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c
index 53091295f..fca6dbcf9 100644
--- a/drivers/md/persistent-data/dm-space-map-metadata.c
+++ b/drivers/md/persistent-data/dm-space-map-metadata.c
@@ -136,7 +136,7 @@ static int brb_push(struct bop_ring_buffer *brb,
return 0;
}
-static int brb_pop(struct bop_ring_buffer *brb, struct block_op *result)
+static int brb_peek(struct bop_ring_buffer *brb, struct block_op *result)
{
struct block_op *bop;
@@ -147,6 +147,17 @@ static int brb_pop(struct bop_ring_buffer *brb, struct block_op *result)
result->type = bop->type;
result->block = bop->block;
+ return 0;
+}
+
+static int brb_pop(struct bop_ring_buffer *brb)
+{
+ struct block_op *bop;
+
+ if (brb_empty(brb))
+ return -ENODATA;
+
+ bop = brb->bops + brb->begin;
brb->begin = brb_next(brb, brb->begin);
return 0;
@@ -211,7 +222,7 @@ static int apply_bops(struct sm_metadata *smm)
while (!brb_empty(&smm->uncommitted)) {
struct block_op bop;
- r = brb_pop(&smm->uncommitted, &bop);
+ r = brb_peek(&smm->uncommitted, &bop);
if (r) {
DMERR("bug in bop ring buffer");
break;
@@ -220,6 +231,8 @@ static int apply_bops(struct sm_metadata *smm)
r = commit_bop(smm, &bop);
if (r)
break;
+
+ brb_pop(&smm->uncommitted);
}
return r;
@@ -683,7 +696,6 @@ static struct dm_space_map bootstrap_ops = {
static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
{
int r, i;
- enum allocation_event ev;
struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
dm_block_t old_len = smm->ll.nr_blocks;
@@ -705,11 +717,12 @@ static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
* allocate any new blocks.
*/
do {
- for (i = old_len; !r && i < smm->begin; i++) {
- r = sm_ll_inc(&smm->ll, i, &ev);
- if (r)
- goto out;
- }
+ for (i = old_len; !r && i < smm->begin; i++)
+ r = add_bop(smm, BOP_INC, i);
+
+ if (r)
+ goto out;
+
old_len = smm->begin;
r = apply_bops(smm);
@@ -754,7 +767,6 @@ int dm_sm_metadata_create(struct dm_space_map *sm,
{
int r;
dm_block_t i;
- enum allocation_event ev;
struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
smm->begin = superblock + 1;
@@ -782,7 +794,7 @@ int dm_sm_metadata_create(struct dm_space_map *sm,
* allocated blocks that they were built from.
*/
for (i = superblock; !r && i < smm->begin; i++)
- r = sm_ll_inc(&smm->ll, i, &ev);
+ r = add_bop(smm, BOP_INC, i);
if (r)
return r;
diff --git a/drivers/md/persistent-data/dm-transaction-manager.c b/drivers/md/persistent-data/dm-transaction-manager.c
index 9cb797d80..abe2c5dd0 100644
--- a/drivers/md/persistent-data/dm-transaction-manager.c
+++ b/drivers/md/persistent-data/dm-transaction-manager.c
@@ -342,9 +342,9 @@ int dm_tm_read_lock(struct dm_transaction_manager *tm, dm_block_t b,
}
EXPORT_SYMBOL_GPL(dm_tm_read_lock);
-int dm_tm_unlock(struct dm_transaction_manager *tm, struct dm_block *b)
+void dm_tm_unlock(struct dm_transaction_manager *tm, struct dm_block *b)
{
- return dm_bm_unlock(b);
+ dm_bm_unlock(b);
}
EXPORT_SYMBOL_GPL(dm_tm_unlock);
diff --git a/drivers/md/persistent-data/dm-transaction-manager.h b/drivers/md/persistent-data/dm-transaction-manager.h
index 2e0d4d66f..f3a18be68 100644
--- a/drivers/md/persistent-data/dm-transaction-manager.h
+++ b/drivers/md/persistent-data/dm-transaction-manager.h
@@ -94,7 +94,7 @@ int dm_tm_read_lock(struct dm_transaction_manager *tm, dm_block_t b,
struct dm_block_validator *v,
struct dm_block **result);
-int dm_tm_unlock(struct dm_transaction_manager *tm, struct dm_block *b);
+void dm_tm_unlock(struct dm_transaction_manager *tm, struct dm_block *b);
/*
* Functions for altering the reference count of a block directly.
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index d9d031ede..e2169ff6e 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -90,6 +90,8 @@ static void r1bio_pool_free(void *r1_bio, void *data)
#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
#define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH)
#define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9)
+#define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW)
+#define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9)
#define NEXT_NORMALIO_DISTANCE (3 * RESYNC_WINDOW_SECTORS)
static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
@@ -1590,6 +1592,15 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
if (rdev->raid_disk >= 0)
first = last = rdev->raid_disk;
+ /*
+ * find the disk ... but prefer rdev->saved_raid_disk
+ * if possible.
+ */
+ if (rdev->saved_raid_disk >= 0 &&
+ rdev->saved_raid_disk >= first &&
+ conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
+ first = last = rdev->saved_raid_disk;
+
for (mirror = first; mirror <= last; mirror++) {
p = conf->mirrors+mirror;
if (!p->rdev) {
@@ -1621,7 +1632,9 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
break;
}
}
+ mddev_suspend(mddev);
md_integrity_add_rdev(rdev, mddev);
+ mddev_resume(mddev);
if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
print_conf(conf);
@@ -2493,6 +2506,11 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
bitmap_close_sync(mddev->bitmap);
close_sync(conf);
+
+ if (mddev_is_clustered(mddev)) {
+ conf->cluster_sync_low = 0;
+ conf->cluster_sync_high = 0;
+ }
return 0;
}
@@ -2513,7 +2531,12 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
return sync_blocks;
}
- bitmap_cond_end_sync(mddev->bitmap, sector_nr);
+ /* we are incrementing sector_nr below. To be safe, we check against
+ * sector_nr + two times RESYNC_SECTORS
+ */
+
+ bitmap_cond_end_sync(mddev->bitmap, sector_nr,
+ mddev_is_clustered(mddev) && (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high));
r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
raise_barrier(conf, sector_nr);
@@ -2704,6 +2727,16 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
bio_full:
r1_bio->sectors = nr_sectors;
+ if (mddev_is_clustered(mddev) &&
+ conf->cluster_sync_high < sector_nr + nr_sectors) {
+ conf->cluster_sync_low = mddev->curr_resync_completed;
+ conf->cluster_sync_high = conf->cluster_sync_low + CLUSTER_RESYNC_WINDOW_SECTORS;
+ /* Send resync message */
+ md_cluster_ops->resync_info_update(mddev,
+ conf->cluster_sync_low,
+ conf->cluster_sync_high);
+ }
+
/* For a user-requested sync, we read all readable devices and do a
* compare
*/
@@ -3018,9 +3051,11 @@ static int raid1_reshape(struct mddev *mddev)
return -EINVAL;
}
- err = md_allow_write(mddev);
- if (err)
- return err;
+ if (!mddev_is_clustered(mddev)) {
+ err = md_allow_write(mddev);
+ if (err)
+ return err;
+ }
raid_disks = mddev->raid_disks + mddev->delta_disks;
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index c52d7139c..61c39b390 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -111,6 +111,13 @@ struct r1conf {
* the new thread here until we fully activate the array.
*/
struct md_thread *thread;
+
+ /* Keep track of cluster resync window to send to other
+ * nodes.
+ */
+ sector_t cluster_sync_low;
+ sector_t cluster_sync_high;
+
};
/*
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 96f365968..84e597e1c 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1739,7 +1739,9 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
rcu_assign_pointer(p->rdev, rdev);
break;
}
+ mddev_suspend(mddev);
md_integrity_add_rdev(rdev, mddev);
+ mddev_resume(mddev);
if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
@@ -1944,6 +1946,8 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
first = i;
fbio = r10_bio->devs[i].bio;
+ fbio->bi_iter.bi_size = r10_bio->sectors << 9;
+ fbio->bi_iter.bi_idx = 0;
vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9);
/* now find blocks with errors */
@@ -1987,7 +1991,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
bio_reset(tbio);
tbio->bi_vcnt = vcnt;
- tbio->bi_iter.bi_size = r10_bio->sectors << 9;
+ tbio->bi_iter.bi_size = fbio->bi_iter.bi_size;
tbio->bi_rw = WRITE;
tbio->bi_private = r10_bio;
tbio->bi_iter.bi_sector = r10_bio->devs[i].addr;
@@ -3147,7 +3151,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
/* resync. Schedule a read for every block at this virt offset */
int count = 0;
- bitmap_cond_end_sync(mddev->bitmap, sector_nr);
+ bitmap_cond_end_sync(mddev->bitmap, sector_nr, 0);
if (!bitmap_start_sync(mddev->bitmap, sector_nr,
&sync_blocks, mddev->degraded) &&
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
new file mode 100644
index 000000000..b887e04d7
--- /dev/null
+++ b/drivers/md/raid5-cache.c
@@ -0,0 +1,1191 @@
+/*
+ * Copyright (C) 2015 Shaohua Li <shli@fb.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/wait.h>
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+#include <linux/raid/md_p.h>
+#include <linux/crc32c.h>
+#include <linux/random.h>
+#include "md.h"
+#include "raid5.h"
+
+/*
+ * metadata/data stored in disk with 4k size unit (a block) regardless
+ * underneath hardware sector size. only works with PAGE_SIZE == 4096
+ */
+#define BLOCK_SECTORS (8)
+
+/*
+ * reclaim runs every 1/4 disk size or 10G reclaimable space. This can prevent
+ * recovery scans a very long log
+ */
+#define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */
+#define RECLAIM_MAX_FREE_SPACE_SHIFT (2)
+
+struct r5l_log {
+ struct md_rdev *rdev;
+
+ u32 uuid_checksum;
+
+ sector_t device_size; /* log device size, round to
+ * BLOCK_SECTORS */
+ sector_t max_free_space; /* reclaim run if free space is at
+ * this size */
+
+ sector_t last_checkpoint; /* log tail. where recovery scan
+ * starts from */
+ u64 last_cp_seq; /* log tail sequence */
+
+ sector_t log_start; /* log head. where new data appends */
+ u64 seq; /* log head sequence */
+
+ sector_t next_checkpoint;
+ u64 next_cp_seq;
+
+ struct mutex io_mutex;
+ struct r5l_io_unit *current_io; /* current io_unit accepting new data */
+
+ spinlock_t io_list_lock;
+ struct list_head running_ios; /* io_units which are still running,
+ * and have not yet been completely
+ * written to the log */
+ struct list_head io_end_ios; /* io_units which have been completely
+ * written to the log but not yet written
+ * to the RAID */
+ struct list_head flushing_ios; /* io_units which are waiting for log
+ * cache flush */
+ struct list_head finished_ios; /* io_units which settle down in log disk */
+ struct bio flush_bio;
+
+ struct kmem_cache *io_kc;
+
+ struct md_thread *reclaim_thread;
+ unsigned long reclaim_target; /* number of space that need to be
+ * reclaimed. if it's 0, reclaim spaces
+ * used by io_units which are in
+ * IO_UNIT_STRIPE_END state (eg, reclaim
+ * dones't wait for specific io_unit
+ * switching to IO_UNIT_STRIPE_END
+ * state) */
+ wait_queue_head_t iounit_wait;
+
+ struct list_head no_space_stripes; /* pending stripes, log has no space */
+ spinlock_t no_space_stripes_lock;
+
+ bool need_cache_flush;
+ bool in_teardown;
+};
+
+/*
+ * an IO range starts from a meta data block and end at the next meta data
+ * block. The io unit's the meta data block tracks data/parity followed it. io
+ * unit is written to log disk with normal write, as we always flush log disk
+ * first and then start move data to raid disks, there is no requirement to
+ * write io unit with FLUSH/FUA
+ */
+struct r5l_io_unit {
+ struct r5l_log *log;
+
+ struct page *meta_page; /* store meta block */
+ int meta_offset; /* current offset in meta_page */
+
+ struct bio *current_bio;/* current_bio accepting new data */
+
+ atomic_t pending_stripe;/* how many stripes not flushed to raid */
+ u64 seq; /* seq number of the metablock */
+ sector_t log_start; /* where the io_unit starts */
+ sector_t log_end; /* where the io_unit ends */
+ struct list_head log_sibling; /* log->running_ios */
+ struct list_head stripe_list; /* stripes added to the io_unit */
+
+ int state;
+ bool need_split_bio;
+};
+
+/* r5l_io_unit state */
+enum r5l_io_unit_state {
+ IO_UNIT_RUNNING = 0, /* accepting new IO */
+ IO_UNIT_IO_START = 1, /* io_unit bio start writing to log,
+ * don't accepting new bio */
+ IO_UNIT_IO_END = 2, /* io_unit bio finish writing to log */
+ IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */
+};
+
+static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc)
+{
+ start += inc;
+ if (start >= log->device_size)
+ start = start - log->device_size;
+ return start;
+}
+
+static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start,
+ sector_t end)
+{
+ if (end >= start)
+ return end - start;
+ else
+ return end + log->device_size - start;
+}
+
+static bool r5l_has_free_space(struct r5l_log *log, sector_t size)
+{
+ sector_t used_size;
+
+ used_size = r5l_ring_distance(log, log->last_checkpoint,
+ log->log_start);
+
+ return log->device_size > used_size + size;
+}
+
+static void r5l_free_io_unit(struct r5l_log *log, struct r5l_io_unit *io)
+{
+ __free_page(io->meta_page);
+ kmem_cache_free(log->io_kc, io);
+}
+
+static void r5l_move_io_unit_list(struct list_head *from, struct list_head *to,
+ enum r5l_io_unit_state state)
+{
+ struct r5l_io_unit *io;
+
+ while (!list_empty(from)) {
+ io = list_first_entry(from, struct r5l_io_unit, log_sibling);
+ /* don't change list order */
+ if (io->state >= state)
+ list_move_tail(&io->log_sibling, to);
+ else
+ break;
+ }
+}
+
+static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
+ enum r5l_io_unit_state state)
+{
+ if (WARN_ON(io->state >= state))
+ return;
+ io->state = state;
+}
+
+static void r5l_io_run_stripes(struct r5l_io_unit *io)
+{
+ struct stripe_head *sh, *next;
+
+ list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
+ list_del_init(&sh->log_list);
+ set_bit(STRIPE_HANDLE, &sh->state);
+ raid5_release_stripe(sh);
+ }
+}
+
+static void r5l_log_run_stripes(struct r5l_log *log)
+{
+ struct r5l_io_unit *io, *next;
+
+ assert_spin_locked(&log->io_list_lock);
+
+ list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
+ /* don't change list order */
+ if (io->state < IO_UNIT_IO_END)
+ break;
+
+ list_move_tail(&io->log_sibling, &log->finished_ios);
+ r5l_io_run_stripes(io);
+ }
+}
+
+static void r5l_log_endio(struct bio *bio)
+{
+ struct r5l_io_unit *io = bio->bi_private;
+ struct r5l_log *log = io->log;
+ unsigned long flags;
+
+ if (bio->bi_error)
+ md_error(log->rdev->mddev, log->rdev);
+
+ bio_put(bio);
+
+ spin_lock_irqsave(&log->io_list_lock, flags);
+ __r5l_set_io_unit_state(io, IO_UNIT_IO_END);
+ if (log->need_cache_flush)
+ r5l_move_io_unit_list(&log->running_ios, &log->io_end_ios,
+ IO_UNIT_IO_END);
+ else
+ r5l_log_run_stripes(log);
+ spin_unlock_irqrestore(&log->io_list_lock, flags);
+
+ if (log->need_cache_flush)
+ md_wakeup_thread(log->rdev->mddev->thread);
+}
+
+static void r5l_submit_current_io(struct r5l_log *log)
+{
+ struct r5l_io_unit *io = log->current_io;
+ struct r5l_meta_block *block;
+ unsigned long flags;
+ u32 crc;
+
+ if (!io)
+ return;
+
+ block = page_address(io->meta_page);
+ block->meta_size = cpu_to_le32(io->meta_offset);
+ crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE);
+ block->checksum = cpu_to_le32(crc);
+
+ log->current_io = NULL;
+ spin_lock_irqsave(&log->io_list_lock, flags);
+ __r5l_set_io_unit_state(io, IO_UNIT_IO_START);
+ spin_unlock_irqrestore(&log->io_list_lock, flags);
+
+ submit_bio(WRITE, io->current_bio);
+}
+
+static struct bio *r5l_bio_alloc(struct r5l_log *log)
+{
+ struct bio *bio = bio_kmalloc(GFP_NOIO | __GFP_NOFAIL, BIO_MAX_PAGES);
+
+ bio->bi_rw = WRITE;
+ bio->bi_bdev = log->rdev->bdev;
+ bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start;
+
+ return bio;
+}
+
+static void r5_reserve_log_entry(struct r5l_log *log, struct r5l_io_unit *io)
+{
+ log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS);
+
+ /*
+ * If we filled up the log device start from the beginning again,
+ * which will require a new bio.
+ *
+ * Note: for this to work properly the log size needs to me a multiple
+ * of BLOCK_SECTORS.
+ */
+ if (log->log_start == 0)
+ io->need_split_bio = true;
+
+ io->log_end = log->log_start;
+}
+
+static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
+{
+ struct r5l_io_unit *io;
+ struct r5l_meta_block *block;
+
+ /* We can't handle memory allocate failure so far */
+ io = kmem_cache_zalloc(log->io_kc, GFP_NOIO | __GFP_NOFAIL);
+ io->log = log;
+ INIT_LIST_HEAD(&io->log_sibling);
+ INIT_LIST_HEAD(&io->stripe_list);
+ io->state = IO_UNIT_RUNNING;
+
+ io->meta_page = alloc_page(GFP_NOIO | __GFP_NOFAIL | __GFP_ZERO);
+ block = page_address(io->meta_page);
+ block->magic = cpu_to_le32(R5LOG_MAGIC);
+ block->version = R5LOG_VERSION;
+ block->seq = cpu_to_le64(log->seq);
+ block->position = cpu_to_le64(log->log_start);
+
+ io->log_start = log->log_start;
+ io->meta_offset = sizeof(struct r5l_meta_block);
+ io->seq = log->seq++;
+
+ io->current_bio = r5l_bio_alloc(log);
+ io->current_bio->bi_end_io = r5l_log_endio;
+ io->current_bio->bi_private = io;
+ bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0);
+
+ r5_reserve_log_entry(log, io);
+
+ spin_lock_irq(&log->io_list_lock);
+ list_add_tail(&io->log_sibling, &log->running_ios);
+ spin_unlock_irq(&log->io_list_lock);
+
+ return io;
+}
+
+static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size)
+{
+ if (log->current_io &&
+ log->current_io->meta_offset + payload_size > PAGE_SIZE)
+ r5l_submit_current_io(log);
+
+ if (!log->current_io)
+ log->current_io = r5l_new_meta(log);
+ return 0;
+}
+
+static void r5l_append_payload_meta(struct r5l_log *log, u16 type,
+ sector_t location,
+ u32 checksum1, u32 checksum2,
+ bool checksum2_valid)
+{
+ struct r5l_io_unit *io = log->current_io;
+ struct r5l_payload_data_parity *payload;
+
+ payload = page_address(io->meta_page) + io->meta_offset;
+ payload->header.type = cpu_to_le16(type);
+ payload->header.flags = cpu_to_le16(0);
+ payload->size = cpu_to_le32((1 + !!checksum2_valid) <<
+ (PAGE_SHIFT - 9));
+ payload->location = cpu_to_le64(location);
+ payload->checksum[0] = cpu_to_le32(checksum1);
+ if (checksum2_valid)
+ payload->checksum[1] = cpu_to_le32(checksum2);
+
+ io->meta_offset += sizeof(struct r5l_payload_data_parity) +
+ sizeof(__le32) * (1 + !!checksum2_valid);
+}
+
+static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
+{
+ struct r5l_io_unit *io = log->current_io;
+
+ if (io->need_split_bio) {
+ struct bio *prev = io->current_bio;
+
+ io->current_bio = r5l_bio_alloc(log);
+ bio_chain(io->current_bio, prev);
+
+ submit_bio(WRITE, prev);
+ }
+
+ if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0))
+ BUG();
+
+ r5_reserve_log_entry(log, io);
+}
+
+static void r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
+ int data_pages, int parity_pages)
+{
+ int i;
+ int meta_size;
+ struct r5l_io_unit *io;
+
+ meta_size =
+ ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
+ * data_pages) +
+ sizeof(struct r5l_payload_data_parity) +
+ sizeof(__le32) * parity_pages;
+
+ r5l_get_meta(log, meta_size);
+ io = log->current_io;
+
+ for (i = 0; i < sh->disks; i++) {
+ if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
+ continue;
+ if (i == sh->pd_idx || i == sh->qd_idx)
+ continue;
+ r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA,
+ raid5_compute_blocknr(sh, i, 0),
+ sh->dev[i].log_checksum, 0, false);
+ r5l_append_payload_page(log, sh->dev[i].page);
+ }
+
+ if (sh->qd_idx >= 0) {
+ r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
+ sh->sector, sh->dev[sh->pd_idx].log_checksum,
+ sh->dev[sh->qd_idx].log_checksum, true);
+ r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
+ r5l_append_payload_page(log, sh->dev[sh->qd_idx].page);
+ } else {
+ r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
+ sh->sector, sh->dev[sh->pd_idx].log_checksum,
+ 0, false);
+ r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
+ }
+
+ list_add_tail(&sh->log_list, &io->stripe_list);
+ atomic_inc(&io->pending_stripe);
+ sh->log_io = io;
+}
+
+static void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
+/*
+ * running in raid5d, where reclaim could wait for raid5d too (when it flushes
+ * data from log to raid disks), so we shouldn't wait for reclaim here
+ */
+int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
+{
+ int write_disks = 0;
+ int data_pages, parity_pages;
+ int meta_size;
+ int reserve;
+ int i;
+
+ if (!log)
+ return -EAGAIN;
+ /* Don't support stripe batch */
+ if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
+ test_bit(STRIPE_SYNCING, &sh->state)) {
+ /* the stripe is written to log, we start writing it to raid */
+ clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
+ return -EAGAIN;
+ }
+
+ for (i = 0; i < sh->disks; i++) {
+ void *addr;
+
+ if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
+ continue;
+ write_disks++;
+ /* checksum is already calculated in last run */
+ if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
+ continue;
+ addr = kmap_atomic(sh->dev[i].page);
+ sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
+ addr, PAGE_SIZE);
+ kunmap_atomic(addr);
+ }
+ parity_pages = 1 + !!(sh->qd_idx >= 0);
+ data_pages = write_disks - parity_pages;
+
+ meta_size =
+ ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
+ * data_pages) +
+ sizeof(struct r5l_payload_data_parity) +
+ sizeof(__le32) * parity_pages;
+ /* Doesn't work with very big raid array */
+ if (meta_size + sizeof(struct r5l_meta_block) > PAGE_SIZE)
+ return -EINVAL;
+
+ set_bit(STRIPE_LOG_TRAPPED, &sh->state);
+ /*
+ * The stripe must enter state machine again to finish the write, so
+ * don't delay.
+ */
+ clear_bit(STRIPE_DELAYED, &sh->state);
+ atomic_inc(&sh->count);
+
+ mutex_lock(&log->io_mutex);
+ /* meta + data */
+ reserve = (1 + write_disks) << (PAGE_SHIFT - 9);
+ if (r5l_has_free_space(log, reserve))
+ r5l_log_stripe(log, sh, data_pages, parity_pages);
+ else {
+ spin_lock(&log->no_space_stripes_lock);
+ list_add_tail(&sh->log_list, &log->no_space_stripes);
+ spin_unlock(&log->no_space_stripes_lock);
+
+ r5l_wake_reclaim(log, reserve);
+ }
+ mutex_unlock(&log->io_mutex);
+
+ return 0;
+}
+
+void r5l_write_stripe_run(struct r5l_log *log)
+{
+ if (!log)
+ return;
+ mutex_lock(&log->io_mutex);
+ r5l_submit_current_io(log);
+ mutex_unlock(&log->io_mutex);
+}
+
+int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
+{
+ if (!log)
+ return -ENODEV;
+ /*
+ * we flush log disk cache first, then write stripe data to raid disks.
+ * So if bio is finished, the log disk cache is flushed already. The
+ * recovery guarantees we can recovery the bio from log disk, so we
+ * don't need to flush again
+ */
+ if (bio->bi_iter.bi_size == 0) {
+ bio_endio(bio);
+ return 0;
+ }
+ bio->bi_rw &= ~REQ_FLUSH;
+ return -EAGAIN;
+}
+
+/* This will run after log space is reclaimed */
+static void r5l_run_no_space_stripes(struct r5l_log *log)
+{
+ struct stripe_head *sh;
+
+ spin_lock(&log->no_space_stripes_lock);
+ while (!list_empty(&log->no_space_stripes)) {
+ sh = list_first_entry(&log->no_space_stripes,
+ struct stripe_head, log_list);
+ list_del_init(&sh->log_list);
+ set_bit(STRIPE_HANDLE, &sh->state);
+ raid5_release_stripe(sh);
+ }
+ spin_unlock(&log->no_space_stripes_lock);
+}
+
+static sector_t r5l_reclaimable_space(struct r5l_log *log)
+{
+ return r5l_ring_distance(log, log->last_checkpoint,
+ log->next_checkpoint);
+}
+
+static bool r5l_complete_finished_ios(struct r5l_log *log)
+{
+ struct r5l_io_unit *io, *next;
+ bool found = false;
+
+ assert_spin_locked(&log->io_list_lock);
+
+ list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) {
+ /* don't change list order */
+ if (io->state < IO_UNIT_STRIPE_END)
+ break;
+
+ log->next_checkpoint = io->log_start;
+ log->next_cp_seq = io->seq;
+
+ list_del(&io->log_sibling);
+ r5l_free_io_unit(log, io);
+
+ found = true;
+ }
+
+ return found;
+}
+
+static void __r5l_stripe_write_finished(struct r5l_io_unit *io)
+{
+ struct r5l_log *log = io->log;
+ unsigned long flags;
+
+ spin_lock_irqsave(&log->io_list_lock, flags);
+ __r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END);
+
+ if (!r5l_complete_finished_ios(log)) {
+ spin_unlock_irqrestore(&log->io_list_lock, flags);
+ return;
+ }
+
+ if (r5l_reclaimable_space(log) > log->max_free_space)
+ r5l_wake_reclaim(log, 0);
+
+ spin_unlock_irqrestore(&log->io_list_lock, flags);
+ wake_up(&log->iounit_wait);
+}
+
+void r5l_stripe_write_finished(struct stripe_head *sh)
+{
+ struct r5l_io_unit *io;
+
+ io = sh->log_io;
+ sh->log_io = NULL;
+
+ if (io && atomic_dec_and_test(&io->pending_stripe))
+ __r5l_stripe_write_finished(io);
+}
+
+static void r5l_log_flush_endio(struct bio *bio)
+{
+ struct r5l_log *log = container_of(bio, struct r5l_log,
+ flush_bio);
+ unsigned long flags;
+ struct r5l_io_unit *io;
+
+ if (bio->bi_error)
+ md_error(log->rdev->mddev, log->rdev);
+
+ spin_lock_irqsave(&log->io_list_lock, flags);
+ list_for_each_entry(io, &log->flushing_ios, log_sibling)
+ r5l_io_run_stripes(io);
+ list_splice_tail_init(&log->flushing_ios, &log->finished_ios);
+ spin_unlock_irqrestore(&log->io_list_lock, flags);
+}
+
+/*
+ * Starting dispatch IO to raid.
+ * io_unit(meta) consists of a log. There is one situation we want to avoid. A
+ * broken meta in the middle of a log causes recovery can't find meta at the
+ * head of log. If operations require meta at the head persistent in log, we
+ * must make sure meta before it persistent in log too. A case is:
+ *
+ * stripe data/parity is in log, we start write stripe to raid disks. stripe
+ * data/parity must be persistent in log before we do the write to raid disks.
+ *
+ * The solution is we restrictly maintain io_unit list order. In this case, we
+ * only write stripes of an io_unit to raid disks till the io_unit is the first
+ * one whose data/parity is in log.
+ */
+void r5l_flush_stripe_to_raid(struct r5l_log *log)
+{
+ bool do_flush;
+
+ if (!log || !log->need_cache_flush)
+ return;
+
+ spin_lock_irq(&log->io_list_lock);
+ /* flush bio is running */
+ if (!list_empty(&log->flushing_ios)) {
+ spin_unlock_irq(&log->io_list_lock);
+ return;
+ }
+ list_splice_tail_init(&log->io_end_ios, &log->flushing_ios);
+ do_flush = !list_empty(&log->flushing_ios);
+ spin_unlock_irq(&log->io_list_lock);
+
+ if (!do_flush)
+ return;
+ bio_reset(&log->flush_bio);
+ log->flush_bio.bi_bdev = log->rdev->bdev;
+ log->flush_bio.bi_end_io = r5l_log_flush_endio;
+ submit_bio(WRITE_FLUSH, &log->flush_bio);
+}
+
+static void r5l_write_super(struct r5l_log *log, sector_t cp);
+static void r5l_write_super_and_discard_space(struct r5l_log *log,
+ sector_t end)
+{
+ struct block_device *bdev = log->rdev->bdev;
+ struct mddev *mddev;
+
+ r5l_write_super(log, end);
+
+ if (!blk_queue_discard(bdev_get_queue(bdev)))
+ return;
+
+ mddev = log->rdev->mddev;
+ /*
+ * This is to avoid a deadlock. r5l_quiesce holds reconfig_mutex and
+ * wait for this thread to finish. This thread waits for
+ * MD_CHANGE_PENDING clear, which is supposed to be done in
+ * md_check_recovery(). md_check_recovery() tries to get
+ * reconfig_mutex. Since r5l_quiesce already holds the mutex,
+ * md_check_recovery() fails, so the PENDING never get cleared. The
+ * in_teardown check workaround this issue.
+ */
+ if (!log->in_teardown) {
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ set_bit(MD_CHANGE_PENDING, &mddev->flags);
+ md_wakeup_thread(mddev->thread);
+ wait_event(mddev->sb_wait,
+ !test_bit(MD_CHANGE_PENDING, &mddev->flags) ||
+ log->in_teardown);
+ /*
+ * r5l_quiesce could run after in_teardown check and hold
+ * mutex first. Superblock might get updated twice.
+ */
+ if (log->in_teardown)
+ md_update_sb(mddev, 1);
+ } else {
+ WARN_ON(!mddev_is_locked(mddev));
+ md_update_sb(mddev, 1);
+ }
+
+ /* discard IO error really doesn't matter, ignore it */
+ if (log->last_checkpoint < end) {
+ blkdev_issue_discard(bdev,
+ log->last_checkpoint + log->rdev->data_offset,
+ end - log->last_checkpoint, GFP_NOIO, 0);
+ } else {
+ blkdev_issue_discard(bdev,
+ log->last_checkpoint + log->rdev->data_offset,
+ log->device_size - log->last_checkpoint,
+ GFP_NOIO, 0);
+ blkdev_issue_discard(bdev, log->rdev->data_offset, end,
+ GFP_NOIO, 0);
+ }
+}
+
+
+static void r5l_do_reclaim(struct r5l_log *log)
+{
+ sector_t reclaim_target = xchg(&log->reclaim_target, 0);
+ sector_t reclaimable;
+ sector_t next_checkpoint;
+ u64 next_cp_seq;
+
+ spin_lock_irq(&log->io_list_lock);
+ /*
+ * move proper io_unit to reclaim list. We should not change the order.
+ * reclaimable/unreclaimable io_unit can be mixed in the list, we
+ * shouldn't reuse space of an unreclaimable io_unit
+ */
+ while (1) {
+ reclaimable = r5l_reclaimable_space(log);
+ if (reclaimable >= reclaim_target ||
+ (list_empty(&log->running_ios) &&
+ list_empty(&log->io_end_ios) &&
+ list_empty(&log->flushing_ios) &&
+ list_empty(&log->finished_ios)))
+ break;
+
+ md_wakeup_thread(log->rdev->mddev->thread);
+ wait_event_lock_irq(log->iounit_wait,
+ r5l_reclaimable_space(log) > reclaimable,
+ log->io_list_lock);
+ }
+
+ next_checkpoint = log->next_checkpoint;
+ next_cp_seq = log->next_cp_seq;
+ spin_unlock_irq(&log->io_list_lock);
+
+ BUG_ON(reclaimable < 0);
+ if (reclaimable == 0)
+ return;
+
+ /*
+ * write_super will flush cache of each raid disk. We must write super
+ * here, because the log area might be reused soon and we don't want to
+ * confuse recovery
+ */
+ r5l_write_super_and_discard_space(log, next_checkpoint);
+
+ mutex_lock(&log->io_mutex);
+ log->last_checkpoint = next_checkpoint;
+ log->last_cp_seq = next_cp_seq;
+ mutex_unlock(&log->io_mutex);
+
+ r5l_run_no_space_stripes(log);
+}
+
+static void r5l_reclaim_thread(struct md_thread *thread)
+{
+ struct mddev *mddev = thread->mddev;
+ struct r5conf *conf = mddev->private;
+ struct r5l_log *log = conf->log;
+
+ if (!log)
+ return;
+ r5l_do_reclaim(log);
+}
+
+static void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
+{
+ unsigned long target;
+ unsigned long new = (unsigned long)space; /* overflow in theory */
+
+ do {
+ target = log->reclaim_target;
+ if (new < target)
+ return;
+ } while (cmpxchg(&log->reclaim_target, target, new) != target);
+ md_wakeup_thread(log->reclaim_thread);
+}
+
+void r5l_quiesce(struct r5l_log *log, int state)
+{
+ struct mddev *mddev;
+ if (!log || state == 2)
+ return;
+ if (state == 0) {
+ log->in_teardown = 0;
+ log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
+ log->rdev->mddev, "reclaim");
+ } else if (state == 1) {
+ /*
+ * at this point all stripes are finished, so io_unit is at
+ * least in STRIPE_END state
+ */
+ log->in_teardown = 1;
+ /* make sure r5l_write_super_and_discard_space exits */
+ mddev = log->rdev->mddev;
+ wake_up(&mddev->sb_wait);
+ r5l_wake_reclaim(log, -1L);
+ md_unregister_thread(&log->reclaim_thread);
+ r5l_do_reclaim(log);
+ }
+}
+
+bool r5l_log_disk_error(struct r5conf *conf)
+{
+ /* don't allow write if journal disk is missing */
+ if (!conf->log)
+ return test_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
+ return test_bit(Faulty, &conf->log->rdev->flags);
+}
+
+struct r5l_recovery_ctx {
+ struct page *meta_page; /* current meta */
+ sector_t meta_total_blocks; /* total size of current meta and data */
+ sector_t pos; /* recovery position */
+ u64 seq; /* recovery position seq */
+};
+
+static int r5l_read_meta_block(struct r5l_log *log,
+ struct r5l_recovery_ctx *ctx)
+{
+ struct page *page = ctx->meta_page;
+ struct r5l_meta_block *mb;
+ u32 crc, stored_crc;
+
+ if (!sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, READ, false))
+ return -EIO;
+
+ mb = page_address(page);
+ stored_crc = le32_to_cpu(mb->checksum);
+ mb->checksum = 0;
+
+ if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
+ le64_to_cpu(mb->seq) != ctx->seq ||
+ mb->version != R5LOG_VERSION ||
+ le64_to_cpu(mb->position) != ctx->pos)
+ return -EINVAL;
+
+ crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
+ if (stored_crc != crc)
+ return -EINVAL;
+
+ if (le32_to_cpu(mb->meta_size) > PAGE_SIZE)
+ return -EINVAL;
+
+ ctx->meta_total_blocks = BLOCK_SECTORS;
+
+ return 0;
+}
+
+static int r5l_recovery_flush_one_stripe(struct r5l_log *log,
+ struct r5l_recovery_ctx *ctx,
+ sector_t stripe_sect,
+ int *offset, sector_t *log_offset)
+{
+ struct r5conf *conf = log->rdev->mddev->private;
+ struct stripe_head *sh;
+ struct r5l_payload_data_parity *payload;
+ int disk_index;
+
+ sh = raid5_get_active_stripe(conf, stripe_sect, 0, 0, 0);
+ while (1) {
+ payload = page_address(ctx->meta_page) + *offset;
+
+ if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
+ raid5_compute_sector(conf,
+ le64_to_cpu(payload->location), 0,
+ &disk_index, sh);
+
+ sync_page_io(log->rdev, *log_offset, PAGE_SIZE,
+ sh->dev[disk_index].page, READ, false);
+ sh->dev[disk_index].log_checksum =
+ le32_to_cpu(payload->checksum[0]);
+ set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
+ ctx->meta_total_blocks += BLOCK_SECTORS;
+ } else {
+ disk_index = sh->pd_idx;
+ sync_page_io(log->rdev, *log_offset, PAGE_SIZE,
+ sh->dev[disk_index].page, READ, false);
+ sh->dev[disk_index].log_checksum =
+ le32_to_cpu(payload->checksum[0]);
+ set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
+
+ if (sh->qd_idx >= 0) {
+ disk_index = sh->qd_idx;
+ sync_page_io(log->rdev,
+ r5l_ring_add(log, *log_offset, BLOCK_SECTORS),
+ PAGE_SIZE, sh->dev[disk_index].page,
+ READ, false);
+ sh->dev[disk_index].log_checksum =
+ le32_to_cpu(payload->checksum[1]);
+ set_bit(R5_Wantwrite,
+ &sh->dev[disk_index].flags);
+ }
+ ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded;
+ }
+
+ *log_offset = r5l_ring_add(log, *log_offset,
+ le32_to_cpu(payload->size));
+ *offset += sizeof(struct r5l_payload_data_parity) +
+ sizeof(__le32) *
+ (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
+ if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY)
+ break;
+ }
+
+ for (disk_index = 0; disk_index < sh->disks; disk_index++) {
+ void *addr;
+ u32 checksum;
+
+ if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
+ continue;
+ addr = kmap_atomic(sh->dev[disk_index].page);
+ checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE);
+ kunmap_atomic(addr);
+ if (checksum != sh->dev[disk_index].log_checksum)
+ goto error;
+ }
+
+ for (disk_index = 0; disk_index < sh->disks; disk_index++) {
+ struct md_rdev *rdev, *rrdev;
+
+ if (!test_and_clear_bit(R5_Wantwrite,
+ &sh->dev[disk_index].flags))
+ continue;
+
+ /* in case device is broken */
+ rdev = rcu_dereference(conf->disks[disk_index].rdev);
+ if (rdev)
+ sync_page_io(rdev, stripe_sect, PAGE_SIZE,
+ sh->dev[disk_index].page, WRITE, false);
+ rrdev = rcu_dereference(conf->disks[disk_index].replacement);
+ if (rrdev)
+ sync_page_io(rrdev, stripe_sect, PAGE_SIZE,
+ sh->dev[disk_index].page, WRITE, false);
+ }
+ raid5_release_stripe(sh);
+ return 0;
+
+error:
+ for (disk_index = 0; disk_index < sh->disks; disk_index++)
+ sh->dev[disk_index].flags = 0;
+ raid5_release_stripe(sh);
+ return -EINVAL;
+}
+
+static int r5l_recovery_flush_one_meta(struct r5l_log *log,
+ struct r5l_recovery_ctx *ctx)
+{
+ struct r5conf *conf = log->rdev->mddev->private;
+ struct r5l_payload_data_parity *payload;
+ struct r5l_meta_block *mb;
+ int offset;
+ sector_t log_offset;
+ sector_t stripe_sector;
+
+ mb = page_address(ctx->meta_page);
+ offset = sizeof(struct r5l_meta_block);
+ log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
+
+ while (offset < le32_to_cpu(mb->meta_size)) {
+ int dd;
+
+ payload = (void *)mb + offset;
+ stripe_sector = raid5_compute_sector(conf,
+ le64_to_cpu(payload->location), 0, &dd, NULL);
+ if (r5l_recovery_flush_one_stripe(log, ctx, stripe_sector,
+ &offset, &log_offset))
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/* copy data/parity from log to raid disks */
+static void r5l_recovery_flush_log(struct r5l_log *log,
+ struct r5l_recovery_ctx *ctx)
+{
+ while (1) {
+ if (r5l_read_meta_block(log, ctx))
+ return;
+ if (r5l_recovery_flush_one_meta(log, ctx))
+ return;
+ ctx->seq++;
+ ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks);
+ }
+}
+
+static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
+ u64 seq)
+{
+ struct page *page;
+ struct r5l_meta_block *mb;
+ u32 crc;
+
+ page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ if (!page)
+ return -ENOMEM;
+ mb = page_address(page);
+ mb->magic = cpu_to_le32(R5LOG_MAGIC);
+ mb->version = R5LOG_VERSION;
+ mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block));
+ mb->seq = cpu_to_le64(seq);
+ mb->position = cpu_to_le64(pos);
+ crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
+ mb->checksum = cpu_to_le32(crc);
+
+ if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, WRITE_FUA, false)) {
+ __free_page(page);
+ return -EIO;
+ }
+ __free_page(page);
+ return 0;
+}
+
+static int r5l_recovery_log(struct r5l_log *log)
+{
+ struct r5l_recovery_ctx ctx;
+
+ ctx.pos = log->last_checkpoint;
+ ctx.seq = log->last_cp_seq;
+ ctx.meta_page = alloc_page(GFP_KERNEL);
+ if (!ctx.meta_page)
+ return -ENOMEM;
+
+ r5l_recovery_flush_log(log, &ctx);
+ __free_page(ctx.meta_page);
+
+ /*
+ * we did a recovery. Now ctx.pos points to an invalid meta block. New
+ * log will start here. but we can't let superblock point to last valid
+ * meta block. The log might looks like:
+ * | meta 1| meta 2| meta 3|
+ * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If
+ * superblock points to meta 1, we write a new valid meta 2n. if crash
+ * happens again, new recovery will start from meta 1. Since meta 2n is
+ * valid now, recovery will think meta 3 is valid, which is wrong.
+ * The solution is we create a new meta in meta2 with its seq == meta
+ * 1's seq + 10 and let superblock points to meta2. The same recovery will
+ * not think meta 3 is a valid meta, because its seq doesn't match
+ */
+ if (ctx.seq > log->last_cp_seq + 1) {
+ int ret;
+
+ ret = r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq + 10);
+ if (ret)
+ return ret;
+ log->seq = ctx.seq + 11;
+ log->log_start = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
+ r5l_write_super(log, ctx.pos);
+ } else {
+ log->log_start = ctx.pos;
+ log->seq = ctx.seq;
+ }
+ return 0;
+}
+
+static void r5l_write_super(struct r5l_log *log, sector_t cp)
+{
+ struct mddev *mddev = log->rdev->mddev;
+
+ log->rdev->journal_tail = cp;
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
+}
+
+static int r5l_load_log(struct r5l_log *log)
+{
+ struct md_rdev *rdev = log->rdev;
+ struct page *page;
+ struct r5l_meta_block *mb;
+ sector_t cp = log->rdev->journal_tail;
+ u32 stored_crc, expected_crc;
+ bool create_super = false;
+ int ret;
+
+ /* Make sure it's valid */
+ if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp)
+ cp = 0;
+ page = alloc_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+
+ if (!sync_page_io(rdev, cp, PAGE_SIZE, page, READ, false)) {
+ ret = -EIO;
+ goto ioerr;
+ }
+ mb = page_address(page);
+
+ if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
+ mb->version != R5LOG_VERSION) {
+ create_super = true;
+ goto create;
+ }
+ stored_crc = le32_to_cpu(mb->checksum);
+ mb->checksum = 0;
+ expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
+ if (stored_crc != expected_crc) {
+ create_super = true;
+ goto create;
+ }
+ if (le64_to_cpu(mb->position) != cp) {
+ create_super = true;
+ goto create;
+ }
+create:
+ if (create_super) {
+ log->last_cp_seq = prandom_u32();
+ cp = 0;
+ /*
+ * Make sure super points to correct address. Log might have
+ * data very soon. If super hasn't correct log tail address,
+ * recovery can't find the log
+ */
+ r5l_write_super(log, cp);
+ } else
+ log->last_cp_seq = le64_to_cpu(mb->seq);
+
+ log->device_size = round_down(rdev->sectors, BLOCK_SECTORS);
+ log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT;
+ if (log->max_free_space > RECLAIM_MAX_FREE_SPACE)
+ log->max_free_space = RECLAIM_MAX_FREE_SPACE;
+ log->last_checkpoint = cp;
+
+ __free_page(page);
+
+ return r5l_recovery_log(log);
+ioerr:
+ __free_page(page);
+ return ret;
+}
+
+int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
+{
+ struct r5l_log *log;
+
+ if (PAGE_SIZE != 4096)
+ return -EINVAL;
+ log = kzalloc(sizeof(*log), GFP_KERNEL);
+ if (!log)
+ return -ENOMEM;
+ log->rdev = rdev;
+
+ log->need_cache_flush = (rdev->bdev->bd_disk->queue->flush_flags != 0);
+
+ log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid,
+ sizeof(rdev->mddev->uuid));
+
+ mutex_init(&log->io_mutex);
+
+ spin_lock_init(&log->io_list_lock);
+ INIT_LIST_HEAD(&log->running_ios);
+ INIT_LIST_HEAD(&log->io_end_ios);
+ INIT_LIST_HEAD(&log->flushing_ios);
+ INIT_LIST_HEAD(&log->finished_ios);
+ bio_init(&log->flush_bio);
+
+ log->io_kc = KMEM_CACHE(r5l_io_unit, 0);
+ if (!log->io_kc)
+ goto io_kc;
+
+ log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
+ log->rdev->mddev, "reclaim");
+ if (!log->reclaim_thread)
+ goto reclaim_thread;
+ init_waitqueue_head(&log->iounit_wait);
+
+ INIT_LIST_HEAD(&log->no_space_stripes);
+ spin_lock_init(&log->no_space_stripes_lock);
+
+ if (r5l_load_log(log))
+ goto error;
+
+ conf->log = log;
+ return 0;
+error:
+ md_unregister_thread(&log->reclaim_thread);
+reclaim_thread:
+ kmem_cache_destroy(log->io_kc);
+io_kc:
+ kfree(log);
+ return -EINVAL;
+}
+
+void r5l_exit_log(struct r5l_log *log)
+{
+ md_unregister_thread(&log->reclaim_thread);
+ kmem_cache_destroy(log->io_kc);
+ kfree(log);
+}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 45933c160..704ef7fcf 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -353,7 +353,7 @@ static void release_inactive_stripe_list(struct r5conf *conf,
struct list_head *list = &temp_inactive_list[size - 1];
/*
- * We don't hold any lock here yet, get_active_stripe() might
+ * We don't hold any lock here yet, raid5_get_active_stripe() might
* remove stripes from the list
*/
if (!list_empty_careful(list)) {
@@ -413,7 +413,7 @@ static int release_stripe_list(struct r5conf *conf,
return count;
}
-static void release_stripe(struct stripe_head *sh)
+void raid5_release_stripe(struct stripe_head *sh)
{
struct r5conf *conf = sh->raid_conf;
unsigned long flags;
@@ -658,9 +658,9 @@ static int has_failed(struct r5conf *conf)
return 0;
}
-static struct stripe_head *
-get_active_stripe(struct r5conf *conf, sector_t sector,
- int previous, int noblock, int noquiesce)
+struct stripe_head *
+raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
+ int previous, int noblock, int noquiesce)
{
struct stripe_head *sh;
int hash = stripe_hash_locks_hash(sector);
@@ -755,6 +755,10 @@ static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
/* Only freshly new full stripe normal write stripe can be added to a batch list */
static bool stripe_can_batch(struct stripe_head *sh)
{
+ struct r5conf *conf = sh->raid_conf;
+
+ if (conf->log)
+ return false;
return test_bit(STRIPE_BATCH_READY, &sh->state) &&
!test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&
is_full_stripe_write(sh);
@@ -858,7 +862,7 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh
unlock_out:
unlock_two_stripes(head, sh);
out:
- release_stripe(head);
+ raid5_release_stripe(head);
}
/* Determine if 'data_offset' or 'new_data_offset' should be used
@@ -895,6 +899,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
might_sleep();
+ if (r5l_write_stripe(conf->log, sh) == 0)
+ return;
for (i = disks; i--; ) {
int rw;
int replace_only = 0;
@@ -1208,7 +1214,7 @@ static void ops_complete_biofill(void *stripe_head_ref)
return_io(&return_bi);
set_bit(STRIPE_HANDLE, &sh->state);
- release_stripe(sh);
+ raid5_release_stripe(sh);
}
static void ops_run_biofill(struct stripe_head *sh)
@@ -1271,7 +1277,7 @@ static void ops_complete_compute(void *stripe_head_ref)
if (sh->check_state == check_state_compute_run)
sh->check_state = check_state_compute_result;
set_bit(STRIPE_HANDLE, &sh->state);
- release_stripe(sh);
+ raid5_release_stripe(sh);
}
/* return a pointer to the address conversion region of the scribble buffer */
@@ -1697,7 +1703,7 @@ static void ops_complete_reconstruct(void *stripe_head_ref)
}
set_bit(STRIPE_HANDLE, &sh->state);
- release_stripe(sh);
+ raid5_release_stripe(sh);
}
static void
@@ -1855,7 +1861,7 @@ static void ops_complete_check(void *stripe_head_ref)
sh->check_state = check_state_check_result;
set_bit(STRIPE_HANDLE, &sh->state);
- release_stripe(sh);
+ raid5_release_stripe(sh);
}
static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
@@ -2017,7 +2023,7 @@ static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
/* we just created an active stripe so... */
atomic_inc(&conf->active_stripes);
- release_stripe(sh);
+ raid5_release_stripe(sh);
conf->max_nr_stripes++;
return 1;
}
@@ -2236,7 +2242,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
if (!p)
err = -ENOMEM;
}
- release_stripe(nsh);
+ raid5_release_stripe(nsh);
}
/* critical section pass, GFP_NOIO no longer needed */
@@ -2394,7 +2400,7 @@ static void raid5_end_read_request(struct bio * bi)
rdev_dec_pending(rdev, conf->mddev);
clear_bit(R5_LOCKED, &sh->dev[i].flags);
set_bit(STRIPE_HANDLE, &sh->state);
- release_stripe(sh);
+ raid5_release_stripe(sh);
}
static void raid5_end_write_request(struct bio *bi)
@@ -2468,14 +2474,12 @@ static void raid5_end_write_request(struct bio *bi)
if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
clear_bit(R5_LOCKED, &sh->dev[i].flags);
set_bit(STRIPE_HANDLE, &sh->state);
- release_stripe(sh);
+ raid5_release_stripe(sh);
if (sh->batch_head && sh != sh->batch_head)
- release_stripe(sh->batch_head);
+ raid5_release_stripe(sh->batch_head);
}
-static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous);
-
static void raid5_build_block(struct stripe_head *sh, int i, int previous)
{
struct r5dev *dev = &sh->dev[i];
@@ -2491,7 +2495,7 @@ static void raid5_build_block(struct stripe_head *sh, int i, int previous)
dev->rreq.bi_private = sh;
dev->flags = 0;
- dev->sector = compute_blocknr(sh, i, previous);
+ dev->sector = raid5_compute_blocknr(sh, i, previous);
}
static void error(struct mddev *mddev, struct md_rdev *rdev)
@@ -2524,9 +2528,9 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
* Input: a 'big' sector number,
* Output: index of the data and parity disk, and the sector # in them.
*/
-static sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
- int previous, int *dd_idx,
- struct stripe_head *sh)
+sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
+ int previous, int *dd_idx,
+ struct stripe_head *sh)
{
sector_t stripe, stripe2;
sector_t chunk_number;
@@ -2726,7 +2730,7 @@ static sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
return new_sector;
}
-static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
+sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
{
struct r5conf *conf = sh->raid_conf;
int raid_disks = sh->disks;
@@ -3098,6 +3102,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
if (bi)
bitmap_end = 1;
+ r5l_stripe_write_finished(sh);
+
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
wake_up(&conf->wait_for_overlap);
@@ -3141,6 +3147,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
* the data has not reached the cache yet.
*/
if (!test_bit(R5_Wantfill, &sh->dev[i].flags) &&
+ s->failed > conf->max_degraded &&
(!test_bit(R5_Insync, &sh->dev[i].flags) ||
test_bit(R5_ReadError, &sh->dev[i].flags))) {
spin_lock_irq(&sh->stripe_lock);
@@ -3497,6 +3504,9 @@ returnbi:
WARN_ON(test_bit(R5_SkipCopy, &dev->flags));
WARN_ON(dev->page != dev->orig_page);
}
+
+ r5l_stripe_write_finished(sh);
+
if (!discard_pending &&
test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) {
int hash;
@@ -3939,10 +3949,10 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
struct stripe_head *sh2;
struct async_submit_ctl submit;
- sector_t bn = compute_blocknr(sh, i, 1);
+ sector_t bn = raid5_compute_blocknr(sh, i, 1);
sector_t s = raid5_compute_sector(conf, bn, 0,
&dd_idx, NULL);
- sh2 = get_active_stripe(conf, s, 0, 1, 1);
+ sh2 = raid5_get_active_stripe(conf, s, 0, 1, 1);
if (sh2 == NULL)
/* so far only the early blocks of this stripe
* have been requested. When later blocks
@@ -3952,7 +3962,7 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
if (!test_bit(STRIPE_EXPANDING, &sh2->state) ||
test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
/* must have already done this block */
- release_stripe(sh2);
+ raid5_release_stripe(sh2);
continue;
}
@@ -3973,7 +3983,7 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
set_bit(STRIPE_EXPAND_READY, &sh2->state);
set_bit(STRIPE_HANDLE, &sh2->state);
}
- release_stripe(sh2);
+ raid5_release_stripe(sh2);
}
/* done submitting copies, wait for them to complete */
@@ -4008,6 +4018,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state) && !sh->batch_head;
s->failed_num[0] = -1;
s->failed_num[1] = -1;
+ s->log_failed = r5l_log_disk_error(conf);
/* Now to look around and see what can be done */
rcu_read_lock();
@@ -4259,7 +4270,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
if (handle_flags == 0 ||
sh->state & handle_flags)
set_bit(STRIPE_HANDLE, &sh->state);
- release_stripe(sh);
+ raid5_release_stripe(sh);
}
spin_lock_irq(&head_sh->stripe_lock);
head_sh->batch_head = NULL;
@@ -4320,6 +4331,9 @@ static void handle_stripe(struct stripe_head *sh)
analyse_stripe(sh, &s);
+ if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
+ goto finish;
+
if (s.handle_bad_blocks) {
set_bit(STRIPE_HANDLE, &sh->state);
goto finish;
@@ -4348,7 +4362,7 @@ static void handle_stripe(struct stripe_head *sh)
/* check if the array has lost more than max_degraded devices and,
* if so, some requests might need to be failed.
*/
- if (s.failed > conf->max_degraded) {
+ if (s.failed > conf->max_degraded || s.log_failed) {
sh->check_state = 0;
sh->reconstruct_state = 0;
break_stripe_batch_list(sh, 0);
@@ -4506,7 +4520,7 @@ static void handle_stripe(struct stripe_head *sh)
/* Finish reconstruct operations initiated by the expansion process */
if (sh->reconstruct_state == reconstruct_state_result) {
struct stripe_head *sh_src
- = get_active_stripe(conf, sh->sector, 1, 1, 1);
+ = raid5_get_active_stripe(conf, sh->sector, 1, 1, 1);
if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) {
/* sh cannot be written until sh_src has been read.
* so arrange for sh to be delayed a little
@@ -4516,11 +4530,11 @@ static void handle_stripe(struct stripe_head *sh)
if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
&sh_src->state))
atomic_inc(&conf->preread_active_stripes);
- release_stripe(sh_src);
+ raid5_release_stripe(sh_src);
goto finish;
}
if (sh_src)
- release_stripe(sh_src);
+ raid5_release_stripe(sh_src);
sh->reconstruct_state = reconstruct_state_idle;
clear_bit(STRIPE_EXPANDING, &sh->state);
@@ -5012,7 +5026,7 @@ static void release_stripe_plug(struct mddev *mddev,
struct raid5_plug_cb *cb;
if (!blk_cb) {
- release_stripe(sh);
+ raid5_release_stripe(sh);
return;
}
@@ -5028,7 +5042,7 @@ static void release_stripe_plug(struct mddev *mddev,
if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state))
list_add_tail(&sh->lru, &cb->list);
else
- release_stripe(sh);
+ raid5_release_stripe(sh);
}
static void make_discard_request(struct mddev *mddev, struct bio *bi)
@@ -5063,12 +5077,12 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
DEFINE_WAIT(w);
int d;
again:
- sh = get_active_stripe(conf, logical_sector, 0, 0, 0);
+ sh = raid5_get_active_stripe(conf, logical_sector, 0, 0, 0);
prepare_to_wait(&conf->wait_for_overlap, &w,
TASK_UNINTERRUPTIBLE);
set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
if (test_bit(STRIPE_SYNCING, &sh->state)) {
- release_stripe(sh);
+ raid5_release_stripe(sh);
schedule();
goto again;
}
@@ -5080,7 +5094,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
if (sh->dev[d].towrite || sh->dev[d].toread) {
set_bit(R5_Overlap, &sh->dev[d].flags);
spin_unlock_irq(&sh->stripe_lock);
- release_stripe(sh);
+ raid5_release_stripe(sh);
schedule();
goto again;
}
@@ -5136,8 +5150,15 @@ static void make_request(struct mddev *mddev, struct bio * bi)
bool do_prepare;
if (unlikely(bi->bi_rw & REQ_FLUSH)) {
- md_flush_request(mddev, bi);
- return;
+ int ret = r5l_handle_flush_request(conf->log, bi);
+
+ if (ret == 0)
+ return;
+ if (ret == -ENODEV) {
+ md_flush_request(mddev, bi);
+ return;
+ }
+ /* ret == -EAGAIN, fallback */
}
md_write_start(mddev, bi);
@@ -5210,7 +5231,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
(unsigned long long)new_sector,
(unsigned long long)logical_sector);
- sh = get_active_stripe(conf, new_sector, previous,
+ sh = raid5_get_active_stripe(conf, new_sector, previous,
(bi->bi_rw&RWA_MASK), 0);
if (sh) {
if (unlikely(previous)) {
@@ -5231,7 +5252,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
must_retry = 1;
spin_unlock_irq(&conf->device_lock);
if (must_retry) {
- release_stripe(sh);
+ raid5_release_stripe(sh);
schedule();
do_prepare = true;
goto retry;
@@ -5241,14 +5262,14 @@ static void make_request(struct mddev *mddev, struct bio * bi)
/* Might have got the wrong stripe_head
* by accident
*/
- release_stripe(sh);
+ raid5_release_stripe(sh);
goto retry;
}
if (rw == WRITE &&
logical_sector >= mddev->suspend_lo &&
logical_sector < mddev->suspend_hi) {
- release_stripe(sh);
+ raid5_release_stripe(sh);
/* As the suspend_* range is controlled by
* userspace, we want an interruptible
* wait.
@@ -5271,7 +5292,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
* and wait a while
*/
md_wakeup_thread(mddev->thread);
- release_stripe(sh);
+ raid5_release_stripe(sh);
schedule();
do_prepare = true;
goto retry;
@@ -5458,7 +5479,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
int j;
int skipped_disk = 0;
- sh = get_active_stripe(conf, stripe_addr+i, 0, 0, 1);
+ sh = raid5_get_active_stripe(conf, stripe_addr+i, 0, 0, 1);
set_bit(STRIPE_EXPANDING, &sh->state);
atomic_inc(&conf->reshape_stripes);
/* If any of this stripe is beyond the end of the old
@@ -5471,7 +5492,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
if (conf->level == 6 &&
j == sh->qd_idx)
continue;
- s = compute_blocknr(sh, j, 0);
+ s = raid5_compute_blocknr(sh, j, 0);
if (s < raid5_size(mddev, 0, 0)) {
skipped_disk = 1;
continue;
@@ -5507,10 +5528,10 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
if (last_sector >= mddev->dev_sectors)
last_sector = mddev->dev_sectors - 1;
while (first_sector <= last_sector) {
- sh = get_active_stripe(conf, first_sector, 1, 0, 1);
+ sh = raid5_get_active_stripe(conf, first_sector, 1, 0, 1);
set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
set_bit(STRIPE_HANDLE, &sh->state);
- release_stripe(sh);
+ raid5_release_stripe(sh);
first_sector += STRIPE_SECTORS;
}
/* Now that the sources are clearly marked, we can release
@@ -5519,7 +5540,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
while (!list_empty(&stripes)) {
sh = list_entry(stripes.next, struct stripe_head, lru);
list_del_init(&sh->lru);
- release_stripe(sh);
+ raid5_release_stripe(sh);
}
/* If this takes us to the resync_max point where we have to pause,
* then we need to write out the superblock.
@@ -5615,11 +5636,11 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int
return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
}
- bitmap_cond_end_sync(mddev->bitmap, sector_nr);
+ bitmap_cond_end_sync(mddev->bitmap, sector_nr, false);
- sh = get_active_stripe(conf, sector_nr, 0, 1, 0);
+ sh = raid5_get_active_stripe(conf, sector_nr, 0, 1, 0);
if (sh == NULL) {
- sh = get_active_stripe(conf, sector_nr, 0, 0, 0);
+ sh = raid5_get_active_stripe(conf, sector_nr, 0, 0, 0);
/* make sure we don't swamp the stripe cache if someone else
* is trying to get access
*/
@@ -5643,7 +5664,7 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int
set_bit(STRIPE_SYNC_REQUESTED, &sh->state);
set_bit(STRIPE_HANDLE, &sh->state);
- release_stripe(sh);
+ raid5_release_stripe(sh);
return STRIPE_SECTORS;
}
@@ -5682,7 +5703,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
/* already done this stripe */
continue;
- sh = get_active_stripe(conf, sector, 0, 1, 1);
+ sh = raid5_get_active_stripe(conf, sector, 0, 1, 1);
if (!sh) {
/* failed to get a stripe - must wait */
@@ -5692,7 +5713,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
}
if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) {
- release_stripe(sh);
+ raid5_release_stripe(sh);
raid5_set_bi_processed_stripes(raid_bio, scnt);
conf->retry_read_aligned = raid_bio;
return handled;
@@ -5700,7 +5721,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags);
handle_stripe(sh);
- release_stripe(sh);
+ raid5_release_stripe(sh);
handled++;
}
remaining = raid5_dec_bi_active_stripes(raid_bio);
@@ -5730,8 +5751,12 @@ static int handle_active_stripes(struct r5conf *conf, int group,
for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
if (!list_empty(temp_inactive_list + i))
break;
- if (i == NR_STRIPE_HASH_LOCKS)
+ if (i == NR_STRIPE_HASH_LOCKS) {
+ spin_unlock_irq(&conf->device_lock);
+ r5l_flush_stripe_to_raid(conf->log);
+ spin_lock_irq(&conf->device_lock);
return batch_size;
+ }
release_inactive = true;
}
spin_unlock_irq(&conf->device_lock);
@@ -5739,6 +5764,7 @@ static int handle_active_stripes(struct r5conf *conf, int group,
release_inactive_stripe_list(conf, temp_inactive_list,
NR_STRIPE_HASH_LOCKS);
+ r5l_flush_stripe_to_raid(conf->log);
if (release_inactive) {
spin_lock_irq(&conf->device_lock);
return 0;
@@ -5746,6 +5772,7 @@ static int handle_active_stripes(struct r5conf *conf, int group,
for (i = 0; i < batch_size; i++)
handle_stripe(batch[i]);
+ r5l_write_stripe_run(conf->log);
cond_resched();
@@ -5879,6 +5906,8 @@ static void raid5d(struct md_thread *thread)
mutex_unlock(&conf->cache_size_mutex);
}
+ r5l_flush_stripe_to_raid(conf->log);
+
async_tx_issue_pending_all();
blk_finish_plug(&plug);
@@ -6316,8 +6345,11 @@ static void raid5_free_percpu(struct r5conf *conf)
static void free_conf(struct r5conf *conf)
{
+ if (conf->log)
+ r5l_exit_log(conf->log);
if (conf->shrinker.seeks)
unregister_shrinker(&conf->shrinker);
+
free_thread_groups(conf);
shrink_stripes(conf);
raid5_free_percpu(conf);
@@ -6530,7 +6562,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
rdev_for_each(rdev, mddev) {
raid_disk = rdev->raid_disk;
if (raid_disk >= max_disks
- || raid_disk < 0)
+ || raid_disk < 0 || test_bit(Journal, &rdev->flags))
continue;
disk = conf->disks + raid_disk;
@@ -6650,6 +6682,7 @@ static int run(struct mddev *mddev)
int working_disks = 0;
int dirty_parity_disks = 0;
struct md_rdev *rdev;
+ struct md_rdev *journal_dev = NULL;
sector_t reshape_offset = 0;
int i;
long long min_offset_diff = 0;
@@ -6662,6 +6695,11 @@ static int run(struct mddev *mddev)
rdev_for_each(rdev, mddev) {
long long diff;
+
+ if (test_bit(Journal, &rdev->flags)) {
+ journal_dev = rdev;
+ continue;
+ }
if (rdev->raid_disk < 0)
continue;
diff = (rdev->new_data_offset - rdev->data_offset);
@@ -6695,6 +6733,12 @@ static int run(struct mddev *mddev)
int chunk_sectors;
int new_data_disks;
+ if (journal_dev) {
+ printk(KERN_ERR "md/raid:%s: don't support reshape with journal - aborting.\n",
+ mdname(mddev));
+ return -EINVAL;
+ }
+
if (mddev->new_level != mddev->level) {
printk(KERN_ERR "md/raid:%s: unsupported reshape "
"required - aborting.\n",
@@ -6770,6 +6814,13 @@ static int run(struct mddev *mddev)
if (IS_ERR(conf))
return PTR_ERR(conf);
+ if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !journal_dev) {
+ printk(KERN_ERR "md/raid:%s: journal disk is missing, force array readonly\n",
+ mdname(mddev));
+ mddev->ro = 1;
+ set_disk_ro(mddev->gendisk, 1);
+ }
+
conf->min_offset_diff = min_offset_diff;
mddev->thread = conf->thread;
conf->thread = NULL;
@@ -6973,6 +7024,14 @@ static int run(struct mddev *mddev)
mddev->queue);
}
+ if (journal_dev) {
+ char b[BDEVNAME_SIZE];
+
+ printk(KERN_INFO"md/raid:%s: using device %s as journal\n",
+ mdname(mddev), bdevname(journal_dev->bdev, b));
+ r5l_init_log(conf, journal_dev);
+ }
+
return 0;
abort:
md_unregister_thread(&mddev->thread);
@@ -7082,6 +7141,15 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
struct disk_info *p = conf->disks + number;
print_raid5_conf(conf);
+ if (test_bit(Journal, &rdev->flags)) {
+ /*
+ * journal disk is not removable, but we need give a chance to
+ * update superblock of other disks. Otherwise journal disk
+ * will be considered as 'fresh'
+ */
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ return -EINVAL;
+ }
if (rdev == p->rdev)
rdevp = &p->rdev;
else if (rdev == p->replacement)
@@ -7144,6 +7212,8 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
int first = 0;
int last = conf->raid_disks - 1;
+ if (test_bit(Journal, &rdev->flags))
+ return -EINVAL;
if (mddev->recovery_disabled == conf->recovery_disabled)
return -EBUSY;
@@ -7205,6 +7275,8 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors)
sector_t newsize;
struct r5conf *conf = mddev->private;
+ if (conf->log)
+ return -EINVAL;
sectors &= ~((sector_t)conf->chunk_sectors - 1);
newsize = raid5_size(mddev, sectors, mddev->raid_disks);
if (mddev->external_size &&
@@ -7256,6 +7328,8 @@ static int check_reshape(struct mddev *mddev)
{
struct r5conf *conf = mddev->private;
+ if (conf->log)
+ return -EINVAL;
if (mddev->delta_disks == 0 &&
mddev->new_layout == mddev->layout &&
mddev->new_chunk_sectors == mddev->chunk_sectors)
@@ -7532,6 +7606,7 @@ static void raid5_quiesce(struct mddev *mddev, int state)
unlock_all_device_hash_locks_irq(conf);
break;
}
+ r5l_quiesce(conf->log, state);
}
static void *raid45_takeover_raid0(struct mddev *mddev, int level)
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 828c2925e..a415e1cd3 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -223,6 +223,9 @@ struct stripe_head {
struct stripe_head *batch_head; /* protected by stripe lock */
spinlock_t batch_lock; /* only header's lock is useful */
struct list_head batch_list; /* protected by head's batch lock*/
+
+ struct r5l_io_unit *log_io;
+ struct list_head log_list;
/**
* struct stripe_operations
* @target - STRIPE_OP_COMPUTE_BLK target
@@ -244,6 +247,7 @@ struct stripe_head {
struct bio *toread, *read, *towrite, *written;
sector_t sector; /* sector of this page */
unsigned long flags;
+ u32 log_checksum;
} dev[1]; /* allocated with extra space depending of RAID geometry */
};
@@ -268,6 +272,7 @@ struct stripe_head_state {
struct bio_list return_bi;
struct md_rdev *blocked_rdev;
int handle_bad_blocks;
+ int log_failed;
};
/* Flags for struct r5dev.flags */
@@ -340,6 +345,7 @@ enum {
STRIPE_BITMAP_PENDING, /* Being added to bitmap, don't add
* to batch yet.
*/
+ STRIPE_LOG_TRAPPED, /* trapped into log */
};
#define STRIPE_EXPAND_SYNC_FLAGS \
@@ -543,6 +549,7 @@ struct r5conf {
struct r5worker_group *worker_groups;
int group_cnt;
int worker_cnt_per_group;
+ struct r5l_log *log;
};
@@ -609,4 +616,21 @@ static inline int algorithm_is_DDF(int layout)
extern void md_raid5_kick_device(struct r5conf *conf);
extern int raid5_set_cache_size(struct mddev *mddev, int size);
+extern sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous);
+extern void raid5_release_stripe(struct stripe_head *sh);
+extern sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
+ int previous, int *dd_idx,
+ struct stripe_head *sh);
+extern struct stripe_head *
+raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
+ int previous, int noblock, int noquiesce);
+extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev);
+extern void r5l_exit_log(struct r5l_log *log);
+extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh);
+extern void r5l_write_stripe_run(struct r5l_log *log);
+extern void r5l_flush_stripe_to_raid(struct r5l_log *log);
+extern void r5l_stripe_write_finished(struct stripe_head *sh);
+extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio);
+extern void r5l_quiesce(struct r5l_log *log, int state);
+extern bool r5l_log_disk_error(struct r5conf *conf);
#endif