diff options
author | André Fabian Silva Delgado <emulatorman@parabola.nu> | 2016-10-20 00:10:27 -0300 |
---|---|---|
committer | André Fabian Silva Delgado <emulatorman@parabola.nu> | 2016-10-20 00:10:27 -0300 |
commit | d0b2f91bede3bd5e3d24dd6803e56eee959c1797 (patch) | |
tree | 7fee4ab0509879c373c4f2cbd5b8a5be5b4041ee /drivers/md/raid5.c | |
parent | e914f8eb445e8f74b00303c19c2ffceaedd16a05 (diff) |
Linux-libre 4.8.2-gnupck-4.8.2-gnu
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r-- | drivers/md/raid5.c | 183 |
1 files changed, 112 insertions, 71 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 8959e6dd3..ee7fc3701 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -659,6 +659,7 @@ raid5_get_active_stripe(struct r5conf *conf, sector_t sector, { struct stripe_head *sh; int hash = stripe_hash_locks_hash(sector); + int inc_empty_inactive_list_flag; pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); @@ -703,7 +704,12 @@ raid5_get_active_stripe(struct r5conf *conf, sector_t sector, atomic_inc(&conf->active_stripes); BUG_ON(list_empty(&sh->lru) && !test_bit(STRIPE_EXPANDING, &sh->state)); + inc_empty_inactive_list_flag = 0; + if (!list_empty(conf->inactive_list + hash)) + inc_empty_inactive_list_flag = 1; list_del_init(&sh->lru); + if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag) + atomic_inc(&conf->empty_inactive_list_nr); if (sh->group) { sh->group->stripes_cnt--; sh->group = NULL; @@ -762,6 +768,7 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh sector_t head_sector, tmp_sec; int hash; int dd_idx; + int inc_empty_inactive_list_flag; /* Don't cross chunks, so stripe pd_idx/qd_idx is the same */ tmp_sec = sh->sector; @@ -779,7 +786,12 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh atomic_inc(&conf->active_stripes); BUG_ON(list_empty(&head->lru) && !test_bit(STRIPE_EXPANDING, &head->state)); + inc_empty_inactive_list_flag = 0; + if (!list_empty(conf->inactive_list + hash)) + inc_empty_inactive_list_flag = 1; list_del_init(&head->lru); + if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag) + atomic_inc(&conf->empty_inactive_list_nr); if (head->group) { head->group->stripes_cnt--; head->group = NULL; @@ -806,7 +818,8 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh dd_idx = 0; while (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx) dd_idx++; - if (head->dev[dd_idx].towrite->bi_rw != sh->dev[dd_idx].towrite->bi_rw) + if (head->dev[dd_idx].towrite->bi_opf != sh->dev[dd_idx].towrite->bi_opf || + bio_op(head->dev[dd_idx].towrite) != bio_op(sh->dev[dd_idx].towrite)) goto unlock_out; if (head->batch_head) { @@ -891,29 +904,28 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) if (r5l_write_stripe(conf->log, sh) == 0) return; for (i = disks; i--; ) { - int rw; + int op, op_flags = 0; int replace_only = 0; struct bio *bi, *rbi; struct md_rdev *rdev, *rrdev = NULL; sh = head_sh; if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { + op = REQ_OP_WRITE; if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) - rw = WRITE_FUA; - else - rw = WRITE; + op_flags = WRITE_FUA; if (test_bit(R5_Discard, &sh->dev[i].flags)) - rw |= REQ_DISCARD; + op = REQ_OP_DISCARD; } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) - rw = READ; + op = REQ_OP_READ; else if (test_and_clear_bit(R5_WantReplace, &sh->dev[i].flags)) { - rw = WRITE; + op = REQ_OP_WRITE; replace_only = 1; } else continue; if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags)) - rw |= REQ_SYNC; + op_flags |= REQ_SYNC; again: bi = &sh->dev[i].req; @@ -927,7 +939,7 @@ again: rdev = rrdev; rrdev = NULL; } - if (rw & WRITE) { + if (op_is_write(op)) { if (replace_only) rdev = NULL; if (rdev == rrdev) @@ -953,7 +965,7 @@ again: * need to check for writes. We never accept write errors * on the replacement, so we don't to check rrdev. */ - while ((rw & WRITE) && rdev && + while (op_is_write(op) && rdev && test_bit(WriteErrorSeen, &rdev->flags)) { sector_t first_bad; int bad_sectors; @@ -993,17 +1005,16 @@ again: set_bit(STRIPE_IO_STARTED, &sh->state); - bio_reset(bi); bi->bi_bdev = rdev->bdev; - bi->bi_rw = rw; - bi->bi_end_io = (rw & WRITE) + bio_set_op_attrs(bi, op, op_flags); + bi->bi_end_io = op_is_write(op) ? raid5_end_write_request : raid5_end_read_request; bi->bi_private = sh; - pr_debug("%s: for %llu schedule op %ld on disc %d\n", + pr_debug("%s: for %llu schedule op %d on disc %d\n", __func__, (unsigned long long)sh->sector, - bi->bi_rw, i); + bi->bi_opf, i); atomic_inc(&sh->count); if (sh != head_sh) atomic_inc(&head_sh->count); @@ -1014,7 +1025,7 @@ again: bi->bi_iter.bi_sector = (sh->sector + rdev->data_offset); if (test_bit(R5_ReadNoMerge, &head_sh->dev[i].flags)) - bi->bi_rw |= REQ_NOMERGE; + bi->bi_opf |= REQ_NOMERGE; if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); @@ -1027,7 +1038,7 @@ again: * If this is discard request, set bi_vcnt 0. We don't * want to confuse SCSI because SCSI will replace payload */ - if (rw & REQ_DISCARD) + if (op == REQ_OP_DISCARD) bi->bi_vcnt = 0; if (rrdev) set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags); @@ -1045,17 +1056,16 @@ again: set_bit(STRIPE_IO_STARTED, &sh->state); - bio_reset(rbi); rbi->bi_bdev = rrdev->bdev; - rbi->bi_rw = rw; - BUG_ON(!(rw & WRITE)); + bio_set_op_attrs(rbi, op, op_flags); + BUG_ON(!op_is_write(op)); rbi->bi_end_io = raid5_end_write_request; rbi->bi_private = sh; - pr_debug("%s: for %llu schedule op %ld on " + pr_debug("%s: for %llu schedule op %d on " "replacement disc %d\n", __func__, (unsigned long long)sh->sector, - rbi->bi_rw, i); + rbi->bi_opf, i); atomic_inc(&sh->count); if (sh != head_sh) atomic_inc(&head_sh->count); @@ -1076,7 +1086,7 @@ again: * If this is discard request, set bi_vcnt 0. We don't * want to confuse SCSI because SCSI will replace payload */ - if (rw & REQ_DISCARD) + if (op == REQ_OP_DISCARD) rbi->bi_vcnt = 0; if (conf->mddev->gendisk) trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev), @@ -1085,10 +1095,10 @@ again: generic_make_request(rbi); } if (!rdev && !rrdev) { - if (rw & WRITE) + if (op_is_write(op)) set_bit(STRIPE_DEGRADED, &sh->state); - pr_debug("skip op %ld on disc %d for sector %llu\n", - bi->bi_rw, i, (unsigned long long)sh->sector); + pr_debug("skip op %d on disc %d for sector %llu\n", + bi->bi_opf, i, (unsigned long long)sh->sector); clear_bit(R5_LOCKED, &sh->dev[i].flags); set_bit(STRIPE_HANDLE, &sh->state); } @@ -1619,11 +1629,11 @@ again: while (wbi && wbi->bi_iter.bi_sector < dev->sector + STRIPE_SECTORS) { - if (wbi->bi_rw & REQ_FUA) + if (wbi->bi_opf & REQ_FUA) set_bit(R5_WantFUA, &dev->flags); - if (wbi->bi_rw & REQ_SYNC) + if (wbi->bi_opf & REQ_SYNC) set_bit(R5_SyncIO, &dev->flags); - if (wbi->bi_rw & REQ_DISCARD) + if (bio_op(wbi) == REQ_OP_DISCARD) set_bit(R5_Discard, &dev->flags); else { tx = async_copy_data(1, wbi, &dev->page, @@ -1978,9 +1988,11 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) put_cpu(); } -static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp) +static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp, + int disks) { struct stripe_head *sh; + int i; sh = kmem_cache_zalloc(sc, gfp); if (sh) { @@ -1989,6 +2001,17 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp) INIT_LIST_HEAD(&sh->batch_list); INIT_LIST_HEAD(&sh->lru); atomic_set(&sh->count, 1); + for (i = 0; i < disks; i++) { + struct r5dev *dev = &sh->dev[i]; + + bio_init(&dev->req); + dev->req.bi_io_vec = &dev->vec; + dev->req.bi_max_vecs = 1; + + bio_init(&dev->rreq); + dev->rreq.bi_io_vec = &dev->rvec; + dev->rreq.bi_max_vecs = 1; + } } return sh; } @@ -1996,7 +2019,7 @@ static int grow_one_stripe(struct r5conf *conf, gfp_t gfp) { struct stripe_head *sh; - sh = alloc_stripe(conf->slab_cache, gfp); + sh = alloc_stripe(conf->slab_cache, gfp, conf->pool_size); if (!sh) return 0; @@ -2167,7 +2190,7 @@ static int resize_stripes(struct r5conf *conf, int newsize) mutex_lock(&conf->cache_size_mutex); for (i = conf->max_nr_stripes; i; i--) { - nsh = alloc_stripe(sc, GFP_KERNEL); + nsh = alloc_stripe(sc, GFP_KERNEL, newsize); if (!nsh) break; @@ -2299,6 +2322,7 @@ static void raid5_end_read_request(struct bio * bi) (unsigned long long)sh->sector, i, atomic_read(&sh->count), bi->bi_error); if (i == disks) { + bio_reset(bi); BUG(); return; } @@ -2399,6 +2423,7 @@ static void raid5_end_read_request(struct bio * bi) } } rdev_dec_pending(rdev, conf->mddev); + bio_reset(bi); clear_bit(R5_LOCKED, &sh->dev[i].flags); set_bit(STRIPE_HANDLE, &sh->state); raid5_release_stripe(sh); @@ -2436,6 +2461,7 @@ static void raid5_end_write_request(struct bio *bi) (unsigned long long)sh->sector, i, atomic_read(&sh->count), bi->bi_error); if (i == disks) { + bio_reset(bi); BUG(); return; } @@ -2472,6 +2498,7 @@ static void raid5_end_write_request(struct bio *bi) if (sh->batch_head && bi->bi_error && !replacement) set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state); + bio_reset(bi); if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) clear_bit(R5_LOCKED, &sh->dev[i].flags); set_bit(STRIPE_HANDLE, &sh->state); @@ -2485,16 +2512,6 @@ static void raid5_build_block(struct stripe_head *sh, int i, int previous) { struct r5dev *dev = &sh->dev[i]; - bio_init(&dev->req); - dev->req.bi_io_vec = &dev->vec; - dev->req.bi_max_vecs = 1; - dev->req.bi_private = sh; - - bio_init(&dev->rreq); - dev->rreq.bi_io_vec = &dev->rvec; - dev->rreq.bi_max_vecs = 1; - dev->rreq.bi_private = sh; - dev->flags = 0; dev->sector = raid5_compute_blocknr(sh, i, previous); } @@ -3080,7 +3097,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, struct md_rdev *rdev; rcu_read_lock(); rdev = rcu_dereference(conf->disks[i].rdev); - if (rdev && test_bit(In_sync, &rdev->flags)) + if (rdev && test_bit(In_sync, &rdev->flags) && + !test_bit(Faulty, &rdev->flags)) atomic_inc(&rdev->nr_pending); else rdev = NULL; @@ -3210,15 +3228,16 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, /* During recovery devices cannot be removed, so * locking and refcounting of rdevs is not needed */ + rcu_read_lock(); for (i = 0; i < conf->raid_disks; i++) { - struct md_rdev *rdev = conf->disks[i].rdev; + struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); if (rdev && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && !rdev_set_badblocks(rdev, sh->sector, STRIPE_SECTORS, 0)) abort = 1; - rdev = conf->disks[i].replacement; + rdev = rcu_dereference(conf->disks[i].replacement); if (rdev && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) @@ -3226,6 +3245,7 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, STRIPE_SECTORS, 0)) abort = 1; } + rcu_read_unlock(); if (abort) conf->recovery_disabled = conf->mddev->recovery_disabled; @@ -3237,15 +3257,16 @@ static int want_replace(struct stripe_head *sh, int disk_idx) { struct md_rdev *rdev; int rv = 0; - /* Doing recovery so rcu locking not required */ - rdev = sh->raid_conf->disks[disk_idx].replacement; + + rcu_read_lock(); + rdev = rcu_dereference(sh->raid_conf->disks[disk_idx].replacement); if (rdev && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && (rdev->recovery_offset <= sh->sector || rdev->mddev->recovery_cp <= sh->sector)) rv = 1; - + rcu_read_unlock(); return rv; } @@ -3600,7 +3621,7 @@ static void handle_stripe_dirtying(struct r5conf *conf, pr_debug("for sector %llu, rmw=%d rcw=%d\n", (unsigned long long)sh->sector, rmw, rcw); set_bit(STRIPE_HANDLE, &sh->state); - if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_ENABLE_RMW)) && rmw > 0) { + if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) { /* prefer read-modify-write, but need to get some data */ if (conf->mddev->queue) blk_add_trace_msg(conf->mddev->queue, @@ -3627,7 +3648,7 @@ static void handle_stripe_dirtying(struct r5conf *conf, } } } - if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_ENABLE_RMW)) && rcw > 0) { + if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_PREFER_RMW)) && rcw > 0) { /* want reconstruct write, but need to get some data */ int qread =0; rcw = 0; @@ -4624,7 +4645,9 @@ finish: } if (!bio_list_empty(&s.return_bi)) { - if (test_bit(MD_CHANGE_PENDING, &conf->mddev->flags)) { + if (test_bit(MD_CHANGE_PENDING, &conf->mddev->flags) && + (s.failed <= conf->max_degraded || + conf->mddev->external == 0)) { spin_lock_irq(&conf->device_lock); bio_list_merge(&conf->return_bi, &s.return_bi); spin_unlock_irq(&conf->device_lock); @@ -5150,7 +5173,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi) DEFINE_WAIT(w); bool do_prepare; - if (unlikely(bi->bi_rw & REQ_FLUSH)) { + if (unlikely(bi->bi_opf & REQ_PREFLUSH)) { int ret = r5l_handle_flush_request(conf->log, bi); if (ret == 0) @@ -5176,7 +5199,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi) return; } - if (unlikely(bi->bi_rw & REQ_DISCARD)) { + if (unlikely(bio_op(bi) == REQ_OP_DISCARD)) { make_discard_request(mddev, bi); return; } @@ -5233,7 +5256,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi) (unsigned long long)logical_sector); sh = raid5_get_active_stripe(conf, new_sector, previous, - (bi->bi_rw&RWA_MASK), 0); + (bi->bi_opf & REQ_RAHEAD), 0); if (sh) { if (unlikely(previous)) { /* expansion might have moved on while waiting for a @@ -5301,7 +5324,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi) set_bit(STRIPE_HANDLE, &sh->state); clear_bit(STRIPE_DELAYED, &sh->state); if ((!sh->batch_head || sh == sh->batch_head) && - (bi->bi_rw & REQ_SYNC) && + (bi->bi_opf & REQ_SYNC) && !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) atomic_inc(&conf->preread_active_stripes); release_stripe_plug(mddev, sh); @@ -6616,6 +6639,16 @@ static struct r5conf *setup_conf(struct mddev *mddev) } conf->min_nr_stripes = NR_STRIPES; + if (mddev->reshape_position != MaxSector) { + int stripes = max_t(int, + ((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4, + ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4); + conf->min_nr_stripes = max(NR_STRIPES, stripes); + if (conf->min_nr_stripes != NR_STRIPES) + printk(KERN_INFO + "md/raid:%s: force stripe size %d for reshape\n", + mdname(mddev), conf->min_nr_stripes); + } memory = conf->min_nr_stripes * (sizeof(struct stripe_head) + max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS); @@ -6822,11 +6855,14 @@ static int raid5_run(struct mddev *mddev) if (IS_ERR(conf)) return PTR_ERR(conf); - if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !journal_dev) { - printk(KERN_ERR "md/raid:%s: journal disk is missing, force array readonly\n", - mdname(mddev)); - mddev->ro = 1; - set_disk_ro(mddev->gendisk, 1); + if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { + if (!journal_dev) { + pr_err("md/raid:%s: journal disk is missing, force array readonly\n", + mdname(mddev)); + mddev->ro = 1; + set_disk_ro(mddev->gendisk, 1); + } else if (mddev->recovery_cp == MaxSector) + set_bit(MD_JOURNAL_CLEAN, &mddev->flags); } conf->min_offset_diff = min_offset_diff; @@ -7066,10 +7102,12 @@ static void raid5_status(struct seq_file *seq, struct mddev *mddev) seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, conf->chunk_sectors / 2, mddev->layout); seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); - for (i = 0; i < conf->raid_disks; i++) - seq_printf (seq, "%s", - conf->disks[i].rdev && - test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_"); + rcu_read_lock(); + for (i = 0; i < conf->raid_disks; i++) { + struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); + seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_"); + } + rcu_read_unlock(); seq_printf (seq, "]"); } @@ -7191,12 +7229,15 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) goto abort; } *rdevp = NULL; - synchronize_rcu(); - if (atomic_read(&rdev->nr_pending)) { - /* lost the race, try later */ - err = -EBUSY; - *rdevp = rdev; - } else if (p->replacement) { + if (!test_bit(RemoveSynchronized, &rdev->flags)) { + synchronize_rcu(); + if (atomic_read(&rdev->nr_pending)) { + /* lost the race, try later */ + err = -EBUSY; + *rdevp = rdev; + } + } + if (p->replacement) { /* We must have just cleared 'rdev' */ p->rdev = p->replacement; clear_bit(Replacement, &p->replacement->flags); |