diff options
Diffstat (limited to 'block')
-rw-r--r-- | block/Kconfig | 13 | ||||
-rw-r--r-- | block/Makefile | 6 | ||||
-rw-r--r-- | block/badblocks.c | 585 | ||||
-rw-r--r-- | block/bio-integrity.c | 13 | ||||
-rw-r--r-- | block/bio.c | 6 | ||||
-rw-r--r-- | block/blk-cgroup.c | 9 | ||||
-rw-r--r-- | block/blk-core.c | 34 | ||||
-rw-r--r-- | block/blk-iopoll.c | 224 | ||||
-rw-r--r-- | block/blk-map.c | 91 | ||||
-rw-r--r-- | block/blk-merge.c | 11 | ||||
-rw-r--r-- | block/blk-mq-cpumap.c | 2 | ||||
-rw-r--r-- | block/blk-mq-tag.c | 11 | ||||
-rw-r--r-- | block/blk-mq.c | 50 | ||||
-rw-r--r-- | block/blk-mq.h | 11 | ||||
-rw-r--r-- | block/blk-sysfs.c | 5 | ||||
-rw-r--r-- | block/blk-timeout.c | 17 | ||||
-rw-r--r-- | block/blk.h | 2 | ||||
-rw-r--r-- | block/deadline-iosched.c | 3 | ||||
-rw-r--r-- | block/genhd.c | 113 | ||||
-rw-r--r-- | block/ioctl.c | 33 | ||||
-rw-r--r-- | block/partition-generic.c | 18 | ||||
-rw-r--r-- | block/uuid.c | 509 |
22 files changed, 830 insertions, 936 deletions
diff --git a/block/Kconfig b/block/Kconfig index 161491d0a..0363cd731 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -88,6 +88,19 @@ config BLK_DEV_INTEGRITY T10/SCSI Data Integrity Field or the T13/ATA External Path Protection. If in doubt, say N. +config BLK_DEV_DAX + bool "Block device DAX support" + depends on FS_DAX + depends on BROKEN + help + When DAX support is available (CONFIG_FS_DAX) raw block + devices can also support direct userspace access to the + storage capacity via MMAP(2) similar to a file on a + DAX-enabled filesystem. However, the DAX I/O-path disables + some standard I/O-statistics, and the MMAP(2) path has some + operational differences due to bypassing the page + cache. If in doubt, say N. + config BLK_DEV_THROTTLING bool "Block layer bio throttling support" depends on BLK_CGROUP=y diff --git a/block/Makefile b/block/Makefile index 086be5007..4a3668393 100644 --- a/block/Makefile +++ b/block/Makefile @@ -5,10 +5,10 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \ blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ - blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \ + blk-lib.o blk-mq.o blk-mq-tag.o \ blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \ - uuid.o genhd.o scsi_ioctl.o partition-generic.o ioprio.o \ - partitions/ + genhd.o scsi_ioctl.o partition-generic.o ioprio.o \ + badblocks.o partitions/ obj-$(CONFIG_BOUNCE) += bounce.o obj-$(CONFIG_BLK_DEV_BSG) += bsg.o diff --git a/block/badblocks.c b/block/badblocks.c new file mode 100644 index 000000000..7be53cb1c --- /dev/null +++ b/block/badblocks.c @@ -0,0 +1,585 @@ +/* + * Bad block management + * + * - Heavily based on MD badblocks code from Neil Brown + * + * Copyright (c) 2015, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include <linux/badblocks.h> +#include <linux/seqlock.h> +#include <linux/device.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/stddef.h> +#include <linux/types.h> +#include <linux/slab.h> + +/** + * badblocks_check() - check a given range for bad sectors + * @bb: the badblocks structure that holds all badblock information + * @s: sector (start) at which to check for badblocks + * @sectors: number of sectors to check for badblocks + * @first_bad: pointer to store location of the first badblock + * @bad_sectors: pointer to store number of badblocks after @first_bad + * + * We can record which blocks on each device are 'bad' and so just + * fail those blocks, or that stripe, rather than the whole device. + * Entries in the bad-block table are 64bits wide. This comprises: + * Length of bad-range, in sectors: 0-511 for lengths 1-512 + * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes) + * A 'shift' can be set so that larger blocks are tracked and + * consequently larger devices can be covered. + * 'Acknowledged' flag - 1 bit. - the most significant bit. + * + * Locking of the bad-block table uses a seqlock so badblocks_check + * might need to retry if it is very unlucky. + * We will sometimes want to check for bad blocks in a bi_end_io function, + * so we use the write_seqlock_irq variant. + * + * When looking for a bad block we specify a range and want to + * know if any block in the range is bad. So we binary-search + * to the last range that starts at-or-before the given endpoint, + * (or "before the sector after the target range") + * then see if it ends after the given start. + * + * Return: + * 0: there are no known bad blocks in the range + * 1: there are known bad block which are all acknowledged + * -1: there are bad blocks which have not yet been acknowledged in metadata. + * plus the start/length of the first bad section we overlap. + */ +int badblocks_check(struct badblocks *bb, sector_t s, int sectors, + sector_t *first_bad, int *bad_sectors) +{ + int hi; + int lo; + u64 *p = bb->page; + int rv; + sector_t target = s + sectors; + unsigned seq; + + if (bb->shift > 0) { + /* round the start down, and the end up */ + s >>= bb->shift; + target += (1<<bb->shift) - 1; + target >>= bb->shift; + sectors = target - s; + } + /* 'target' is now the first block after the bad range */ + +retry: + seq = read_seqbegin(&bb->lock); + lo = 0; + rv = 0; + hi = bb->count; + + /* Binary search between lo and hi for 'target' + * i.e. for the last range that starts before 'target' + */ + /* INVARIANT: ranges before 'lo' and at-or-after 'hi' + * are known not to be the last range before target. + * VARIANT: hi-lo is the number of possible + * ranges, and decreases until it reaches 1 + */ + while (hi - lo > 1) { + int mid = (lo + hi) / 2; + sector_t a = BB_OFFSET(p[mid]); + + if (a < target) + /* This could still be the one, earlier ranges + * could not. + */ + lo = mid; + else + /* This and later ranges are definitely out. */ + hi = mid; + } + /* 'lo' might be the last that started before target, but 'hi' isn't */ + if (hi > lo) { + /* need to check all range that end after 's' to see if + * any are unacknowledged. + */ + while (lo >= 0 && + BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { + if (BB_OFFSET(p[lo]) < target) { + /* starts before the end, and finishes after + * the start, so they must overlap + */ + if (rv != -1 && BB_ACK(p[lo])) + rv = 1; + else + rv = -1; + *first_bad = BB_OFFSET(p[lo]); + *bad_sectors = BB_LEN(p[lo]); + } + lo--; + } + } + + if (read_seqretry(&bb->lock, seq)) + goto retry; + + return rv; +} +EXPORT_SYMBOL_GPL(badblocks_check); + +/** + * badblocks_set() - Add a range of bad blocks to the table. + * @bb: the badblocks structure that holds all badblock information + * @s: first sector to mark as bad + * @sectors: number of sectors to mark as bad + * @acknowledged: weather to mark the bad sectors as acknowledged + * + * This might extend the table, or might contract it if two adjacent ranges + * can be merged. We binary-search to find the 'insertion' point, then + * decide how best to handle it. + * + * Return: + * 0: success + * 1: failed to set badblocks (out of space) + */ +int badblocks_set(struct badblocks *bb, sector_t s, int sectors, + int acknowledged) +{ + u64 *p; + int lo, hi; + int rv = 0; + unsigned long flags; + + if (bb->shift < 0) + /* badblocks are disabled */ + return 0; + + if (bb->shift) { + /* round the start down, and the end up */ + sector_t next = s + sectors; + + s >>= bb->shift; + next += (1<<bb->shift) - 1; + next >>= bb->shift; + sectors = next - s; + } + + write_seqlock_irqsave(&bb->lock, flags); + + p = bb->page; + lo = 0; + hi = bb->count; + /* Find the last range that starts at-or-before 's' */ + while (hi - lo > 1) { + int mid = (lo + hi) / 2; + sector_t a = BB_OFFSET(p[mid]); + + if (a <= s) + lo = mid; + else + hi = mid; + } + if (hi > lo && BB_OFFSET(p[lo]) > s) + hi = lo; + + if (hi > lo) { + /* we found a range that might merge with the start + * of our new range + */ + sector_t a = BB_OFFSET(p[lo]); + sector_t e = a + BB_LEN(p[lo]); + int ack = BB_ACK(p[lo]); + + if (e >= s) { + /* Yes, we can merge with a previous range */ + if (s == a && s + sectors >= e) + /* new range covers old */ + ack = acknowledged; + else + ack = ack && acknowledged; + + if (e < s + sectors) + e = s + sectors; + if (e - a <= BB_MAX_LEN) { + p[lo] = BB_MAKE(a, e-a, ack); + s = e; + } else { + /* does not all fit in one range, + * make p[lo] maximal + */ + if (BB_LEN(p[lo]) != BB_MAX_LEN) + p[lo] = BB_MAKE(a, BB_MAX_LEN, ack); + s = a + BB_MAX_LEN; + } + sectors = e - s; + } + } + if (sectors && hi < bb->count) { + /* 'hi' points to the first range that starts after 's'. + * Maybe we can merge with the start of that range + */ + sector_t a = BB_OFFSET(p[hi]); + sector_t e = a + BB_LEN(p[hi]); + int ack = BB_ACK(p[hi]); + + if (a <= s + sectors) { + /* merging is possible */ + if (e <= s + sectors) { + /* full overlap */ + e = s + sectors; + ack = acknowledged; + } else + ack = ack && acknowledged; + + a = s; + if (e - a <= BB_MAX_LEN) { + p[hi] = BB_MAKE(a, e-a, ack); + s = e; + } else { + p[hi] = BB_MAKE(a, BB_MAX_LEN, ack); + s = a + BB_MAX_LEN; + } + sectors = e - s; + lo = hi; + hi++; + } + } + if (sectors == 0 && hi < bb->count) { + /* we might be able to combine lo and hi */ + /* Note: 's' is at the end of 'lo' */ + sector_t a = BB_OFFSET(p[hi]); + int lolen = BB_LEN(p[lo]); + int hilen = BB_LEN(p[hi]); + int newlen = lolen + hilen - (s - a); + + if (s >= a && newlen < BB_MAX_LEN) { + /* yes, we can combine them */ + int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]); + + p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack); + memmove(p + hi, p + hi + 1, + (bb->count - hi - 1) * 8); + bb->count--; + } + } + while (sectors) { + /* didn't merge (it all). + * Need to add a range just before 'hi' + */ + if (bb->count >= MAX_BADBLOCKS) { + /* No room for more */ + rv = 1; + break; + } else { + int this_sectors = sectors; + + memmove(p + hi + 1, p + hi, + (bb->count - hi) * 8); + bb->count++; + + if (this_sectors > BB_MAX_LEN) + this_sectors = BB_MAX_LEN; + p[hi] = BB_MAKE(s, this_sectors, acknowledged); + sectors -= this_sectors; + s += this_sectors; + } + } + + bb->changed = 1; + if (!acknowledged) + bb->unacked_exist = 1; + write_sequnlock_irqrestore(&bb->lock, flags); + + return rv; +} +EXPORT_SYMBOL_GPL(badblocks_set); + +/** + * badblocks_clear() - Remove a range of bad blocks to the table. + * @bb: the badblocks structure that holds all badblock information + * @s: first sector to mark as bad + * @sectors: number of sectors to mark as bad + * + * This may involve extending the table if we spilt a region, + * but it must not fail. So if the table becomes full, we just + * drop the remove request. + * + * Return: + * 0: success + * 1: failed to clear badblocks + */ +int badblocks_clear(struct badblocks *bb, sector_t s, int sectors) +{ + u64 *p; + int lo, hi; + sector_t target = s + sectors; + int rv = 0; + + if (bb->shift > 0) { + /* When clearing we round the start up and the end down. + * This should not matter as the shift should align with + * the block size and no rounding should ever be needed. + * However it is better the think a block is bad when it + * isn't than to think a block is not bad when it is. + */ + s += (1<<bb->shift) - 1; + s >>= bb->shift; + target >>= bb->shift; + sectors = target - s; + } + + write_seqlock_irq(&bb->lock); + + p = bb->page; + lo = 0; + hi = bb->count; + /* Find the last range that starts before 'target' */ + while (hi - lo > 1) { + int mid = (lo + hi) / 2; + sector_t a = BB_OFFSET(p[mid]); + + if (a < target) + lo = mid; + else + hi = mid; + } + if (hi > lo) { + /* p[lo] is the last range that could overlap the + * current range. Earlier ranges could also overlap, + * but only this one can overlap the end of the range. + */ + if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) { + /* Partial overlap, leave the tail of this range */ + int ack = BB_ACK(p[lo]); + sector_t a = BB_OFFSET(p[lo]); + sector_t end = a + BB_LEN(p[lo]); + + if (a < s) { + /* we need to split this range */ + if (bb->count >= MAX_BADBLOCKS) { + rv = -ENOSPC; + goto out; + } + memmove(p+lo+1, p+lo, (bb->count - lo) * 8); + bb->count++; + p[lo] = BB_MAKE(a, s-a, ack); + lo++; + } + p[lo] = BB_MAKE(target, end - target, ack); + /* there is no longer an overlap */ + hi = lo; + lo--; + } + while (lo >= 0 && + BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { + /* This range does overlap */ + if (BB_OFFSET(p[lo]) < s) { + /* Keep the early parts of this range. */ + int ack = BB_ACK(p[lo]); + sector_t start = BB_OFFSET(p[lo]); + + p[lo] = BB_MAKE(start, s - start, ack); + /* now low doesn't overlap, so.. */ + break; + } + lo--; + } + /* 'lo' is strictly before, 'hi' is strictly after, + * anything between needs to be discarded + */ + if (hi - lo > 1) { + memmove(p+lo+1, p+hi, (bb->count - hi) * 8); + bb->count -= (hi - lo - 1); + } + } + + bb->changed = 1; +out: + write_sequnlock_irq(&bb->lock); + return rv; +} +EXPORT_SYMBOL_GPL(badblocks_clear); + +/** + * ack_all_badblocks() - Acknowledge all bad blocks in a list. + * @bb: the badblocks structure that holds all badblock information + * + * This only succeeds if ->changed is clear. It is used by + * in-kernel metadata updates + */ +void ack_all_badblocks(struct badblocks *bb) +{ + if (bb->page == NULL || bb->changed) + /* no point even trying */ + return; + write_seqlock_irq(&bb->lock); + + if (bb->changed == 0 && bb->unacked_exist) { + u64 *p = bb->page; + int i; + + for (i = 0; i < bb->count ; i++) { + if (!BB_ACK(p[i])) { + sector_t start = BB_OFFSET(p[i]); + int len = BB_LEN(p[i]); + + p[i] = BB_MAKE(start, len, 1); + } + } + bb->unacked_exist = 0; + } + write_sequnlock_irq(&bb->lock); +} +EXPORT_SYMBOL_GPL(ack_all_badblocks); + +/** + * badblocks_show() - sysfs access to bad-blocks list + * @bb: the badblocks structure that holds all badblock information + * @page: buffer received from sysfs + * @unack: weather to show unacknowledged badblocks + * + * Return: + * Length of returned data + */ +ssize_t badblocks_show(struct badblocks *bb, char *page, int unack) +{ + size_t len; + int i; + u64 *p = bb->page; + unsigned seq; + + if (bb->shift < 0) + return 0; + +retry: + seq = read_seqbegin(&bb->lock); + + len = 0; + i = 0; + + while (len < PAGE_SIZE && i < bb->count) { + sector_t s = BB_OFFSET(p[i]); + unsigned int length = BB_LEN(p[i]); + int ack = BB_ACK(p[i]); + + i++; + + if (unack && ack) + continue; + + len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n", + (unsigned long long)s << bb->shift, + length << bb->shift); + } + if (unack && len == 0) + bb->unacked_exist = 0; + + if (read_seqretry(&bb->lock, seq)) + goto retry; + + return len; +} +EXPORT_SYMBOL_GPL(badblocks_show); + +/** + * badblocks_store() - sysfs access to bad-blocks list + * @bb: the badblocks structure that holds all badblock information + * @page: buffer received from sysfs + * @len: length of data received from sysfs + * @unack: weather to show unacknowledged badblocks + * + * Return: + * Length of the buffer processed or -ve error. + */ +ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len, + int unack) +{ + unsigned long long sector; + int length; + char newline; + + switch (sscanf(page, "%llu %d%c", §or, &length, &newline)) { + case 3: + if (newline != '\n') + return -EINVAL; + case 2: + if (length <= 0) + return -EINVAL; + break; + default: + return -EINVAL; + } + + if (badblocks_set(bb, sector, length, !unack)) + return -ENOSPC; + else + return len; +} +EXPORT_SYMBOL_GPL(badblocks_store); + +static int __badblocks_init(struct device *dev, struct badblocks *bb, + int enable) +{ + bb->dev = dev; + bb->count = 0; + if (enable) + bb->shift = 0; + else + bb->shift = -1; + if (dev) + bb->page = devm_kzalloc(dev, PAGE_SIZE, GFP_KERNEL); + else + bb->page = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!bb->page) { + bb->shift = -1; + return -ENOMEM; + } + seqlock_init(&bb->lock); + + return 0; +} + +/** + * badblocks_init() - initialize the badblocks structure + * @bb: the badblocks structure that holds all badblock information + * @enable: weather to enable badblocks accounting + * + * Return: + * 0: success + * -ve errno: on error + */ +int badblocks_init(struct badblocks *bb, int enable) +{ + return __badblocks_init(NULL, bb, enable); +} +EXPORT_SYMBOL_GPL(badblocks_init); + +int devm_init_badblocks(struct device *dev, struct badblocks *bb) +{ + if (!bb) + return -EINVAL; + return __badblocks_init(dev, bb, 1); +} +EXPORT_SYMBOL_GPL(devm_init_badblocks); + +/** + * badblocks_exit() - free the badblocks structure + * @bb: the badblocks structure that holds all badblock information + */ +void badblocks_exit(struct badblocks *bb) +{ + if (!bb) + return; + if (bb->dev) + devm_kfree(bb->dev, bb->page); + else + kfree(bb->page); + bb->page = NULL; +} +EXPORT_SYMBOL_GPL(badblocks_exit); diff --git a/block/bio-integrity.c b/block/bio-integrity.c index f6325d573..711e4d8de 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -66,7 +66,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, } if (unlikely(!bip)) - return NULL; + return ERR_PTR(-ENOMEM); memset(bip, 0, sizeof(*bip)); @@ -89,7 +89,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, return bip; err: mempool_free(bip, bs->bio_integrity_pool); - return NULL; + return ERR_PTR(-ENOMEM); } EXPORT_SYMBOL(bio_integrity_alloc); @@ -298,10 +298,10 @@ int bio_integrity_prep(struct bio *bio) /* Allocate bio integrity payload and integrity vectors */ bip = bio_integrity_alloc(bio, GFP_NOIO, nr_pages); - if (unlikely(bip == NULL)) { + if (IS_ERR(bip)) { printk(KERN_ERR "could not allocate data integrity bioset\n"); kfree(buf); - return -EIO; + return PTR_ERR(bip); } bip->bip_flags |= BIP_BLOCK_INTEGRITY; @@ -465,9 +465,8 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src, BUG_ON(bip_src == NULL); bip = bio_integrity_alloc(bio, gfp_mask, bip_src->bip_vcnt); - - if (bip == NULL) - return -EIO; + if (IS_ERR(bip)) + return PTR_ERR(bip); memcpy(bip->bip_vec, bip_src->bip_vec, bip_src->bip_vcnt * sizeof(struct bio_vec)); diff --git a/block/bio.c b/block/bio.c index d4d144363..cf7591551 100644 --- a/block/bio.c +++ b/block/bio.c @@ -874,7 +874,7 @@ int submit_bio_wait(int rw, struct bio *bio) bio->bi_private = &ret; bio->bi_end_io = submit_bio_wait_endio; submit_bio(rw, bio); - wait_for_completion(&ret.event); + wait_for_completion_io(&ret.event); return ret.error; } @@ -1128,7 +1128,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q, int i, ret; int nr_pages = 0; unsigned int len = iter->count; - unsigned int offset = map_data ? map_data->offset & ~PAGE_MASK : 0; + unsigned int offset = map_data ? offset_in_page(map_data->offset) : 0; for (i = 0; i < iter->nr_segs; i++) { unsigned long uaddr; @@ -1307,7 +1307,7 @@ struct bio *bio_map_user_iov(struct request_queue *q, goto out_unmap; } - offset = uaddr & ~PAGE_MASK; + offset = offset_in_page(uaddr); for (j = cur_page; j < page_limit; j++) { unsigned int bytes = PAGE_SIZE - offset; diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 5a37188b5..66e6f1aae 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -788,6 +788,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, { struct gendisk *disk; struct blkcg_gq *blkg; + struct module *owner; unsigned int major, minor; int key_len, part, ret; char *body; @@ -804,7 +805,9 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, if (!disk) return -ENODEV; if (part) { + owner = disk->fops->owner; put_disk(disk); + module_put(owner); return -ENODEV; } @@ -820,7 +823,9 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, ret = PTR_ERR(blkg); rcu_read_unlock(); spin_unlock_irq(disk->queue->queue_lock); + owner = disk->fops->owner; put_disk(disk); + module_put(owner); /* * If queue was bypassing, we should retry. Do so after a * short msleep(). It isn't strictly necessary but queue @@ -851,9 +856,13 @@ EXPORT_SYMBOL_GPL(blkg_conf_prep); void blkg_conf_finish(struct blkg_conf_ctx *ctx) __releases(ctx->disk->queue->queue_lock) __releases(rcu) { + struct module *owner; + spin_unlock_irq(ctx->disk->queue->queue_lock); rcu_read_unlock(); + owner = ctx->disk->fops->owner; put_disk(ctx->disk); + module_put(owner); } EXPORT_SYMBOL_GPL(blkg_conf_finish); diff --git a/block/blk-core.c b/block/blk-core.c index e8e229c70..b83d29755 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -48,12 +48,10 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug); DEFINE_IDA(blk_queue_ida); -int trap_non_toi_io; - /* * For the allocated request tables */ -struct kmem_cache *request_cachep = NULL; +struct kmem_cache *request_cachep; /* * For queue allocation @@ -648,7 +646,7 @@ struct request_queue *blk_alloc_queue(gfp_t gfp_mask) } EXPORT_SYMBOL(blk_alloc_queue); -int blk_queue_enter(struct request_queue *q, gfp_t gfp) +int blk_queue_enter(struct request_queue *q, bool nowait) { while (true) { int ret; @@ -656,7 +654,7 @@ int blk_queue_enter(struct request_queue *q, gfp_t gfp) if (percpu_ref_tryget_live(&q->q_usage_counter)) return 0; - if (!gfpflags_allow_blocking(gfp)) + if (nowait) return -EBUSY; ret = wait_event_interruptible(q->mq_freeze_wq, @@ -682,6 +680,13 @@ static void blk_queue_usage_counter_release(struct percpu_ref *ref) wake_up_all(&q->mq_freeze_wq); } +static void blk_rq_timed_out_timer(unsigned long data) +{ + struct request_queue *q = (struct request_queue *)data; + + kblockd_schedule_work(&q->timeout_work); +} + struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) { struct request_queue *q; @@ -843,6 +848,7 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, if (blk_init_rl(&q->root_rl, q, GFP_KERNEL)) goto fail; + INIT_WORK(&q->timeout_work, blk_timeout_work); q->request_fn = rfn; q->prep_rq_fn = NULL; q->unprep_rq_fn = NULL; @@ -1294,7 +1300,9 @@ static struct request *blk_old_get_request(struct request_queue *q, int rw, struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) { if (q->mq_ops) - return blk_mq_alloc_request(q, rw, gfp_mask, false); + return blk_mq_alloc_request(q, rw, + (gfp_mask & __GFP_DIRECT_RECLAIM) ? + 0 : BLK_MQ_REQ_NOWAIT); else return blk_old_get_request(q, rw, gfp_mask); } @@ -2062,8 +2070,7 @@ blk_qc_t generic_make_request(struct bio *bio) do { struct request_queue *q = bdev_get_queue(bio->bi_bdev); - if (likely(blk_queue_enter(q, __GFP_DIRECT_RECLAIM) == 0)) { - + if (likely(blk_queue_enter(q, false) == 0)) { ret = q->make_request_fn(q, bio); blk_queue_exit(q); @@ -2097,9 +2104,6 @@ blk_qc_t submit_bio(int rw, struct bio *bio) { bio->bi_rw |= rw; - if (unlikely(trap_non_toi_io)) - BUG_ON(!bio_flagged(bio, BIO_TOI)); - /* * If it's a regular read/write or a barrier with data attached, * go through the normal accounting stuff before submission. @@ -2451,14 +2455,16 @@ struct request *blk_peek_request(struct request_queue *q) rq = NULL; break; - } else if (ret == BLKPREP_KILL) { + } else if (ret == BLKPREP_KILL || ret == BLKPREP_INVALID) { + int err = (ret == BLKPREP_INVALID) ? -EREMOTEIO : -EIO; + rq->cmd_flags |= REQ_QUIET; /* * Mark this request as started so we don't trigger * any debug logic in the end I/O path. */ blk_start_request(rq); - __blk_end_request_all(rq, -EIO); + __blk_end_request_all(rq, err); } else { printk(KERN_ERR "%s: bad return=%d\n", __func__, ret); break; @@ -3539,7 +3545,7 @@ int __init blk_dev_init(void) request_cachep = kmem_cache_create("blkdev_requests", sizeof(struct request), 0, SLAB_PANIC, NULL); - blk_requestq_cachep = kmem_cache_create("blkdev_queue", + blk_requestq_cachep = kmem_cache_create("request_queue", sizeof(struct request_queue), 0, SLAB_PANIC, NULL); return 0; diff --git a/block/blk-iopoll.c b/block/blk-iopoll.c deleted file mode 100644 index 0736729d6..000000000 --- a/block/blk-iopoll.c +++ /dev/null @@ -1,224 +0,0 @@ -/* - * Functions related to interrupt-poll handling in the block layer. This - * is similar to NAPI for network devices. - */ -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/bio.h> -#include <linux/blkdev.h> -#include <linux/interrupt.h> -#include <linux/cpu.h> -#include <linux/blk-iopoll.h> -#include <linux/delay.h> - -#include "blk.h" - -static unsigned int blk_iopoll_budget __read_mostly = 256; - -static DEFINE_PER_CPU(struct list_head, blk_cpu_iopoll); - -/** - * blk_iopoll_sched - Schedule a run of the iopoll handler - * @iop: The parent iopoll structure - * - * Description: - * Add this blk_iopoll structure to the pending poll list and trigger the - * raise of the blk iopoll softirq. The driver must already have gotten a - * successful return from blk_iopoll_sched_prep() before calling this. - **/ -void blk_iopoll_sched(struct blk_iopoll *iop) -{ - unsigned long flags; - - local_irq_save(flags); - list_add_tail(&iop->list, this_cpu_ptr(&blk_cpu_iopoll)); - __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ); - local_irq_restore(flags); -} -EXPORT_SYMBOL(blk_iopoll_sched); - -/** - * __blk_iopoll_complete - Mark this @iop as un-polled again - * @iop: The parent iopoll structure - * - * Description: - * See blk_iopoll_complete(). This function must be called with interrupts - * disabled. - **/ -void __blk_iopoll_complete(struct blk_iopoll *iop) -{ - list_del(&iop->list); - smp_mb__before_atomic(); - clear_bit_unlock(IOPOLL_F_SCHED, &iop->state); -} -EXPORT_SYMBOL(__blk_iopoll_complete); - -/** - * blk_iopoll_complete - Mark this @iop as un-polled again - * @iop: The parent iopoll structure - * - * Description: - * If a driver consumes less than the assigned budget in its run of the - * iopoll handler, it'll end the polled mode by calling this function. The - * iopoll handler will not be invoked again before blk_iopoll_sched_prep() - * is called. - **/ -void blk_iopoll_complete(struct blk_iopoll *iop) -{ - unsigned long flags; - - local_irq_save(flags); - __blk_iopoll_complete(iop); - local_irq_restore(flags); -} -EXPORT_SYMBOL(blk_iopoll_complete); - -static void blk_iopoll_softirq(struct softirq_action *h) -{ - struct list_head *list = this_cpu_ptr(&blk_cpu_iopoll); - int rearm = 0, budget = blk_iopoll_budget; - unsigned long start_time = jiffies; - - local_irq_disable(); - - while (!list_empty(list)) { - struct blk_iopoll *iop; - int work, weight; - - /* - * If softirq window is exhausted then punt. - */ - if (budget <= 0 || time_after(jiffies, start_time)) { - rearm = 1; - break; - } - - local_irq_enable(); - - /* Even though interrupts have been re-enabled, this - * access is safe because interrupts can only add new - * entries to the tail of this list, and only ->poll() - * calls can remove this head entry from the list. - */ - iop = list_entry(list->next, struct blk_iopoll, list); - - weight = iop->weight; - work = 0; - if (test_bit(IOPOLL_F_SCHED, &iop->state)) - work = iop->poll(iop, weight); - - budget -= work; - - local_irq_disable(); - - /* - * Drivers must not modify the iopoll state, if they - * consume their assigned weight (or more, some drivers can't - * easily just stop processing, they have to complete an - * entire mask of commands).In such cases this code - * still "owns" the iopoll instance and therefore can - * move the instance around on the list at-will. - */ - if (work >= weight) { - if (blk_iopoll_disable_pending(iop)) - __blk_iopoll_complete(iop); - else - list_move_tail(&iop->list, list); - } - } - - if (rearm) - __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ); - - local_irq_enable(); -} - -/** - * blk_iopoll_disable - Disable iopoll on this @iop - * @iop: The parent iopoll structure - * - * Description: - * Disable io polling and wait for any pending callbacks to have completed. - **/ -void blk_iopoll_disable(struct blk_iopoll *iop) -{ - set_bit(IOPOLL_F_DISABLE, &iop->state); - while (test_and_set_bit(IOPOLL_F_SCHED, &iop->state)) - msleep(1); - clear_bit(IOPOLL_F_DISABLE, &iop->state); -} -EXPORT_SYMBOL(blk_iopoll_disable); - -/** - * blk_iopoll_enable - Enable iopoll on this @iop - * @iop: The parent iopoll structure - * - * Description: - * Enable iopoll on this @iop. Note that the handler run will not be - * scheduled, it will only mark it as active. - **/ -void blk_iopoll_enable(struct blk_iopoll *iop) -{ - BUG_ON(!test_bit(IOPOLL_F_SCHED, &iop->state)); - smp_mb__before_atomic(); - clear_bit_unlock(IOPOLL_F_SCHED, &iop->state); -} -EXPORT_SYMBOL(blk_iopoll_enable); - -/** - * blk_iopoll_init - Initialize this @iop - * @iop: The parent iopoll structure - * @weight: The default weight (or command completion budget) - * @poll_fn: The handler to invoke - * - * Description: - * Initialize this blk_iopoll structure. Before being actively used, the - * driver must call blk_iopoll_enable(). - **/ -void blk_iopoll_init(struct blk_iopoll *iop, int weight, blk_iopoll_fn *poll_fn) -{ - memset(iop, 0, sizeof(*iop)); - INIT_LIST_HEAD(&iop->list); - iop->weight = weight; - iop->poll = poll_fn; - set_bit(IOPOLL_F_SCHED, &iop->state); -} -EXPORT_SYMBOL(blk_iopoll_init); - -static int blk_iopoll_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - /* - * If a CPU goes away, splice its entries to the current CPU - * and trigger a run of the softirq - */ - if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { - int cpu = (unsigned long) hcpu; - - local_irq_disable(); - list_splice_init(&per_cpu(blk_cpu_iopoll, cpu), - this_cpu_ptr(&blk_cpu_iopoll)); - __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ); - local_irq_enable(); - } - - return NOTIFY_OK; -} - -static struct notifier_block blk_iopoll_cpu_notifier = { - .notifier_call = blk_iopoll_cpu_notify, -}; - -static __init int blk_iopoll_setup(void) -{ - int i; - - for_each_possible_cpu(i) - INIT_LIST_HEAD(&per_cpu(blk_cpu_iopoll, i)); - - open_softirq(BLOCK_IOPOLL_SOFTIRQ, blk_iopoll_softirq); - register_hotcpu_notifier(&blk_iopoll_cpu_notifier); - return 0; -} -subsys_initcall(blk_iopoll_setup); diff --git a/block/blk-map.c b/block/blk-map.c index f565e11f4..a54f0543b 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -57,6 +57,49 @@ static int __blk_rq_unmap_user(struct bio *bio) return ret; } +static int __blk_rq_map_user_iov(struct request *rq, + struct rq_map_data *map_data, struct iov_iter *iter, + gfp_t gfp_mask, bool copy) +{ + struct request_queue *q = rq->q; + struct bio *bio, *orig_bio; + int ret; + + if (copy) + bio = bio_copy_user_iov(q, map_data, iter, gfp_mask); + else + bio = bio_map_user_iov(q, iter, gfp_mask); + + if (IS_ERR(bio)) + return PTR_ERR(bio); + + if (map_data && map_data->null_mapped) + bio_set_flag(bio, BIO_NULL_MAPPED); + + iov_iter_advance(iter, bio->bi_iter.bi_size); + if (map_data) + map_data->offset += bio->bi_iter.bi_size; + + orig_bio = bio; + blk_queue_bounce(q, &bio); + + /* + * We link the bounce buffer in and could have to traverse it + * later so we have to get a ref to prevent it from being freed + */ + bio_get(bio); + + ret = blk_rq_append_bio(q, rq, bio); + if (ret) { + bio_endio(bio); + __blk_rq_unmap_user(orig_bio); + bio_put(bio); + return ret; + } + + return 0; +} + /** * blk_rq_map_user_iov - map user data to a request, for REQ_TYPE_BLOCK_PC usage * @q: request queue where request should be inserted @@ -82,10 +125,11 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, struct rq_map_data *map_data, const struct iov_iter *iter, gfp_t gfp_mask) { - struct bio *bio; - int unaligned = 0; - struct iov_iter i; struct iovec iov, prv = {.iov_base = NULL, .iov_len = 0}; + bool copy = (q->dma_pad_mask & iter->count) || map_data; + struct bio *bio = NULL; + struct iov_iter i; + int ret; if (!iter || !iter->count) return -EINVAL; @@ -101,42 +145,29 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, */ if ((uaddr & queue_dma_alignment(q)) || iovec_gap_to_prv(q, &prv, &iov)) - unaligned = 1; + copy = true; prv.iov_base = iov.iov_base; prv.iov_len = iov.iov_len; } - if (unaligned || (q->dma_pad_mask & iter->count) || map_data) - bio = bio_copy_user_iov(q, map_data, iter, gfp_mask); - else - bio = bio_map_user_iov(q, iter, gfp_mask); - - if (IS_ERR(bio)) - return PTR_ERR(bio); - - if (map_data && map_data->null_mapped) - bio_set_flag(bio, BIO_NULL_MAPPED); - - if (bio->bi_iter.bi_size != iter->count) { - /* - * Grab an extra reference to this bio, as bio_unmap_user() - * expects to be able to drop it twice as it happens on the - * normal IO completion path - */ - bio_get(bio); - bio_endio(bio); - __blk_rq_unmap_user(bio); - return -EINVAL; - } + i = *iter; + do { + ret =__blk_rq_map_user_iov(rq, map_data, &i, gfp_mask, copy); + if (ret) + goto unmap_rq; + if (!bio) + bio = rq->bio; + } while (iov_iter_count(&i)); if (!bio_flagged(bio, BIO_USER_MAPPED)) rq->cmd_flags |= REQ_COPY_USER; - - blk_queue_bounce(q, &bio); - bio_get(bio); - blk_rq_bio_prep(q, rq, bio); return 0; + +unmap_rq: + __blk_rq_unmap_user(bio); + rq->bio = NULL; + return -EINVAL; } EXPORT_SYMBOL(blk_rq_map_user_iov); diff --git a/block/blk-merge.c b/block/blk-merge.c index b966db8f3..261353166 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -7,6 +7,8 @@ #include <linux/blkdev.h> #include <linux/scatterlist.h> +#include <trace/events/block.h> + #include "blk.h" static struct bio *blk_bio_discard_split(struct request_queue *q, @@ -187,6 +189,7 @@ void blk_queue_split(struct request_queue *q, struct bio **bio, split->bi_rw |= REQ_NOMERGE; bio_chain(split, *bio); + trace_block_split(q, split, (*bio)->bi_iter.bi_sector); generic_make_request(*bio); *bio = split; } @@ -301,7 +304,6 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio, struct bio *nxt) { struct bio_vec end_bv = { NULL }, nxt_bv; - struct bvec_iter iter; if (!blk_queue_cluster(q)) return 0; @@ -313,11 +315,8 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio, if (!bio_has_data(bio)) return 1; - bio_for_each_segment(end_bv, bio, iter) - if (end_bv.bv_len == iter.bi_size) - break; - - nxt_bv = bio_iovec(nxt); + bio_get_last_bvec(bio, &end_bv); + bio_get_first_bvec(nxt, &nxt_bv); if (!BIOVEC_PHYS_MERGEABLE(&end_bv, &nxt_bv)) return 0; diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c index 8764c241e..d0634bcf3 100644 --- a/block/blk-mq-cpumap.c +++ b/block/blk-mq-cpumap.c @@ -113,7 +113,7 @@ int blk_mq_hw_queue_to_node(unsigned int *mq_map, unsigned int index) for_each_possible_cpu(i) { if (index == mq_map[i]) - return cpu_to_node(i); + return local_memory_node(cpu_to_node(i)); } return NUMA_NO_NODE; diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index a07ca3488..abdbb4740 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -268,7 +268,7 @@ static int bt_get(struct blk_mq_alloc_data *data, if (tag != -1) return tag; - if (!gfpflags_allow_blocking(data->gfp)) + if (data->flags & BLK_MQ_REQ_NOWAIT) return -1; bs = bt_wait_ptr(bt, hctx); @@ -303,7 +303,7 @@ static int bt_get(struct blk_mq_alloc_data *data, data->ctx = blk_mq_get_ctx(data->q); data->hctx = data->q->mq_ops->map_queue(data->q, data->ctx->cpu); - if (data->reserved) { + if (data->flags & BLK_MQ_REQ_RESERVED) { bt = &data->hctx->tags->breserved_tags; } else { last_tag = &data->ctx->last_tag; @@ -349,10 +349,9 @@ static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_alloc_data *data) unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) { - if (!data->reserved) - return __blk_mq_get_tag(data); - - return __blk_mq_get_reserved_tag(data); + if (data->flags & BLK_MQ_REQ_RESERVED) + return __blk_mq_get_reserved_tag(data); + return __blk_mq_get_tag(data); } static struct bt_wait_state *bt_wake_ptr(struct blk_mq_bitmap_tags *bt) diff --git a/block/blk-mq.c b/block/blk-mq.c index 6d6f8feb4..56c0a726b 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -229,8 +229,8 @@ __blk_mq_alloc_request(struct blk_mq_alloc_data *data, int rw) return NULL; } -struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, - bool reserved) +struct request *blk_mq_alloc_request(struct request_queue *q, int rw, + unsigned int flags) { struct blk_mq_ctx *ctx; struct blk_mq_hw_ctx *hctx; @@ -238,24 +238,22 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, struct blk_mq_alloc_data alloc_data; int ret; - ret = blk_queue_enter(q, gfp); + ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT); if (ret) return ERR_PTR(ret); ctx = blk_mq_get_ctx(q); hctx = q->mq_ops->map_queue(q, ctx->cpu); - blk_mq_set_alloc_data(&alloc_data, q, gfp & ~__GFP_DIRECT_RECLAIM, - reserved, ctx, hctx); + blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx); rq = __blk_mq_alloc_request(&alloc_data, rw); - if (!rq && (gfp & __GFP_DIRECT_RECLAIM)) { + if (!rq && !(flags & BLK_MQ_REQ_NOWAIT)) { __blk_mq_run_hw_queue(hctx); blk_mq_put_ctx(ctx); ctx = blk_mq_get_ctx(q); hctx = q->mq_ops->map_queue(q, ctx->cpu); - blk_mq_set_alloc_data(&alloc_data, q, gfp, reserved, ctx, - hctx); + blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx); rq = __blk_mq_alloc_request(&alloc_data, rw); ctx = alloc_data.ctx; } @@ -601,12 +599,12 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, * If a request wasn't started before the queue was * marked dying, kill it here or it'll go unnoticed. */ - if (unlikely(blk_queue_dying(rq->q))) - blk_mq_complete_request(rq, -EIO); + if (unlikely(blk_queue_dying(rq->q))) { + rq->errors = -EIO; + blk_mq_end_request(rq, rq->errors); + } return; } - if (rq->cmd_flags & REQ_NO_TIMEOUT) - return; if (time_after_eq(jiffies, rq->deadline)) { if (!blk_mark_rq_complete(rq)) @@ -617,15 +615,19 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, } } -static void blk_mq_rq_timer(unsigned long priv) +static void blk_mq_timeout_work(struct work_struct *work) { - struct request_queue *q = (struct request_queue *)priv; + struct request_queue *q = + container_of(work, struct request_queue, timeout_work); struct blk_mq_timeout_data data = { .next = 0, .next_set = 0, }; int i; + if (blk_queue_enter(q, true)) + return; + blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data); if (data.next_set) { @@ -640,6 +642,7 @@ static void blk_mq_rq_timer(unsigned long priv) blk_mq_tag_idle(hctx); } } + blk_queue_exit(q); } /* @@ -1175,8 +1178,7 @@ static struct request *blk_mq_map_request(struct request_queue *q, rw |= REQ_SYNC; trace_block_getrq(q, bio, rw); - blk_mq_set_alloc_data(&alloc_data, q, GFP_ATOMIC, false, ctx, - hctx); + blk_mq_set_alloc_data(&alloc_data, q, BLK_MQ_REQ_NOWAIT, ctx, hctx); rq = __blk_mq_alloc_request(&alloc_data, rw); if (unlikely(!rq)) { __blk_mq_run_hw_queue(hctx); @@ -1185,8 +1187,7 @@ static struct request *blk_mq_map_request(struct request_queue *q, ctx = blk_mq_get_ctx(q); hctx = q->mq_ops->map_queue(q, ctx->cpu); - blk_mq_set_alloc_data(&alloc_data, q, - __GFP_RECLAIM|__GFP_HIGH, false, ctx, hctx); + blk_mq_set_alloc_data(&alloc_data, q, 0, ctx, hctx); rq = __blk_mq_alloc_request(&alloc_data, rw); ctx = alloc_data.ctx; hctx = alloc_data.hctx; @@ -1794,7 +1795,7 @@ static void blk_mq_init_cpu_queues(struct request_queue *q, * not, we remain on the home node of the device */ if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) - hctx->numa_node = cpu_to_node(i); + hctx->numa_node = local_memory_node(cpu_to_node(i)); } } @@ -1854,6 +1855,7 @@ static void blk_mq_map_swqueue(struct request_queue *q, hctx->tags = set->tags[i]; WARN_ON(!hctx->tags); + cpumask_copy(hctx->tags->cpumask, hctx->cpumask); /* * Set the map size to the number of mapped software queues. * This is more accurate and more efficient than looping @@ -1867,14 +1869,6 @@ static void blk_mq_map_swqueue(struct request_queue *q, hctx->next_cpu = cpumask_first(hctx->cpumask); hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; } - - queue_for_each_ctx(q, ctx, i) { - if (!cpumask_test_cpu(i, online_mask)) - continue; - - hctx = q->mq_ops->map_queue(q, i); - cpumask_set_cpu(i, hctx->tags->cpumask); - } } static void queue_set_hctx_shared(struct request_queue *q, bool shared) @@ -2019,7 +2013,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, hctxs[i]->queue_num = i; } - setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); + INIT_WORK(&q->timeout_work, blk_mq_timeout_work); blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ); q->nr_queues = nr_cpu_ids; diff --git a/block/blk-mq.h b/block/blk-mq.h index 713820b47..eaede8e45 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -96,8 +96,7 @@ static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx) struct blk_mq_alloc_data { /* input parameter */ struct request_queue *q; - gfp_t gfp; - bool reserved; + unsigned int flags; /* input & output parameter */ struct blk_mq_ctx *ctx; @@ -105,13 +104,11 @@ struct blk_mq_alloc_data { }; static inline void blk_mq_set_alloc_data(struct blk_mq_alloc_data *data, - struct request_queue *q, gfp_t gfp, bool reserved, - struct blk_mq_ctx *ctx, - struct blk_mq_hw_ctx *hctx) + struct request_queue *q, unsigned int flags, + struct blk_mq_ctx *ctx, struct blk_mq_hw_ctx *hctx) { data->q = q; - data->gfp = gfp; - data->reserved = reserved; + data->flags = flags; data->ctx = ctx; data->hctx = hctx; } diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index e140cc487..dd9376305 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -147,10 +147,9 @@ static ssize_t queue_discard_granularity_show(struct request_queue *q, char *pag static ssize_t queue_discard_max_hw_show(struct request_queue *q, char *page) { - unsigned long long val; - val = q->limits.max_hw_discard_sectors << 9; - return sprintf(page, "%llu\n", val); + return sprintf(page, "%llu\n", + (unsigned long long)q->limits.max_hw_discard_sectors << 9); } static ssize_t queue_discard_max_show(struct request_queue *q, char *page) diff --git a/block/blk-timeout.c b/block/blk-timeout.c index aa40aa933..a30441a20 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -127,13 +127,16 @@ static void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout } } -void blk_rq_timed_out_timer(unsigned long data) +void blk_timeout_work(struct work_struct *work) { - struct request_queue *q = (struct request_queue *) data; + struct request_queue *q = + container_of(work, struct request_queue, timeout_work); unsigned long flags, next = 0; struct request *rq, *tmp; int next_set = 0; + if (blk_queue_enter(q, true)) + return; spin_lock_irqsave(q->queue_lock, flags); list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list) @@ -143,6 +146,7 @@ void blk_rq_timed_out_timer(unsigned long data) mod_timer(&q->timeout, round_jiffies_up(next)); spin_unlock_irqrestore(q->queue_lock, flags); + blk_queue_exit(q); } /** @@ -186,15 +190,13 @@ unsigned long blk_rq_timeout(unsigned long timeout) * Notes: * Each request has its own timer, and as it is added to the queue, we * set up the timer. When the request completes, we cancel the timer. + * Queue lock must be held for the non-mq case, mq case doesn't care. */ void blk_add_timer(struct request *req) { struct request_queue *q = req->q; unsigned long expiry; - if (req->cmd_flags & REQ_NO_TIMEOUT) - return; - /* blk-mq has its own handler, so we don't need ->rq_timed_out_fn */ if (!q->mq_ops && !q->rq_timed_out_fn) return; @@ -209,6 +211,11 @@ void blk_add_timer(struct request *req) req->timeout = q->rq_timeout; req->deadline = jiffies + req->timeout; + + /* + * Only the non-mq case needs to add the request to a protected list. + * For the mq case we simply scan the tag map. + */ if (!q->mq_ops) list_add_tail(&req->timeout_list, &req->q->timeout_list); diff --git a/block/blk.h b/block/blk.h index c43926d3d..70e4aee9c 100644 --- a/block/blk.h +++ b/block/blk.h @@ -93,7 +93,7 @@ static inline void blk_flush_integrity(void) } #endif -void blk_rq_timed_out_timer(unsigned long data); +void blk_timeout_work(struct work_struct *work); unsigned long blk_rq_timeout(unsigned long timeout); void blk_add_timer(struct request *req); void blk_delete_timer(struct request *); diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c index a753df2b3..d0dd7882d 100644 --- a/block/deadline-iosched.c +++ b/block/deadline-iosched.c @@ -39,7 +39,6 @@ struct deadline_data { */ struct request *next_rq[2]; unsigned int batching; /* number of sequential requests made */ - sector_t last_sector; /* head position */ unsigned int starved; /* times reads have starved writes */ /* @@ -210,8 +209,6 @@ deadline_move_request(struct deadline_data *dd, struct request *rq) dd->next_rq[WRITE] = NULL; dd->next_rq[data_dir] = deadline_latter_request(rq); - dd->last_sector = rq_end_sector(rq); - /* * take it off the sort and fifo list, move * to dispatch queue diff --git a/block/genhd.c b/block/genhd.c index c06731166..9f42526b4 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -18,10 +18,9 @@ #include <linux/kobj_map.h> #include <linux/mutex.h> #include <linux/idr.h> -#include <linux/ctype.h> -#include <linux/fs_uuid.h> #include <linux/log2.h> #include <linux/pm_runtime.h> +#include <linux/badblocks.h> #include "blk.h" @@ -666,7 +665,6 @@ void del_gendisk(struct gendisk *disk) kobject_put(disk->part0.holder_dir); kobject_put(disk->slave_dir); - disk->driverfs_dev = NULL; if (!sysfs_deprecated) sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); pm_runtime_set_memalloc_noio(disk_to_dev(disk), false); @@ -674,6 +672,31 @@ void del_gendisk(struct gendisk *disk) } EXPORT_SYMBOL(del_gendisk); +/* sysfs access to bad-blocks list. */ +static ssize_t disk_badblocks_show(struct device *dev, + struct device_attribute *attr, + char *page) +{ + struct gendisk *disk = dev_to_disk(dev); + + if (!disk->bb) + return sprintf(page, "\n"); + + return badblocks_show(disk->bb, page, 0); +} + +static ssize_t disk_badblocks_store(struct device *dev, + struct device_attribute *attr, + const char *page, size_t len) +{ + struct gendisk *disk = dev_to_disk(dev); + + if (!disk->bb) + return -ENXIO; + + return badblocks_store(disk->bb, page, len, 0); +} + /** * get_gendisk - get partitioning information for a given device * @devt: device to get partitioning information for @@ -992,6 +1015,8 @@ static DEVICE_ATTR(discard_alignment, S_IRUGO, disk_discard_alignment_show, static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL); static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL); +static DEVICE_ATTR(badblocks, S_IRUGO | S_IWUSR, disk_badblocks_show, + disk_badblocks_store); #ifdef CONFIG_FAIL_MAKE_REQUEST static struct device_attribute dev_attr_fail = __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store); @@ -1013,6 +1038,7 @@ static struct attribute *disk_attrs[] = { &dev_attr_capability.attr, &dev_attr_stat.attr, &dev_attr_inflight.attr, + &dev_attr_badblocks.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST &dev_attr_fail.attr, #endif @@ -1391,85 +1417,6 @@ int invalidate_partition(struct gendisk *disk, int partno) EXPORT_SYMBOL(invalidate_partition); -dev_t blk_lookup_fs_info(struct fs_info *seek) -{ - dev_t devt = MKDEV(0, 0); - struct class_dev_iter iter; - struct device *dev; - int best_score = 0; - - class_dev_iter_init(&iter, &block_class, NULL, &disk_type); - while (best_score < 3 && (dev = class_dev_iter_next(&iter))) { - struct gendisk *disk = dev_to_disk(dev); - struct disk_part_iter piter; - struct hd_struct *part; - - disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0); - - while (best_score < 3 && (part = disk_part_iter_next(&piter))) { - int score = part_matches_fs_info(part, seek); - if (score > best_score) { - devt = part_devt(part); - best_score = score; - } - } - disk_part_iter_exit(&piter); - } - class_dev_iter_exit(&iter); - return devt; -} - -/* Caller uses NULL, key to start. For each match found, we return a bdev on - * which we have done blkdev_get, and we do the blkdev_put on block devices - * that are passed to us. When no more matches are found, we return NULL. - */ -struct block_device *next_bdev_of_type(struct block_device *last, - const char *key) -{ - dev_t devt = MKDEV(0, 0); - struct class_dev_iter iter; - struct device *dev; - struct block_device *next = NULL, *bdev; - int got_last = 0; - - if (!key) - goto out; - - class_dev_iter_init(&iter, &block_class, NULL, &disk_type); - while (!devt && (dev = class_dev_iter_next(&iter))) { - struct gendisk *disk = dev_to_disk(dev); - struct disk_part_iter piter; - struct hd_struct *part; - - disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0); - - while ((part = disk_part_iter_next(&piter))) { - bdev = bdget(part_devt(part)); - if (last && !got_last) { - if (last == bdev) - got_last = 1; - continue; - } - - if (blkdev_get(bdev, FMODE_READ, 0)) - continue; - - if (bdev_matches_key(bdev, key)) { - next = bdev; - break; - } - - blkdev_put(bdev, FMODE_READ); - } - disk_part_iter_exit(&piter); - } - class_dev_iter_exit(&iter); -out: - if (last) - blkdev_put(last, FMODE_READ); - return next; -} - /* * Disk events - monitor disk events like media change and eject request. */ @@ -1502,7 +1449,7 @@ static DEFINE_MUTEX(disk_events_mutex); static LIST_HEAD(disk_events); /* disable in-kernel polling by default */ -static unsigned long disk_events_dfl_poll_msecs = 0; +static unsigned long disk_events_dfl_poll_msecs; static unsigned long disk_events_poll_jiffies(struct gendisk *disk) { diff --git a/block/ioctl.c b/block/ioctl.c index 0918aed2d..d8996bbd7 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -4,6 +4,7 @@ #include <linux/gfp.h> #include <linux/blkpg.h> #include <linux/hdreg.h> +#include <linux/badblocks.h> #include <linux/backing-dev.h> #include <linux/fs.h> #include <linux/blktrace_api.h> @@ -406,6 +407,35 @@ static inline int is_unrecognized_ioctl(int ret) ret == -ENOIOCTLCMD; } +#ifdef CONFIG_FS_DAX +bool blkdev_dax_capable(struct block_device *bdev) +{ + struct gendisk *disk = bdev->bd_disk; + + if (!disk->fops->direct_access) + return false; + + /* + * If the partition is not aligned on a page boundary, we can't + * do dax I/O to it. + */ + if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512)) + || (bdev->bd_part->nr_sects % (PAGE_SIZE / 512))) + return false; + + /* + * If the device has known bad blocks, force all I/O through the + * driver / page cache. + * + * TODO: support finer grained dax error handling + */ + if (disk->bb && disk->bb->count) + return false; + + return true; +} +#endif + static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode, unsigned cmd, unsigned long arg) { @@ -568,6 +598,9 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, case BLKTRACESETUP: case BLKTRACETEARDOWN: return blk_trace_ioctl(bdev, cmd, argp); + case BLKDAXGET: + return put_int(arg, !!(bdev->bd_inode->i_flags & S_DAX)); + break; case IOC_PR_REGISTER: return blkdev_pr_register(bdev, argp); case IOC_PR_RESERVE: diff --git a/block/partition-generic.c b/block/partition-generic.c index 746935a59..fefd01b49 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -16,6 +16,7 @@ #include <linux/kmod.h> #include <linux/ctype.h> #include <linux/genhd.h> +#include <linux/dax.h> #include <linux/blktrace_api.h> #include "partitions/check.h" @@ -550,13 +551,24 @@ int invalidate_partitions(struct gendisk *disk, struct block_device *bdev) return 0; } -unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p) +static struct page *read_pagecache_sector(struct block_device *bdev, sector_t n) { struct address_space *mapping = bdev->bd_inode->i_mapping; + + return read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_CACHE_SHIFT-9)), + NULL); +} + +unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p) +{ struct page *page; - page = read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_CACHE_SHIFT-9)), - NULL); + /* don't populate page cache for dax capable devices */ + if (IS_DAX(bdev->bd_inode)) + page = read_dax_sector(bdev, n); + else + page = read_pagecache_sector(bdev, n); + if (!IS_ERR(page)) { if (PageError(page)) goto fail; diff --git a/block/uuid.c b/block/uuid.c deleted file mode 100644 index 4610d7b8f..000000000 --- a/block/uuid.c +++ /dev/null @@ -1,509 +0,0 @@ -#include <linux/blkdev.h> -#include <linux/ctype.h> -#include <linux/fs_uuid.h> -#include <linux/slab.h> -#include <linux/export.h> - -static int debug_enabled; - -#define PRINTK(fmt, args...) do { \ - if (debug_enabled) \ - printk(KERN_DEBUG fmt, ## args); \ - } while(0) - -#define PRINT_HEX_DUMP(v1, v2, v3, v4, v5, v6, v7, v8) \ - do { \ - if (debug_enabled) \ - print_hex_dump(v1, v2, v3, v4, v5, v6, v7, v8); \ - } while(0) - -/* - * Simple UUID translation - */ - -struct uuid_info { - const char *key; - const char *name; - long bkoff; - unsigned sboff; - unsigned sig_len; - const char *magic; - int uuid_offset; - int last_mount_offset; - int last_mount_size; -}; - -/* - * Based on libuuid's blkid_magic array. Note that I don't - * have uuid offsets for all of these yet - mssing ones are 0x0. - * Further information welcome. - * - * Rearranged by page of fs signature for optimisation. - */ -static struct uuid_info uuid_list[] = { - { NULL, "oracleasm", 0, 32, 8, "ORCLDISK", 0x0, 0, 0 }, - { "ntfs", "ntfs", 0, 3, 8, "NTFS ", 0x0, 0, 0 }, - { "vfat", "vfat", 0, 0x52, 5, "MSWIN", 0x0, 0, 0 }, - { "vfat", "vfat", 0, 0x52, 8, "FAT32 ", 0x0, 0, 0 }, - { "vfat", "vfat", 0, 0x36, 5, "MSDOS", 0x0, 0, 0 }, - { "vfat", "vfat", 0, 0x36, 8, "FAT16 ", 0x0, 0, 0 }, - { "vfat", "vfat", 0, 0x36, 8, "FAT12 ", 0x0, 0, 0 }, - { "vfat", "vfat", 0, 0, 1, "\353", 0x0, 0, 0 }, - { "vfat", "vfat", 0, 0, 1, "\351", 0x0, 0, 0 }, - { "vfat", "vfat", 0, 0x1fe, 2, "\125\252", 0x0, 0, 0 }, - { "xfs", "xfs", 0, 0, 4, "XFSB", 0x20, 0, 0 }, - { "romfs", "romfs", 0, 0, 8, "-rom1fs-", 0x0, 0, 0 }, - { "bfs", "bfs", 0, 0, 4, "\316\372\173\033", 0, 0, 0 }, - { "cramfs", "cramfs", 0, 0, 4, "E=\315\050", 0x0, 0, 0 }, - { "qnx4", "qnx4", 0, 4, 6, "QNX4FS", 0, 0, 0 }, - { NULL, "crypt_LUKS", 0, 0, 6, "LUKS\xba\xbe", 0x0, 0, 0 }, - { "squashfs", "squashfs", 0, 0, 4, "sqsh", 0, 0, 0 }, - { "squashfs", "squashfs", 0, 0, 4, "hsqs", 0, 0, 0 }, - { "ocfs", "ocfs", 0, 8, 9, "OracleCFS", 0x0, 0, 0 }, - { "lvm2pv", "lvm2pv", 0, 0x018, 8, "LVM2 001", 0x0, 0, 0 }, - { "sysv", "sysv", 0, 0x3f8, 4, "\020~\030\375", 0, 0, 0 }, - { "ext", "ext", 1, 0x38, 2, "\123\357", 0x468, 0x42c, 4 }, - { "minix", "minix", 1, 0x10, 2, "\177\023", 0, 0, 0 }, - { "minix", "minix", 1, 0x10, 2, "\217\023", 0, 0, 0 }, - { "minix", "minix", 1, 0x10, 2, "\150\044", 0, 0, 0 }, - { "minix", "minix", 1, 0x10, 2, "\170\044", 0, 0, 0 }, - { "lvm2pv", "lvm2pv", 1, 0x018, 8, "LVM2 001", 0x0, 0, 0 }, - { "vxfs", "vxfs", 1, 0, 4, "\365\374\001\245", 0, 0, 0 }, - { "hfsplus", "hfsplus", 1, 0, 2, "BD", 0x0, 0, 0 }, - { "hfsplus", "hfsplus", 1, 0, 2, "H+", 0x0, 0, 0 }, - { "hfsplus", "hfsplus", 1, 0, 2, "HX", 0x0, 0, 0 }, - { "hfs", "hfs", 1, 0, 2, "BD", 0x0, 0, 0 }, - { "ocfs2", "ocfs2", 1, 0, 6, "OCFSV2", 0x0, 0, 0 }, - { "lvm2pv", "lvm2pv", 0, 0x218, 8, "LVM2 001", 0x0, 0, 0 }, - { "lvm2pv", "lvm2pv", 1, 0x218, 8, "LVM2 001", 0x0, 0, 0 }, - { "ocfs2", "ocfs2", 2, 0, 6, "OCFSV2", 0x0, 0, 0 }, - { "swap", "swap", 0, 0xff6, 10, "SWAP-SPACE", 0x40c, 0, 0 }, - { "swap", "swap", 0, 0xff6, 10, "SWAPSPACE2", 0x40c, 0, 0 }, - { "swap", "swsuspend", 0, 0xff6, 9, "S1SUSPEND", 0x40c, 0, 0 }, - { "swap", "swsuspend", 0, 0xff6, 9, "S2SUSPEND", 0x40c, 0, 0 }, - { "swap", "swsuspend", 0, 0xff6, 9, "ULSUSPEND", 0x40c, 0, 0 }, - { "ocfs2", "ocfs2", 4, 0, 6, "OCFSV2", 0x0, 0, 0 }, - { "ocfs2", "ocfs2", 8, 0, 6, "OCFSV2", 0x0, 0, 0 }, - { "hpfs", "hpfs", 8, 0, 4, "I\350\225\371", 0, 0, 0 }, - { "reiserfs", "reiserfs", 8, 0x34, 8, "ReIsErFs", 0x10054, 0, 0 }, - { "reiserfs", "reiserfs", 8, 20, 8, "ReIsErFs", 0x10054, 0, 0 }, - { "zfs", "zfs", 8, 0, 8, "\0\0\x02\xf5\xb0\x07\xb1\x0c", 0x0, 0, 0 }, - { "zfs", "zfs", 8, 0, 8, "\x0c\xb1\x07\xb0\xf5\x02\0\0", 0x0, 0, 0 }, - { "ufs", "ufs", 8, 0x55c, 4, "T\031\001\000", 0, 0, 0 }, - { "swap", "swap", 0, 0x1ff6, 10, "SWAP-SPACE", 0x40c, 0, 0 }, - { "swap", "swap", 0, 0x1ff6, 10, "SWAPSPACE2", 0x40c, 0, 0 }, - { "swap", "swsuspend", 0, 0x1ff6, 9, "S1SUSPEND", 0x40c, 0, 0 }, - { "swap", "swsuspend", 0, 0x1ff6, 9, "S2SUSPEND", 0x40c, 0, 0 }, - { "swap", "swsuspend", 0, 0x1ff6, 9, "ULSUSPEND", 0x40c, 0, 0 }, - { "reiserfs", "reiserfs", 64, 0x34, 9, "ReIsEr2Fs", 0x10054, 0, 0 }, - { "reiserfs", "reiserfs", 64, 0x34, 9, "ReIsEr3Fs", 0x10054, 0, 0 }, - { "reiserfs", "reiserfs", 64, 0x34, 8, "ReIsErFs", 0x10054, 0, 0 }, - { "reiser4", "reiser4", 64, 0, 7, "ReIsEr4", 0x100544, 0, 0 }, - { "gfs2", "gfs2", 64, 0, 4, "\x01\x16\x19\x70", 0x0, 0, 0 }, - { "gfs", "gfs", 64, 0, 4, "\x01\x16\x19\x70", 0x0, 0, 0 }, - { "btrfs", "btrfs", 64, 0x40, 8, "_BHRfS_M", 0x0, 0, 0 }, - { "swap", "swap", 0, 0x3ff6, 10, "SWAP-SPACE", 0x40c, 0, 0 }, - { "swap", "swap", 0, 0x3ff6, 10, "SWAPSPACE2", 0x40c, 0, 0 }, - { "swap", "swsuspend", 0, 0x3ff6, 9, "S1SUSPEND", 0x40c, 0, 0 }, - { "swap", "swsuspend", 0, 0x3ff6, 9, "S2SUSPEND", 0x40c, 0, 0 }, - { "swap", "swsuspend", 0, 0x3ff6, 9, "ULSUSPEND", 0x40c, 0, 0 }, - { "udf", "udf", 32, 1, 5, "BEA01", 0x0, 0, 0 }, - { "udf", "udf", 32, 1, 5, "BOOT2", 0x0, 0, 0 }, - { "udf", "udf", 32, 1, 5, "CD001", 0x0, 0, 0 }, - { "udf", "udf", 32, 1, 5, "CDW02", 0x0, 0, 0 }, - { "udf", "udf", 32, 1, 5, "NSR02", 0x0, 0, 0 }, - { "udf", "udf", 32, 1, 5, "NSR03", 0x0, 0, 0 }, - { "udf", "udf", 32, 1, 5, "TEA01", 0x0, 0, 0 }, - { "iso9660", "iso9660", 32, 1, 5, "CD001", 0x0, 0, 0 }, - { "iso9660", "iso9660", 32, 9, 5, "CDROM", 0x0, 0, 0 }, - { "jfs", "jfs", 32, 0, 4, "JFS1", 0x88, 0, 0 }, - { "swap", "swap", 0, 0x7ff6, 10, "SWAP-SPACE", 0x40c, 0, 0 }, - { "swap", "swap", 0, 0x7ff6, 10, "SWAPSPACE2", 0x40c, 0, 0 }, - { "swap", "swsuspend", 0, 0x7ff6, 9, "S1SUSPEND", 0x40c, 0, 0 }, - { "swap", "swsuspend", 0, 0x7ff6, 9, "S2SUSPEND", 0x40c, 0, 0 }, - { "swap", "swsuspend", 0, 0x7ff6, 9, "ULSUSPEND", 0x40c, 0, 0 }, - { "swap", "swap", 0, 0xfff6, 10, "SWAP-SPACE", 0x40c, 0, 0 }, - { "swap", "swap", 0, 0xfff6, 10, "SWAPSPACE2", 0x40c, 0, 0 }, - { "swap", "swsuspend", 0, 0xfff6, 9, "S1SUSPEND", 0x40c, 0, 0 }, - { "swap", "swsuspend", 0, 0xfff6, 9, "S2SUSPEND", 0x40c, 0, 0 }, - { "swap", "swsuspend", 0, 0xfff6, 9, "ULSUSPEND", 0x40c, 0, 0 }, - { "zfs", "zfs", 264, 0, 8, "\0\0\x02\xf5\xb0\x07\xb1\x0c", 0x0, 0, 0 }, - { "zfs", "zfs", 264, 0, 8, "\x0c\xb1\x07\xb0\xf5\x02\0\0", 0x0, 0, 0 }, - { NULL, NULL, 0, 0, 0, NULL, 0x0, 0, 0 } -}; - -static int null_uuid(const char *uuid) -{ - int i; - - for (i = 0; i < 16 && !uuid[i]; i++); - - return (i == 16); -} - - -static void uuid_end_bio(struct bio *bio) -{ - struct page *page = bio->bi_io_vec[0].bv_page; - - if (bio->bi_error) - SetPageError(page); - - unlock_page(page); - bio_put(bio); -} - - -/** - * submit - submit BIO request - * @dev: The block device we're using. - * @page_num: The page we're reading. - * - * Based on Patrick Mochell's pmdisk code from long ago: "Straight from the - * textbook - allocate and initialize the bio. If we're writing, make sure - * the page is marked as dirty. Then submit it and carry on." - **/ -static struct page *read_bdev_page(struct block_device *dev, int page_num) -{ - struct bio *bio = NULL; - struct page *page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); - - if (!page) { - printk(KERN_ERR "Failed to allocate a page for reading data " - "in UUID checks."); - return NULL; - } - - bio = bio_alloc(GFP_NOFS, 1); - bio->bi_bdev = dev; - bio->bi_iter.bi_sector = page_num << 3; - bio->bi_end_io = uuid_end_bio; - bio->bi_flags |= (1 << BIO_TOI); - - PRINTK("Submitting bio on device %lx, page %d using bio %p and page %p.\n", - (unsigned long) dev->bd_dev, page_num, bio, page); - - if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { - printk(KERN_DEBUG "ERROR: adding page to bio at %d\n", - page_num); - bio_put(bio); - __free_page(page); - printk(KERN_DEBUG "read_bdev_page freed page %p (in error " - "path).\n", page); - return NULL; - } - - lock_page(page); - submit_bio(READ | REQ_SYNC, bio); - - wait_on_page_locked(page); - if (PageError(page)) { - __free_page(page); - page = NULL; - } - return page; -} - -int bdev_matches_key(struct block_device *bdev, const char *key) -{ - unsigned char *data = NULL; - struct page *data_page = NULL; - - int dev_offset, pg_num, pg_off, i; - int last_pg_num = -1; - int result = 0; - char buf[50]; - - if (null_uuid(key)) { - PRINTK("Refusing to find a NULL key.\n"); - return 0; - } - - if (!bdev->bd_disk) { - bdevname(bdev, buf); - PRINTK("bdev %s has no bd_disk.\n", buf); - return 0; - } - - if (!bdev->bd_disk->queue) { - bdevname(bdev, buf); - PRINTK("bdev %s has no queue.\n", buf); - return 0; - } - - for (i = 0; uuid_list[i].name; i++) { - struct uuid_info *dat = &uuid_list[i]; - - if (!dat->key || strcmp(dat->key, key)) - continue; - - dev_offset = (dat->bkoff << 10) + dat->sboff; - pg_num = dev_offset >> 12; - pg_off = dev_offset & 0xfff; - - if ((((pg_num + 1) << 3) - 1) > bdev->bd_part->nr_sects >> 1) - continue; - - if (pg_num != last_pg_num) { - if (data_page) { - kunmap(data_page); - __free_page(data_page); - } - data_page = read_bdev_page(bdev, pg_num); - if (!data_page) - continue; - data = kmap(data_page); - } - - last_pg_num = pg_num; - - if (strncmp(&data[pg_off], dat->magic, dat->sig_len)) - continue; - - result = 1; - break; - } - - if (data_page) { - kunmap(data_page); - __free_page(data_page); - } - - return result; -} - -/* - * part_matches_fs_info - Does the given partition match the details given? - * - * Returns a score saying how good the match is. - * 0 = no UUID match. - * 1 = UUID but last mount time differs. - * 2 = UUID, last mount time but not dev_t - * 3 = perfect match - * - * This lets us cope elegantly with probing resulting in dev_ts changing - * from boot to boot, and with the case where a user copies a partition - * (UUID is non unique), and we need to check the last mount time of the - * correct partition. - */ -int part_matches_fs_info(struct hd_struct *part, struct fs_info *seek) -{ - struct block_device *bdev; - struct fs_info *got; - int result = 0; - char buf[50]; - - if (null_uuid((char *) &seek->uuid)) { - PRINTK("Refusing to find a NULL uuid.\n"); - return 0; - } - - bdev = bdget(part_devt(part)); - - PRINTK("part_matches fs info considering %x.\n", part_devt(part)); - - if (blkdev_get(bdev, FMODE_READ, 0)) { - PRINTK("blkdev_get failed.\n"); - return 0; - } - - if (!bdev->bd_disk) { - bdevname(bdev, buf); - PRINTK("bdev %s has no bd_disk.\n", buf); - goto out; - } - - if (!bdev->bd_disk->queue) { - bdevname(bdev, buf); - PRINTK("bdev %s has no queue.\n", buf); - goto out; - } - - got = fs_info_from_block_dev(bdev); - - if (got && !memcmp(got->uuid, seek->uuid, 16)) { - PRINTK(" Have matching UUID.\n"); - PRINTK(" Got: LMS %d, LM %p.\n", got->last_mount_size, got->last_mount); - PRINTK(" Seek: LMS %d, LM %p.\n", seek->last_mount_size, seek->last_mount); - result = 1; - - if (got->last_mount_size == seek->last_mount_size && - got->last_mount && seek->last_mount && - !memcmp(got->last_mount, seek->last_mount, - got->last_mount_size)) { - result = 2; - - PRINTK(" Matching last mount time.\n"); - - if (part_devt(part) == seek->dev_t) { - result = 3; - PRINTK(" Matching dev_t.\n"); - } else - PRINTK("Dev_ts differ (%x vs %x).\n", part_devt(part), seek->dev_t); - } - } - - PRINTK(" Score for %x is %d.\n", part_devt(part), result); - free_fs_info(got); -out: - blkdev_put(bdev, FMODE_READ); - return result; -} - -void free_fs_info(struct fs_info *fs_info) -{ - if (!fs_info || IS_ERR(fs_info)) - return; - - if (fs_info->last_mount) - kfree(fs_info->last_mount); - - kfree(fs_info); -} - -struct fs_info *fs_info_from_block_dev(struct block_device *bdev) -{ - unsigned char *data = NULL; - struct page *data_page = NULL; - - int dev_offset, pg_num, pg_off; - int uuid_pg_num, uuid_pg_off, i; - unsigned char *uuid_data = NULL; - struct page *uuid_data_page = NULL; - - int last_pg_num = -1, last_uuid_pg_num = 0; - char buf[50]; - struct fs_info *fs_info = NULL; - - bdevname(bdev, buf); - - PRINTK("uuid_from_block_dev looking for partition type of %s.\n", buf); - - for (i = 0; uuid_list[i].name; i++) { - struct uuid_info *dat = &uuid_list[i]; - dev_offset = (dat->bkoff << 10) + dat->sboff; - pg_num = dev_offset >> 12; - pg_off = dev_offset & 0xfff; - uuid_pg_num = dat->uuid_offset >> 12; - uuid_pg_off = dat->uuid_offset & 0xfff; - - if ((((pg_num + 1) << 3) - 1) > bdev->bd_part->nr_sects >> 1) - continue; - - /* Ignore partition types with no UUID offset */ - if (!dat->uuid_offset) - continue; - - if (pg_num != last_pg_num) { - if (data_page) { - kunmap(data_page); - __free_page(data_page); - } - data_page = read_bdev_page(bdev, pg_num); - if (!data_page) - continue; - data = kmap(data_page); - } - - last_pg_num = pg_num; - - if (strncmp(&data[pg_off], dat->magic, dat->sig_len)) - continue; - - PRINTK("This partition looks like %s.\n", dat->name); - - fs_info = kzalloc(sizeof(struct fs_info), GFP_KERNEL); - - if (!fs_info) { - PRINTK("Failed to allocate fs_info struct."); - fs_info = ERR_PTR(-ENOMEM); - break; - } - - /* UUID can't be off the end of the disk */ - if ((uuid_pg_num > bdev->bd_part->nr_sects >> 3) || - !dat->uuid_offset) - goto no_uuid; - - if (!uuid_data || uuid_pg_num != last_uuid_pg_num) { - /* No need to reread the page from above */ - if (uuid_pg_num == pg_num && uuid_data) - memcpy(uuid_data, data, PAGE_SIZE); - else { - if (uuid_data_page) { - kunmap(uuid_data_page); - __free_page(uuid_data_page); - } - uuid_data_page = read_bdev_page(bdev, uuid_pg_num); - if (!uuid_data_page) - continue; - uuid_data = kmap(uuid_data_page); - } - } - - last_uuid_pg_num = uuid_pg_num; - memcpy(&fs_info->uuid, &uuid_data[uuid_pg_off], 16); - fs_info->dev_t = bdev->bd_dev; - -no_uuid: - PRINT_HEX_DUMP(KERN_EMERG, "fs_info_from_block_dev " - "returning uuid ", DUMP_PREFIX_NONE, 16, 1, - fs_info->uuid, 16, 0); - - if (dat->last_mount_size) { - int pg = dat->last_mount_offset >> 12, sz; - int off = dat->last_mount_offset & 0xfff; - struct page *last_mount = read_bdev_page(bdev, pg); - unsigned char *last_mount_data; - char *ptr; - - if (!last_mount) { - fs_info = ERR_PTR(-ENOMEM); - break; - } - last_mount_data = kmap(last_mount); - sz = dat->last_mount_size; - ptr = kmalloc(sz, GFP_KERNEL); - - if (!ptr) { - printk(KERN_EMERG "fs_info_from_block_dev " - "failed to get memory for last mount " - "timestamp."); - free_fs_info(fs_info); - fs_info = ERR_PTR(-ENOMEM); - } else { - fs_info->last_mount = ptr; - fs_info->last_mount_size = sz; - memcpy(ptr, &last_mount_data[off], sz); - } - - kunmap(last_mount); - __free_page(last_mount); - } - break; - } - - if (data_page) { - kunmap(data_page); - __free_page(data_page); - } - - if (uuid_data_page) { - kunmap(uuid_data_page); - __free_page(uuid_data_page); - } - - return fs_info; -} - -static int __init uuid_debug_setup(char *str) -{ - int value; - - if (sscanf(str, "=%d", &value)) - debug_enabled = value; - - return 1; -} - -__setup("uuid_debug", uuid_debug_setup); |