summaryrefslogtreecommitdiff
path: root/block
diff options
context:
space:
mode:
Diffstat (limited to 'block')
-rw-r--r--block/Makefile2
-rw-r--r--block/bfq.h2
-rw-r--r--block/bio-integrity.c4
-rw-r--r--block/bio.c114
-rw-r--r--block/blk-cgroup.c279
-rw-r--r--block/blk-cgroup.h603
-rw-r--r--block/blk-core.c119
-rw-r--r--block/blk-exec.c10
-rw-r--r--block/blk-integrity.c1
-rw-r--r--block/blk-merge.c3
-rw-r--r--block/blk-mq-cpumap.c2
-rw-r--r--block/blk-mq-tag.c38
-rw-r--r--block/blk-mq-tag.h1
-rw-r--r--block/blk-mq.c160
-rw-r--r--block/blk-settings.c4
-rw-r--r--block/blk-sysfs.c3
-rw-r--r--block/blk-throttle.c2
-rw-r--r--block/blk.h5
-rw-r--r--block/bounce.c4
-rw-r--r--block/cfq-iosched.c127
-rw-r--r--block/elevator.c4
-rw-r--r--block/genhd.c82
-rw-r--r--block/ioctl.c37
-rw-r--r--block/scsi_ioctl.c4
-rw-r--r--block/uuid.c509
25 files changed, 627 insertions, 1492 deletions
diff --git a/block/Makefile b/block/Makefile
index 086be5007..1ed86d58f 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -7,7 +7,7 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \
blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \
- uuid.o genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
+ genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
partitions/
obj-$(CONFIG_BOUNCE) += bounce.o
diff --git a/block/bfq.h b/block/bfq.h
index 0ea164d41..96ffbf773 100644
--- a/block/bfq.h
+++ b/block/bfq.h
@@ -1,5 +1,5 @@
/*
- * BFQ-v7r8 for 4.0.0: data structures and common functions prototypes.
+ * BFQ-v7r8 for 4.1.0: data structures and common functions prototypes.
*
* Based on ideas and code from CFQ:
* Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 39ce74d10..719b7152a 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -361,7 +361,7 @@ static void bio_integrity_verify_fn(struct work_struct *work)
/* Restore original bio completion handler */
bio->bi_end_io = bip->bip_end_io;
- bio_endio_nodec(bio, error);
+ bio_endio(bio, error);
}
/**
@@ -388,7 +388,7 @@ void bio_integrity_endio(struct bio *bio, int error)
*/
if (error) {
bio->bi_end_io = bip->bip_end_io;
- bio_endio_nodec(bio, error);
+ bio_endio(bio, error);
return;
}
diff --git a/block/bio.c b/block/bio.c
index 4441522ca..d6e5ba339 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -270,8 +270,8 @@ void bio_init(struct bio *bio)
{
memset(bio, 0, sizeof(*bio));
bio->bi_flags = 1 << BIO_UPTODATE;
- atomic_set(&bio->bi_remaining, 1);
- atomic_set(&bio->bi_cnt, 1);
+ atomic_set(&bio->__bi_remaining, 1);
+ atomic_set(&bio->__bi_cnt, 1);
}
EXPORT_SYMBOL(bio_init);
@@ -292,8 +292,8 @@ void bio_reset(struct bio *bio)
__bio_free(bio);
memset(bio, 0, BIO_RESET_BYTES);
- bio->bi_flags = flags|(1 << BIO_UPTODATE);
- atomic_set(&bio->bi_remaining, 1);
+ bio->bi_flags = flags | (1 << BIO_UPTODATE);
+ atomic_set(&bio->__bi_remaining, 1);
}
EXPORT_SYMBOL(bio_reset);
@@ -303,6 +303,17 @@ static void bio_chain_endio(struct bio *bio, int error)
bio_put(bio);
}
+/*
+ * Increment chain count for the bio. Make sure the CHAIN flag update
+ * is visible before the raised count.
+ */
+static inline void bio_inc_remaining(struct bio *bio)
+{
+ bio->bi_flags |= (1 << BIO_CHAIN);
+ smp_mb__before_atomic();
+ atomic_inc(&bio->__bi_remaining);
+}
+
/**
* bio_chain - chain bio completions
* @bio: the target bio
@@ -320,7 +331,7 @@ void bio_chain(struct bio *bio, struct bio *parent)
bio->bi_private = parent;
bio->bi_end_io = bio_chain_endio;
- atomic_inc(&parent->bi_remaining);
+ bio_inc_remaining(parent);
}
EXPORT_SYMBOL(bio_chain);
@@ -524,13 +535,17 @@ EXPORT_SYMBOL(zero_fill_bio);
**/
void bio_put(struct bio *bio)
{
- BIO_BUG_ON(!atomic_read(&bio->bi_cnt));
-
- /*
- * last put frees it
- */
- if (atomic_dec_and_test(&bio->bi_cnt))
+ if (!bio_flagged(bio, BIO_REFFED))
bio_free(bio);
+ else {
+ BIO_BUG_ON(!atomic_read(&bio->__bi_cnt));
+
+ /*
+ * last put frees it
+ */
+ if (atomic_dec_and_test(&bio->__bi_cnt))
+ bio_free(bio);
+ }
}
EXPORT_SYMBOL(bio_put);
@@ -1741,6 +1756,25 @@ void bio_flush_dcache_pages(struct bio *bi)
EXPORT_SYMBOL(bio_flush_dcache_pages);
#endif
+static inline bool bio_remaining_done(struct bio *bio)
+{
+ /*
+ * If we're not chaining, then ->__bi_remaining is always 1 and
+ * we always end io on the first invocation.
+ */
+ if (!bio_flagged(bio, BIO_CHAIN))
+ return true;
+
+ BUG_ON(atomic_read(&bio->__bi_remaining) <= 0);
+
+ if (atomic_dec_and_test(&bio->__bi_remaining)) {
+ clear_bit(BIO_CHAIN, &bio->bi_flags);
+ return true;
+ }
+
+ return false;
+}
+
/**
* bio_endio - end I/O on a bio
* @bio: bio
@@ -1758,15 +1792,13 @@ EXPORT_SYMBOL(bio_flush_dcache_pages);
void bio_endio(struct bio *bio, int error)
{
while (bio) {
- BUG_ON(atomic_read(&bio->bi_remaining) <= 0);
-
if (error)
clear_bit(BIO_UPTODATE, &bio->bi_flags);
else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
error = -EIO;
- if (!atomic_dec_and_test(&bio->bi_remaining))
- return;
+ if (unlikely(!bio_remaining_done(bio)))
+ break;
/*
* Need to have a real endio function for chained bios,
@@ -1790,21 +1822,6 @@ void bio_endio(struct bio *bio, int error)
EXPORT_SYMBOL(bio_endio);
/**
- * bio_endio_nodec - end I/O on a bio, without decrementing bi_remaining
- * @bio: bio
- * @error: error, if any
- *
- * For code that has saved and restored bi_end_io; thing hard before using this
- * function, probably you should've cloned the entire bio.
- **/
-void bio_endio_nodec(struct bio *bio, int error)
-{
- atomic_inc(&bio->bi_remaining);
- bio_endio(bio, error);
-}
-EXPORT_SYMBOL(bio_endio_nodec);
-
-/**
* bio_split - split a bio
* @bio: bio to split
* @sectors: number of sectors to split from the front of @bio
@@ -1980,6 +1997,29 @@ struct bio_set *bioset_create_nobvec(unsigned int pool_size, unsigned int front_
EXPORT_SYMBOL(bioset_create_nobvec);
#ifdef CONFIG_BLK_CGROUP
+
+/**
+ * bio_associate_blkcg - associate a bio with the specified blkcg
+ * @bio: target bio
+ * @blkcg_css: css of the blkcg to associate
+ *
+ * Associate @bio with the blkcg specified by @blkcg_css. Block layer will
+ * treat @bio as if it were issued by a task which belongs to the blkcg.
+ *
+ * This function takes an extra reference of @blkcg_css which will be put
+ * when @bio is released. The caller must own @bio and is responsible for
+ * synchronizing calls to this function.
+ */
+int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css)
+{
+ if (unlikely(bio->bi_css))
+ return -EBUSY;
+ css_get(blkcg_css);
+ bio->bi_css = blkcg_css;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(bio_associate_blkcg);
+
/**
* bio_associate_current - associate a bio with %current
* @bio: target bio
@@ -1996,28 +2036,20 @@ EXPORT_SYMBOL(bioset_create_nobvec);
int bio_associate_current(struct bio *bio)
{
struct io_context *ioc;
- struct cgroup_subsys_state *css;
- if (bio->bi_ioc)
+ if (bio->bi_css)
return -EBUSY;
ioc = current->io_context;
if (!ioc)
return -ENOENT;
- /* acquire active ref on @ioc and associate */
get_io_context_active(ioc);
bio->bi_ioc = ioc;
-
- /* associate blkcg if exists */
- rcu_read_lock();
- css = task_css(current, blkio_cgrp_id);
- if (css && css_tryget_online(css))
- bio->bi_css = css;
- rcu_read_unlock();
-
+ bio->bi_css = task_get_css(current, blkio_cgrp_id);
return 0;
}
+EXPORT_SYMBOL_GPL(bio_associate_current);
/**
* bio_disassociate_task - undo bio_associate_current()
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 6817e2896..d6283b3f5 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -9,29 +9,45 @@
*
* Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
* Nauman Rafique <nauman@google.com>
+ *
+ * For policy-specific per-blkcg data:
+ * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it>
+ * Arianna Avanzini <avanzini.arianna@gmail.com>
*/
#include <linux/ioprio.h>
#include <linux/kdev_t.h>
#include <linux/module.h>
#include <linux/err.h>
#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
#include <linux/slab.h>
#include <linux/genhd.h>
#include <linux/delay.h>
#include <linux/atomic.h>
-#include "blk-cgroup.h"
+#include <linux/blk-cgroup.h>
#include "blk.h"
#define MAX_KEY_LEN 100
+/*
+ * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation.
+ * blkcg_pol_register_mutex nests outside of it and synchronizes entire
+ * policy [un]register operations including cgroup file additions /
+ * removals. Putting cgroup file registration outside blkcg_pol_mutex
+ * allows grabbing it from cgroup callbacks.
+ */
+static DEFINE_MUTEX(blkcg_pol_register_mutex);
static DEFINE_MUTEX(blkcg_pol_mutex);
-struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT,
- .cfq_leaf_weight = 2 * CFQ_WEIGHT_DEFAULT, };
+struct blkcg blkcg_root;
EXPORT_SYMBOL_GPL(blkcg_root);
+struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css;
+
static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
+static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */
+
static bool blkcg_policy_enabled(struct request_queue *q,
const struct blkcg_policy *pol)
{
@@ -179,6 +195,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
struct blkcg_gq *new_blkg)
{
struct blkcg_gq *blkg;
+ struct bdi_writeback_congested *wb_congested;
int i, ret;
WARN_ON_ONCE(!rcu_read_lock_held());
@@ -190,22 +207,30 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
goto err_free_blkg;
}
+ wb_congested = wb_congested_get_create(&q->backing_dev_info,
+ blkcg->css.id, GFP_ATOMIC);
+ if (!wb_congested) {
+ ret = -ENOMEM;
+ goto err_put_css;
+ }
+
/* allocate */
if (!new_blkg) {
new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC);
if (unlikely(!new_blkg)) {
ret = -ENOMEM;
- goto err_put_css;
+ goto err_put_congested;
}
}
blkg = new_blkg;
+ blkg->wb_congested = wb_congested;
/* link parent */
if (blkcg_parent(blkcg)) {
blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
if (WARN_ON_ONCE(!blkg->parent)) {
ret = -EINVAL;
- goto err_put_css;
+ goto err_put_congested;
}
blkg_get(blkg->parent);
}
@@ -235,18 +260,15 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
blkg->online = true;
spin_unlock(&blkcg->lock);
- if (!ret) {
- if (blkcg == &blkcg_root) {
- q->root_blkg = blkg;
- q->root_rl.blkg = blkg;
- }
+ if (!ret)
return blkg;
- }
/* @blkg failed fully initialized, use the usual release path */
blkg_put(blkg);
return ERR_PTR(ret);
+err_put_congested:
+ wb_congested_put(wb_congested);
err_put_css:
css_put(&blkcg->css);
err_free_blkg:
@@ -340,15 +362,6 @@ static void blkg_destroy(struct blkcg_gq *blkg)
rcu_assign_pointer(blkcg->blkg_hint, NULL);
/*
- * If root blkg is destroyed. Just clear the pointer since root_rl
- * does not take reference on root blkg.
- */
- if (blkcg == &blkcg_root) {
- blkg->q->root_blkg = NULL;
- blkg->q->root_rl.blkg = NULL;
- }
-
- /*
* Put the reference taken at the time of creation so that when all
* queues are gone, group can be destroyed.
*/
@@ -402,6 +415,8 @@ void __blkg_release_rcu(struct rcu_head *rcu_head)
if (blkg->parent)
blkg_put(blkg->parent);
+ wb_congested_put(blkg->wb_congested);
+
blkg_free(blkg);
}
EXPORT_SYMBOL_GPL(__blkg_release_rcu);
@@ -448,20 +463,7 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css,
struct blkcg_gq *blkg;
int i;
- /*
- * XXX: We invoke cgroup_add/rm_cftypes() under blkcg_pol_mutex
- * which ends up putting cgroup's internal cgroup_tree_mutex under
- * it; however, cgroup_tree_mutex is nested above cgroup file
- * active protection and grabbing blkcg_pol_mutex from a cgroup
- * file operation creates a possible circular dependency. cgroup
- * internal locking is planned to go through further simplification
- * and this issue should go away soon. For now, let's trylock
- * blkcg_pol_mutex and restart the write on failure.
- *
- * http://lkml.kernel.org/g/5363C04B.4010400@oracle.com
- */
- if (!mutex_trylock(&blkcg_pol_mutex))
- return restart_syscall();
+ mutex_lock(&blkcg_pol_mutex);
spin_lock_irq(&blkcg->lock);
/*
@@ -813,20 +815,35 @@ static void blkcg_css_offline(struct cgroup_subsys_state *css)
}
spin_unlock_irq(&blkcg->lock);
+
+ wb_blkcg_offline(blkcg);
}
static void blkcg_css_free(struct cgroup_subsys_state *css)
{
struct blkcg *blkcg = css_to_blkcg(css);
- if (blkcg != &blkcg_root)
+ mutex_lock(&blkcg_pol_mutex);
+ list_del(&blkcg->all_blkcgs_node);
+ mutex_unlock(&blkcg_pol_mutex);
+
+ if (blkcg != &blkcg_root) {
+ int i;
+
+ for (i = 0; i < BLKCG_MAX_POLS; i++)
+ kfree(blkcg->pd[i]);
kfree(blkcg);
+ }
}
static struct cgroup_subsys_state *
blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
{
struct blkcg *blkcg;
+ struct cgroup_subsys_state *ret;
+ int i;
+
+ mutex_lock(&blkcg_pol_mutex);
if (!parent_css) {
blkcg = &blkcg_root;
@@ -834,17 +851,54 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
}
blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
- if (!blkcg)
- return ERR_PTR(-ENOMEM);
+ if (!blkcg) {
+ ret = ERR_PTR(-ENOMEM);
+ goto free_blkcg;
+ }
+
+ for (i = 0; i < BLKCG_MAX_POLS ; i++) {
+ struct blkcg_policy *pol = blkcg_policy[i];
+ struct blkcg_policy_data *cpd;
+
+ /*
+ * If the policy hasn't been attached yet, wait for it
+ * to be attached before doing anything else. Otherwise,
+ * check if the policy requires any specific per-cgroup
+ * data: if it does, allocate and initialize it.
+ */
+ if (!pol || !pol->cpd_size)
+ continue;
+
+ BUG_ON(blkcg->pd[i]);
+ cpd = kzalloc(pol->cpd_size, GFP_KERNEL);
+ if (!cpd) {
+ ret = ERR_PTR(-ENOMEM);
+ goto free_pd_blkcg;
+ }
+ blkcg->pd[i] = cpd;
+ cpd->plid = i;
+ pol->cpd_init_fn(blkcg);
+ }
- blkcg->cfq_weight = CFQ_WEIGHT_DEFAULT;
- blkcg->cfq_leaf_weight = CFQ_WEIGHT_DEFAULT;
done:
spin_lock_init(&blkcg->lock);
INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC);
INIT_HLIST_HEAD(&blkcg->blkg_list);
+#ifdef CONFIG_CGROUP_WRITEBACK
+ INIT_LIST_HEAD(&blkcg->cgwb_list);
+#endif
+ list_add_tail(&blkcg->all_blkcgs_node, &all_blkcgs);
+ mutex_unlock(&blkcg_pol_mutex);
return &blkcg->css;
+
+free_pd_blkcg:
+ for (i--; i >= 0; i--)
+ kfree(blkcg->pd[i]);
+free_blkcg:
+ kfree(blkcg);
+ mutex_unlock(&blkcg_pol_mutex);
+ return ret;
}
/**
@@ -859,9 +913,45 @@ done:
*/
int blkcg_init_queue(struct request_queue *q)
{
- might_sleep();
+ struct blkcg_gq *new_blkg, *blkg;
+ bool preloaded;
+ int ret;
+
+ new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
+ if (!new_blkg)
+ return -ENOMEM;
+
+ preloaded = !radix_tree_preload(GFP_KERNEL);
+
+ /*
+ * Make sure the root blkg exists and count the existing blkgs. As
+ * @q is bypassing at this point, blkg_lookup_create() can't be
+ * used. Open code insertion.
+ */
+ rcu_read_lock();
+ spin_lock_irq(q->queue_lock);
+ blkg = blkg_create(&blkcg_root, q, new_blkg);
+ spin_unlock_irq(q->queue_lock);
+ rcu_read_unlock();
+
+ if (preloaded)
+ radix_tree_preload_end();
- return blk_throtl_init(q);
+ if (IS_ERR(blkg)) {
+ kfree(new_blkg);
+ return PTR_ERR(blkg);
+ }
+
+ q->root_blkg = blkg;
+ q->root_rl.blkg = blkg;
+
+ ret = blk_throtl_init(q);
+ if (ret) {
+ spin_lock_irq(q->queue_lock);
+ blkg_destroy_all(q);
+ spin_unlock_irq(q->queue_lock);
+ }
+ return ret;
}
/**
@@ -962,52 +1052,21 @@ int blkcg_activate_policy(struct request_queue *q,
const struct blkcg_policy *pol)
{
LIST_HEAD(pds);
- struct blkcg_gq *blkg, *new_blkg;
- struct blkg_policy_data *pd, *n;
+ struct blkcg_gq *blkg;
+ struct blkg_policy_data *pd, *nd;
int cnt = 0, ret;
- bool preloaded;
if (blkcg_policy_enabled(q, pol))
return 0;
- /* preallocations for root blkg */
- new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
- if (!new_blkg)
- return -ENOMEM;
-
+ /* count and allocate policy_data for all existing blkgs */
blk_queue_bypass_start(q);
-
- preloaded = !radix_tree_preload(GFP_KERNEL);
-
- /*
- * Make sure the root blkg exists and count the existing blkgs. As
- * @q is bypassing at this point, blkg_lookup_create() can't be
- * used. Open code it.
- */
spin_lock_irq(q->queue_lock);
-
- rcu_read_lock();
- blkg = __blkg_lookup(&blkcg_root, q, false);
- if (blkg)
- blkg_free(new_blkg);
- else
- blkg = blkg_create(&blkcg_root, q, new_blkg);
- rcu_read_unlock();
-
- if (preloaded)
- radix_tree_preload_end();
-
- if (IS_ERR(blkg)) {
- ret = PTR_ERR(blkg);
- goto out_unlock;
- }
-
list_for_each_entry(blkg, &q->blkg_list, q_node)
cnt++;
-
spin_unlock_irq(q->queue_lock);
- /* allocate policy_data for all existing blkgs */
+ /* allocate per-blkg policy data for all existing blkgs */
while (cnt--) {
pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node);
if (!pd) {
@@ -1018,7 +1077,7 @@ int blkcg_activate_policy(struct request_queue *q,
}
/*
- * Install the allocated pds. With @q bypassing, no new blkg
+ * Install the allocated pds and cpds. With @q bypassing, no new blkg
* should have been created while the queue lock was dropped.
*/
spin_lock_irq(q->queue_lock);
@@ -1049,7 +1108,7 @@ out_unlock:
spin_unlock_irq(q->queue_lock);
out_free:
blk_queue_bypass_end(q);
- list_for_each_entry_safe(pd, n, &pds, alloc_node)
+ list_for_each_entry_safe(pd, nd, &pds, alloc_node)
kfree(pd);
return ret;
}
@@ -1076,10 +1135,6 @@ void blkcg_deactivate_policy(struct request_queue *q,
__clear_bit(pol->plid, q->blkcg_pols);
- /* if no policy is left, no need for blkgs - shoot them down */
- if (bitmap_empty(q->blkcg_pols, BLKCG_MAX_POLS))
- blkg_destroy_all(q);
-
list_for_each_entry(blkg, &q->blkg_list, q_node) {
/* grab blkcg lock too while removing @pd from @blkg */
spin_lock(&blkg->blkcg->lock);
@@ -1109,11 +1164,13 @@ EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);
*/
int blkcg_policy_register(struct blkcg_policy *pol)
{
+ struct blkcg *blkcg;
int i, ret;
if (WARN_ON(pol->pd_size < sizeof(struct blkg_policy_data)))
return -EINVAL;
+ mutex_lock(&blkcg_pol_register_mutex);
mutex_lock(&blkcg_pol_mutex);
/* find an empty slot */
@@ -1122,19 +1179,49 @@ int blkcg_policy_register(struct blkcg_policy *pol)
if (!blkcg_policy[i])
break;
if (i >= BLKCG_MAX_POLS)
- goto out_unlock;
+ goto err_unlock;
- /* register and update blkgs */
+ /* register @pol */
pol->plid = i;
- blkcg_policy[i] = pol;
+ blkcg_policy[pol->plid] = pol;
+
+ /* allocate and install cpd's */
+ if (pol->cpd_size) {
+ list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
+ struct blkcg_policy_data *cpd;
+
+ cpd = kzalloc(pol->cpd_size, GFP_KERNEL);
+ if (!cpd) {
+ mutex_unlock(&blkcg_pol_mutex);
+ goto err_free_cpds;
+ }
+
+ blkcg->pd[pol->plid] = cpd;
+ cpd->plid = pol->plid;
+ pol->cpd_init_fn(blkcg);
+ }
+ }
+
+ mutex_unlock(&blkcg_pol_mutex);
/* everything is in place, add intf files for the new policy */
if (pol->cftypes)
WARN_ON(cgroup_add_legacy_cftypes(&blkio_cgrp_subsys,
pol->cftypes));
- ret = 0;
-out_unlock:
+ mutex_unlock(&blkcg_pol_register_mutex);
+ return 0;
+
+err_free_cpds:
+ if (pol->cpd_size) {
+ list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
+ kfree(blkcg->pd[pol->plid]);
+ blkcg->pd[pol->plid] = NULL;
+ }
+ }
+ blkcg_policy[pol->plid] = NULL;
+err_unlock:
mutex_unlock(&blkcg_pol_mutex);
+ mutex_unlock(&blkcg_pol_register_mutex);
return ret;
}
EXPORT_SYMBOL_GPL(blkcg_policy_register);
@@ -1147,7 +1234,9 @@ EXPORT_SYMBOL_GPL(blkcg_policy_register);
*/
void blkcg_policy_unregister(struct blkcg_policy *pol)
{
- mutex_lock(&blkcg_pol_mutex);
+ struct blkcg *blkcg;
+
+ mutex_lock(&blkcg_pol_register_mutex);
if (WARN_ON(blkcg_policy[pol->plid] != pol))
goto out_unlock;
@@ -1156,9 +1245,19 @@ void blkcg_policy_unregister(struct blkcg_policy *pol)
if (pol->cftypes)
cgroup_rm_cftypes(pol->cftypes);
- /* unregister and update blkgs */
+ /* remove cpds and unregister */
+ mutex_lock(&blkcg_pol_mutex);
+
+ if (pol->cpd_size) {
+ list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
+ kfree(blkcg->pd[pol->plid]);
+ blkcg->pd[pol->plid] = NULL;
+ }
+ }
blkcg_policy[pol->plid] = NULL;
-out_unlock:
+
mutex_unlock(&blkcg_pol_mutex);
+out_unlock:
+ mutex_unlock(&blkcg_pol_register_mutex);
}
EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
deleted file mode 100644
index c567865b5..000000000
--- a/block/blk-cgroup.h
+++ /dev/null
@@ -1,603 +0,0 @@
-#ifndef _BLK_CGROUP_H
-#define _BLK_CGROUP_H
-/*
- * Common Block IO controller cgroup interface
- *
- * Based on ideas and code from CFQ, CFS and BFQ:
- * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
- *
- * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
- * Paolo Valente <paolo.valente@unimore.it>
- *
- * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
- * Nauman Rafique <nauman@google.com>
- */
-
-#include <linux/cgroup.h>
-#include <linux/u64_stats_sync.h>
-#include <linux/seq_file.h>
-#include <linux/radix-tree.h>
-#include <linux/blkdev.h>
-#include <linux/atomic.h>
-
-/* Max limits for throttle policy */
-#define THROTL_IOPS_MAX UINT_MAX
-
-/* CFQ specific, out here for blkcg->cfq_weight */
-#define CFQ_WEIGHT_MIN 10
-#define CFQ_WEIGHT_MAX 1000
-#define CFQ_WEIGHT_DEFAULT 500
-
-#ifdef CONFIG_BLK_CGROUP
-
-enum blkg_rwstat_type {
- BLKG_RWSTAT_READ,
- BLKG_RWSTAT_WRITE,
- BLKG_RWSTAT_SYNC,
- BLKG_RWSTAT_ASYNC,
-
- BLKG_RWSTAT_NR,
- BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR,
-};
-
-struct blkcg_gq;
-
-struct blkcg {
- struct cgroup_subsys_state css;
- spinlock_t lock;
-
- struct radix_tree_root blkg_tree;
- struct blkcg_gq *blkg_hint;
- struct hlist_head blkg_list;
-
- /* TODO: per-policy storage in blkcg */
- unsigned int cfq_weight; /* belongs to cfq */
- unsigned int cfq_leaf_weight;
-};
-
-struct blkg_stat {
- struct u64_stats_sync syncp;
- uint64_t cnt;
-};
-
-struct blkg_rwstat {
- struct u64_stats_sync syncp;
- uint64_t cnt[BLKG_RWSTAT_NR];
-};
-
-/*
- * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a
- * request_queue (q). This is used by blkcg policies which need to track
- * information per blkcg - q pair.
- *
- * There can be multiple active blkcg policies and each has its private
- * data on each blkg, the size of which is determined by
- * blkcg_policy->pd_size. blkcg core allocates and frees such areas
- * together with blkg and invokes pd_init/exit_fn() methods.
- *
- * Such private data must embed struct blkg_policy_data (pd) at the
- * beginning and pd_size can't be smaller than pd.
- */
-struct blkg_policy_data {
- /* the blkg and policy id this per-policy data belongs to */
- struct blkcg_gq *blkg;
- int plid;
-
- /* used during policy activation */
- struct list_head alloc_node;
-};
-
-/* association between a blk cgroup and a request queue */
-struct blkcg_gq {
- /* Pointer to the associated request_queue */
- struct request_queue *q;
- struct list_head q_node;
- struct hlist_node blkcg_node;
- struct blkcg *blkcg;
-
- /* all non-root blkcg_gq's are guaranteed to have access to parent */
- struct blkcg_gq *parent;
-
- /* request allocation list for this blkcg-q pair */
- struct request_list rl;
-
- /* reference count */
- atomic_t refcnt;
-
- /* is this blkg online? protected by both blkcg and q locks */
- bool online;
-
- struct blkg_policy_data *pd[BLKCG_MAX_POLS];
-
- struct rcu_head rcu_head;
-};
-
-typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg);
-typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg);
-typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg);
-typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg);
-typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg);
-
-struct blkcg_policy {
- int plid;
- /* policy specific private data size */
- size_t pd_size;
- /* cgroup files for the policy */
- struct cftype *cftypes;
-
- /* operations */
- blkcg_pol_init_pd_fn *pd_init_fn;
- blkcg_pol_online_pd_fn *pd_online_fn;
- blkcg_pol_offline_pd_fn *pd_offline_fn;
- blkcg_pol_exit_pd_fn *pd_exit_fn;
- blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn;
-};
-
-extern struct blkcg blkcg_root;
-
-struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q);
-struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
- struct request_queue *q);
-int blkcg_init_queue(struct request_queue *q);
-void blkcg_drain_queue(struct request_queue *q);
-void blkcg_exit_queue(struct request_queue *q);
-
-/* Blkio controller policy registration */
-int blkcg_policy_register(struct blkcg_policy *pol);
-void blkcg_policy_unregister(struct blkcg_policy *pol);
-int blkcg_activate_policy(struct request_queue *q,
- const struct blkcg_policy *pol);
-void blkcg_deactivate_policy(struct request_queue *q,
- const struct blkcg_policy *pol);
-
-void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
- u64 (*prfill)(struct seq_file *,
- struct blkg_policy_data *, int),
- const struct blkcg_policy *pol, int data,
- bool show_total);
-u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v);
-u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
- const struct blkg_rwstat *rwstat);
-u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off);
-u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
- int off);
-
-u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off);
-struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd,
- int off);
-
-struct blkg_conf_ctx {
- struct gendisk *disk;
- struct blkcg_gq *blkg;
- u64 v;
-};
-
-int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
- const char *input, struct blkg_conf_ctx *ctx);
-void blkg_conf_finish(struct blkg_conf_ctx *ctx);
-
-
-static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css)
-{
- return css ? container_of(css, struct blkcg, css) : NULL;
-}
-
-static inline struct blkcg *task_blkcg(struct task_struct *tsk)
-{
- return css_to_blkcg(task_css(tsk, blkio_cgrp_id));
-}
-
-static inline struct blkcg *bio_blkcg(struct bio *bio)
-{
- if (bio && bio->bi_css)
- return css_to_blkcg(bio->bi_css);
- return task_blkcg(current);
-}
-
-/**
- * blkcg_parent - get the parent of a blkcg
- * @blkcg: blkcg of interest
- *
- * Return the parent blkcg of @blkcg. Can be called anytime.
- */
-static inline struct blkcg *blkcg_parent(struct blkcg *blkcg)
-{
- return css_to_blkcg(blkcg->css.parent);
-}
-
-/**
- * blkg_to_pdata - get policy private data
- * @blkg: blkg of interest
- * @pol: policy of interest
- *
- * Return pointer to private data associated with the @blkg-@pol pair.
- */
-static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
- struct blkcg_policy *pol)
-{
- return blkg ? blkg->pd[pol->plid] : NULL;
-}
-
-/**
- * pdata_to_blkg - get blkg associated with policy private data
- * @pd: policy private data of interest
- *
- * @pd is policy private data. Determine the blkg it's associated with.
- */
-static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd)
-{
- return pd ? pd->blkg : NULL;
-}
-
-/**
- * blkg_path - format cgroup path of blkg
- * @blkg: blkg of interest
- * @buf: target buffer
- * @buflen: target buffer length
- *
- * Format the path of the cgroup of @blkg into @buf.
- */
-static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen)
-{
- char *p;
-
- p = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen);
- if (!p) {
- strncpy(buf, "<unavailable>", buflen);
- return -ENAMETOOLONG;
- }
-
- memmove(buf, p, buf + buflen - p);
- return 0;
-}
-
-/**
- * blkg_get - get a blkg reference
- * @blkg: blkg to get
- *
- * The caller should be holding an existing reference.
- */
-static inline void blkg_get(struct blkcg_gq *blkg)
-{
- WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0);
- atomic_inc(&blkg->refcnt);
-}
-
-void __blkg_release_rcu(struct rcu_head *rcu);
-
-/**
- * blkg_put - put a blkg reference
- * @blkg: blkg to put
- */
-static inline void blkg_put(struct blkcg_gq *blkg)
-{
- WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0);
- if (atomic_dec_and_test(&blkg->refcnt))
- call_rcu(&blkg->rcu_head, __blkg_release_rcu);
-}
-
-struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
- bool update_hint);
-
-/**
- * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
- * @d_blkg: loop cursor pointing to the current descendant
- * @pos_css: used for iteration
- * @p_blkg: target blkg to walk descendants of
- *
- * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU
- * read locked. If called under either blkcg or queue lock, the iteration
- * is guaranteed to include all and only online blkgs. The caller may
- * update @pos_css by calling css_rightmost_descendant() to skip subtree.
- * @p_blkg is included in the iteration and the first node to be visited.
- */
-#define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg) \
- css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css) \
- if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \
- (p_blkg)->q, false)))
-
-/**
- * blkg_for_each_descendant_post - post-order walk of a blkg's descendants
- * @d_blkg: loop cursor pointing to the current descendant
- * @pos_css: used for iteration
- * @p_blkg: target blkg to walk descendants of
- *
- * Similar to blkg_for_each_descendant_pre() but performs post-order
- * traversal instead. Synchronization rules are the same. @p_blkg is
- * included in the iteration and the last node to be visited.
- */
-#define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg) \
- css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css) \
- if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \
- (p_blkg)->q, false)))
-
-/**
- * blk_get_rl - get request_list to use
- * @q: request_queue of interest
- * @bio: bio which will be attached to the allocated request (may be %NULL)
- *
- * The caller wants to allocate a request from @q to use for @bio. Find
- * the request_list to use and obtain a reference on it. Should be called
- * under queue_lock. This function is guaranteed to return non-%NULL
- * request_list.
- */
-static inline struct request_list *blk_get_rl(struct request_queue *q,
- struct bio *bio)
-{
- struct blkcg *blkcg;
- struct blkcg_gq *blkg;
-
- rcu_read_lock();
-
- blkcg = bio_blkcg(bio);
-
- /* bypass blkg lookup and use @q->root_rl directly for root */
- if (blkcg == &blkcg_root)
- goto root_rl;
-
- /*
- * Try to use blkg->rl. blkg lookup may fail under memory pressure
- * or if either the blkcg or queue is going away. Fall back to
- * root_rl in such cases.
- */
- blkg = blkg_lookup_create(blkcg, q);
- if (unlikely(IS_ERR(blkg)))
- goto root_rl;
-
- blkg_get(blkg);
- rcu_read_unlock();
- return &blkg->rl;
-root_rl:
- rcu_read_unlock();
- return &q->root_rl;
-}
-
-/**
- * blk_put_rl - put request_list
- * @rl: request_list to put
- *
- * Put the reference acquired by blk_get_rl(). Should be called under
- * queue_lock.
- */
-static inline void blk_put_rl(struct request_list *rl)
-{
- /* root_rl may not have blkg set */
- if (rl->blkg && rl->blkg->blkcg != &blkcg_root)
- blkg_put(rl->blkg);
-}
-
-/**
- * blk_rq_set_rl - associate a request with a request_list
- * @rq: request of interest
- * @rl: target request_list
- *
- * Associate @rq with @rl so that accounting and freeing can know the
- * request_list @rq came from.
- */
-static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl)
-{
- rq->rl = rl;
-}
-
-/**
- * blk_rq_rl - return the request_list a request came from
- * @rq: request of interest
- *
- * Return the request_list @rq is allocated from.
- */
-static inline struct request_list *blk_rq_rl(struct request *rq)
-{
- return rq->rl;
-}
-
-struct request_list *__blk_queue_next_rl(struct request_list *rl,
- struct request_queue *q);
-/**
- * blk_queue_for_each_rl - iterate through all request_lists of a request_queue
- *
- * Should be used under queue_lock.
- */
-#define blk_queue_for_each_rl(rl, q) \
- for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q)))
-
-static inline void blkg_stat_init(struct blkg_stat *stat)
-{
- u64_stats_init(&stat->syncp);
-}
-
-/**
- * blkg_stat_add - add a value to a blkg_stat
- * @stat: target blkg_stat
- * @val: value to add
- *
- * Add @val to @stat. The caller is responsible for synchronizing calls to
- * this function.
- */
-static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val)
-{
- u64_stats_update_begin(&stat->syncp);
- stat->cnt += val;
- u64_stats_update_end(&stat->syncp);
-}
-
-/**
- * blkg_stat_read - read the current value of a blkg_stat
- * @stat: blkg_stat to read
- *
- * Read the current value of @stat. This function can be called without
- * synchroniztion and takes care of u64 atomicity.
- */
-static inline uint64_t blkg_stat_read(struct blkg_stat *stat)
-{
- unsigned int start;
- uint64_t v;
-
- do {
- start = u64_stats_fetch_begin_irq(&stat->syncp);
- v = stat->cnt;
- } while (u64_stats_fetch_retry_irq(&stat->syncp, start));
-
- return v;
-}
-
-/**
- * blkg_stat_reset - reset a blkg_stat
- * @stat: blkg_stat to reset
- */
-static inline void blkg_stat_reset(struct blkg_stat *stat)
-{
- stat->cnt = 0;
-}
-
-/**
- * blkg_stat_merge - merge a blkg_stat into another
- * @to: the destination blkg_stat
- * @from: the source
- *
- * Add @from's count to @to.
- */
-static inline void blkg_stat_merge(struct blkg_stat *to, struct blkg_stat *from)
-{
- blkg_stat_add(to, blkg_stat_read(from));
-}
-
-static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat)
-{
- u64_stats_init(&rwstat->syncp);
-}
-
-/**
- * blkg_rwstat_add - add a value to a blkg_rwstat
- * @rwstat: target blkg_rwstat
- * @rw: mask of REQ_{WRITE|SYNC}
- * @val: value to add
- *
- * Add @val to @rwstat. The counters are chosen according to @rw. The
- * caller is responsible for synchronizing calls to this function.
- */
-static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat,
- int rw, uint64_t val)
-{
- u64_stats_update_begin(&rwstat->syncp);
-
- if (rw & REQ_WRITE)
- rwstat->cnt[BLKG_RWSTAT_WRITE] += val;
- else
- rwstat->cnt[BLKG_RWSTAT_READ] += val;
- if (rw & REQ_SYNC)
- rwstat->cnt[BLKG_RWSTAT_SYNC] += val;
- else
- rwstat->cnt[BLKG_RWSTAT_ASYNC] += val;
-
- u64_stats_update_end(&rwstat->syncp);
-}
-
-/**
- * blkg_rwstat_read - read the current values of a blkg_rwstat
- * @rwstat: blkg_rwstat to read
- *
- * Read the current snapshot of @rwstat and return it as the return value.
- * This function can be called without synchronization and takes care of
- * u64 atomicity.
- */
-static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat)
-{
- unsigned int start;
- struct blkg_rwstat tmp;
-
- do {
- start = u64_stats_fetch_begin_irq(&rwstat->syncp);
- tmp = *rwstat;
- } while (u64_stats_fetch_retry_irq(&rwstat->syncp, start));
-
- return tmp;
-}
-
-/**
- * blkg_rwstat_total - read the total count of a blkg_rwstat
- * @rwstat: blkg_rwstat to read
- *
- * Return the total count of @rwstat regardless of the IO direction. This
- * function can be called without synchronization and takes care of u64
- * atomicity.
- */
-static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat)
-{
- struct blkg_rwstat tmp = blkg_rwstat_read(rwstat);
-
- return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE];
-}
-
-/**
- * blkg_rwstat_reset - reset a blkg_rwstat
- * @rwstat: blkg_rwstat to reset
- */
-static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat)
-{
- memset(rwstat->cnt, 0, sizeof(rwstat->cnt));
-}
-
-/**
- * blkg_rwstat_merge - merge a blkg_rwstat into another
- * @to: the destination blkg_rwstat
- * @from: the source
- *
- * Add @from's counts to @to.
- */
-static inline void blkg_rwstat_merge(struct blkg_rwstat *to,
- struct blkg_rwstat *from)
-{
- struct blkg_rwstat v = blkg_rwstat_read(from);
- int i;
-
- u64_stats_update_begin(&to->syncp);
- for (i = 0; i < BLKG_RWSTAT_NR; i++)
- to->cnt[i] += v.cnt[i];
- u64_stats_update_end(&to->syncp);
-}
-
-#else /* CONFIG_BLK_CGROUP */
-
-struct cgroup;
-struct blkcg;
-
-struct blkg_policy_data {
-};
-
-struct blkcg_gq {
-};
-
-struct blkcg_policy {
-};
-
-static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
-static inline int blkcg_init_queue(struct request_queue *q) { return 0; }
-static inline void blkcg_drain_queue(struct request_queue *q) { }
-static inline void blkcg_exit_queue(struct request_queue *q) { }
-static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; }
-static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { }
-static inline int blkcg_activate_policy(struct request_queue *q,
- const struct blkcg_policy *pol) { return 0; }
-static inline void blkcg_deactivate_policy(struct request_queue *q,
- const struct blkcg_policy *pol) { }
-
-static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; }
-
-static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
- struct blkcg_policy *pol) { return NULL; }
-static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; }
-static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
-static inline void blkg_get(struct blkcg_gq *blkg) { }
-static inline void blkg_put(struct blkcg_gq *blkg) { }
-
-static inline struct request_list *blk_get_rl(struct request_queue *q,
- struct bio *bio) { return &q->root_rl; }
-static inline void blk_put_rl(struct request_list *rl) { }
-static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { }
-static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; }
-
-#define blk_queue_for_each_rl(rl, q) \
- for ((rl) = &(q)->root_rl; (rl); (rl) = NULL)
-
-#endif /* CONFIG_BLK_CGROUP */
-#endif /* _BLK_CGROUP_H */
diff --git a/block/blk-core.c b/block/blk-core.c
index 4c6e9ced9..627ed0c59 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -32,12 +32,12 @@
#include <linux/delay.h>
#include <linux/ratelimit.h>
#include <linux/pm_runtime.h>
+#include <linux/blk-cgroup.h>
#define CREATE_TRACE_POINTS
#include <trace/events/block.h>
#include "blk.h"
-#include "blk-cgroup.h"
#include "blk-mq.h"
EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
@@ -48,8 +48,6 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug);
DEFINE_IDA(blk_queue_ida);
-int trap_non_toi_io;
-
/*
* For the allocated request tables
*/
@@ -65,6 +63,31 @@ struct kmem_cache *blk_requestq_cachep;
*/
static struct workqueue_struct *kblockd_workqueue;
+static void blk_clear_congested(struct request_list *rl, int sync)
+{
+#ifdef CONFIG_CGROUP_WRITEBACK
+ clear_wb_congested(rl->blkg->wb_congested, sync);
+#else
+ /*
+ * If !CGROUP_WRITEBACK, all blkg's map to bdi->wb and we shouldn't
+ * flip its congestion state for events on other blkcgs.
+ */
+ if (rl == &rl->q->root_rl)
+ clear_wb_congested(rl->q->backing_dev_info.wb.congested, sync);
+#endif
+}
+
+static void blk_set_congested(struct request_list *rl, int sync)
+{
+#ifdef CONFIG_CGROUP_WRITEBACK
+ set_wb_congested(rl->blkg->wb_congested, sync);
+#else
+ /* see blk_clear_congested() */
+ if (rl == &rl->q->root_rl)
+ set_wb_congested(rl->q->backing_dev_info.wb.congested, sync);
+#endif
+}
+
void blk_queue_congestion_threshold(struct request_queue *q)
{
int nr;
@@ -287,6 +310,7 @@ inline void __blk_run_queue_uncond(struct request_queue *q)
q->request_fn(q);
q->request_fn_active--;
}
+EXPORT_SYMBOL_GPL(__blk_run_queue_uncond);
/**
* __blk_run_queue - run a single device queue
@@ -623,8 +647,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
q->backing_dev_info.ra_pages =
(VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
- q->backing_dev_info.state = 0;
- q->backing_dev_info.capabilities = 0;
+ q->backing_dev_info.capabilities = BDI_CAP_CGROUP_WRITEBACK;
q->backing_dev_info.name = "block";
q->node = node_id;
@@ -847,13 +870,8 @@ static void __freed_request(struct request_list *rl, int sync)
{
struct request_queue *q = rl->q;
- /*
- * bdi isn't aware of blkcg yet. As all async IOs end up root
- * blkcg anyway, just use root blkcg state.
- */
- if (rl == &q->root_rl &&
- rl->count[sync] < queue_congestion_off_threshold(q))
- blk_clear_queue_congested(q, sync);
+ if (rl->count[sync] < queue_congestion_off_threshold(q))
+ blk_clear_congested(rl, sync);
if (rl->count[sync] + 1 <= q->nr_requests) {
if (waitqueue_active(&rl->wait[sync]))
@@ -886,25 +904,25 @@ static void freed_request(struct request_list *rl, unsigned int flags)
int blk_update_nr_requests(struct request_queue *q, unsigned int nr)
{
struct request_list *rl;
+ int on_thresh, off_thresh;
spin_lock_irq(q->queue_lock);
q->nr_requests = nr;
blk_queue_congestion_threshold(q);
+ on_thresh = queue_congestion_on_threshold(q);
+ off_thresh = queue_congestion_off_threshold(q);
- /* congestion isn't cgroup aware and follows root blkcg for now */
- rl = &q->root_rl;
-
- if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q))
- blk_set_queue_congested(q, BLK_RW_SYNC);
- else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q))
- blk_clear_queue_congested(q, BLK_RW_SYNC);
+ blk_queue_for_each_rl(rl, q) {
+ if (rl->count[BLK_RW_SYNC] >= on_thresh)
+ blk_set_congested(rl, BLK_RW_SYNC);
+ else if (rl->count[BLK_RW_SYNC] < off_thresh)
+ blk_clear_congested(rl, BLK_RW_SYNC);
- if (rl->count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q))
- blk_set_queue_congested(q, BLK_RW_ASYNC);
- else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q))
- blk_clear_queue_congested(q, BLK_RW_ASYNC);
+ if (rl->count[BLK_RW_ASYNC] >= on_thresh)
+ blk_set_congested(rl, BLK_RW_ASYNC);
+ else if (rl->count[BLK_RW_ASYNC] < off_thresh)
+ blk_clear_congested(rl, BLK_RW_ASYNC);
- blk_queue_for_each_rl(rl, q) {
if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
blk_set_rl_full(rl, BLK_RW_SYNC);
} else {
@@ -1014,12 +1032,7 @@ static struct request *__get_request(struct request_list *rl, int rw_flags,
}
}
}
- /*
- * bdi isn't aware of blkcg yet. As all async IOs end up
- * root blkcg anyway, just use root blkcg state.
- */
- if (rl == &q->root_rl)
- blk_set_queue_congested(q, is_sync);
+ blk_set_congested(rl, is_sync);
}
/*
@@ -1527,7 +1540,8 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
* Caller must ensure !blk_queue_nomerges(q) beforehand.
*/
bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
- unsigned int *request_count)
+ unsigned int *request_count,
+ struct request **same_queue_rq)
{
struct blk_plug *plug;
struct request *rq;
@@ -1547,8 +1561,16 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
list_for_each_entry_reverse(rq, plug_list, queuelist) {
int el_ret;
- if (rq->q == q)
+ if (rq->q == q) {
(*request_count)++;
+ /*
+ * Only blk-mq multiple hardware queues case checks the
+ * rq in the same queue, there should be only one such
+ * rq in a queue
+ **/
+ if (same_queue_rq)
+ *same_queue_rq = rq;
+ }
if (rq->q != q || !blk_rq_merge_ok(rq, bio))
continue;
@@ -1613,7 +1635,7 @@ static void blk_queue_bio(struct request_queue *q, struct bio *bio)
* any locks.
*/
if (!blk_queue_nomerges(q) &&
- blk_attempt_plug_merge(q, bio, &request_count))
+ blk_attempt_plug_merge(q, bio, &request_count, NULL))
return;
spin_lock_irq(q->queue_lock);
@@ -1720,8 +1742,6 @@ static void handle_bad_sector(struct bio *bio)
bio->bi_rw,
(unsigned long long)bio_end_sector(bio),
(long long)(i_size_read(bio->bi_bdev->bd_inode) >> 9));
-
- set_bit(BIO_EOF, &bio->bi_flags);
}
#ifdef CONFIG_FAIL_MAKE_REQUEST
@@ -1968,9 +1988,6 @@ void submit_bio(int rw, struct bio *bio)
{
bio->bi_rw |= rw;
- if (unlikely(trap_non_toi_io))
- BUG_ON(!(bio->bi_flags & BIO_TOI));
-
/*
* If it's a regular read/write or a barrier with data attached,
* go through the normal accounting stuff before submission.
@@ -3039,21 +3056,20 @@ void blk_start_plug(struct blk_plug *plug)
{
struct task_struct *tsk = current;
+ /*
+ * If this is a nested plug, don't actually assign it.
+ */
+ if (tsk->plug)
+ return;
+
INIT_LIST_HEAD(&plug->list);
INIT_LIST_HEAD(&plug->mq_list);
INIT_LIST_HEAD(&plug->cb_list);
-
/*
- * If this is a nested plug, don't actually assign it. It will be
- * flushed on its own.
+ * Store ordering should not be needed here, since a potential
+ * preempt will imply a full memory barrier
*/
- if (!tsk->plug) {
- /*
- * Store ordering should not be needed here, since a potential
- * preempt will imply a full memory barrier
- */
- tsk->plug = plug;
- }
+ tsk->plug = plug;
}
EXPORT_SYMBOL(blk_start_plug);
@@ -3200,10 +3216,11 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
void blk_finish_plug(struct blk_plug *plug)
{
+ if (plug != current->plug)
+ return;
blk_flush_plug_list(plug, false);
- if (plug == current->plug)
- current->plug = NULL;
+ current->plug = NULL;
}
EXPORT_SYMBOL(blk_finish_plug);
@@ -3353,7 +3370,7 @@ EXPORT_SYMBOL(blk_post_runtime_resume);
int __init blk_dev_init(void)
{
BUILD_BUG_ON(__REQ_NR_BITS > 8 *
- sizeof(((struct request *)0)->cmd_flags));
+ FIELD_SIZEOF(struct request, cmd_flags));
/* used for unplugging and affects IO latency/throughput - HIGHPRI */
kblockd_workqueue = alloc_workqueue("kblockd",
diff --git a/block/blk-exec.c b/block/blk-exec.c
index 9924725fa..3fec8a29d 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -53,7 +53,6 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
rq_end_io_fn *done)
{
int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
- bool is_pm_resume;
WARN_ON(irqs_disabled());
WARN_ON(rq->cmd_type == REQ_TYPE_FS);
@@ -70,12 +69,6 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
return;
}
- /*
- * need to check this before __blk_run_queue(), because rq can
- * be freed before that returns.
- */
- is_pm_resume = rq->cmd_type == REQ_TYPE_PM_RESUME;
-
spin_lock_irq(q->queue_lock);
if (unlikely(blk_queue_dying(q))) {
@@ -88,9 +81,6 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
__elv_add_request(q, rq, where);
__blk_run_queue(q);
- /* the queue is stopped so it won't be run */
- if (is_pm_resume)
- __blk_run_queue_uncond(q);
spin_unlock_irq(q->queue_lock);
}
EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 79ffb4855..f548b64be 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -21,6 +21,7 @@
*/
#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
#include <linux/mempool.h>
#include <linux/bio.h>
#include <linux/scatterlist.h>
diff --git a/block/blk-merge.c b/block/blk-merge.c
index fd3fee81c..30a0d9f89 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -589,7 +589,8 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
!blk_write_same_mergeable(rq->bio, bio))
return false;
- if (q->queue_flags & (1 << QUEUE_FLAG_SG_GAPS)) {
+ /* Only check gaps if the bio carries data */
+ if (q->queue_flags & (1 << QUEUE_FLAG_SG_GAPS) && bio_has_data(bio)) {
struct bio_vec *bprev;
bprev = &rq->biotail->bi_io_vec[rq->biotail->bi_vcnt - 1];
diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index 5f13f4d0b..1e28ddb65 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -24,7 +24,7 @@ static int get_first_sibling(unsigned int cpu)
{
unsigned int ret;
- ret = cpumask_first(topology_thread_cpumask(cpu));
+ ret = cpumask_first(topology_sibling_cpumask(cpu));
if (ret < nr_cpu_ids)
return ret;
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index be3290cc0..9b6e28830 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -438,6 +438,39 @@ static void bt_for_each(struct blk_mq_hw_ctx *hctx,
}
}
+static void bt_tags_for_each(struct blk_mq_tags *tags,
+ struct blk_mq_bitmap_tags *bt, unsigned int off,
+ busy_tag_iter_fn *fn, void *data, bool reserved)
+{
+ struct request *rq;
+ int bit, i;
+
+ if (!tags->rqs)
+ return;
+ for (i = 0; i < bt->map_nr; i++) {
+ struct blk_align_bitmap *bm = &bt->map[i];
+
+ for (bit = find_first_bit(&bm->word, bm->depth);
+ bit < bm->depth;
+ bit = find_next_bit(&bm->word, bm->depth, bit + 1)) {
+ rq = blk_mq_tag_to_rq(tags, off + bit);
+ fn(rq, data, reserved);
+ }
+
+ off += (1 << bt->bits_per_word);
+ }
+}
+
+void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
+ void *priv)
+{
+ if (tags->nr_reserved_tags)
+ bt_tags_for_each(tags, &tags->breserved_tags, 0, fn, priv, true);
+ bt_tags_for_each(tags, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv,
+ false);
+}
+EXPORT_SYMBOL(blk_mq_all_tag_busy_iter);
+
void blk_mq_tag_busy_iter(struct blk_mq_hw_ctx *hctx, busy_iter_fn *fn,
void *priv)
{
@@ -580,6 +613,11 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
if (!tags)
return NULL;
+ if (!zalloc_cpumask_var(&tags->cpumask, GFP_KERNEL)) {
+ kfree(tags);
+ return NULL;
+ }
+
tags->nr_tags = total_tags;
tags->nr_reserved_tags = reserved_tags;
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 90767b370..75893a342 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -44,6 +44,7 @@ struct blk_mq_tags {
struct list_head page_list;
int alloc_policy;
+ cpumask_var_t cpumask;
};
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 2dc1fd6c5..7d842db59 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -89,7 +89,8 @@ static int blk_mq_queue_enter(struct request_queue *q, gfp_t gfp)
return -EBUSY;
ret = wait_event_interruptible(q->mq_freeze_wq,
- !q->mq_freeze_depth || blk_queue_dying(q));
+ !atomic_read(&q->mq_freeze_depth) ||
+ blk_queue_dying(q));
if (blk_queue_dying(q))
return -ENODEV;
if (ret)
@@ -112,13 +113,10 @@ static void blk_mq_usage_counter_release(struct percpu_ref *ref)
void blk_mq_freeze_queue_start(struct request_queue *q)
{
- bool freeze;
+ int freeze_depth;
- spin_lock_irq(q->queue_lock);
- freeze = !q->mq_freeze_depth++;
- spin_unlock_irq(q->queue_lock);
-
- if (freeze) {
+ freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
+ if (freeze_depth == 1) {
percpu_ref_kill(&q->mq_usage_counter);
blk_mq_run_hw_queues(q, false);
}
@@ -143,13 +141,11 @@ EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
void blk_mq_unfreeze_queue(struct request_queue *q)
{
- bool wake;
+ int freeze_depth;
- spin_lock_irq(q->queue_lock);
- wake = !--q->mq_freeze_depth;
- WARN_ON_ONCE(q->mq_freeze_depth < 0);
- spin_unlock_irq(q->queue_lock);
- if (wake) {
+ freeze_depth = atomic_dec_return(&q->mq_freeze_depth);
+ WARN_ON_ONCE(freeze_depth < 0);
+ if (!freeze_depth) {
percpu_ref_reinit(&q->mq_usage_counter);
wake_up_all(&q->mq_freeze_wq);
}
@@ -1237,6 +1233,38 @@ static struct request *blk_mq_map_request(struct request_queue *q,
return rq;
}
+static int blk_mq_direct_issue_request(struct request *rq)
+{
+ int ret;
+ struct request_queue *q = rq->q;
+ struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q,
+ rq->mq_ctx->cpu);
+ struct blk_mq_queue_data bd = {
+ .rq = rq,
+ .list = NULL,
+ .last = 1
+ };
+
+ /*
+ * For OK queue, we are done. For error, kill it. Any other
+ * error (busy), just add it to our list as we previously
+ * would have done
+ */
+ ret = q->mq_ops->queue_rq(hctx, &bd);
+ if (ret == BLK_MQ_RQ_QUEUE_OK)
+ return 0;
+ else {
+ __blk_mq_requeue_request(rq);
+
+ if (ret == BLK_MQ_RQ_QUEUE_ERROR) {
+ rq->errors = -EIO;
+ blk_mq_end_request(rq, rq->errors);
+ return 0;
+ }
+ return -1;
+ }
+}
+
/*
* Multiple hardware queue variant. This will not use per-process plugs,
* but will attempt to bypass the hctx queueing if we can go straight to
@@ -1248,6 +1276,9 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
struct blk_map_ctx data;
struct request *rq;
+ unsigned int request_count = 0;
+ struct blk_plug *plug;
+ struct request *same_queue_rq = NULL;
blk_queue_bounce(q, &bio);
@@ -1256,6 +1287,10 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
return;
}
+ if (!is_flush_fua && !blk_queue_nomerges(q) &&
+ blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
+ return;
+
rq = blk_mq_map_request(q, bio, &data);
if (unlikely(!rq))
return;
@@ -1266,38 +1301,42 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
goto run_queue;
}
+ plug = current->plug;
/*
* If the driver supports defer issued based on 'last', then
* queue it up like normal since we can potentially save some
* CPU this way.
*/
- if (is_sync && !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) {
- struct blk_mq_queue_data bd = {
- .rq = rq,
- .list = NULL,
- .last = 1
- };
- int ret;
+ if (((plug && !blk_queue_nomerges(q)) || is_sync) &&
+ !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) {
+ struct request *old_rq = NULL;
blk_mq_bio_to_request(rq, bio);
/*
- * For OK queue, we are done. For error, kill it. Any other
- * error (busy), just add it to our list as we previously
- * would have done
+ * we do limited pluging. If bio can be merged, do merge.
+ * Otherwise the existing request in the plug list will be
+ * issued. So the plug list will have one request at most
*/
- ret = q->mq_ops->queue_rq(data.hctx, &bd);
- if (ret == BLK_MQ_RQ_QUEUE_OK)
- goto done;
- else {
- __blk_mq_requeue_request(rq);
-
- if (ret == BLK_MQ_RQ_QUEUE_ERROR) {
- rq->errors = -EIO;
- blk_mq_end_request(rq, rq->errors);
- goto done;
+ if (plug) {
+ /*
+ * The plug list might get flushed before this. If that
+ * happens, same_queue_rq is invalid and plug list is empty
+ **/
+ if (same_queue_rq && !list_empty(&plug->mq_list)) {
+ old_rq = same_queue_rq;
+ list_del_init(&old_rq->queuelist);
}
- }
+ list_add_tail(&rq->queuelist, &plug->mq_list);
+ } else /* is_sync */
+ old_rq = rq;
+ blk_mq_put_ctx(data.ctx);
+ if (!old_rq)
+ return;
+ if (!blk_mq_direct_issue_request(old_rq))
+ return;
+ blk_mq_insert_request(old_rq, false, true, true);
+ return;
}
if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
@@ -1310,7 +1349,6 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
run_queue:
blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua);
}
-done:
blk_mq_put_ctx(data.ctx);
}
@@ -1322,16 +1360,11 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
{
const int is_sync = rw_is_sync(bio->bi_rw);
const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
- unsigned int use_plug, request_count = 0;
+ struct blk_plug *plug;
+ unsigned int request_count = 0;
struct blk_map_ctx data;
struct request *rq;
- /*
- * If we have multiple hardware queues, just go directly to
- * one of those for sync IO.
- */
- use_plug = !is_flush_fua && !is_sync;
-
blk_queue_bounce(q, &bio);
if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
@@ -1339,8 +1372,8 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
return;
}
- if (use_plug && !blk_queue_nomerges(q) &&
- blk_attempt_plug_merge(q, bio, &request_count))
+ if (!is_flush_fua && !blk_queue_nomerges(q) &&
+ blk_attempt_plug_merge(q, bio, &request_count, NULL))
return;
rq = blk_mq_map_request(q, bio, &data);
@@ -1358,21 +1391,18 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
* utilize that to temporarily store requests until the task is
* either done or scheduled away.
*/
- if (use_plug) {
- struct blk_plug *plug = current->plug;
-
- if (plug) {
- blk_mq_bio_to_request(rq, bio);
- if (list_empty(&plug->mq_list))
- trace_block_plug(q);
- else if (request_count >= BLK_MAX_REQUEST_COUNT) {
- blk_flush_plug_list(plug, false);
- trace_block_plug(q);
- }
- list_add_tail(&rq->queuelist, &plug->mq_list);
- blk_mq_put_ctx(data.ctx);
- return;
+ plug = current->plug;
+ if (plug) {
+ blk_mq_bio_to_request(rq, bio);
+ if (list_empty(&plug->mq_list))
+ trace_block_plug(q);
+ else if (request_count >= BLK_MAX_REQUEST_COUNT) {
+ blk_flush_plug_list(plug, false);
+ trace_block_plug(q);
}
+ list_add_tail(&rq->queuelist, &plug->mq_list);
+ blk_mq_put_ctx(data.ctx);
+ return;
}
if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
@@ -1508,7 +1538,6 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
i++;
}
}
-
return tags;
fail:
@@ -1792,6 +1821,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
hctx = q->mq_ops->map_queue(q, i);
cpumask_set_cpu(i, hctx->cpumask);
+ cpumask_set_cpu(i, hctx->tags->cpumask);
ctx->index_hw = hctx->nr_ctx;
hctx->ctxs[hctx->nr_ctx++] = ctx;
}
@@ -2056,7 +2086,7 @@ void blk_mq_free_queue(struct request_queue *q)
/* Basically redo blk_mq_init_queue with queue frozen */
static void blk_mq_queue_reinit(struct request_queue *q)
{
- WARN_ON_ONCE(!q->mq_freeze_depth);
+ WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth));
blk_mq_sysfs_unregister(q);
@@ -2173,6 +2203,12 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
return 0;
}
+struct cpumask *blk_mq_tags_cpumask(struct blk_mq_tags *tags)
+{
+ return tags->cpumask;
+}
+EXPORT_SYMBOL_GPL(blk_mq_tags_cpumask);
+
/*
* Alloc a tag set to be associated with one or more request queues.
* May fail with EINVAL for various error conditions. May adjust the
@@ -2234,8 +2270,10 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
int i;
for (i = 0; i < set->nr_hw_queues; i++) {
- if (set->tags[i])
+ if (set->tags[i]) {
blk_mq_free_rq_map(set, set->tags[i], i);
+ free_cpumask_var(set->tags[i]->cpumask);
+ }
}
kfree(set->tags);
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 12600bfff..e0057d035 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -241,8 +241,8 @@ EXPORT_SYMBOL(blk_queue_bounce_limit);
* Description:
* Enables a low level driver to set a hard upper limit,
* max_hw_sectors, on the size of requests. max_hw_sectors is set by
- * the device driver based upon the combined capabilities of I/O
- * controller and storage device.
+ * the device driver based upon the capabilities of the I/O
+ * controller.
*
* max_sectors is a soft limit imposed by the block layer for
* filesystem type requests. This value can be overridden on a
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 2b8fd302f..6264b382d 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -6,11 +6,12 @@
#include <linux/module.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
#include <linux/blktrace_api.h>
#include <linux/blk-mq.h>
+#include <linux/blk-cgroup.h>
#include "blk.h"
-#include "blk-cgroup.h"
#include "blk-mq.h"
struct queue_sysfs_entry {
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 5b9c6d5c3..b23193518 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -9,7 +9,7 @@
#include <linux/blkdev.h>
#include <linux/bio.h>
#include <linux/blktrace_api.h>
-#include "blk-cgroup.h"
+#include <linux/blk-cgroup.h>
#include "blk.h"
/* Max dispatch from a group in 1 round */
diff --git a/block/blk.h b/block/blk.h
index 43b036185..026d95941 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -78,7 +78,8 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
struct bio *bio);
bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
- unsigned int *request_count);
+ unsigned int *request_count,
+ struct request **same_queue_rq);
void blk_account_io_start(struct request *req, bool new_io);
void blk_account_io_completion(struct request *req, unsigned int bytes);
@@ -193,8 +194,6 @@ int blk_try_merge(struct request *rq, struct bio *bio);
void blk_queue_congestion_threshold(struct request_queue *q);
-void __blk_run_queue_uncond(struct request_queue *q);
-
int blk_dev_init(void);
diff --git a/block/bounce.c b/block/bounce.c
index ed9dd8067..b17311227 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -13,6 +13,7 @@
#include <linux/pagemap.h>
#include <linux/mempool.h>
#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
#include <linux/init.h>
#include <linux/hash.h>
#include <linux/highmem.h>
@@ -128,9 +129,6 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
struct bio_vec *bvec, *org_vec;
int i;
- if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
- set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags);
-
/*
* free up bounce indirect pages used
*/
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 5da8e6e9a..c62bb2e65 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -14,8 +14,8 @@
#include <linux/rbtree.h>
#include <linux/ioprio.h>
#include <linux/blktrace_api.h>
+#include <linux/blk-cgroup.h>
#include "blk.h"
-#include "blk-cgroup.h"
/*
* tunables
@@ -67,6 +67,11 @@ static struct kmem_cache *cfq_pool;
#define sample_valid(samples) ((samples) > 80)
#define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node)
+/* blkio-related constants */
+#define CFQ_WEIGHT_MIN 10
+#define CFQ_WEIGHT_MAX 1000
+#define CFQ_WEIGHT_DEFAULT 500
+
struct cfq_ttime {
unsigned long last_end_request;
@@ -212,6 +217,15 @@ struct cfqg_stats {
#endif /* CONFIG_CFQ_GROUP_IOSCHED */
};
+/* Per-cgroup data */
+struct cfq_group_data {
+ /* must be the first member */
+ struct blkcg_policy_data pd;
+
+ unsigned int weight;
+ unsigned int leaf_weight;
+};
+
/* This is per cgroup per device grouping structure */
struct cfq_group {
/* must be the first member */
@@ -446,16 +460,6 @@ CFQ_CFQQ_FNS(deep);
CFQ_CFQQ_FNS(wait_busy);
#undef CFQ_CFQQ_FNS
-static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd)
-{
- return pd ? container_of(pd, struct cfq_group, pd) : NULL;
-}
-
-static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg)
-{
- return pd_to_blkg(&cfqg->pd);
-}
-
#if defined(CONFIG_CFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
/* cfqg stats flags */
@@ -600,6 +604,22 @@ static inline void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) { }
#ifdef CONFIG_CFQ_GROUP_IOSCHED
+static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd)
+{
+ return pd ? container_of(pd, struct cfq_group, pd) : NULL;
+}
+
+static struct cfq_group_data
+*cpd_to_cfqgd(struct blkcg_policy_data *cpd)
+{
+ return cpd ? container_of(cpd, struct cfq_group_data, pd) : NULL;
+}
+
+static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg)
+{
+ return pd_to_blkg(&cfqg->pd);
+}
+
static struct blkcg_policy blkcg_policy_cfq;
static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg)
@@ -607,6 +627,11 @@ static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg)
return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq));
}
+static struct cfq_group_data *blkcg_to_cfqgd(struct blkcg *blkcg)
+{
+ return cpd_to_cfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_cfq));
+}
+
static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg)
{
struct blkcg_gq *pblkg = cfqg_to_blkg(cfqg)->parent;
@@ -1544,13 +1569,28 @@ static void cfqg_stats_init(struct cfqg_stats *stats)
#endif
}
+static void cfq_cpd_init(const struct blkcg *blkcg)
+{
+ struct cfq_group_data *cgd =
+ cpd_to_cfqgd(blkcg->pd[blkcg_policy_cfq.plid]);
+
+ if (blkcg == &blkcg_root) {
+ cgd->weight = 2 * CFQ_WEIGHT_DEFAULT;
+ cgd->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT;
+ } else {
+ cgd->weight = CFQ_WEIGHT_DEFAULT;
+ cgd->leaf_weight = CFQ_WEIGHT_DEFAULT;
+ }
+}
+
static void cfq_pd_init(struct blkcg_gq *blkg)
{
struct cfq_group *cfqg = blkg_to_cfqg(blkg);
+ struct cfq_group_data *cgd = blkcg_to_cfqgd(blkg->blkcg);
cfq_init_cfqg_base(cfqg);
- cfqg->weight = blkg->blkcg->cfq_weight;
- cfqg->leaf_weight = blkg->blkcg->cfq_leaf_weight;
+ cfqg->weight = cgd->weight;
+ cfqg->leaf_weight = cgd->leaf_weight;
cfqg_stats_init(&cfqg->stats);
cfqg_stats_init(&cfqg->dead_stats);
}
@@ -1673,13 +1713,27 @@ static int cfqg_print_leaf_weight_device(struct seq_file *sf, void *v)
static int cfq_print_weight(struct seq_file *sf, void *v)
{
- seq_printf(sf, "%u\n", css_to_blkcg(seq_css(sf))->cfq_weight);
+ struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+ struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
+ unsigned int val = 0;
+
+ if (cgd)
+ val = cgd->weight;
+
+ seq_printf(sf, "%u\n", val);
return 0;
}
static int cfq_print_leaf_weight(struct seq_file *sf, void *v)
{
- seq_printf(sf, "%u\n", css_to_blkcg(seq_css(sf))->cfq_leaf_weight);
+ struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+ struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
+ unsigned int val = 0;
+
+ if (cgd)
+ val = cgd->leaf_weight;
+
+ seq_printf(sf, "%u\n", val);
return 0;
}
@@ -1690,6 +1744,7 @@ static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of,
struct blkcg *blkcg = css_to_blkcg(of_css(of));
struct blkg_conf_ctx ctx;
struct cfq_group *cfqg;
+ struct cfq_group_data *cfqgd;
int ret;
ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx);
@@ -1698,17 +1753,22 @@ static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of,
ret = -EINVAL;
cfqg = blkg_to_cfqg(ctx.blkg);
+ cfqgd = blkcg_to_cfqgd(blkcg);
+ if (!cfqg || !cfqgd)
+ goto err;
+
if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) {
if (!is_leaf_weight) {
cfqg->dev_weight = ctx.v;
- cfqg->new_weight = ctx.v ?: blkcg->cfq_weight;
+ cfqg->new_weight = ctx.v ?: cfqgd->weight;
} else {
cfqg->dev_leaf_weight = ctx.v;
- cfqg->new_leaf_weight = ctx.v ?: blkcg->cfq_leaf_weight;
+ cfqg->new_leaf_weight = ctx.v ?: cfqgd->leaf_weight;
}
ret = 0;
}
+err:
blkg_conf_finish(&ctx);
return ret ?: nbytes;
}
@@ -1730,16 +1790,23 @@ static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
{
struct blkcg *blkcg = css_to_blkcg(css);
struct blkcg_gq *blkg;
+ struct cfq_group_data *cfqgd;
+ int ret = 0;
if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX)
return -EINVAL;
spin_lock_irq(&blkcg->lock);
+ cfqgd = blkcg_to_cfqgd(blkcg);
+ if (!cfqgd) {
+ ret = -EINVAL;
+ goto out;
+ }
if (!is_leaf_weight)
- blkcg->cfq_weight = val;
+ cfqgd->weight = val;
else
- blkcg->cfq_leaf_weight = val;
+ cfqgd->leaf_weight = val;
hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
struct cfq_group *cfqg = blkg_to_cfqg(blkg);
@@ -1749,15 +1816,16 @@ static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
if (!is_leaf_weight) {
if (!cfqg->dev_weight)
- cfqg->new_weight = blkcg->cfq_weight;
+ cfqg->new_weight = cfqgd->weight;
} else {
if (!cfqg->dev_leaf_weight)
- cfqg->new_leaf_weight = blkcg->cfq_leaf_weight;
+ cfqg->new_leaf_weight = cfqgd->leaf_weight;
}
}
+out:
spin_unlock_irq(&blkcg->lock);
- return 0;
+ return ret;
}
static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
@@ -4477,6 +4545,18 @@ out_free:
return ret;
}
+static void cfq_registered_queue(struct request_queue *q)
+{
+ struct elevator_queue *e = q->elevator;
+ struct cfq_data *cfqd = e->elevator_data;
+
+ /*
+ * Default to IOPS mode with no idling for SSDs
+ */
+ if (blk_queue_nonrot(q))
+ cfqd->cfq_slice_idle = 0;
+}
+
/*
* sysfs parts below -->
*/
@@ -4592,6 +4672,7 @@ static struct elevator_type iosched_cfq = {
.elevator_may_queue_fn = cfq_may_queue,
.elevator_init_fn = cfq_init_queue,
.elevator_exit_fn = cfq_exit_queue,
+ .elevator_registered_fn = cfq_registered_queue,
},
.icq_size = sizeof(struct cfq_io_cq),
.icq_align = __alignof__(struct cfq_io_cq),
@@ -4603,8 +4684,10 @@ static struct elevator_type iosched_cfq = {
#ifdef CONFIG_CFQ_GROUP_IOSCHED
static struct blkcg_policy blkcg_policy_cfq = {
.pd_size = sizeof(struct cfq_group),
+ .cpd_size = sizeof(struct cfq_group_data),
.cftypes = cfq_blkcg_files,
+ .cpd_init_fn = cfq_cpd_init,
.pd_init_fn = cfq_pd_init,
.pd_offline_fn = cfq_pd_offline,
.pd_reset_stats_fn = cfq_pd_reset_stats,
diff --git a/block/elevator.c b/block/elevator.c
index 8985038f3..84d63943f 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -35,11 +35,11 @@
#include <linux/hash.h>
#include <linux/uaccess.h>
#include <linux/pm_runtime.h>
+#include <linux/blk-cgroup.h>
#include <trace/events/block.h>
#include "blk.h"
-#include "blk-cgroup.h"
static DEFINE_SPINLOCK(elv_list_lock);
static LIST_HEAD(elv_list);
@@ -806,6 +806,8 @@ int elv_register_queue(struct request_queue *q)
}
kobject_uevent(&e->kobj, KOBJ_ADD);
e->registered = 1;
+ if (e->type->ops.elevator_registered_fn)
+ e->type->ops.elevator_registered_fn(q);
}
return error;
}
diff --git a/block/genhd.c b/block/genhd.c
index d2b7ebfb5..59a1395ee 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -8,6 +8,7 @@
#include <linux/kdev_t.h>
#include <linux/kernel.h>
#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
#include <linux/init.h>
#include <linux/spinlock.h>
#include <linux/proc_fs.h>
@@ -17,8 +18,6 @@
#include <linux/kobj_map.h>
#include <linux/mutex.h>
#include <linux/idr.h>
-#include <linux/ctype.h>
-#include <linux/fs_uuid.h>
#include <linux/log2.h>
#include <linux/pm_runtime.h>
@@ -1385,85 +1384,6 @@ int invalidate_partition(struct gendisk *disk, int partno)
EXPORT_SYMBOL(invalidate_partition);
-dev_t blk_lookup_fs_info(struct fs_info *seek)
-{
- dev_t devt = MKDEV(0, 0);
- struct class_dev_iter iter;
- struct device *dev;
- int best_score = 0;
-
- class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
- while (best_score < 3 && (dev = class_dev_iter_next(&iter))) {
- struct gendisk *disk = dev_to_disk(dev);
- struct disk_part_iter piter;
- struct hd_struct *part;
-
- disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0);
-
- while (best_score < 3 && (part = disk_part_iter_next(&piter))) {
- int score = part_matches_fs_info(part, seek);
- if (score > best_score) {
- devt = part_devt(part);
- best_score = score;
- }
- }
- disk_part_iter_exit(&piter);
- }
- class_dev_iter_exit(&iter);
- return devt;
-}
-
-/* Caller uses NULL, key to start. For each match found, we return a bdev on
- * which we have done blkdev_get, and we do the blkdev_put on block devices
- * that are passed to us. When no more matches are found, we return NULL.
- */
-struct block_device *next_bdev_of_type(struct block_device *last,
- const char *key)
-{
- dev_t devt = MKDEV(0, 0);
- struct class_dev_iter iter;
- struct device *dev;
- struct block_device *next = NULL, *bdev;
- int got_last = 0;
-
- if (!key)
- goto out;
-
- class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
- while (!devt && (dev = class_dev_iter_next(&iter))) {
- struct gendisk *disk = dev_to_disk(dev);
- struct disk_part_iter piter;
- struct hd_struct *part;
-
- disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0);
-
- while ((part = disk_part_iter_next(&piter))) {
- bdev = bdget(part_devt(part));
- if (last && !got_last) {
- if (last == bdev)
- got_last = 1;
- continue;
- }
-
- if (blkdev_get(bdev, FMODE_READ, 0))
- continue;
-
- if (bdev_matches_key(bdev, key)) {
- next = bdev;
- break;
- }
-
- blkdev_put(bdev, FMODE_READ);
- }
- disk_part_iter_exit(&piter);
- }
- class_dev_iter_exit(&iter);
-out:
- if (last)
- blkdev_put(last, FMODE_READ);
- return next;
-}
-
/*
* Disk events - monitor disk events like media change and eject request.
*/
diff --git a/block/ioctl.c b/block/ioctl.c
index 7d8befde2..8061eba42 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -150,21 +150,48 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
}
}
-static int blkdev_reread_part(struct block_device *bdev)
+/*
+ * This is an exported API for the block driver, and will not
+ * acquire bd_mutex. This API should be used in case that
+ * caller has held bd_mutex already.
+ */
+int __blkdev_reread_part(struct block_device *bdev)
{
struct gendisk *disk = bdev->bd_disk;
- int res;
if (!disk_part_scan_enabled(disk) || bdev != bdev->bd_contains)
return -EINVAL;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
- if (!mutex_trylock(&bdev->bd_mutex))
- return -EBUSY;
- res = rescan_partitions(disk, bdev);
+
+ lockdep_assert_held(&bdev->bd_mutex);
+
+ return rescan_partitions(disk, bdev);
+}
+EXPORT_SYMBOL(__blkdev_reread_part);
+
+/*
+ * This is an exported API for the block driver, and will
+ * try to acquire bd_mutex. If bd_mutex has been held already
+ * in current context, please call __blkdev_reread_part().
+ *
+ * Make sure the held locks in current context aren't required
+ * in open()/close() handler and I/O path for avoiding ABBA deadlock:
+ * - bd_mutex is held before calling block driver's open/close
+ * handler
+ * - reading partition table may submit I/O to the block device
+ */
+int blkdev_reread_part(struct block_device *bdev)
+{
+ int res;
+
+ mutex_lock(&bdev->bd_mutex);
+ res = __blkdev_reread_part(bdev);
mutex_unlock(&bdev->bd_mutex);
+
return res;
}
+EXPORT_SYMBOL(blkdev_reread_part);
static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
uint64_t len, int secure)
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 55b6f15da..dda653ce7 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -326,8 +326,8 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
goto out_put_request;
}
- ret = -EFAULT;
- if (blk_fill_sghdr_rq(q, rq, hdr, mode))
+ ret = blk_fill_sghdr_rq(q, rq, hdr, mode);
+ if (ret < 0)
goto out_free_cdb;
ret = 0;
diff --git a/block/uuid.c b/block/uuid.c
deleted file mode 100644
index 722d53b63..000000000
--- a/block/uuid.c
+++ /dev/null
@@ -1,509 +0,0 @@
-#include <linux/blkdev.h>
-#include <linux/ctype.h>
-#include <linux/fs_uuid.h>
-#include <linux/slab.h>
-#include <linux/export.h>
-
-static int debug_enabled;
-
-#define PRINTK(fmt, args...) do { \
- if (debug_enabled) \
- printk(KERN_DEBUG fmt, ## args); \
- } while(0)
-
-#define PRINT_HEX_DUMP(v1, v2, v3, v4, v5, v6, v7, v8) \
- do { \
- if (debug_enabled) \
- print_hex_dump(v1, v2, v3, v4, v5, v6, v7, v8); \
- } while(0)
-
-/*
- * Simple UUID translation
- */
-
-struct uuid_info {
- const char *key;
- const char *name;
- long bkoff;
- unsigned sboff;
- unsigned sig_len;
- const char *magic;
- int uuid_offset;
- int last_mount_offset;
- int last_mount_size;
-};
-
-/*
- * Based on libuuid's blkid_magic array. Note that I don't
- * have uuid offsets for all of these yet - mssing ones are 0x0.
- * Further information welcome.
- *
- * Rearranged by page of fs signature for optimisation.
- */
-static struct uuid_info uuid_list[] = {
- { NULL, "oracleasm", 0, 32, 8, "ORCLDISK", 0x0, 0, 0 },
- { "ntfs", "ntfs", 0, 3, 8, "NTFS ", 0x0, 0, 0 },
- { "vfat", "vfat", 0, 0x52, 5, "MSWIN", 0x0, 0, 0 },
- { "vfat", "vfat", 0, 0x52, 8, "FAT32 ", 0x0, 0, 0 },
- { "vfat", "vfat", 0, 0x36, 5, "MSDOS", 0x0, 0, 0 },
- { "vfat", "vfat", 0, 0x36, 8, "FAT16 ", 0x0, 0, 0 },
- { "vfat", "vfat", 0, 0x36, 8, "FAT12 ", 0x0, 0, 0 },
- { "vfat", "vfat", 0, 0, 1, "\353", 0x0, 0, 0 },
- { "vfat", "vfat", 0, 0, 1, "\351", 0x0, 0, 0 },
- { "vfat", "vfat", 0, 0x1fe, 2, "\125\252", 0x0, 0, 0 },
- { "xfs", "xfs", 0, 0, 4, "XFSB", 0x20, 0, 0 },
- { "romfs", "romfs", 0, 0, 8, "-rom1fs-", 0x0, 0, 0 },
- { "bfs", "bfs", 0, 0, 4, "\316\372\173\033", 0, 0, 0 },
- { "cramfs", "cramfs", 0, 0, 4, "E=\315\050", 0x0, 0, 0 },
- { "qnx4", "qnx4", 0, 4, 6, "QNX4FS", 0, 0, 0 },
- { NULL, "crypt_LUKS", 0, 0, 6, "LUKS\xba\xbe", 0x0, 0, 0 },
- { "squashfs", "squashfs", 0, 0, 4, "sqsh", 0, 0, 0 },
- { "squashfs", "squashfs", 0, 0, 4, "hsqs", 0, 0, 0 },
- { "ocfs", "ocfs", 0, 8, 9, "OracleCFS", 0x0, 0, 0 },
- { "lvm2pv", "lvm2pv", 0, 0x018, 8, "LVM2 001", 0x0, 0, 0 },
- { "sysv", "sysv", 0, 0x3f8, 4, "\020~\030\375", 0, 0, 0 },
- { "ext", "ext", 1, 0x38, 2, "\123\357", 0x468, 0x42c, 4 },
- { "minix", "minix", 1, 0x10, 2, "\177\023", 0, 0, 0 },
- { "minix", "minix", 1, 0x10, 2, "\217\023", 0, 0, 0 },
- { "minix", "minix", 1, 0x10, 2, "\150\044", 0, 0, 0 },
- { "minix", "minix", 1, 0x10, 2, "\170\044", 0, 0, 0 },
- { "lvm2pv", "lvm2pv", 1, 0x018, 8, "LVM2 001", 0x0, 0, 0 },
- { "vxfs", "vxfs", 1, 0, 4, "\365\374\001\245", 0, 0, 0 },
- { "hfsplus", "hfsplus", 1, 0, 2, "BD", 0x0, 0, 0 },
- { "hfsplus", "hfsplus", 1, 0, 2, "H+", 0x0, 0, 0 },
- { "hfsplus", "hfsplus", 1, 0, 2, "HX", 0x0, 0, 0 },
- { "hfs", "hfs", 1, 0, 2, "BD", 0x0, 0, 0 },
- { "ocfs2", "ocfs2", 1, 0, 6, "OCFSV2", 0x0, 0, 0 },
- { "lvm2pv", "lvm2pv", 0, 0x218, 8, "LVM2 001", 0x0, 0, 0 },
- { "lvm2pv", "lvm2pv", 1, 0x218, 8, "LVM2 001", 0x0, 0, 0 },
- { "ocfs2", "ocfs2", 2, 0, 6, "OCFSV2", 0x0, 0, 0 },
- { "swap", "swap", 0, 0xff6, 10, "SWAP-SPACE", 0x40c, 0, 0 },
- { "swap", "swap", 0, 0xff6, 10, "SWAPSPACE2", 0x40c, 0, 0 },
- { "swap", "swsuspend", 0, 0xff6, 9, "S1SUSPEND", 0x40c, 0, 0 },
- { "swap", "swsuspend", 0, 0xff6, 9, "S2SUSPEND", 0x40c, 0, 0 },
- { "swap", "swsuspend", 0, 0xff6, 9, "ULSUSPEND", 0x40c, 0, 0 },
- { "ocfs2", "ocfs2", 4, 0, 6, "OCFSV2", 0x0, 0, 0 },
- { "ocfs2", "ocfs2", 8, 0, 6, "OCFSV2", 0x0, 0, 0 },
- { "hpfs", "hpfs", 8, 0, 4, "I\350\225\371", 0, 0, 0 },
- { "reiserfs", "reiserfs", 8, 0x34, 8, "ReIsErFs", 0x10054, 0, 0 },
- { "reiserfs", "reiserfs", 8, 20, 8, "ReIsErFs", 0x10054, 0, 0 },
- { "zfs", "zfs", 8, 0, 8, "\0\0\x02\xf5\xb0\x07\xb1\x0c", 0x0, 0, 0 },
- { "zfs", "zfs", 8, 0, 8, "\x0c\xb1\x07\xb0\xf5\x02\0\0", 0x0, 0, 0 },
- { "ufs", "ufs", 8, 0x55c, 4, "T\031\001\000", 0, 0, 0 },
- { "swap", "swap", 0, 0x1ff6, 10, "SWAP-SPACE", 0x40c, 0, 0 },
- { "swap", "swap", 0, 0x1ff6, 10, "SWAPSPACE2", 0x40c, 0, 0 },
- { "swap", "swsuspend", 0, 0x1ff6, 9, "S1SUSPEND", 0x40c, 0, 0 },
- { "swap", "swsuspend", 0, 0x1ff6, 9, "S2SUSPEND", 0x40c, 0, 0 },
- { "swap", "swsuspend", 0, 0x1ff6, 9, "ULSUSPEND", 0x40c, 0, 0 },
- { "reiserfs", "reiserfs", 64, 0x34, 9, "ReIsEr2Fs", 0x10054, 0, 0 },
- { "reiserfs", "reiserfs", 64, 0x34, 9, "ReIsEr3Fs", 0x10054, 0, 0 },
- { "reiserfs", "reiserfs", 64, 0x34, 8, "ReIsErFs", 0x10054, 0, 0 },
- { "reiser4", "reiser4", 64, 0, 7, "ReIsEr4", 0x100544, 0, 0 },
- { "gfs2", "gfs2", 64, 0, 4, "\x01\x16\x19\x70", 0x0, 0, 0 },
- { "gfs", "gfs", 64, 0, 4, "\x01\x16\x19\x70", 0x0, 0, 0 },
- { "btrfs", "btrfs", 64, 0x40, 8, "_BHRfS_M", 0x0, 0, 0 },
- { "swap", "swap", 0, 0x3ff6, 10, "SWAP-SPACE", 0x40c, 0, 0 },
- { "swap", "swap", 0, 0x3ff6, 10, "SWAPSPACE2", 0x40c, 0, 0 },
- { "swap", "swsuspend", 0, 0x3ff6, 9, "S1SUSPEND", 0x40c, 0, 0 },
- { "swap", "swsuspend", 0, 0x3ff6, 9, "S2SUSPEND", 0x40c, 0, 0 },
- { "swap", "swsuspend", 0, 0x3ff6, 9, "ULSUSPEND", 0x40c, 0, 0 },
- { "udf", "udf", 32, 1, 5, "BEA01", 0x0, 0, 0 },
- { "udf", "udf", 32, 1, 5, "BOOT2", 0x0, 0, 0 },
- { "udf", "udf", 32, 1, 5, "CD001", 0x0, 0, 0 },
- { "udf", "udf", 32, 1, 5, "CDW02", 0x0, 0, 0 },
- { "udf", "udf", 32, 1, 5, "NSR02", 0x0, 0, 0 },
- { "udf", "udf", 32, 1, 5, "NSR03", 0x0, 0, 0 },
- { "udf", "udf", 32, 1, 5, "TEA01", 0x0, 0, 0 },
- { "iso9660", "iso9660", 32, 1, 5, "CD001", 0x0, 0, 0 },
- { "iso9660", "iso9660", 32, 9, 5, "CDROM", 0x0, 0, 0 },
- { "jfs", "jfs", 32, 0, 4, "JFS1", 0x88, 0, 0 },
- { "swap", "swap", 0, 0x7ff6, 10, "SWAP-SPACE", 0x40c, 0, 0 },
- { "swap", "swap", 0, 0x7ff6, 10, "SWAPSPACE2", 0x40c, 0, 0 },
- { "swap", "swsuspend", 0, 0x7ff6, 9, "S1SUSPEND", 0x40c, 0, 0 },
- { "swap", "swsuspend", 0, 0x7ff6, 9, "S2SUSPEND", 0x40c, 0, 0 },
- { "swap", "swsuspend", 0, 0x7ff6, 9, "ULSUSPEND", 0x40c, 0, 0 },
- { "swap", "swap", 0, 0xfff6, 10, "SWAP-SPACE", 0x40c, 0, 0 },
- { "swap", "swap", 0, 0xfff6, 10, "SWAPSPACE2", 0x40c, 0, 0 },
- { "swap", "swsuspend", 0, 0xfff6, 9, "S1SUSPEND", 0x40c, 0, 0 },
- { "swap", "swsuspend", 0, 0xfff6, 9, "S2SUSPEND", 0x40c, 0, 0 },
- { "swap", "swsuspend", 0, 0xfff6, 9, "ULSUSPEND", 0x40c, 0, 0 },
- { "zfs", "zfs", 264, 0, 8, "\0\0\x02\xf5\xb0\x07\xb1\x0c", 0x0, 0, 0 },
- { "zfs", "zfs", 264, 0, 8, "\x0c\xb1\x07\xb0\xf5\x02\0\0", 0x0, 0, 0 },
- { NULL, NULL, 0, 0, 0, NULL, 0x0, 0, 0 }
-};
-
-static int null_uuid(const char *uuid)
-{
- int i;
-
- for (i = 0; i < 16 && !uuid[i]; i++);
-
- return (i == 16);
-}
-
-
-static void uuid_end_bio(struct bio *bio, int err)
-{
- struct page *page = bio->bi_io_vec[0].bv_page;
-
- if(!test_bit(BIO_UPTODATE, &bio->bi_flags))
- SetPageError(page);
-
- unlock_page(page);
- bio_put(bio);
-}
-
-
-/**
- * submit - submit BIO request
- * @dev: The block device we're using.
- * @page_num: The page we're reading.
- *
- * Based on Patrick Mochell's pmdisk code from long ago: "Straight from the
- * textbook - allocate and initialize the bio. If we're writing, make sure
- * the page is marked as dirty. Then submit it and carry on."
- **/
-static struct page *read_bdev_page(struct block_device *dev, int page_num)
-{
- struct bio *bio = NULL;
- struct page *page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
-
- if (!page) {
- printk(KERN_ERR "Failed to allocate a page for reading data "
- "in UUID checks.");
- return NULL;
- }
-
- bio = bio_alloc(GFP_NOFS, 1);
- bio->bi_bdev = dev;
- bio->bi_iter.bi_sector = page_num << 3;
- bio->bi_end_io = uuid_end_bio;
- bio->bi_flags |= (1 << BIO_TOI);
-
- PRINTK("Submitting bio on device %lx, page %d using bio %p and page %p.\n",
- (unsigned long) dev->bd_dev, page_num, bio, page);
-
- if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
- printk(KERN_DEBUG "ERROR: adding page to bio at %d\n",
- page_num);
- bio_put(bio);
- __free_page(page);
- printk(KERN_DEBUG "read_bdev_page freed page %p (in error "
- "path).\n", page);
- return NULL;
- }
-
- lock_page(page);
- submit_bio(READ | REQ_SYNC, bio);
-
- wait_on_page_locked(page);
- if (PageError(page)) {
- __free_page(page);
- page = NULL;
- }
- return page;
-}
-
-int bdev_matches_key(struct block_device *bdev, const char *key)
-{
- unsigned char *data = NULL;
- struct page *data_page = NULL;
-
- int dev_offset, pg_num, pg_off, i;
- int last_pg_num = -1;
- int result = 0;
- char buf[50];
-
- if (null_uuid(key)) {
- PRINTK("Refusing to find a NULL key.\n");
- return 0;
- }
-
- if (!bdev->bd_disk) {
- bdevname(bdev, buf);
- PRINTK("bdev %s has no bd_disk.\n", buf);
- return 0;
- }
-
- if (!bdev->bd_disk->queue) {
- bdevname(bdev, buf);
- PRINTK("bdev %s has no queue.\n", buf);
- return 0;
- }
-
- for (i = 0; uuid_list[i].name; i++) {
- struct uuid_info *dat = &uuid_list[i];
-
- if (!dat->key || strcmp(dat->key, key))
- continue;
-
- dev_offset = (dat->bkoff << 10) + dat->sboff;
- pg_num = dev_offset >> 12;
- pg_off = dev_offset & 0xfff;
-
- if ((((pg_num + 1) << 3) - 1) > bdev->bd_part->nr_sects >> 1)
- continue;
-
- if (pg_num != last_pg_num) {
- if (data_page) {
- kunmap(data_page);
- __free_page(data_page);
- }
- data_page = read_bdev_page(bdev, pg_num);
- if (!data_page)
- continue;
- data = kmap(data_page);
- }
-
- last_pg_num = pg_num;
-
- if (strncmp(&data[pg_off], dat->magic, dat->sig_len))
- continue;
-
- result = 1;
- break;
- }
-
- if (data_page) {
- kunmap(data_page);
- __free_page(data_page);
- }
-
- return result;
-}
-
-/*
- * part_matches_fs_info - Does the given partition match the details given?
- *
- * Returns a score saying how good the match is.
- * 0 = no UUID match.
- * 1 = UUID but last mount time differs.
- * 2 = UUID, last mount time but not dev_t
- * 3 = perfect match
- *
- * This lets us cope elegantly with probing resulting in dev_ts changing
- * from boot to boot, and with the case where a user copies a partition
- * (UUID is non unique), and we need to check the last mount time of the
- * correct partition.
- */
-int part_matches_fs_info(struct hd_struct *part, struct fs_info *seek)
-{
- struct block_device *bdev;
- struct fs_info *got;
- int result = 0;
- char buf[50];
-
- if (null_uuid((char *) &seek->uuid)) {
- PRINTK("Refusing to find a NULL uuid.\n");
- return 0;
- }
-
- bdev = bdget(part_devt(part));
-
- PRINTK("part_matches fs info considering %x.\n", part_devt(part));
-
- if (blkdev_get(bdev, FMODE_READ, 0)) {
- PRINTK("blkdev_get failed.\n");
- return 0;
- }
-
- if (!bdev->bd_disk) {
- bdevname(bdev, buf);
- PRINTK("bdev %s has no bd_disk.\n", buf);
- goto out;
- }
-
- if (!bdev->bd_disk->queue) {
- bdevname(bdev, buf);
- PRINTK("bdev %s has no queue.\n", buf);
- goto out;
- }
-
- got = fs_info_from_block_dev(bdev);
-
- if (got && !memcmp(got->uuid, seek->uuid, 16)) {
- PRINTK(" Have matching UUID.\n");
- PRINTK(" Got: LMS %d, LM %p.\n", got->last_mount_size, got->last_mount);
- PRINTK(" Seek: LMS %d, LM %p.\n", seek->last_mount_size, seek->last_mount);
- result = 1;
-
- if (got->last_mount_size == seek->last_mount_size &&
- got->last_mount && seek->last_mount &&
- !memcmp(got->last_mount, seek->last_mount,
- got->last_mount_size)) {
- result = 2;
-
- PRINTK(" Matching last mount time.\n");
-
- if (part_devt(part) == seek->dev_t) {
- result = 3;
- PRINTK(" Matching dev_t.\n");
- } else
- PRINTK("Dev_ts differ (%x vs %x).\n", part_devt(part), seek->dev_t);
- }
- }
-
- PRINTK(" Score for %x is %d.\n", part_devt(part), result);
- free_fs_info(got);
-out:
- blkdev_put(bdev, FMODE_READ);
- return result;
-}
-
-void free_fs_info(struct fs_info *fs_info)
-{
- if (!fs_info || IS_ERR(fs_info))
- return;
-
- if (fs_info->last_mount)
- kfree(fs_info->last_mount);
-
- kfree(fs_info);
-}
-
-struct fs_info *fs_info_from_block_dev(struct block_device *bdev)
-{
- unsigned char *data = NULL;
- struct page *data_page = NULL;
-
- int dev_offset, pg_num, pg_off;
- int uuid_pg_num, uuid_pg_off, i;
- unsigned char *uuid_data = NULL;
- struct page *uuid_data_page = NULL;
-
- int last_pg_num = -1, last_uuid_pg_num = 0;
- char buf[50];
- struct fs_info *fs_info = NULL;
-
- bdevname(bdev, buf);
-
- PRINTK("uuid_from_block_dev looking for partition type of %s.\n", buf);
-
- for (i = 0; uuid_list[i].name; i++) {
- struct uuid_info *dat = &uuid_list[i];
- dev_offset = (dat->bkoff << 10) + dat->sboff;
- pg_num = dev_offset >> 12;
- pg_off = dev_offset & 0xfff;
- uuid_pg_num = dat->uuid_offset >> 12;
- uuid_pg_off = dat->uuid_offset & 0xfff;
-
- if ((((pg_num + 1) << 3) - 1) > bdev->bd_part->nr_sects >> 1)
- continue;
-
- /* Ignore partition types with no UUID offset */
- if (!dat->uuid_offset)
- continue;
-
- if (pg_num != last_pg_num) {
- if (data_page) {
- kunmap(data_page);
- __free_page(data_page);
- }
- data_page = read_bdev_page(bdev, pg_num);
- if (!data_page)
- continue;
- data = kmap(data_page);
- }
-
- last_pg_num = pg_num;
-
- if (strncmp(&data[pg_off], dat->magic, dat->sig_len))
- continue;
-
- PRINTK("This partition looks like %s.\n", dat->name);
-
- fs_info = kzalloc(sizeof(struct fs_info), GFP_KERNEL);
-
- if (!fs_info) {
- PRINTK("Failed to allocate fs_info struct.");
- fs_info = ERR_PTR(-ENOMEM);
- break;
- }
-
- /* UUID can't be off the end of the disk */
- if ((uuid_pg_num > bdev->bd_part->nr_sects >> 3) ||
- !dat->uuid_offset)
- goto no_uuid;
-
- if (!uuid_data || uuid_pg_num != last_uuid_pg_num) {
- /* No need to reread the page from above */
- if (uuid_pg_num == pg_num && uuid_data)
- memcpy(uuid_data, data, PAGE_SIZE);
- else {
- if (uuid_data_page) {
- kunmap(uuid_data_page);
- __free_page(uuid_data_page);
- }
- uuid_data_page = read_bdev_page(bdev, uuid_pg_num);
- if (!uuid_data_page)
- continue;
- uuid_data = kmap(uuid_data_page);
- }
- }
-
- last_uuid_pg_num = uuid_pg_num;
- memcpy(&fs_info->uuid, &uuid_data[uuid_pg_off], 16);
- fs_info->dev_t = bdev->bd_dev;
-
-no_uuid:
- PRINT_HEX_DUMP(KERN_EMERG, "fs_info_from_block_dev "
- "returning uuid ", DUMP_PREFIX_NONE, 16, 1,
- fs_info->uuid, 16, 0);
-
- if (dat->last_mount_size) {
- int pg = dat->last_mount_offset >> 12, sz;
- int off = dat->last_mount_offset & 0xfff;
- struct page *last_mount = read_bdev_page(bdev, pg);
- unsigned char *last_mount_data;
- char *ptr;
-
- if (!last_mount) {
- fs_info = ERR_PTR(-ENOMEM);
- break;
- }
- last_mount_data = kmap(last_mount);
- sz = dat->last_mount_size;
- ptr = kmalloc(sz, GFP_KERNEL);
-
- if (!ptr) {
- printk(KERN_EMERG "fs_info_from_block_dev "
- "failed to get memory for last mount "
- "timestamp.");
- free_fs_info(fs_info);
- fs_info = ERR_PTR(-ENOMEM);
- } else {
- fs_info->last_mount = ptr;
- fs_info->last_mount_size = sz;
- memcpy(ptr, &last_mount_data[off], sz);
- }
-
- kunmap(last_mount);
- __free_page(last_mount);
- }
- break;
- }
-
- if (data_page) {
- kunmap(data_page);
- __free_page(data_page);
- }
-
- if (uuid_data_page) {
- kunmap(uuid_data_page);
- __free_page(uuid_data_page);
- }
-
- return fs_info;
-}
-
-static int __init uuid_debug_setup(char *str)
-{
- int value;
-
- if (sscanf(str, "=%d", &value))
- debug_enabled = value;
-
- return 1;
-}
-
-__setup("uuid_debug", uuid_debug_setup);