summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
Diffstat (limited to 'lib')
-rw-r--r--lib/Kconfig13
-rw-r--r--lib/Kconfig.debug48
-rw-r--r--lib/Kconfig.kgdb2
-rw-r--r--lib/Makefile8
-rw-r--r--lib/asn1_decoder.c3
-rw-r--r--lib/debugobjects.c92
-rw-r--r--lib/gcd.c77
-rw-r--r--lib/iov_iter.c104
-rw-r--r--lib/mpi/mpicoder.c122
-rw-r--r--lib/nlattr.c103
-rw-r--r--lib/nmi_backtrace.c89
-rw-r--r--lib/nodemask.c30
-rw-r--r--lib/percpu_counter.c6
-rw-r--r--lib/proportions.c407
-rw-r--r--lib/radix-tree.c947
-rw-r--r--lib/rhashtable.c6
-rw-r--r--lib/sg_pool.c172
-rw-r--r--lib/string_helpers.c92
-rw-r--r--lib/strncpy_from_user.c2
-rw-r--r--lib/test_bpf.c5
-rw-r--r--lib/test_hash.c250
-rw-r--r--lib/test_kasan.c69
-rw-r--r--lib/test_rhashtable.c2
-rw-r--r--lib/test_uuid.c133
-rw-r--r--lib/uuid.c91
-rw-r--r--lib/vsprintf.c21
-rw-r--r--lib/wbt.c569
27 files changed, 2270 insertions, 1193 deletions
diff --git a/lib/Kconfig b/lib/Kconfig
index 3cca12225..c585e4c40 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -362,6 +362,9 @@ config INTERVAL_TREE
for more information.
+config RADIX_TREE_MULTIORDER
+ bool
+
config ASSOCIATIVE_ARRAY
bool
help
@@ -523,6 +526,13 @@ config SG_SPLIT
a scatterlist. This should be selected by a driver or an API which
whishes to split a scatterlist amongst multiple DMA channels.
+config SG_POOL
+ def_bool n
+ help
+ Provides a helper to allocate chained scatterlists. This should be
+ selected by a driver or an API which whishes to allocate chained
+ scatterlist.
+
#
# sg chaining option
#
@@ -540,4 +550,7 @@ config STACKDEPOT
bool
select STACKTRACE
+config WBT
+ bool
+
endmenu
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index b4e287cde..42d3d798c 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -257,6 +257,7 @@ config PAGE_OWNER
config DEBUG_FS
bool "Debug Filesystem"
+ select SRCU
help
debugfs is a virtual file system that kernel developers use to put
debugging files into. Enable this option to be able to read and
@@ -1289,6 +1290,39 @@ config TORTURE_TEST
tristate
default n
+config RCU_PERF_TEST
+ tristate "performance tests for RCU"
+ depends on DEBUG_KERNEL
+ select TORTURE_TEST
+ select SRCU
+ select TASKS_RCU
+ default n
+ help
+ This option provides a kernel module that runs performance
+ tests on the RCU infrastructure. The kernel module may be built
+ after the fact on the running kernel to be tested, if desired.
+
+ Say Y here if you want RCU performance tests to be built into
+ the kernel.
+ Say M if you want the RCU performance tests to build as a module.
+ Say N if you are unsure.
+
+config RCU_PERF_TEST_RUNNABLE
+ bool "performance tests for RCU runnable by default"
+ depends on RCU_PERF_TEST = y
+ default n
+ help
+ This option provides a way to build the RCU performance tests
+ directly into the kernel without them starting up at boot time.
+ You can use /sys/module to manually override this setting.
+ This /proc file is available only when the RCU performance
+ tests have been built into the kernel.
+
+ Say Y here if you want the RCU performance tests to start during
+ boot (you probably don't).
+ Say N here if you want the RCU performance tests to start only
+ after being manually enabled via /sys/module.
+
config RCU_TORTURE_TEST
tristate "torture tests for RCU"
depends on DEBUG_KERNEL && !SCHED_BFS
@@ -1808,6 +1842,9 @@ config TEST_BITMAP
If unsure, say N.
+config TEST_UUID
+ tristate "Test functions located in the uuid module at runtime"
+
config TEST_RHASHTABLE
tristate "Perform selftest on resizable hash table"
default n
@@ -1816,6 +1853,17 @@ config TEST_RHASHTABLE
If unsure, say N.
+config TEST_HASH
+ tristate "Perform selftest on hash functions"
+ default n
+ help
+ Enable this option to test the kernel's integer (<linux/hash,h>)
+ and string (<linux/stringhash.h>) hash functions on boot
+ (or module load).
+
+ This is intended to help people writing architecture-specific
+ optimized versions. If unsure, say N.
+
endmenu # runtime tests
config PROVIDE_OHCI1394_DMA_INIT
diff --git a/lib/Kconfig.kgdb b/lib/Kconfig.kgdb
index c635a107a..533f91263 100644
--- a/lib/Kconfig.kgdb
+++ b/lib/Kconfig.kgdb
@@ -22,7 +22,7 @@ config KGDB_SERIAL_CONSOLE
tristate "KGDB: use kgdb over the serial console"
select CONSOLE_POLL
select MAGIC_SYSRQ
- depends on TTY
+ depends on TTY && HW_CONSOLE
default y
help
Share a serial console with kgdb. Sysrq-g must be used
diff --git a/lib/Makefile b/lib/Makefile
index 7bd6fd436..5b5506e3b 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -23,9 +23,9 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
rbtree.o radix-tree.o dump_stack.o timerqueue.o\
idr.o int_sqrt.o extable.o \
sha1.o md5.o irq_regs.o argv_split.o \
- proportions.o flex_proportions.o ratelimit.o show_mem.o \
+ flex_proportions.o ratelimit.o show_mem.o \
is_single_threaded.o plist.o decompress.o kobject_uevent.o \
- earlycpio.o seq_buf.o nmi_backtrace.o
+ earlycpio.o seq_buf.o nmi_backtrace.o nodemask.o
obj-$(CONFIG_ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS) += usercopy.o
lib-$(CONFIG_MMU) += ioremap.o
@@ -48,6 +48,7 @@ obj-$(CONFIG_TEST_HEXDUMP) += test_hexdump.o
obj-y += kstrtox.o
obj-$(CONFIG_TEST_BPF) += test_bpf.o
obj-$(CONFIG_TEST_FIRMWARE) += test_firmware.o
+obj-$(CONFIG_TEST_HASH) += test_hash.o
obj-$(CONFIG_TEST_KASAN) += test_kasan.o
obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o
obj-$(CONFIG_TEST_LKM) += test_module.o
@@ -57,6 +58,7 @@ obj-$(CONFIG_TEST_STATIC_KEYS) += test_static_keys.o
obj-$(CONFIG_TEST_STATIC_KEYS) += test_static_key_base.o
obj-$(CONFIG_TEST_PRINTF) += test_printf.o
obj-$(CONFIG_TEST_BITMAP) += test_bitmap.o
+obj-$(CONFIG_TEST_UUID) += test_uuid.o
ifeq ($(CONFIG_DEBUG_KOBJECT),y)
CFLAGS_kobject.o += -DDEBUG
@@ -178,8 +180,10 @@ obj-$(CONFIG_GENERIC_STRNLEN_USER) += strnlen_user.o
obj-$(CONFIG_GENERIC_NET_UTILS) += net_utils.o
obj-$(CONFIG_SG_SPLIT) += sg_split.o
+obj-$(CONFIG_SG_POOL) += sg_pool.o
obj-$(CONFIG_STMP_DEVICE) += stmp_device.o
obj-$(CONFIG_IRQ_POLL) += irq_poll.o
+obj-$(CONFIG_WBT) += wbt.o
obj-$(CONFIG_STACKDEPOT) += stackdepot.o
KASAN_SANITIZE_stackdepot.o := n
diff --git a/lib/asn1_decoder.c b/lib/asn1_decoder.c
index 554522934..0bd8a611e 100644
--- a/lib/asn1_decoder.c
+++ b/lib/asn1_decoder.c
@@ -12,6 +12,7 @@
#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/errno.h>
+#include <linux/module.h>
#include <linux/asn1_decoder.h>
#include <linux/asn1_ber_bytecode.h>
@@ -506,3 +507,5 @@ error:
return -EBADMSG;
}
EXPORT_SYMBOL_GPL(asn1_ber_decoder);
+
+MODULE_LICENSE("GPL");
diff --git a/lib/debugobjects.c b/lib/debugobjects.c
index 519b5a10f..a8e12601e 100644
--- a/lib/debugobjects.c
+++ b/lib/debugobjects.c
@@ -269,16 +269,15 @@ static void debug_print_object(struct debug_obj *obj, char *msg)
* Try to repair the damage, so we have a better chance to get useful
* debug output.
*/
-static int
-debug_object_fixup(int (*fixup)(void *addr, enum debug_obj_state state),
+static bool
+debug_object_fixup(bool (*fixup)(void *addr, enum debug_obj_state state),
void * addr, enum debug_obj_state state)
{
- int fixed = 0;
-
- if (fixup)
- fixed = fixup(addr, state);
- debug_objects_fixups += fixed;
- return fixed;
+ if (fixup && fixup(addr, state)) {
+ debug_objects_fixups++;
+ return true;
+ }
+ return false;
}
static void debug_object_is_on_stack(void *addr, int onstack)
@@ -416,7 +415,7 @@ int debug_object_activate(void *addr, struct debug_obj_descr *descr)
state = obj->state;
raw_spin_unlock_irqrestore(&db->lock, flags);
ret = debug_object_fixup(descr->fixup_activate, addr, state);
- return ret ? -EINVAL : 0;
+ return ret ? 0 : -EINVAL;
case ODEBUG_STATE_DESTROYED:
debug_print_object(obj, "activate");
@@ -432,14 +431,21 @@ int debug_object_activate(void *addr, struct debug_obj_descr *descr)
raw_spin_unlock_irqrestore(&db->lock, flags);
/*
- * This happens when a static object is activated. We
- * let the type specific code decide whether this is
- * true or not.
+ * We are here when a static object is activated. We
+ * let the type specific code confirm whether this is
+ * true or not. if true, we just make sure that the
+ * static object is tracked in the object tracker. If
+ * not, this must be a bug, so we try to fix it up.
*/
- if (debug_object_fixup(descr->fixup_activate, addr,
- ODEBUG_STATE_NOTAVAILABLE)) {
+ if (descr->is_static_object && descr->is_static_object(addr)) {
+ /* track this static object */
+ debug_object_init(addr, descr);
+ debug_object_activate(addr, descr);
+ } else {
debug_print_object(&o, "activate");
- return -EINVAL;
+ ret = debug_object_fixup(descr->fixup_activate, addr,
+ ODEBUG_STATE_NOTAVAILABLE);
+ return ret ? 0 : -EINVAL;
}
return 0;
}
@@ -603,12 +609,18 @@ void debug_object_assert_init(void *addr, struct debug_obj_descr *descr)
raw_spin_unlock_irqrestore(&db->lock, flags);
/*
- * Maybe the object is static. Let the type specific
- * code decide what to do.
+ * Maybe the object is static, and we let the type specific
+ * code confirm. Track this static object if true, else invoke
+ * fixup.
*/
- if (debug_object_fixup(descr->fixup_assert_init, addr,
- ODEBUG_STATE_NOTAVAILABLE))
+ if (descr->is_static_object && descr->is_static_object(addr)) {
+ /* Track this static object */
+ debug_object_init(addr, descr);
+ } else {
debug_print_object(&o, "assert_init");
+ debug_object_fixup(descr->fixup_assert_init, addr,
+ ODEBUG_STATE_NOTAVAILABLE);
+ }
return;
}
@@ -793,11 +805,18 @@ struct self_test {
static __initdata struct debug_obj_descr descr_type_test;
+static bool __init is_static_object(void *addr)
+{
+ struct self_test *obj = addr;
+
+ return obj->static_init;
+}
+
/*
* fixup_init is called when:
* - an active object is initialized
*/
-static int __init fixup_init(void *addr, enum debug_obj_state state)
+static bool __init fixup_init(void *addr, enum debug_obj_state state)
{
struct self_test *obj = addr;
@@ -805,37 +824,31 @@ static int __init fixup_init(void *addr, enum debug_obj_state state)
case ODEBUG_STATE_ACTIVE:
debug_object_deactivate(obj, &descr_type_test);
debug_object_init(obj, &descr_type_test);
- return 1;
+ return true;
default:
- return 0;
+ return false;
}
}
/*
* fixup_activate is called when:
* - an active object is activated
- * - an unknown object is activated (might be a statically initialized object)
+ * - an unknown non-static object is activated
*/
-static int __init fixup_activate(void *addr, enum debug_obj_state state)
+static bool __init fixup_activate(void *addr, enum debug_obj_state state)
{
struct self_test *obj = addr;
switch (state) {
case ODEBUG_STATE_NOTAVAILABLE:
- if (obj->static_init == 1) {
- debug_object_init(obj, &descr_type_test);
- debug_object_activate(obj, &descr_type_test);
- return 0;
- }
- return 1;
-
+ return true;
case ODEBUG_STATE_ACTIVE:
debug_object_deactivate(obj, &descr_type_test);
debug_object_activate(obj, &descr_type_test);
- return 1;
+ return true;
default:
- return 0;
+ return false;
}
}
@@ -843,7 +856,7 @@ static int __init fixup_activate(void *addr, enum debug_obj_state state)
* fixup_destroy is called when:
* - an active object is destroyed
*/
-static int __init fixup_destroy(void *addr, enum debug_obj_state state)
+static bool __init fixup_destroy(void *addr, enum debug_obj_state state)
{
struct self_test *obj = addr;
@@ -851,9 +864,9 @@ static int __init fixup_destroy(void *addr, enum debug_obj_state state)
case ODEBUG_STATE_ACTIVE:
debug_object_deactivate(obj, &descr_type_test);
debug_object_destroy(obj, &descr_type_test);
- return 1;
+ return true;
default:
- return 0;
+ return false;
}
}
@@ -861,7 +874,7 @@ static int __init fixup_destroy(void *addr, enum debug_obj_state state)
* fixup_free is called when:
* - an active object is freed
*/
-static int __init fixup_free(void *addr, enum debug_obj_state state)
+static bool __init fixup_free(void *addr, enum debug_obj_state state)
{
struct self_test *obj = addr;
@@ -869,9 +882,9 @@ static int __init fixup_free(void *addr, enum debug_obj_state state)
case ODEBUG_STATE_ACTIVE:
debug_object_deactivate(obj, &descr_type_test);
debug_object_free(obj, &descr_type_test);
- return 1;
+ return true;
default:
- return 0;
+ return false;
}
}
@@ -917,6 +930,7 @@ out:
static __initdata struct debug_obj_descr descr_type_test = {
.name = "selftest",
+ .is_static_object = is_static_object,
.fixup_init = fixup_init,
.fixup_activate = fixup_activate,
.fixup_destroy = fixup_destroy,
diff --git a/lib/gcd.c b/lib/gcd.c
index 3657f129d..135ee6407 100644
--- a/lib/gcd.c
+++ b/lib/gcd.c
@@ -2,20 +2,77 @@
#include <linux/gcd.h>
#include <linux/export.h>
-/* Greatest common divisor */
+/*
+ * This implements the binary GCD algorithm. (Often attributed to Stein,
+ * but as Knuth has noted, appears in a first-century Chinese math text.)
+ *
+ * This is faster than the division-based algorithm even on x86, which
+ * has decent hardware division.
+ */
+
+#if !defined(CONFIG_CPU_NO_EFFICIENT_FFS) && !defined(CPU_NO_EFFICIENT_FFS)
+
+/* If __ffs is available, the even/odd algorithm benchmarks slower. */
unsigned long gcd(unsigned long a, unsigned long b)
{
- unsigned long r;
+ unsigned long r = a | b;
+
+ if (!a || !b)
+ return r;
- if (a < b)
- swap(a, b);
+ b >>= __ffs(b);
+ if (b == 1)
+ return r & -r;
- if (!b)
- return a;
- while ((r = a % b) != 0) {
- a = b;
- b = r;
+ for (;;) {
+ a >>= __ffs(a);
+ if (a == 1)
+ return r & -r;
+ if (a == b)
+ return a << __ffs(r);
+
+ if (a < b)
+ swap(a, b);
+ a -= b;
}
- return b;
}
+
+#else
+
+/* If normalization is done by loops, the even/odd algorithm is a win. */
+unsigned long gcd(unsigned long a, unsigned long b)
+{
+ unsigned long r = a | b;
+
+ if (!a || !b)
+ return r;
+
+ /* Isolate lsbit of r */
+ r &= -r;
+
+ while (!(b & r))
+ b >>= 1;
+ if (b == r)
+ return r;
+
+ for (;;) {
+ while (!(a & r))
+ a >>= 1;
+ if (a == r)
+ return r;
+ if (a == b)
+ return a;
+
+ if (a < b)
+ swap(a, b);
+ a -= b;
+ a >>= 1;
+ if (a & r)
+ a += b;
+ a >>= 1;
+ }
+}
+
+#endif
+
EXPORT_SYMBOL_GPL(gcd);
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index ca5316e00..0cd522753 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -99,40 +99,44 @@
}
#define iterate_and_advance(i, n, v, I, B, K) { \
- size_t skip = i->iov_offset; \
- if (unlikely(i->type & ITER_BVEC)) { \
- const struct bio_vec *bvec; \
- struct bio_vec v; \
- iterate_bvec(i, n, v, bvec, skip, (B)) \
- if (skip == bvec->bv_len) { \
- bvec++; \
- skip = 0; \
- } \
- i->nr_segs -= bvec - i->bvec; \
- i->bvec = bvec; \
- } else if (unlikely(i->type & ITER_KVEC)) { \
- const struct kvec *kvec; \
- struct kvec v; \
- iterate_kvec(i, n, v, kvec, skip, (K)) \
- if (skip == kvec->iov_len) { \
- kvec++; \
- skip = 0; \
- } \
- i->nr_segs -= kvec - i->kvec; \
- i->kvec = kvec; \
- } else { \
- const struct iovec *iov; \
- struct iovec v; \
- iterate_iovec(i, n, v, iov, skip, (I)) \
- if (skip == iov->iov_len) { \
- iov++; \
- skip = 0; \
+ if (unlikely(i->count < n)) \
+ n = i->count; \
+ if (i->count) { \
+ size_t skip = i->iov_offset; \
+ if (unlikely(i->type & ITER_BVEC)) { \
+ const struct bio_vec *bvec; \
+ struct bio_vec v; \
+ iterate_bvec(i, n, v, bvec, skip, (B)) \
+ if (skip == bvec->bv_len) { \
+ bvec++; \
+ skip = 0; \
+ } \
+ i->nr_segs -= bvec - i->bvec; \
+ i->bvec = bvec; \
+ } else if (unlikely(i->type & ITER_KVEC)) { \
+ const struct kvec *kvec; \
+ struct kvec v; \
+ iterate_kvec(i, n, v, kvec, skip, (K)) \
+ if (skip == kvec->iov_len) { \
+ kvec++; \
+ skip = 0; \
+ } \
+ i->nr_segs -= kvec - i->kvec; \
+ i->kvec = kvec; \
+ } else { \
+ const struct iovec *iov; \
+ struct iovec v; \
+ iterate_iovec(i, n, v, iov, skip, (I)) \
+ if (skip == iov->iov_len) { \
+ iov++; \
+ skip = 0; \
+ } \
+ i->nr_segs -= iov - i->iov; \
+ i->iov = iov; \
} \
- i->nr_segs -= iov - i->iov; \
- i->iov = iov; \
+ i->count -= n; \
+ i->iov_offset = skip; \
} \
- i->count -= n; \
- i->iov_offset = skip; \
}
static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t bytes,
@@ -386,12 +390,6 @@ static void memzero_page(struct page *page, size_t offset, size_t len)
size_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
{
const char *from = addr;
- if (unlikely(bytes > i->count))
- bytes = i->count;
-
- if (unlikely(!bytes))
- return 0;
-
iterate_and_advance(i, bytes, v,
__copy_to_user(v.iov_base, (from += v.iov_len) - v.iov_len,
v.iov_len),
@@ -407,12 +405,6 @@ EXPORT_SYMBOL(copy_to_iter);
size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
{
char *to = addr;
- if (unlikely(bytes > i->count))
- bytes = i->count;
-
- if (unlikely(!bytes))
- return 0;
-
iterate_and_advance(i, bytes, v,
__copy_from_user((to += v.iov_len) - v.iov_len, v.iov_base,
v.iov_len),
@@ -428,12 +420,6 @@ EXPORT_SYMBOL(copy_from_iter);
size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
{
char *to = addr;
- if (unlikely(bytes > i->count))
- bytes = i->count;
-
- if (unlikely(!bytes))
- return 0;
-
iterate_and_advance(i, bytes, v,
__copy_from_user_nocache((to += v.iov_len) - v.iov_len,
v.iov_base, v.iov_len),
@@ -474,12 +460,6 @@ EXPORT_SYMBOL(copy_page_from_iter);
size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
{
- if (unlikely(bytes > i->count))
- bytes = i->count;
-
- if (unlikely(!bytes))
- return 0;
-
iterate_and_advance(i, bytes, v,
__clear_user(v.iov_base, v.iov_len),
memzero_page(v.bv_page, v.bv_offset, v.bv_len),
@@ -685,12 +665,6 @@ size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum,
char *to = addr;
__wsum sum, next;
size_t off = 0;
- if (unlikely(bytes > i->count))
- bytes = i->count;
-
- if (unlikely(!bytes))
- return 0;
-
sum = *csum;
iterate_and_advance(i, bytes, v, ({
int err = 0;
@@ -729,12 +703,6 @@ size_t csum_and_copy_to_iter(const void *addr, size_t bytes, __wsum *csum,
const char *from = addr;
__wsum sum, next;
size_t off = 0;
- if (unlikely(bytes > i->count))
- bytes = i->count;
-
- if (unlikely(!bytes))
- return 0;
-
sum = *csum;
iterate_and_advance(i, bytes, v, ({
int err = 0;
diff --git a/lib/mpi/mpicoder.c b/lib/mpi/mpicoder.c
index eb15e7dc7..747606f9e 100644
--- a/lib/mpi/mpicoder.c
+++ b/lib/mpi/mpicoder.c
@@ -20,6 +20,8 @@
#include <linux/bitops.h>
#include <linux/count_zeros.h>
+#include <linux/byteorder/generic.h>
+#include <linux/string.h>
#include "mpi-internal.h"
#define MAX_EXTERN_MPI_BITS 16384
@@ -163,7 +165,13 @@ int mpi_read_buffer(MPI a, uint8_t *buf, unsigned buf_len, unsigned *nbytes,
int *sign)
{
uint8_t *p;
- mpi_limb_t alimb;
+#if BYTES_PER_MPI_LIMB == 4
+ __be32 alimb;
+#elif BYTES_PER_MPI_LIMB == 8
+ __be64 alimb;
+#else
+#error please implement for this limb size.
+#endif
unsigned int n = mpi_get_size(a);
int i, lzeros;
@@ -183,38 +191,19 @@ int mpi_read_buffer(MPI a, uint8_t *buf, unsigned buf_len, unsigned *nbytes,
p = buf;
*nbytes = n - lzeros;
- for (i = a->nlimbs - 1; i >= 0; i--) {
- alimb = a->d[i];
+ for (i = a->nlimbs - 1 - lzeros / BYTES_PER_MPI_LIMB,
+ lzeros %= BYTES_PER_MPI_LIMB;
+ i >= 0; i--) {
#if BYTES_PER_MPI_LIMB == 4
- *p++ = alimb >> 24;
- *p++ = alimb >> 16;
- *p++ = alimb >> 8;
- *p++ = alimb;
+ alimb = cpu_to_be32(a->d[i]);
#elif BYTES_PER_MPI_LIMB == 8
- *p++ = alimb >> 56;
- *p++ = alimb >> 48;
- *p++ = alimb >> 40;
- *p++ = alimb >> 32;
- *p++ = alimb >> 24;
- *p++ = alimb >> 16;
- *p++ = alimb >> 8;
- *p++ = alimb;
+ alimb = cpu_to_be64(a->d[i]);
#else
#error please implement for this limb size.
#endif
-
- if (lzeros > 0) {
- if (lzeros >= sizeof(alimb)) {
- p -= sizeof(alimb);
- } else {
- mpi_limb_t *limb1 = (void *)p - sizeof(alimb);
- mpi_limb_t *limb2 = (void *)p - sizeof(alimb)
- + lzeros;
- *limb1 = *limb2;
- p -= lzeros;
- }
- lzeros -= sizeof(alimb);
- }
+ memcpy(p, (u8 *)&alimb + lzeros, BYTES_PER_MPI_LIMB - lzeros);
+ p += BYTES_PER_MPI_LIMB - lzeros;
+ lzeros = 0;
}
return 0;
}
@@ -359,7 +348,13 @@ int mpi_write_to_sgl(MPI a, struct scatterlist *sgl, unsigned *nbytes,
int *sign)
{
u8 *p, *p2;
- mpi_limb_t alimb, alimb2;
+#if BYTES_PER_MPI_LIMB == 4
+ __be32 alimb;
+#elif BYTES_PER_MPI_LIMB == 8
+ __be64 alimb;
+#else
+#error please implement for this limb size.
+#endif
unsigned int n = mpi_get_size(a);
int i, x, y = 0, lzeros, buf_len;
@@ -380,42 +375,22 @@ int mpi_write_to_sgl(MPI a, struct scatterlist *sgl, unsigned *nbytes,
buf_len = sgl->length;
p2 = sg_virt(sgl);
- for (i = a->nlimbs - 1; i >= 0; i--) {
- alimb = a->d[i];
- p = (u8 *)&alimb2;
+ for (i = a->nlimbs - 1 - lzeros / BYTES_PER_MPI_LIMB,
+ lzeros %= BYTES_PER_MPI_LIMB;
+ i >= 0; i--) {
#if BYTES_PER_MPI_LIMB == 4
- *p++ = alimb >> 24;
- *p++ = alimb >> 16;
- *p++ = alimb >> 8;
- *p++ = alimb;
+ alimb = cpu_to_be32(a->d[i]);
#elif BYTES_PER_MPI_LIMB == 8
- *p++ = alimb >> 56;
- *p++ = alimb >> 48;
- *p++ = alimb >> 40;
- *p++ = alimb >> 32;
- *p++ = alimb >> 24;
- *p++ = alimb >> 16;
- *p++ = alimb >> 8;
- *p++ = alimb;
+ alimb = cpu_to_be64(a->d[i]);
#else
#error please implement for this limb size.
#endif
- if (lzeros > 0) {
- if (lzeros >= sizeof(alimb)) {
- p -= sizeof(alimb);
- continue;
- } else {
- mpi_limb_t *limb1 = (void *)p - sizeof(alimb);
- mpi_limb_t *limb2 = (void *)p - sizeof(alimb)
- + lzeros;
- *limb1 = *limb2;
- p -= lzeros;
- y = lzeros;
- }
- lzeros -= sizeof(alimb);
+ if (lzeros) {
+ y = lzeros;
+ lzeros = 0;
}
- p = p - (sizeof(alimb) - y);
+ p = (u8 *)&alimb + y;
for (x = 0; x < sizeof(alimb) - y; x++) {
if (!buf_len) {
@@ -443,15 +418,15 @@ EXPORT_SYMBOL_GPL(mpi_write_to_sgl);
* a new MPI and reads the content of the sgl to the MPI.
*
* @sgl: scatterlist to read from
- * @len: number of bytes to read
+ * @nbytes: number of bytes to read
*
* Return: Pointer to a new MPI or NULL on error
*/
-MPI mpi_read_raw_from_sgl(struct scatterlist *sgl, unsigned int len)
+MPI mpi_read_raw_from_sgl(struct scatterlist *sgl, unsigned int nbytes)
{
struct scatterlist *sg;
int x, i, j, z, lzeros, ents;
- unsigned int nbits, nlimbs, nbytes;
+ unsigned int nbits, nlimbs;
mpi_limb_t a;
MPI val = NULL;
@@ -472,16 +447,12 @@ MPI mpi_read_raw_from_sgl(struct scatterlist *sgl, unsigned int len)
break;
ents--;
+ nbytes -= lzeros;
lzeros = 0;
}
sgl = sg;
-
- if (!ents)
- nbytes = 0;
- else
- nbytes = len - lzeros;
-
+ nbytes -= lzeros;
nbits = nbytes * 8;
if (nbits > MAX_EXTERN_MPI_BITS) {
pr_info("MPI: mpi too large (%u bits)\n", nbits);
@@ -489,9 +460,8 @@ MPI mpi_read_raw_from_sgl(struct scatterlist *sgl, unsigned int len)
}
if (nbytes > 0)
- nbits -= count_leading_zeros(*(u8 *)(sg_virt(sgl) + lzeros));
- else
- nbits = 0;
+ nbits -= count_leading_zeros(*(u8 *)(sg_virt(sgl) + lzeros)) -
+ (BITS_PER_LONG - 8);
nlimbs = DIV_ROUND_UP(nbytes, BYTES_PER_MPI_LIMB);
val = mpi_alloc(nlimbs);
@@ -507,19 +477,14 @@ MPI mpi_read_raw_from_sgl(struct scatterlist *sgl, unsigned int len)
j = nlimbs - 1;
a = 0;
- z = 0;
- x = BYTES_PER_MPI_LIMB - nbytes % BYTES_PER_MPI_LIMB;
- x %= BYTES_PER_MPI_LIMB;
+ z = BYTES_PER_MPI_LIMB - nbytes % BYTES_PER_MPI_LIMB;
+ z %= BYTES_PER_MPI_LIMB;
for_each_sg(sgl, sg, ents, i) {
const u8 *buffer = sg_virt(sg) + lzeros;
int len = sg->length - lzeros;
- int buf_shift = x;
-
- if (sg_is_last(sg) && (len % BYTES_PER_MPI_LIMB))
- len += BYTES_PER_MPI_LIMB - (len % BYTES_PER_MPI_LIMB);
- for (; x < len + buf_shift; x++) {
+ for (x = 0; x < len; x++) {
a <<= 8;
a |= *buffer++;
if (((z + x + 1) % BYTES_PER_MPI_LIMB) == 0) {
@@ -528,7 +493,6 @@ MPI mpi_read_raw_from_sgl(struct scatterlist *sgl, unsigned int len)
}
}
z += x;
- x = 0;
lzeros = 0;
}
return val;
diff --git a/lib/nlattr.c b/lib/nlattr.c
index f5907d232..fce1e9afc 100644
--- a/lib/nlattr.c
+++ b/lib/nlattr.c
@@ -355,6 +355,30 @@ struct nlattr *__nla_reserve(struct sk_buff *skb, int attrtype, int attrlen)
EXPORT_SYMBOL(__nla_reserve);
/**
+ * __nla_reserve_64bit - reserve room for attribute on the skb and align it
+ * @skb: socket buffer to reserve room on
+ * @attrtype: attribute type
+ * @attrlen: length of attribute payload
+ * @padattr: attribute type for the padding
+ *
+ * Adds a netlink attribute header to a socket buffer and reserves
+ * room for the payload but does not copy it. It also ensure that this
+ * attribute will have a 64-bit aligned nla_data() area.
+ *
+ * The caller is responsible to ensure that the skb provides enough
+ * tailroom for the attribute header and payload.
+ */
+struct nlattr *__nla_reserve_64bit(struct sk_buff *skb, int attrtype,
+ int attrlen, int padattr)
+{
+ if (nla_need_padding_for_64bit(skb))
+ nla_align_64bit(skb, padattr);
+
+ return __nla_reserve(skb, attrtype, attrlen);
+}
+EXPORT_SYMBOL(__nla_reserve_64bit);
+
+/**
* __nla_reserve_nohdr - reserve room for attribute without header
* @skb: socket buffer to reserve room on
* @attrlen: length of attribute payload
@@ -397,6 +421,36 @@ struct nlattr *nla_reserve(struct sk_buff *skb, int attrtype, int attrlen)
EXPORT_SYMBOL(nla_reserve);
/**
+ * nla_reserve_64bit - reserve room for attribute on the skb and align it
+ * @skb: socket buffer to reserve room on
+ * @attrtype: attribute type
+ * @attrlen: length of attribute payload
+ * @padattr: attribute type for the padding
+ *
+ * Adds a netlink attribute header to a socket buffer and reserves
+ * room for the payload but does not copy it. It also ensure that this
+ * attribute will have a 64-bit aligned nla_data() area.
+ *
+ * Returns NULL if the tailroom of the skb is insufficient to store
+ * the attribute header and payload.
+ */
+struct nlattr *nla_reserve_64bit(struct sk_buff *skb, int attrtype, int attrlen,
+ int padattr)
+{
+ size_t len;
+
+ if (nla_need_padding_for_64bit(skb))
+ len = nla_total_size_64bit(attrlen);
+ else
+ len = nla_total_size(attrlen);
+ if (unlikely(skb_tailroom(skb) < len))
+ return NULL;
+
+ return __nla_reserve_64bit(skb, attrtype, attrlen, padattr);
+}
+EXPORT_SYMBOL(nla_reserve_64bit);
+
+/**
* nla_reserve_nohdr - reserve room for attribute without header
* @skb: socket buffer to reserve room on
* @attrlen: length of attribute payload
@@ -436,6 +490,27 @@ void __nla_put(struct sk_buff *skb, int attrtype, int attrlen,
EXPORT_SYMBOL(__nla_put);
/**
+ * __nla_put_64bit - Add a netlink attribute to a socket buffer and align it
+ * @skb: socket buffer to add attribute to
+ * @attrtype: attribute type
+ * @attrlen: length of attribute payload
+ * @data: head of attribute payload
+ * @padattr: attribute type for the padding
+ *
+ * The caller is responsible to ensure that the skb provides enough
+ * tailroom for the attribute header and payload.
+ */
+void __nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen,
+ const void *data, int padattr)
+{
+ struct nlattr *nla;
+
+ nla = __nla_reserve_64bit(skb, attrtype, attrlen, padattr);
+ memcpy(nla_data(nla), data, attrlen);
+}
+EXPORT_SYMBOL(__nla_put_64bit);
+
+/**
* __nla_put_nohdr - Add a netlink attribute without header
* @skb: socket buffer to add attribute to
* @attrlen: length of attribute payload
@@ -474,6 +549,34 @@ int nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data)
EXPORT_SYMBOL(nla_put);
/**
+ * nla_put_64bit - Add a netlink attribute to a socket buffer and align it
+ * @skb: socket buffer to add attribute to
+ * @attrtype: attribute type
+ * @attrlen: length of attribute payload
+ * @data: head of attribute payload
+ * @padattr: attribute type for the padding
+ *
+ * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store
+ * the attribute header and payload.
+ */
+int nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen,
+ const void *data, int padattr)
+{
+ size_t len;
+
+ if (nla_need_padding_for_64bit(skb))
+ len = nla_total_size_64bit(attrlen);
+ else
+ len = nla_total_size(attrlen);
+ if (unlikely(skb_tailroom(skb) < len))
+ return -EMSGSIZE;
+
+ __nla_put_64bit(skb, attrtype, attrlen, data, padattr);
+ return 0;
+}
+EXPORT_SYMBOL(nla_put_64bit);
+
+/**
* nla_put_nohdr - Add a netlink attribute without header
* @skb: socket buffer to add attribute to
* @attrlen: length of attribute payload
diff --git a/lib/nmi_backtrace.c b/lib/nmi_backtrace.c
index 6019c53c6..26caf51cc 100644
--- a/lib/nmi_backtrace.c
+++ b/lib/nmi_backtrace.c
@@ -16,33 +16,14 @@
#include <linux/delay.h>
#include <linux/kprobes.h>
#include <linux/nmi.h>
-#include <linux/seq_buf.h>
#ifdef arch_trigger_all_cpu_backtrace
/* For reliability, we're prepared to waste bits here. */
static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
-static cpumask_t printtrace_mask;
-
-#define NMI_BUF_SIZE 4096
-
-struct nmi_seq_buf {
- unsigned char buffer[NMI_BUF_SIZE];
- struct seq_buf seq;
-};
-
-/* Safe printing in NMI context */
-static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq);
/* "in progress" flag of arch_trigger_all_cpu_backtrace */
static unsigned long backtrace_flag;
-static void print_seq_line(struct nmi_seq_buf *s, int start, int end)
-{
- const char *buf = s->buffer + start;
-
- printk("%.*s", (end - start) + 1, buf);
-}
-
/*
* When raise() is called it will be is passed a pointer to the
* backtrace_mask. Architectures that call nmi_cpu_backtrace()
@@ -52,8 +33,7 @@ static void print_seq_line(struct nmi_seq_buf *s, int start, int end)
void nmi_trigger_all_cpu_backtrace(bool include_self,
void (*raise)(cpumask_t *mask))
{
- struct nmi_seq_buf *s;
- int i, cpu, this_cpu = get_cpu();
+ int i, this_cpu = get_cpu();
if (test_and_set_bit(0, &backtrace_flag)) {
/*
@@ -68,17 +48,6 @@ void nmi_trigger_all_cpu_backtrace(bool include_self,
if (!include_self)
cpumask_clear_cpu(this_cpu, to_cpumask(backtrace_mask));
- cpumask_copy(&printtrace_mask, to_cpumask(backtrace_mask));
-
- /*
- * Set up per_cpu seq_buf buffers that the NMIs running on the other
- * CPUs will write to.
- */
- for_each_cpu(cpu, to_cpumask(backtrace_mask)) {
- s = &per_cpu(nmi_print_seq, cpu);
- seq_buf_init(&s->seq, s->buffer, NMI_BUF_SIZE);
- }
-
if (!cpumask_empty(to_cpumask(backtrace_mask))) {
pr_info("Sending NMI to %s CPUs:\n",
(include_self ? "all" : "other"));
@@ -94,73 +63,25 @@ void nmi_trigger_all_cpu_backtrace(bool include_self,
}
/*
- * Now that all the NMIs have triggered, we can dump out their
- * back traces safely to the console.
+ * Force flush any remote buffers that might be stuck in IRQ context
+ * and therefore could not run their irq_work.
*/
- for_each_cpu(cpu, &printtrace_mask) {
- int len, last_i = 0;
+ printk_nmi_flush();
- s = &per_cpu(nmi_print_seq, cpu);
- len = seq_buf_used(&s->seq);
- if (!len)
- continue;
-
- /* Print line by line. */
- for (i = 0; i < len; i++) {
- if (s->buffer[i] == '\n') {
- print_seq_line(s, last_i, i);
- last_i = i + 1;
- }
- }
- /* Check if there was a partial line. */
- if (last_i < len) {
- print_seq_line(s, last_i, len - 1);
- pr_cont("\n");
- }
- }
-
- clear_bit(0, &backtrace_flag);
- smp_mb__after_atomic();
+ clear_bit_unlock(0, &backtrace_flag);
put_cpu();
}
-/*
- * It is not safe to call printk() directly from NMI handlers.
- * It may be fine if the NMI detected a lock up and we have no choice
- * but to do so, but doing a NMI on all other CPUs to get a back trace
- * can be done with a sysrq-l. We don't want that to lock up, which
- * can happen if the NMI interrupts a printk in progress.
- *
- * Instead, we redirect the vprintk() to this nmi_vprintk() that writes
- * the content into a per cpu seq_buf buffer. Then when the NMIs are
- * all done, we can safely dump the contents of the seq_buf to a printk()
- * from a non NMI context.
- */
-static int nmi_vprintk(const char *fmt, va_list args)
-{
- struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
- unsigned int len = seq_buf_used(&s->seq);
-
- seq_buf_vprintf(&s->seq, fmt, args);
- return seq_buf_used(&s->seq) - len;
-}
-
bool nmi_cpu_backtrace(struct pt_regs *regs)
{
int cpu = smp_processor_id();
if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
- printk_func_t printk_func_save = this_cpu_read(printk_func);
-
- /* Replace printk to write into the NMI seq */
- this_cpu_write(printk_func, nmi_vprintk);
pr_warn("NMI backtrace for cpu %d\n", cpu);
if (regs)
show_regs(regs);
else
dump_stack();
- this_cpu_write(printk_func, printk_func_save);
-
cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
return true;
}
diff --git a/lib/nodemask.c b/lib/nodemask.c
new file mode 100644
index 000000000..e42a5bf44
--- /dev/null
+++ b/lib/nodemask.c
@@ -0,0 +1,30 @@
+#include <linux/nodemask.h>
+#include <linux/module.h>
+#include <linux/random.h>
+
+int __next_node_in(int node, const nodemask_t *srcp)
+{
+ int ret = __next_node(node, srcp);
+
+ if (ret == MAX_NUMNODES)
+ ret = __first_node(srcp);
+ return ret;
+}
+EXPORT_SYMBOL(__next_node_in);
+
+#ifdef CONFIG_NUMA
+/*
+ * Return the bit number of a random bit set in the nodemask.
+ * (returns NUMA_NO_NODE if nodemask is empty)
+ */
+int node_random(const nodemask_t *maskp)
+{
+ int w, bit = NUMA_NO_NODE;
+
+ w = nodes_weight(*maskp);
+ if (w)
+ bit = bitmap_ord_to_pos(maskp->bits,
+ get_random_int() % w, MAX_NUMNODES);
+ return bit;
+}
+#endif
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index f051d69f0..72d36113c 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -19,7 +19,7 @@ static DEFINE_SPINLOCK(percpu_counters_lock);
static struct debug_obj_descr percpu_counter_debug_descr;
-static int percpu_counter_fixup_free(void *addr, enum debug_obj_state state)
+static bool percpu_counter_fixup_free(void *addr, enum debug_obj_state state)
{
struct percpu_counter *fbc = addr;
@@ -27,9 +27,9 @@ static int percpu_counter_fixup_free(void *addr, enum debug_obj_state state)
case ODEBUG_STATE_ACTIVE:
percpu_counter_destroy(fbc);
debug_object_free(fbc, &percpu_counter_debug_descr);
- return 1;
+ return true;
default:
- return 0;
+ return false;
}
}
diff --git a/lib/proportions.c b/lib/proportions.c
deleted file mode 100644
index efa54f259..000000000
--- a/lib/proportions.c
+++ /dev/null
@@ -1,407 +0,0 @@
-/*
- * Floating proportions
- *
- * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
- *
- * Description:
- *
- * The floating proportion is a time derivative with an exponentially decaying
- * history:
- *
- * p_{j} = \Sum_{i=0} (dx_{j}/dt_{-i}) / 2^(1+i)
- *
- * Where j is an element from {prop_local}, x_{j} is j's number of events,
- * and i the time period over which the differential is taken. So d/dt_{-i} is
- * the differential over the i-th last period.
- *
- * The decaying history gives smooth transitions. The time differential carries
- * the notion of speed.
- *
- * The denominator is 2^(1+i) because we want the series to be normalised, ie.
- *
- * \Sum_{i=0} 1/2^(1+i) = 1
- *
- * Further more, if we measure time (t) in the same events as x; so that:
- *
- * t = \Sum_{j} x_{j}
- *
- * we get that:
- *
- * \Sum_{j} p_{j} = 1
- *
- * Writing this in an iterative fashion we get (dropping the 'd's):
- *
- * if (++x_{j}, ++t > period)
- * t /= 2;
- * for_each (j)
- * x_{j} /= 2;
- *
- * so that:
- *
- * p_{j} = x_{j} / t;
- *
- * We optimize away the '/= 2' for the global time delta by noting that:
- *
- * if (++t > period) t /= 2:
- *
- * Can be approximated by:
- *
- * period/2 + (++t % period/2)
- *
- * [ Furthermore, when we choose period to be 2^n it can be written in terms of
- * binary operations and wraparound artefacts disappear. ]
- *
- * Also note that this yields a natural counter of the elapsed periods:
- *
- * c = t / (period/2)
- *
- * [ Its monotonic increasing property can be applied to mitigate the wrap-
- * around issue. ]
- *
- * This allows us to do away with the loop over all prop_locals on each period
- * expiration. By remembering the period count under which it was last accessed
- * as c_{j}, we can obtain the number of 'missed' cycles from:
- *
- * c - c_{j}
- *
- * We can then lazily catch up to the global period count every time we are
- * going to use x_{j}, by doing:
- *
- * x_{j} /= 2^(c - c_{j}), c_{j} = c
- */
-
-#include <linux/proportions.h>
-#include <linux/rcupdate.h>
-
-int prop_descriptor_init(struct prop_descriptor *pd, int shift, gfp_t gfp)
-{
- int err;
-
- if (shift > PROP_MAX_SHIFT)
- shift = PROP_MAX_SHIFT;
-
- pd->index = 0;
- pd->pg[0].shift = shift;
- mutex_init(&pd->mutex);
- err = percpu_counter_init(&pd->pg[0].events, 0, gfp);
- if (err)
- goto out;
-
- err = percpu_counter_init(&pd->pg[1].events, 0, gfp);
- if (err)
- percpu_counter_destroy(&pd->pg[0].events);
-
-out:
- return err;
-}
-
-/*
- * We have two copies, and flip between them to make it seem like an atomic
- * update. The update is not really atomic wrt the events counter, but
- * it is internally consistent with the bit layout depending on shift.
- *
- * We copy the events count, move the bits around and flip the index.
- */
-void prop_change_shift(struct prop_descriptor *pd, int shift)
-{
- int index;
- int offset;
- u64 events;
- unsigned long flags;
-
- if (shift > PROP_MAX_SHIFT)
- shift = PROP_MAX_SHIFT;
-
- mutex_lock(&pd->mutex);
-
- index = pd->index ^ 1;
- offset = pd->pg[pd->index].shift - shift;
- if (!offset)
- goto out;
-
- pd->pg[index].shift = shift;
-
- local_irq_save(flags);
- events = percpu_counter_sum(&pd->pg[pd->index].events);
- if (offset < 0)
- events <<= -offset;
- else
- events >>= offset;
- percpu_counter_set(&pd->pg[index].events, events);
-
- /*
- * ensure the new pg is fully written before the switch
- */
- smp_wmb();
- pd->index = index;
- local_irq_restore(flags);
-
- synchronize_rcu();
-
-out:
- mutex_unlock(&pd->mutex);
-}
-
-/*
- * wrap the access to the data in an rcu_read_lock() section;
- * this is used to track the active references.
- */
-static struct prop_global *prop_get_global(struct prop_descriptor *pd)
-__acquires(RCU)
-{
- int index;
-
- rcu_read_lock();
- index = pd->index;
- /*
- * match the wmb from vcd_flip()
- */
- smp_rmb();
- return &pd->pg[index];
-}
-
-static void prop_put_global(struct prop_descriptor *pd, struct prop_global *pg)
-__releases(RCU)
-{
- rcu_read_unlock();
-}
-
-static void
-prop_adjust_shift(int *pl_shift, unsigned long *pl_period, int new_shift)
-{
- int offset = *pl_shift - new_shift;
-
- if (!offset)
- return;
-
- if (offset < 0)
- *pl_period <<= -offset;
- else
- *pl_period >>= offset;
-
- *pl_shift = new_shift;
-}
-
-/*
- * PERCPU
- */
-
-#define PROP_BATCH (8*(1+ilog2(nr_cpu_ids)))
-
-int prop_local_init_percpu(struct prop_local_percpu *pl, gfp_t gfp)
-{
- raw_spin_lock_init(&pl->lock);
- pl->shift = 0;
- pl->period = 0;
- return percpu_counter_init(&pl->events, 0, gfp);
-}
-
-void prop_local_destroy_percpu(struct prop_local_percpu *pl)
-{
- percpu_counter_destroy(&pl->events);
-}
-
-/*
- * Catch up with missed period expirations.
- *
- * until (c_{j} == c)
- * x_{j} -= x_{j}/2;
- * c_{j}++;
- */
-static
-void prop_norm_percpu(struct prop_global *pg, struct prop_local_percpu *pl)
-{
- unsigned long period = 1UL << (pg->shift - 1);
- unsigned long period_mask = ~(period - 1);
- unsigned long global_period;
- unsigned long flags;
-
- global_period = percpu_counter_read(&pg->events);
- global_period &= period_mask;
-
- /*
- * Fast path - check if the local and global period count still match
- * outside of the lock.
- */
- if (pl->period == global_period)
- return;
-
- raw_spin_lock_irqsave(&pl->lock, flags);
- prop_adjust_shift(&pl->shift, &pl->period, pg->shift);
-
- /*
- * For each missed period, we half the local counter.
- * basically:
- * pl->events >> (global_period - pl->period);
- */
- period = (global_period - pl->period) >> (pg->shift - 1);
- if (period < BITS_PER_LONG) {
- s64 val = percpu_counter_read(&pl->events);
-
- if (val < (nr_cpu_ids * PROP_BATCH))
- val = percpu_counter_sum(&pl->events);
-
- __percpu_counter_add(&pl->events, -val + (val >> period),
- PROP_BATCH);
- } else
- percpu_counter_set(&pl->events, 0);
-
- pl->period = global_period;
- raw_spin_unlock_irqrestore(&pl->lock, flags);
-}
-
-/*
- * ++x_{j}, ++t
- */
-void __prop_inc_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl)
-{
- struct prop_global *pg = prop_get_global(pd);
-
- prop_norm_percpu(pg, pl);
- __percpu_counter_add(&pl->events, 1, PROP_BATCH);
- percpu_counter_add(&pg->events, 1);
- prop_put_global(pd, pg);
-}
-
-/*
- * identical to __prop_inc_percpu, except that it limits this pl's fraction to
- * @frac/PROP_FRAC_BASE by ignoring events when this limit has been exceeded.
- */
-void __prop_inc_percpu_max(struct prop_descriptor *pd,
- struct prop_local_percpu *pl, long frac)
-{
- struct prop_global *pg = prop_get_global(pd);
-
- prop_norm_percpu(pg, pl);
-
- if (unlikely(frac != PROP_FRAC_BASE)) {
- unsigned long period_2 = 1UL << (pg->shift - 1);
- unsigned long counter_mask = period_2 - 1;
- unsigned long global_count;
- long numerator, denominator;
-
- numerator = percpu_counter_read_positive(&pl->events);
- global_count = percpu_counter_read(&pg->events);
- denominator = period_2 + (global_count & counter_mask);
-
- if (numerator > ((denominator * frac) >> PROP_FRAC_SHIFT))
- goto out_put;
- }
-
- percpu_counter_add(&pl->events, 1);
- percpu_counter_add(&pg->events, 1);
-
-out_put:
- prop_put_global(pd, pg);
-}
-
-/*
- * Obtain a fraction of this proportion
- *
- * p_{j} = x_{j} / (period/2 + t % period/2)
- */
-void prop_fraction_percpu(struct prop_descriptor *pd,
- struct prop_local_percpu *pl,
- long *numerator, long *denominator)
-{
- struct prop_global *pg = prop_get_global(pd);
- unsigned long period_2 = 1UL << (pg->shift - 1);
- unsigned long counter_mask = period_2 - 1;
- unsigned long global_count;
-
- prop_norm_percpu(pg, pl);
- *numerator = percpu_counter_read_positive(&pl->events);
-
- global_count = percpu_counter_read(&pg->events);
- *denominator = period_2 + (global_count & counter_mask);
-
- prop_put_global(pd, pg);
-}
-
-/*
- * SINGLE
- */
-
-int prop_local_init_single(struct prop_local_single *pl)
-{
- raw_spin_lock_init(&pl->lock);
- pl->shift = 0;
- pl->period = 0;
- pl->events = 0;
- return 0;
-}
-
-void prop_local_destroy_single(struct prop_local_single *pl)
-{
-}
-
-/*
- * Catch up with missed period expirations.
- */
-static
-void prop_norm_single(struct prop_global *pg, struct prop_local_single *pl)
-{
- unsigned long period = 1UL << (pg->shift - 1);
- unsigned long period_mask = ~(period - 1);
- unsigned long global_period;
- unsigned long flags;
-
- global_period = percpu_counter_read(&pg->events);
- global_period &= period_mask;
-
- /*
- * Fast path - check if the local and global period count still match
- * outside of the lock.
- */
- if (pl->period == global_period)
- return;
-
- raw_spin_lock_irqsave(&pl->lock, flags);
- prop_adjust_shift(&pl->shift, &pl->period, pg->shift);
- /*
- * For each missed period, we half the local counter.
- */
- period = (global_period - pl->period) >> (pg->shift - 1);
- if (likely(period < BITS_PER_LONG))
- pl->events >>= period;
- else
- pl->events = 0;
- pl->period = global_period;
- raw_spin_unlock_irqrestore(&pl->lock, flags);
-}
-
-/*
- * ++x_{j}, ++t
- */
-void __prop_inc_single(struct prop_descriptor *pd, struct prop_local_single *pl)
-{
- struct prop_global *pg = prop_get_global(pd);
-
- prop_norm_single(pg, pl);
- pl->events++;
- percpu_counter_add(&pg->events, 1);
- prop_put_global(pd, pg);
-}
-
-/*
- * Obtain a fraction of this proportion
- *
- * p_{j} = x_{j} / (period/2 + t % period/2)
- */
-void prop_fraction_single(struct prop_descriptor *pd,
- struct prop_local_single *pl,
- long *numerator, long *denominator)
-{
- struct prop_global *pg = prop_get_global(pd);
- unsigned long period_2 = 1UL << (pg->shift - 1);
- unsigned long counter_mask = period_2 - 1;
- unsigned long global_count;
-
- prop_norm_single(pg, pl);
- *numerator = pl->events;
-
- global_count = percpu_counter_read(&pg->events);
- *denominator = period_2 + (global_count & counter_mask);
-
- prop_put_global(pd, pg);
-}
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 1624c4117..bc7852f95 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -4,6 +4,8 @@
* Copyright (C) 2005 SGI, Christoph Lameter
* Copyright (C) 2006 Nick Piggin
* Copyright (C) 2012 Konstantin Khlebnikov
+ * Copyright (C) 2016 Intel, Matthew Wilcox
+ * Copyright (C) 2016 Intel, Ross Zwisler
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
@@ -37,12 +39,6 @@
/*
- * The height_to_maxindex array needs to be one deeper than the maximum
- * path as height 0 holds only 1 entry.
- */
-static unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH + 1] __read_mostly;
-
-/*
* Radix tree node cache.
*/
static struct kmem_cache *radix_tree_node_cachep;
@@ -64,20 +60,58 @@ static struct kmem_cache *radix_tree_node_cachep;
* Per-cpu pool of preloaded nodes
*/
struct radix_tree_preload {
- int nr;
+ unsigned nr;
/* nodes->private_data points to next preallocated node */
struct radix_tree_node *nodes;
};
static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };
-static inline void *ptr_to_indirect(void *ptr)
+static inline void *node_to_entry(void *ptr)
+{
+ return (void *)((unsigned long)ptr | RADIX_TREE_INTERNAL_NODE);
+}
+
+#define RADIX_TREE_RETRY node_to_entry(NULL)
+
+#ifdef CONFIG_RADIX_TREE_MULTIORDER
+/* Sibling slots point directly to another slot in the same node */
+static inline bool is_sibling_entry(struct radix_tree_node *parent, void *node)
+{
+ void **ptr = node;
+ return (parent->slots <= ptr) &&
+ (ptr < parent->slots + RADIX_TREE_MAP_SIZE);
+}
+#else
+static inline bool is_sibling_entry(struct radix_tree_node *parent, void *node)
+{
+ return false;
+}
+#endif
+
+static inline unsigned long get_slot_offset(struct radix_tree_node *parent,
+ void **slot)
{
- return (void *)((unsigned long)ptr | RADIX_TREE_INDIRECT_PTR);
+ return slot - parent->slots;
}
-static inline void *indirect_to_ptr(void *ptr)
+static unsigned int radix_tree_descend(struct radix_tree_node *parent,
+ struct radix_tree_node **nodep, unsigned long index)
{
- return (void *)((unsigned long)ptr & ~RADIX_TREE_INDIRECT_PTR);
+ unsigned int offset = (index >> parent->shift) & RADIX_TREE_MAP_MASK;
+ void **entry = rcu_dereference_raw(parent->slots[offset]);
+
+#ifdef CONFIG_RADIX_TREE_MULTIORDER
+ if (radix_tree_is_internal_node(entry)) {
+ unsigned long siboff = get_slot_offset(parent, entry);
+ if (siboff < RADIX_TREE_MAP_SIZE) {
+ offset = siboff;
+ entry = rcu_dereference_raw(parent->slots[offset]);
+ }
+ }
+#endif
+
+ *nodep = (void *)entry;
+ return offset;
}
static inline gfp_t root_gfp_mask(struct radix_tree_root *root)
@@ -108,7 +142,7 @@ static inline void root_tag_set(struct radix_tree_root *root, unsigned int tag)
root->gfp_mask |= (__force gfp_t)(1 << (tag + __GFP_BITS_SHIFT));
}
-static inline void root_tag_clear(struct radix_tree_root *root, unsigned int tag)
+static inline void root_tag_clear(struct radix_tree_root *root, unsigned tag)
{
root->gfp_mask &= (__force gfp_t)~(1 << (tag + __GFP_BITS_SHIFT));
}
@@ -120,7 +154,12 @@ static inline void root_tag_clear_all(struct radix_tree_root *root)
static inline int root_tag_get(struct radix_tree_root *root, unsigned int tag)
{
- return (__force unsigned)root->gfp_mask & (1 << (tag + __GFP_BITS_SHIFT));
+ return (__force int)root->gfp_mask & (1 << (tag + __GFP_BITS_SHIFT));
+}
+
+static inline unsigned root_tags_get(struct radix_tree_root *root)
+{
+ return (__force unsigned)root->gfp_mask >> __GFP_BITS_SHIFT;
}
/*
@@ -129,7 +168,7 @@ static inline int root_tag_get(struct radix_tree_root *root, unsigned int tag)
*/
static inline int any_tag_set(struct radix_tree_node *node, unsigned int tag)
{
- int idx;
+ unsigned idx;
for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
if (node->tags[tag][idx])
return 1;
@@ -173,38 +212,45 @@ radix_tree_find_next_bit(const unsigned long *addr,
return size;
}
-#if 0
-static void dump_node(void *slot, int height, int offset)
+#ifndef __KERNEL__
+static void dump_node(struct radix_tree_node *node, unsigned long index)
{
- struct radix_tree_node *node;
- int i;
+ unsigned long i;
- if (!slot)
- return;
+ pr_debug("radix node: %p offset %d tags %lx %lx %lx shift %d count %d parent %p\n",
+ node, node->offset,
+ node->tags[0][0], node->tags[1][0], node->tags[2][0],
+ node->shift, node->count, node->parent);
- if (height == 0) {
- pr_debug("radix entry %p offset %d\n", slot, offset);
- return;
+ for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
+ unsigned long first = index | (i << node->shift);
+ unsigned long last = first | ((1UL << node->shift) - 1);
+ void *entry = node->slots[i];
+ if (!entry)
+ continue;
+ if (is_sibling_entry(node, entry)) {
+ pr_debug("radix sblng %p offset %ld val %p indices %ld-%ld\n",
+ entry, i,
+ *(void **)entry_to_node(entry),
+ first, last);
+ } else if (!radix_tree_is_internal_node(entry)) {
+ pr_debug("radix entry %p offset %ld indices %ld-%ld\n",
+ entry, i, first, last);
+ } else {
+ dump_node(entry_to_node(entry), first);
+ }
}
-
- node = indirect_to_ptr(slot);
- pr_debug("radix node: %p offset %d tags %lx %lx %lx path %x count %d parent %p\n",
- slot, offset, node->tags[0][0], node->tags[1][0],
- node->tags[2][0], node->path, node->count, node->parent);
-
- for (i = 0; i < RADIX_TREE_MAP_SIZE; i++)
- dump_node(node->slots[i], height - 1, i);
}
/* For debug */
static void radix_tree_dump(struct radix_tree_root *root)
{
- pr_debug("radix root: %p height %d rnode %p tags %x\n",
- root, root->height, root->rnode,
+ pr_debug("radix root: %p rnode %p tags %x\n",
+ root, root->rnode,
root->gfp_mask >> __GFP_BITS_SHIFT);
- if (!radix_tree_is_indirect_ptr(root->rnode))
+ if (!radix_tree_is_internal_node(root->rnode))
return;
- dump_node(root->rnode, root->height, 0);
+ dump_node(entry_to_node(root->rnode), 0);
}
#endif
@@ -219,19 +265,20 @@ radix_tree_node_alloc(struct radix_tree_root *root)
gfp_t gfp_mask = root_gfp_mask(root);
/*
- * Preload code isn't irq safe and it doesn't make sence to use
- * preloading in the interrupt anyway as all the allocations have to
- * be atomic. So just do normal allocation when in interrupt.
+ * Preload code isn't irq safe and it doesn't make sense to use
+ * preloading during an interrupt anyway as all the allocations have
+ * to be atomic. So just do normal allocation when in interrupt.
*/
if (!gfpflags_allow_blocking(gfp_mask) && !in_interrupt()) {
struct radix_tree_preload *rtp;
/*
* Even if the caller has preloaded, try to allocate from the
- * cache first for the new node to get accounted.
+ * cache first for the new node to get accounted to the memory
+ * cgroup.
*/
ret = kmem_cache_alloc(radix_tree_node_cachep,
- gfp_mask | __GFP_ACCOUNT | __GFP_NOWARN);
+ gfp_mask | __GFP_NOWARN);
if (ret)
goto out;
@@ -254,10 +301,9 @@ radix_tree_node_alloc(struct radix_tree_root *root)
kmemleak_update_trace(ret);
goto out;
}
- ret = kmem_cache_alloc(radix_tree_node_cachep,
- gfp_mask | __GFP_ACCOUNT);
+ ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
out:
- BUG_ON(radix_tree_is_indirect_ptr(ret));
+ BUG_ON(radix_tree_is_internal_node(ret));
return ret;
}
@@ -302,6 +348,12 @@ static int __radix_tree_preload(gfp_t gfp_mask)
struct radix_tree_node *node;
int ret = -ENOMEM;
+ /*
+ * Nodes preloaded by one cgroup can be be used by another cgroup, so
+ * they should never be accounted to any particular memory cgroup.
+ */
+ gfp_mask &= ~__GFP_ACCOUNT;
+
preempt_disable();
rtp = this_cpu_ptr(&radix_tree_preloads);
while (rtp->nr < RADIX_TREE_PRELOAD_SIZE) {
@@ -357,38 +409,58 @@ int radix_tree_maybe_preload(gfp_t gfp_mask)
EXPORT_SYMBOL(radix_tree_maybe_preload);
/*
- * Return the maximum key which can be store into a
- * radix tree with height HEIGHT.
+ * The maximum index which can be stored in a radix tree
*/
-static inline unsigned long radix_tree_maxindex(unsigned int height)
+static inline unsigned long shift_maxindex(unsigned int shift)
+{
+ return (RADIX_TREE_MAP_SIZE << shift) - 1;
+}
+
+static inline unsigned long node_maxindex(struct radix_tree_node *node)
+{
+ return shift_maxindex(node->shift);
+}
+
+static unsigned radix_tree_load_root(struct radix_tree_root *root,
+ struct radix_tree_node **nodep, unsigned long *maxindex)
{
- return height_to_maxindex[height];
+ struct radix_tree_node *node = rcu_dereference_raw(root->rnode);
+
+ *nodep = node;
+
+ if (likely(radix_tree_is_internal_node(node))) {
+ node = entry_to_node(node);
+ *maxindex = node_maxindex(node);
+ return node->shift + RADIX_TREE_MAP_SHIFT;
+ }
+
+ *maxindex = 0;
+ return 0;
}
/*
* Extend a radix tree so it can store key @index.
*/
static int radix_tree_extend(struct radix_tree_root *root,
- unsigned long index, unsigned order)
+ unsigned long index, unsigned int shift)
{
- struct radix_tree_node *node;
struct radix_tree_node *slot;
- unsigned int height;
+ unsigned int maxshift;
int tag;
- /* Figure out what the height should be. */
- height = root->height + 1;
- while (index > radix_tree_maxindex(height))
- height++;
+ /* Figure out what the shift should be. */
+ maxshift = shift;
+ while (index > shift_maxindex(maxshift))
+ maxshift += RADIX_TREE_MAP_SHIFT;
- if ((root->rnode == NULL) && (order == 0)) {
- root->height = height;
+ slot = root->rnode;
+ if (!slot)
goto out;
- }
do {
- unsigned int newheight;
- if (!(node = radix_tree_node_alloc(root)))
+ struct radix_tree_node *node = radix_tree_node_alloc(root);
+
+ if (!node)
return -ENOMEM;
/* Propagate the aggregated tag info into the new root */
@@ -397,25 +469,20 @@ static int radix_tree_extend(struct radix_tree_root *root,
tag_set(node, tag, 0);
}
- /* Increase the height. */
- newheight = root->height+1;
- BUG_ON(newheight & ~RADIX_TREE_HEIGHT_MASK);
- node->path = newheight;
+ BUG_ON(shift > BITS_PER_LONG);
+ node->shift = shift;
+ node->offset = 0;
node->count = 1;
node->parent = NULL;
- slot = root->rnode;
- if (radix_tree_is_indirect_ptr(slot) && newheight > 1) {
- slot = indirect_to_ptr(slot);
- slot->parent = node;
- slot = ptr_to_indirect(slot);
- }
+ if (radix_tree_is_internal_node(slot))
+ entry_to_node(slot)->parent = node;
node->slots[0] = slot;
- node = ptr_to_indirect(node);
- rcu_assign_pointer(root->rnode, node);
- root->height = newheight;
- } while (height > root->height);
+ slot = node_to_entry(node);
+ rcu_assign_pointer(root->rnode, slot);
+ shift += RADIX_TREE_MAP_SHIFT;
+ } while (shift <= maxshift);
out:
- return 0;
+ return maxshift + RADIX_TREE_MAP_SHIFT;
}
/**
@@ -439,71 +506,70 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
unsigned order, struct radix_tree_node **nodep,
void ***slotp)
{
- struct radix_tree_node *node = NULL, *slot;
- unsigned int height, shift, offset;
- int error;
+ struct radix_tree_node *node = NULL, *child;
+ void **slot = (void **)&root->rnode;
+ unsigned long maxindex;
+ unsigned int shift, offset = 0;
+ unsigned long max = index | ((1UL << order) - 1);
- BUG_ON((0 < order) && (order < RADIX_TREE_MAP_SHIFT));
+ shift = radix_tree_load_root(root, &child, &maxindex);
/* Make sure the tree is high enough. */
- if (index > radix_tree_maxindex(root->height)) {
- error = radix_tree_extend(root, index, order);
- if (error)
+ if (max > maxindex) {
+ int error = radix_tree_extend(root, max, shift);
+ if (error < 0)
return error;
+ shift = error;
+ child = root->rnode;
+ if (order == shift)
+ shift += RADIX_TREE_MAP_SHIFT;
}
- slot = root->rnode;
-
- height = root->height;
- shift = height * RADIX_TREE_MAP_SHIFT;
-
- offset = 0; /* uninitialised var warning */
while (shift > order) {
- if (slot == NULL) {
+ shift -= RADIX_TREE_MAP_SHIFT;
+ if (child == NULL) {
/* Have to add a child node. */
- if (!(slot = radix_tree_node_alloc(root)))
+ child = radix_tree_node_alloc(root);
+ if (!child)
return -ENOMEM;
- slot->path = height;
- slot->parent = node;
- if (node) {
- rcu_assign_pointer(node->slots[offset],
- ptr_to_indirect(slot));
+ child->shift = shift;
+ child->offset = offset;
+ child->parent = node;
+ rcu_assign_pointer(*slot, node_to_entry(child));
+ if (node)
node->count++;
- slot->path |= offset << RADIX_TREE_HEIGHT_SHIFT;
- } else
- rcu_assign_pointer(root->rnode,
- ptr_to_indirect(slot));
- } else if (!radix_tree_is_indirect_ptr(slot))
+ } else if (!radix_tree_is_internal_node(child))
break;
/* Go a level down */
- height--;
- shift -= RADIX_TREE_MAP_SHIFT;
- offset = (index >> shift) & RADIX_TREE_MAP_MASK;
- node = indirect_to_ptr(slot);
- slot = node->slots[offset];
+ node = entry_to_node(child);
+ offset = radix_tree_descend(node, &child, index);
+ slot = &node->slots[offset];
}
+#ifdef CONFIG_RADIX_TREE_MULTIORDER
/* Insert pointers to the canonical entry */
- if ((shift - order) > 0) {
- int i, n = 1 << (shift - order);
+ if (order > shift) {
+ unsigned i, n = 1 << (order - shift);
offset = offset & ~(n - 1);
- slot = ptr_to_indirect(&node->slots[offset]);
+ slot = &node->slots[offset];
+ child = node_to_entry(slot);
for (i = 0; i < n; i++) {
- if (node->slots[offset + i])
+ if (slot[i])
return -EEXIST;
}
for (i = 1; i < n; i++) {
- rcu_assign_pointer(node->slots[offset + i], slot);
+ rcu_assign_pointer(slot[i], child);
node->count++;
}
}
+#endif
if (nodep)
*nodep = node;
if (slotp)
- *slotp = node ? node->slots + offset : (void **)&root->rnode;
+ *slotp = slot;
return 0;
}
@@ -523,7 +589,7 @@ int __radix_tree_insert(struct radix_tree_root *root, unsigned long index,
void **slot;
int error;
- BUG_ON(radix_tree_is_indirect_ptr(item));
+ BUG_ON(radix_tree_is_internal_node(item));
error = __radix_tree_create(root, index, order, &node, &slot);
if (error)
@@ -533,12 +599,13 @@ int __radix_tree_insert(struct radix_tree_root *root, unsigned long index,
rcu_assign_pointer(*slot, item);
if (node) {
+ unsigned offset = get_slot_offset(node, slot);
node->count++;
- BUG_ON(tag_get(node, 0, index & RADIX_TREE_MAP_MASK));
- BUG_ON(tag_get(node, 1, index & RADIX_TREE_MAP_MASK));
+ BUG_ON(tag_get(node, 0, offset));
+ BUG_ON(tag_get(node, 1, offset));
+ BUG_ON(tag_get(node, 2, offset));
} else {
- BUG_ON(root_tag_get(root, 0));
- BUG_ON(root_tag_get(root, 1));
+ BUG_ON(root_tags_get(root));
}
return 0;
@@ -563,44 +630,25 @@ void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index,
struct radix_tree_node **nodep, void ***slotp)
{
struct radix_tree_node *node, *parent;
- unsigned int height, shift;
+ unsigned long maxindex;
void **slot;
- node = rcu_dereference_raw(root->rnode);
- if (node == NULL)
+ restart:
+ parent = NULL;
+ slot = (void **)&root->rnode;
+ radix_tree_load_root(root, &node, &maxindex);
+ if (index > maxindex)
return NULL;
- if (!radix_tree_is_indirect_ptr(node)) {
- if (index > 0)
- return NULL;
+ while (radix_tree_is_internal_node(node)) {
+ unsigned offset;
- if (nodep)
- *nodep = NULL;
- if (slotp)
- *slotp = (void **)&root->rnode;
- return node;
+ if (node == RADIX_TREE_RETRY)
+ goto restart;
+ parent = entry_to_node(node);
+ offset = radix_tree_descend(parent, &node, index);
+ slot = parent->slots + offset;
}
- node = indirect_to_ptr(node);
-
- height = node->path & RADIX_TREE_HEIGHT_MASK;
- if (index > radix_tree_maxindex(height))
- return NULL;
-
- shift = (height-1) * RADIX_TREE_MAP_SHIFT;
-
- do {
- parent = node;
- slot = node->slots + ((index >> shift) & RADIX_TREE_MAP_MASK);
- node = rcu_dereference_raw(*slot);
- if (node == NULL)
- return NULL;
- if (!radix_tree_is_indirect_ptr(node))
- break;
- node = indirect_to_ptr(node);
-
- shift -= RADIX_TREE_MAP_SHIFT;
- height--;
- } while (height > 0);
if (nodep)
*nodep = parent;
@@ -654,59 +702,72 @@ EXPORT_SYMBOL(radix_tree_lookup);
* radix_tree_tag_set - set a tag on a radix tree node
* @root: radix tree root
* @index: index key
- * @tag: tag index
+ * @tag: tag index
*
* Set the search tag (which must be < RADIX_TREE_MAX_TAGS)
* corresponding to @index in the radix tree. From
* the root all the way down to the leaf node.
*
- * Returns the address of the tagged item. Setting a tag on a not-present
+ * Returns the address of the tagged item. Setting a tag on a not-present
* item is a bug.
*/
void *radix_tree_tag_set(struct radix_tree_root *root,
unsigned long index, unsigned int tag)
{
- unsigned int height, shift;
- struct radix_tree_node *slot;
+ struct radix_tree_node *node, *parent;
+ unsigned long maxindex;
- height = root->height;
- BUG_ON(index > radix_tree_maxindex(height));
+ radix_tree_load_root(root, &node, &maxindex);
+ BUG_ON(index > maxindex);
- slot = indirect_to_ptr(root->rnode);
- shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
+ while (radix_tree_is_internal_node(node)) {
+ unsigned offset;
- while (height > 0) {
- int offset;
+ parent = entry_to_node(node);
+ offset = radix_tree_descend(parent, &node, index);
+ BUG_ON(!node);
- offset = (index >> shift) & RADIX_TREE_MAP_MASK;
- if (!tag_get(slot, tag, offset))
- tag_set(slot, tag, offset);
- slot = slot->slots[offset];
- BUG_ON(slot == NULL);
- if (!radix_tree_is_indirect_ptr(slot))
- break;
- slot = indirect_to_ptr(slot);
- shift -= RADIX_TREE_MAP_SHIFT;
- height--;
+ if (!tag_get(parent, tag, offset))
+ tag_set(parent, tag, offset);
}
/* set the root's tag bit */
- if (slot && !root_tag_get(root, tag))
+ if (!root_tag_get(root, tag))
root_tag_set(root, tag);
- return slot;
+ return node;
}
EXPORT_SYMBOL(radix_tree_tag_set);
+static void node_tag_clear(struct radix_tree_root *root,
+ struct radix_tree_node *node,
+ unsigned int tag, unsigned int offset)
+{
+ while (node) {
+ if (!tag_get(node, tag, offset))
+ return;
+ tag_clear(node, tag, offset);
+ if (any_tag_set(node, tag))
+ return;
+
+ offset = node->offset;
+ node = node->parent;
+ }
+
+ /* clear the root's tag bit */
+ if (root_tag_get(root, tag))
+ root_tag_clear(root, tag);
+}
+
/**
* radix_tree_tag_clear - clear a tag on a radix tree node
* @root: radix tree root
* @index: index key
- * @tag: tag index
+ * @tag: tag index
*
* Clear the search tag (which must be < RADIX_TREE_MAX_TAGS)
- * corresponding to @index in the radix tree. If
- * this causes the leaf node to have no tags set then clear the tag in the
+ * corresponding to @index in the radix tree. If this causes
+ * the leaf node to have no tags set then clear the tag in the
* next-to-leaf node, etc.
*
* Returns the address of the tagged item on success, else NULL. ie:
@@ -715,52 +776,25 @@ EXPORT_SYMBOL(radix_tree_tag_set);
void *radix_tree_tag_clear(struct radix_tree_root *root,
unsigned long index, unsigned int tag)
{
- struct radix_tree_node *node = NULL;
- struct radix_tree_node *slot = NULL;
- unsigned int height, shift;
+ struct radix_tree_node *node, *parent;
+ unsigned long maxindex;
int uninitialized_var(offset);
- height = root->height;
- if (index > radix_tree_maxindex(height))
- goto out;
-
- shift = height * RADIX_TREE_MAP_SHIFT;
- slot = root->rnode;
+ radix_tree_load_root(root, &node, &maxindex);
+ if (index > maxindex)
+ return NULL;
- while (shift) {
- if (slot == NULL)
- goto out;
- if (!radix_tree_is_indirect_ptr(slot))
- break;
- slot = indirect_to_ptr(slot);
+ parent = NULL;
- shift -= RADIX_TREE_MAP_SHIFT;
- offset = (index >> shift) & RADIX_TREE_MAP_MASK;
- node = slot;
- slot = slot->slots[offset];
+ while (radix_tree_is_internal_node(node)) {
+ parent = entry_to_node(node);
+ offset = radix_tree_descend(parent, &node, index);
}
- if (slot == NULL)
- goto out;
+ if (node)
+ node_tag_clear(root, parent, tag, offset);
- while (node) {
- if (!tag_get(node, tag, offset))
- goto out;
- tag_clear(node, tag, offset);
- if (any_tag_set(node, tag))
- goto out;
-
- index >>= RADIX_TREE_MAP_SHIFT;
- offset = index & RADIX_TREE_MAP_MASK;
- node = node->parent;
- }
-
- /* clear the root's tag bit */
- if (root_tag_get(root, tag))
- root_tag_clear(root, tag);
-
-out:
- return slot;
+ return node;
}
EXPORT_SYMBOL(radix_tree_tag_clear);
@@ -768,7 +802,7 @@ EXPORT_SYMBOL(radix_tree_tag_clear);
* radix_tree_tag_get - get a tag on a radix tree node
* @root: radix tree root
* @index: index key
- * @tag: tag index (< RADIX_TREE_MAX_TAGS)
+ * @tag: tag index (< RADIX_TREE_MAX_TAGS)
*
* Return values:
*
@@ -782,48 +816,44 @@ EXPORT_SYMBOL(radix_tree_tag_clear);
int radix_tree_tag_get(struct radix_tree_root *root,
unsigned long index, unsigned int tag)
{
- unsigned int height, shift;
- struct radix_tree_node *node;
+ struct radix_tree_node *node, *parent;
+ unsigned long maxindex;
- /* check the root's tag bit */
if (!root_tag_get(root, tag))
return 0;
- node = rcu_dereference_raw(root->rnode);
- if (node == NULL)
+ radix_tree_load_root(root, &node, &maxindex);
+ if (index > maxindex)
return 0;
-
- if (!radix_tree_is_indirect_ptr(node))
- return (index == 0);
- node = indirect_to_ptr(node);
-
- height = node->path & RADIX_TREE_HEIGHT_MASK;
- if (index > radix_tree_maxindex(height))
+ if (node == NULL)
return 0;
- shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
+ while (radix_tree_is_internal_node(node)) {
+ unsigned offset;
- for ( ; ; ) {
- int offset;
+ parent = entry_to_node(node);
+ offset = radix_tree_descend(parent, &node, index);
- if (node == NULL)
+ if (!node)
return 0;
- node = indirect_to_ptr(node);
-
- offset = (index >> shift) & RADIX_TREE_MAP_MASK;
- if (!tag_get(node, tag, offset))
+ if (!tag_get(parent, tag, offset))
return 0;
- if (height == 1)
- return 1;
- node = rcu_dereference_raw(node->slots[offset]);
- if (!radix_tree_is_indirect_ptr(node))
- return 1;
- shift -= RADIX_TREE_MAP_SHIFT;
- height--;
+ if (node == RADIX_TREE_RETRY)
+ break;
}
+
+ return 1;
}
EXPORT_SYMBOL(radix_tree_tag_get);
+static inline void __set_iter_shift(struct radix_tree_iter *iter,
+ unsigned int shift)
+{
+#ifdef CONFIG_RADIX_TREE_MULTIORDER
+ iter->shift = shift;
+#endif
+}
+
/**
* radix_tree_next_chunk - find next chunk of slots for iteration
*
@@ -835,9 +865,9 @@ EXPORT_SYMBOL(radix_tree_tag_get);
void **radix_tree_next_chunk(struct radix_tree_root *root,
struct radix_tree_iter *iter, unsigned flags)
{
- unsigned shift, tag = flags & RADIX_TREE_ITER_TAG_MASK;
- struct radix_tree_node *rnode, *node;
- unsigned long index, offset, height;
+ unsigned tag = flags & RADIX_TREE_ITER_TAG_MASK;
+ struct radix_tree_node *node, *child;
+ unsigned long index, offset, maxindex;
if ((flags & RADIX_TREE_ITER_TAGGED) && !root_tag_get(root, tag))
return NULL;
@@ -855,33 +885,28 @@ void **radix_tree_next_chunk(struct radix_tree_root *root,
if (!index && iter->index)
return NULL;
- rnode = rcu_dereference_raw(root->rnode);
- if (radix_tree_is_indirect_ptr(rnode)) {
- rnode = indirect_to_ptr(rnode);
- } else if (rnode && !index) {
+ restart:
+ radix_tree_load_root(root, &child, &maxindex);
+ if (index > maxindex)
+ return NULL;
+ if (!child)
+ return NULL;
+
+ if (!radix_tree_is_internal_node(child)) {
/* Single-slot tree */
- iter->index = 0;
- iter->next_index = 1;
+ iter->index = index;
+ iter->next_index = maxindex + 1;
iter->tags = 1;
+ __set_iter_shift(iter, 0);
return (void **)&root->rnode;
- } else
- return NULL;
-
-restart:
- height = rnode->path & RADIX_TREE_HEIGHT_MASK;
- shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
- offset = index >> shift;
+ }
- /* Index outside of the tree */
- if (offset >= RADIX_TREE_MAP_SIZE)
- return NULL;
+ do {
+ node = entry_to_node(child);
+ offset = radix_tree_descend(node, &child, index);
- node = rnode;
- while (1) {
- struct radix_tree_node *slot;
if ((flags & RADIX_TREE_ITER_TAGGED) ?
- !test_bit(offset, node->tags[tag]) :
- !node->slots[offset]) {
+ !tag_get(node, tag, offset) : !child) {
/* Hole detected */
if (flags & RADIX_TREE_ITER_CONTIG)
return NULL;
@@ -893,35 +918,30 @@ restart:
offset + 1);
else
while (++offset < RADIX_TREE_MAP_SIZE) {
- if (node->slots[offset])
+ void *slot = node->slots[offset];
+ if (is_sibling_entry(node, slot))
+ continue;
+ if (slot)
break;
}
- index &= ~((RADIX_TREE_MAP_SIZE << shift) - 1);
- index += offset << shift;
+ index &= ~node_maxindex(node);
+ index += offset << node->shift;
/* Overflow after ~0UL */
if (!index)
return NULL;
if (offset == RADIX_TREE_MAP_SIZE)
goto restart;
+ child = rcu_dereference_raw(node->slots[offset]);
}
- /* This is leaf-node */
- if (!shift)
- break;
-
- slot = rcu_dereference_raw(node->slots[offset]);
- if (slot == NULL)
+ if ((child == NULL) || (child == RADIX_TREE_RETRY))
goto restart;
- if (!radix_tree_is_indirect_ptr(slot))
- break;
- node = indirect_to_ptr(slot);
- shift -= RADIX_TREE_MAP_SHIFT;
- offset = (index >> shift) & RADIX_TREE_MAP_MASK;
- }
+ } while (radix_tree_is_internal_node(child));
/* Update the iterator state */
- iter->index = index;
- iter->next_index = (index | RADIX_TREE_MAP_MASK) + 1;
+ iter->index = (index &~ node_maxindex(node)) | (offset << node->shift);
+ iter->next_index = (index | node_maxindex(node)) + 1;
+ __set_iter_shift(iter, node->shift);
/* Construct iter->tags bit-mask from node->tags[tag] array */
if (flags & RADIX_TREE_ITER_TAGGED) {
@@ -967,7 +987,7 @@ EXPORT_SYMBOL(radix_tree_next_chunk);
* set is outside the range we are scanning. This reults in dangling tags and
* can lead to problems with later tag operations (e.g. livelocks on lookups).
*
- * The function returns number of leaves where the tag was set and sets
+ * The function returns the number of leaves where the tag was set and sets
* *first_indexp to the first unscanned index.
* WARNING! *first_indexp can wrap if last_index is ULONG_MAX. Caller must
* be prepared to handle that.
@@ -977,14 +997,13 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
unsigned long nr_to_tag,
unsigned int iftag, unsigned int settag)
{
- unsigned int height = root->height;
- struct radix_tree_node *node = NULL;
- struct radix_tree_node *slot;
- unsigned int shift;
+ struct radix_tree_node *parent, *node, *child;
+ unsigned long maxindex;
unsigned long tagged = 0;
unsigned long index = *first_indexp;
- last_index = min(last_index, radix_tree_maxindex(height));
+ radix_tree_load_root(root, &child, &maxindex);
+ last_index = min(last_index, maxindex);
if (index > last_index)
return 0;
if (!nr_to_tag)
@@ -993,80 +1012,62 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
*first_indexp = last_index + 1;
return 0;
}
- if (height == 0) {
+ if (!radix_tree_is_internal_node(child)) {
*first_indexp = last_index + 1;
root_tag_set(root, settag);
return 1;
}
- shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
- slot = indirect_to_ptr(root->rnode);
+ node = entry_to_node(child);
for (;;) {
- unsigned long upindex;
- int offset;
-
- offset = (index >> shift) & RADIX_TREE_MAP_MASK;
- if (!slot->slots[offset])
+ unsigned offset = radix_tree_descend(node, &child, index);
+ if (!child)
goto next;
- if (!tag_get(slot, iftag, offset))
+ if (!tag_get(node, iftag, offset))
goto next;
- if (shift) {
- node = slot;
- slot = slot->slots[offset];
- if (radix_tree_is_indirect_ptr(slot)) {
- slot = indirect_to_ptr(slot);
- shift -= RADIX_TREE_MAP_SHIFT;
- continue;
- } else {
- slot = node;
- node = node->parent;
- }
+ /* Sibling slots never have tags set on them */
+ if (radix_tree_is_internal_node(child)) {
+ node = entry_to_node(child);
+ continue;
}
/* tag the leaf */
- tagged += 1 << shift;
- tag_set(slot, settag, offset);
+ tagged++;
+ tag_set(node, settag, offset);
/* walk back up the path tagging interior nodes */
- upindex = index;
- while (node) {
- upindex >>= RADIX_TREE_MAP_SHIFT;
- offset = upindex & RADIX_TREE_MAP_MASK;
-
+ parent = node;
+ for (;;) {
+ offset = parent->offset;
+ parent = parent->parent;
+ if (!parent)
+ break;
/* stop if we find a node with the tag already set */
- if (tag_get(node, settag, offset))
+ if (tag_get(parent, settag, offset))
break;
- tag_set(node, settag, offset);
- node = node->parent;
+ tag_set(parent, settag, offset);
}
-
- /*
- * Small optimization: now clear that node pointer.
- * Since all of this slot's ancestors now have the tag set
- * from setting it above, we have no further need to walk
- * back up the tree setting tags, until we update slot to
- * point to another radix_tree_node.
- */
- node = NULL;
-
-next:
- /* Go to next item at level determined by 'shift' */
- index = ((index >> shift) + 1) << shift;
+ next:
+ /* Go to next entry in node */
+ index = ((index >> node->shift) + 1) << node->shift;
/* Overflow can happen when last_index is ~0UL... */
if (index > last_index || !index)
break;
- if (tagged >= nr_to_tag)
- break;
- while (((index >> shift) & RADIX_TREE_MAP_MASK) == 0) {
+ offset = (index >> node->shift) & RADIX_TREE_MAP_MASK;
+ while (offset == 0) {
/*
* We've fully scanned this node. Go up. Because
* last_index is guaranteed to be in the tree, what
* we do below cannot wander astray.
*/
- slot = slot->parent;
- shift += RADIX_TREE_MAP_SHIFT;
+ node = node->parent;
+ offset = (index >> node->shift) & RADIX_TREE_MAP_MASK;
}
+ if (is_sibling_entry(node, node->slots[offset]))
+ goto next;
+ if (tagged >= nr_to_tag)
+ break;
}
/*
* We need not to tag the root tag if there is no tag which is set with
@@ -1095,9 +1096,10 @@ EXPORT_SYMBOL(radix_tree_range_tag_if_tagged);
*
* Like radix_tree_lookup, radix_tree_gang_lookup may be called under
* rcu_read_lock. In this case, rather than the returned results being
- * an atomic snapshot of the tree at a single point in time, the semantics
- * of an RCU protected gang lookup are as though multiple radix_tree_lookups
- * have been issued in individual locks, and results stored in 'results'.
+ * an atomic snapshot of the tree at a single point in time, the
+ * semantics of an RCU protected gang lookup are as though multiple
+ * radix_tree_lookups have been issued in individual locks, and results
+ * stored in 'results'.
*/
unsigned int
radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
@@ -1114,7 +1116,7 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
results[ret] = rcu_dereference_raw(*slot);
if (!results[ret])
continue;
- if (radix_tree_is_indirect_ptr(results[ret])) {
+ if (radix_tree_is_internal_node(results[ret])) {
slot = radix_tree_iter_retry(&iter);
continue;
}
@@ -1197,7 +1199,7 @@ radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
results[ret] = rcu_dereference_raw(*slot);
if (!results[ret])
continue;
- if (radix_tree_is_indirect_ptr(results[ret])) {
+ if (radix_tree_is_internal_node(results[ret])) {
slot = radix_tree_iter_retry(&iter);
continue;
}
@@ -1247,58 +1249,48 @@ EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot);
#if defined(CONFIG_SHMEM) && defined(CONFIG_SWAP)
#include <linux/sched.h> /* for cond_resched() */
+struct locate_info {
+ unsigned long found_index;
+ bool stop;
+};
+
/*
* This linear search is at present only useful to shmem_unuse_inode().
*/
static unsigned long __locate(struct radix_tree_node *slot, void *item,
- unsigned long index, unsigned long *found_index)
+ unsigned long index, struct locate_info *info)
{
- unsigned int shift, height;
unsigned long i;
- height = slot->path & RADIX_TREE_HEIGHT_MASK;
- shift = (height-1) * RADIX_TREE_MAP_SHIFT;
-
- for ( ; height > 1; height--) {
- i = (index >> shift) & RADIX_TREE_MAP_MASK;
- for (;;) {
- if (slot->slots[i] != NULL)
- break;
- index &= ~((1UL << shift) - 1);
- index += 1UL << shift;
- if (index == 0)
- goto out; /* 32-bit wraparound */
- i++;
- if (i == RADIX_TREE_MAP_SIZE)
+ do {
+ unsigned int shift = slot->shift;
+
+ for (i = (index >> shift) & RADIX_TREE_MAP_MASK;
+ i < RADIX_TREE_MAP_SIZE;
+ i++, index += (1UL << shift)) {
+ struct radix_tree_node *node =
+ rcu_dereference_raw(slot->slots[i]);
+ if (node == RADIX_TREE_RETRY)
goto out;
- }
-
- slot = rcu_dereference_raw(slot->slots[i]);
- if (slot == NULL)
- goto out;
- if (!radix_tree_is_indirect_ptr(slot)) {
- if (slot == item) {
- *found_index = index + i;
- index = 0;
- } else {
- index += shift;
+ if (!radix_tree_is_internal_node(node)) {
+ if (node == item) {
+ info->found_index = index;
+ info->stop = true;
+ goto out;
+ }
+ continue;
}
- goto out;
+ node = entry_to_node(node);
+ if (is_sibling_entry(slot, node))
+ continue;
+ slot = node;
+ break;
}
- slot = indirect_to_ptr(slot);
- shift -= RADIX_TREE_MAP_SHIFT;
- }
+ } while (i < RADIX_TREE_MAP_SIZE);
- /* Bottom level: check items */
- for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
- if (slot->slots[i] == item) {
- *found_index = index + i;
- index = 0;
- goto out;
- }
- }
- index += RADIX_TREE_MAP_SIZE;
out:
+ if ((index == 0) && (i == RADIX_TREE_MAP_SIZE))
+ info->stop = true;
return index;
}
@@ -1316,32 +1308,35 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item)
struct radix_tree_node *node;
unsigned long max_index;
unsigned long cur_index = 0;
- unsigned long found_index = -1;
+ struct locate_info info = {
+ .found_index = -1,
+ .stop = false,
+ };
do {
rcu_read_lock();
node = rcu_dereference_raw(root->rnode);
- if (!radix_tree_is_indirect_ptr(node)) {
+ if (!radix_tree_is_internal_node(node)) {
rcu_read_unlock();
if (node == item)
- found_index = 0;
+ info.found_index = 0;
break;
}
- node = indirect_to_ptr(node);
- max_index = radix_tree_maxindex(node->path &
- RADIX_TREE_HEIGHT_MASK);
+ node = entry_to_node(node);
+
+ max_index = node_maxindex(node);
if (cur_index > max_index) {
rcu_read_unlock();
break;
}
- cur_index = __locate(node, item, cur_index, &found_index);
+ cur_index = __locate(node, item, cur_index, &info);
rcu_read_unlock();
cond_resched();
- } while (cur_index != 0 && cur_index <= max_index);
+ } while (!info.stop && cur_index <= max_index);
- return found_index;
+ return info.found_index;
}
#else
unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item)
@@ -1351,47 +1346,45 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item)
#endif /* CONFIG_SHMEM && CONFIG_SWAP */
/**
- * radix_tree_shrink - shrink height of a radix tree to minimal
+ * radix_tree_shrink - shrink radix tree to minimum height
* @root radix tree root
*/
-static inline void radix_tree_shrink(struct radix_tree_root *root)
+static inline bool radix_tree_shrink(struct radix_tree_root *root)
{
- /* try to shrink tree height */
- while (root->height > 0) {
- struct radix_tree_node *to_free = root->rnode;
- struct radix_tree_node *slot;
+ bool shrunk = false;
- BUG_ON(!radix_tree_is_indirect_ptr(to_free));
- to_free = indirect_to_ptr(to_free);
+ for (;;) {
+ struct radix_tree_node *node = root->rnode;
+ struct radix_tree_node *child;
+
+ if (!radix_tree_is_internal_node(node))
+ break;
+ node = entry_to_node(node);
/*
* The candidate node has more than one child, or its child
- * is not at the leftmost slot, or it is a multiorder entry,
- * we cannot shrink.
+ * is not at the leftmost slot, or the child is a multiorder
+ * entry, we cannot shrink.
*/
- if (to_free->count != 1)
+ if (node->count != 1)
+ break;
+ child = node->slots[0];
+ if (!child)
break;
- slot = to_free->slots[0];
- if (!slot)
+ if (!radix_tree_is_internal_node(child) && node->shift)
break;
+ if (radix_tree_is_internal_node(child))
+ entry_to_node(child)->parent = NULL;
+
/*
* We don't need rcu_assign_pointer(), since we are simply
* moving the node from one part of the tree to another: if it
* was safe to dereference the old pointer to it
- * (to_free->slots[0]), it will be safe to dereference the new
+ * (node->slots[0]), it will be safe to dereference the new
* one (root->rnode) as far as dependent read barriers go.
*/
- if (root->height > 1) {
- if (!radix_tree_is_indirect_ptr(slot))
- break;
-
- slot = indirect_to_ptr(slot);
- slot->parent = NULL;
- slot = ptr_to_indirect(slot);
- }
- root->rnode = slot;
- root->height--;
+ root->rnode = child;
/*
* We have a dilemma here. The node's slot[0] must not be
@@ -1403,7 +1396,7 @@ static inline void radix_tree_shrink(struct radix_tree_root *root)
* their slot to become empty sooner or later.
*
* For example, lockless pagecache will look up a slot, deref
- * the page pointer, and if the page is 0 refcount it means it
+ * the page pointer, and if the page has 0 refcount it means it
* was concurrently deleted from pagecache so try the deref
* again. Fortunately there is already a requirement for logic
* to retry the entire slot lookup -- the indirect pointer
@@ -1411,12 +1404,14 @@ static inline void radix_tree_shrink(struct radix_tree_root *root)
* also results in a stale slot). So tag the slot as indirect
* to force callers to retry.
*/
- if (root->height == 0)
- *((unsigned long *)&to_free->slots[0]) |=
- RADIX_TREE_INDIRECT_PTR;
+ if (!radix_tree_is_internal_node(child))
+ node->slots[0] = RADIX_TREE_RETRY;
- radix_tree_node_free(to_free);
+ radix_tree_node_free(node);
+ shrunk = true;
}
+
+ return shrunk;
}
/**
@@ -1439,24 +1434,17 @@ bool __radix_tree_delete_node(struct radix_tree_root *root,
struct radix_tree_node *parent;
if (node->count) {
- if (node == indirect_to_ptr(root->rnode)) {
- radix_tree_shrink(root);
- if (root->height == 0)
- deleted = true;
- }
+ if (node == entry_to_node(root->rnode))
+ deleted |= radix_tree_shrink(root);
return deleted;
}
parent = node->parent;
if (parent) {
- unsigned int offset;
-
- offset = node->path >> RADIX_TREE_HEIGHT_SHIFT;
- parent->slots[offset] = NULL;
+ parent->slots[node->offset] = NULL;
parent->count--;
} else {
root_tag_clear_all(root);
- root->height = 0;
root->rnode = NULL;
}
@@ -1469,6 +1457,20 @@ bool __radix_tree_delete_node(struct radix_tree_root *root,
return deleted;
}
+static inline void delete_sibling_entries(struct radix_tree_node *node,
+ void *ptr, unsigned offset)
+{
+#ifdef CONFIG_RADIX_TREE_MULTIORDER
+ int i;
+ for (i = 1; offset + i < RADIX_TREE_MAP_SIZE; i++) {
+ if (node->slots[offset + i] != ptr)
+ break;
+ node->slots[offset + i] = NULL;
+ node->count--;
+ }
+#endif
+}
+
/**
* radix_tree_delete_item - delete an item from a radix tree
* @root: radix tree root
@@ -1484,7 +1486,7 @@ void *radix_tree_delete_item(struct radix_tree_root *root,
unsigned long index, void *item)
{
struct radix_tree_node *node;
- unsigned int offset, i;
+ unsigned int offset;
void **slot;
void *entry;
int tag;
@@ -1502,24 +1504,13 @@ void *radix_tree_delete_item(struct radix_tree_root *root,
return entry;
}
- offset = index & RADIX_TREE_MAP_MASK;
+ offset = get_slot_offset(node, slot);
- /*
- * Clear all tags associated with the item to be deleted.
- * This way of doing it would be inefficient, but seldom is any set.
- */
- for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
- if (tag_get(node, tag, offset))
- radix_tree_tag_clear(root, index, tag);
- }
+ /* Clear all tags associated with the item to be deleted. */
+ for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
+ node_tag_clear(root, node, tag, offset);
- /* Delete any sibling slots pointing to this slot */
- for (i = 1; offset + i < RADIX_TREE_MAP_SIZE; i++) {
- if (node->slots[offset + i] != ptr_to_indirect(slot))
- break;
- node->slots[offset + i] = NULL;
- node->count--;
- }
+ delete_sibling_entries(node, node_to_entry(slot), offset);
node->slots[offset] = NULL;
node->count--;
@@ -1544,6 +1535,28 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
}
EXPORT_SYMBOL(radix_tree_delete);
+struct radix_tree_node *radix_tree_replace_clear_tags(
+ struct radix_tree_root *root,
+ unsigned long index, void *entry)
+{
+ struct radix_tree_node *node;
+ void **slot;
+
+ __radix_tree_lookup(root, index, &node, &slot);
+
+ if (node) {
+ unsigned int tag, offset = get_slot_offset(node, slot);
+ for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
+ node_tag_clear(root, node, tag, offset);
+ } else {
+ /* Clear root node tags */
+ root->gfp_mask &= __GFP_BITS_MASK;
+ }
+
+ radix_tree_replace_slot(slot, entry);
+ return node;
+}
+
/**
* radix_tree_tagged - test whether any items in the tree are tagged
* @root: radix tree root
@@ -1564,45 +1577,24 @@ radix_tree_node_ctor(void *arg)
INIT_LIST_HEAD(&node->private_list);
}
-static __init unsigned long __maxindex(unsigned int height)
-{
- unsigned int width = height * RADIX_TREE_MAP_SHIFT;
- int shift = RADIX_TREE_INDEX_BITS - width;
-
- if (shift < 0)
- return ~0UL;
- if (shift >= BITS_PER_LONG)
- return 0UL;
- return ~0UL >> shift;
-}
-
-static __init void radix_tree_init_maxindex(void)
-{
- unsigned int i;
-
- for (i = 0; i < ARRAY_SIZE(height_to_maxindex); i++)
- height_to_maxindex[i] = __maxindex(i);
-}
-
static int radix_tree_callback(struct notifier_block *nfb,
- unsigned long action,
- void *hcpu)
+ unsigned long action, void *hcpu)
{
- int cpu = (long)hcpu;
- struct radix_tree_preload *rtp;
- struct radix_tree_node *node;
-
- /* Free per-cpu pool of perloaded nodes */
- if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
- rtp = &per_cpu(radix_tree_preloads, cpu);
- while (rtp->nr) {
+ int cpu = (long)hcpu;
+ struct radix_tree_preload *rtp;
+ struct radix_tree_node *node;
+
+ /* Free per-cpu pool of preloaded nodes */
+ if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
+ rtp = &per_cpu(radix_tree_preloads, cpu);
+ while (rtp->nr) {
node = rtp->nodes;
rtp->nodes = node->private_data;
kmem_cache_free(radix_tree_node_cachep, node);
rtp->nr--;
- }
- }
- return NOTIFY_OK;
+ }
+ }
+ return NOTIFY_OK;
}
void __init radix_tree_init(void)
@@ -1611,6 +1603,5 @@ void __init radix_tree_init(void)
sizeof(struct radix_tree_node), 0,
SLAB_PANIC | SLAB_RECLAIM_ACCOUNT,
radix_tree_node_ctor);
- radix_tree_init_maxindex();
hotcpu_notifier(radix_tree_callback, 0);
}
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index cc808707d..5d845ffd7 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -487,6 +487,7 @@ EXPORT_SYMBOL_GPL(rhashtable_insert_slow);
* rhashtable_walk_init - Initialise an iterator
* @ht: Table to walk over
* @iter: Hash table Iterator
+ * @gfp: GFP flags for allocations
*
* This function prepares a hash table walk.
*
@@ -504,14 +505,15 @@ EXPORT_SYMBOL_GPL(rhashtable_insert_slow);
* You must call rhashtable_walk_exit if this function returns
* successfully.
*/
-int rhashtable_walk_init(struct rhashtable *ht, struct rhashtable_iter *iter)
+int rhashtable_walk_init(struct rhashtable *ht, struct rhashtable_iter *iter,
+ gfp_t gfp)
{
iter->ht = ht;
iter->p = NULL;
iter->slot = 0;
iter->skip = 0;
- iter->walker = kmalloc(sizeof(*iter->walker), GFP_KERNEL);
+ iter->walker = kmalloc(sizeof(*iter->walker), gfp);
if (!iter->walker)
return -ENOMEM;
diff --git a/lib/sg_pool.c b/lib/sg_pool.c
new file mode 100644
index 000000000..6dd30615a
--- /dev/null
+++ b/lib/sg_pool.c
@@ -0,0 +1,172 @@
+#include <linux/module.h>
+#include <linux/scatterlist.h>
+#include <linux/mempool.h>
+#include <linux/slab.h>
+
+#define SG_MEMPOOL_NR ARRAY_SIZE(sg_pools)
+#define SG_MEMPOOL_SIZE 2
+
+struct sg_pool {
+ size_t size;
+ char *name;
+ struct kmem_cache *slab;
+ mempool_t *pool;
+};
+
+#define SP(x) { .size = x, "sgpool-" __stringify(x) }
+#if (SG_CHUNK_SIZE < 32)
+#error SG_CHUNK_SIZE is too small (must be 32 or greater)
+#endif
+static struct sg_pool sg_pools[] = {
+ SP(8),
+ SP(16),
+#if (SG_CHUNK_SIZE > 32)
+ SP(32),
+#if (SG_CHUNK_SIZE > 64)
+ SP(64),
+#if (SG_CHUNK_SIZE > 128)
+ SP(128),
+#if (SG_CHUNK_SIZE > 256)
+#error SG_CHUNK_SIZE is too large (256 MAX)
+#endif
+#endif
+#endif
+#endif
+ SP(SG_CHUNK_SIZE)
+};
+#undef SP
+
+static inline unsigned int sg_pool_index(unsigned short nents)
+{
+ unsigned int index;
+
+ BUG_ON(nents > SG_CHUNK_SIZE);
+
+ if (nents <= 8)
+ index = 0;
+ else
+ index = get_count_order(nents) - 3;
+
+ return index;
+}
+
+static void sg_pool_free(struct scatterlist *sgl, unsigned int nents)
+{
+ struct sg_pool *sgp;
+
+ sgp = sg_pools + sg_pool_index(nents);
+ mempool_free(sgl, sgp->pool);
+}
+
+static struct scatterlist *sg_pool_alloc(unsigned int nents, gfp_t gfp_mask)
+{
+ struct sg_pool *sgp;
+
+ sgp = sg_pools + sg_pool_index(nents);
+ return mempool_alloc(sgp->pool, gfp_mask);
+}
+
+/**
+ * sg_free_table_chained - Free a previously mapped sg table
+ * @table: The sg table header to use
+ * @first_chunk: was first_chunk not NULL in sg_alloc_table_chained?
+ *
+ * Description:
+ * Free an sg table previously allocated and setup with
+ * sg_alloc_table_chained().
+ *
+ **/
+void sg_free_table_chained(struct sg_table *table, bool first_chunk)
+{
+ if (first_chunk && table->orig_nents <= SG_CHUNK_SIZE)
+ return;
+ __sg_free_table(table, SG_CHUNK_SIZE, first_chunk, sg_pool_free);
+}
+EXPORT_SYMBOL_GPL(sg_free_table_chained);
+
+/**
+ * sg_alloc_table_chained - Allocate and chain SGLs in an sg table
+ * @table: The sg table header to use
+ * @nents: Number of entries in sg list
+ * @first_chunk: first SGL
+ *
+ * Description:
+ * Allocate and chain SGLs in an sg table. If @nents@ is larger than
+ * SG_CHUNK_SIZE a chained sg table will be setup.
+ *
+ **/
+int sg_alloc_table_chained(struct sg_table *table, int nents,
+ struct scatterlist *first_chunk)
+{
+ int ret;
+
+ BUG_ON(!nents);
+
+ if (first_chunk) {
+ if (nents <= SG_CHUNK_SIZE) {
+ table->nents = table->orig_nents = nents;
+ sg_init_table(table->sgl, nents);
+ return 0;
+ }
+ }
+
+ ret = __sg_alloc_table(table, nents, SG_CHUNK_SIZE,
+ first_chunk, GFP_ATOMIC, sg_pool_alloc);
+ if (unlikely(ret))
+ sg_free_table_chained(table, (bool)first_chunk);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(sg_alloc_table_chained);
+
+static __init int sg_pool_init(void)
+{
+ int i;
+
+ for (i = 0; i < SG_MEMPOOL_NR; i++) {
+ struct sg_pool *sgp = sg_pools + i;
+ int size = sgp->size * sizeof(struct scatterlist);
+
+ sgp->slab = kmem_cache_create(sgp->name, size, 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ if (!sgp->slab) {
+ printk(KERN_ERR "SG_POOL: can't init sg slab %s\n",
+ sgp->name);
+ goto cleanup_sdb;
+ }
+
+ sgp->pool = mempool_create_slab_pool(SG_MEMPOOL_SIZE,
+ sgp->slab);
+ if (!sgp->pool) {
+ printk(KERN_ERR "SG_POOL: can't init sg mempool %s\n",
+ sgp->name);
+ goto cleanup_sdb;
+ }
+ }
+
+ return 0;
+
+cleanup_sdb:
+ for (i = 0; i < SG_MEMPOOL_NR; i++) {
+ struct sg_pool *sgp = sg_pools + i;
+ if (sgp->pool)
+ mempool_destroy(sgp->pool);
+ if (sgp->slab)
+ kmem_cache_destroy(sgp->slab);
+ }
+
+ return -ENOMEM;
+}
+
+static __exit void sg_pool_exit(void)
+{
+ int i;
+
+ for (i = 0; i < SG_MEMPOOL_NR; i++) {
+ struct sg_pool *sgp = sg_pools + i;
+ mempool_destroy(sgp->pool);
+ kmem_cache_destroy(sgp->slab);
+ }
+}
+
+module_init(sg_pool_init);
+module_exit(sg_pool_exit);
diff --git a/lib/string_helpers.c b/lib/string_helpers.c
index 5c88204b6..ecaac2c05 100644
--- a/lib/string_helpers.c
+++ b/lib/string_helpers.c
@@ -10,6 +10,10 @@
#include <linux/export.h>
#include <linux/ctype.h>
#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/limits.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
#include <linux/string.h>
#include <linux/string_helpers.h>
@@ -534,3 +538,91 @@ int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz,
return p - dst;
}
EXPORT_SYMBOL(string_escape_mem);
+
+/*
+ * Return an allocated string that has been escaped of special characters
+ * and double quotes, making it safe to log in quotes.
+ */
+char *kstrdup_quotable(const char *src, gfp_t gfp)
+{
+ size_t slen, dlen;
+ char *dst;
+ const int flags = ESCAPE_HEX;
+ const char esc[] = "\f\n\r\t\v\a\e\\\"";
+
+ if (!src)
+ return NULL;
+ slen = strlen(src);
+
+ dlen = string_escape_mem(src, slen, NULL, 0, flags, esc);
+ dst = kmalloc(dlen + 1, gfp);
+ if (!dst)
+ return NULL;
+
+ WARN_ON(string_escape_mem(src, slen, dst, dlen, flags, esc) != dlen);
+ dst[dlen] = '\0';
+
+ return dst;
+}
+EXPORT_SYMBOL_GPL(kstrdup_quotable);
+
+/*
+ * Returns allocated NULL-terminated string containing process
+ * command line, with inter-argument NULLs replaced with spaces,
+ * and other special characters escaped.
+ */
+char *kstrdup_quotable_cmdline(struct task_struct *task, gfp_t gfp)
+{
+ char *buffer, *quoted;
+ int i, res;
+
+ buffer = kmalloc(PAGE_SIZE, GFP_TEMPORARY);
+ if (!buffer)
+ return NULL;
+
+ res = get_cmdline(task, buffer, PAGE_SIZE - 1);
+ buffer[res] = '\0';
+
+ /* Collapse trailing NULLs, leave res pointing to last non-NULL. */
+ while (--res >= 0 && buffer[res] == '\0')
+ ;
+
+ /* Replace inter-argument NULLs. */
+ for (i = 0; i <= res; i++)
+ if (buffer[i] == '\0')
+ buffer[i] = ' ';
+
+ /* Make sure result is printable. */
+ quoted = kstrdup_quotable(buffer, gfp);
+ kfree(buffer);
+ return quoted;
+}
+EXPORT_SYMBOL_GPL(kstrdup_quotable_cmdline);
+
+/*
+ * Returns allocated NULL-terminated string containing pathname,
+ * with special characters escaped, able to be safely logged. If
+ * there is an error, the leading character will be "<".
+ */
+char *kstrdup_quotable_file(struct file *file, gfp_t gfp)
+{
+ char *temp, *pathname;
+
+ if (!file)
+ return kstrdup("<unknown>", gfp);
+
+ /* We add 11 spaces for ' (deleted)' to be appended */
+ temp = kmalloc(PATH_MAX + 11, GFP_TEMPORARY);
+ if (!temp)
+ return kstrdup("<no_memory>", gfp);
+
+ pathname = file_path(file, temp, PATH_MAX + 11);
+ if (IS_ERR(pathname))
+ pathname = kstrdup("<too_long>", gfp);
+ else
+ pathname = kstrdup_quotable(pathname, gfp);
+
+ kfree(temp);
+ return pathname;
+}
+EXPORT_SYMBOL_GPL(kstrdup_quotable_file);
diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c
index 338403241..33f655ef4 100644
--- a/lib/strncpy_from_user.c
+++ b/lib/strncpy_from_user.c
@@ -1,5 +1,6 @@
#include <linux/compiler.h>
#include <linux/export.h>
+#include <linux/kasan-checks.h>
#include <linux/uaccess.h>
#include <linux/kernel.h>
#include <linux/errno.h>
@@ -109,6 +110,7 @@ long strncpy_from_user(char *dst, const char __user *src, long count)
unsigned long max = max_addr - src_addr;
long retval;
+ kasan_check_write(dst, count);
user_access_begin();
retval = do_strncpy_from_user(dst, src, count, max);
user_access_end();
diff --git a/lib/test_bpf.c b/lib/test_bpf.c
index 8f22fbedc..93f45011a 100644
--- a/lib/test_bpf.c
+++ b/lib/test_bpf.c
@@ -5621,7 +5621,10 @@ static struct bpf_prog *generate_filter(int which, int *err)
fp->type = BPF_PROG_TYPE_SOCKET_FILTER;
memcpy(fp->insnsi, fptr, fp->len * sizeof(struct bpf_insn));
- bpf_prog_select_runtime(fp);
+ /* We cannot error here as we don't need type compatibility
+ * checks.
+ */
+ fp = bpf_prog_select_runtime(fp, err);
break;
}
diff --git a/lib/test_hash.c b/lib/test_hash.c
new file mode 100644
index 000000000..c9549c8b4
--- /dev/null
+++ b/lib/test_hash.c
@@ -0,0 +1,250 @@
+/*
+ * Test cases for <linux/hash.h> and <linux/stringhash.h>
+ * This just verifies that various ways of computing a hash
+ * produce the same thing and, for cases where a k-bit hash
+ * value is requested, is of the requested size.
+ *
+ * We fill a buffer with a 255-byte null-terminated string,
+ * and use both full_name_hash() and hashlen_string() to hash the
+ * substrings from i to j, where 0 <= i < j < 256.
+ *
+ * The returned values are used to check that __hash_32() and
+ * __hash_32_generic() compute the same thing. Likewise hash_32()
+ * and hash_64().
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt "\n"
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/hash.h>
+#include <linux/stringhash.h>
+#include <linux/printk.h>
+
+/* 32-bit XORSHIFT generator. Seed must not be zero. */
+static u32 __init __attribute_const__
+xorshift(u32 seed)
+{
+ seed ^= seed << 13;
+ seed ^= seed >> 17;
+ seed ^= seed << 5;
+ return seed;
+}
+
+/* Given a non-zero x, returns a non-zero byte. */
+static u8 __init __attribute_const__
+mod255(u32 x)
+{
+ x = (x & 0xffff) + (x >> 16); /* 1 <= x <= 0x1fffe */
+ x = (x & 0xff) + (x >> 8); /* 1 <= x <= 0x2fd */
+ x = (x & 0xff) + (x >> 8); /* 1 <= x <= 0x100 */
+ x = (x & 0xff) + (x >> 8); /* 1 <= x <= 0xff */
+ return x;
+}
+
+/* Fill the buffer with non-zero bytes. */
+static void __init
+fill_buf(char *buf, size_t len, u32 seed)
+{
+ size_t i;
+
+ for (i = 0; i < len; i++) {
+ seed = xorshift(seed);
+ buf[i] = mod255(seed);
+ }
+}
+
+/*
+ * Test the various integer hash functions. h64 (or its low-order bits)
+ * is the integer to hash. hash_or accumulates the OR of the hash values,
+ * which are later checked to see that they cover all the requested bits.
+ *
+ * Because these functions (as opposed to the string hashes) are all
+ * inline, the code being tested is actually in the module, and you can
+ * recompile and re-test the module without rebooting.
+ */
+static bool __init
+test_int_hash(unsigned long long h64, u32 hash_or[2][33])
+{
+ int k;
+ u32 h0 = (u32)h64, h1, h2;
+
+ /* Test __hash32 */
+ hash_or[0][0] |= h1 = __hash_32(h0);
+#ifdef HAVE_ARCH__HASH_32
+ hash_or[1][0] |= h2 = __hash_32_generic(h0);
+#if HAVE_ARCH__HASH_32 == 1
+ if (h1 != h2) {
+ pr_err("__hash_32(%#x) = %#x != __hash_32_generic() = %#x",
+ h0, h1, h2);
+ return false;
+ }
+#endif
+#endif
+
+ /* Test k = 1..32 bits */
+ for (k = 1; k <= 32; k++) {
+ u32 const m = ((u32)2 << (k-1)) - 1; /* Low k bits set */
+
+ /* Test hash_32 */
+ hash_or[0][k] |= h1 = hash_32(h0, k);
+ if (h1 > m) {
+ pr_err("hash_32(%#x, %d) = %#x > %#x", h0, k, h1, m);
+ return false;
+ }
+#ifdef HAVE_ARCH_HASH_32
+ h2 = hash_32_generic(h0, k);
+#if HAVE_ARCH_HASH_32 == 1
+ if (h1 != h2) {
+ pr_err("hash_32(%#x, %d) = %#x != hash_32_generic() "
+ " = %#x", h0, k, h1, h2);
+ return false;
+ }
+#else
+ if (h2 > m) {
+ pr_err("hash_32_generic(%#x, %d) = %#x > %#x",
+ h0, k, h1, m);
+ return false;
+ }
+#endif
+#endif
+ /* Test hash_64 */
+ hash_or[1][k] |= h1 = hash_64(h64, k);
+ if (h1 > m) {
+ pr_err("hash_64(%#llx, %d) = %#x > %#x", h64, k, h1, m);
+ return false;
+ }
+#ifdef HAVE_ARCH_HASH_64
+ h2 = hash_64_generic(h64, k);
+#if HAVE_ARCH_HASH_64 == 1
+ if (h1 != h2) {
+ pr_err("hash_64(%#llx, %d) = %#x != hash_64_generic() "
+ "= %#x", h64, k, h1, h2);
+ return false;
+ }
+#else
+ if (h2 > m) {
+ pr_err("hash_64_generic(%#llx, %d) = %#x > %#x",
+ h64, k, h1, m);
+ return false;
+ }
+#endif
+#endif
+ }
+
+ (void)h2; /* Suppress unused variable warning */
+ return true;
+}
+
+#define SIZE 256 /* Run time is cubic in SIZE */
+
+static int __init
+test_hash_init(void)
+{
+ char buf[SIZE+1];
+ u32 string_or = 0, hash_or[2][33] = { 0 };
+ unsigned tests = 0;
+ unsigned long long h64 = 0;
+ int i, j;
+
+ fill_buf(buf, SIZE, 1);
+
+ /* Test every possible non-empty substring in the buffer. */
+ for (j = SIZE; j > 0; --j) {
+ buf[j] = '\0';
+
+ for (i = 0; i <= j; i++) {
+ u64 hashlen = hashlen_string(buf+i);
+ u32 h0 = full_name_hash(buf+i, j-i);
+
+ /* Check that hashlen_string gets the length right */
+ if (hashlen_len(hashlen) != j-i) {
+ pr_err("hashlen_string(%d..%d) returned length"
+ " %u, expected %d",
+ i, j, hashlen_len(hashlen), j-i);
+ return -EINVAL;
+ }
+ /* Check that the hashes match */
+ if (hashlen_hash(hashlen) != h0) {
+ pr_err("hashlen_string(%d..%d) = %08x != "
+ "full_name_hash() = %08x",
+ i, j, hashlen_hash(hashlen), h0);
+ return -EINVAL;
+ }
+
+ string_or |= h0;
+ h64 = h64 << 32 | h0; /* For use with hash_64 */
+ if (!test_int_hash(h64, hash_or))
+ return -EINVAL;
+ tests++;
+ } /* i */
+ } /* j */
+
+ /* The OR of all the hash values should cover all the bits */
+ if (~string_or) {
+ pr_err("OR of all string hash results = %#x != %#x",
+ string_or, -1u);
+ return -EINVAL;
+ }
+ if (~hash_or[0][0]) {
+ pr_err("OR of all __hash_32 results = %#x != %#x",
+ hash_or[0][0], -1u);
+ return -EINVAL;
+ }
+#ifdef HAVE_ARCH__HASH_32
+#if HAVE_ARCH__HASH_32 != 1 /* Test is pointless if results match */
+ if (~hash_or[1][0]) {
+ pr_err("OR of all __hash_32_generic results = %#x != %#x",
+ hash_or[1][0], -1u);
+ return -EINVAL;
+ }
+#endif
+#endif
+
+ /* Likewise for all the i-bit hash values */
+ for (i = 1; i <= 32; i++) {
+ u32 const m = ((u32)2 << (i-1)) - 1; /* Low i bits set */
+
+ if (hash_or[0][i] != m) {
+ pr_err("OR of all hash_32(%d) results = %#x "
+ "(%#x expected)", i, hash_or[0][i], m);
+ return -EINVAL;
+ }
+ if (hash_or[1][i] != m) {
+ pr_err("OR of all hash_64(%d) results = %#x "
+ "(%#x expected)", i, hash_or[1][i], m);
+ return -EINVAL;
+ }
+ }
+
+ /* Issue notices about skipped tests. */
+#ifndef HAVE_ARCH__HASH_32
+ pr_info("__hash_32() has no arch implementation to test.");
+#elif HAVE_ARCH__HASH_32 != 1
+ pr_info("__hash_32() is arch-specific; not compared to generic.");
+#endif
+#ifndef HAVE_ARCH_HASH_32
+ pr_info("hash_32() has no arch implementation to test.");
+#elif HAVE_ARCH_HASH_32 != 1
+ pr_info("hash_32() is arch-specific; not compared to generic.");
+#endif
+#ifndef HAVE_ARCH_HASH_64
+ pr_info("hash_64() has no arch implementation to test.");
+#elif HAVE_ARCH_HASH_64 != 1
+ pr_info("hash_64() is arch-specific; not compared to generic.");
+#endif
+
+ pr_notice("%u tests passed.", tests);
+
+ return 0;
+}
+
+static void __exit test_hash_exit(void)
+{
+}
+
+module_init(test_hash_init); /* Does everything */
+module_exit(test_hash_exit); /* Does nothing */
+
+MODULE_LICENSE("GPL");
diff --git a/lib/test_kasan.c b/lib/test_kasan.c
index 82169fbf2..5e51872b3 100644
--- a/lib/test_kasan.c
+++ b/lib/test_kasan.c
@@ -12,9 +12,12 @@
#define pr_fmt(fmt) "kasan test: %s " fmt, __func__
#include <linux/kernel.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
#include <linux/printk.h>
#include <linux/slab.h>
#include <linux/string.h>
+#include <linux/uaccess.h>
#include <linux/module.h>
static noinline void __init kmalloc_oob_right(void)
@@ -344,6 +347,70 @@ static noinline void __init kasan_stack_oob(void)
*(volatile char *)p;
}
+static noinline void __init ksize_unpoisons_memory(void)
+{
+ char *ptr;
+ size_t size = 123, real_size = size;
+
+ pr_info("ksize() unpoisons the whole allocated chunk\n");
+ ptr = kmalloc(size, GFP_KERNEL);
+ if (!ptr) {
+ pr_err("Allocation failed\n");
+ return;
+ }
+ real_size = ksize(ptr);
+ /* This access doesn't trigger an error. */
+ ptr[size] = 'x';
+ /* This one does. */
+ ptr[real_size] = 'y';
+ kfree(ptr);
+}
+
+static noinline void __init copy_user_test(void)
+{
+ char *kmem;
+ char __user *usermem;
+ size_t size = 10;
+ int unused;
+
+ kmem = kmalloc(size, GFP_KERNEL);
+ if (!kmem)
+ return;
+
+ usermem = (char __user *)vm_mmap(NULL, 0, PAGE_SIZE,
+ PROT_READ | PROT_WRITE | PROT_EXEC,
+ MAP_ANONYMOUS | MAP_PRIVATE, 0);
+ if (IS_ERR(usermem)) {
+ pr_err("Failed to allocate user memory\n");
+ kfree(kmem);
+ return;
+ }
+
+ pr_info("out-of-bounds in copy_from_user()\n");
+ unused = copy_from_user(kmem, usermem, size + 1);
+
+ pr_info("out-of-bounds in copy_to_user()\n");
+ unused = copy_to_user(usermem, kmem, size + 1);
+
+ pr_info("out-of-bounds in __copy_from_user()\n");
+ unused = __copy_from_user(kmem, usermem, size + 1);
+
+ pr_info("out-of-bounds in __copy_to_user()\n");
+ unused = __copy_to_user(usermem, kmem, size + 1);
+
+ pr_info("out-of-bounds in __copy_from_user_inatomic()\n");
+ unused = __copy_from_user_inatomic(kmem, usermem, size + 1);
+
+ pr_info("out-of-bounds in __copy_to_user_inatomic()\n");
+ unused = __copy_to_user_inatomic(usermem, kmem, size + 1);
+
+ pr_info("out-of-bounds in strncpy_from_user()\n");
+ unused = strncpy_from_user(kmem, usermem, size + 1);
+
+ vm_munmap((unsigned long)usermem, PAGE_SIZE);
+ kfree(kmem);
+}
+
static int __init kmalloc_tests_init(void)
{
kmalloc_oob_right();
@@ -367,6 +434,8 @@ static int __init kmalloc_tests_init(void)
kmem_cache_oob();
kasan_stack_oob();
kasan_global_oob();
+ ksize_unpoisons_memory();
+ copy_user_test();
return -EAGAIN;
}
diff --git a/lib/test_rhashtable.c b/lib/test_rhashtable.c
index 270bf7289..297fdb5e7 100644
--- a/lib/test_rhashtable.c
+++ b/lib/test_rhashtable.c
@@ -143,7 +143,7 @@ static void test_bucket_stats(struct rhashtable *ht)
struct rhashtable_iter hti;
struct rhash_head *pos;
- err = rhashtable_walk_init(ht, &hti);
+ err = rhashtable_walk_init(ht, &hti, GFP_KERNEL);
if (err) {
pr_warn("Test failed: allocation error");
return;
diff --git a/lib/test_uuid.c b/lib/test_uuid.c
new file mode 100644
index 000000000..547d3127a
--- /dev/null
+++ b/lib/test_uuid.c
@@ -0,0 +1,133 @@
+/*
+ * Test cases for lib/uuid.c module.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/uuid.h>
+
+struct test_uuid_data {
+ const char *uuid;
+ uuid_le le;
+ uuid_be be;
+};
+
+static const struct test_uuid_data test_uuid_test_data[] = {
+ {
+ .uuid = "c33f4995-3701-450e-9fbf-206a2e98e576",
+ .le = UUID_LE(0xc33f4995, 0x3701, 0x450e, 0x9f, 0xbf, 0x20, 0x6a, 0x2e, 0x98, 0xe5, 0x76),
+ .be = UUID_BE(0xc33f4995, 0x3701, 0x450e, 0x9f, 0xbf, 0x20, 0x6a, 0x2e, 0x98, 0xe5, 0x76),
+ },
+ {
+ .uuid = "64b4371c-77c1-48f9-8221-29f054fc023b",
+ .le = UUID_LE(0x64b4371c, 0x77c1, 0x48f9, 0x82, 0x21, 0x29, 0xf0, 0x54, 0xfc, 0x02, 0x3b),
+ .be = UUID_BE(0x64b4371c, 0x77c1, 0x48f9, 0x82, 0x21, 0x29, 0xf0, 0x54, 0xfc, 0x02, 0x3b),
+ },
+ {
+ .uuid = "0cb4ddff-a545-4401-9d06-688af53e7f84",
+ .le = UUID_LE(0x0cb4ddff, 0xa545, 0x4401, 0x9d, 0x06, 0x68, 0x8a, 0xf5, 0x3e, 0x7f, 0x84),
+ .be = UUID_BE(0x0cb4ddff, 0xa545, 0x4401, 0x9d, 0x06, 0x68, 0x8a, 0xf5, 0x3e, 0x7f, 0x84),
+ },
+};
+
+static const char * const test_uuid_wrong_data[] = {
+ "c33f4995-3701-450e-9fbf206a2e98e576 ", /* no hyphen(s) */
+ "64b4371c-77c1-48f9-8221-29f054XX023b", /* invalid character(s) */
+ "0cb4ddff-a545-4401-9d06-688af53e", /* not enough data */
+};
+
+static unsigned total_tests __initdata;
+static unsigned failed_tests __initdata;
+
+static void __init test_uuid_failed(const char *prefix, bool wrong, bool be,
+ const char *data, const char *actual)
+{
+ pr_err("%s test #%u %s %s data: '%s'\n",
+ prefix,
+ total_tests,
+ wrong ? "passed on wrong" : "failed on",
+ be ? "BE" : "LE",
+ data);
+ if (actual && *actual)
+ pr_err("%s test #%u actual data: '%s'\n",
+ prefix,
+ total_tests,
+ actual);
+ failed_tests++;
+}
+
+static void __init test_uuid_test(const struct test_uuid_data *data)
+{
+ uuid_le le;
+ uuid_be be;
+ char buf[48];
+
+ /* LE */
+ total_tests++;
+ if (uuid_le_to_bin(data->uuid, &le))
+ test_uuid_failed("conversion", false, false, data->uuid, NULL);
+
+ total_tests++;
+ if (uuid_le_cmp(data->le, le)) {
+ sprintf(buf, "%pUl", &le);
+ test_uuid_failed("cmp", false, false, data->uuid, buf);
+ }
+
+ /* BE */
+ total_tests++;
+ if (uuid_be_to_bin(data->uuid, &be))
+ test_uuid_failed("conversion", false, true, data->uuid, NULL);
+
+ total_tests++;
+ if (uuid_be_cmp(data->be, be)) {
+ sprintf(buf, "%pUb", &be);
+ test_uuid_failed("cmp", false, true, data->uuid, buf);
+ }
+}
+
+static void __init test_uuid_wrong(const char *data)
+{
+ uuid_le le;
+ uuid_be be;
+
+ /* LE */
+ total_tests++;
+ if (!uuid_le_to_bin(data, &le))
+ test_uuid_failed("negative", true, false, data, NULL);
+
+ /* BE */
+ total_tests++;
+ if (!uuid_be_to_bin(data, &be))
+ test_uuid_failed("negative", true, true, data, NULL);
+}
+
+static int __init test_uuid_init(void)
+{
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(test_uuid_test_data); i++)
+ test_uuid_test(&test_uuid_test_data[i]);
+
+ for (i = 0; i < ARRAY_SIZE(test_uuid_wrong_data); i++)
+ test_uuid_wrong(test_uuid_wrong_data[i]);
+
+ if (failed_tests == 0)
+ pr_info("all %u tests passed\n", total_tests);
+ else
+ pr_err("failed %u out of %u tests\n", failed_tests, total_tests);
+
+ return failed_tests ? -EINVAL : 0;
+}
+module_init(test_uuid_init);
+
+static void __exit test_uuid_exit(void)
+{
+ /* do nothing */
+}
+module_exit(test_uuid_exit);
+
+MODULE_AUTHOR("Andy Shevchenko <andriy.shevchenko@linux.intel.com>");
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/lib/uuid.c b/lib/uuid.c
index 398821e4d..37687af77 100644
--- a/lib/uuid.c
+++ b/lib/uuid.c
@@ -1,7 +1,7 @@
/*
* Unified UUID/GUID definition
*
- * Copyright (C) 2009, Intel Corp.
+ * Copyright (C) 2009, 2016 Intel Corp.
* Huang Ying <ying.huang@intel.com>
*
* This program is free software; you can redistribute it and/or
@@ -12,17 +12,40 @@
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/errno.h>
#include <linux/export.h>
#include <linux/uuid.h>
#include <linux/random.h>
+const u8 uuid_le_index[16] = {3,2,1,0,5,4,7,6,8,9,10,11,12,13,14,15};
+EXPORT_SYMBOL(uuid_le_index);
+const u8 uuid_be_index[16] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
+EXPORT_SYMBOL(uuid_be_index);
+
+/***************************************************************
+ * Random UUID interface
+ *
+ * Used here for a Boot ID, but can be useful for other kernel
+ * drivers.
+ ***************************************************************/
+
+/*
+ * Generate random UUID
+ */
+void generate_random_uuid(unsigned char uuid[16])
+{
+ get_random_bytes(uuid, 16);
+ /* Set UUID version to 4 --- truly random generation */
+ uuid[6] = (uuid[6] & 0x0F) | 0x40;
+ /* Set the UUID variant to DCE */
+ uuid[8] = (uuid[8] & 0x3F) | 0x80;
+}
+EXPORT_SYMBOL(generate_random_uuid);
+
static void __uuid_gen_common(__u8 b[16])
{
prandom_bytes(b, 16);
@@ -45,3 +68,61 @@ void uuid_be_gen(uuid_be *bu)
bu->b[6] = (bu->b[6] & 0x0F) | 0x40;
}
EXPORT_SYMBOL_GPL(uuid_be_gen);
+
+/**
+ * uuid_is_valid - checks if UUID string valid
+ * @uuid: UUID string to check
+ *
+ * Description:
+ * It checks if the UUID string is following the format:
+ * xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
+ * where x is a hex digit.
+ *
+ * Return: true if input is valid UUID string.
+ */
+bool uuid_is_valid(const char *uuid)
+{
+ unsigned int i;
+
+ for (i = 0; i < UUID_STRING_LEN; i++) {
+ if (i == 8 || i == 13 || i == 18 || i == 23) {
+ if (uuid[i] != '-')
+ return false;
+ } else if (!isxdigit(uuid[i])) {
+ return false;
+ }
+ }
+
+ return true;
+}
+EXPORT_SYMBOL(uuid_is_valid);
+
+static int __uuid_to_bin(const char *uuid, __u8 b[16], const u8 ei[16])
+{
+ static const u8 si[16] = {0,2,4,6,9,11,14,16,19,21,24,26,28,30,32,34};
+ unsigned int i;
+
+ if (!uuid_is_valid(uuid))
+ return -EINVAL;
+
+ for (i = 0; i < 16; i++) {
+ int hi = hex_to_bin(uuid[si[i] + 0]);
+ int lo = hex_to_bin(uuid[si[i] + 1]);
+
+ b[ei[i]] = (hi << 4) | lo;
+ }
+
+ return 0;
+}
+
+int uuid_le_to_bin(const char *uuid, uuid_le *u)
+{
+ return __uuid_to_bin(uuid, u->b, uuid_le_index);
+}
+EXPORT_SYMBOL(uuid_le_to_bin);
+
+int uuid_be_to_bin(const char *uuid, uuid_be *u)
+{
+ return __uuid_to_bin(uuid, u->b, uuid_be_index);
+}
+EXPORT_SYMBOL(uuid_be_to_bin);
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index ccb664b54..0967771d8 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -30,6 +30,7 @@
#include <linux/ioport.h>
#include <linux/dcache.h>
#include <linux/cred.h>
+#include <linux/uuid.h>
#include <net/addrconf.h>
#ifdef CONFIG_BLOCK
#include <linux/blkdev.h>
@@ -1304,19 +1305,17 @@ static noinline_for_stack
char *uuid_string(char *buf, char *end, const u8 *addr,
struct printf_spec spec, const char *fmt)
{
- char uuid[sizeof("xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx")];
+ char uuid[UUID_STRING_LEN + 1];
char *p = uuid;
int i;
- static const u8 be[16] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
- static const u8 le[16] = {3,2,1,0,5,4,7,6,8,9,10,11,12,13,14,15};
- const u8 *index = be;
+ const u8 *index = uuid_be_index;
bool uc = false;
switch (*(++fmt)) {
case 'L':
uc = true; /* fall-through */
case 'l':
- index = le;
+ index = uuid_le_index;
break;
case 'B':
uc = true;
@@ -1324,7 +1323,10 @@ char *uuid_string(char *buf, char *end, const u8 *addr,
}
for (i = 0; i < 16; i++) {
- p = hex_byte_pack(p, addr[index[i]]);
+ if (uc)
+ p = hex_byte_pack_upper(p, addr[index[i]]);
+ else
+ p = hex_byte_pack(p, addr[index[i]]);
switch (i) {
case 3:
case 5:
@@ -1337,13 +1339,6 @@ char *uuid_string(char *buf, char *end, const u8 *addr,
*p = 0;
- if (uc) {
- p = uuid;
- do {
- *p = toupper(*p);
- } while (*(++p));
- }
-
return string(buf, end, uuid, spec);
}
diff --git a/lib/wbt.c b/lib/wbt.c
new file mode 100644
index 000000000..cc5a24270
--- /dev/null
+++ b/lib/wbt.c
@@ -0,0 +1,569 @@
+/*
+ * buffered writeback throttling. losely based on CoDel. We can't drop
+ * packets for IO scheduling, so the logic is something like this:
+ *
+ * - Monitor latencies in a defined window of time.
+ * - If the minimum latency in the above window exceeds some target, increment
+ * scaling step and scale down queue depth by a factor of 2x. The monitoring
+ * window is then shrunk to 100 / sqrt(scaling step + 1).
+ * - For any window where we don't have solid data on what the latencies
+ * look like, retain status quo.
+ * - If latencies look good, decrement scaling step.
+ *
+ * Copyright (C) 2016 Jens Axboe
+ *
+ * Things that (may) need changing:
+ *
+ * - Different scaling of background/normal/high priority writeback.
+ * We may have to violate guarantees for max.
+ * - We can have mismatches between the stat window and our window.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/blk_types.h>
+#include <linux/slab.h>
+#include <linux/backing-dev.h>
+#include <linux/wbt.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/wbt.h>
+
+enum {
+ /*
+ * Might need to be higher
+ */
+ RWB_MAX_DEPTH = 64,
+
+ /*
+ * 100msec window
+ */
+ RWB_WINDOW_NSEC = 100 * 1000 * 1000ULL,
+
+ /*
+ * Disregard stats, if we don't meet these minimums
+ */
+ RWB_MIN_WRITE_SAMPLES = 3,
+ RWB_MIN_READ_SAMPLES = 1,
+
+ /*
+ * If we have this number of consecutive windows with not enough
+ * information to scale up or down, scale up.
+ */
+ RWB_UNKNOWN_BUMP = 5,
+};
+
+static inline bool rwb_enabled(struct rq_wb *rwb)
+{
+ return rwb && rwb->wb_normal != 0;
+}
+
+/*
+ * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded,
+ * false if 'v' + 1 would be bigger than 'below'.
+ */
+static bool atomic_inc_below(atomic_t *v, int below)
+{
+ int cur = atomic_read(v);
+
+ for (;;) {
+ int old;
+
+ if (cur >= below)
+ return false;
+ old = atomic_cmpxchg(v, cur, cur + 1);
+ if (old == cur)
+ break;
+ cur = old;
+ }
+
+ return true;
+}
+
+static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
+{
+ if (rwb_enabled(rwb)) {
+ const unsigned long cur = jiffies;
+
+ if (cur != *var)
+ *var = cur;
+ }
+}
+
+void __wbt_done(struct rq_wb *rwb)
+{
+ int inflight, limit;
+
+ inflight = atomic_dec_return(&rwb->inflight);
+
+ /*
+ * wbt got disabled with IO in flight. Wake up any potential
+ * waiters, we don't have to do more than that.
+ */
+ if (unlikely(!rwb_enabled(rwb))) {
+ wake_up_all(&rwb->wait);
+ return;
+ }
+
+ /*
+ * If the device does write back caching, drop further down
+ * before we wake people up.
+ */
+ if (rwb->wc && !atomic_read(&rwb->bdi->wb.dirty_sleeping))
+ limit = 0;
+ else
+ limit = rwb->wb_normal;
+
+ /*
+ * Don't wake anyone up if we are above the normal limit.
+ */
+ if (inflight && inflight >= limit)
+ return;
+
+ if (waitqueue_active(&rwb->wait)) {
+ int diff = limit - inflight;
+
+ if (!inflight || diff >= rwb->wb_background / 2)
+ wake_up_nr(&rwb->wait, 1);
+ }
+}
+
+/*
+ * Called on completion of a request. Note that it's also called when
+ * a request is merged, when the request gets freed.
+ */
+void wbt_done(struct rq_wb *rwb, struct wb_issue_stat *stat)
+{
+ if (!rwb)
+ return;
+
+ if (!wbt_tracked(stat)) {
+ if (rwb->sync_cookie == stat) {
+ rwb->sync_issue = 0;
+ rwb->sync_cookie = NULL;
+ }
+
+ wb_timestamp(rwb, &rwb->last_comp);
+ } else {
+ WARN_ON_ONCE(stat == rwb->sync_cookie);
+ __wbt_done(rwb);
+ wbt_clear_tracked(stat);
+ }
+}
+
+static void calc_wb_limits(struct rq_wb *rwb)
+{
+ unsigned int depth;
+
+ if (!rwb->min_lat_nsec) {
+ rwb->wb_max = rwb->wb_normal = rwb->wb_background = 0;
+ return;
+ }
+
+ /*
+ * For QD=1 devices, this is a special case. It's important for those
+ * to have one request ready when one completes, so force a depth of
+ * 2 for those devices. On the backend, it'll be a depth of 1 anyway,
+ * since the device can't have more than that in flight. If we're
+ * scaling down, then keep a setting of 1/1/1.
+ */
+ if (rwb->queue_depth == 1) {
+ if (rwb->scale_step)
+ rwb->wb_max = rwb->wb_normal = 1;
+ else
+ rwb->wb_max = rwb->wb_normal = 2;
+ rwb->wb_background = 1;
+ } else {
+ depth = min_t(unsigned int, RWB_MAX_DEPTH, rwb->queue_depth);
+
+ /*
+ * Set our max/normal/bg queue depths based on how far
+ * we have scaled down (->scale_step).
+ */
+ rwb->wb_max = 1 + ((depth - 1) >> min(31U, rwb->scale_step));
+ rwb->wb_normal = (rwb->wb_max + 1) / 2;
+ rwb->wb_background = (rwb->wb_max + 3) / 4;
+ }
+}
+
+static bool inline stat_sample_valid(struct blk_rq_stat *stat)
+{
+ /*
+ * We need at least one read sample, and a minimum of
+ * RWB_MIN_WRITE_SAMPLES. We require some write samples to know
+ * that it's writes impacting us, and not just some sole read on
+ * a device that is in a lower power state.
+ */
+ return stat[0].nr_samples >= 1 &&
+ stat[1].nr_samples >= RWB_MIN_WRITE_SAMPLES;
+}
+
+static u64 rwb_sync_issue_lat(struct rq_wb *rwb)
+{
+ u64 now, issue = ACCESS_ONCE(rwb->sync_issue);
+
+ if (!issue || !rwb->sync_cookie)
+ return 0;
+
+ now = ktime_to_ns(ktime_get());
+ return now - issue;
+}
+
+enum {
+ LAT_OK,
+ LAT_UNKNOWN,
+ LAT_EXCEEDED,
+};
+
+static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
+{
+ u64 thislat;
+
+ /*
+ * If our stored sync issue exceeds the window size, or it
+ * exceeds our min target AND we haven't logged any entries,
+ * flag the latency as exceeded. wbt works off completion latencies,
+ * but for a flooded device, a single sync IO can take a long time
+ * to complete after being issued. If this time exceeds our
+ * monitoring window AND we didn't see any other completions in that
+ * window, then count that sync IO as a violation of the latency.
+ */
+ thislat = rwb_sync_issue_lat(rwb);
+ if (thislat > rwb->cur_win_nsec ||
+ (thislat > rwb->min_lat_nsec && !stat[0].nr_samples)) {
+ trace_wbt_lat(rwb->bdi, thislat);
+ return LAT_EXCEEDED;
+ }
+
+ if (!stat_sample_valid(stat))
+ return LAT_UNKNOWN;
+
+ /*
+ * If the 'min' latency exceeds our target, step down.
+ */
+ if (stat[0].min > rwb->min_lat_nsec) {
+ trace_wbt_lat(rwb->bdi, stat[0].min);
+ trace_wbt_stat(rwb->bdi, stat);
+ return LAT_EXCEEDED;
+ }
+
+ if (rwb->scale_step)
+ trace_wbt_stat(rwb->bdi, stat);
+
+ return LAT_OK;
+}
+
+static int latency_exceeded(struct rq_wb *rwb)
+{
+ struct blk_rq_stat stat[2];
+
+ rwb->stat_ops->get(rwb->ops_data, stat);
+ return __latency_exceeded(rwb, stat);
+}
+
+static void rwb_trace_step(struct rq_wb *rwb, const char *msg)
+{
+ trace_wbt_step(rwb->bdi, msg, rwb->scale_step, rwb->cur_win_nsec,
+ rwb->wb_background, rwb->wb_normal, rwb->wb_max);
+}
+
+static void scale_up(struct rq_wb *rwb)
+{
+ /*
+ * If we're at 0, we can't go lower.
+ */
+ if (!rwb->scale_step)
+ return;
+
+ rwb->scale_step--;
+ rwb->unknown_cnt = 0;
+ rwb->stat_ops->clear(rwb->ops_data);
+ calc_wb_limits(rwb);
+
+ if (waitqueue_active(&rwb->wait))
+ wake_up_all(&rwb->wait);
+
+ rwb_trace_step(rwb, "step up");
+}
+
+static void scale_down(struct rq_wb *rwb)
+{
+ /*
+ * Stop scaling down when we've hit the limit. This also prevents
+ * ->scale_step from going to crazy values, if the device can't
+ * keep up.
+ */
+ if (rwb->wb_max == 1)
+ return;
+
+ rwb->scale_step++;
+ rwb->unknown_cnt = 0;
+ rwb->stat_ops->clear(rwb->ops_data);
+ calc_wb_limits(rwb);
+ rwb_trace_step(rwb, "step down");
+}
+
+static void rwb_arm_timer(struct rq_wb *rwb)
+{
+ unsigned long expires;
+
+ /*
+ * We should speed this up, using some variant of a fast integer
+ * inverse square root calculation. Since we only do this for
+ * every window expiration, it's not a huge deal, though.
+ */
+ rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4,
+ int_sqrt((rwb->scale_step + 1) << 8));
+ expires = jiffies + nsecs_to_jiffies(rwb->cur_win_nsec);
+ mod_timer(&rwb->window_timer, expires);
+}
+
+static void wb_timer_fn(unsigned long data)
+{
+ struct rq_wb *rwb = (struct rq_wb *) data;
+ int status;
+
+ /*
+ * If we exceeded the latency target, step down. If we did not,
+ * step one level up. If we don't know enough to say either exceeded
+ * or ok, then don't do anything.
+ */
+ status = latency_exceeded(rwb);
+ switch (status) {
+ case LAT_EXCEEDED:
+ scale_down(rwb);
+ break;
+ case LAT_OK:
+ scale_up(rwb);
+ break;
+ case LAT_UNKNOWN:
+ /*
+ * We had no read samples, start bumping up the write
+ * depth slowly
+ */
+ if (++rwb->unknown_cnt >= RWB_UNKNOWN_BUMP)
+ scale_up(rwb);
+ break;
+ default:
+ break;
+ }
+
+ /*
+ * Re-arm timer, if we have IO in flight
+ */
+ if (rwb->scale_step || atomic_read(&rwb->inflight))
+ rwb_arm_timer(rwb);
+}
+
+void wbt_update_limits(struct rq_wb *rwb)
+{
+ rwb->scale_step = 0;
+ calc_wb_limits(rwb);
+
+ if (waitqueue_active(&rwb->wait))
+ wake_up_all(&rwb->wait);
+}
+
+static bool close_io(struct rq_wb *rwb)
+{
+ const unsigned long now = jiffies;
+
+ return time_before(now, rwb->last_issue + HZ / 10) ||
+ time_before(now, rwb->last_comp + HZ / 10);
+}
+
+#define REQ_HIPRIO (REQ_SYNC | REQ_META | REQ_PRIO)
+
+static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
+{
+ unsigned int limit;
+
+ /*
+ * At this point we know it's a buffered write. If REQ_SYNC is
+ * set, then it's WB_SYNC_ALL writeback, and we'll use the max
+ * limit for that. If the write is marked as a background write,
+ * then use the idle limit, or go to normal if we haven't had
+ * competing IO for a bit.
+ */
+ if ((rw & REQ_HIPRIO) || atomic_read(&rwb->bdi->wb.dirty_sleeping))
+ limit = rwb->wb_max;
+ else if ((rw & REQ_BG) || close_io(rwb)) {
+ /*
+ * If less than 100ms since we completed unrelated IO,
+ * limit us to half the depth for background writeback.
+ */
+ limit = rwb->wb_background;
+ } else
+ limit = rwb->wb_normal;
+
+ return limit;
+}
+
+static inline bool may_queue(struct rq_wb *rwb, unsigned long rw)
+{
+ /*
+ * inc it here even if disabled, since we'll dec it at completion.
+ * this only happens if the task was sleeping in __wbt_wait(),
+ * and someone turned it off at the same time.
+ */
+ if (!rwb_enabled(rwb)) {
+ atomic_inc(&rwb->inflight);
+ return true;
+ }
+
+ return atomic_inc_below(&rwb->inflight, get_limit(rwb, rw));
+}
+
+/*
+ * Block if we will exceed our limit, or if we are currently waiting for
+ * the timer to kick off queuing again.
+ */
+static void __wbt_wait(struct rq_wb *rwb, unsigned long rw, spinlock_t *lock)
+{
+ DEFINE_WAIT(wait);
+
+ if (may_queue(rwb, rw))
+ return;
+
+ do {
+ prepare_to_wait_exclusive(&rwb->wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+
+ if (may_queue(rwb, rw))
+ break;
+
+ if (lock)
+ spin_unlock_irq(lock);
+
+ io_schedule();
+
+ if (lock)
+ spin_lock_irq(lock);
+ } while (1);
+
+ finish_wait(&rwb->wait, &wait);
+}
+
+static inline bool wbt_should_throttle(struct rq_wb *rwb, unsigned int rw)
+{
+ /*
+ * If not a WRITE (or a discard), do nothing
+ */
+ if (!(rw & REQ_WRITE) || (rw & REQ_DISCARD))
+ return false;
+
+ /*
+ * Don't throttle WRITE_ODIRECT
+ */
+ if ((rw & (REQ_SYNC | REQ_NOIDLE)) == REQ_SYNC)
+ return false;
+
+ return true;
+}
+
+/*
+ * Returns true if the IO request should be accounted, false if not.
+ * May sleep, if we have exceeded the writeback limits. Caller can pass
+ * in an irq held spinlock, if it holds one when calling this function.
+ * If we do sleep, we'll release and re-grab it.
+ */
+bool wbt_wait(struct rq_wb *rwb, unsigned int rw, spinlock_t *lock)
+{
+ if (!rwb_enabled(rwb))
+ return false;
+
+ if (!wbt_should_throttle(rwb, rw)) {
+ wb_timestamp(rwb, &rwb->last_issue);
+ return false;
+ }
+
+ __wbt_wait(rwb, rw, lock);
+
+ if (!timer_pending(&rwb->window_timer))
+ rwb_arm_timer(rwb);
+
+ return true;
+}
+
+void wbt_issue(struct rq_wb *rwb, struct wb_issue_stat *stat)
+{
+ if (!rwb_enabled(rwb))
+ return;
+
+ wbt_issue_stat_set_time(stat);
+
+ /*
+ * Track sync issue, in case it takes a long time to complete. Allows
+ * us to react quicker, if a sync IO takes a long time to complete.
+ * Note that this is just a hint. 'stat' can go away when the
+ * request completes, so it's important we never dereference it. We
+ * only use the address to compare with, which is why we store the
+ * sync_issue time locally.
+ */
+ if (!wbt_tracked(stat) && !rwb->sync_issue) {
+ rwb->sync_cookie = stat;
+ rwb->sync_issue = wbt_issue_stat_get_time(stat);
+ }
+}
+
+void wbt_requeue(struct rq_wb *rwb, struct wb_issue_stat *stat)
+{
+ if (!rwb_enabled(rwb))
+ return;
+ if (stat == rwb->sync_cookie) {
+ rwb->sync_issue = 0;
+ rwb->sync_cookie = NULL;
+ }
+}
+
+void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth)
+{
+ if (rwb) {
+ rwb->queue_depth = depth;
+ wbt_update_limits(rwb);
+ }
+}
+
+void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on)
+{
+ if (rwb)
+ rwb->wc = write_cache_on;
+}
+
+void wbt_disable(struct rq_wb *rwb)
+{
+ del_timer_sync(&rwb->window_timer);
+ rwb->win_nsec = rwb->min_lat_nsec = 0;
+ wbt_update_limits(rwb);
+}
+EXPORT_SYMBOL_GPL(wbt_disable);
+
+struct rq_wb *wbt_init(struct backing_dev_info *bdi, struct wb_stat_ops *ops,
+ void *ops_data)
+{
+ struct rq_wb *rwb;
+
+ rwb = kzalloc(sizeof(*rwb), GFP_KERNEL);
+ if (!rwb)
+ return ERR_PTR(-ENOMEM);
+
+ atomic_set(&rwb->inflight, 0);
+ init_waitqueue_head(&rwb->wait);
+ setup_timer(&rwb->window_timer, wb_timer_fn, (unsigned long) rwb);
+ rwb->wc = 1;
+ rwb->queue_depth = RWB_MAX_DEPTH;
+ rwb->last_comp = rwb->last_issue = jiffies;
+ rwb->bdi = bdi;
+ rwb->win_nsec = RWB_WINDOW_NSEC;
+ rwb->stat_ops = ops,
+ rwb->ops_data = ops_data;
+ wbt_update_limits(rwb);
+ return rwb;
+}
+
+void wbt_exit(struct rq_wb *rwb)
+{
+ if (rwb) {
+ del_timer_sync(&rwb->window_timer);
+ kfree(rwb);
+ }
+}