diff options
Diffstat (limited to 'lib')
-rw-r--r-- | lib/Kconfig | 13 | ||||
-rw-r--r-- | lib/Kconfig.debug | 48 | ||||
-rw-r--r-- | lib/Kconfig.kgdb | 2 | ||||
-rw-r--r-- | lib/Makefile | 8 | ||||
-rw-r--r-- | lib/asn1_decoder.c | 3 | ||||
-rw-r--r-- | lib/debugobjects.c | 92 | ||||
-rw-r--r-- | lib/gcd.c | 77 | ||||
-rw-r--r-- | lib/iov_iter.c | 104 | ||||
-rw-r--r-- | lib/mpi/mpicoder.c | 122 | ||||
-rw-r--r-- | lib/nlattr.c | 103 | ||||
-rw-r--r-- | lib/nmi_backtrace.c | 89 | ||||
-rw-r--r-- | lib/nodemask.c | 30 | ||||
-rw-r--r-- | lib/percpu_counter.c | 6 | ||||
-rw-r--r-- | lib/proportions.c | 407 | ||||
-rw-r--r-- | lib/radix-tree.c | 947 | ||||
-rw-r--r-- | lib/rhashtable.c | 6 | ||||
-rw-r--r-- | lib/sg_pool.c | 172 | ||||
-rw-r--r-- | lib/string_helpers.c | 92 | ||||
-rw-r--r-- | lib/strncpy_from_user.c | 2 | ||||
-rw-r--r-- | lib/test_bpf.c | 5 | ||||
-rw-r--r-- | lib/test_hash.c | 250 | ||||
-rw-r--r-- | lib/test_kasan.c | 69 | ||||
-rw-r--r-- | lib/test_rhashtable.c | 2 | ||||
-rw-r--r-- | lib/test_uuid.c | 133 | ||||
-rw-r--r-- | lib/uuid.c | 91 | ||||
-rw-r--r-- | lib/vsprintf.c | 21 | ||||
-rw-r--r-- | lib/wbt.c | 569 |
27 files changed, 2270 insertions, 1193 deletions
diff --git a/lib/Kconfig b/lib/Kconfig index 3cca12225..c585e4c40 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -362,6 +362,9 @@ config INTERVAL_TREE for more information. +config RADIX_TREE_MULTIORDER + bool + config ASSOCIATIVE_ARRAY bool help @@ -523,6 +526,13 @@ config SG_SPLIT a scatterlist. This should be selected by a driver or an API which whishes to split a scatterlist amongst multiple DMA channels. +config SG_POOL + def_bool n + help + Provides a helper to allocate chained scatterlists. This should be + selected by a driver or an API which whishes to allocate chained + scatterlist. + # # sg chaining option # @@ -540,4 +550,7 @@ config STACKDEPOT bool select STACKTRACE +config WBT + bool + endmenu diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index b4e287cde..42d3d798c 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -257,6 +257,7 @@ config PAGE_OWNER config DEBUG_FS bool "Debug Filesystem" + select SRCU help debugfs is a virtual file system that kernel developers use to put debugging files into. Enable this option to be able to read and @@ -1289,6 +1290,39 @@ config TORTURE_TEST tristate default n +config RCU_PERF_TEST + tristate "performance tests for RCU" + depends on DEBUG_KERNEL + select TORTURE_TEST + select SRCU + select TASKS_RCU + default n + help + This option provides a kernel module that runs performance + tests on the RCU infrastructure. The kernel module may be built + after the fact on the running kernel to be tested, if desired. + + Say Y here if you want RCU performance tests to be built into + the kernel. + Say M if you want the RCU performance tests to build as a module. + Say N if you are unsure. + +config RCU_PERF_TEST_RUNNABLE + bool "performance tests for RCU runnable by default" + depends on RCU_PERF_TEST = y + default n + help + This option provides a way to build the RCU performance tests + directly into the kernel without them starting up at boot time. + You can use /sys/module to manually override this setting. + This /proc file is available only when the RCU performance + tests have been built into the kernel. + + Say Y here if you want the RCU performance tests to start during + boot (you probably don't). + Say N here if you want the RCU performance tests to start only + after being manually enabled via /sys/module. + config RCU_TORTURE_TEST tristate "torture tests for RCU" depends on DEBUG_KERNEL && !SCHED_BFS @@ -1808,6 +1842,9 @@ config TEST_BITMAP If unsure, say N. +config TEST_UUID + tristate "Test functions located in the uuid module at runtime" + config TEST_RHASHTABLE tristate "Perform selftest on resizable hash table" default n @@ -1816,6 +1853,17 @@ config TEST_RHASHTABLE If unsure, say N. +config TEST_HASH + tristate "Perform selftest on hash functions" + default n + help + Enable this option to test the kernel's integer (<linux/hash,h>) + and string (<linux/stringhash.h>) hash functions on boot + (or module load). + + This is intended to help people writing architecture-specific + optimized versions. If unsure, say N. + endmenu # runtime tests config PROVIDE_OHCI1394_DMA_INIT diff --git a/lib/Kconfig.kgdb b/lib/Kconfig.kgdb index c635a107a..533f91263 100644 --- a/lib/Kconfig.kgdb +++ b/lib/Kconfig.kgdb @@ -22,7 +22,7 @@ config KGDB_SERIAL_CONSOLE tristate "KGDB: use kgdb over the serial console" select CONSOLE_POLL select MAGIC_SYSRQ - depends on TTY + depends on TTY && HW_CONSOLE default y help Share a serial console with kgdb. Sysrq-g must be used diff --git a/lib/Makefile b/lib/Makefile index 7bd6fd436..5b5506e3b 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -23,9 +23,9 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ rbtree.o radix-tree.o dump_stack.o timerqueue.o\ idr.o int_sqrt.o extable.o \ sha1.o md5.o irq_regs.o argv_split.o \ - proportions.o flex_proportions.o ratelimit.o show_mem.o \ + flex_proportions.o ratelimit.o show_mem.o \ is_single_threaded.o plist.o decompress.o kobject_uevent.o \ - earlycpio.o seq_buf.o nmi_backtrace.o + earlycpio.o seq_buf.o nmi_backtrace.o nodemask.o obj-$(CONFIG_ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS) += usercopy.o lib-$(CONFIG_MMU) += ioremap.o @@ -48,6 +48,7 @@ obj-$(CONFIG_TEST_HEXDUMP) += test_hexdump.o obj-y += kstrtox.o obj-$(CONFIG_TEST_BPF) += test_bpf.o obj-$(CONFIG_TEST_FIRMWARE) += test_firmware.o +obj-$(CONFIG_TEST_HASH) += test_hash.o obj-$(CONFIG_TEST_KASAN) += test_kasan.o obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o obj-$(CONFIG_TEST_LKM) += test_module.o @@ -57,6 +58,7 @@ obj-$(CONFIG_TEST_STATIC_KEYS) += test_static_keys.o obj-$(CONFIG_TEST_STATIC_KEYS) += test_static_key_base.o obj-$(CONFIG_TEST_PRINTF) += test_printf.o obj-$(CONFIG_TEST_BITMAP) += test_bitmap.o +obj-$(CONFIG_TEST_UUID) += test_uuid.o ifeq ($(CONFIG_DEBUG_KOBJECT),y) CFLAGS_kobject.o += -DDEBUG @@ -178,8 +180,10 @@ obj-$(CONFIG_GENERIC_STRNLEN_USER) += strnlen_user.o obj-$(CONFIG_GENERIC_NET_UTILS) += net_utils.o obj-$(CONFIG_SG_SPLIT) += sg_split.o +obj-$(CONFIG_SG_POOL) += sg_pool.o obj-$(CONFIG_STMP_DEVICE) += stmp_device.o obj-$(CONFIG_IRQ_POLL) += irq_poll.o +obj-$(CONFIG_WBT) += wbt.o obj-$(CONFIG_STACKDEPOT) += stackdepot.o KASAN_SANITIZE_stackdepot.o := n diff --git a/lib/asn1_decoder.c b/lib/asn1_decoder.c index 554522934..0bd8a611e 100644 --- a/lib/asn1_decoder.c +++ b/lib/asn1_decoder.c @@ -12,6 +12,7 @@ #include <linux/export.h> #include <linux/kernel.h> #include <linux/errno.h> +#include <linux/module.h> #include <linux/asn1_decoder.h> #include <linux/asn1_ber_bytecode.h> @@ -506,3 +507,5 @@ error: return -EBADMSG; } EXPORT_SYMBOL_GPL(asn1_ber_decoder); + +MODULE_LICENSE("GPL"); diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 519b5a10f..a8e12601e 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -269,16 +269,15 @@ static void debug_print_object(struct debug_obj *obj, char *msg) * Try to repair the damage, so we have a better chance to get useful * debug output. */ -static int -debug_object_fixup(int (*fixup)(void *addr, enum debug_obj_state state), +static bool +debug_object_fixup(bool (*fixup)(void *addr, enum debug_obj_state state), void * addr, enum debug_obj_state state) { - int fixed = 0; - - if (fixup) - fixed = fixup(addr, state); - debug_objects_fixups += fixed; - return fixed; + if (fixup && fixup(addr, state)) { + debug_objects_fixups++; + return true; + } + return false; } static void debug_object_is_on_stack(void *addr, int onstack) @@ -416,7 +415,7 @@ int debug_object_activate(void *addr, struct debug_obj_descr *descr) state = obj->state; raw_spin_unlock_irqrestore(&db->lock, flags); ret = debug_object_fixup(descr->fixup_activate, addr, state); - return ret ? -EINVAL : 0; + return ret ? 0 : -EINVAL; case ODEBUG_STATE_DESTROYED: debug_print_object(obj, "activate"); @@ -432,14 +431,21 @@ int debug_object_activate(void *addr, struct debug_obj_descr *descr) raw_spin_unlock_irqrestore(&db->lock, flags); /* - * This happens when a static object is activated. We - * let the type specific code decide whether this is - * true or not. + * We are here when a static object is activated. We + * let the type specific code confirm whether this is + * true or not. if true, we just make sure that the + * static object is tracked in the object tracker. If + * not, this must be a bug, so we try to fix it up. */ - if (debug_object_fixup(descr->fixup_activate, addr, - ODEBUG_STATE_NOTAVAILABLE)) { + if (descr->is_static_object && descr->is_static_object(addr)) { + /* track this static object */ + debug_object_init(addr, descr); + debug_object_activate(addr, descr); + } else { debug_print_object(&o, "activate"); - return -EINVAL; + ret = debug_object_fixup(descr->fixup_activate, addr, + ODEBUG_STATE_NOTAVAILABLE); + return ret ? 0 : -EINVAL; } return 0; } @@ -603,12 +609,18 @@ void debug_object_assert_init(void *addr, struct debug_obj_descr *descr) raw_spin_unlock_irqrestore(&db->lock, flags); /* - * Maybe the object is static. Let the type specific - * code decide what to do. + * Maybe the object is static, and we let the type specific + * code confirm. Track this static object if true, else invoke + * fixup. */ - if (debug_object_fixup(descr->fixup_assert_init, addr, - ODEBUG_STATE_NOTAVAILABLE)) + if (descr->is_static_object && descr->is_static_object(addr)) { + /* Track this static object */ + debug_object_init(addr, descr); + } else { debug_print_object(&o, "assert_init"); + debug_object_fixup(descr->fixup_assert_init, addr, + ODEBUG_STATE_NOTAVAILABLE); + } return; } @@ -793,11 +805,18 @@ struct self_test { static __initdata struct debug_obj_descr descr_type_test; +static bool __init is_static_object(void *addr) +{ + struct self_test *obj = addr; + + return obj->static_init; +} + /* * fixup_init is called when: * - an active object is initialized */ -static int __init fixup_init(void *addr, enum debug_obj_state state) +static bool __init fixup_init(void *addr, enum debug_obj_state state) { struct self_test *obj = addr; @@ -805,37 +824,31 @@ static int __init fixup_init(void *addr, enum debug_obj_state state) case ODEBUG_STATE_ACTIVE: debug_object_deactivate(obj, &descr_type_test); debug_object_init(obj, &descr_type_test); - return 1; + return true; default: - return 0; + return false; } } /* * fixup_activate is called when: * - an active object is activated - * - an unknown object is activated (might be a statically initialized object) + * - an unknown non-static object is activated */ -static int __init fixup_activate(void *addr, enum debug_obj_state state) +static bool __init fixup_activate(void *addr, enum debug_obj_state state) { struct self_test *obj = addr; switch (state) { case ODEBUG_STATE_NOTAVAILABLE: - if (obj->static_init == 1) { - debug_object_init(obj, &descr_type_test); - debug_object_activate(obj, &descr_type_test); - return 0; - } - return 1; - + return true; case ODEBUG_STATE_ACTIVE: debug_object_deactivate(obj, &descr_type_test); debug_object_activate(obj, &descr_type_test); - return 1; + return true; default: - return 0; + return false; } } @@ -843,7 +856,7 @@ static int __init fixup_activate(void *addr, enum debug_obj_state state) * fixup_destroy is called when: * - an active object is destroyed */ -static int __init fixup_destroy(void *addr, enum debug_obj_state state) +static bool __init fixup_destroy(void *addr, enum debug_obj_state state) { struct self_test *obj = addr; @@ -851,9 +864,9 @@ static int __init fixup_destroy(void *addr, enum debug_obj_state state) case ODEBUG_STATE_ACTIVE: debug_object_deactivate(obj, &descr_type_test); debug_object_destroy(obj, &descr_type_test); - return 1; + return true; default: - return 0; + return false; } } @@ -861,7 +874,7 @@ static int __init fixup_destroy(void *addr, enum debug_obj_state state) * fixup_free is called when: * - an active object is freed */ -static int __init fixup_free(void *addr, enum debug_obj_state state) +static bool __init fixup_free(void *addr, enum debug_obj_state state) { struct self_test *obj = addr; @@ -869,9 +882,9 @@ static int __init fixup_free(void *addr, enum debug_obj_state state) case ODEBUG_STATE_ACTIVE: debug_object_deactivate(obj, &descr_type_test); debug_object_free(obj, &descr_type_test); - return 1; + return true; default: - return 0; + return false; } } @@ -917,6 +930,7 @@ out: static __initdata struct debug_obj_descr descr_type_test = { .name = "selftest", + .is_static_object = is_static_object, .fixup_init = fixup_init, .fixup_activate = fixup_activate, .fixup_destroy = fixup_destroy, @@ -2,20 +2,77 @@ #include <linux/gcd.h> #include <linux/export.h> -/* Greatest common divisor */ +/* + * This implements the binary GCD algorithm. (Often attributed to Stein, + * but as Knuth has noted, appears in a first-century Chinese math text.) + * + * This is faster than the division-based algorithm even on x86, which + * has decent hardware division. + */ + +#if !defined(CONFIG_CPU_NO_EFFICIENT_FFS) && !defined(CPU_NO_EFFICIENT_FFS) + +/* If __ffs is available, the even/odd algorithm benchmarks slower. */ unsigned long gcd(unsigned long a, unsigned long b) { - unsigned long r; + unsigned long r = a | b; + + if (!a || !b) + return r; - if (a < b) - swap(a, b); + b >>= __ffs(b); + if (b == 1) + return r & -r; - if (!b) - return a; - while ((r = a % b) != 0) { - a = b; - b = r; + for (;;) { + a >>= __ffs(a); + if (a == 1) + return r & -r; + if (a == b) + return a << __ffs(r); + + if (a < b) + swap(a, b); + a -= b; } - return b; } + +#else + +/* If normalization is done by loops, the even/odd algorithm is a win. */ +unsigned long gcd(unsigned long a, unsigned long b) +{ + unsigned long r = a | b; + + if (!a || !b) + return r; + + /* Isolate lsbit of r */ + r &= -r; + + while (!(b & r)) + b >>= 1; + if (b == r) + return r; + + for (;;) { + while (!(a & r)) + a >>= 1; + if (a == r) + return r; + if (a == b) + return a; + + if (a < b) + swap(a, b); + a -= b; + a >>= 1; + if (a & r) + a += b; + a >>= 1; + } +} + +#endif + EXPORT_SYMBOL_GPL(gcd); diff --git a/lib/iov_iter.c b/lib/iov_iter.c index ca5316e00..0cd522753 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -99,40 +99,44 @@ } #define iterate_and_advance(i, n, v, I, B, K) { \ - size_t skip = i->iov_offset; \ - if (unlikely(i->type & ITER_BVEC)) { \ - const struct bio_vec *bvec; \ - struct bio_vec v; \ - iterate_bvec(i, n, v, bvec, skip, (B)) \ - if (skip == bvec->bv_len) { \ - bvec++; \ - skip = 0; \ - } \ - i->nr_segs -= bvec - i->bvec; \ - i->bvec = bvec; \ - } else if (unlikely(i->type & ITER_KVEC)) { \ - const struct kvec *kvec; \ - struct kvec v; \ - iterate_kvec(i, n, v, kvec, skip, (K)) \ - if (skip == kvec->iov_len) { \ - kvec++; \ - skip = 0; \ - } \ - i->nr_segs -= kvec - i->kvec; \ - i->kvec = kvec; \ - } else { \ - const struct iovec *iov; \ - struct iovec v; \ - iterate_iovec(i, n, v, iov, skip, (I)) \ - if (skip == iov->iov_len) { \ - iov++; \ - skip = 0; \ + if (unlikely(i->count < n)) \ + n = i->count; \ + if (i->count) { \ + size_t skip = i->iov_offset; \ + if (unlikely(i->type & ITER_BVEC)) { \ + const struct bio_vec *bvec; \ + struct bio_vec v; \ + iterate_bvec(i, n, v, bvec, skip, (B)) \ + if (skip == bvec->bv_len) { \ + bvec++; \ + skip = 0; \ + } \ + i->nr_segs -= bvec - i->bvec; \ + i->bvec = bvec; \ + } else if (unlikely(i->type & ITER_KVEC)) { \ + const struct kvec *kvec; \ + struct kvec v; \ + iterate_kvec(i, n, v, kvec, skip, (K)) \ + if (skip == kvec->iov_len) { \ + kvec++; \ + skip = 0; \ + } \ + i->nr_segs -= kvec - i->kvec; \ + i->kvec = kvec; \ + } else { \ + const struct iovec *iov; \ + struct iovec v; \ + iterate_iovec(i, n, v, iov, skip, (I)) \ + if (skip == iov->iov_len) { \ + iov++; \ + skip = 0; \ + } \ + i->nr_segs -= iov - i->iov; \ + i->iov = iov; \ } \ - i->nr_segs -= iov - i->iov; \ - i->iov = iov; \ + i->count -= n; \ + i->iov_offset = skip; \ } \ - i->count -= n; \ - i->iov_offset = skip; \ } static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t bytes, @@ -386,12 +390,6 @@ static void memzero_page(struct page *page, size_t offset, size_t len) size_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i) { const char *from = addr; - if (unlikely(bytes > i->count)) - bytes = i->count; - - if (unlikely(!bytes)) - return 0; - iterate_and_advance(i, bytes, v, __copy_to_user(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len), @@ -407,12 +405,6 @@ EXPORT_SYMBOL(copy_to_iter); size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) { char *to = addr; - if (unlikely(bytes > i->count)) - bytes = i->count; - - if (unlikely(!bytes)) - return 0; - iterate_and_advance(i, bytes, v, __copy_from_user((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len), @@ -428,12 +420,6 @@ EXPORT_SYMBOL(copy_from_iter); size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i) { char *to = addr; - if (unlikely(bytes > i->count)) - bytes = i->count; - - if (unlikely(!bytes)) - return 0; - iterate_and_advance(i, bytes, v, __copy_from_user_nocache((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len), @@ -474,12 +460,6 @@ EXPORT_SYMBOL(copy_page_from_iter); size_t iov_iter_zero(size_t bytes, struct iov_iter *i) { - if (unlikely(bytes > i->count)) - bytes = i->count; - - if (unlikely(!bytes)) - return 0; - iterate_and_advance(i, bytes, v, __clear_user(v.iov_base, v.iov_len), memzero_page(v.bv_page, v.bv_offset, v.bv_len), @@ -685,12 +665,6 @@ size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, char *to = addr; __wsum sum, next; size_t off = 0; - if (unlikely(bytes > i->count)) - bytes = i->count; - - if (unlikely(!bytes)) - return 0; - sum = *csum; iterate_and_advance(i, bytes, v, ({ int err = 0; @@ -729,12 +703,6 @@ size_t csum_and_copy_to_iter(const void *addr, size_t bytes, __wsum *csum, const char *from = addr; __wsum sum, next; size_t off = 0; - if (unlikely(bytes > i->count)) - bytes = i->count; - - if (unlikely(!bytes)) - return 0; - sum = *csum; iterate_and_advance(i, bytes, v, ({ int err = 0; diff --git a/lib/mpi/mpicoder.c b/lib/mpi/mpicoder.c index eb15e7dc7..747606f9e 100644 --- a/lib/mpi/mpicoder.c +++ b/lib/mpi/mpicoder.c @@ -20,6 +20,8 @@ #include <linux/bitops.h> #include <linux/count_zeros.h> +#include <linux/byteorder/generic.h> +#include <linux/string.h> #include "mpi-internal.h" #define MAX_EXTERN_MPI_BITS 16384 @@ -163,7 +165,13 @@ int mpi_read_buffer(MPI a, uint8_t *buf, unsigned buf_len, unsigned *nbytes, int *sign) { uint8_t *p; - mpi_limb_t alimb; +#if BYTES_PER_MPI_LIMB == 4 + __be32 alimb; +#elif BYTES_PER_MPI_LIMB == 8 + __be64 alimb; +#else +#error please implement for this limb size. +#endif unsigned int n = mpi_get_size(a); int i, lzeros; @@ -183,38 +191,19 @@ int mpi_read_buffer(MPI a, uint8_t *buf, unsigned buf_len, unsigned *nbytes, p = buf; *nbytes = n - lzeros; - for (i = a->nlimbs - 1; i >= 0; i--) { - alimb = a->d[i]; + for (i = a->nlimbs - 1 - lzeros / BYTES_PER_MPI_LIMB, + lzeros %= BYTES_PER_MPI_LIMB; + i >= 0; i--) { #if BYTES_PER_MPI_LIMB == 4 - *p++ = alimb >> 24; - *p++ = alimb >> 16; - *p++ = alimb >> 8; - *p++ = alimb; + alimb = cpu_to_be32(a->d[i]); #elif BYTES_PER_MPI_LIMB == 8 - *p++ = alimb >> 56; - *p++ = alimb >> 48; - *p++ = alimb >> 40; - *p++ = alimb >> 32; - *p++ = alimb >> 24; - *p++ = alimb >> 16; - *p++ = alimb >> 8; - *p++ = alimb; + alimb = cpu_to_be64(a->d[i]); #else #error please implement for this limb size. #endif - - if (lzeros > 0) { - if (lzeros >= sizeof(alimb)) { - p -= sizeof(alimb); - } else { - mpi_limb_t *limb1 = (void *)p - sizeof(alimb); - mpi_limb_t *limb2 = (void *)p - sizeof(alimb) - + lzeros; - *limb1 = *limb2; - p -= lzeros; - } - lzeros -= sizeof(alimb); - } + memcpy(p, (u8 *)&alimb + lzeros, BYTES_PER_MPI_LIMB - lzeros); + p += BYTES_PER_MPI_LIMB - lzeros; + lzeros = 0; } return 0; } @@ -359,7 +348,13 @@ int mpi_write_to_sgl(MPI a, struct scatterlist *sgl, unsigned *nbytes, int *sign) { u8 *p, *p2; - mpi_limb_t alimb, alimb2; +#if BYTES_PER_MPI_LIMB == 4 + __be32 alimb; +#elif BYTES_PER_MPI_LIMB == 8 + __be64 alimb; +#else +#error please implement for this limb size. +#endif unsigned int n = mpi_get_size(a); int i, x, y = 0, lzeros, buf_len; @@ -380,42 +375,22 @@ int mpi_write_to_sgl(MPI a, struct scatterlist *sgl, unsigned *nbytes, buf_len = sgl->length; p2 = sg_virt(sgl); - for (i = a->nlimbs - 1; i >= 0; i--) { - alimb = a->d[i]; - p = (u8 *)&alimb2; + for (i = a->nlimbs - 1 - lzeros / BYTES_PER_MPI_LIMB, + lzeros %= BYTES_PER_MPI_LIMB; + i >= 0; i--) { #if BYTES_PER_MPI_LIMB == 4 - *p++ = alimb >> 24; - *p++ = alimb >> 16; - *p++ = alimb >> 8; - *p++ = alimb; + alimb = cpu_to_be32(a->d[i]); #elif BYTES_PER_MPI_LIMB == 8 - *p++ = alimb >> 56; - *p++ = alimb >> 48; - *p++ = alimb >> 40; - *p++ = alimb >> 32; - *p++ = alimb >> 24; - *p++ = alimb >> 16; - *p++ = alimb >> 8; - *p++ = alimb; + alimb = cpu_to_be64(a->d[i]); #else #error please implement for this limb size. #endif - if (lzeros > 0) { - if (lzeros >= sizeof(alimb)) { - p -= sizeof(alimb); - continue; - } else { - mpi_limb_t *limb1 = (void *)p - sizeof(alimb); - mpi_limb_t *limb2 = (void *)p - sizeof(alimb) - + lzeros; - *limb1 = *limb2; - p -= lzeros; - y = lzeros; - } - lzeros -= sizeof(alimb); + if (lzeros) { + y = lzeros; + lzeros = 0; } - p = p - (sizeof(alimb) - y); + p = (u8 *)&alimb + y; for (x = 0; x < sizeof(alimb) - y; x++) { if (!buf_len) { @@ -443,15 +418,15 @@ EXPORT_SYMBOL_GPL(mpi_write_to_sgl); * a new MPI and reads the content of the sgl to the MPI. * * @sgl: scatterlist to read from - * @len: number of bytes to read + * @nbytes: number of bytes to read * * Return: Pointer to a new MPI or NULL on error */ -MPI mpi_read_raw_from_sgl(struct scatterlist *sgl, unsigned int len) +MPI mpi_read_raw_from_sgl(struct scatterlist *sgl, unsigned int nbytes) { struct scatterlist *sg; int x, i, j, z, lzeros, ents; - unsigned int nbits, nlimbs, nbytes; + unsigned int nbits, nlimbs; mpi_limb_t a; MPI val = NULL; @@ -472,16 +447,12 @@ MPI mpi_read_raw_from_sgl(struct scatterlist *sgl, unsigned int len) break; ents--; + nbytes -= lzeros; lzeros = 0; } sgl = sg; - - if (!ents) - nbytes = 0; - else - nbytes = len - lzeros; - + nbytes -= lzeros; nbits = nbytes * 8; if (nbits > MAX_EXTERN_MPI_BITS) { pr_info("MPI: mpi too large (%u bits)\n", nbits); @@ -489,9 +460,8 @@ MPI mpi_read_raw_from_sgl(struct scatterlist *sgl, unsigned int len) } if (nbytes > 0) - nbits -= count_leading_zeros(*(u8 *)(sg_virt(sgl) + lzeros)); - else - nbits = 0; + nbits -= count_leading_zeros(*(u8 *)(sg_virt(sgl) + lzeros)) - + (BITS_PER_LONG - 8); nlimbs = DIV_ROUND_UP(nbytes, BYTES_PER_MPI_LIMB); val = mpi_alloc(nlimbs); @@ -507,19 +477,14 @@ MPI mpi_read_raw_from_sgl(struct scatterlist *sgl, unsigned int len) j = nlimbs - 1; a = 0; - z = 0; - x = BYTES_PER_MPI_LIMB - nbytes % BYTES_PER_MPI_LIMB; - x %= BYTES_PER_MPI_LIMB; + z = BYTES_PER_MPI_LIMB - nbytes % BYTES_PER_MPI_LIMB; + z %= BYTES_PER_MPI_LIMB; for_each_sg(sgl, sg, ents, i) { const u8 *buffer = sg_virt(sg) + lzeros; int len = sg->length - lzeros; - int buf_shift = x; - - if (sg_is_last(sg) && (len % BYTES_PER_MPI_LIMB)) - len += BYTES_PER_MPI_LIMB - (len % BYTES_PER_MPI_LIMB); - for (; x < len + buf_shift; x++) { + for (x = 0; x < len; x++) { a <<= 8; a |= *buffer++; if (((z + x + 1) % BYTES_PER_MPI_LIMB) == 0) { @@ -528,7 +493,6 @@ MPI mpi_read_raw_from_sgl(struct scatterlist *sgl, unsigned int len) } } z += x; - x = 0; lzeros = 0; } return val; diff --git a/lib/nlattr.c b/lib/nlattr.c index f5907d232..fce1e9afc 100644 --- a/lib/nlattr.c +++ b/lib/nlattr.c @@ -355,6 +355,30 @@ struct nlattr *__nla_reserve(struct sk_buff *skb, int attrtype, int attrlen) EXPORT_SYMBOL(__nla_reserve); /** + * __nla_reserve_64bit - reserve room for attribute on the skb and align it + * @skb: socket buffer to reserve room on + * @attrtype: attribute type + * @attrlen: length of attribute payload + * @padattr: attribute type for the padding + * + * Adds a netlink attribute header to a socket buffer and reserves + * room for the payload but does not copy it. It also ensure that this + * attribute will have a 64-bit aligned nla_data() area. + * + * The caller is responsible to ensure that the skb provides enough + * tailroom for the attribute header and payload. + */ +struct nlattr *__nla_reserve_64bit(struct sk_buff *skb, int attrtype, + int attrlen, int padattr) +{ + if (nla_need_padding_for_64bit(skb)) + nla_align_64bit(skb, padattr); + + return __nla_reserve(skb, attrtype, attrlen); +} +EXPORT_SYMBOL(__nla_reserve_64bit); + +/** * __nla_reserve_nohdr - reserve room for attribute without header * @skb: socket buffer to reserve room on * @attrlen: length of attribute payload @@ -397,6 +421,36 @@ struct nlattr *nla_reserve(struct sk_buff *skb, int attrtype, int attrlen) EXPORT_SYMBOL(nla_reserve); /** + * nla_reserve_64bit - reserve room for attribute on the skb and align it + * @skb: socket buffer to reserve room on + * @attrtype: attribute type + * @attrlen: length of attribute payload + * @padattr: attribute type for the padding + * + * Adds a netlink attribute header to a socket buffer and reserves + * room for the payload but does not copy it. It also ensure that this + * attribute will have a 64-bit aligned nla_data() area. + * + * Returns NULL if the tailroom of the skb is insufficient to store + * the attribute header and payload. + */ +struct nlattr *nla_reserve_64bit(struct sk_buff *skb, int attrtype, int attrlen, + int padattr) +{ + size_t len; + + if (nla_need_padding_for_64bit(skb)) + len = nla_total_size_64bit(attrlen); + else + len = nla_total_size(attrlen); + if (unlikely(skb_tailroom(skb) < len)) + return NULL; + + return __nla_reserve_64bit(skb, attrtype, attrlen, padattr); +} +EXPORT_SYMBOL(nla_reserve_64bit); + +/** * nla_reserve_nohdr - reserve room for attribute without header * @skb: socket buffer to reserve room on * @attrlen: length of attribute payload @@ -436,6 +490,27 @@ void __nla_put(struct sk_buff *skb, int attrtype, int attrlen, EXPORT_SYMBOL(__nla_put); /** + * __nla_put_64bit - Add a netlink attribute to a socket buffer and align it + * @skb: socket buffer to add attribute to + * @attrtype: attribute type + * @attrlen: length of attribute payload + * @data: head of attribute payload + * @padattr: attribute type for the padding + * + * The caller is responsible to ensure that the skb provides enough + * tailroom for the attribute header and payload. + */ +void __nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen, + const void *data, int padattr) +{ + struct nlattr *nla; + + nla = __nla_reserve_64bit(skb, attrtype, attrlen, padattr); + memcpy(nla_data(nla), data, attrlen); +} +EXPORT_SYMBOL(__nla_put_64bit); + +/** * __nla_put_nohdr - Add a netlink attribute without header * @skb: socket buffer to add attribute to * @attrlen: length of attribute payload @@ -474,6 +549,34 @@ int nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data) EXPORT_SYMBOL(nla_put); /** + * nla_put_64bit - Add a netlink attribute to a socket buffer and align it + * @skb: socket buffer to add attribute to + * @attrtype: attribute type + * @attrlen: length of attribute payload + * @data: head of attribute payload + * @padattr: attribute type for the padding + * + * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store + * the attribute header and payload. + */ +int nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen, + const void *data, int padattr) +{ + size_t len; + + if (nla_need_padding_for_64bit(skb)) + len = nla_total_size_64bit(attrlen); + else + len = nla_total_size(attrlen); + if (unlikely(skb_tailroom(skb) < len)) + return -EMSGSIZE; + + __nla_put_64bit(skb, attrtype, attrlen, data, padattr); + return 0; +} +EXPORT_SYMBOL(nla_put_64bit); + +/** * nla_put_nohdr - Add a netlink attribute without header * @skb: socket buffer to add attribute to * @attrlen: length of attribute payload diff --git a/lib/nmi_backtrace.c b/lib/nmi_backtrace.c index 6019c53c6..26caf51cc 100644 --- a/lib/nmi_backtrace.c +++ b/lib/nmi_backtrace.c @@ -16,33 +16,14 @@ #include <linux/delay.h> #include <linux/kprobes.h> #include <linux/nmi.h> -#include <linux/seq_buf.h> #ifdef arch_trigger_all_cpu_backtrace /* For reliability, we're prepared to waste bits here. */ static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; -static cpumask_t printtrace_mask; - -#define NMI_BUF_SIZE 4096 - -struct nmi_seq_buf { - unsigned char buffer[NMI_BUF_SIZE]; - struct seq_buf seq; -}; - -/* Safe printing in NMI context */ -static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq); /* "in progress" flag of arch_trigger_all_cpu_backtrace */ static unsigned long backtrace_flag; -static void print_seq_line(struct nmi_seq_buf *s, int start, int end) -{ - const char *buf = s->buffer + start; - - printk("%.*s", (end - start) + 1, buf); -} - /* * When raise() is called it will be is passed a pointer to the * backtrace_mask. Architectures that call nmi_cpu_backtrace() @@ -52,8 +33,7 @@ static void print_seq_line(struct nmi_seq_buf *s, int start, int end) void nmi_trigger_all_cpu_backtrace(bool include_self, void (*raise)(cpumask_t *mask)) { - struct nmi_seq_buf *s; - int i, cpu, this_cpu = get_cpu(); + int i, this_cpu = get_cpu(); if (test_and_set_bit(0, &backtrace_flag)) { /* @@ -68,17 +48,6 @@ void nmi_trigger_all_cpu_backtrace(bool include_self, if (!include_self) cpumask_clear_cpu(this_cpu, to_cpumask(backtrace_mask)); - cpumask_copy(&printtrace_mask, to_cpumask(backtrace_mask)); - - /* - * Set up per_cpu seq_buf buffers that the NMIs running on the other - * CPUs will write to. - */ - for_each_cpu(cpu, to_cpumask(backtrace_mask)) { - s = &per_cpu(nmi_print_seq, cpu); - seq_buf_init(&s->seq, s->buffer, NMI_BUF_SIZE); - } - if (!cpumask_empty(to_cpumask(backtrace_mask))) { pr_info("Sending NMI to %s CPUs:\n", (include_self ? "all" : "other")); @@ -94,73 +63,25 @@ void nmi_trigger_all_cpu_backtrace(bool include_self, } /* - * Now that all the NMIs have triggered, we can dump out their - * back traces safely to the console. + * Force flush any remote buffers that might be stuck in IRQ context + * and therefore could not run their irq_work. */ - for_each_cpu(cpu, &printtrace_mask) { - int len, last_i = 0; + printk_nmi_flush(); - s = &per_cpu(nmi_print_seq, cpu); - len = seq_buf_used(&s->seq); - if (!len) - continue; - - /* Print line by line. */ - for (i = 0; i < len; i++) { - if (s->buffer[i] == '\n') { - print_seq_line(s, last_i, i); - last_i = i + 1; - } - } - /* Check if there was a partial line. */ - if (last_i < len) { - print_seq_line(s, last_i, len - 1); - pr_cont("\n"); - } - } - - clear_bit(0, &backtrace_flag); - smp_mb__after_atomic(); + clear_bit_unlock(0, &backtrace_flag); put_cpu(); } -/* - * It is not safe to call printk() directly from NMI handlers. - * It may be fine if the NMI detected a lock up and we have no choice - * but to do so, but doing a NMI on all other CPUs to get a back trace - * can be done with a sysrq-l. We don't want that to lock up, which - * can happen if the NMI interrupts a printk in progress. - * - * Instead, we redirect the vprintk() to this nmi_vprintk() that writes - * the content into a per cpu seq_buf buffer. Then when the NMIs are - * all done, we can safely dump the contents of the seq_buf to a printk() - * from a non NMI context. - */ -static int nmi_vprintk(const char *fmt, va_list args) -{ - struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq); - unsigned int len = seq_buf_used(&s->seq); - - seq_buf_vprintf(&s->seq, fmt, args); - return seq_buf_used(&s->seq) - len; -} - bool nmi_cpu_backtrace(struct pt_regs *regs) { int cpu = smp_processor_id(); if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { - printk_func_t printk_func_save = this_cpu_read(printk_func); - - /* Replace printk to write into the NMI seq */ - this_cpu_write(printk_func, nmi_vprintk); pr_warn("NMI backtrace for cpu %d\n", cpu); if (regs) show_regs(regs); else dump_stack(); - this_cpu_write(printk_func, printk_func_save); - cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); return true; } diff --git a/lib/nodemask.c b/lib/nodemask.c new file mode 100644 index 000000000..e42a5bf44 --- /dev/null +++ b/lib/nodemask.c @@ -0,0 +1,30 @@ +#include <linux/nodemask.h> +#include <linux/module.h> +#include <linux/random.h> + +int __next_node_in(int node, const nodemask_t *srcp) +{ + int ret = __next_node(node, srcp); + + if (ret == MAX_NUMNODES) + ret = __first_node(srcp); + return ret; +} +EXPORT_SYMBOL(__next_node_in); + +#ifdef CONFIG_NUMA +/* + * Return the bit number of a random bit set in the nodemask. + * (returns NUMA_NO_NODE if nodemask is empty) + */ +int node_random(const nodemask_t *maskp) +{ + int w, bit = NUMA_NO_NODE; + + w = nodes_weight(*maskp); + if (w) + bit = bitmap_ord_to_pos(maskp->bits, + get_random_int() % w, MAX_NUMNODES); + return bit; +} +#endif diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index f051d69f0..72d36113c 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -19,7 +19,7 @@ static DEFINE_SPINLOCK(percpu_counters_lock); static struct debug_obj_descr percpu_counter_debug_descr; -static int percpu_counter_fixup_free(void *addr, enum debug_obj_state state) +static bool percpu_counter_fixup_free(void *addr, enum debug_obj_state state) { struct percpu_counter *fbc = addr; @@ -27,9 +27,9 @@ static int percpu_counter_fixup_free(void *addr, enum debug_obj_state state) case ODEBUG_STATE_ACTIVE: percpu_counter_destroy(fbc); debug_object_free(fbc, &percpu_counter_debug_descr); - return 1; + return true; default: - return 0; + return false; } } diff --git a/lib/proportions.c b/lib/proportions.c deleted file mode 100644 index efa54f259..000000000 --- a/lib/proportions.c +++ /dev/null @@ -1,407 +0,0 @@ -/* - * Floating proportions - * - * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra - * - * Description: - * - * The floating proportion is a time derivative with an exponentially decaying - * history: - * - * p_{j} = \Sum_{i=0} (dx_{j}/dt_{-i}) / 2^(1+i) - * - * Where j is an element from {prop_local}, x_{j} is j's number of events, - * and i the time period over which the differential is taken. So d/dt_{-i} is - * the differential over the i-th last period. - * - * The decaying history gives smooth transitions. The time differential carries - * the notion of speed. - * - * The denominator is 2^(1+i) because we want the series to be normalised, ie. - * - * \Sum_{i=0} 1/2^(1+i) = 1 - * - * Further more, if we measure time (t) in the same events as x; so that: - * - * t = \Sum_{j} x_{j} - * - * we get that: - * - * \Sum_{j} p_{j} = 1 - * - * Writing this in an iterative fashion we get (dropping the 'd's): - * - * if (++x_{j}, ++t > period) - * t /= 2; - * for_each (j) - * x_{j} /= 2; - * - * so that: - * - * p_{j} = x_{j} / t; - * - * We optimize away the '/= 2' for the global time delta by noting that: - * - * if (++t > period) t /= 2: - * - * Can be approximated by: - * - * period/2 + (++t % period/2) - * - * [ Furthermore, when we choose period to be 2^n it can be written in terms of - * binary operations and wraparound artefacts disappear. ] - * - * Also note that this yields a natural counter of the elapsed periods: - * - * c = t / (period/2) - * - * [ Its monotonic increasing property can be applied to mitigate the wrap- - * around issue. ] - * - * This allows us to do away with the loop over all prop_locals on each period - * expiration. By remembering the period count under which it was last accessed - * as c_{j}, we can obtain the number of 'missed' cycles from: - * - * c - c_{j} - * - * We can then lazily catch up to the global period count every time we are - * going to use x_{j}, by doing: - * - * x_{j} /= 2^(c - c_{j}), c_{j} = c - */ - -#include <linux/proportions.h> -#include <linux/rcupdate.h> - -int prop_descriptor_init(struct prop_descriptor *pd, int shift, gfp_t gfp) -{ - int err; - - if (shift > PROP_MAX_SHIFT) - shift = PROP_MAX_SHIFT; - - pd->index = 0; - pd->pg[0].shift = shift; - mutex_init(&pd->mutex); - err = percpu_counter_init(&pd->pg[0].events, 0, gfp); - if (err) - goto out; - - err = percpu_counter_init(&pd->pg[1].events, 0, gfp); - if (err) - percpu_counter_destroy(&pd->pg[0].events); - -out: - return err; -} - -/* - * We have two copies, and flip between them to make it seem like an atomic - * update. The update is not really atomic wrt the events counter, but - * it is internally consistent with the bit layout depending on shift. - * - * We copy the events count, move the bits around and flip the index. - */ -void prop_change_shift(struct prop_descriptor *pd, int shift) -{ - int index; - int offset; - u64 events; - unsigned long flags; - - if (shift > PROP_MAX_SHIFT) - shift = PROP_MAX_SHIFT; - - mutex_lock(&pd->mutex); - - index = pd->index ^ 1; - offset = pd->pg[pd->index].shift - shift; - if (!offset) - goto out; - - pd->pg[index].shift = shift; - - local_irq_save(flags); - events = percpu_counter_sum(&pd->pg[pd->index].events); - if (offset < 0) - events <<= -offset; - else - events >>= offset; - percpu_counter_set(&pd->pg[index].events, events); - - /* - * ensure the new pg is fully written before the switch - */ - smp_wmb(); - pd->index = index; - local_irq_restore(flags); - - synchronize_rcu(); - -out: - mutex_unlock(&pd->mutex); -} - -/* - * wrap the access to the data in an rcu_read_lock() section; - * this is used to track the active references. - */ -static struct prop_global *prop_get_global(struct prop_descriptor *pd) -__acquires(RCU) -{ - int index; - - rcu_read_lock(); - index = pd->index; - /* - * match the wmb from vcd_flip() - */ - smp_rmb(); - return &pd->pg[index]; -} - -static void prop_put_global(struct prop_descriptor *pd, struct prop_global *pg) -__releases(RCU) -{ - rcu_read_unlock(); -} - -static void -prop_adjust_shift(int *pl_shift, unsigned long *pl_period, int new_shift) -{ - int offset = *pl_shift - new_shift; - - if (!offset) - return; - - if (offset < 0) - *pl_period <<= -offset; - else - *pl_period >>= offset; - - *pl_shift = new_shift; -} - -/* - * PERCPU - */ - -#define PROP_BATCH (8*(1+ilog2(nr_cpu_ids))) - -int prop_local_init_percpu(struct prop_local_percpu *pl, gfp_t gfp) -{ - raw_spin_lock_init(&pl->lock); - pl->shift = 0; - pl->period = 0; - return percpu_counter_init(&pl->events, 0, gfp); -} - -void prop_local_destroy_percpu(struct prop_local_percpu *pl) -{ - percpu_counter_destroy(&pl->events); -} - -/* - * Catch up with missed period expirations. - * - * until (c_{j} == c) - * x_{j} -= x_{j}/2; - * c_{j}++; - */ -static -void prop_norm_percpu(struct prop_global *pg, struct prop_local_percpu *pl) -{ - unsigned long period = 1UL << (pg->shift - 1); - unsigned long period_mask = ~(period - 1); - unsigned long global_period; - unsigned long flags; - - global_period = percpu_counter_read(&pg->events); - global_period &= period_mask; - - /* - * Fast path - check if the local and global period count still match - * outside of the lock. - */ - if (pl->period == global_period) - return; - - raw_spin_lock_irqsave(&pl->lock, flags); - prop_adjust_shift(&pl->shift, &pl->period, pg->shift); - - /* - * For each missed period, we half the local counter. - * basically: - * pl->events >> (global_period - pl->period); - */ - period = (global_period - pl->period) >> (pg->shift - 1); - if (period < BITS_PER_LONG) { - s64 val = percpu_counter_read(&pl->events); - - if (val < (nr_cpu_ids * PROP_BATCH)) - val = percpu_counter_sum(&pl->events); - - __percpu_counter_add(&pl->events, -val + (val >> period), - PROP_BATCH); - } else - percpu_counter_set(&pl->events, 0); - - pl->period = global_period; - raw_spin_unlock_irqrestore(&pl->lock, flags); -} - -/* - * ++x_{j}, ++t - */ -void __prop_inc_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl) -{ - struct prop_global *pg = prop_get_global(pd); - - prop_norm_percpu(pg, pl); - __percpu_counter_add(&pl->events, 1, PROP_BATCH); - percpu_counter_add(&pg->events, 1); - prop_put_global(pd, pg); -} - -/* - * identical to __prop_inc_percpu, except that it limits this pl's fraction to - * @frac/PROP_FRAC_BASE by ignoring events when this limit has been exceeded. - */ -void __prop_inc_percpu_max(struct prop_descriptor *pd, - struct prop_local_percpu *pl, long frac) -{ - struct prop_global *pg = prop_get_global(pd); - - prop_norm_percpu(pg, pl); - - if (unlikely(frac != PROP_FRAC_BASE)) { - unsigned long period_2 = 1UL << (pg->shift - 1); - unsigned long counter_mask = period_2 - 1; - unsigned long global_count; - long numerator, denominator; - - numerator = percpu_counter_read_positive(&pl->events); - global_count = percpu_counter_read(&pg->events); - denominator = period_2 + (global_count & counter_mask); - - if (numerator > ((denominator * frac) >> PROP_FRAC_SHIFT)) - goto out_put; - } - - percpu_counter_add(&pl->events, 1); - percpu_counter_add(&pg->events, 1); - -out_put: - prop_put_global(pd, pg); -} - -/* - * Obtain a fraction of this proportion - * - * p_{j} = x_{j} / (period/2 + t % period/2) - */ -void prop_fraction_percpu(struct prop_descriptor *pd, - struct prop_local_percpu *pl, - long *numerator, long *denominator) -{ - struct prop_global *pg = prop_get_global(pd); - unsigned long period_2 = 1UL << (pg->shift - 1); - unsigned long counter_mask = period_2 - 1; - unsigned long global_count; - - prop_norm_percpu(pg, pl); - *numerator = percpu_counter_read_positive(&pl->events); - - global_count = percpu_counter_read(&pg->events); - *denominator = period_2 + (global_count & counter_mask); - - prop_put_global(pd, pg); -} - -/* - * SINGLE - */ - -int prop_local_init_single(struct prop_local_single *pl) -{ - raw_spin_lock_init(&pl->lock); - pl->shift = 0; - pl->period = 0; - pl->events = 0; - return 0; -} - -void prop_local_destroy_single(struct prop_local_single *pl) -{ -} - -/* - * Catch up with missed period expirations. - */ -static -void prop_norm_single(struct prop_global *pg, struct prop_local_single *pl) -{ - unsigned long period = 1UL << (pg->shift - 1); - unsigned long period_mask = ~(period - 1); - unsigned long global_period; - unsigned long flags; - - global_period = percpu_counter_read(&pg->events); - global_period &= period_mask; - - /* - * Fast path - check if the local and global period count still match - * outside of the lock. - */ - if (pl->period == global_period) - return; - - raw_spin_lock_irqsave(&pl->lock, flags); - prop_adjust_shift(&pl->shift, &pl->period, pg->shift); - /* - * For each missed period, we half the local counter. - */ - period = (global_period - pl->period) >> (pg->shift - 1); - if (likely(period < BITS_PER_LONG)) - pl->events >>= period; - else - pl->events = 0; - pl->period = global_period; - raw_spin_unlock_irqrestore(&pl->lock, flags); -} - -/* - * ++x_{j}, ++t - */ -void __prop_inc_single(struct prop_descriptor *pd, struct prop_local_single *pl) -{ - struct prop_global *pg = prop_get_global(pd); - - prop_norm_single(pg, pl); - pl->events++; - percpu_counter_add(&pg->events, 1); - prop_put_global(pd, pg); -} - -/* - * Obtain a fraction of this proportion - * - * p_{j} = x_{j} / (period/2 + t % period/2) - */ -void prop_fraction_single(struct prop_descriptor *pd, - struct prop_local_single *pl, - long *numerator, long *denominator) -{ - struct prop_global *pg = prop_get_global(pd); - unsigned long period_2 = 1UL << (pg->shift - 1); - unsigned long counter_mask = period_2 - 1; - unsigned long global_count; - - prop_norm_single(pg, pl); - *numerator = pl->events; - - global_count = percpu_counter_read(&pg->events); - *denominator = period_2 + (global_count & counter_mask); - - prop_put_global(pd, pg); -} diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 1624c4117..bc7852f95 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -4,6 +4,8 @@ * Copyright (C) 2005 SGI, Christoph Lameter * Copyright (C) 2006 Nick Piggin * Copyright (C) 2012 Konstantin Khlebnikov + * Copyright (C) 2016 Intel, Matthew Wilcox + * Copyright (C) 2016 Intel, Ross Zwisler * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as @@ -37,12 +39,6 @@ /* - * The height_to_maxindex array needs to be one deeper than the maximum - * path as height 0 holds only 1 entry. - */ -static unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH + 1] __read_mostly; - -/* * Radix tree node cache. */ static struct kmem_cache *radix_tree_node_cachep; @@ -64,20 +60,58 @@ static struct kmem_cache *radix_tree_node_cachep; * Per-cpu pool of preloaded nodes */ struct radix_tree_preload { - int nr; + unsigned nr; /* nodes->private_data points to next preallocated node */ struct radix_tree_node *nodes; }; static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, }; -static inline void *ptr_to_indirect(void *ptr) +static inline void *node_to_entry(void *ptr) +{ + return (void *)((unsigned long)ptr | RADIX_TREE_INTERNAL_NODE); +} + +#define RADIX_TREE_RETRY node_to_entry(NULL) + +#ifdef CONFIG_RADIX_TREE_MULTIORDER +/* Sibling slots point directly to another slot in the same node */ +static inline bool is_sibling_entry(struct radix_tree_node *parent, void *node) +{ + void **ptr = node; + return (parent->slots <= ptr) && + (ptr < parent->slots + RADIX_TREE_MAP_SIZE); +} +#else +static inline bool is_sibling_entry(struct radix_tree_node *parent, void *node) +{ + return false; +} +#endif + +static inline unsigned long get_slot_offset(struct radix_tree_node *parent, + void **slot) { - return (void *)((unsigned long)ptr | RADIX_TREE_INDIRECT_PTR); + return slot - parent->slots; } -static inline void *indirect_to_ptr(void *ptr) +static unsigned int radix_tree_descend(struct radix_tree_node *parent, + struct radix_tree_node **nodep, unsigned long index) { - return (void *)((unsigned long)ptr & ~RADIX_TREE_INDIRECT_PTR); + unsigned int offset = (index >> parent->shift) & RADIX_TREE_MAP_MASK; + void **entry = rcu_dereference_raw(parent->slots[offset]); + +#ifdef CONFIG_RADIX_TREE_MULTIORDER + if (radix_tree_is_internal_node(entry)) { + unsigned long siboff = get_slot_offset(parent, entry); + if (siboff < RADIX_TREE_MAP_SIZE) { + offset = siboff; + entry = rcu_dereference_raw(parent->slots[offset]); + } + } +#endif + + *nodep = (void *)entry; + return offset; } static inline gfp_t root_gfp_mask(struct radix_tree_root *root) @@ -108,7 +142,7 @@ static inline void root_tag_set(struct radix_tree_root *root, unsigned int tag) root->gfp_mask |= (__force gfp_t)(1 << (tag + __GFP_BITS_SHIFT)); } -static inline void root_tag_clear(struct radix_tree_root *root, unsigned int tag) +static inline void root_tag_clear(struct radix_tree_root *root, unsigned tag) { root->gfp_mask &= (__force gfp_t)~(1 << (tag + __GFP_BITS_SHIFT)); } @@ -120,7 +154,12 @@ static inline void root_tag_clear_all(struct radix_tree_root *root) static inline int root_tag_get(struct radix_tree_root *root, unsigned int tag) { - return (__force unsigned)root->gfp_mask & (1 << (tag + __GFP_BITS_SHIFT)); + return (__force int)root->gfp_mask & (1 << (tag + __GFP_BITS_SHIFT)); +} + +static inline unsigned root_tags_get(struct radix_tree_root *root) +{ + return (__force unsigned)root->gfp_mask >> __GFP_BITS_SHIFT; } /* @@ -129,7 +168,7 @@ static inline int root_tag_get(struct radix_tree_root *root, unsigned int tag) */ static inline int any_tag_set(struct radix_tree_node *node, unsigned int tag) { - int idx; + unsigned idx; for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) { if (node->tags[tag][idx]) return 1; @@ -173,38 +212,45 @@ radix_tree_find_next_bit(const unsigned long *addr, return size; } -#if 0 -static void dump_node(void *slot, int height, int offset) +#ifndef __KERNEL__ +static void dump_node(struct radix_tree_node *node, unsigned long index) { - struct radix_tree_node *node; - int i; + unsigned long i; - if (!slot) - return; + pr_debug("radix node: %p offset %d tags %lx %lx %lx shift %d count %d parent %p\n", + node, node->offset, + node->tags[0][0], node->tags[1][0], node->tags[2][0], + node->shift, node->count, node->parent); - if (height == 0) { - pr_debug("radix entry %p offset %d\n", slot, offset); - return; + for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { + unsigned long first = index | (i << node->shift); + unsigned long last = first | ((1UL << node->shift) - 1); + void *entry = node->slots[i]; + if (!entry) + continue; + if (is_sibling_entry(node, entry)) { + pr_debug("radix sblng %p offset %ld val %p indices %ld-%ld\n", + entry, i, + *(void **)entry_to_node(entry), + first, last); + } else if (!radix_tree_is_internal_node(entry)) { + pr_debug("radix entry %p offset %ld indices %ld-%ld\n", + entry, i, first, last); + } else { + dump_node(entry_to_node(entry), first); + } } - - node = indirect_to_ptr(slot); - pr_debug("radix node: %p offset %d tags %lx %lx %lx path %x count %d parent %p\n", - slot, offset, node->tags[0][0], node->tags[1][0], - node->tags[2][0], node->path, node->count, node->parent); - - for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) - dump_node(node->slots[i], height - 1, i); } /* For debug */ static void radix_tree_dump(struct radix_tree_root *root) { - pr_debug("radix root: %p height %d rnode %p tags %x\n", - root, root->height, root->rnode, + pr_debug("radix root: %p rnode %p tags %x\n", + root, root->rnode, root->gfp_mask >> __GFP_BITS_SHIFT); - if (!radix_tree_is_indirect_ptr(root->rnode)) + if (!radix_tree_is_internal_node(root->rnode)) return; - dump_node(root->rnode, root->height, 0); + dump_node(entry_to_node(root->rnode), 0); } #endif @@ -219,19 +265,20 @@ radix_tree_node_alloc(struct radix_tree_root *root) gfp_t gfp_mask = root_gfp_mask(root); /* - * Preload code isn't irq safe and it doesn't make sence to use - * preloading in the interrupt anyway as all the allocations have to - * be atomic. So just do normal allocation when in interrupt. + * Preload code isn't irq safe and it doesn't make sense to use + * preloading during an interrupt anyway as all the allocations have + * to be atomic. So just do normal allocation when in interrupt. */ if (!gfpflags_allow_blocking(gfp_mask) && !in_interrupt()) { struct radix_tree_preload *rtp; /* * Even if the caller has preloaded, try to allocate from the - * cache first for the new node to get accounted. + * cache first for the new node to get accounted to the memory + * cgroup. */ ret = kmem_cache_alloc(radix_tree_node_cachep, - gfp_mask | __GFP_ACCOUNT | __GFP_NOWARN); + gfp_mask | __GFP_NOWARN); if (ret) goto out; @@ -254,10 +301,9 @@ radix_tree_node_alloc(struct radix_tree_root *root) kmemleak_update_trace(ret); goto out; } - ret = kmem_cache_alloc(radix_tree_node_cachep, - gfp_mask | __GFP_ACCOUNT); + ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask); out: - BUG_ON(radix_tree_is_indirect_ptr(ret)); + BUG_ON(radix_tree_is_internal_node(ret)); return ret; } @@ -302,6 +348,12 @@ static int __radix_tree_preload(gfp_t gfp_mask) struct radix_tree_node *node; int ret = -ENOMEM; + /* + * Nodes preloaded by one cgroup can be be used by another cgroup, so + * they should never be accounted to any particular memory cgroup. + */ + gfp_mask &= ~__GFP_ACCOUNT; + preempt_disable(); rtp = this_cpu_ptr(&radix_tree_preloads); while (rtp->nr < RADIX_TREE_PRELOAD_SIZE) { @@ -357,38 +409,58 @@ int radix_tree_maybe_preload(gfp_t gfp_mask) EXPORT_SYMBOL(radix_tree_maybe_preload); /* - * Return the maximum key which can be store into a - * radix tree with height HEIGHT. + * The maximum index which can be stored in a radix tree */ -static inline unsigned long radix_tree_maxindex(unsigned int height) +static inline unsigned long shift_maxindex(unsigned int shift) +{ + return (RADIX_TREE_MAP_SIZE << shift) - 1; +} + +static inline unsigned long node_maxindex(struct radix_tree_node *node) +{ + return shift_maxindex(node->shift); +} + +static unsigned radix_tree_load_root(struct radix_tree_root *root, + struct radix_tree_node **nodep, unsigned long *maxindex) { - return height_to_maxindex[height]; + struct radix_tree_node *node = rcu_dereference_raw(root->rnode); + + *nodep = node; + + if (likely(radix_tree_is_internal_node(node))) { + node = entry_to_node(node); + *maxindex = node_maxindex(node); + return node->shift + RADIX_TREE_MAP_SHIFT; + } + + *maxindex = 0; + return 0; } /* * Extend a radix tree so it can store key @index. */ static int radix_tree_extend(struct radix_tree_root *root, - unsigned long index, unsigned order) + unsigned long index, unsigned int shift) { - struct radix_tree_node *node; struct radix_tree_node *slot; - unsigned int height; + unsigned int maxshift; int tag; - /* Figure out what the height should be. */ - height = root->height + 1; - while (index > radix_tree_maxindex(height)) - height++; + /* Figure out what the shift should be. */ + maxshift = shift; + while (index > shift_maxindex(maxshift)) + maxshift += RADIX_TREE_MAP_SHIFT; - if ((root->rnode == NULL) && (order == 0)) { - root->height = height; + slot = root->rnode; + if (!slot) goto out; - } do { - unsigned int newheight; - if (!(node = radix_tree_node_alloc(root))) + struct radix_tree_node *node = radix_tree_node_alloc(root); + + if (!node) return -ENOMEM; /* Propagate the aggregated tag info into the new root */ @@ -397,25 +469,20 @@ static int radix_tree_extend(struct radix_tree_root *root, tag_set(node, tag, 0); } - /* Increase the height. */ - newheight = root->height+1; - BUG_ON(newheight & ~RADIX_TREE_HEIGHT_MASK); - node->path = newheight; + BUG_ON(shift > BITS_PER_LONG); + node->shift = shift; + node->offset = 0; node->count = 1; node->parent = NULL; - slot = root->rnode; - if (radix_tree_is_indirect_ptr(slot) && newheight > 1) { - slot = indirect_to_ptr(slot); - slot->parent = node; - slot = ptr_to_indirect(slot); - } + if (radix_tree_is_internal_node(slot)) + entry_to_node(slot)->parent = node; node->slots[0] = slot; - node = ptr_to_indirect(node); - rcu_assign_pointer(root->rnode, node); - root->height = newheight; - } while (height > root->height); + slot = node_to_entry(node); + rcu_assign_pointer(root->rnode, slot); + shift += RADIX_TREE_MAP_SHIFT; + } while (shift <= maxshift); out: - return 0; + return maxshift + RADIX_TREE_MAP_SHIFT; } /** @@ -439,71 +506,70 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index, unsigned order, struct radix_tree_node **nodep, void ***slotp) { - struct radix_tree_node *node = NULL, *slot; - unsigned int height, shift, offset; - int error; + struct radix_tree_node *node = NULL, *child; + void **slot = (void **)&root->rnode; + unsigned long maxindex; + unsigned int shift, offset = 0; + unsigned long max = index | ((1UL << order) - 1); - BUG_ON((0 < order) && (order < RADIX_TREE_MAP_SHIFT)); + shift = radix_tree_load_root(root, &child, &maxindex); /* Make sure the tree is high enough. */ - if (index > radix_tree_maxindex(root->height)) { - error = radix_tree_extend(root, index, order); - if (error) + if (max > maxindex) { + int error = radix_tree_extend(root, max, shift); + if (error < 0) return error; + shift = error; + child = root->rnode; + if (order == shift) + shift += RADIX_TREE_MAP_SHIFT; } - slot = root->rnode; - - height = root->height; - shift = height * RADIX_TREE_MAP_SHIFT; - - offset = 0; /* uninitialised var warning */ while (shift > order) { - if (slot == NULL) { + shift -= RADIX_TREE_MAP_SHIFT; + if (child == NULL) { /* Have to add a child node. */ - if (!(slot = radix_tree_node_alloc(root))) + child = radix_tree_node_alloc(root); + if (!child) return -ENOMEM; - slot->path = height; - slot->parent = node; - if (node) { - rcu_assign_pointer(node->slots[offset], - ptr_to_indirect(slot)); + child->shift = shift; + child->offset = offset; + child->parent = node; + rcu_assign_pointer(*slot, node_to_entry(child)); + if (node) node->count++; - slot->path |= offset << RADIX_TREE_HEIGHT_SHIFT; - } else - rcu_assign_pointer(root->rnode, - ptr_to_indirect(slot)); - } else if (!radix_tree_is_indirect_ptr(slot)) + } else if (!radix_tree_is_internal_node(child)) break; /* Go a level down */ - height--; - shift -= RADIX_TREE_MAP_SHIFT; - offset = (index >> shift) & RADIX_TREE_MAP_MASK; - node = indirect_to_ptr(slot); - slot = node->slots[offset]; + node = entry_to_node(child); + offset = radix_tree_descend(node, &child, index); + slot = &node->slots[offset]; } +#ifdef CONFIG_RADIX_TREE_MULTIORDER /* Insert pointers to the canonical entry */ - if ((shift - order) > 0) { - int i, n = 1 << (shift - order); + if (order > shift) { + unsigned i, n = 1 << (order - shift); offset = offset & ~(n - 1); - slot = ptr_to_indirect(&node->slots[offset]); + slot = &node->slots[offset]; + child = node_to_entry(slot); for (i = 0; i < n; i++) { - if (node->slots[offset + i]) + if (slot[i]) return -EEXIST; } for (i = 1; i < n; i++) { - rcu_assign_pointer(node->slots[offset + i], slot); + rcu_assign_pointer(slot[i], child); node->count++; } } +#endif if (nodep) *nodep = node; if (slotp) - *slotp = node ? node->slots + offset : (void **)&root->rnode; + *slotp = slot; return 0; } @@ -523,7 +589,7 @@ int __radix_tree_insert(struct radix_tree_root *root, unsigned long index, void **slot; int error; - BUG_ON(radix_tree_is_indirect_ptr(item)); + BUG_ON(radix_tree_is_internal_node(item)); error = __radix_tree_create(root, index, order, &node, &slot); if (error) @@ -533,12 +599,13 @@ int __radix_tree_insert(struct radix_tree_root *root, unsigned long index, rcu_assign_pointer(*slot, item); if (node) { + unsigned offset = get_slot_offset(node, slot); node->count++; - BUG_ON(tag_get(node, 0, index & RADIX_TREE_MAP_MASK)); - BUG_ON(tag_get(node, 1, index & RADIX_TREE_MAP_MASK)); + BUG_ON(tag_get(node, 0, offset)); + BUG_ON(tag_get(node, 1, offset)); + BUG_ON(tag_get(node, 2, offset)); } else { - BUG_ON(root_tag_get(root, 0)); - BUG_ON(root_tag_get(root, 1)); + BUG_ON(root_tags_get(root)); } return 0; @@ -563,44 +630,25 @@ void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index, struct radix_tree_node **nodep, void ***slotp) { struct radix_tree_node *node, *parent; - unsigned int height, shift; + unsigned long maxindex; void **slot; - node = rcu_dereference_raw(root->rnode); - if (node == NULL) + restart: + parent = NULL; + slot = (void **)&root->rnode; + radix_tree_load_root(root, &node, &maxindex); + if (index > maxindex) return NULL; - if (!radix_tree_is_indirect_ptr(node)) { - if (index > 0) - return NULL; + while (radix_tree_is_internal_node(node)) { + unsigned offset; - if (nodep) - *nodep = NULL; - if (slotp) - *slotp = (void **)&root->rnode; - return node; + if (node == RADIX_TREE_RETRY) + goto restart; + parent = entry_to_node(node); + offset = radix_tree_descend(parent, &node, index); + slot = parent->slots + offset; } - node = indirect_to_ptr(node); - - height = node->path & RADIX_TREE_HEIGHT_MASK; - if (index > radix_tree_maxindex(height)) - return NULL; - - shift = (height-1) * RADIX_TREE_MAP_SHIFT; - - do { - parent = node; - slot = node->slots + ((index >> shift) & RADIX_TREE_MAP_MASK); - node = rcu_dereference_raw(*slot); - if (node == NULL) - return NULL; - if (!radix_tree_is_indirect_ptr(node)) - break; - node = indirect_to_ptr(node); - - shift -= RADIX_TREE_MAP_SHIFT; - height--; - } while (height > 0); if (nodep) *nodep = parent; @@ -654,59 +702,72 @@ EXPORT_SYMBOL(radix_tree_lookup); * radix_tree_tag_set - set a tag on a radix tree node * @root: radix tree root * @index: index key - * @tag: tag index + * @tag: tag index * * Set the search tag (which must be < RADIX_TREE_MAX_TAGS) * corresponding to @index in the radix tree. From * the root all the way down to the leaf node. * - * Returns the address of the tagged item. Setting a tag on a not-present + * Returns the address of the tagged item. Setting a tag on a not-present * item is a bug. */ void *radix_tree_tag_set(struct radix_tree_root *root, unsigned long index, unsigned int tag) { - unsigned int height, shift; - struct radix_tree_node *slot; + struct radix_tree_node *node, *parent; + unsigned long maxindex; - height = root->height; - BUG_ON(index > radix_tree_maxindex(height)); + radix_tree_load_root(root, &node, &maxindex); + BUG_ON(index > maxindex); - slot = indirect_to_ptr(root->rnode); - shift = (height - 1) * RADIX_TREE_MAP_SHIFT; + while (radix_tree_is_internal_node(node)) { + unsigned offset; - while (height > 0) { - int offset; + parent = entry_to_node(node); + offset = radix_tree_descend(parent, &node, index); + BUG_ON(!node); - offset = (index >> shift) & RADIX_TREE_MAP_MASK; - if (!tag_get(slot, tag, offset)) - tag_set(slot, tag, offset); - slot = slot->slots[offset]; - BUG_ON(slot == NULL); - if (!radix_tree_is_indirect_ptr(slot)) - break; - slot = indirect_to_ptr(slot); - shift -= RADIX_TREE_MAP_SHIFT; - height--; + if (!tag_get(parent, tag, offset)) + tag_set(parent, tag, offset); } /* set the root's tag bit */ - if (slot && !root_tag_get(root, tag)) + if (!root_tag_get(root, tag)) root_tag_set(root, tag); - return slot; + return node; } EXPORT_SYMBOL(radix_tree_tag_set); +static void node_tag_clear(struct radix_tree_root *root, + struct radix_tree_node *node, + unsigned int tag, unsigned int offset) +{ + while (node) { + if (!tag_get(node, tag, offset)) + return; + tag_clear(node, tag, offset); + if (any_tag_set(node, tag)) + return; + + offset = node->offset; + node = node->parent; + } + + /* clear the root's tag bit */ + if (root_tag_get(root, tag)) + root_tag_clear(root, tag); +} + /** * radix_tree_tag_clear - clear a tag on a radix tree node * @root: radix tree root * @index: index key - * @tag: tag index + * @tag: tag index * * Clear the search tag (which must be < RADIX_TREE_MAX_TAGS) - * corresponding to @index in the radix tree. If - * this causes the leaf node to have no tags set then clear the tag in the + * corresponding to @index in the radix tree. If this causes + * the leaf node to have no tags set then clear the tag in the * next-to-leaf node, etc. * * Returns the address of the tagged item on success, else NULL. ie: @@ -715,52 +776,25 @@ EXPORT_SYMBOL(radix_tree_tag_set); void *radix_tree_tag_clear(struct radix_tree_root *root, unsigned long index, unsigned int tag) { - struct radix_tree_node *node = NULL; - struct radix_tree_node *slot = NULL; - unsigned int height, shift; + struct radix_tree_node *node, *parent; + unsigned long maxindex; int uninitialized_var(offset); - height = root->height; - if (index > radix_tree_maxindex(height)) - goto out; - - shift = height * RADIX_TREE_MAP_SHIFT; - slot = root->rnode; + radix_tree_load_root(root, &node, &maxindex); + if (index > maxindex) + return NULL; - while (shift) { - if (slot == NULL) - goto out; - if (!radix_tree_is_indirect_ptr(slot)) - break; - slot = indirect_to_ptr(slot); + parent = NULL; - shift -= RADIX_TREE_MAP_SHIFT; - offset = (index >> shift) & RADIX_TREE_MAP_MASK; - node = slot; - slot = slot->slots[offset]; + while (radix_tree_is_internal_node(node)) { + parent = entry_to_node(node); + offset = radix_tree_descend(parent, &node, index); } - if (slot == NULL) - goto out; + if (node) + node_tag_clear(root, parent, tag, offset); - while (node) { - if (!tag_get(node, tag, offset)) - goto out; - tag_clear(node, tag, offset); - if (any_tag_set(node, tag)) - goto out; - - index >>= RADIX_TREE_MAP_SHIFT; - offset = index & RADIX_TREE_MAP_MASK; - node = node->parent; - } - - /* clear the root's tag bit */ - if (root_tag_get(root, tag)) - root_tag_clear(root, tag); - -out: - return slot; + return node; } EXPORT_SYMBOL(radix_tree_tag_clear); @@ -768,7 +802,7 @@ EXPORT_SYMBOL(radix_tree_tag_clear); * radix_tree_tag_get - get a tag on a radix tree node * @root: radix tree root * @index: index key - * @tag: tag index (< RADIX_TREE_MAX_TAGS) + * @tag: tag index (< RADIX_TREE_MAX_TAGS) * * Return values: * @@ -782,48 +816,44 @@ EXPORT_SYMBOL(radix_tree_tag_clear); int radix_tree_tag_get(struct radix_tree_root *root, unsigned long index, unsigned int tag) { - unsigned int height, shift; - struct radix_tree_node *node; + struct radix_tree_node *node, *parent; + unsigned long maxindex; - /* check the root's tag bit */ if (!root_tag_get(root, tag)) return 0; - node = rcu_dereference_raw(root->rnode); - if (node == NULL) + radix_tree_load_root(root, &node, &maxindex); + if (index > maxindex) return 0; - - if (!radix_tree_is_indirect_ptr(node)) - return (index == 0); - node = indirect_to_ptr(node); - - height = node->path & RADIX_TREE_HEIGHT_MASK; - if (index > radix_tree_maxindex(height)) + if (node == NULL) return 0; - shift = (height - 1) * RADIX_TREE_MAP_SHIFT; + while (radix_tree_is_internal_node(node)) { + unsigned offset; - for ( ; ; ) { - int offset; + parent = entry_to_node(node); + offset = radix_tree_descend(parent, &node, index); - if (node == NULL) + if (!node) return 0; - node = indirect_to_ptr(node); - - offset = (index >> shift) & RADIX_TREE_MAP_MASK; - if (!tag_get(node, tag, offset)) + if (!tag_get(parent, tag, offset)) return 0; - if (height == 1) - return 1; - node = rcu_dereference_raw(node->slots[offset]); - if (!radix_tree_is_indirect_ptr(node)) - return 1; - shift -= RADIX_TREE_MAP_SHIFT; - height--; + if (node == RADIX_TREE_RETRY) + break; } + + return 1; } EXPORT_SYMBOL(radix_tree_tag_get); +static inline void __set_iter_shift(struct radix_tree_iter *iter, + unsigned int shift) +{ +#ifdef CONFIG_RADIX_TREE_MULTIORDER + iter->shift = shift; +#endif +} + /** * radix_tree_next_chunk - find next chunk of slots for iteration * @@ -835,9 +865,9 @@ EXPORT_SYMBOL(radix_tree_tag_get); void **radix_tree_next_chunk(struct radix_tree_root *root, struct radix_tree_iter *iter, unsigned flags) { - unsigned shift, tag = flags & RADIX_TREE_ITER_TAG_MASK; - struct radix_tree_node *rnode, *node; - unsigned long index, offset, height; + unsigned tag = flags & RADIX_TREE_ITER_TAG_MASK; + struct radix_tree_node *node, *child; + unsigned long index, offset, maxindex; if ((flags & RADIX_TREE_ITER_TAGGED) && !root_tag_get(root, tag)) return NULL; @@ -855,33 +885,28 @@ void **radix_tree_next_chunk(struct radix_tree_root *root, if (!index && iter->index) return NULL; - rnode = rcu_dereference_raw(root->rnode); - if (radix_tree_is_indirect_ptr(rnode)) { - rnode = indirect_to_ptr(rnode); - } else if (rnode && !index) { + restart: + radix_tree_load_root(root, &child, &maxindex); + if (index > maxindex) + return NULL; + if (!child) + return NULL; + + if (!radix_tree_is_internal_node(child)) { /* Single-slot tree */ - iter->index = 0; - iter->next_index = 1; + iter->index = index; + iter->next_index = maxindex + 1; iter->tags = 1; + __set_iter_shift(iter, 0); return (void **)&root->rnode; - } else - return NULL; - -restart: - height = rnode->path & RADIX_TREE_HEIGHT_MASK; - shift = (height - 1) * RADIX_TREE_MAP_SHIFT; - offset = index >> shift; + } - /* Index outside of the tree */ - if (offset >= RADIX_TREE_MAP_SIZE) - return NULL; + do { + node = entry_to_node(child); + offset = radix_tree_descend(node, &child, index); - node = rnode; - while (1) { - struct radix_tree_node *slot; if ((flags & RADIX_TREE_ITER_TAGGED) ? - !test_bit(offset, node->tags[tag]) : - !node->slots[offset]) { + !tag_get(node, tag, offset) : !child) { /* Hole detected */ if (flags & RADIX_TREE_ITER_CONTIG) return NULL; @@ -893,35 +918,30 @@ restart: offset + 1); else while (++offset < RADIX_TREE_MAP_SIZE) { - if (node->slots[offset]) + void *slot = node->slots[offset]; + if (is_sibling_entry(node, slot)) + continue; + if (slot) break; } - index &= ~((RADIX_TREE_MAP_SIZE << shift) - 1); - index += offset << shift; + index &= ~node_maxindex(node); + index += offset << node->shift; /* Overflow after ~0UL */ if (!index) return NULL; if (offset == RADIX_TREE_MAP_SIZE) goto restart; + child = rcu_dereference_raw(node->slots[offset]); } - /* This is leaf-node */ - if (!shift) - break; - - slot = rcu_dereference_raw(node->slots[offset]); - if (slot == NULL) + if ((child == NULL) || (child == RADIX_TREE_RETRY)) goto restart; - if (!radix_tree_is_indirect_ptr(slot)) - break; - node = indirect_to_ptr(slot); - shift -= RADIX_TREE_MAP_SHIFT; - offset = (index >> shift) & RADIX_TREE_MAP_MASK; - } + } while (radix_tree_is_internal_node(child)); /* Update the iterator state */ - iter->index = index; - iter->next_index = (index | RADIX_TREE_MAP_MASK) + 1; + iter->index = (index &~ node_maxindex(node)) | (offset << node->shift); + iter->next_index = (index | node_maxindex(node)) + 1; + __set_iter_shift(iter, node->shift); /* Construct iter->tags bit-mask from node->tags[tag] array */ if (flags & RADIX_TREE_ITER_TAGGED) { @@ -967,7 +987,7 @@ EXPORT_SYMBOL(radix_tree_next_chunk); * set is outside the range we are scanning. This reults in dangling tags and * can lead to problems with later tag operations (e.g. livelocks on lookups). * - * The function returns number of leaves where the tag was set and sets + * The function returns the number of leaves where the tag was set and sets * *first_indexp to the first unscanned index. * WARNING! *first_indexp can wrap if last_index is ULONG_MAX. Caller must * be prepared to handle that. @@ -977,14 +997,13 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root, unsigned long nr_to_tag, unsigned int iftag, unsigned int settag) { - unsigned int height = root->height; - struct radix_tree_node *node = NULL; - struct radix_tree_node *slot; - unsigned int shift; + struct radix_tree_node *parent, *node, *child; + unsigned long maxindex; unsigned long tagged = 0; unsigned long index = *first_indexp; - last_index = min(last_index, radix_tree_maxindex(height)); + radix_tree_load_root(root, &child, &maxindex); + last_index = min(last_index, maxindex); if (index > last_index) return 0; if (!nr_to_tag) @@ -993,80 +1012,62 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root, *first_indexp = last_index + 1; return 0; } - if (height == 0) { + if (!radix_tree_is_internal_node(child)) { *first_indexp = last_index + 1; root_tag_set(root, settag); return 1; } - shift = (height - 1) * RADIX_TREE_MAP_SHIFT; - slot = indirect_to_ptr(root->rnode); + node = entry_to_node(child); for (;;) { - unsigned long upindex; - int offset; - - offset = (index >> shift) & RADIX_TREE_MAP_MASK; - if (!slot->slots[offset]) + unsigned offset = radix_tree_descend(node, &child, index); + if (!child) goto next; - if (!tag_get(slot, iftag, offset)) + if (!tag_get(node, iftag, offset)) goto next; - if (shift) { - node = slot; - slot = slot->slots[offset]; - if (radix_tree_is_indirect_ptr(slot)) { - slot = indirect_to_ptr(slot); - shift -= RADIX_TREE_MAP_SHIFT; - continue; - } else { - slot = node; - node = node->parent; - } + /* Sibling slots never have tags set on them */ + if (radix_tree_is_internal_node(child)) { + node = entry_to_node(child); + continue; } /* tag the leaf */ - tagged += 1 << shift; - tag_set(slot, settag, offset); + tagged++; + tag_set(node, settag, offset); /* walk back up the path tagging interior nodes */ - upindex = index; - while (node) { - upindex >>= RADIX_TREE_MAP_SHIFT; - offset = upindex & RADIX_TREE_MAP_MASK; - + parent = node; + for (;;) { + offset = parent->offset; + parent = parent->parent; + if (!parent) + break; /* stop if we find a node with the tag already set */ - if (tag_get(node, settag, offset)) + if (tag_get(parent, settag, offset)) break; - tag_set(node, settag, offset); - node = node->parent; + tag_set(parent, settag, offset); } - - /* - * Small optimization: now clear that node pointer. - * Since all of this slot's ancestors now have the tag set - * from setting it above, we have no further need to walk - * back up the tree setting tags, until we update slot to - * point to another radix_tree_node. - */ - node = NULL; - -next: - /* Go to next item at level determined by 'shift' */ - index = ((index >> shift) + 1) << shift; + next: + /* Go to next entry in node */ + index = ((index >> node->shift) + 1) << node->shift; /* Overflow can happen when last_index is ~0UL... */ if (index > last_index || !index) break; - if (tagged >= nr_to_tag) - break; - while (((index >> shift) & RADIX_TREE_MAP_MASK) == 0) { + offset = (index >> node->shift) & RADIX_TREE_MAP_MASK; + while (offset == 0) { /* * We've fully scanned this node. Go up. Because * last_index is guaranteed to be in the tree, what * we do below cannot wander astray. */ - slot = slot->parent; - shift += RADIX_TREE_MAP_SHIFT; + node = node->parent; + offset = (index >> node->shift) & RADIX_TREE_MAP_MASK; } + if (is_sibling_entry(node, node->slots[offset])) + goto next; + if (tagged >= nr_to_tag) + break; } /* * We need not to tag the root tag if there is no tag which is set with @@ -1095,9 +1096,10 @@ EXPORT_SYMBOL(radix_tree_range_tag_if_tagged); * * Like radix_tree_lookup, radix_tree_gang_lookup may be called under * rcu_read_lock. In this case, rather than the returned results being - * an atomic snapshot of the tree at a single point in time, the semantics - * of an RCU protected gang lookup are as though multiple radix_tree_lookups - * have been issued in individual locks, and results stored in 'results'. + * an atomic snapshot of the tree at a single point in time, the + * semantics of an RCU protected gang lookup are as though multiple + * radix_tree_lookups have been issued in individual locks, and results + * stored in 'results'. */ unsigned int radix_tree_gang_lookup(struct radix_tree_root *root, void **results, @@ -1114,7 +1116,7 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results, results[ret] = rcu_dereference_raw(*slot); if (!results[ret]) continue; - if (radix_tree_is_indirect_ptr(results[ret])) { + if (radix_tree_is_internal_node(results[ret])) { slot = radix_tree_iter_retry(&iter); continue; } @@ -1197,7 +1199,7 @@ radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results, results[ret] = rcu_dereference_raw(*slot); if (!results[ret]) continue; - if (radix_tree_is_indirect_ptr(results[ret])) { + if (radix_tree_is_internal_node(results[ret])) { slot = radix_tree_iter_retry(&iter); continue; } @@ -1247,58 +1249,48 @@ EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot); #if defined(CONFIG_SHMEM) && defined(CONFIG_SWAP) #include <linux/sched.h> /* for cond_resched() */ +struct locate_info { + unsigned long found_index; + bool stop; +}; + /* * This linear search is at present only useful to shmem_unuse_inode(). */ static unsigned long __locate(struct radix_tree_node *slot, void *item, - unsigned long index, unsigned long *found_index) + unsigned long index, struct locate_info *info) { - unsigned int shift, height; unsigned long i; - height = slot->path & RADIX_TREE_HEIGHT_MASK; - shift = (height-1) * RADIX_TREE_MAP_SHIFT; - - for ( ; height > 1; height--) { - i = (index >> shift) & RADIX_TREE_MAP_MASK; - for (;;) { - if (slot->slots[i] != NULL) - break; - index &= ~((1UL << shift) - 1); - index += 1UL << shift; - if (index == 0) - goto out; /* 32-bit wraparound */ - i++; - if (i == RADIX_TREE_MAP_SIZE) + do { + unsigned int shift = slot->shift; + + for (i = (index >> shift) & RADIX_TREE_MAP_MASK; + i < RADIX_TREE_MAP_SIZE; + i++, index += (1UL << shift)) { + struct radix_tree_node *node = + rcu_dereference_raw(slot->slots[i]); + if (node == RADIX_TREE_RETRY) goto out; - } - - slot = rcu_dereference_raw(slot->slots[i]); - if (slot == NULL) - goto out; - if (!radix_tree_is_indirect_ptr(slot)) { - if (slot == item) { - *found_index = index + i; - index = 0; - } else { - index += shift; + if (!radix_tree_is_internal_node(node)) { + if (node == item) { + info->found_index = index; + info->stop = true; + goto out; + } + continue; } - goto out; + node = entry_to_node(node); + if (is_sibling_entry(slot, node)) + continue; + slot = node; + break; } - slot = indirect_to_ptr(slot); - shift -= RADIX_TREE_MAP_SHIFT; - } + } while (i < RADIX_TREE_MAP_SIZE); - /* Bottom level: check items */ - for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { - if (slot->slots[i] == item) { - *found_index = index + i; - index = 0; - goto out; - } - } - index += RADIX_TREE_MAP_SIZE; out: + if ((index == 0) && (i == RADIX_TREE_MAP_SIZE)) + info->stop = true; return index; } @@ -1316,32 +1308,35 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item) struct radix_tree_node *node; unsigned long max_index; unsigned long cur_index = 0; - unsigned long found_index = -1; + struct locate_info info = { + .found_index = -1, + .stop = false, + }; do { rcu_read_lock(); node = rcu_dereference_raw(root->rnode); - if (!radix_tree_is_indirect_ptr(node)) { + if (!radix_tree_is_internal_node(node)) { rcu_read_unlock(); if (node == item) - found_index = 0; + info.found_index = 0; break; } - node = indirect_to_ptr(node); - max_index = radix_tree_maxindex(node->path & - RADIX_TREE_HEIGHT_MASK); + node = entry_to_node(node); + + max_index = node_maxindex(node); if (cur_index > max_index) { rcu_read_unlock(); break; } - cur_index = __locate(node, item, cur_index, &found_index); + cur_index = __locate(node, item, cur_index, &info); rcu_read_unlock(); cond_resched(); - } while (cur_index != 0 && cur_index <= max_index); + } while (!info.stop && cur_index <= max_index); - return found_index; + return info.found_index; } #else unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item) @@ -1351,47 +1346,45 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item) #endif /* CONFIG_SHMEM && CONFIG_SWAP */ /** - * radix_tree_shrink - shrink height of a radix tree to minimal + * radix_tree_shrink - shrink radix tree to minimum height * @root radix tree root */ -static inline void radix_tree_shrink(struct radix_tree_root *root) +static inline bool radix_tree_shrink(struct radix_tree_root *root) { - /* try to shrink tree height */ - while (root->height > 0) { - struct radix_tree_node *to_free = root->rnode; - struct radix_tree_node *slot; + bool shrunk = false; - BUG_ON(!radix_tree_is_indirect_ptr(to_free)); - to_free = indirect_to_ptr(to_free); + for (;;) { + struct radix_tree_node *node = root->rnode; + struct radix_tree_node *child; + + if (!radix_tree_is_internal_node(node)) + break; + node = entry_to_node(node); /* * The candidate node has more than one child, or its child - * is not at the leftmost slot, or it is a multiorder entry, - * we cannot shrink. + * is not at the leftmost slot, or the child is a multiorder + * entry, we cannot shrink. */ - if (to_free->count != 1) + if (node->count != 1) + break; + child = node->slots[0]; + if (!child) break; - slot = to_free->slots[0]; - if (!slot) + if (!radix_tree_is_internal_node(child) && node->shift) break; + if (radix_tree_is_internal_node(child)) + entry_to_node(child)->parent = NULL; + /* * We don't need rcu_assign_pointer(), since we are simply * moving the node from one part of the tree to another: if it * was safe to dereference the old pointer to it - * (to_free->slots[0]), it will be safe to dereference the new + * (node->slots[0]), it will be safe to dereference the new * one (root->rnode) as far as dependent read barriers go. */ - if (root->height > 1) { - if (!radix_tree_is_indirect_ptr(slot)) - break; - - slot = indirect_to_ptr(slot); - slot->parent = NULL; - slot = ptr_to_indirect(slot); - } - root->rnode = slot; - root->height--; + root->rnode = child; /* * We have a dilemma here. The node's slot[0] must not be @@ -1403,7 +1396,7 @@ static inline void radix_tree_shrink(struct radix_tree_root *root) * their slot to become empty sooner or later. * * For example, lockless pagecache will look up a slot, deref - * the page pointer, and if the page is 0 refcount it means it + * the page pointer, and if the page has 0 refcount it means it * was concurrently deleted from pagecache so try the deref * again. Fortunately there is already a requirement for logic * to retry the entire slot lookup -- the indirect pointer @@ -1411,12 +1404,14 @@ static inline void radix_tree_shrink(struct radix_tree_root *root) * also results in a stale slot). So tag the slot as indirect * to force callers to retry. */ - if (root->height == 0) - *((unsigned long *)&to_free->slots[0]) |= - RADIX_TREE_INDIRECT_PTR; + if (!radix_tree_is_internal_node(child)) + node->slots[0] = RADIX_TREE_RETRY; - radix_tree_node_free(to_free); + radix_tree_node_free(node); + shrunk = true; } + + return shrunk; } /** @@ -1439,24 +1434,17 @@ bool __radix_tree_delete_node(struct radix_tree_root *root, struct radix_tree_node *parent; if (node->count) { - if (node == indirect_to_ptr(root->rnode)) { - radix_tree_shrink(root); - if (root->height == 0) - deleted = true; - } + if (node == entry_to_node(root->rnode)) + deleted |= radix_tree_shrink(root); return deleted; } parent = node->parent; if (parent) { - unsigned int offset; - - offset = node->path >> RADIX_TREE_HEIGHT_SHIFT; - parent->slots[offset] = NULL; + parent->slots[node->offset] = NULL; parent->count--; } else { root_tag_clear_all(root); - root->height = 0; root->rnode = NULL; } @@ -1469,6 +1457,20 @@ bool __radix_tree_delete_node(struct radix_tree_root *root, return deleted; } +static inline void delete_sibling_entries(struct radix_tree_node *node, + void *ptr, unsigned offset) +{ +#ifdef CONFIG_RADIX_TREE_MULTIORDER + int i; + for (i = 1; offset + i < RADIX_TREE_MAP_SIZE; i++) { + if (node->slots[offset + i] != ptr) + break; + node->slots[offset + i] = NULL; + node->count--; + } +#endif +} + /** * radix_tree_delete_item - delete an item from a radix tree * @root: radix tree root @@ -1484,7 +1486,7 @@ void *radix_tree_delete_item(struct radix_tree_root *root, unsigned long index, void *item) { struct radix_tree_node *node; - unsigned int offset, i; + unsigned int offset; void **slot; void *entry; int tag; @@ -1502,24 +1504,13 @@ void *radix_tree_delete_item(struct radix_tree_root *root, return entry; } - offset = index & RADIX_TREE_MAP_MASK; + offset = get_slot_offset(node, slot); - /* - * Clear all tags associated with the item to be deleted. - * This way of doing it would be inefficient, but seldom is any set. - */ - for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { - if (tag_get(node, tag, offset)) - radix_tree_tag_clear(root, index, tag); - } + /* Clear all tags associated with the item to be deleted. */ + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) + node_tag_clear(root, node, tag, offset); - /* Delete any sibling slots pointing to this slot */ - for (i = 1; offset + i < RADIX_TREE_MAP_SIZE; i++) { - if (node->slots[offset + i] != ptr_to_indirect(slot)) - break; - node->slots[offset + i] = NULL; - node->count--; - } + delete_sibling_entries(node, node_to_entry(slot), offset); node->slots[offset] = NULL; node->count--; @@ -1544,6 +1535,28 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) } EXPORT_SYMBOL(radix_tree_delete); +struct radix_tree_node *radix_tree_replace_clear_tags( + struct radix_tree_root *root, + unsigned long index, void *entry) +{ + struct radix_tree_node *node; + void **slot; + + __radix_tree_lookup(root, index, &node, &slot); + + if (node) { + unsigned int tag, offset = get_slot_offset(node, slot); + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) + node_tag_clear(root, node, tag, offset); + } else { + /* Clear root node tags */ + root->gfp_mask &= __GFP_BITS_MASK; + } + + radix_tree_replace_slot(slot, entry); + return node; +} + /** * radix_tree_tagged - test whether any items in the tree are tagged * @root: radix tree root @@ -1564,45 +1577,24 @@ radix_tree_node_ctor(void *arg) INIT_LIST_HEAD(&node->private_list); } -static __init unsigned long __maxindex(unsigned int height) -{ - unsigned int width = height * RADIX_TREE_MAP_SHIFT; - int shift = RADIX_TREE_INDEX_BITS - width; - - if (shift < 0) - return ~0UL; - if (shift >= BITS_PER_LONG) - return 0UL; - return ~0UL >> shift; -} - -static __init void radix_tree_init_maxindex(void) -{ - unsigned int i; - - for (i = 0; i < ARRAY_SIZE(height_to_maxindex); i++) - height_to_maxindex[i] = __maxindex(i); -} - static int radix_tree_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) + unsigned long action, void *hcpu) { - int cpu = (long)hcpu; - struct radix_tree_preload *rtp; - struct radix_tree_node *node; - - /* Free per-cpu pool of perloaded nodes */ - if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { - rtp = &per_cpu(radix_tree_preloads, cpu); - while (rtp->nr) { + int cpu = (long)hcpu; + struct radix_tree_preload *rtp; + struct radix_tree_node *node; + + /* Free per-cpu pool of preloaded nodes */ + if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { + rtp = &per_cpu(radix_tree_preloads, cpu); + while (rtp->nr) { node = rtp->nodes; rtp->nodes = node->private_data; kmem_cache_free(radix_tree_node_cachep, node); rtp->nr--; - } - } - return NOTIFY_OK; + } + } + return NOTIFY_OK; } void __init radix_tree_init(void) @@ -1611,6 +1603,5 @@ void __init radix_tree_init(void) sizeof(struct radix_tree_node), 0, SLAB_PANIC | SLAB_RECLAIM_ACCOUNT, radix_tree_node_ctor); - radix_tree_init_maxindex(); hotcpu_notifier(radix_tree_callback, 0); } diff --git a/lib/rhashtable.c b/lib/rhashtable.c index cc808707d..5d845ffd7 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -487,6 +487,7 @@ EXPORT_SYMBOL_GPL(rhashtable_insert_slow); * rhashtable_walk_init - Initialise an iterator * @ht: Table to walk over * @iter: Hash table Iterator + * @gfp: GFP flags for allocations * * This function prepares a hash table walk. * @@ -504,14 +505,15 @@ EXPORT_SYMBOL_GPL(rhashtable_insert_slow); * You must call rhashtable_walk_exit if this function returns * successfully. */ -int rhashtable_walk_init(struct rhashtable *ht, struct rhashtable_iter *iter) +int rhashtable_walk_init(struct rhashtable *ht, struct rhashtable_iter *iter, + gfp_t gfp) { iter->ht = ht; iter->p = NULL; iter->slot = 0; iter->skip = 0; - iter->walker = kmalloc(sizeof(*iter->walker), GFP_KERNEL); + iter->walker = kmalloc(sizeof(*iter->walker), gfp); if (!iter->walker) return -ENOMEM; diff --git a/lib/sg_pool.c b/lib/sg_pool.c new file mode 100644 index 000000000..6dd30615a --- /dev/null +++ b/lib/sg_pool.c @@ -0,0 +1,172 @@ +#include <linux/module.h> +#include <linux/scatterlist.h> +#include <linux/mempool.h> +#include <linux/slab.h> + +#define SG_MEMPOOL_NR ARRAY_SIZE(sg_pools) +#define SG_MEMPOOL_SIZE 2 + +struct sg_pool { + size_t size; + char *name; + struct kmem_cache *slab; + mempool_t *pool; +}; + +#define SP(x) { .size = x, "sgpool-" __stringify(x) } +#if (SG_CHUNK_SIZE < 32) +#error SG_CHUNK_SIZE is too small (must be 32 or greater) +#endif +static struct sg_pool sg_pools[] = { + SP(8), + SP(16), +#if (SG_CHUNK_SIZE > 32) + SP(32), +#if (SG_CHUNK_SIZE > 64) + SP(64), +#if (SG_CHUNK_SIZE > 128) + SP(128), +#if (SG_CHUNK_SIZE > 256) +#error SG_CHUNK_SIZE is too large (256 MAX) +#endif +#endif +#endif +#endif + SP(SG_CHUNK_SIZE) +}; +#undef SP + +static inline unsigned int sg_pool_index(unsigned short nents) +{ + unsigned int index; + + BUG_ON(nents > SG_CHUNK_SIZE); + + if (nents <= 8) + index = 0; + else + index = get_count_order(nents) - 3; + + return index; +} + +static void sg_pool_free(struct scatterlist *sgl, unsigned int nents) +{ + struct sg_pool *sgp; + + sgp = sg_pools + sg_pool_index(nents); + mempool_free(sgl, sgp->pool); +} + +static struct scatterlist *sg_pool_alloc(unsigned int nents, gfp_t gfp_mask) +{ + struct sg_pool *sgp; + + sgp = sg_pools + sg_pool_index(nents); + return mempool_alloc(sgp->pool, gfp_mask); +} + +/** + * sg_free_table_chained - Free a previously mapped sg table + * @table: The sg table header to use + * @first_chunk: was first_chunk not NULL in sg_alloc_table_chained? + * + * Description: + * Free an sg table previously allocated and setup with + * sg_alloc_table_chained(). + * + **/ +void sg_free_table_chained(struct sg_table *table, bool first_chunk) +{ + if (first_chunk && table->orig_nents <= SG_CHUNK_SIZE) + return; + __sg_free_table(table, SG_CHUNK_SIZE, first_chunk, sg_pool_free); +} +EXPORT_SYMBOL_GPL(sg_free_table_chained); + +/** + * sg_alloc_table_chained - Allocate and chain SGLs in an sg table + * @table: The sg table header to use + * @nents: Number of entries in sg list + * @first_chunk: first SGL + * + * Description: + * Allocate and chain SGLs in an sg table. If @nents@ is larger than + * SG_CHUNK_SIZE a chained sg table will be setup. + * + **/ +int sg_alloc_table_chained(struct sg_table *table, int nents, + struct scatterlist *first_chunk) +{ + int ret; + + BUG_ON(!nents); + + if (first_chunk) { + if (nents <= SG_CHUNK_SIZE) { + table->nents = table->orig_nents = nents; + sg_init_table(table->sgl, nents); + return 0; + } + } + + ret = __sg_alloc_table(table, nents, SG_CHUNK_SIZE, + first_chunk, GFP_ATOMIC, sg_pool_alloc); + if (unlikely(ret)) + sg_free_table_chained(table, (bool)first_chunk); + return ret; +} +EXPORT_SYMBOL_GPL(sg_alloc_table_chained); + +static __init int sg_pool_init(void) +{ + int i; + + for (i = 0; i < SG_MEMPOOL_NR; i++) { + struct sg_pool *sgp = sg_pools + i; + int size = sgp->size * sizeof(struct scatterlist); + + sgp->slab = kmem_cache_create(sgp->name, size, 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!sgp->slab) { + printk(KERN_ERR "SG_POOL: can't init sg slab %s\n", + sgp->name); + goto cleanup_sdb; + } + + sgp->pool = mempool_create_slab_pool(SG_MEMPOOL_SIZE, + sgp->slab); + if (!sgp->pool) { + printk(KERN_ERR "SG_POOL: can't init sg mempool %s\n", + sgp->name); + goto cleanup_sdb; + } + } + + return 0; + +cleanup_sdb: + for (i = 0; i < SG_MEMPOOL_NR; i++) { + struct sg_pool *sgp = sg_pools + i; + if (sgp->pool) + mempool_destroy(sgp->pool); + if (sgp->slab) + kmem_cache_destroy(sgp->slab); + } + + return -ENOMEM; +} + +static __exit void sg_pool_exit(void) +{ + int i; + + for (i = 0; i < SG_MEMPOOL_NR; i++) { + struct sg_pool *sgp = sg_pools + i; + mempool_destroy(sgp->pool); + kmem_cache_destroy(sgp->slab); + } +} + +module_init(sg_pool_init); +module_exit(sg_pool_exit); diff --git a/lib/string_helpers.c b/lib/string_helpers.c index 5c88204b6..ecaac2c05 100644 --- a/lib/string_helpers.c +++ b/lib/string_helpers.c @@ -10,6 +10,10 @@ #include <linux/export.h> #include <linux/ctype.h> #include <linux/errno.h> +#include <linux/fs.h> +#include <linux/limits.h> +#include <linux/mm.h> +#include <linux/slab.h> #include <linux/string.h> #include <linux/string_helpers.h> @@ -534,3 +538,91 @@ int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz, return p - dst; } EXPORT_SYMBOL(string_escape_mem); + +/* + * Return an allocated string that has been escaped of special characters + * and double quotes, making it safe to log in quotes. + */ +char *kstrdup_quotable(const char *src, gfp_t gfp) +{ + size_t slen, dlen; + char *dst; + const int flags = ESCAPE_HEX; + const char esc[] = "\f\n\r\t\v\a\e\\\""; + + if (!src) + return NULL; + slen = strlen(src); + + dlen = string_escape_mem(src, slen, NULL, 0, flags, esc); + dst = kmalloc(dlen + 1, gfp); + if (!dst) + return NULL; + + WARN_ON(string_escape_mem(src, slen, dst, dlen, flags, esc) != dlen); + dst[dlen] = '\0'; + + return dst; +} +EXPORT_SYMBOL_GPL(kstrdup_quotable); + +/* + * Returns allocated NULL-terminated string containing process + * command line, with inter-argument NULLs replaced with spaces, + * and other special characters escaped. + */ +char *kstrdup_quotable_cmdline(struct task_struct *task, gfp_t gfp) +{ + char *buffer, *quoted; + int i, res; + + buffer = kmalloc(PAGE_SIZE, GFP_TEMPORARY); + if (!buffer) + return NULL; + + res = get_cmdline(task, buffer, PAGE_SIZE - 1); + buffer[res] = '\0'; + + /* Collapse trailing NULLs, leave res pointing to last non-NULL. */ + while (--res >= 0 && buffer[res] == '\0') + ; + + /* Replace inter-argument NULLs. */ + for (i = 0; i <= res; i++) + if (buffer[i] == '\0') + buffer[i] = ' '; + + /* Make sure result is printable. */ + quoted = kstrdup_quotable(buffer, gfp); + kfree(buffer); + return quoted; +} +EXPORT_SYMBOL_GPL(kstrdup_quotable_cmdline); + +/* + * Returns allocated NULL-terminated string containing pathname, + * with special characters escaped, able to be safely logged. If + * there is an error, the leading character will be "<". + */ +char *kstrdup_quotable_file(struct file *file, gfp_t gfp) +{ + char *temp, *pathname; + + if (!file) + return kstrdup("<unknown>", gfp); + + /* We add 11 spaces for ' (deleted)' to be appended */ + temp = kmalloc(PATH_MAX + 11, GFP_TEMPORARY); + if (!temp) + return kstrdup("<no_memory>", gfp); + + pathname = file_path(file, temp, PATH_MAX + 11); + if (IS_ERR(pathname)) + pathname = kstrdup("<too_long>", gfp); + else + pathname = kstrdup_quotable(pathname, gfp); + + kfree(temp); + return pathname; +} +EXPORT_SYMBOL_GPL(kstrdup_quotable_file); diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c index 338403241..33f655ef4 100644 --- a/lib/strncpy_from_user.c +++ b/lib/strncpy_from_user.c @@ -1,5 +1,6 @@ #include <linux/compiler.h> #include <linux/export.h> +#include <linux/kasan-checks.h> #include <linux/uaccess.h> #include <linux/kernel.h> #include <linux/errno.h> @@ -109,6 +110,7 @@ long strncpy_from_user(char *dst, const char __user *src, long count) unsigned long max = max_addr - src_addr; long retval; + kasan_check_write(dst, count); user_access_begin(); retval = do_strncpy_from_user(dst, src, count, max); user_access_end(); diff --git a/lib/test_bpf.c b/lib/test_bpf.c index 8f22fbedc..93f45011a 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -5621,7 +5621,10 @@ static struct bpf_prog *generate_filter(int which, int *err) fp->type = BPF_PROG_TYPE_SOCKET_FILTER; memcpy(fp->insnsi, fptr, fp->len * sizeof(struct bpf_insn)); - bpf_prog_select_runtime(fp); + /* We cannot error here as we don't need type compatibility + * checks. + */ + fp = bpf_prog_select_runtime(fp, err); break; } diff --git a/lib/test_hash.c b/lib/test_hash.c new file mode 100644 index 000000000..c9549c8b4 --- /dev/null +++ b/lib/test_hash.c @@ -0,0 +1,250 @@ +/* + * Test cases for <linux/hash.h> and <linux/stringhash.h> + * This just verifies that various ways of computing a hash + * produce the same thing and, for cases where a k-bit hash + * value is requested, is of the requested size. + * + * We fill a buffer with a 255-byte null-terminated string, + * and use both full_name_hash() and hashlen_string() to hash the + * substrings from i to j, where 0 <= i < j < 256. + * + * The returned values are used to check that __hash_32() and + * __hash_32_generic() compute the same thing. Likewise hash_32() + * and hash_64(). + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt "\n" + +#include <linux/compiler.h> +#include <linux/types.h> +#include <linux/module.h> +#include <linux/hash.h> +#include <linux/stringhash.h> +#include <linux/printk.h> + +/* 32-bit XORSHIFT generator. Seed must not be zero. */ +static u32 __init __attribute_const__ +xorshift(u32 seed) +{ + seed ^= seed << 13; + seed ^= seed >> 17; + seed ^= seed << 5; + return seed; +} + +/* Given a non-zero x, returns a non-zero byte. */ +static u8 __init __attribute_const__ +mod255(u32 x) +{ + x = (x & 0xffff) + (x >> 16); /* 1 <= x <= 0x1fffe */ + x = (x & 0xff) + (x >> 8); /* 1 <= x <= 0x2fd */ + x = (x & 0xff) + (x >> 8); /* 1 <= x <= 0x100 */ + x = (x & 0xff) + (x >> 8); /* 1 <= x <= 0xff */ + return x; +} + +/* Fill the buffer with non-zero bytes. */ +static void __init +fill_buf(char *buf, size_t len, u32 seed) +{ + size_t i; + + for (i = 0; i < len; i++) { + seed = xorshift(seed); + buf[i] = mod255(seed); + } +} + +/* + * Test the various integer hash functions. h64 (or its low-order bits) + * is the integer to hash. hash_or accumulates the OR of the hash values, + * which are later checked to see that they cover all the requested bits. + * + * Because these functions (as opposed to the string hashes) are all + * inline, the code being tested is actually in the module, and you can + * recompile and re-test the module without rebooting. + */ +static bool __init +test_int_hash(unsigned long long h64, u32 hash_or[2][33]) +{ + int k; + u32 h0 = (u32)h64, h1, h2; + + /* Test __hash32 */ + hash_or[0][0] |= h1 = __hash_32(h0); +#ifdef HAVE_ARCH__HASH_32 + hash_or[1][0] |= h2 = __hash_32_generic(h0); +#if HAVE_ARCH__HASH_32 == 1 + if (h1 != h2) { + pr_err("__hash_32(%#x) = %#x != __hash_32_generic() = %#x", + h0, h1, h2); + return false; + } +#endif +#endif + + /* Test k = 1..32 bits */ + for (k = 1; k <= 32; k++) { + u32 const m = ((u32)2 << (k-1)) - 1; /* Low k bits set */ + + /* Test hash_32 */ + hash_or[0][k] |= h1 = hash_32(h0, k); + if (h1 > m) { + pr_err("hash_32(%#x, %d) = %#x > %#x", h0, k, h1, m); + return false; + } +#ifdef HAVE_ARCH_HASH_32 + h2 = hash_32_generic(h0, k); +#if HAVE_ARCH_HASH_32 == 1 + if (h1 != h2) { + pr_err("hash_32(%#x, %d) = %#x != hash_32_generic() " + " = %#x", h0, k, h1, h2); + return false; + } +#else + if (h2 > m) { + pr_err("hash_32_generic(%#x, %d) = %#x > %#x", + h0, k, h1, m); + return false; + } +#endif +#endif + /* Test hash_64 */ + hash_or[1][k] |= h1 = hash_64(h64, k); + if (h1 > m) { + pr_err("hash_64(%#llx, %d) = %#x > %#x", h64, k, h1, m); + return false; + } +#ifdef HAVE_ARCH_HASH_64 + h2 = hash_64_generic(h64, k); +#if HAVE_ARCH_HASH_64 == 1 + if (h1 != h2) { + pr_err("hash_64(%#llx, %d) = %#x != hash_64_generic() " + "= %#x", h64, k, h1, h2); + return false; + } +#else + if (h2 > m) { + pr_err("hash_64_generic(%#llx, %d) = %#x > %#x", + h64, k, h1, m); + return false; + } +#endif +#endif + } + + (void)h2; /* Suppress unused variable warning */ + return true; +} + +#define SIZE 256 /* Run time is cubic in SIZE */ + +static int __init +test_hash_init(void) +{ + char buf[SIZE+1]; + u32 string_or = 0, hash_or[2][33] = { 0 }; + unsigned tests = 0; + unsigned long long h64 = 0; + int i, j; + + fill_buf(buf, SIZE, 1); + + /* Test every possible non-empty substring in the buffer. */ + for (j = SIZE; j > 0; --j) { + buf[j] = '\0'; + + for (i = 0; i <= j; i++) { + u64 hashlen = hashlen_string(buf+i); + u32 h0 = full_name_hash(buf+i, j-i); + + /* Check that hashlen_string gets the length right */ + if (hashlen_len(hashlen) != j-i) { + pr_err("hashlen_string(%d..%d) returned length" + " %u, expected %d", + i, j, hashlen_len(hashlen), j-i); + return -EINVAL; + } + /* Check that the hashes match */ + if (hashlen_hash(hashlen) != h0) { + pr_err("hashlen_string(%d..%d) = %08x != " + "full_name_hash() = %08x", + i, j, hashlen_hash(hashlen), h0); + return -EINVAL; + } + + string_or |= h0; + h64 = h64 << 32 | h0; /* For use with hash_64 */ + if (!test_int_hash(h64, hash_or)) + return -EINVAL; + tests++; + } /* i */ + } /* j */ + + /* The OR of all the hash values should cover all the bits */ + if (~string_or) { + pr_err("OR of all string hash results = %#x != %#x", + string_or, -1u); + return -EINVAL; + } + if (~hash_or[0][0]) { + pr_err("OR of all __hash_32 results = %#x != %#x", + hash_or[0][0], -1u); + return -EINVAL; + } +#ifdef HAVE_ARCH__HASH_32 +#if HAVE_ARCH__HASH_32 != 1 /* Test is pointless if results match */ + if (~hash_or[1][0]) { + pr_err("OR of all __hash_32_generic results = %#x != %#x", + hash_or[1][0], -1u); + return -EINVAL; + } +#endif +#endif + + /* Likewise for all the i-bit hash values */ + for (i = 1; i <= 32; i++) { + u32 const m = ((u32)2 << (i-1)) - 1; /* Low i bits set */ + + if (hash_or[0][i] != m) { + pr_err("OR of all hash_32(%d) results = %#x " + "(%#x expected)", i, hash_or[0][i], m); + return -EINVAL; + } + if (hash_or[1][i] != m) { + pr_err("OR of all hash_64(%d) results = %#x " + "(%#x expected)", i, hash_or[1][i], m); + return -EINVAL; + } + } + + /* Issue notices about skipped tests. */ +#ifndef HAVE_ARCH__HASH_32 + pr_info("__hash_32() has no arch implementation to test."); +#elif HAVE_ARCH__HASH_32 != 1 + pr_info("__hash_32() is arch-specific; not compared to generic."); +#endif +#ifndef HAVE_ARCH_HASH_32 + pr_info("hash_32() has no arch implementation to test."); +#elif HAVE_ARCH_HASH_32 != 1 + pr_info("hash_32() is arch-specific; not compared to generic."); +#endif +#ifndef HAVE_ARCH_HASH_64 + pr_info("hash_64() has no arch implementation to test."); +#elif HAVE_ARCH_HASH_64 != 1 + pr_info("hash_64() is arch-specific; not compared to generic."); +#endif + + pr_notice("%u tests passed.", tests); + + return 0; +} + +static void __exit test_hash_exit(void) +{ +} + +module_init(test_hash_init); /* Does everything */ +module_exit(test_hash_exit); /* Does nothing */ + +MODULE_LICENSE("GPL"); diff --git a/lib/test_kasan.c b/lib/test_kasan.c index 82169fbf2..5e51872b3 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -12,9 +12,12 @@ #define pr_fmt(fmt) "kasan test: %s " fmt, __func__ #include <linux/kernel.h> +#include <linux/mman.h> +#include <linux/mm.h> #include <linux/printk.h> #include <linux/slab.h> #include <linux/string.h> +#include <linux/uaccess.h> #include <linux/module.h> static noinline void __init kmalloc_oob_right(void) @@ -344,6 +347,70 @@ static noinline void __init kasan_stack_oob(void) *(volatile char *)p; } +static noinline void __init ksize_unpoisons_memory(void) +{ + char *ptr; + size_t size = 123, real_size = size; + + pr_info("ksize() unpoisons the whole allocated chunk\n"); + ptr = kmalloc(size, GFP_KERNEL); + if (!ptr) { + pr_err("Allocation failed\n"); + return; + } + real_size = ksize(ptr); + /* This access doesn't trigger an error. */ + ptr[size] = 'x'; + /* This one does. */ + ptr[real_size] = 'y'; + kfree(ptr); +} + +static noinline void __init copy_user_test(void) +{ + char *kmem; + char __user *usermem; + size_t size = 10; + int unused; + + kmem = kmalloc(size, GFP_KERNEL); + if (!kmem) + return; + + usermem = (char __user *)vm_mmap(NULL, 0, PAGE_SIZE, + PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_ANONYMOUS | MAP_PRIVATE, 0); + if (IS_ERR(usermem)) { + pr_err("Failed to allocate user memory\n"); + kfree(kmem); + return; + } + + pr_info("out-of-bounds in copy_from_user()\n"); + unused = copy_from_user(kmem, usermem, size + 1); + + pr_info("out-of-bounds in copy_to_user()\n"); + unused = copy_to_user(usermem, kmem, size + 1); + + pr_info("out-of-bounds in __copy_from_user()\n"); + unused = __copy_from_user(kmem, usermem, size + 1); + + pr_info("out-of-bounds in __copy_to_user()\n"); + unused = __copy_to_user(usermem, kmem, size + 1); + + pr_info("out-of-bounds in __copy_from_user_inatomic()\n"); + unused = __copy_from_user_inatomic(kmem, usermem, size + 1); + + pr_info("out-of-bounds in __copy_to_user_inatomic()\n"); + unused = __copy_to_user_inatomic(usermem, kmem, size + 1); + + pr_info("out-of-bounds in strncpy_from_user()\n"); + unused = strncpy_from_user(kmem, usermem, size + 1); + + vm_munmap((unsigned long)usermem, PAGE_SIZE); + kfree(kmem); +} + static int __init kmalloc_tests_init(void) { kmalloc_oob_right(); @@ -367,6 +434,8 @@ static int __init kmalloc_tests_init(void) kmem_cache_oob(); kasan_stack_oob(); kasan_global_oob(); + ksize_unpoisons_memory(); + copy_user_test(); return -EAGAIN; } diff --git a/lib/test_rhashtable.c b/lib/test_rhashtable.c index 270bf7289..297fdb5e7 100644 --- a/lib/test_rhashtable.c +++ b/lib/test_rhashtable.c @@ -143,7 +143,7 @@ static void test_bucket_stats(struct rhashtable *ht) struct rhashtable_iter hti; struct rhash_head *pos; - err = rhashtable_walk_init(ht, &hti); + err = rhashtable_walk_init(ht, &hti, GFP_KERNEL); if (err) { pr_warn("Test failed: allocation error"); return; diff --git a/lib/test_uuid.c b/lib/test_uuid.c new file mode 100644 index 000000000..547d3127a --- /dev/null +++ b/lib/test_uuid.c @@ -0,0 +1,133 @@ +/* + * Test cases for lib/uuid.c module. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/string.h> +#include <linux/uuid.h> + +struct test_uuid_data { + const char *uuid; + uuid_le le; + uuid_be be; +}; + +static const struct test_uuid_data test_uuid_test_data[] = { + { + .uuid = "c33f4995-3701-450e-9fbf-206a2e98e576", + .le = UUID_LE(0xc33f4995, 0x3701, 0x450e, 0x9f, 0xbf, 0x20, 0x6a, 0x2e, 0x98, 0xe5, 0x76), + .be = UUID_BE(0xc33f4995, 0x3701, 0x450e, 0x9f, 0xbf, 0x20, 0x6a, 0x2e, 0x98, 0xe5, 0x76), + }, + { + .uuid = "64b4371c-77c1-48f9-8221-29f054fc023b", + .le = UUID_LE(0x64b4371c, 0x77c1, 0x48f9, 0x82, 0x21, 0x29, 0xf0, 0x54, 0xfc, 0x02, 0x3b), + .be = UUID_BE(0x64b4371c, 0x77c1, 0x48f9, 0x82, 0x21, 0x29, 0xf0, 0x54, 0xfc, 0x02, 0x3b), + }, + { + .uuid = "0cb4ddff-a545-4401-9d06-688af53e7f84", + .le = UUID_LE(0x0cb4ddff, 0xa545, 0x4401, 0x9d, 0x06, 0x68, 0x8a, 0xf5, 0x3e, 0x7f, 0x84), + .be = UUID_BE(0x0cb4ddff, 0xa545, 0x4401, 0x9d, 0x06, 0x68, 0x8a, 0xf5, 0x3e, 0x7f, 0x84), + }, +}; + +static const char * const test_uuid_wrong_data[] = { + "c33f4995-3701-450e-9fbf206a2e98e576 ", /* no hyphen(s) */ + "64b4371c-77c1-48f9-8221-29f054XX023b", /* invalid character(s) */ + "0cb4ddff-a545-4401-9d06-688af53e", /* not enough data */ +}; + +static unsigned total_tests __initdata; +static unsigned failed_tests __initdata; + +static void __init test_uuid_failed(const char *prefix, bool wrong, bool be, + const char *data, const char *actual) +{ + pr_err("%s test #%u %s %s data: '%s'\n", + prefix, + total_tests, + wrong ? "passed on wrong" : "failed on", + be ? "BE" : "LE", + data); + if (actual && *actual) + pr_err("%s test #%u actual data: '%s'\n", + prefix, + total_tests, + actual); + failed_tests++; +} + +static void __init test_uuid_test(const struct test_uuid_data *data) +{ + uuid_le le; + uuid_be be; + char buf[48]; + + /* LE */ + total_tests++; + if (uuid_le_to_bin(data->uuid, &le)) + test_uuid_failed("conversion", false, false, data->uuid, NULL); + + total_tests++; + if (uuid_le_cmp(data->le, le)) { + sprintf(buf, "%pUl", &le); + test_uuid_failed("cmp", false, false, data->uuid, buf); + } + + /* BE */ + total_tests++; + if (uuid_be_to_bin(data->uuid, &be)) + test_uuid_failed("conversion", false, true, data->uuid, NULL); + + total_tests++; + if (uuid_be_cmp(data->be, be)) { + sprintf(buf, "%pUb", &be); + test_uuid_failed("cmp", false, true, data->uuid, buf); + } +} + +static void __init test_uuid_wrong(const char *data) +{ + uuid_le le; + uuid_be be; + + /* LE */ + total_tests++; + if (!uuid_le_to_bin(data, &le)) + test_uuid_failed("negative", true, false, data, NULL); + + /* BE */ + total_tests++; + if (!uuid_be_to_bin(data, &be)) + test_uuid_failed("negative", true, true, data, NULL); +} + +static int __init test_uuid_init(void) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(test_uuid_test_data); i++) + test_uuid_test(&test_uuid_test_data[i]); + + for (i = 0; i < ARRAY_SIZE(test_uuid_wrong_data); i++) + test_uuid_wrong(test_uuid_wrong_data[i]); + + if (failed_tests == 0) + pr_info("all %u tests passed\n", total_tests); + else + pr_err("failed %u out of %u tests\n", failed_tests, total_tests); + + return failed_tests ? -EINVAL : 0; +} +module_init(test_uuid_init); + +static void __exit test_uuid_exit(void) +{ + /* do nothing */ +} +module_exit(test_uuid_exit); + +MODULE_AUTHOR("Andy Shevchenko <andriy.shevchenko@linux.intel.com>"); +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/lib/uuid.c b/lib/uuid.c index 398821e4d..37687af77 100644 --- a/lib/uuid.c +++ b/lib/uuid.c @@ -1,7 +1,7 @@ /* * Unified UUID/GUID definition * - * Copyright (C) 2009, Intel Corp. + * Copyright (C) 2009, 2016 Intel Corp. * Huang Ying <ying.huang@intel.com> * * This program is free software; you can redistribute it and/or @@ -12,17 +12,40 @@ * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <linux/kernel.h> +#include <linux/ctype.h> +#include <linux/errno.h> #include <linux/export.h> #include <linux/uuid.h> #include <linux/random.h> +const u8 uuid_le_index[16] = {3,2,1,0,5,4,7,6,8,9,10,11,12,13,14,15}; +EXPORT_SYMBOL(uuid_le_index); +const u8 uuid_be_index[16] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; +EXPORT_SYMBOL(uuid_be_index); + +/*************************************************************** + * Random UUID interface + * + * Used here for a Boot ID, but can be useful for other kernel + * drivers. + ***************************************************************/ + +/* + * Generate random UUID + */ +void generate_random_uuid(unsigned char uuid[16]) +{ + get_random_bytes(uuid, 16); + /* Set UUID version to 4 --- truly random generation */ + uuid[6] = (uuid[6] & 0x0F) | 0x40; + /* Set the UUID variant to DCE */ + uuid[8] = (uuid[8] & 0x3F) | 0x80; +} +EXPORT_SYMBOL(generate_random_uuid); + static void __uuid_gen_common(__u8 b[16]) { prandom_bytes(b, 16); @@ -45,3 +68,61 @@ void uuid_be_gen(uuid_be *bu) bu->b[6] = (bu->b[6] & 0x0F) | 0x40; } EXPORT_SYMBOL_GPL(uuid_be_gen); + +/** + * uuid_is_valid - checks if UUID string valid + * @uuid: UUID string to check + * + * Description: + * It checks if the UUID string is following the format: + * xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx + * where x is a hex digit. + * + * Return: true if input is valid UUID string. + */ +bool uuid_is_valid(const char *uuid) +{ + unsigned int i; + + for (i = 0; i < UUID_STRING_LEN; i++) { + if (i == 8 || i == 13 || i == 18 || i == 23) { + if (uuid[i] != '-') + return false; + } else if (!isxdigit(uuid[i])) { + return false; + } + } + + return true; +} +EXPORT_SYMBOL(uuid_is_valid); + +static int __uuid_to_bin(const char *uuid, __u8 b[16], const u8 ei[16]) +{ + static const u8 si[16] = {0,2,4,6,9,11,14,16,19,21,24,26,28,30,32,34}; + unsigned int i; + + if (!uuid_is_valid(uuid)) + return -EINVAL; + + for (i = 0; i < 16; i++) { + int hi = hex_to_bin(uuid[si[i] + 0]); + int lo = hex_to_bin(uuid[si[i] + 1]); + + b[ei[i]] = (hi << 4) | lo; + } + + return 0; +} + +int uuid_le_to_bin(const char *uuid, uuid_le *u) +{ + return __uuid_to_bin(uuid, u->b, uuid_le_index); +} +EXPORT_SYMBOL(uuid_le_to_bin); + +int uuid_be_to_bin(const char *uuid, uuid_be *u) +{ + return __uuid_to_bin(uuid, u->b, uuid_be_index); +} +EXPORT_SYMBOL(uuid_be_to_bin); diff --git a/lib/vsprintf.c b/lib/vsprintf.c index ccb664b54..0967771d8 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -30,6 +30,7 @@ #include <linux/ioport.h> #include <linux/dcache.h> #include <linux/cred.h> +#include <linux/uuid.h> #include <net/addrconf.h> #ifdef CONFIG_BLOCK #include <linux/blkdev.h> @@ -1304,19 +1305,17 @@ static noinline_for_stack char *uuid_string(char *buf, char *end, const u8 *addr, struct printf_spec spec, const char *fmt) { - char uuid[sizeof("xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx")]; + char uuid[UUID_STRING_LEN + 1]; char *p = uuid; int i; - static const u8 be[16] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; - static const u8 le[16] = {3,2,1,0,5,4,7,6,8,9,10,11,12,13,14,15}; - const u8 *index = be; + const u8 *index = uuid_be_index; bool uc = false; switch (*(++fmt)) { case 'L': uc = true; /* fall-through */ case 'l': - index = le; + index = uuid_le_index; break; case 'B': uc = true; @@ -1324,7 +1323,10 @@ char *uuid_string(char *buf, char *end, const u8 *addr, } for (i = 0; i < 16; i++) { - p = hex_byte_pack(p, addr[index[i]]); + if (uc) + p = hex_byte_pack_upper(p, addr[index[i]]); + else + p = hex_byte_pack(p, addr[index[i]]); switch (i) { case 3: case 5: @@ -1337,13 +1339,6 @@ char *uuid_string(char *buf, char *end, const u8 *addr, *p = 0; - if (uc) { - p = uuid; - do { - *p = toupper(*p); - } while (*(++p)); - } - return string(buf, end, uuid, spec); } diff --git a/lib/wbt.c b/lib/wbt.c new file mode 100644 index 000000000..cc5a24270 --- /dev/null +++ b/lib/wbt.c @@ -0,0 +1,569 @@ +/* + * buffered writeback throttling. losely based on CoDel. We can't drop + * packets for IO scheduling, so the logic is something like this: + * + * - Monitor latencies in a defined window of time. + * - If the minimum latency in the above window exceeds some target, increment + * scaling step and scale down queue depth by a factor of 2x. The monitoring + * window is then shrunk to 100 / sqrt(scaling step + 1). + * - For any window where we don't have solid data on what the latencies + * look like, retain status quo. + * - If latencies look good, decrement scaling step. + * + * Copyright (C) 2016 Jens Axboe + * + * Things that (may) need changing: + * + * - Different scaling of background/normal/high priority writeback. + * We may have to violate guarantees for max. + * - We can have mismatches between the stat window and our window. + * + */ +#include <linux/kernel.h> +#include <linux/blk_types.h> +#include <linux/slab.h> +#include <linux/backing-dev.h> +#include <linux/wbt.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/wbt.h> + +enum { + /* + * Might need to be higher + */ + RWB_MAX_DEPTH = 64, + + /* + * 100msec window + */ + RWB_WINDOW_NSEC = 100 * 1000 * 1000ULL, + + /* + * Disregard stats, if we don't meet these minimums + */ + RWB_MIN_WRITE_SAMPLES = 3, + RWB_MIN_READ_SAMPLES = 1, + + /* + * If we have this number of consecutive windows with not enough + * information to scale up or down, scale up. + */ + RWB_UNKNOWN_BUMP = 5, +}; + +static inline bool rwb_enabled(struct rq_wb *rwb) +{ + return rwb && rwb->wb_normal != 0; +} + +/* + * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded, + * false if 'v' + 1 would be bigger than 'below'. + */ +static bool atomic_inc_below(atomic_t *v, int below) +{ + int cur = atomic_read(v); + + for (;;) { + int old; + + if (cur >= below) + return false; + old = atomic_cmpxchg(v, cur, cur + 1); + if (old == cur) + break; + cur = old; + } + + return true; +} + +static void wb_timestamp(struct rq_wb *rwb, unsigned long *var) +{ + if (rwb_enabled(rwb)) { + const unsigned long cur = jiffies; + + if (cur != *var) + *var = cur; + } +} + +void __wbt_done(struct rq_wb *rwb) +{ + int inflight, limit; + + inflight = atomic_dec_return(&rwb->inflight); + + /* + * wbt got disabled with IO in flight. Wake up any potential + * waiters, we don't have to do more than that. + */ + if (unlikely(!rwb_enabled(rwb))) { + wake_up_all(&rwb->wait); + return; + } + + /* + * If the device does write back caching, drop further down + * before we wake people up. + */ + if (rwb->wc && !atomic_read(&rwb->bdi->wb.dirty_sleeping)) + limit = 0; + else + limit = rwb->wb_normal; + + /* + * Don't wake anyone up if we are above the normal limit. + */ + if (inflight && inflight >= limit) + return; + + if (waitqueue_active(&rwb->wait)) { + int diff = limit - inflight; + + if (!inflight || diff >= rwb->wb_background / 2) + wake_up_nr(&rwb->wait, 1); + } +} + +/* + * Called on completion of a request. Note that it's also called when + * a request is merged, when the request gets freed. + */ +void wbt_done(struct rq_wb *rwb, struct wb_issue_stat *stat) +{ + if (!rwb) + return; + + if (!wbt_tracked(stat)) { + if (rwb->sync_cookie == stat) { + rwb->sync_issue = 0; + rwb->sync_cookie = NULL; + } + + wb_timestamp(rwb, &rwb->last_comp); + } else { + WARN_ON_ONCE(stat == rwb->sync_cookie); + __wbt_done(rwb); + wbt_clear_tracked(stat); + } +} + +static void calc_wb_limits(struct rq_wb *rwb) +{ + unsigned int depth; + + if (!rwb->min_lat_nsec) { + rwb->wb_max = rwb->wb_normal = rwb->wb_background = 0; + return; + } + + /* + * For QD=1 devices, this is a special case. It's important for those + * to have one request ready when one completes, so force a depth of + * 2 for those devices. On the backend, it'll be a depth of 1 anyway, + * since the device can't have more than that in flight. If we're + * scaling down, then keep a setting of 1/1/1. + */ + if (rwb->queue_depth == 1) { + if (rwb->scale_step) + rwb->wb_max = rwb->wb_normal = 1; + else + rwb->wb_max = rwb->wb_normal = 2; + rwb->wb_background = 1; + } else { + depth = min_t(unsigned int, RWB_MAX_DEPTH, rwb->queue_depth); + + /* + * Set our max/normal/bg queue depths based on how far + * we have scaled down (->scale_step). + */ + rwb->wb_max = 1 + ((depth - 1) >> min(31U, rwb->scale_step)); + rwb->wb_normal = (rwb->wb_max + 1) / 2; + rwb->wb_background = (rwb->wb_max + 3) / 4; + } +} + +static bool inline stat_sample_valid(struct blk_rq_stat *stat) +{ + /* + * We need at least one read sample, and a minimum of + * RWB_MIN_WRITE_SAMPLES. We require some write samples to know + * that it's writes impacting us, and not just some sole read on + * a device that is in a lower power state. + */ + return stat[0].nr_samples >= 1 && + stat[1].nr_samples >= RWB_MIN_WRITE_SAMPLES; +} + +static u64 rwb_sync_issue_lat(struct rq_wb *rwb) +{ + u64 now, issue = ACCESS_ONCE(rwb->sync_issue); + + if (!issue || !rwb->sync_cookie) + return 0; + + now = ktime_to_ns(ktime_get()); + return now - issue; +} + +enum { + LAT_OK, + LAT_UNKNOWN, + LAT_EXCEEDED, +}; + +static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) +{ + u64 thislat; + + /* + * If our stored sync issue exceeds the window size, or it + * exceeds our min target AND we haven't logged any entries, + * flag the latency as exceeded. wbt works off completion latencies, + * but for a flooded device, a single sync IO can take a long time + * to complete after being issued. If this time exceeds our + * monitoring window AND we didn't see any other completions in that + * window, then count that sync IO as a violation of the latency. + */ + thislat = rwb_sync_issue_lat(rwb); + if (thislat > rwb->cur_win_nsec || + (thislat > rwb->min_lat_nsec && !stat[0].nr_samples)) { + trace_wbt_lat(rwb->bdi, thislat); + return LAT_EXCEEDED; + } + + if (!stat_sample_valid(stat)) + return LAT_UNKNOWN; + + /* + * If the 'min' latency exceeds our target, step down. + */ + if (stat[0].min > rwb->min_lat_nsec) { + trace_wbt_lat(rwb->bdi, stat[0].min); + trace_wbt_stat(rwb->bdi, stat); + return LAT_EXCEEDED; + } + + if (rwb->scale_step) + trace_wbt_stat(rwb->bdi, stat); + + return LAT_OK; +} + +static int latency_exceeded(struct rq_wb *rwb) +{ + struct blk_rq_stat stat[2]; + + rwb->stat_ops->get(rwb->ops_data, stat); + return __latency_exceeded(rwb, stat); +} + +static void rwb_trace_step(struct rq_wb *rwb, const char *msg) +{ + trace_wbt_step(rwb->bdi, msg, rwb->scale_step, rwb->cur_win_nsec, + rwb->wb_background, rwb->wb_normal, rwb->wb_max); +} + +static void scale_up(struct rq_wb *rwb) +{ + /* + * If we're at 0, we can't go lower. + */ + if (!rwb->scale_step) + return; + + rwb->scale_step--; + rwb->unknown_cnt = 0; + rwb->stat_ops->clear(rwb->ops_data); + calc_wb_limits(rwb); + + if (waitqueue_active(&rwb->wait)) + wake_up_all(&rwb->wait); + + rwb_trace_step(rwb, "step up"); +} + +static void scale_down(struct rq_wb *rwb) +{ + /* + * Stop scaling down when we've hit the limit. This also prevents + * ->scale_step from going to crazy values, if the device can't + * keep up. + */ + if (rwb->wb_max == 1) + return; + + rwb->scale_step++; + rwb->unknown_cnt = 0; + rwb->stat_ops->clear(rwb->ops_data); + calc_wb_limits(rwb); + rwb_trace_step(rwb, "step down"); +} + +static void rwb_arm_timer(struct rq_wb *rwb) +{ + unsigned long expires; + + /* + * We should speed this up, using some variant of a fast integer + * inverse square root calculation. Since we only do this for + * every window expiration, it's not a huge deal, though. + */ + rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4, + int_sqrt((rwb->scale_step + 1) << 8)); + expires = jiffies + nsecs_to_jiffies(rwb->cur_win_nsec); + mod_timer(&rwb->window_timer, expires); +} + +static void wb_timer_fn(unsigned long data) +{ + struct rq_wb *rwb = (struct rq_wb *) data; + int status; + + /* + * If we exceeded the latency target, step down. If we did not, + * step one level up. If we don't know enough to say either exceeded + * or ok, then don't do anything. + */ + status = latency_exceeded(rwb); + switch (status) { + case LAT_EXCEEDED: + scale_down(rwb); + break; + case LAT_OK: + scale_up(rwb); + break; + case LAT_UNKNOWN: + /* + * We had no read samples, start bumping up the write + * depth slowly + */ + if (++rwb->unknown_cnt >= RWB_UNKNOWN_BUMP) + scale_up(rwb); + break; + default: + break; + } + + /* + * Re-arm timer, if we have IO in flight + */ + if (rwb->scale_step || atomic_read(&rwb->inflight)) + rwb_arm_timer(rwb); +} + +void wbt_update_limits(struct rq_wb *rwb) +{ + rwb->scale_step = 0; + calc_wb_limits(rwb); + + if (waitqueue_active(&rwb->wait)) + wake_up_all(&rwb->wait); +} + +static bool close_io(struct rq_wb *rwb) +{ + const unsigned long now = jiffies; + + return time_before(now, rwb->last_issue + HZ / 10) || + time_before(now, rwb->last_comp + HZ / 10); +} + +#define REQ_HIPRIO (REQ_SYNC | REQ_META | REQ_PRIO) + +static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw) +{ + unsigned int limit; + + /* + * At this point we know it's a buffered write. If REQ_SYNC is + * set, then it's WB_SYNC_ALL writeback, and we'll use the max + * limit for that. If the write is marked as a background write, + * then use the idle limit, or go to normal if we haven't had + * competing IO for a bit. + */ + if ((rw & REQ_HIPRIO) || atomic_read(&rwb->bdi->wb.dirty_sleeping)) + limit = rwb->wb_max; + else if ((rw & REQ_BG) || close_io(rwb)) { + /* + * If less than 100ms since we completed unrelated IO, + * limit us to half the depth for background writeback. + */ + limit = rwb->wb_background; + } else + limit = rwb->wb_normal; + + return limit; +} + +static inline bool may_queue(struct rq_wb *rwb, unsigned long rw) +{ + /* + * inc it here even if disabled, since we'll dec it at completion. + * this only happens if the task was sleeping in __wbt_wait(), + * and someone turned it off at the same time. + */ + if (!rwb_enabled(rwb)) { + atomic_inc(&rwb->inflight); + return true; + } + + return atomic_inc_below(&rwb->inflight, get_limit(rwb, rw)); +} + +/* + * Block if we will exceed our limit, or if we are currently waiting for + * the timer to kick off queuing again. + */ +static void __wbt_wait(struct rq_wb *rwb, unsigned long rw, spinlock_t *lock) +{ + DEFINE_WAIT(wait); + + if (may_queue(rwb, rw)) + return; + + do { + prepare_to_wait_exclusive(&rwb->wait, &wait, + TASK_UNINTERRUPTIBLE); + + if (may_queue(rwb, rw)) + break; + + if (lock) + spin_unlock_irq(lock); + + io_schedule(); + + if (lock) + spin_lock_irq(lock); + } while (1); + + finish_wait(&rwb->wait, &wait); +} + +static inline bool wbt_should_throttle(struct rq_wb *rwb, unsigned int rw) +{ + /* + * If not a WRITE (or a discard), do nothing + */ + if (!(rw & REQ_WRITE) || (rw & REQ_DISCARD)) + return false; + + /* + * Don't throttle WRITE_ODIRECT + */ + if ((rw & (REQ_SYNC | REQ_NOIDLE)) == REQ_SYNC) + return false; + + return true; +} + +/* + * Returns true if the IO request should be accounted, false if not. + * May sleep, if we have exceeded the writeback limits. Caller can pass + * in an irq held spinlock, if it holds one when calling this function. + * If we do sleep, we'll release and re-grab it. + */ +bool wbt_wait(struct rq_wb *rwb, unsigned int rw, spinlock_t *lock) +{ + if (!rwb_enabled(rwb)) + return false; + + if (!wbt_should_throttle(rwb, rw)) { + wb_timestamp(rwb, &rwb->last_issue); + return false; + } + + __wbt_wait(rwb, rw, lock); + + if (!timer_pending(&rwb->window_timer)) + rwb_arm_timer(rwb); + + return true; +} + +void wbt_issue(struct rq_wb *rwb, struct wb_issue_stat *stat) +{ + if (!rwb_enabled(rwb)) + return; + + wbt_issue_stat_set_time(stat); + + /* + * Track sync issue, in case it takes a long time to complete. Allows + * us to react quicker, if a sync IO takes a long time to complete. + * Note that this is just a hint. 'stat' can go away when the + * request completes, so it's important we never dereference it. We + * only use the address to compare with, which is why we store the + * sync_issue time locally. + */ + if (!wbt_tracked(stat) && !rwb->sync_issue) { + rwb->sync_cookie = stat; + rwb->sync_issue = wbt_issue_stat_get_time(stat); + } +} + +void wbt_requeue(struct rq_wb *rwb, struct wb_issue_stat *stat) +{ + if (!rwb_enabled(rwb)) + return; + if (stat == rwb->sync_cookie) { + rwb->sync_issue = 0; + rwb->sync_cookie = NULL; + } +} + +void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth) +{ + if (rwb) { + rwb->queue_depth = depth; + wbt_update_limits(rwb); + } +} + +void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on) +{ + if (rwb) + rwb->wc = write_cache_on; +} + +void wbt_disable(struct rq_wb *rwb) +{ + del_timer_sync(&rwb->window_timer); + rwb->win_nsec = rwb->min_lat_nsec = 0; + wbt_update_limits(rwb); +} +EXPORT_SYMBOL_GPL(wbt_disable); + +struct rq_wb *wbt_init(struct backing_dev_info *bdi, struct wb_stat_ops *ops, + void *ops_data) +{ + struct rq_wb *rwb; + + rwb = kzalloc(sizeof(*rwb), GFP_KERNEL); + if (!rwb) + return ERR_PTR(-ENOMEM); + + atomic_set(&rwb->inflight, 0); + init_waitqueue_head(&rwb->wait); + setup_timer(&rwb->window_timer, wb_timer_fn, (unsigned long) rwb); + rwb->wc = 1; + rwb->queue_depth = RWB_MAX_DEPTH; + rwb->last_comp = rwb->last_issue = jiffies; + rwb->bdi = bdi; + rwb->win_nsec = RWB_WINDOW_NSEC; + rwb->stat_ops = ops, + rwb->ops_data = ops_data; + wbt_update_limits(rwb); + return rwb; +} + +void wbt_exit(struct rq_wb *rwb) +{ + if (rwb) { + del_timer_sync(&rwb->window_timer); + kfree(rwb); + } +} |