From d0b2f91bede3bd5e3d24dd6803e56eee959c1797 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Fabian=20Silva=20Delgado?= Date: Thu, 20 Oct 2016 00:10:27 -0300 Subject: Linux-libre 4.8.2-gnu --- lib/Kconfig.debug | 67 ++----- lib/Kconfig.kasan | 4 +- lib/Makefile | 10 +- lib/atomic64.c | 32 +++- lib/atomic64_test.c | 34 ++++ lib/bitmap.c | 2 +- lib/chacha20.c | 79 ++++++++ lib/crc32.c | 16 +- lib/digsig.c | 16 +- lib/dma-debug.c | 2 + lib/dma-noop.c | 9 +- lib/dynamic_debug.c | 7 + lib/earlycpio.c | 5 +- lib/hweight.c | 4 + lib/iommu-helper.c | 3 +- lib/iov_iter.c | 53 +++--- lib/mpi/mpicoder.c | 247 ++++++++----------------- lib/radix-tree.c | 92 +++++++++- lib/random32.c | 1 - lib/ratelimit.c | 10 +- lib/rbtree.c | 26 ++- lib/rhashtable.c | 20 +- lib/sradix-tree.c | 476 ++++++++++++++++++++++++++++++++++++++++++++++++ lib/stackdepot.c | 1 + lib/strncpy_from_user.c | 8 +- lib/strnlen_user.c | 7 +- lib/swiotlb.c | 13 +- lib/test_hash.c | 30 +-- lib/test_rhashtable.c | 2 +- lib/ubsan.c | 2 +- lib/wbt.c | 288 +++++++++++++++++++++-------- 31 files changed, 1148 insertions(+), 418 deletions(-) create mode 100644 lib/chacha20.c create mode 100644 lib/sradix-tree.c (limited to 'lib') diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 42d3d798c..896a3d0b1 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -244,6 +244,7 @@ config PAGE_OWNER depends on DEBUG_KERNEL && STACKTRACE_SUPPORT select DEBUG_FS select STACKTRACE + select STACKDEPOT select PAGE_EXTENSION help This keeps track of what call chain is the owner of a page, may @@ -708,6 +709,8 @@ config KCOV bool "Code coverage for fuzzing" depends on ARCH_HAS_KCOV select DEBUG_FS + select GCC_PLUGINS if !COMPILE_TEST + select GCC_PLUGIN_SANCOV if !COMPILE_TEST help KCOV exposes kernel code coverage information in a form suitable for coverage-guided fuzzing (randomized testing). @@ -718,6 +721,17 @@ config KCOV For more details, see Documentation/kcov.txt. +config KCOV_INSTRUMENT_ALL + bool "Instrument all code by default" + depends on KCOV + default y if KCOV + help + If you are doing generic system call fuzzing (like e.g. syzkaller), + then you will want to instrument the whole kernel and you should + say y here. If you are doing more targeted fuzzing (like e.g. + filesystem fuzzing with AFL) then you will want to enable coverage + for more specific subsets of files, and should say n here. + config DEBUG_SHIRQ bool "Debug shared IRQ handlers" depends on DEBUG_KERNEL @@ -807,7 +821,7 @@ config DETECT_HUNG_TASK help Say Y here to enable the kernel to detect "hung tasks", which are bugs that cause the task to be stuck in - uninterruptible "D" state indefinitiley. + uninterruptible "D" state indefinitely. When a hung task is detected, the kernel will print the current stack trace (which you should report), but the @@ -1307,22 +1321,6 @@ config RCU_PERF_TEST Say M if you want the RCU performance tests to build as a module. Say N if you are unsure. -config RCU_PERF_TEST_RUNNABLE - bool "performance tests for RCU runnable by default" - depends on RCU_PERF_TEST = y - default n - help - This option provides a way to build the RCU performance tests - directly into the kernel without them starting up at boot time. - You can use /sys/module to manually override this setting. - This /proc file is available only when the RCU performance - tests have been built into the kernel. - - Say Y here if you want the RCU performance tests to start during - boot (you probably don't). - Say N here if you want the RCU performance tests to start only - after being manually enabled via /sys/module. - config RCU_TORTURE_TEST tristate "torture tests for RCU" depends on DEBUG_KERNEL && !SCHED_BFS @@ -1340,23 +1338,6 @@ config RCU_TORTURE_TEST Say M if you want the RCU torture tests to build as a module. Say N if you are unsure. -config RCU_TORTURE_TEST_RUNNABLE - bool "torture tests for RCU runnable by default" - depends on RCU_TORTURE_TEST = y - default n - help - This option provides a way to build the RCU torture tests - directly into the kernel without them starting up at boot - time. You can use /proc/sys/kernel/rcutorture_runnable - to manually override this setting. This /proc file is - available only when the RCU torture tests have been built - into the kernel. - - Say Y here if you want the RCU torture tests to start during - boot (you probably don't). - Say N here if you want the RCU torture tests to start only - after being manually enabled via /proc. - config RCU_TORTURE_TEST_SLOW_PREINIT bool "Slow down RCU grace-period pre-initialization to expose races" depends on RCU_TORTURE_TEST @@ -1706,24 +1687,6 @@ config LATENCYTOP Enable this option if you want to use the LatencyTOP tool to find out which userspace is blocking on what kernel operations. -config ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS - bool - -config DEBUG_STRICT_USER_COPY_CHECKS - bool "Strict user copy size checks" - depends on ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS - depends on DEBUG_KERNEL && !TRACE_BRANCH_PROFILING - help - Enabling this option turns a certain set of sanity checks for user - copy operations into compile time failures. - - The copy_from_user() etc checks are there to help test if there - are sufficient security checks on the length argument of - the copy operation, by having gcc prove that the argument is - within bounds. - - If unsure, say N. - source kernel/trace/Kconfig menu "Runtime Testing" diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan index 67d8c6838..bd38aab05 100644 --- a/lib/Kconfig.kasan +++ b/lib/Kconfig.kasan @@ -5,9 +5,9 @@ if HAVE_ARCH_KASAN config KASAN bool "KASan: runtime memory debugger" - depends on SLUB_DEBUG || (SLAB && !DEBUG_SLAB) + depends on SLUB || (SLAB && !DEBUG_SLAB) select CONSTRUCTORS - select STACKDEPOT if SLAB + select STACKDEPOT help Enables kernel address sanitizer - runtime memory debugger, designed to find out-of-bounds accesses and use-after-free bugs. diff --git a/lib/Makefile b/lib/Makefile index 5b5506e3b..5bd016bd3 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -15,19 +15,15 @@ KCOV_INSTRUMENT_rbtree.o := n KCOV_INSTRUMENT_list_debug.o := n KCOV_INSTRUMENT_debugobjects.o := n KCOV_INSTRUMENT_dynamic_debug.o := n -# Kernel does not boot if we instrument this file as it uses custom calling -# convention (see CONFIG_ARCH_HWEIGHT_CFLAGS). -KCOV_INSTRUMENT_hweight.o := n lib-y := ctype.o string.o vsprintf.o cmdline.o \ - rbtree.o radix-tree.o dump_stack.o timerqueue.o\ + rbtree.o radix-tree.o sradix-tree.o dump_stack.o timerqueue.o\ idr.o int_sqrt.o extable.o \ - sha1.o md5.o irq_regs.o argv_split.o \ + sha1.o chacha20.o md5.o irq_regs.o argv_split.o \ flex_proportions.o ratelimit.o show_mem.o \ is_single_threaded.o plist.o decompress.o kobject_uevent.o \ earlycpio.o seq_buf.o nmi_backtrace.o nodemask.o -obj-$(CONFIG_ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS) += usercopy.o lib-$(CONFIG_MMU) += ioremap.o lib-$(CONFIG_SMP) += cpumask.o lib-$(CONFIG_HAS_DMA) += dma-noop.o @@ -74,8 +70,6 @@ obj-$(CONFIG_HAS_IOMEM) += iomap_copy.o devres.o obj-$(CONFIG_CHECK_SIGNATURE) += check_signature.o obj-$(CONFIG_DEBUG_LOCKING_API_SELFTESTS) += locking-selftest.o -GCOV_PROFILE_hweight.o := n -CFLAGS_hweight.o = $(subst $(quote),,$(CONFIG_ARCH_HWEIGHT_CFLAGS)) obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o obj-$(CONFIG_BTREE) += btree.o diff --git a/lib/atomic64.c b/lib/atomic64.c index 2886ebac6..53c2d5edc 100644 --- a/lib/atomic64.c +++ b/lib/atomic64.c @@ -96,17 +96,41 @@ long long atomic64_##op##_return(long long a, atomic64_t *v) \ } \ EXPORT_SYMBOL(atomic64_##op##_return); +#define ATOMIC64_FETCH_OP(op, c_op) \ +long long atomic64_fetch_##op(long long a, atomic64_t *v) \ +{ \ + unsigned long flags; \ + raw_spinlock_t *lock = lock_addr(v); \ + long long val; \ + \ + raw_spin_lock_irqsave(lock, flags); \ + val = v->counter; \ + v->counter c_op a; \ + raw_spin_unlock_irqrestore(lock, flags); \ + return val; \ +} \ +EXPORT_SYMBOL(atomic64_fetch_##op); + #define ATOMIC64_OPS(op, c_op) \ ATOMIC64_OP(op, c_op) \ - ATOMIC64_OP_RETURN(op, c_op) + ATOMIC64_OP_RETURN(op, c_op) \ + ATOMIC64_FETCH_OP(op, c_op) ATOMIC64_OPS(add, +=) ATOMIC64_OPS(sub, -=) -ATOMIC64_OP(and, &=) -ATOMIC64_OP(or, |=) -ATOMIC64_OP(xor, ^=) #undef ATOMIC64_OPS +#define ATOMIC64_OPS(op, c_op) \ + ATOMIC64_OP(op, c_op) \ + ATOMIC64_OP_RETURN(op, c_op) \ + ATOMIC64_FETCH_OP(op, c_op) + +ATOMIC64_OPS(and, &=) +ATOMIC64_OPS(or, |=) +ATOMIC64_OPS(xor, ^=) + +#undef ATOMIC64_OPS +#undef ATOMIC64_FETCH_OP #undef ATOMIC64_OP_RETURN #undef ATOMIC64_OP diff --git a/lib/atomic64_test.c b/lib/atomic64_test.c index 123481814..dbb369145 100644 --- a/lib/atomic64_test.c +++ b/lib/atomic64_test.c @@ -53,11 +53,25 @@ do { \ BUG_ON(atomic##bit##_read(&v) != r); \ } while (0) +#define TEST_FETCH(bit, op, c_op, val) \ +do { \ + atomic##bit##_set(&v, v0); \ + r = v0; \ + r c_op val; \ + BUG_ON(atomic##bit##_##op(val, &v) != v0); \ + BUG_ON(atomic##bit##_read(&v) != r); \ +} while (0) + #define RETURN_FAMILY_TEST(bit, op, c_op, val) \ do { \ FAMILY_TEST(TEST_RETURN, bit, op, c_op, val); \ } while (0) +#define FETCH_FAMILY_TEST(bit, op, c_op, val) \ +do { \ + FAMILY_TEST(TEST_FETCH, bit, op, c_op, val); \ +} while (0) + #define TEST_ARGS(bit, op, init, ret, expect, args...) \ do { \ atomic##bit##_set(&v, init); \ @@ -114,6 +128,16 @@ static __init void test_atomic(void) RETURN_FAMILY_TEST(, sub_return, -=, onestwos); RETURN_FAMILY_TEST(, sub_return, -=, -one); + FETCH_FAMILY_TEST(, fetch_add, +=, onestwos); + FETCH_FAMILY_TEST(, fetch_add, +=, -one); + FETCH_FAMILY_TEST(, fetch_sub, -=, onestwos); + FETCH_FAMILY_TEST(, fetch_sub, -=, -one); + + FETCH_FAMILY_TEST(, fetch_or, |=, v1); + FETCH_FAMILY_TEST(, fetch_and, &=, v1); + FETCH_FAMILY_TEST(, fetch_andnot, &= ~, v1); + FETCH_FAMILY_TEST(, fetch_xor, ^=, v1); + INC_RETURN_FAMILY_TEST(, v0); DEC_RETURN_FAMILY_TEST(, v0); @@ -154,6 +178,16 @@ static __init void test_atomic64(void) RETURN_FAMILY_TEST(64, sub_return, -=, onestwos); RETURN_FAMILY_TEST(64, sub_return, -=, -one); + FETCH_FAMILY_TEST(64, fetch_add, +=, onestwos); + FETCH_FAMILY_TEST(64, fetch_add, +=, -one); + FETCH_FAMILY_TEST(64, fetch_sub, -=, onestwos); + FETCH_FAMILY_TEST(64, fetch_sub, -=, -one); + + FETCH_FAMILY_TEST(64, fetch_or, |=, v1); + FETCH_FAMILY_TEST(64, fetch_and, &=, v1); + FETCH_FAMILY_TEST(64, fetch_andnot, &= ~, v1); + FETCH_FAMILY_TEST(64, fetch_xor, ^=, v1); + INIT(v0); atomic64_inc(&v); r += one; diff --git a/lib/bitmap.c b/lib/bitmap.c index c66da508c..eca88087f 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -14,9 +14,9 @@ #include #include #include +#include #include -#include /* * bitmaps provide an array of bits, implemented using an an diff --git a/lib/chacha20.c b/lib/chacha20.c new file mode 100644 index 000000000..250ceed9e --- /dev/null +++ b/lib/chacha20.c @@ -0,0 +1,79 @@ +/* + * ChaCha20 256-bit cipher algorithm, RFC7539 + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include + +static inline u32 rotl32(u32 v, u8 n) +{ + return (v << n) | (v >> (sizeof(v) * 8 - n)); +} + +extern void chacha20_block(u32 *state, void *stream) +{ + u32 x[16], *out = stream; + int i; + + for (i = 0; i < ARRAY_SIZE(x); i++) + x[i] = state[i]; + + for (i = 0; i < 20; i += 2) { + x[0] += x[4]; x[12] = rotl32(x[12] ^ x[0], 16); + x[1] += x[5]; x[13] = rotl32(x[13] ^ x[1], 16); + x[2] += x[6]; x[14] = rotl32(x[14] ^ x[2], 16); + x[3] += x[7]; x[15] = rotl32(x[15] ^ x[3], 16); + + x[8] += x[12]; x[4] = rotl32(x[4] ^ x[8], 12); + x[9] += x[13]; x[5] = rotl32(x[5] ^ x[9], 12); + x[10] += x[14]; x[6] = rotl32(x[6] ^ x[10], 12); + x[11] += x[15]; x[7] = rotl32(x[7] ^ x[11], 12); + + x[0] += x[4]; x[12] = rotl32(x[12] ^ x[0], 8); + x[1] += x[5]; x[13] = rotl32(x[13] ^ x[1], 8); + x[2] += x[6]; x[14] = rotl32(x[14] ^ x[2], 8); + x[3] += x[7]; x[15] = rotl32(x[15] ^ x[3], 8); + + x[8] += x[12]; x[4] = rotl32(x[4] ^ x[8], 7); + x[9] += x[13]; x[5] = rotl32(x[5] ^ x[9], 7); + x[10] += x[14]; x[6] = rotl32(x[6] ^ x[10], 7); + x[11] += x[15]; x[7] = rotl32(x[7] ^ x[11], 7); + + x[0] += x[5]; x[15] = rotl32(x[15] ^ x[0], 16); + x[1] += x[6]; x[12] = rotl32(x[12] ^ x[1], 16); + x[2] += x[7]; x[13] = rotl32(x[13] ^ x[2], 16); + x[3] += x[4]; x[14] = rotl32(x[14] ^ x[3], 16); + + x[10] += x[15]; x[5] = rotl32(x[5] ^ x[10], 12); + x[11] += x[12]; x[6] = rotl32(x[6] ^ x[11], 12); + x[8] += x[13]; x[7] = rotl32(x[7] ^ x[8], 12); + x[9] += x[14]; x[4] = rotl32(x[4] ^ x[9], 12); + + x[0] += x[5]; x[15] = rotl32(x[15] ^ x[0], 8); + x[1] += x[6]; x[12] = rotl32(x[12] ^ x[1], 8); + x[2] += x[7]; x[13] = rotl32(x[13] ^ x[2], 8); + x[3] += x[4]; x[14] = rotl32(x[14] ^ x[3], 8); + + x[10] += x[15]; x[5] = rotl32(x[5] ^ x[10], 7); + x[11] += x[12]; x[6] = rotl32(x[6] ^ x[11], 7); + x[8] += x[13]; x[7] = rotl32(x[7] ^ x[8], 7); + x[9] += x[14]; x[4] = rotl32(x[4] ^ x[9], 7); + } + + for (i = 0; i < ARRAY_SIZE(x); i++) + out[i] = cpu_to_le32(x[i] + state[i]); + + state[12]++; +} +EXPORT_SYMBOL(chacha20_block); diff --git a/lib/crc32.c b/lib/crc32.c index 9a907d489..7fbd1a112 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -979,7 +979,6 @@ static int __init crc32c_test(void) int i; int errors = 0; int bytes = 0; - struct timespec start, stop; u64 nsec; unsigned long flags; @@ -999,20 +998,17 @@ static int __init crc32c_test(void) local_irq_save(flags); local_irq_disable(); - getnstimeofday(&start); + nsec = ktime_get_ns(); for (i = 0; i < 100; i++) { if (test[i].crc32c_le != __crc32c_le(test[i].crc, test_buf + test[i].start, test[i].length)) errors++; } - getnstimeofday(&stop); + nsec = ktime_get_ns() - nsec; local_irq_restore(flags); local_irq_enable(); - nsec = stop.tv_nsec - start.tv_nsec + - 1000000000 * (stop.tv_sec - start.tv_sec); - pr_info("crc32c: CRC_LE_BITS = %d\n", CRC_LE_BITS); if (errors) @@ -1065,7 +1061,6 @@ static int __init crc32_test(void) int i; int errors = 0; int bytes = 0; - struct timespec start, stop; u64 nsec; unsigned long flags; @@ -1088,7 +1083,7 @@ static int __init crc32_test(void) local_irq_save(flags); local_irq_disable(); - getnstimeofday(&start); + nsec = ktime_get_ns(); for (i = 0; i < 100; i++) { if (test[i].crc_le != crc32_le(test[i].crc, test_buf + test[i].start, test[i].length)) @@ -1098,14 +1093,11 @@ static int __init crc32_test(void) test[i].start, test[i].length)) errors++; } - getnstimeofday(&stop); + nsec = ktime_get_ns() - nsec; local_irq_restore(flags); local_irq_enable(); - nsec = stop.tv_nsec - start.tv_nsec + - 1000000000 * (stop.tv_sec - start.tv_sec); - pr_info("crc32: CRC_LE_BITS = %d, CRC_BE BITS = %d\n", CRC_LE_BITS, CRC_BE_BITS); diff --git a/lib/digsig.c b/lib/digsig.c index 07be6c1ef..55b8b2f41 100644 --- a/lib/digsig.c +++ b/lib/digsig.c @@ -104,21 +104,25 @@ static int digsig_verify_rsa(struct key *key, datap = pkh->mpi; endp = ukp->data + ukp->datalen; - err = -ENOMEM; - for (i = 0; i < pkh->nmpi; i++) { unsigned int remaining = endp - datap; pkey[i] = mpi_read_from_buffer(datap, &remaining); - if (!pkey[i]) + if (IS_ERR(pkey[i])) { + err = PTR_ERR(pkey[i]); goto err; + } datap += remaining; } mblen = mpi_get_nbits(pkey[0]); mlen = DIV_ROUND_UP(mblen, 8); - if (mlen == 0) + if (mlen == 0) { + err = -EINVAL; goto err; + } + + err = -ENOMEM; out1 = kzalloc(mlen, GFP_KERNEL); if (!out1) @@ -126,8 +130,10 @@ static int digsig_verify_rsa(struct key *key, nret = siglen; in = mpi_read_from_buffer(sig, &nret); - if (!in) + if (IS_ERR(in)) { + err = PTR_ERR(in); goto err; + } res = mpi_alloc(mpi_get_nlimbs(in) * 2); if (!res) diff --git a/lib/dma-debug.c b/lib/dma-debug.c index 51a76af25..fcfa1939a 100644 --- a/lib/dma-debug.c +++ b/lib/dma-debug.c @@ -253,6 +253,7 @@ static int hash_fn(struct dma_debug_entry *entry) */ static struct hash_bucket *get_hash_bucket(struct dma_debug_entry *entry, unsigned long *flags) + __acquires(&dma_entry_hash[idx].lock) { int idx = hash_fn(entry); unsigned long __flags; @@ -267,6 +268,7 @@ static struct hash_bucket *get_hash_bucket(struct dma_debug_entry *entry, */ static void put_hash_bucket(struct hash_bucket *bucket, unsigned long *flags) + __releases(&bucket->lock) { unsigned long __flags = *flags; diff --git a/lib/dma-noop.c b/lib/dma-noop.c index 721456468..3d766e78f 100644 --- a/lib/dma-noop.c +++ b/lib/dma-noop.c @@ -10,7 +10,7 @@ static void *dma_noop_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, - struct dma_attrs *attrs) + unsigned long attrs) { void *ret; @@ -22,7 +22,7 @@ static void *dma_noop_alloc(struct device *dev, size_t size, static void dma_noop_free(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_addr, - struct dma_attrs *attrs) + unsigned long attrs) { free_pages((unsigned long)cpu_addr, get_order(size)); } @@ -30,13 +30,14 @@ static void dma_noop_free(struct device *dev, size_t size, static dma_addr_t dma_noop_map_page(struct device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction dir, - struct dma_attrs *attrs) + unsigned long attrs) { return page_to_phys(page) + offset; } static int dma_noop_map_sg(struct device *dev, struct scatterlist *sgl, int nents, - enum dma_data_direction dir, struct dma_attrs *attrs) + enum dma_data_direction dir, + unsigned long attrs) { int i; struct scatterlist *sg; diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c index fe42b6ec3..da796e2dc 100644 --- a/lib/dynamic_debug.c +++ b/lib/dynamic_debug.c @@ -188,6 +188,13 @@ static int ddebug_change(const struct ddebug_query *query, newflags = (dp->flags & mask) | flags; if (newflags == dp->flags) continue; +#ifdef HAVE_JUMP_LABEL + if (dp->flags & _DPRINTK_FLAGS_PRINT) { + if (!(flags & _DPRINTK_FLAGS_PRINT)) + static_branch_disable(&dp->key.dd_key_true); + } else if (flags & _DPRINTK_FLAGS_PRINT) + static_branch_enable(&dp->key.dd_key_true); +#endif dp->flags = newflags; vpr_info("changed %s:%d [%s]%s =%s\n", trim_prefix(dp->filename), dp->lineno, diff --git a/lib/earlycpio.c b/lib/earlycpio.c index 3eb3e4722..db283ba4d 100644 --- a/lib/earlycpio.c +++ b/lib/earlycpio.c @@ -125,7 +125,10 @@ struct cpio_data find_cpio_data(const char *path, void *data, if ((ch[C_MODE] & 0170000) == 0100000 && ch[C_NAMESIZE] >= mypathsize && !memcmp(p, path, mypathsize)) { - *nextoff = (long)nptr - (long)data; + + if (nextoff) + *nextoff = (long)nptr - (long)data; + if (ch[C_NAMESIZE] - mypathsize >= MAX_CPIO_FILE_NAME) { pr_warn( "File %s exceeding MAX_CPIO_FILE_NAME [%d]\n", diff --git a/lib/hweight.c b/lib/hweight.c index 9a5c1f221..43273a7d8 100644 --- a/lib/hweight.c +++ b/lib/hweight.c @@ -9,6 +9,7 @@ * The Hamming Weight of a number is the total number of bits set in it. */ +#ifndef __HAVE_ARCH_SW_HWEIGHT unsigned int __sw_hweight32(unsigned int w) { #ifdef CONFIG_ARCH_HAS_FAST_MULTIPLIER @@ -25,6 +26,7 @@ unsigned int __sw_hweight32(unsigned int w) #endif } EXPORT_SYMBOL(__sw_hweight32); +#endif unsigned int __sw_hweight16(unsigned int w) { @@ -43,6 +45,7 @@ unsigned int __sw_hweight8(unsigned int w) } EXPORT_SYMBOL(__sw_hweight8); +#ifndef __HAVE_ARCH_SW_HWEIGHT unsigned long __sw_hweight64(__u64 w) { #if BITS_PER_LONG == 32 @@ -65,3 +68,4 @@ unsigned long __sw_hweight64(__u64 w) #endif } EXPORT_SYMBOL(__sw_hweight64); +#endif diff --git a/lib/iommu-helper.c b/lib/iommu-helper.c index c27e26921..a816f3a80 100644 --- a/lib/iommu-helper.c +++ b/lib/iommu-helper.c @@ -29,8 +29,7 @@ again: index = bitmap_find_next_zero_area(map, size, start, nr, align_mask); if (index < size) { if (iommu_is_span_boundary(index, nr, shift, boundary_size)) { - /* we could do more effectively */ - start = index + 1; + start = ALIGN(shift + index, boundary_size) - shift; goto again; } bitmap_set(map, index, nr); diff --git a/lib/iov_iter.c b/lib/iov_iter.c index eaaf73032..7e3138cfc 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -56,37 +56,24 @@ n = wanted; \ } -#define iterate_bvec(i, n, __v, __p, skip, STEP) { \ - size_t wanted = n; \ - __p = i->bvec; \ - __v.bv_len = min_t(size_t, n, __p->bv_len - skip); \ - if (likely(__v.bv_len)) { \ - __v.bv_page = __p->bv_page; \ - __v.bv_offset = __p->bv_offset + skip; \ - (void)(STEP); \ - skip += __v.bv_len; \ - n -= __v.bv_len; \ - } \ - while (unlikely(n)) { \ - __p++; \ - __v.bv_len = min_t(size_t, n, __p->bv_len); \ - if (unlikely(!__v.bv_len)) \ +#define iterate_bvec(i, n, __v, __bi, skip, STEP) { \ + struct bvec_iter __start; \ + __start.bi_size = n; \ + __start.bi_bvec_done = skip; \ + __start.bi_idx = 0; \ + for_each_bvec(__v, i->bvec, __bi, __start) { \ + if (!__v.bv_len) \ continue; \ - __v.bv_page = __p->bv_page; \ - __v.bv_offset = __p->bv_offset; \ (void)(STEP); \ - skip = __v.bv_len; \ - n -= __v.bv_len; \ } \ - n = wanted; \ } #define iterate_all_kinds(i, n, v, I, B, K) { \ size_t skip = i->iov_offset; \ if (unlikely(i->type & ITER_BVEC)) { \ - const struct bio_vec *bvec; \ struct bio_vec v; \ - iterate_bvec(i, n, v, bvec, skip, (B)) \ + struct bvec_iter __bi; \ + iterate_bvec(i, n, v, __bi, skip, (B)) \ } else if (unlikely(i->type & ITER_KVEC)) { \ const struct kvec *kvec; \ struct kvec v; \ @@ -104,15 +91,13 @@ if (i->count) { \ size_t skip = i->iov_offset; \ if (unlikely(i->type & ITER_BVEC)) { \ - const struct bio_vec *bvec; \ + const struct bio_vec *bvec = i->bvec; \ struct bio_vec v; \ - iterate_bvec(i, n, v, bvec, skip, (B)) \ - if (skip == bvec->bv_len) { \ - bvec++; \ - skip = 0; \ - } \ - i->nr_segs -= bvec - i->bvec; \ - i->bvec = bvec; \ + struct bvec_iter __bi; \ + iterate_bvec(i, n, v, __bi, skip, (B)) \ + i->bvec = __bvec_iter_bvec(i->bvec, __bi); \ + i->nr_segs -= i->bvec - bvec; \ + skip = __bi.bi_bvec_done; \ } else if (unlikely(i->type & ITER_KVEC)) { \ const struct kvec *kvec; \ struct kvec v; \ @@ -159,7 +144,7 @@ static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t b buf = iov->iov_base + skip; copy = min(bytes, iov->iov_len - skip); - if (!fault_in_pages_writeable(buf, copy)) { + if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_writeable(buf, copy)) { kaddr = kmap_atomic(page); from = kaddr + offset; @@ -190,6 +175,7 @@ static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t b copy = min(bytes, iov->iov_len - skip); } /* Too bad - revert to non-atomic kmap */ + kaddr = kmap(page); from = kaddr + offset; left = __copy_to_user(buf, from, copy); @@ -208,6 +194,7 @@ static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t b bytes -= copy; } kunmap(page); + done: if (skip == iov->iov_len) { iov++; @@ -240,7 +227,7 @@ static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t buf = iov->iov_base + skip; copy = min(bytes, iov->iov_len - skip); - if (!fault_in_pages_readable(buf, copy)) { + if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_readable(buf, copy)) { kaddr = kmap_atomic(page); to = kaddr + offset; @@ -271,6 +258,7 @@ static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t copy = min(bytes, iov->iov_len - skip); } /* Too bad - revert to non-atomic kmap */ + kaddr = kmap(page); to = kaddr + offset; left = __copy_from_user(to, buf, copy); @@ -289,6 +277,7 @@ static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t bytes -= copy; } kunmap(page); + done: if (skip == iov->iov_len) { iov++; diff --git a/lib/mpi/mpicoder.c b/lib/mpi/mpicoder.c index 747606f9e..5a0f75a3b 100644 --- a/lib/mpi/mpicoder.c +++ b/lib/mpi/mpicoder.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include "mpi-internal.h" @@ -50,9 +51,7 @@ MPI mpi_read_raw_data(const void *xbuffer, size_t nbytes) return NULL; } if (nbytes > 0) - nbits -= count_leading_zeros(buffer[0]); - else - nbits = 0; + nbits -= count_leading_zeros(buffer[0]) - (BITS_PER_LONG - 8); nlimbs = DIV_ROUND_UP(nbytes, BYTES_PER_MPI_LIMB); val = mpi_alloc(nlimbs); @@ -82,50 +81,30 @@ EXPORT_SYMBOL_GPL(mpi_read_raw_data); MPI mpi_read_from_buffer(const void *xbuffer, unsigned *ret_nread) { const uint8_t *buffer = xbuffer; - int i, j; - unsigned nbits, nbytes, nlimbs, nread = 0; - mpi_limb_t a; - MPI val = NULL; + unsigned int nbits, nbytes; + MPI val; if (*ret_nread < 2) - goto leave; + return ERR_PTR(-EINVAL); nbits = buffer[0] << 8 | buffer[1]; if (nbits > MAX_EXTERN_MPI_BITS) { pr_info("MPI: mpi too large (%u bits)\n", nbits); - goto leave; + return ERR_PTR(-EINVAL); } - buffer += 2; - nread = 2; nbytes = DIV_ROUND_UP(nbits, 8); - nlimbs = DIV_ROUND_UP(nbytes, BYTES_PER_MPI_LIMB); - val = mpi_alloc(nlimbs); - if (!val) - return NULL; - i = BYTES_PER_MPI_LIMB - nbytes % BYTES_PER_MPI_LIMB; - i %= BYTES_PER_MPI_LIMB; - val->nbits = nbits; - j = val->nlimbs = nlimbs; - val->sign = 0; - for (; j > 0; j--) { - a = 0; - for (; i < BYTES_PER_MPI_LIMB; i++) { - if (++nread > *ret_nread) { - printk - ("MPI: mpi larger than buffer nread=%d ret_nread=%d\n", - nread, *ret_nread); - goto leave; - } - a <<= 8; - a |= *buffer++; - } - i = 0; - val->d[j - 1] = a; + if (nbytes + 2 > *ret_nread) { + pr_info("MPI: mpi larger than buffer nbytes=%u ret_nread=%u\n", + nbytes, *ret_nread); + return ERR_PTR(-EINVAL); } -leave: - *ret_nread = nread; + val = mpi_read_raw_data(buffer + 2, nbytes); + if (!val) + return ERR_PTR(-ENOMEM); + + *ret_nread = nbytes + 2; return val; } EXPORT_SYMBOL_GPL(mpi_read_from_buffer); @@ -250,82 +229,6 @@ void *mpi_get_buffer(MPI a, unsigned *nbytes, int *sign) } EXPORT_SYMBOL_GPL(mpi_get_buffer); -/**************** - * Use BUFFER to update MPI. - */ -int mpi_set_buffer(MPI a, const void *xbuffer, unsigned nbytes, int sign) -{ - const uint8_t *buffer = xbuffer, *p; - mpi_limb_t alimb; - int nlimbs; - int i; - - nlimbs = DIV_ROUND_UP(nbytes, BYTES_PER_MPI_LIMB); - if (RESIZE_IF_NEEDED(a, nlimbs) < 0) - return -ENOMEM; - a->sign = sign; - - for (i = 0, p = buffer + nbytes - 1; p >= buffer + BYTES_PER_MPI_LIMB;) { -#if BYTES_PER_MPI_LIMB == 4 - alimb = (mpi_limb_t) *p--; - alimb |= (mpi_limb_t) *p-- << 8; - alimb |= (mpi_limb_t) *p-- << 16; - alimb |= (mpi_limb_t) *p-- << 24; -#elif BYTES_PER_MPI_LIMB == 8 - alimb = (mpi_limb_t) *p--; - alimb |= (mpi_limb_t) *p-- << 8; - alimb |= (mpi_limb_t) *p-- << 16; - alimb |= (mpi_limb_t) *p-- << 24; - alimb |= (mpi_limb_t) *p-- << 32; - alimb |= (mpi_limb_t) *p-- << 40; - alimb |= (mpi_limb_t) *p-- << 48; - alimb |= (mpi_limb_t) *p-- << 56; -#else -#error please implement for this limb size. -#endif - a->d[i++] = alimb; - } - if (p >= buffer) { -#if BYTES_PER_MPI_LIMB == 4 - alimb = *p--; - if (p >= buffer) - alimb |= (mpi_limb_t) *p-- << 8; - if (p >= buffer) - alimb |= (mpi_limb_t) *p-- << 16; - if (p >= buffer) - alimb |= (mpi_limb_t) *p-- << 24; -#elif BYTES_PER_MPI_LIMB == 8 - alimb = (mpi_limb_t) *p--; - if (p >= buffer) - alimb |= (mpi_limb_t) *p-- << 8; - if (p >= buffer) - alimb |= (mpi_limb_t) *p-- << 16; - if (p >= buffer) - alimb |= (mpi_limb_t) *p-- << 24; - if (p >= buffer) - alimb |= (mpi_limb_t) *p-- << 32; - if (p >= buffer) - alimb |= (mpi_limb_t) *p-- << 40; - if (p >= buffer) - alimb |= (mpi_limb_t) *p-- << 48; - if (p >= buffer) - alimb |= (mpi_limb_t) *p-- << 56; -#else -#error please implement for this limb size. -#endif - a->d[i++] = alimb; - } - a->nlimbs = i; - - if (i != nlimbs) { - pr_emerg("MPI: mpi_set_buffer: Assertion failed (%d != %d)", i, - nlimbs); - BUG(); - } - return 0; -} -EXPORT_SYMBOL_GPL(mpi_set_buffer); - /** * mpi_write_to_sgl() - Funnction exports MPI to an sgl (msb first) * @@ -335,16 +238,13 @@ EXPORT_SYMBOL_GPL(mpi_set_buffer); * @a: a multi precision integer * @sgl: scatterlist to write to. Needs to be at least * mpi_get_size(a) long. - * @nbytes: in/out param - it has the be set to the maximum number of - * bytes that can be written to sgl. This has to be at least - * the size of the integer a. On return it receives the actual - * length of the data written on success or the data that would - * be written if buffer was too small. + * @nbytes: the number of bytes to write. Leading bytes will be + * filled with zero. * @sign: if not NULL, it will be set to the sign of a. * * Return: 0 on success or error code in case of error */ -int mpi_write_to_sgl(MPI a, struct scatterlist *sgl, unsigned *nbytes, +int mpi_write_to_sgl(MPI a, struct scatterlist *sgl, unsigned nbytes, int *sign) { u8 *p, *p2; @@ -356,55 +256,60 @@ int mpi_write_to_sgl(MPI a, struct scatterlist *sgl, unsigned *nbytes, #error please implement for this limb size. #endif unsigned int n = mpi_get_size(a); - int i, x, y = 0, lzeros, buf_len; - - if (!nbytes) - return -EINVAL; + struct sg_mapping_iter miter; + int i, x, buf_len; + int nents; if (sign) *sign = a->sign; - lzeros = count_lzeros(a); - - if (*nbytes < n - lzeros) { - *nbytes = n - lzeros; + if (nbytes < n) return -EOVERFLOW; - } - *nbytes = n - lzeros; - buf_len = sgl->length; - p2 = sg_virt(sgl); + nents = sg_nents_for_len(sgl, nbytes); + if (nents < 0) + return -EINVAL; - for (i = a->nlimbs - 1 - lzeros / BYTES_PER_MPI_LIMB, - lzeros %= BYTES_PER_MPI_LIMB; - i >= 0; i--) { + sg_miter_start(&miter, sgl, nents, SG_MITER_ATOMIC | SG_MITER_TO_SG); + sg_miter_next(&miter); + buf_len = miter.length; + p2 = miter.addr; + + while (nbytes > n) { + i = min_t(unsigned, nbytes - n, buf_len); + memset(p2, 0, i); + p2 += i; + nbytes -= i; + + buf_len -= i; + if (!buf_len) { + sg_miter_next(&miter); + buf_len = miter.length; + p2 = miter.addr; + } + } + + for (i = a->nlimbs - 1; i >= 0; i--) { #if BYTES_PER_MPI_LIMB == 4 - alimb = cpu_to_be32(a->d[i]); + alimb = a->d[i] ? cpu_to_be32(a->d[i]) : 0; #elif BYTES_PER_MPI_LIMB == 8 - alimb = cpu_to_be64(a->d[i]); + alimb = a->d[i] ? cpu_to_be64(a->d[i]) : 0; #else #error please implement for this limb size. #endif - if (lzeros) { - y = lzeros; - lzeros = 0; - } - - p = (u8 *)&alimb + y; + p = (u8 *)&alimb; - for (x = 0; x < sizeof(alimb) - y; x++) { - if (!buf_len) { - sgl = sg_next(sgl); - if (!sgl) - return -EINVAL; - buf_len = sgl->length; - p2 = sg_virt(sgl); - } + for (x = 0; x < sizeof(alimb); x++) { *p2++ = *p++; - buf_len--; + if (!--buf_len) { + sg_miter_next(&miter); + buf_len = miter.length; + p2 = miter.addr; + } } - y = 0; } + + sg_miter_stop(&miter); return 0; } EXPORT_SYMBOL_GPL(mpi_write_to_sgl); @@ -424,19 +329,23 @@ EXPORT_SYMBOL_GPL(mpi_write_to_sgl); */ MPI mpi_read_raw_from_sgl(struct scatterlist *sgl, unsigned int nbytes) { - struct scatterlist *sg; - int x, i, j, z, lzeros, ents; + struct sg_mapping_iter miter; unsigned int nbits, nlimbs; + int x, j, z, lzeros, ents; + unsigned int len; + const u8 *buff; mpi_limb_t a; MPI val = NULL; - lzeros = 0; - ents = sg_nents(sgl); + ents = sg_nents_for_len(sgl, nbytes); + if (ents < 0) + return NULL; - for_each_sg(sgl, sg, ents, i) { - const u8 *buff = sg_virt(sg); - int len = sg->length; + sg_miter_start(&miter, sgl, ents, SG_MITER_ATOMIC | SG_MITER_FROM_SG); + lzeros = 0; + len = 0; + while (nbytes > 0) { while (len && !*buff) { lzeros++; len--; @@ -446,12 +355,17 @@ MPI mpi_read_raw_from_sgl(struct scatterlist *sgl, unsigned int nbytes) if (len && *buff) break; - ents--; + sg_miter_next(&miter); + buff = miter.addr; + len = miter.length; + nbytes -= lzeros; lzeros = 0; } - sgl = sg; + miter.consumed = lzeros; + sg_miter_stop(&miter); + nbytes -= lzeros; nbits = nbytes * 8; if (nbits > MAX_EXTERN_MPI_BITS) { @@ -460,8 +374,7 @@ MPI mpi_read_raw_from_sgl(struct scatterlist *sgl, unsigned int nbytes) } if (nbytes > 0) - nbits -= count_leading_zeros(*(u8 *)(sg_virt(sgl) + lzeros)) - - (BITS_PER_LONG - 8); + nbits -= count_leading_zeros(*buff) - (BITS_PER_LONG - 8); nlimbs = DIV_ROUND_UP(nbytes, BYTES_PER_MPI_LIMB); val = mpi_alloc(nlimbs); @@ -480,21 +393,21 @@ MPI mpi_read_raw_from_sgl(struct scatterlist *sgl, unsigned int nbytes) z = BYTES_PER_MPI_LIMB - nbytes % BYTES_PER_MPI_LIMB; z %= BYTES_PER_MPI_LIMB; - for_each_sg(sgl, sg, ents, i) { - const u8 *buffer = sg_virt(sg) + lzeros; - int len = sg->length - lzeros; + while (sg_miter_next(&miter)) { + buff = miter.addr; + len = miter.length; for (x = 0; x < len; x++) { a <<= 8; - a |= *buffer++; + a |= *buff++; if (((z + x + 1) % BYTES_PER_MPI_LIMB) == 0) { val->d[j--] = a; a = 0; } } z += x; - lzeros = 0; } + return val; } EXPORT_SYMBOL_GPL(mpi_read_raw_from_sgl); diff --git a/lib/radix-tree.c b/lib/radix-tree.c index bc7852f95..91f0727e3 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -38,6 +38,9 @@ #include /* in_interrupt() */ +/* Number of nodes in fully populated tree of given height */ +static unsigned long height_to_maxnodes[RADIX_TREE_MAX_PATH + 1] __read_mostly; + /* * Radix tree node cache. */ @@ -102,10 +105,10 @@ static unsigned int radix_tree_descend(struct radix_tree_node *parent, #ifdef CONFIG_RADIX_TREE_MULTIORDER if (radix_tree_is_internal_node(entry)) { - unsigned long siboff = get_slot_offset(parent, entry); - if (siboff < RADIX_TREE_MAP_SIZE) { - offset = siboff; - entry = rcu_dereference_raw(parent->slots[offset]); + if (is_sibling_entry(parent, entry)) { + void **sibentry = (void **) entry_to_node(entry); + offset = get_slot_offset(parent, sibentry); + entry = rcu_dereference_raw(*sibentry); } } #endif @@ -342,7 +345,7 @@ radix_tree_node_free(struct radix_tree_node *node) * To make use of this facility, the radix tree must be initialised without * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE(). */ -static int __radix_tree_preload(gfp_t gfp_mask) +static int __radix_tree_preload(gfp_t gfp_mask, int nr) { struct radix_tree_preload *rtp; struct radix_tree_node *node; @@ -356,14 +359,14 @@ static int __radix_tree_preload(gfp_t gfp_mask) preempt_disable(); rtp = this_cpu_ptr(&radix_tree_preloads); - while (rtp->nr < RADIX_TREE_PRELOAD_SIZE) { + while (rtp->nr < nr) { preempt_enable(); node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask); if (node == NULL) goto out; preempt_disable(); rtp = this_cpu_ptr(&radix_tree_preloads); - if (rtp->nr < RADIX_TREE_PRELOAD_SIZE) { + if (rtp->nr < nr) { node->private_data = rtp->nodes; rtp->nodes = node; rtp->nr++; @@ -389,7 +392,7 @@ int radix_tree_preload(gfp_t gfp_mask) { /* Warn on non-sensical use... */ WARN_ON_ONCE(!gfpflags_allow_blocking(gfp_mask)); - return __radix_tree_preload(gfp_mask); + return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE); } EXPORT_SYMBOL(radix_tree_preload); @@ -401,13 +404,58 @@ EXPORT_SYMBOL(radix_tree_preload); int radix_tree_maybe_preload(gfp_t gfp_mask) { if (gfpflags_allow_blocking(gfp_mask)) - return __radix_tree_preload(gfp_mask); + return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE); /* Preloading doesn't help anything with this gfp mask, skip it */ preempt_disable(); return 0; } EXPORT_SYMBOL(radix_tree_maybe_preload); +/* + * The same as function above, but preload number of nodes required to insert + * (1 << order) continuous naturally-aligned elements. + */ +int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order) +{ + unsigned long nr_subtrees; + int nr_nodes, subtree_height; + + /* Preloading doesn't help anything with this gfp mask, skip it */ + if (!gfpflags_allow_blocking(gfp_mask)) { + preempt_disable(); + return 0; + } + + /* + * Calculate number and height of fully populated subtrees it takes to + * store (1 << order) elements. + */ + nr_subtrees = 1 << order; + for (subtree_height = 0; nr_subtrees > RADIX_TREE_MAP_SIZE; + subtree_height++) + nr_subtrees >>= RADIX_TREE_MAP_SHIFT; + + /* + * The worst case is zero height tree with a single item at index 0 and + * then inserting items starting at ULONG_MAX - (1 << order). + * + * This requires RADIX_TREE_MAX_PATH nodes to build branch from root to + * 0-index item. + */ + nr_nodes = RADIX_TREE_MAX_PATH; + + /* Plus branch to fully populated subtrees. */ + nr_nodes += RADIX_TREE_MAX_PATH - subtree_height; + + /* Root node is shared. */ + nr_nodes--; + + /* Plus nodes required to build subtrees. */ + nr_nodes += nr_subtrees * height_to_maxnodes[subtree_height]; + + return __radix_tree_preload(gfp_mask, nr_nodes); +} + /* * The maximum index which can be stored in a radix tree */ @@ -1577,6 +1625,31 @@ radix_tree_node_ctor(void *arg) INIT_LIST_HEAD(&node->private_list); } +static __init unsigned long __maxindex(unsigned int height) +{ + unsigned int width = height * RADIX_TREE_MAP_SHIFT; + int shift = RADIX_TREE_INDEX_BITS - width; + + if (shift < 0) + return ~0UL; + if (shift >= BITS_PER_LONG) + return 0UL; + return ~0UL >> shift; +} + +static __init void radix_tree_init_maxnodes(void) +{ + unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH + 1]; + unsigned int i, j; + + for (i = 0; i < ARRAY_SIZE(height_to_maxindex); i++) + height_to_maxindex[i] = __maxindex(i); + for (i = 0; i < ARRAY_SIZE(height_to_maxnodes); i++) { + for (j = i; j > 0; j--) + height_to_maxnodes[i] += height_to_maxindex[j - 1] + 1; + } +} + static int radix_tree_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { @@ -1603,5 +1676,6 @@ void __init radix_tree_init(void) sizeof(struct radix_tree_node), 0, SLAB_PANIC | SLAB_RECLAIM_ACCOUNT, radix_tree_node_ctor); + radix_tree_init_maxnodes(); hotcpu_notifier(radix_tree_callback, 0); } diff --git a/lib/random32.c b/lib/random32.c index 510d1ce7d..69ed593aa 100644 --- a/lib/random32.c +++ b/lib/random32.c @@ -233,7 +233,6 @@ static void __prandom_timer(unsigned long dontcare) static void __init __prandom_start_seed_timer(void) { - set_timer_slack(&seed_timer, HZ); seed_timer.expires = jiffies + msecs_to_jiffies(40 * MSEC_PER_SEC); add_timer(&seed_timer); } diff --git a/lib/ratelimit.c b/lib/ratelimit.c index 2c5de8646..08f8043ca 100644 --- a/lib/ratelimit.c +++ b/lib/ratelimit.c @@ -46,12 +46,14 @@ int ___ratelimit(struct ratelimit_state *rs, const char *func) rs->begin = jiffies; if (time_is_before_jiffies(rs->begin + rs->interval)) { - if (rs->missed) - printk(KERN_WARNING "%s: %d callbacks suppressed\n", - func, rs->missed); + if (rs->missed) { + if (!(rs->flags & RATELIMIT_MSG_ON_RELEASE)) { + pr_warn("%s: %d callbacks suppressed\n", func, rs->missed); + rs->missed = 0; + } + } rs->begin = jiffies; rs->printed = 0; - rs->missed = 0; } if (rs->burst && rs->burst > rs->printed) { rs->printed++; diff --git a/lib/rbtree.c b/lib/rbtree.c index 1356454e3..eb8a19fee 100644 --- a/lib/rbtree.c +++ b/lib/rbtree.c @@ -539,17 +539,39 @@ void rb_replace_node(struct rb_node *victim, struct rb_node *new, { struct rb_node *parent = rb_parent(victim); + /* Copy the pointers/colour from the victim to the replacement */ + *new = *victim; + /* Set the surrounding nodes to point to the replacement */ - __rb_change_child(victim, new, parent, root); if (victim->rb_left) rb_set_parent(victim->rb_left, new); if (victim->rb_right) rb_set_parent(victim->rb_right, new); + __rb_change_child(victim, new, parent, root); +} +EXPORT_SYMBOL(rb_replace_node); + +void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new, + struct rb_root *root) +{ + struct rb_node *parent = rb_parent(victim); /* Copy the pointers/colour from the victim to the replacement */ *new = *victim; + + /* Set the surrounding nodes to point to the replacement */ + if (victim->rb_left) + rb_set_parent(victim->rb_left, new); + if (victim->rb_right) + rb_set_parent(victim->rb_right, new); + + /* Set the parent's pointer to the new node last after an RCU barrier + * so that the pointers onwards are seen to be set correctly when doing + * an RCU walk over the tree. + */ + __rb_change_child_rcu(victim, new, parent, root); } -EXPORT_SYMBOL(rb_replace_node); +EXPORT_SYMBOL(rb_replace_node_rcu); static struct rb_node *rb_left_deepest_node(const struct rb_node *node) { diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 5d845ffd7..56054e541 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -30,7 +30,7 @@ #define HASH_DEFAULT_SIZE 64UL #define HASH_MIN_SIZE 4U -#define BUCKET_LOCKS_PER_CPU 128UL +#define BUCKET_LOCKS_PER_CPU 32UL static u32 head_hashfn(struct rhashtable *ht, const struct bucket_table *tbl, @@ -70,21 +70,25 @@ static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl, unsigned int nr_pcpus = num_possible_cpus(); #endif - nr_pcpus = min_t(unsigned int, nr_pcpus, 32UL); + nr_pcpus = min_t(unsigned int, nr_pcpus, 64UL); size = roundup_pow_of_two(nr_pcpus * ht->p.locks_mul); /* Never allocate more than 0.5 locks per bucket */ size = min_t(unsigned int, size, tbl->size >> 1); if (sizeof(spinlock_t) != 0) { + tbl->locks = NULL; #ifdef CONFIG_NUMA if (size * sizeof(spinlock_t) > PAGE_SIZE && gfp == GFP_KERNEL) tbl->locks = vmalloc(size * sizeof(spinlock_t)); - else #endif - tbl->locks = kmalloc_array(size, sizeof(spinlock_t), - gfp); + if (gfp != GFP_KERNEL) + gfp |= __GFP_NOWARN | __GFP_NORETRY; + + if (!tbl->locks) + tbl->locks = kmalloc_array(size, sizeof(spinlock_t), + gfp); if (!tbl->locks) return -ENOMEM; for (i = 0; i < size; i++) @@ -321,12 +325,14 @@ static int rhashtable_expand(struct rhashtable *ht) static int rhashtable_shrink(struct rhashtable *ht) { struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht); - unsigned int size; + unsigned int nelems = atomic_read(&ht->nelems); + unsigned int size = 0; int err; ASSERT_RHT_MUTEX(ht); - size = roundup_pow_of_two(atomic_read(&ht->nelems) * 3 / 2); + if (nelems) + size = roundup_pow_of_two(nelems * 3 / 2); if (size < ht->p.min_size) size = ht->p.min_size; diff --git a/lib/sradix-tree.c b/lib/sradix-tree.c new file mode 100644 index 000000000..8d0632917 --- /dev/null +++ b/lib/sradix-tree.c @@ -0,0 +1,476 @@ +#include +#include +#include +#include +#include +#include +#include + +static inline int sradix_node_full(struct sradix_tree_root *root, struct sradix_tree_node *node) +{ + return node->fulls == root->stores_size || + (node->height == 1 && node->count == root->stores_size); +} + +/* + * Extend a sradix tree so it can store key @index. + */ +static int sradix_tree_extend(struct sradix_tree_root *root, unsigned long index) +{ + struct sradix_tree_node *node; + unsigned int height; + + if (unlikely(root->rnode == NULL)) { + if (!(node = root->alloc())) + return -ENOMEM; + + node->height = 1; + root->rnode = node; + root->height = 1; + } + + /* Figure out what the height should be. */ + height = root->height; + index >>= root->shift * height; + + while (index) { + index >>= root->shift; + height++; + } + + while (height > root->height) { + unsigned int newheight; + if (!(node = root->alloc())) + return -ENOMEM; + + /* Increase the height. */ + node->stores[0] = root->rnode; + root->rnode->parent = node; + if (root->extend) + root->extend(node, root->rnode); + + newheight = root->height + 1; + node->height = newheight; + node->count = 1; + if (sradix_node_full(root, root->rnode)) + node->fulls = 1; + + root->rnode = node; + root->height = newheight; + } + + return 0; +} + +/* + * Search the next item from the current node, that is not NULL + * and can satify root->iter(). + */ +void *sradix_tree_next(struct sradix_tree_root *root, + struct sradix_tree_node *node, unsigned long index, + int (*iter)(void *item, unsigned long height)) +{ + unsigned long offset; + void *item; + + if (unlikely(node == NULL)) { + node = root->rnode; + for (offset = 0; offset < root->stores_size; offset++) { + item = node->stores[offset]; + if (item && (!iter || iter(item, node->height))) + break; + } + + if (unlikely(offset >= root->stores_size)) + return NULL; + + if (node->height == 1) + return item; + else + goto go_down; + } + + while (node) { + offset = (index & root->mask) + 1; + for (;offset < root->stores_size; offset++) { + item = node->stores[offset]; + if (item && (!iter || iter(item, node->height))) + break; + } + + if (offset < root->stores_size) + break; + + node = node->parent; + index >>= root->shift; + } + + if (!node) + return NULL; + + while (node->height > 1) { +go_down: + node = item; + for (offset = 0; offset < root->stores_size; offset++) { + item = node->stores[offset]; + if (item && (!iter || iter(item, node->height))) + break; + } + + if (unlikely(offset >= root->stores_size)) + return NULL; + } + + BUG_ON(offset > root->stores_size); + + return item; +} + +/* + * Blindly insert the item to the tree. Typically, we reuse the + * first empty store item. + */ +int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num) +{ + unsigned long index; + unsigned int height; + struct sradix_tree_node *node, *tmp = NULL; + int offset, offset_saved; + void **store = NULL; + int error, i, j, shift; + +go_on: + index = root->min; + + if (root->enter_node && !sradix_node_full(root, root->enter_node)) { + node = root->enter_node; + BUG_ON((index >> (root->shift * root->height))); + } else { + node = root->rnode; + if (node == NULL || (index >> (root->shift * root->height)) + || sradix_node_full(root, node)) { + error = sradix_tree_extend(root, index); + if (error) + return error; + + node = root->rnode; + } + } + + + height = node->height; + shift = (height - 1) * root->shift; + offset = (index >> shift) & root->mask; + while (shift > 0) { + offset_saved = offset; + for (; offset < root->stores_size; offset++) { + store = &node->stores[offset]; + tmp = *store; + + if (!tmp || !sradix_node_full(root, tmp)) + break; + } + BUG_ON(offset >= root->stores_size); + + if (offset != offset_saved) { + index += (offset - offset_saved) << shift; + index &= ~((1UL << shift) - 1); + } + + if (!tmp) { + if (!(tmp = root->alloc())) + return -ENOMEM; + + tmp->height = shift / root->shift; + *store = tmp; + tmp->parent = node; + node->count++; +// if (root->extend) +// root->extend(node, tmp); + } + + node = tmp; + shift -= root->shift; + offset = (index >> shift) & root->mask; + } + + BUG_ON(node->height != 1); + + + store = &node->stores[offset]; + for (i = 0, j = 0; + j < root->stores_size - node->count && + i < root->stores_size - offset && j < num; i++) { + if (!store[i]) { + store[i] = item[j]; + if (root->assign) + root->assign(node, index + i, item[j]); + j++; + } + } + + node->count += j; + root->num += j; + num -= j; + + while (sradix_node_full(root, node)) { + node = node->parent; + if (!node) + break; + + node->fulls++; + } + + if (unlikely(!node)) { + /* All nodes are full */ + root->min = 1 << (root->height * root->shift); + root->enter_node = NULL; + } else { + root->min = index + i - 1; + root->min |= (1UL << (node->height - 1)) - 1; + root->min++; + root->enter_node = node; + } + + if (num) { + item += j; + goto go_on; + } + + return 0; +} + + +/** + * sradix_tree_shrink - shrink height of a sradix tree to minimal + * @root sradix tree root + * + */ +static inline void sradix_tree_shrink(struct sradix_tree_root *root) +{ + /* try to shrink tree height */ + while (root->height > 1) { + struct sradix_tree_node *to_free = root->rnode; + + /* + * The candidate node has more than one child, or its child + * is not at the leftmost store, we cannot shrink. + */ + if (to_free->count != 1 || !to_free->stores[0]) + break; + + root->rnode = to_free->stores[0]; + root->rnode->parent = NULL; + root->height--; + if (unlikely(root->enter_node == to_free)) { + root->enter_node = NULL; + } + root->free(to_free); + } +} + +/* + * Del the item on the known leaf node and index + */ +void sradix_tree_delete_from_leaf(struct sradix_tree_root *root, + struct sradix_tree_node *node, unsigned long index) +{ + unsigned int offset; + struct sradix_tree_node *start, *end; + + BUG_ON(node->height != 1); + + start = node; + while (node && !(--node->count)) + node = node->parent; + + end = node; + if (!node) { + root->rnode = NULL; + root->height = 0; + root->min = 0; + root->num = 0; + root->enter_node = NULL; + } else { + offset = (index >> (root->shift * (node->height - 1))) & root->mask; + if (root->rm) + root->rm(node, offset); + node->stores[offset] = NULL; + root->num--; + if (root->min > index) { + root->min = index; + root->enter_node = node; + } + } + + if (start != end) { + do { + node = start; + start = start->parent; + if (unlikely(root->enter_node == node)) + root->enter_node = end; + root->free(node); + } while (start != end); + + /* + * Note that shrink may free "end", so enter_node still need to + * be checked inside. + */ + sradix_tree_shrink(root); + } else if (node->count == root->stores_size - 1) { + /* It WAS a full leaf node. Update the ancestors */ + node = node->parent; + while (node) { + node->fulls--; + if (node->fulls != root->stores_size - 1) + break; + + node = node->parent; + } + } +} + +void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index) +{ + unsigned int height, offset; + struct sradix_tree_node *node; + int shift; + + node = root->rnode; + if (node == NULL || (index >> (root->shift * root->height))) + return NULL; + + height = root->height; + shift = (height - 1) * root->shift; + + do { + offset = (index >> shift) & root->mask; + node = node->stores[offset]; + if (!node) + return NULL; + + shift -= root->shift; + } while (shift >= 0); + + return node; +} + +/* + * Return the item if it exists, otherwise create it in place + * and return the created item. + */ +void *sradix_tree_lookup_create(struct sradix_tree_root *root, + unsigned long index, void *(*item_alloc)(void)) +{ + unsigned int height, offset; + struct sradix_tree_node *node, *tmp; + void *item; + int shift, error; + + if (root->rnode == NULL || (index >> (root->shift * root->height))) { + if (item_alloc) { + error = sradix_tree_extend(root, index); + if (error) + return NULL; + } else { + return NULL; + } + } + + node = root->rnode; + height = root->height; + shift = (height - 1) * root->shift; + + do { + offset = (index >> shift) & root->mask; + if (!node->stores[offset]) { + if (!(tmp = root->alloc())) + return NULL; + + tmp->height = shift / root->shift; + node->stores[offset] = tmp; + tmp->parent = node; + node->count++; + node = tmp; + } else { + node = node->stores[offset]; + } + + shift -= root->shift; + } while (shift > 0); + + BUG_ON(node->height != 1); + offset = index & root->mask; + if (node->stores[offset]) { + return node->stores[offset]; + } else if (item_alloc) { + if (!(item = item_alloc())) + return NULL; + + node->stores[offset] = item; + + /* + * NOTE: we do NOT call root->assign here, since this item is + * newly created by us having no meaning. Caller can call this + * if it's necessary to do so. + */ + + node->count++; + root->num++; + + while (sradix_node_full(root, node)) { + node = node->parent; + if (!node) + break; + + node->fulls++; + } + + if (unlikely(!node)) { + /* All nodes are full */ + root->min = 1 << (root->height * root->shift); + } else { + if (root->min == index) { + root->min |= (1UL << (node->height - 1)) - 1; + root->min++; + root->enter_node = node; + } + } + + return item; + } else { + return NULL; + } + +} + +int sradix_tree_delete(struct sradix_tree_root *root, unsigned long index) +{ + unsigned int height, offset; + struct sradix_tree_node *node; + int shift; + + node = root->rnode; + if (node == NULL || (index >> (root->shift * root->height))) + return -ENOENT; + + height = root->height; + shift = (height - 1) * root->shift; + + do { + offset = (index >> shift) & root->mask; + node = node->stores[offset]; + if (!node) + return -ENOENT; + + shift -= root->shift; + } while (shift > 0); + + offset = index & root->mask; + if (!node->stores[offset]) + return -ENOENT; + + sradix_tree_delete_from_leaf(root, node, index); + + return 0; +} diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 53ad6c083..60f77f1d4 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -242,6 +242,7 @@ depot_stack_handle_t depot_save_stack(struct stack_trace *trace, */ alloc_flags &= ~GFP_ZONEMASK; alloc_flags &= (GFP_ATOMIC | GFP_KERNEL); + alloc_flags |= __GFP_NOWARN; page = alloc_pages(alloc_flags, STACK_ALLOC_ORDER); if (page) prealloc = page_address(page); diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c index 33f655ef4..9c5fe8110 100644 --- a/lib/strncpy_from_user.c +++ b/lib/strncpy_from_user.c @@ -40,8 +40,8 @@ static inline long do_strncpy_from_user(char *dst, const char __user *src, long unsigned long c, data; /* Fall back to byte-at-a-time if we get a page fault */ - if (unlikely(unsafe_get_user(c,(unsigned long __user *)(src+res)))) - break; + unsafe_get_user(c, (unsigned long __user *)(src+res), byte_at_a_time); + *(unsigned long *)(dst+res) = c; if (has_zero(c, &data, &constants)) { data = prep_zero_mask(c, data, &constants); @@ -56,8 +56,7 @@ byte_at_a_time: while (max) { char c; - if (unlikely(unsafe_get_user(c,src+res))) - return -EFAULT; + unsafe_get_user(c,src+res, efault); dst[res] = c; if (!c) return res; @@ -76,6 +75,7 @@ byte_at_a_time: * Nope: we hit the address space limit, and we still had more * characters the caller would have wanted. That's an EFAULT. */ +efault: return -EFAULT; } diff --git a/lib/strnlen_user.c b/lib/strnlen_user.c index 262594362..8e105ed4d 100644 --- a/lib/strnlen_user.c +++ b/lib/strnlen_user.c @@ -45,8 +45,7 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count, src -= align; max += align; - if (unlikely(unsafe_get_user(c,(unsigned long __user *)src))) - return 0; + unsafe_get_user(c, (unsigned long __user *)src, efault); c |= aligned_byte_mask(align); for (;;) { @@ -61,8 +60,7 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count, if (unlikely(max <= sizeof(unsigned long))) break; max -= sizeof(unsigned long); - if (unlikely(unsafe_get_user(c,(unsigned long __user *)(src+res)))) - return 0; + unsafe_get_user(c, (unsigned long __user *)(src+res), efault); } res -= align; @@ -77,6 +75,7 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count, * Nope: we hit the address space limit, and we still had more * characters the caller would have wanted. That's 0. */ +efault: return 0; } diff --git a/lib/swiotlb.c b/lib/swiotlb.c index 76f29ecba..22e13a0e1 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -738,7 +738,7 @@ swiotlb_full(struct device *dev, size_t size, enum dma_data_direction dir, dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction dir, - struct dma_attrs *attrs) + unsigned long attrs) { phys_addr_t map, phys = page_to_phys(page) + offset; dma_addr_t dev_addr = phys_to_dma(dev, phys); @@ -807,7 +807,7 @@ static void unmap_single(struct device *hwdev, dma_addr_t dev_addr, void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr, size_t size, enum dma_data_direction dir, - struct dma_attrs *attrs) + unsigned long attrs) { unmap_single(hwdev, dev_addr, size, dir); } @@ -877,7 +877,7 @@ EXPORT_SYMBOL(swiotlb_sync_single_for_device); */ int swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems, - enum dma_data_direction dir, struct dma_attrs *attrs) + enum dma_data_direction dir, unsigned long attrs) { struct scatterlist *sg; int i; @@ -914,7 +914,7 @@ int swiotlb_map_sg(struct device *hwdev, struct scatterlist *sgl, int nelems, enum dma_data_direction dir) { - return swiotlb_map_sg_attrs(hwdev, sgl, nelems, dir, NULL); + return swiotlb_map_sg_attrs(hwdev, sgl, nelems, dir, 0); } EXPORT_SYMBOL(swiotlb_map_sg); @@ -924,7 +924,8 @@ EXPORT_SYMBOL(swiotlb_map_sg); */ void swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl, - int nelems, enum dma_data_direction dir, struct dma_attrs *attrs) + int nelems, enum dma_data_direction dir, + unsigned long attrs) { struct scatterlist *sg; int i; @@ -941,7 +942,7 @@ void swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nelems, enum dma_data_direction dir) { - return swiotlb_unmap_sg_attrs(hwdev, sgl, nelems, dir, NULL); + return swiotlb_unmap_sg_attrs(hwdev, sgl, nelems, dir, 0); } EXPORT_SYMBOL(swiotlb_unmap_sg); diff --git a/lib/test_hash.c b/lib/test_hash.c index c9549c8b4..cac20c5fb 100644 --- a/lib/test_hash.c +++ b/lib/test_hash.c @@ -143,7 +143,7 @@ static int __init test_hash_init(void) { char buf[SIZE+1]; - u32 string_or = 0, hash_or[2][33] = { 0 }; + u32 string_or = 0, hash_or[2][33] = { { 0, } }; unsigned tests = 0; unsigned long long h64 = 0; int i, j; @@ -155,8 +155,8 @@ test_hash_init(void) buf[j] = '\0'; for (i = 0; i <= j; i++) { - u64 hashlen = hashlen_string(buf+i); - u32 h0 = full_name_hash(buf+i, j-i); + u64 hashlen = hashlen_string(buf+i, buf+i); + u32 h0 = full_name_hash(buf+i, buf+i, j-i); /* Check that hashlen_string gets the length right */ if (hashlen_len(hashlen) != j-i) { @@ -219,21 +219,27 @@ test_hash_init(void) } /* Issue notices about skipped tests. */ -#ifndef HAVE_ARCH__HASH_32 - pr_info("__hash_32() has no arch implementation to test."); -#elif HAVE_ARCH__HASH_32 != 1 +#ifdef HAVE_ARCH__HASH_32 +#if HAVE_ARCH__HASH_32 != 1 pr_info("__hash_32() is arch-specific; not compared to generic."); #endif -#ifndef HAVE_ARCH_HASH_32 - pr_info("hash_32() has no arch implementation to test."); -#elif HAVE_ARCH_HASH_32 != 1 +#else + pr_info("__hash_32() has no arch implementation to test."); +#endif +#ifdef HAVE_ARCH_HASH_32 +#if HAVE_ARCH_HASH_32 != 1 pr_info("hash_32() is arch-specific; not compared to generic."); #endif -#ifndef HAVE_ARCH_HASH_64 - pr_info("hash_64() has no arch implementation to test."); -#elif HAVE_ARCH_HASH_64 != 1 +#else + pr_info("hash_32() has no arch implementation to test."); +#endif +#ifdef HAVE_ARCH_HASH_64 +#if HAVE_ARCH_HASH_64 != 1 pr_info("hash_64() is arch-specific; not compared to generic."); #endif +#else + pr_info("hash_64() has no arch implementation to test."); +#endif pr_notice("%u tests passed.", tests); diff --git a/lib/test_rhashtable.c b/lib/test_rhashtable.c index 297fdb5e7..64e899b63 100644 --- a/lib/test_rhashtable.c +++ b/lib/test_rhashtable.c @@ -38,7 +38,7 @@ MODULE_PARM_DESC(runs, "Number of test runs per variant (default: 4)"); static int max_size = 0; module_param(max_size, int, 0); -MODULE_PARM_DESC(runs, "Maximum table size (default: calculated)"); +MODULE_PARM_DESC(max_size, "Maximum table size (default: calculated)"); static bool shrinking = false; module_param(shrinking, bool, 0); diff --git a/lib/ubsan.c b/lib/ubsan.c index 8799ae5e2..fb0409df1 100644 --- a/lib/ubsan.c +++ b/lib/ubsan.c @@ -308,7 +308,7 @@ static void handle_object_size_mismatch(struct type_mismatch_data *data, return; ubsan_prologue(&data->location, &flags); - pr_err("%s address %pk with insufficient space\n", + pr_err("%s address %p with insufficient space\n", type_check_kinds[data->type_check_kind], (void *) ptr); pr_err("for an object of type %s\n", data->type->type_name); diff --git a/lib/wbt.c b/lib/wbt.c index cc5a24270..257c7b099 100644 --- a/lib/wbt.c +++ b/lib/wbt.c @@ -1,5 +1,5 @@ /* - * buffered writeback throttling. losely based on CoDel. We can't drop + * buffered writeback throttling. loosely based on CoDel. We can't drop * packets for IO scheduling, so the logic is something like this: * * - Monitor latencies in a defined window of time. @@ -9,30 +9,31 @@ * - For any window where we don't have solid data on what the latencies * look like, retain status quo. * - If latencies look good, decrement scaling step. + * - If we're only doing writes, allow the scaling step to go negative. This + * will temporarily boost write performance, snapping back to a stable + * scaling step of 0 if reads show up or the heavy writers finish. Unlike + * positive scaling steps where we shrink the monitoring window, a negative + * scaling step retains the default step==0 window size. * * Copyright (C) 2016 Jens Axboe * - * Things that (may) need changing: - * - * - Different scaling of background/normal/high priority writeback. - * We may have to violate guarantees for max. - * - We can have mismatches between the stat window and our window. - * */ #include #include #include #include #include +#include #define CREATE_TRACE_POINTS #include enum { /* - * Might need to be higher + * Default setting, we'll scale up (to 75% of QD max) or down (min 1) + * from here depending on device stats */ - RWB_MAX_DEPTH = 64, + RWB_DEF_DEPTH = 16, /* * 100msec window @@ -40,10 +41,9 @@ enum { RWB_WINDOW_NSEC = 100 * 1000 * 1000ULL, /* - * Disregard stats, if we don't meet these minimums + * Disregard stats, if we don't meet this minimum */ RWB_MIN_WRITE_SAMPLES = 3, - RWB_MIN_READ_SAMPLES = 1, /* * If we have this number of consecutive windows with not enough @@ -89,18 +89,51 @@ static void wb_timestamp(struct rq_wb *rwb, unsigned long *var) } } -void __wbt_done(struct rq_wb *rwb) +/* + * If a task was rate throttled in balance_dirty_pages() within the last + * second or so, use that to indicate a higher cleaning rate. + */ +static bool wb_recent_wait(struct rq_wb *rwb) +{ + struct bdi_writeback *wb = &rwb->bdi->wb; + + return time_before(jiffies, wb->dirty_sleep + HZ); +} + +static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb, bool is_kswapd) +{ + return &rwb->rq_wait[is_kswapd]; +} + +static void rwb_wake_all(struct rq_wb *rwb) +{ + int i; + + for (i = 0; i < WBT_NUM_RWQ; i++) { + struct rq_wait *rqw = &rwb->rq_wait[i]; + + if (waitqueue_active(&rqw->wait)) + wake_up_all(&rqw->wait); + } +} + +void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct) { + struct rq_wait *rqw; int inflight, limit; - inflight = atomic_dec_return(&rwb->inflight); + if (!(wb_acct & WBT_TRACKED)) + return; + + rqw = get_rq_wait(rwb, wb_acct & WBT_KSWAPD); + inflight = atomic_dec_return(&rqw->inflight); /* * wbt got disabled with IO in flight. Wake up any potential * waiters, we don't have to do more than that. */ if (unlikely(!rwb_enabled(rwb))) { - wake_up_all(&rwb->wait); + rwb_wake_all(rwb); return; } @@ -108,7 +141,7 @@ void __wbt_done(struct rq_wb *rwb) * If the device does write back caching, drop further down * before we wake people up. */ - if (rwb->wc && !atomic_read(&rwb->bdi->wb.dirty_sleeping)) + if (rwb->wc && !wb_recent_wait(rwb)) limit = 0; else limit = rwb->wb_normal; @@ -119,11 +152,11 @@ void __wbt_done(struct rq_wb *rwb) if (inflight && inflight >= limit) return; - if (waitqueue_active(&rwb->wait)) { + if (waitqueue_active(&rqw->wait)) { int diff = limit - inflight; if (!inflight || diff >= rwb->wb_background / 2) - wake_up_nr(&rwb->wait, 1); + wake_up(&rqw->wait); } } @@ -136,27 +169,33 @@ void wbt_done(struct rq_wb *rwb, struct wb_issue_stat *stat) if (!rwb) return; - if (!wbt_tracked(stat)) { + if (!wbt_is_tracked(stat)) { if (rwb->sync_cookie == stat) { rwb->sync_issue = 0; rwb->sync_cookie = NULL; } - wb_timestamp(rwb, &rwb->last_comp); + if (wbt_is_read(stat)) + wb_timestamp(rwb, &rwb->last_comp); + wbt_clear_state(stat); } else { WARN_ON_ONCE(stat == rwb->sync_cookie); - __wbt_done(rwb); - wbt_clear_tracked(stat); + __wbt_done(rwb, wbt_stat_to_mask(stat)); + wbt_clear_state(stat); } } -static void calc_wb_limits(struct rq_wb *rwb) +/* + * Return true, if we can't increase the depth further by scaling + */ +static bool calc_wb_limits(struct rq_wb *rwb) { unsigned int depth; + bool ret = false; if (!rwb->min_lat_nsec) { rwb->wb_max = rwb->wb_normal = rwb->wb_background = 0; - return; + return false; } /* @@ -167,22 +206,44 @@ static void calc_wb_limits(struct rq_wb *rwb) * scaling down, then keep a setting of 1/1/1. */ if (rwb->queue_depth == 1) { - if (rwb->scale_step) + if (rwb->scale_step > 0) rwb->wb_max = rwb->wb_normal = 1; - else + else { rwb->wb_max = rwb->wb_normal = 2; + ret = true; + } rwb->wb_background = 1; } else { - depth = min_t(unsigned int, RWB_MAX_DEPTH, rwb->queue_depth); + /* + * scale_step == 0 is our default state. If we have suffered + * latency spikes, step will be > 0, and we shrink the + * allowed write depths. If step is < 0, we're only doing + * writes, and we allow a temporarily higher depth to + * increase performance. + */ + depth = min_t(unsigned int, RWB_DEF_DEPTH, rwb->queue_depth); + if (rwb->scale_step > 0) + depth = 1 + ((depth - 1) >> min(31, rwb->scale_step)); + else if (rwb->scale_step < 0) { + unsigned int maxd = 3 * rwb->queue_depth / 4; + + depth = 1 + ((depth - 1) << -rwb->scale_step); + if (depth > maxd) { + depth = maxd; + ret = true; + } + } /* * Set our max/normal/bg queue depths based on how far * we have scaled down (->scale_step). */ - rwb->wb_max = 1 + ((depth - 1) >> min(31U, rwb->scale_step)); + rwb->wb_max = depth; rwb->wb_normal = (rwb->wb_max + 1) / 2; rwb->wb_background = (rwb->wb_max + 3) / 4; } + + return ret; } static bool inline stat_sample_valid(struct blk_rq_stat *stat) @@ -209,8 +270,9 @@ static u64 rwb_sync_issue_lat(struct rq_wb *rwb) } enum { - LAT_OK, + LAT_OK = 1, LAT_UNKNOWN, + LAT_UNKNOWN_WRITES, LAT_EXCEEDED, }; @@ -234,8 +296,21 @@ static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) return LAT_EXCEEDED; } - if (!stat_sample_valid(stat)) + /* + * No read/write mix, if stat isn't valid + */ + if (!stat_sample_valid(stat)) { + /* + * If we had writes in this stat window and the window is + * current, we're only doing writes. If a task recently + * waited or still has writes in flights, consider us doing + * just writes as well. + */ + if ((stat[1].nr_samples && rwb->stat_ops->is_current(stat)) || + wb_recent_wait(rwb) || wbt_inflight(rwb)) + return LAT_UNKNOWN_WRITES; return LAT_UNKNOWN; + } /* * If the 'min' latency exceeds our target, step down. @@ -269,23 +344,27 @@ static void rwb_trace_step(struct rq_wb *rwb, const char *msg) static void scale_up(struct rq_wb *rwb) { /* - * If we're at 0, we can't go lower. + * Hit max in previous round, stop here */ - if (!rwb->scale_step) + if (rwb->scaled_max) return; rwb->scale_step--; rwb->unknown_cnt = 0; rwb->stat_ops->clear(rwb->ops_data); - calc_wb_limits(rwb); - if (waitqueue_active(&rwb->wait)) - wake_up_all(&rwb->wait); + rwb->scaled_max = calc_wb_limits(rwb); + + rwb_wake_all(rwb); rwb_trace_step(rwb, "step up"); } -static void scale_down(struct rq_wb *rwb) +/* + * Scale rwb down. If 'hard_throttle' is set, do it quicker, since we + * had a latency violation. + */ +static void scale_down(struct rq_wb *rwb, bool hard_throttle) { /* * Stop scaling down when we've hit the limit. This also prevents @@ -295,7 +374,12 @@ static void scale_down(struct rq_wb *rwb) if (rwb->wb_max == 1) return; - rwb->scale_step++; + if (rwb->scale_step < 0 && hard_throttle) + rwb->scale_step = 0; + else + rwb->scale_step++; + + rwb->scaled_max = false; rwb->unknown_cnt = 0; rwb->stat_ops->clear(rwb->ops_data); calc_wb_limits(rwb); @@ -306,13 +390,23 @@ static void rwb_arm_timer(struct rq_wb *rwb) { unsigned long expires; - /* - * We should speed this up, using some variant of a fast integer - * inverse square root calculation. Since we only do this for - * every window expiration, it's not a huge deal, though. - */ - rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4, + if (rwb->scale_step > 0) { + /* + * We should speed this up, using some variant of a fast + * integer inverse square root calculation. Since we only do + * this for every window expiration, it's not a huge deal, + * though. + */ + rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4, int_sqrt((rwb->scale_step + 1) << 8)); + } else { + /* + * For step < 0, we don't want to increase/decrease the + * window size. + */ + rwb->cur_win_nsec = rwb->win_nsec; + } + expires = jiffies + nsecs_to_jiffies(rwb->cur_win_nsec); mod_timer(&rwb->window_timer, expires); } @@ -320,28 +414,45 @@ static void rwb_arm_timer(struct rq_wb *rwb) static void wb_timer_fn(unsigned long data) { struct rq_wb *rwb = (struct rq_wb *) data; + unsigned int inflight = wbt_inflight(rwb); int status; + status = latency_exceeded(rwb); + + trace_wbt_timer(rwb->bdi, status, rwb->scale_step, inflight); + /* * If we exceeded the latency target, step down. If we did not, * step one level up. If we don't know enough to say either exceeded * or ok, then don't do anything. */ - status = latency_exceeded(rwb); switch (status) { case LAT_EXCEEDED: - scale_down(rwb); + scale_down(rwb, true); break; case LAT_OK: scale_up(rwb); break; + case LAT_UNKNOWN_WRITES: + scale_up(rwb); + break; case LAT_UNKNOWN: + if (++rwb->unknown_cnt < RWB_UNKNOWN_BUMP) + break; /* - * We had no read samples, start bumping up the write - * depth slowly + * We get here for two reasons: + * + * 1) We previously scaled reduced depth, and we currently + * don't have a valid read/write sample. For that case, + * slowly return to center state (step == 0). + * 2) We started a the center step, but don't have a valid + * read/write sample, but we do have writes going on. + * Allow step to go negative, to increase write perf. */ - if (++rwb->unknown_cnt >= RWB_UNKNOWN_BUMP) + if (rwb->scale_step > 0) scale_up(rwb); + else if (rwb->scale_step < 0) + scale_down(rwb, false); break; default: break; @@ -350,17 +461,17 @@ static void wb_timer_fn(unsigned long data) /* * Re-arm timer, if we have IO in flight */ - if (rwb->scale_step || atomic_read(&rwb->inflight)) + if (rwb->scale_step || inflight) rwb_arm_timer(rwb); } void wbt_update_limits(struct rq_wb *rwb) { rwb->scale_step = 0; + rwb->scaled_max = false; calc_wb_limits(rwb); - if (waitqueue_active(&rwb->wait)) - wake_up_all(&rwb->wait); + rwb_wake_all(rwb); } static bool close_io(struct rq_wb *rwb) @@ -378,13 +489,14 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw) unsigned int limit; /* - * At this point we know it's a buffered write. If REQ_SYNC is - * set, then it's WB_SYNC_ALL writeback, and we'll use the max - * limit for that. If the write is marked as a background write, - * then use the idle limit, or go to normal if we haven't had - * competing IO for a bit. + * At this point we know it's a buffered write. If this is + * kswapd trying to free memory, or REQ_SYNC is set, set, then + * it's WB_SYNC_ALL writeback, and we'll use the max limit for + * that. If the write is marked as a background write, then use + * the idle limit, or go to normal if we haven't had competing + * IO for a bit. */ - if ((rw & REQ_HIPRIO) || atomic_read(&rwb->bdi->wb.dirty_sleeping)) + if ((rw & REQ_HIPRIO) || wb_recent_wait(rwb) || current_is_kswapd()) limit = rwb->wb_max; else if ((rw & REQ_BG) || close_io(rwb)) { /* @@ -398,7 +510,8 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw) return limit; } -static inline bool may_queue(struct rq_wb *rwb, unsigned long rw) +static inline bool may_queue(struct rq_wb *rwb, struct rq_wait *rqw, + unsigned long rw) { /* * inc it here even if disabled, since we'll dec it at completion. @@ -406,11 +519,11 @@ static inline bool may_queue(struct rq_wb *rwb, unsigned long rw) * and someone turned it off at the same time. */ if (!rwb_enabled(rwb)) { - atomic_inc(&rwb->inflight); + atomic_inc(&rqw->inflight); return true; } - return atomic_inc_below(&rwb->inflight, get_limit(rwb, rw)); + return atomic_inc_below(&rqw->inflight, get_limit(rwb, rw)); } /* @@ -419,16 +532,17 @@ static inline bool may_queue(struct rq_wb *rwb, unsigned long rw) */ static void __wbt_wait(struct rq_wb *rwb, unsigned long rw, spinlock_t *lock) { + struct rq_wait *rqw = get_rq_wait(rwb, current_is_kswapd()); DEFINE_WAIT(wait); - if (may_queue(rwb, rw)) + if (may_queue(rwb, rqw, rw)) return; do { - prepare_to_wait_exclusive(&rwb->wait, &wait, + prepare_to_wait_exclusive(&rqw->wait, &wait, TASK_UNINTERRUPTIBLE); - if (may_queue(rwb, rw)) + if (may_queue(rwb, rqw, rw)) break; if (lock) @@ -440,15 +554,17 @@ static void __wbt_wait(struct rq_wb *rwb, unsigned long rw, spinlock_t *lock) spin_lock_irq(lock); } while (1); - finish_wait(&rwb->wait, &wait); + finish_wait(&rqw->wait, &wait); } static inline bool wbt_should_throttle(struct rq_wb *rwb, unsigned int rw) { + const int op = rw >> BIO_OP_SHIFT; + /* * If not a WRITE (or a discard), do nothing */ - if (!(rw & REQ_WRITE) || (rw & REQ_DISCARD)) + if (!(op == REQ_OP_WRITE || op == REQ_OP_DISCARD)) return false; /* @@ -466,14 +582,20 @@ static inline bool wbt_should_throttle(struct rq_wb *rwb, unsigned int rw) * in an irq held spinlock, if it holds one when calling this function. * If we do sleep, we'll release and re-grab it. */ -bool wbt_wait(struct rq_wb *rwb, unsigned int rw, spinlock_t *lock) +unsigned int wbt_wait(struct rq_wb *rwb, unsigned int rw, spinlock_t *lock) { + unsigned int ret = 0; + if (!rwb_enabled(rwb)) - return false; + return 0; + + if ((rw >> BIO_OP_SHIFT) == REQ_OP_READ) + ret = WBT_READ; if (!wbt_should_throttle(rwb, rw)) { - wb_timestamp(rwb, &rwb->last_issue); - return false; + if (ret & WBT_READ) + wb_timestamp(rwb, &rwb->last_issue); + return ret; } __wbt_wait(rwb, rw, lock); @@ -481,7 +603,10 @@ bool wbt_wait(struct rq_wb *rwb, unsigned int rw, spinlock_t *lock) if (!timer_pending(&rwb->window_timer)) rwb_arm_timer(rwb); - return true; + if (current_is_kswapd()) + ret |= WBT_KSWAPD; + + return ret | WBT_TRACKED; } void wbt_issue(struct rq_wb *rwb, struct wb_issue_stat *stat) @@ -499,7 +624,7 @@ void wbt_issue(struct rq_wb *rwb, struct wb_issue_stat *stat) * only use the address to compare with, which is why we store the * sync_issue time locally. */ - if (!wbt_tracked(stat) && !rwb->sync_issue) { + if (wbt_is_read(stat) && !rwb->sync_issue) { rwb->sync_cookie = stat; rwb->sync_issue = wbt_issue_stat_get_time(stat); } @@ -531,9 +656,11 @@ void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on) void wbt_disable(struct rq_wb *rwb) { - del_timer_sync(&rwb->window_timer); - rwb->win_nsec = rwb->min_lat_nsec = 0; - wbt_update_limits(rwb); + if (rwb) { + del_timer_sync(&rwb->window_timer); + rwb->win_nsec = rwb->min_lat_nsec = 0; + wbt_update_limits(rwb); + } } EXPORT_SYMBOL_GPL(wbt_disable); @@ -541,20 +668,27 @@ struct rq_wb *wbt_init(struct backing_dev_info *bdi, struct wb_stat_ops *ops, void *ops_data) { struct rq_wb *rwb; + int i; + + if (!ops->get || !ops->is_current || !ops->clear) + return ERR_PTR(-EINVAL); rwb = kzalloc(sizeof(*rwb), GFP_KERNEL); if (!rwb) return ERR_PTR(-ENOMEM); - atomic_set(&rwb->inflight, 0); - init_waitqueue_head(&rwb->wait); + for (i = 0; i < WBT_NUM_RWQ; i++) { + atomic_set(&rwb->rq_wait[i].inflight, 0); + init_waitqueue_head(&rwb->rq_wait[i].wait); + } + setup_timer(&rwb->window_timer, wb_timer_fn, (unsigned long) rwb); rwb->wc = 1; - rwb->queue_depth = RWB_MAX_DEPTH; + rwb->queue_depth = RWB_DEF_DEPTH; rwb->last_comp = rwb->last_issue = jiffies; rwb->bdi = bdi; rwb->win_nsec = RWB_WINDOW_NSEC; - rwb->stat_ops = ops, + rwb->stat_ops = ops; rwb->ops_data = ops_data; wbt_update_limits(rwb); return rwb; -- cgit v1.2.3