diff options
Diffstat (limited to 'arch/x86')
264 files changed, 9582 insertions, 3909 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b3a1a5d77..a43ae1fa2 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -27,7 +27,8 @@ config X86 select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FAST_MULTIPLIER select ARCH_HAS_GCOV_PROFILE_ALL - select ARCH_HAS_PMEM_API + select ARCH_HAS_PMEM_API if X86_64 + select ARCH_HAS_MMIO_FLUSH select ARCH_HAS_SG_CHAIN select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI @@ -41,6 +42,7 @@ config X86 select ARCH_USE_CMPXCHG_LOCKREF if X86_64 select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_SPINLOCKS + select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH if SMP select ARCH_WANTS_DYNAMIC_TASK_STRUCT select ARCH_WANT_FRAME_POINTERS select ARCH_WANT_IPC_PARSE_VERSION if X86_32 @@ -133,7 +135,7 @@ config X86 select HAVE_PERF_USER_STACK_DUMP select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_SYSCALL_TRACEPOINTS - select HAVE_UID16 if X86_32 + select HAVE_UID16 if X86_32 || IA32_EMULATION select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_USER_RETURN_NOTIFIER select IRQ_FORCED_THREADING @@ -875,10 +877,26 @@ config SCHED_SMT depends on SMP ---help--- SMT scheduler support improves the CPU scheduler's decision making - when dealing with Intel Pentium 4 chips with HyperThreading at a + when dealing with Intel P4/Core 2 chips with HyperThreading at a cost of slightly increased overhead in some places. If unsure say N here. +config SMT_NICE + bool "SMT (Hyperthreading) aware nice priority and policy support" + depends on SCHED_BFS && SCHED_SMT + default y + ---help--- + Enabling Hyperthreading on Intel CPUs decreases the effectiveness + of the use of 'nice' levels and different scheduling policies + (e.g. realtime) due to sharing of CPU power between hyperthreads. + SMT nice support makes each logical CPU aware of what is running on + its hyperthread siblings, maintaining appropriate distribution of + CPU according to nice levels and scheduling policies at the expense + of slightly increased overhead. + + If unsure say Y here. + + config SCHED_MC def_bool y prompt "Multi-core scheduler support" @@ -955,6 +973,7 @@ config X86_REROUTE_FOR_BROKEN_BOOT_IRQS config X86_MCE bool "Machine Check / overheating reporting" + select GENERIC_ALLOCATOR default y ---help--- Machine Check support allows the processor to notify the @@ -1002,19 +1021,42 @@ config X86_THERMAL_VECTOR def_bool y depends on X86_MCE_INTEL -config VM86 - bool "Enable VM86 support" if EXPERT - default y +config X86_LEGACY_VM86 + bool "Legacy VM86 support" + default n depends on X86_32 ---help--- - This option is required by programs like DOSEMU to run - 16-bit real mode legacy code on x86 processors. It also may - be needed by software like XFree86 to initialize some video - cards via BIOS. Disabling this option saves about 6K. + This option allows user programs to put the CPU into V8086 + mode, which is an 80286-era approximation of 16-bit real mode. + + Some very old versions of X and/or vbetool require this option + for user mode setting. Similarly, DOSEMU will use it if + available to accelerate real mode DOS programs. However, any + recent version of DOSEMU, X, or vbetool should be fully + functional even without kernel VM86 support, as they will all + fall back to software emulation. Nevertheless, if you are using + a 16-bit DOS program where 16-bit performance matters, vm86 + mode might be faster than emulation and you might want to + enable this option. + + Note that any app that works on a 64-bit kernel is unlikely to + need this option, as 64-bit kernels don't, and can't, support + V8086 mode. This option is also unrelated to 16-bit protected + mode and is not needed to run most 16-bit programs under Wine. + + Enabling this option increases the complexity of the kernel + and slows down exception handling a tiny bit. + + If unsure, say N here. + +config VM86 + bool + default X86_LEGACY_VM86 config X86_16BIT bool "Enable support for 16-bit segments" if EXPERT default y + depends on MODIFY_LDT_SYSCALL ---help--- This option is required by programs like Wine to run 16-bit protected mode legacy code on x86 processors. Disabling @@ -1282,6 +1324,7 @@ config HIGHMEM config X86_PAE bool "PAE (Physical Address Extension) Support" depends on X86_32 && !HIGHMEM4G + select SWIOTLB ---help--- PAE is required for NX support, and furthermore enables larger swapspace support for non-overcommit purposes. It @@ -1426,10 +1469,14 @@ config ILLEGAL_POINTER_VALUE source "mm/Kconfig" +config X86_PMEM_LEGACY_DEVICE + bool + config X86_PMEM_LEGACY - bool "Support non-standard NVDIMMs and ADR protected memory" + tristate "Support non-standard NVDIMMs and ADR protected memory" depends on PHYS_ADDR_T_64BIT depends on BLK_DEV + select X86_PMEM_LEGACY_DEVICE select LIBNVDIMM help Treat memory marked using the non-standard e820 type of 12 as used @@ -1509,6 +1556,7 @@ config X86_RESERVE_LOW config MATH_EMULATION bool + depends on MODIFY_LDT_SYSCALL prompt "Math emulation" if X86_32 ---help--- Linux can emulate a math coprocessor (used for floating point @@ -1724,6 +1772,7 @@ source kernel/Kconfig.hz config KEXEC bool "kexec system call" + select KEXEC_CORE ---help--- kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot @@ -1740,8 +1789,8 @@ config KEXEC config KEXEC_FILE bool "kexec file based system call" + select KEXEC_CORE select BUILD_BIN2C - depends on KEXEC depends on X86_64 depends on CRYPTO=y depends on CRYPTO_SHA256=y @@ -1944,7 +1993,7 @@ config HOTPLUG_CPU config BOOTPARAM_HOTPLUG_CPU0 bool "Set default setting of cpu0_hotpluggable" default n - depends on HOTPLUG_CPU + depends on HOTPLUG_CPU && !SCHED_BFS ---help--- Set whether default state of cpu0_hotpluggable is on or off. @@ -1973,7 +2022,7 @@ config BOOTPARAM_HOTPLUG_CPU0 config DEBUG_HOTPLUG_CPU0 def_bool n prompt "Debug CPU0 hotplug" - depends on HOTPLUG_CPU + depends on HOTPLUG_CPU && !SCHED_BFS ---help--- Enabling this option offlines CPU0 (if CPU0 can be offlined) as soon as possible and boots up userspace with CPU0 offlined. User @@ -2053,6 +2102,22 @@ config CMDLINE_OVERRIDE This is used to work around broken boot loaders. This should be set to 'N' under normal conditions. +config MODIFY_LDT_SYSCALL + bool "Enable the LDT (local descriptor table)" if EXPERT + default y + ---help--- + Linux can allow user programs to install a per-process x86 + Local Descriptor Table (LDT) using the modify_ldt(2) system + call. This is required to run 16-bit or segmented code such as + DOSEMU or some Wine programs. It is also used by some very old + threading libraries. + + Enabling this feature adds a small amount of overhead to + context switches and increases the low-level kernel attack + surface. Disabling it removes the modify_ldt(2) system call. + + Saying 'N' here may make sense for embedded or server kernels. + source "kernel/livepatch/Kconfig" endmenu @@ -2522,7 +2587,7 @@ config IA32_EMULATION depends on X86_64 select BINFMT_ELF select COMPAT_BINFMT_ELF - select HAVE_UID16 + select ARCH_WANT_OLD_COMPAT_IPC ---help--- Include code to run legacy 32-bit programs under a 64-bit kernel. You should likely turn this on, unless you're @@ -2536,7 +2601,7 @@ config IA32_AOUT config X86_X32 bool "x32 ABI for 64-bit mode" - depends on X86_64 && IA32_EMULATION + depends on X86_64 ---help--- Include code to run binaries for the x32 native 32-bit ABI for 64-bit processors. An x32 process gets access to the @@ -2550,7 +2615,6 @@ config X86_X32 config COMPAT def_bool y depends on IA32_EMULATION || X86_X32 - select ARCH_WANT_OLD_COMPAT_IPC if COMPAT config COMPAT_FOR_U64_ALIGNMENT diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 4f6fb1de1..1f9fbf9d6 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -203,6 +203,13 @@ config MPILEDRIVER Enables -march=bdver2 +config MSTEAMROLLER + bool "AMD Steamroller" + ---help--- + Select this for AMD Steamroller processors. + + Enables -march=bdver3 + config MJAGUAR bool "AMD Jaguar" ---help--- @@ -376,6 +383,14 @@ config MBROADWELL Enables -march=broadwell +config MSKYLAKE + bool "Intel Skylake" + ---help--- + + Select this for 6th Gen Core processors in the Skylake family. + + Enables -march=skylake + config GENERIC_CPU bool "Generic-x86-64" depends on X86_64 @@ -430,7 +445,7 @@ config X86_INTERNODE_CACHE_SHIFT config X86_L1_CACHE_SHIFT int default "7" if MPENTIUM4 || MPSC - default "6" if MK7 || MK8 || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MBULLDOZER || MPILEDRIVER || MJAGUAR || MPENTIUMM || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MNATIVE || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU + default "6" if MK7 || MK8 || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MJAGUAR || MPENTIUMM || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MNATIVE || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU default "4" if MELAN || M486 || MGEODEGX1 default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX @@ -461,11 +476,11 @@ config X86_ALIGNMENT_16 config X86_INTEL_USERCOPY def_bool y - depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK8SSE3 || MK7 || MEFFICEON || MCORE2 || MK10 || MBARCELONA || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MNATIVE + depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK8SSE3 || MK7 || MEFFICEON || MCORE2 || MK10 || MBARCELONA || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MNATIVE config X86_USE_PPRO_CHECKSUM def_bool y - depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MK10 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MK8SSE3 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MATOM || MNATIVE + depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MK10 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MK8SSE3 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MATOM || MNATIVE config X86_USE_3DNOW def_bool y @@ -489,17 +504,17 @@ config X86_P6_NOP config X86_TSC def_bool y - depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MK8SSE3 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MNATIVE || MATOM) || X86_64 + depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MK8SSE3 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MNATIVE || MATOM) || X86_64 config X86_CMPXCHG64 def_bool y - depends on X86_PAE || X86_64 || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MATOM || MNATIVE + depends on X86_PAE || X86_64 || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MATOM || MNATIVE # this should be set for all -march=.. options where the compiler # generates cmov. config X86_CMOV def_bool y - depends on (MK8 || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MBULLDOZER || MPILEDRIVER || MJAGUAR || MK7 || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MNATIVE || MATOM || MGEODE_LX) + depends on (MK8 || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MJAGUAR || MK7 || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MNATIVE || MATOM || MGEODE_LX) config X86_MINIMUM_CPU_FAMILY int diff --git a/arch/x86/Makefile b/arch/x86/Makefile index d8102ac3a..d5de9d500 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -39,6 +39,16 @@ ifdef CONFIG_X86_NEED_RELOCS LDFLAGS_vmlinux := --emit-relocs endif +# +# Prevent GCC from generating any FP code by mistake. +# +# This must happen before we try the -mpreferred-stack-boundary, see: +# +# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383 +# +KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow +KBUILD_CFLAGS += $(call cc-option,-mno-avx,) + ifeq ($(CONFIG_X86_32),y) BITS := 32 UTS_MACHINE := i386 @@ -102,6 +112,7 @@ else cflags-$(CONFIG_MBOBCAT) += $(call cc-option,-march=btver1) cflags-$(CONFIG_MBULLDOZER) += $(call cc-option,-march=bdver1) cflags-$(CONFIG_MPILEDRIVER) += $(call cc-option,-march=bdver2) + cflags-$(CONFIG_MSTEAMROLLER) += $(call cc-option,-march=bdver3) cflags-$(CONFIG_MJAGUAR) += $(call cc-option,-march=btver2) cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona) @@ -121,6 +132,8 @@ else $(call cc-option,-march=haswell,$(call cc-option,-mtune=haswell)) cflags-$(CONFIG_MBROADWELL) += \ $(call cc-option,-march=broadwell,$(call cc-option,-mtune=broadwell)) + cflags-$(CONFIG_MSKYLAKE) += \ + $(call cc-option,-march=skylake,$(call cc-option,-mtune=skylake)) cflags-$(CONFIG_MATOM) += $(call cc-option,-march=bonnell) \ $(call cc-option,-mtune=bonnell,$(call cc-option,-mtune=generic)) cflags-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=generic) @@ -189,9 +202,6 @@ KBUILD_CFLAGS += -pipe KBUILD_CFLAGS += -Wno-sign-compare # KBUILD_CFLAGS += -fno-asynchronous-unwind-tables -# prevent gcc from generating any FP code by mistake -KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -KBUILD_CFLAGS += $(call cc-option,-mno-avx,) KBUILD_CFLAGS += $(mflags-y) KBUILD_AFLAGS += $(mflags-y) @@ -234,6 +244,8 @@ drivers-$(CONFIG_PM) += arch/x86/power/ drivers-$(CONFIG_FB) += arch/x86/video/ +drivers-$(CONFIG_RAS) += arch/x86/ras/ + #### # boot loader support. Several targets are kept for legacy purposes diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu index 558bd3b65..6b71290d8 100644 --- a/arch/x86/Makefile_32.cpu +++ b/arch/x86/Makefile_32.cpu @@ -34,6 +34,7 @@ cflags-$(CONFIG_MBARCELONA) += $(call cc-option,-march=barcelona,-march=athlon) cflags-$(CONFIG_MBOBCAT) += $(call cc-option,-march=btver1,-march=athlon) cflags-$(CONFIG_MBULLDOZER) += $(call cc-option,-march=bdver1,-march=athlon) cflags-$(CONFIG_MPILEDRIVER) += $(call cc-option,-march=bdver2,-march=athlon) +cflags-$(CONFIG_MSTEAMROLLER) += $(call cc-option,-march=bdver3,-march=athlon) cflags-$(CONFIG_MJAGUAR) += $(call cc-option,-march=btver2,-march=athlon) cflags-$(CONFIG_MCRUSOE) += -march=i686 $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 cflags-$(CONFIG_MEFFICEON) += -march=i686 $(call tune,pentium3) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 @@ -50,6 +51,7 @@ cflags-$(CONFIG_MSANDYBRIDGE) += -march=i686 $(call tune,sandybridge) cflags-$(CONFIG_MIVYBRIDGE) += -march=i686 $(call tune,ivybridge) cflags-$(CONFIG_MHASWELL) += -march=i686 $(call tune,haswell) cflags-$(CONFIG_MBROADWELL) += -march=i686 $(call tune,broadwell) +cflags-$(CONFIG_MSKYLAKE) += -march=i686 $(call tune,skylake) cflags-$(CONFIG_MATOM) += $(call cc-option,-march=bonnell,$(call cc-option,-march=core2,-march=i686)) \ $(call cc-option,-mtune=bonnell,$(call cc-option,-mtune=generic)) diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 57bbf2fb2..0d553e541 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -23,7 +23,7 @@ targets += fdimage fdimage144 fdimage288 image.iso mtools.conf subdir- := compressed setup-y += a20.o bioscall.o cmdline.o copy.o cpu.o cpuflags.o cpucheck.o -setup-y += early_serial_console.o edd.o header.o main.o mca.o memory.o +setup-y += early_serial_console.o edd.o header.o main.o memory.o setup-y += pm.o pmjump.o printf.o regs.o string.o tty.o video.o setup-y += video-mode.o version.o setup-$(CONFIG_X86_APM_BOOT) += apm.o diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index bd49ec612..0033e96c3 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -307,9 +307,6 @@ void query_edd(void); /* header.S */ void __attribute__((noreturn)) die(void); -/* mca.c */ -int query_mca(void); - /* memory.c */ int detect_memory(void); diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c index d7b1f655b..6a9b96b46 100644 --- a/arch/x86/boot/compressed/aslr.c +++ b/arch/x86/boot/compressed/aslr.c @@ -82,7 +82,7 @@ static unsigned long get_random_long(void) if (has_cpuflag(X86_FEATURE_TSC)) { debug_putstr(" RDTSC"); - rdtscll(raw); + raw = rdtsc(); random ^= raw; use_i8254 = false; diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index 7d69afd8b..db51c1f27 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -667,6 +667,7 @@ setup_gop32(struct screen_info *si, efi_guid_t *proto, bool conout_found = false; void *dummy = NULL; u32 h = handles[i]; + u32 current_fb_base; status = efi_call_early(handle_protocol, h, proto, (void **)&gop32); @@ -678,7 +679,7 @@ setup_gop32(struct screen_info *si, efi_guid_t *proto, if (status == EFI_SUCCESS) conout_found = true; - status = __gop_query32(gop32, &info, &size, &fb_base); + status = __gop_query32(gop32, &info, &size, ¤t_fb_base); if (status == EFI_SUCCESS && (!first_gop || conout_found)) { /* * Systems that use the UEFI Console Splitter may @@ -692,6 +693,7 @@ setup_gop32(struct screen_info *si, efi_guid_t *proto, pixel_format = info->pixel_format; pixel_info = info->pixel_information; pixels_per_scan_line = info->pixels_per_scan_line; + fb_base = current_fb_base; /* * Once we've found a GOP supporting ConOut, @@ -770,6 +772,7 @@ setup_gop64(struct screen_info *si, efi_guid_t *proto, bool conout_found = false; void *dummy = NULL; u64 h = handles[i]; + u32 current_fb_base; status = efi_call_early(handle_protocol, h, proto, (void **)&gop64); @@ -781,7 +784,7 @@ setup_gop64(struct screen_info *si, efi_guid_t *proto, if (status == EFI_SUCCESS) conout_found = true; - status = __gop_query64(gop64, &info, &size, &fb_base); + status = __gop_query64(gop64, &info, &size, ¤t_fb_base); if (status == EFI_SUCCESS && (!first_gop || conout_found)) { /* * Systems that use the UEFI Console Splitter may @@ -795,6 +798,7 @@ setup_gop64(struct screen_info *si, efi_guid_t *proto, pixel_format = info->pixel_format; pixel_info = info->pixel_information; pixels_per_scan_line = info->pixels_per_scan_line; + fb_base = current_fb_base; /* * Once we've found a GOP supporting ConOut, @@ -1041,7 +1045,6 @@ void setup_graphics(struct boot_params *boot_params) struct boot_params *make_boot_params(struct efi_config *c) { struct boot_params *boot_params; - struct sys_desc_table *sdt; struct apm_bios_info *bi; struct setup_header *hdr; struct efi_info *efi; @@ -1089,7 +1092,6 @@ struct boot_params *make_boot_params(struct efi_config *c) hdr = &boot_params->hdr; efi = &boot_params->efi_info; bi = &boot_params->apm_bios_info; - sdt = &boot_params->sys_desc_table; /* Copy the second sector to boot_params */ memcpy(&hdr->jump, image->image_base + 512, 512); @@ -1118,8 +1120,6 @@ struct boot_params *make_boot_params(struct efi_config *c) /* Clear APM BIOS info */ memset(bi, 0, sizeof(*bi)); - memset(sdt, 0, sizeof(*sdt)); - status = efi_parse_options(cmdline_ptr); if (status != EFI_SUCCESS) goto fail2; diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index e28437e0f..79dac1758 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -220,6 +220,23 @@ void __putstr(const char *s) outb(0xff & (pos >> 1), vidport+1); } +void __puthex(unsigned long value) +{ + char alpha[2] = "0"; + int bits; + + for (bits = sizeof(value) * 8 - 4; bits >= 0; bits -= 4) { + unsigned long digit = (value >> bits) & 0xf; + + if (digit < 0xA) + alpha[0] = '0' + digit; + else + alpha[0] = 'a' + (digit - 0xA); + + __putstr(alpha); + } +} + static void error(char *x) { error_putstr("\n\n"); @@ -399,6 +416,13 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap, free_mem_ptr = heap; /* Heap */ free_mem_end_ptr = heap + BOOT_HEAP_SIZE; + /* Report initial kernel position details. */ + debug_putaddr(input_data); + debug_putaddr(input_len); + debug_putaddr(output); + debug_putaddr(output_len); + debug_putaddr(run_size); + /* * The memory hole needed for the kernel is the larger of either * the entire decompressed kernel plus relocation table, or the diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 805d25ca5..3783dc3e1 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -34,16 +34,27 @@ extern memptr free_mem_ptr; extern memptr free_mem_end_ptr; extern struct boot_params *real_mode; /* Pointer to real-mode data */ void __putstr(const char *s); +void __puthex(unsigned long value); #define error_putstr(__x) __putstr(__x) +#define error_puthex(__x) __puthex(__x) #ifdef CONFIG_X86_VERBOSE_BOOTUP #define debug_putstr(__x) __putstr(__x) +#define debug_puthex(__x) __puthex(__x) +#define debug_putaddr(__x) { \ + debug_putstr(#__x ": 0x"); \ + debug_puthex((unsigned long)(__x)); \ + debug_putstr("\n"); \ + } #else static inline void debug_putstr(const char *s) { } +static inline void debug_puthex(const char *s) +{ } +#define debug_putaddr(x) /* */ #endif diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 16ef02596..2d6b309c8 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -414,7 +414,7 @@ xloadflags: # define XLF23 0 #endif -#if defined(CONFIG_X86_64) && defined(CONFIG_EFI) && defined(CONFIG_KEXEC) +#if defined(CONFIG_X86_64) && defined(CONFIG_EFI) && defined(CONFIG_KEXEC_CORE) # define XLF4 XLF_EFI_KEXEC #else # define XLF4 0 diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c index fd6c9f236..9bcea386d 100644 --- a/arch/x86/boot/main.c +++ b/arch/x86/boot/main.c @@ -161,9 +161,6 @@ void main(void) /* Set keyboard repeat rate (why?) and query the lock flags */ keyboard_init(); - /* Query MCA information */ - query_mca(); - /* Query Intel SpeedStep (IST) information */ query_ist(); diff --git a/arch/x86/boot/mca.c b/arch/x86/boot/mca.c deleted file mode 100644 index a95a53114..000000000 --- a/arch/x86/boot/mca.c +++ /dev/null @@ -1,38 +0,0 @@ -/* -*- linux-c -*- ------------------------------------------------------- * - * - * Copyright (C) 1991, 1992 Linus Torvalds - * Copyright 2007 rPath, Inc. - All Rights Reserved - * Copyright 2009 Intel Corporation; author H. Peter Anvin - * - * This file is part of the Linux kernel, and is made available under - * the terms of the GNU General Public License version 2. - * - * ----------------------------------------------------------------------- */ - -/* - * Get the MCA system description table - */ - -#include "boot.h" - -int query_mca(void) -{ - struct biosregs ireg, oreg; - u16 len; - - initregs(&ireg); - ireg.ah = 0xc0; - intcall(0x15, &ireg, &oreg); - - if (oreg.eflags & X86_EFLAGS_CF) - return -1; /* No MCA present */ - - set_fs(oreg.es); - len = rdfs16(oreg.bx); - - if (len > sizeof(boot_params.sys_desc_table)) - len = sizeof(boot_params.sys_desc_table); - - copy_from_fs(&boot_params.sys_desc_table, oreg.bx, len); - return 0; -} diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index aaa1118bf..028be48c8 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -23,6 +23,7 @@ CONFIG_BLK_DEV_INITRD=y # CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y CONFIG_KPROBES=y +CONFIG_JUMP_LABEL=y CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 315b86106..cb5b3ab5b 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -22,6 +22,7 @@ CONFIG_BLK_DEV_INITRD=y # CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y CONFIG_KPROBES=y +CONFIG_JUMP_LABEL=y CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y @@ -207,7 +208,6 @@ CONFIG_AGP_AMD64=y CONFIG_AGP_INTEL=y CONFIG_DRM=y CONFIG_DRM_I915=y -CONFIG_DRM_I915_KMS=y CONFIG_FB_MODE_HELPERS=y CONFIG_FB_TILEBLITTING=y CONFIG_FB_EFI=y diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 5a4a089e8..9a2838cf0 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -20,6 +20,7 @@ obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o +obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha20-x86_64.o obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o @@ -30,6 +31,7 @@ obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o obj-$(CONFIG_CRYPTO_SHA256_SSSE3) += sha256-ssse3.o obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o obj-$(CONFIG_CRYPTO_CRCT10DIF_PCLMUL) += crct10dif-pclmul.o +obj-$(CONFIG_CRYPTO_POLY1305_X86_64) += poly1305-x86_64.o # These modules require assembler to support AVX. ifeq ($(avx_supported),yes) @@ -60,6 +62,7 @@ blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o +chacha20-x86_64-y := chacha20-ssse3-x86_64.o chacha20_glue.o serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o ifeq ($(avx_supported),yes) @@ -75,6 +78,7 @@ endif ifeq ($(avx2_supported),yes) camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o + chacha20-x86_64-y += chacha20-avx2-x86_64.o serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o endif @@ -82,8 +86,10 @@ aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o +poly1305-x86_64-y := poly1305-sse2-x86_64.o poly1305_glue.o ifeq ($(avx2_supported),yes) sha1-ssse3-y += sha1_avx2_x86_64_asm.o +poly1305-x86_64-y += poly1305-avx2-x86_64.o endif crc32c-intel-y := crc32c-intel_glue.o crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index dccad38b5..3633ad614 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -803,10 +803,7 @@ static int rfc4106_init(struct crypto_aead *aead) return PTR_ERR(cryptd_tfm); *ctx = cryptd_tfm; - crypto_aead_set_reqsize( - aead, - sizeof(struct aead_request) + - crypto_aead_reqsize(&cryptd_tfm->base)); + crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base)); return 0; } @@ -955,8 +952,8 @@ static int helper_rfc4106_encrypt(struct aead_request *req) /* Assuming we are supporting rfc4106 64-bit extended */ /* sequence numbers We need to have the AAD length equal */ - /* to 8 or 12 bytes */ - if (unlikely(req->assoclen != 8 && req->assoclen != 12)) + /* to 16 or 20 bytes */ + if (unlikely(req->assoclen != 16 && req->assoclen != 20)) return -EINVAL; /* IV below built */ @@ -992,9 +989,9 @@ static int helper_rfc4106_encrypt(struct aead_request *req) } kernel_fpu_begin(); - aesni_gcm_enc_tfm(aes_ctx, dst, src, (unsigned long)req->cryptlen, iv, - ctx->hash_subkey, assoc, (unsigned long)req->assoclen, dst - + ((unsigned long)req->cryptlen), auth_tag_len); + aesni_gcm_enc_tfm(aes_ctx, dst, src, req->cryptlen, iv, + ctx->hash_subkey, assoc, req->assoclen - 8, + dst + req->cryptlen, auth_tag_len); kernel_fpu_end(); /* The authTag (aka the Integrity Check Value) needs to be written @@ -1033,12 +1030,12 @@ static int helper_rfc4106_decrypt(struct aead_request *req) struct scatter_walk dst_sg_walk; unsigned int i; - if (unlikely(req->assoclen != 8 && req->assoclen != 12)) + if (unlikely(req->assoclen != 16 && req->assoclen != 20)) return -EINVAL; /* Assuming we are supporting rfc4106 64-bit extended */ /* sequence numbers We need to have the AAD length */ - /* equal to 8 or 12 bytes */ + /* equal to 16 or 20 bytes */ tempCipherLen = (unsigned long)(req->cryptlen - auth_tag_len); /* IV below built */ @@ -1075,8 +1072,8 @@ static int helper_rfc4106_decrypt(struct aead_request *req) kernel_fpu_begin(); aesni_gcm_dec_tfm(aes_ctx, dst, src, tempCipherLen, iv, - ctx->hash_subkey, assoc, (unsigned long)req->assoclen, - authTag, auth_tag_len); + ctx->hash_subkey, assoc, req->assoclen - 8, + authTag, auth_tag_len); kernel_fpu_end(); /* Compare generated tag with passed in tag. */ @@ -1105,19 +1102,12 @@ static int rfc4106_encrypt(struct aead_request *req) struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct cryptd_aead **ctx = crypto_aead_ctx(tfm); struct cryptd_aead *cryptd_tfm = *ctx; - struct aead_request *subreq = aead_request_ctx(req); - aead_request_set_tfm(subreq, irq_fpu_usable() ? - cryptd_aead_child(cryptd_tfm) : - &cryptd_tfm->base); + aead_request_set_tfm(req, irq_fpu_usable() ? + cryptd_aead_child(cryptd_tfm) : + &cryptd_tfm->base); - aead_request_set_callback(subreq, req->base.flags, - req->base.complete, req->base.data); - aead_request_set_crypt(subreq, req->src, req->dst, - req->cryptlen, req->iv); - aead_request_set_ad(subreq, req->assoclen); - - return crypto_aead_encrypt(subreq); + return crypto_aead_encrypt(req); } static int rfc4106_decrypt(struct aead_request *req) @@ -1125,19 +1115,12 @@ static int rfc4106_decrypt(struct aead_request *req) struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct cryptd_aead **ctx = crypto_aead_ctx(tfm); struct cryptd_aead *cryptd_tfm = *ctx; - struct aead_request *subreq = aead_request_ctx(req); - - aead_request_set_tfm(subreq, irq_fpu_usable() ? - cryptd_aead_child(cryptd_tfm) : - &cryptd_tfm->base); - aead_request_set_callback(subreq, req->base.flags, - req->base.complete, req->base.data); - aead_request_set_crypt(subreq, req->src, req->dst, - req->cryptlen, req->iv); - aead_request_set_ad(subreq, req->assoclen); + aead_request_set_tfm(req, irq_fpu_usable() ? + cryptd_aead_child(cryptd_tfm) : + &cryptd_tfm->base); - return crypto_aead_decrypt(subreq); + return crypto_aead_decrypt(req); } #endif diff --git a/arch/x86/crypto/chacha20-avx2-x86_64.S b/arch/x86/crypto/chacha20-avx2-x86_64.S new file mode 100644 index 000000000..16694e625 --- /dev/null +++ b/arch/x86/crypto/chacha20-avx2-x86_64.S @@ -0,0 +1,443 @@ +/* + * ChaCha20 256-bit cipher algorithm, RFC7539, x64 AVX2 functions + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/linkage.h> + +.data +.align 32 + +ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 + .octa 0x0e0d0c0f0a09080b0605040702010003 +ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 + .octa 0x0d0c0f0e09080b0a0504070601000302 +CTRINC: .octa 0x00000003000000020000000100000000 + .octa 0x00000007000000060000000500000004 + +.text + +ENTRY(chacha20_8block_xor_avx2) + # %rdi: Input state matrix, s + # %rsi: 8 data blocks output, o + # %rdx: 8 data blocks input, i + + # This function encrypts eight consecutive ChaCha20 blocks by loading + # the state matrix in AVX registers eight times. As we need some + # scratch registers, we save the first four registers on the stack. The + # algorithm performs each operation on the corresponding word of each + # state matrix, hence requires no word shuffling. For final XORing step + # we transpose the matrix by interleaving 32-, 64- and then 128-bit + # words, which allows us to do XOR in AVX registers. 8/16-bit word + # rotation is done with the slightly better performing byte shuffling, + # 7/12-bit word rotation uses traditional shift+OR. + + vzeroupper + # 4 * 32 byte stack, 32-byte aligned + mov %rsp, %r8 + and $~31, %rsp + sub $0x80, %rsp + + # x0..15[0-7] = s[0..15] + vpbroadcastd 0x00(%rdi),%ymm0 + vpbroadcastd 0x04(%rdi),%ymm1 + vpbroadcastd 0x08(%rdi),%ymm2 + vpbroadcastd 0x0c(%rdi),%ymm3 + vpbroadcastd 0x10(%rdi),%ymm4 + vpbroadcastd 0x14(%rdi),%ymm5 + vpbroadcastd 0x18(%rdi),%ymm6 + vpbroadcastd 0x1c(%rdi),%ymm7 + vpbroadcastd 0x20(%rdi),%ymm8 + vpbroadcastd 0x24(%rdi),%ymm9 + vpbroadcastd 0x28(%rdi),%ymm10 + vpbroadcastd 0x2c(%rdi),%ymm11 + vpbroadcastd 0x30(%rdi),%ymm12 + vpbroadcastd 0x34(%rdi),%ymm13 + vpbroadcastd 0x38(%rdi),%ymm14 + vpbroadcastd 0x3c(%rdi),%ymm15 + # x0..3 on stack + vmovdqa %ymm0,0x00(%rsp) + vmovdqa %ymm1,0x20(%rsp) + vmovdqa %ymm2,0x40(%rsp) + vmovdqa %ymm3,0x60(%rsp) + + vmovdqa CTRINC(%rip),%ymm1 + vmovdqa ROT8(%rip),%ymm2 + vmovdqa ROT16(%rip),%ymm3 + + # x12 += counter values 0-3 + vpaddd %ymm1,%ymm12,%ymm12 + + mov $10,%ecx + +.Ldoubleround8: + # x0 += x4, x12 = rotl32(x12 ^ x0, 16) + vpaddd 0x00(%rsp),%ymm4,%ymm0 + vmovdqa %ymm0,0x00(%rsp) + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm3,%ymm12,%ymm12 + # x1 += x5, x13 = rotl32(x13 ^ x1, 16) + vpaddd 0x20(%rsp),%ymm5,%ymm0 + vmovdqa %ymm0,0x20(%rsp) + vpxor %ymm0,%ymm13,%ymm13 + vpshufb %ymm3,%ymm13,%ymm13 + # x2 += x6, x14 = rotl32(x14 ^ x2, 16) + vpaddd 0x40(%rsp),%ymm6,%ymm0 + vmovdqa %ymm0,0x40(%rsp) + vpxor %ymm0,%ymm14,%ymm14 + vpshufb %ymm3,%ymm14,%ymm14 + # x3 += x7, x15 = rotl32(x15 ^ x3, 16) + vpaddd 0x60(%rsp),%ymm7,%ymm0 + vmovdqa %ymm0,0x60(%rsp) + vpxor %ymm0,%ymm15,%ymm15 + vpshufb %ymm3,%ymm15,%ymm15 + + # x8 += x12, x4 = rotl32(x4 ^ x8, 12) + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $12,%ymm4,%ymm0 + vpsrld $20,%ymm4,%ymm4 + vpor %ymm0,%ymm4,%ymm4 + # x9 += x13, x5 = rotl32(x5 ^ x9, 12) + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $12,%ymm5,%ymm0 + vpsrld $20,%ymm5,%ymm5 + vpor %ymm0,%ymm5,%ymm5 + # x10 += x14, x6 = rotl32(x6 ^ x10, 12) + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $12,%ymm6,%ymm0 + vpsrld $20,%ymm6,%ymm6 + vpor %ymm0,%ymm6,%ymm6 + # x11 += x15, x7 = rotl32(x7 ^ x11, 12) + vpaddd %ymm15,%ymm11,%ymm11 + vpxor %ymm11,%ymm7,%ymm7 + vpslld $12,%ymm7,%ymm0 + vpsrld $20,%ymm7,%ymm7 + vpor %ymm0,%ymm7,%ymm7 + + # x0 += x4, x12 = rotl32(x12 ^ x0, 8) + vpaddd 0x00(%rsp),%ymm4,%ymm0 + vmovdqa %ymm0,0x00(%rsp) + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm2,%ymm12,%ymm12 + # x1 += x5, x13 = rotl32(x13 ^ x1, 8) + vpaddd 0x20(%rsp),%ymm5,%ymm0 + vmovdqa %ymm0,0x20(%rsp) + vpxor %ymm0,%ymm13,%ymm13 + vpshufb %ymm2,%ymm13,%ymm13 + # x2 += x6, x14 = rotl32(x14 ^ x2, 8) + vpaddd 0x40(%rsp),%ymm6,%ymm0 + vmovdqa %ymm0,0x40(%rsp) + vpxor %ymm0,%ymm14,%ymm14 + vpshufb %ymm2,%ymm14,%ymm14 + # x3 += x7, x15 = rotl32(x15 ^ x3, 8) + vpaddd 0x60(%rsp),%ymm7,%ymm0 + vmovdqa %ymm0,0x60(%rsp) + vpxor %ymm0,%ymm15,%ymm15 + vpshufb %ymm2,%ymm15,%ymm15 + + # x8 += x12, x4 = rotl32(x4 ^ x8, 7) + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm0 + vpsrld $25,%ymm4,%ymm4 + vpor %ymm0,%ymm4,%ymm4 + # x9 += x13, x5 = rotl32(x5 ^ x9, 7) + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm0 + vpsrld $25,%ymm5,%ymm5 + vpor %ymm0,%ymm5,%ymm5 + # x10 += x14, x6 = rotl32(x6 ^ x10, 7) + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm0 + vpsrld $25,%ymm6,%ymm6 + vpor %ymm0,%ymm6,%ymm6 + # x11 += x15, x7 = rotl32(x7 ^ x11, 7) + vpaddd %ymm15,%ymm11,%ymm11 + vpxor %ymm11,%ymm7,%ymm7 + vpslld $7,%ymm7,%ymm0 + vpsrld $25,%ymm7,%ymm7 + vpor %ymm0,%ymm7,%ymm7 + + # x0 += x5, x15 = rotl32(x15 ^ x0, 16) + vpaddd 0x00(%rsp),%ymm5,%ymm0 + vmovdqa %ymm0,0x00(%rsp) + vpxor %ymm0,%ymm15,%ymm15 + vpshufb %ymm3,%ymm15,%ymm15 + # x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0 + vpaddd 0x20(%rsp),%ymm6,%ymm0 + vmovdqa %ymm0,0x20(%rsp) + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm3,%ymm12,%ymm12 + # x2 += x7, x13 = rotl32(x13 ^ x2, 16) + vpaddd 0x40(%rsp),%ymm7,%ymm0 + vmovdqa %ymm0,0x40(%rsp) + vpxor %ymm0,%ymm13,%ymm13 + vpshufb %ymm3,%ymm13,%ymm13 + # x3 += x4, x14 = rotl32(x14 ^ x3, 16) + vpaddd 0x60(%rsp),%ymm4,%ymm0 + vmovdqa %ymm0,0x60(%rsp) + vpxor %ymm0,%ymm14,%ymm14 + vpshufb %ymm3,%ymm14,%ymm14 + + # x10 += x15, x5 = rotl32(x5 ^ x10, 12) + vpaddd %ymm15,%ymm10,%ymm10 + vpxor %ymm10,%ymm5,%ymm5 + vpslld $12,%ymm5,%ymm0 + vpsrld $20,%ymm5,%ymm5 + vpor %ymm0,%ymm5,%ymm5 + # x11 += x12, x6 = rotl32(x6 ^ x11, 12) + vpaddd %ymm12,%ymm11,%ymm11 + vpxor %ymm11,%ymm6,%ymm6 + vpslld $12,%ymm6,%ymm0 + vpsrld $20,%ymm6,%ymm6 + vpor %ymm0,%ymm6,%ymm6 + # x8 += x13, x7 = rotl32(x7 ^ x8, 12) + vpaddd %ymm13,%ymm8,%ymm8 + vpxor %ymm8,%ymm7,%ymm7 + vpslld $12,%ymm7,%ymm0 + vpsrld $20,%ymm7,%ymm7 + vpor %ymm0,%ymm7,%ymm7 + # x9 += x14, x4 = rotl32(x4 ^ x9, 12) + vpaddd %ymm14,%ymm9,%ymm9 + vpxor %ymm9,%ymm4,%ymm4 + vpslld $12,%ymm4,%ymm0 + vpsrld $20,%ymm4,%ymm4 + vpor %ymm0,%ymm4,%ymm4 + + # x0 += x5, x15 = rotl32(x15 ^ x0, 8) + vpaddd 0x00(%rsp),%ymm5,%ymm0 + vmovdqa %ymm0,0x00(%rsp) + vpxor %ymm0,%ymm15,%ymm15 + vpshufb %ymm2,%ymm15,%ymm15 + # x1 += x6, x12 = rotl32(x12 ^ x1, 8) + vpaddd 0x20(%rsp),%ymm6,%ymm0 + vmovdqa %ymm0,0x20(%rsp) + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm2,%ymm12,%ymm12 + # x2 += x7, x13 = rotl32(x13 ^ x2, 8) + vpaddd 0x40(%rsp),%ymm7,%ymm0 + vmovdqa %ymm0,0x40(%rsp) + vpxor %ymm0,%ymm13,%ymm13 + vpshufb %ymm2,%ymm13,%ymm13 + # x3 += x4, x14 = rotl32(x14 ^ x3, 8) + vpaddd 0x60(%rsp),%ymm4,%ymm0 + vmovdqa %ymm0,0x60(%rsp) + vpxor %ymm0,%ymm14,%ymm14 + vpshufb %ymm2,%ymm14,%ymm14 + + # x10 += x15, x5 = rotl32(x5 ^ x10, 7) + vpaddd %ymm15,%ymm10,%ymm10 + vpxor %ymm10,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm0 + vpsrld $25,%ymm5,%ymm5 + vpor %ymm0,%ymm5,%ymm5 + # x11 += x12, x6 = rotl32(x6 ^ x11, 7) + vpaddd %ymm12,%ymm11,%ymm11 + vpxor %ymm11,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm0 + vpsrld $25,%ymm6,%ymm6 + vpor %ymm0,%ymm6,%ymm6 + # x8 += x13, x7 = rotl32(x7 ^ x8, 7) + vpaddd %ymm13,%ymm8,%ymm8 + vpxor %ymm8,%ymm7,%ymm7 + vpslld $7,%ymm7,%ymm0 + vpsrld $25,%ymm7,%ymm7 + vpor %ymm0,%ymm7,%ymm7 + # x9 += x14, x4 = rotl32(x4 ^ x9, 7) + vpaddd %ymm14,%ymm9,%ymm9 + vpxor %ymm9,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm0 + vpsrld $25,%ymm4,%ymm4 + vpor %ymm0,%ymm4,%ymm4 + + dec %ecx + jnz .Ldoubleround8 + + # x0..15[0-3] += s[0..15] + vpbroadcastd 0x00(%rdi),%ymm0 + vpaddd 0x00(%rsp),%ymm0,%ymm0 + vmovdqa %ymm0,0x00(%rsp) + vpbroadcastd 0x04(%rdi),%ymm0 + vpaddd 0x20(%rsp),%ymm0,%ymm0 + vmovdqa %ymm0,0x20(%rsp) + vpbroadcastd 0x08(%rdi),%ymm0 + vpaddd 0x40(%rsp),%ymm0,%ymm0 + vmovdqa %ymm0,0x40(%rsp) + vpbroadcastd 0x0c(%rdi),%ymm0 + vpaddd 0x60(%rsp),%ymm0,%ymm0 + vmovdqa %ymm0,0x60(%rsp) + vpbroadcastd 0x10(%rdi),%ymm0 + vpaddd %ymm0,%ymm4,%ymm4 + vpbroadcastd 0x14(%rdi),%ymm0 + vpaddd %ymm0,%ymm5,%ymm5 + vpbroadcastd 0x18(%rdi),%ymm0 + vpaddd %ymm0,%ymm6,%ymm6 + vpbroadcastd 0x1c(%rdi),%ymm0 + vpaddd %ymm0,%ymm7,%ymm7 + vpbroadcastd 0x20(%rdi),%ymm0 + vpaddd %ymm0,%ymm8,%ymm8 + vpbroadcastd 0x24(%rdi),%ymm0 + vpaddd %ymm0,%ymm9,%ymm9 + vpbroadcastd 0x28(%rdi),%ymm0 + vpaddd %ymm0,%ymm10,%ymm10 + vpbroadcastd 0x2c(%rdi),%ymm0 + vpaddd %ymm0,%ymm11,%ymm11 + vpbroadcastd 0x30(%rdi),%ymm0 + vpaddd %ymm0,%ymm12,%ymm12 + vpbroadcastd 0x34(%rdi),%ymm0 + vpaddd %ymm0,%ymm13,%ymm13 + vpbroadcastd 0x38(%rdi),%ymm0 + vpaddd %ymm0,%ymm14,%ymm14 + vpbroadcastd 0x3c(%rdi),%ymm0 + vpaddd %ymm0,%ymm15,%ymm15 + + # x12 += counter values 0-3 + vpaddd %ymm1,%ymm12,%ymm12 + + # interleave 32-bit words in state n, n+1 + vmovdqa 0x00(%rsp),%ymm0 + vmovdqa 0x20(%rsp),%ymm1 + vpunpckldq %ymm1,%ymm0,%ymm2 + vpunpckhdq %ymm1,%ymm0,%ymm1 + vmovdqa %ymm2,0x00(%rsp) + vmovdqa %ymm1,0x20(%rsp) + vmovdqa 0x40(%rsp),%ymm0 + vmovdqa 0x60(%rsp),%ymm1 + vpunpckldq %ymm1,%ymm0,%ymm2 + vpunpckhdq %ymm1,%ymm0,%ymm1 + vmovdqa %ymm2,0x40(%rsp) + vmovdqa %ymm1,0x60(%rsp) + vmovdqa %ymm4,%ymm0 + vpunpckldq %ymm5,%ymm0,%ymm4 + vpunpckhdq %ymm5,%ymm0,%ymm5 + vmovdqa %ymm6,%ymm0 + vpunpckldq %ymm7,%ymm0,%ymm6 + vpunpckhdq %ymm7,%ymm0,%ymm7 + vmovdqa %ymm8,%ymm0 + vpunpckldq %ymm9,%ymm0,%ymm8 + vpunpckhdq %ymm9,%ymm0,%ymm9 + vmovdqa %ymm10,%ymm0 + vpunpckldq %ymm11,%ymm0,%ymm10 + vpunpckhdq %ymm11,%ymm0,%ymm11 + vmovdqa %ymm12,%ymm0 + vpunpckldq %ymm13,%ymm0,%ymm12 + vpunpckhdq %ymm13,%ymm0,%ymm13 + vmovdqa %ymm14,%ymm0 + vpunpckldq %ymm15,%ymm0,%ymm14 + vpunpckhdq %ymm15,%ymm0,%ymm15 + + # interleave 64-bit words in state n, n+2 + vmovdqa 0x00(%rsp),%ymm0 + vmovdqa 0x40(%rsp),%ymm2 + vpunpcklqdq %ymm2,%ymm0,%ymm1 + vpunpckhqdq %ymm2,%ymm0,%ymm2 + vmovdqa %ymm1,0x00(%rsp) + vmovdqa %ymm2,0x40(%rsp) + vmovdqa 0x20(%rsp),%ymm0 + vmovdqa 0x60(%rsp),%ymm2 + vpunpcklqdq %ymm2,%ymm0,%ymm1 + vpunpckhqdq %ymm2,%ymm0,%ymm2 + vmovdqa %ymm1,0x20(%rsp) + vmovdqa %ymm2,0x60(%rsp) + vmovdqa %ymm4,%ymm0 + vpunpcklqdq %ymm6,%ymm0,%ymm4 + vpunpckhqdq %ymm6,%ymm0,%ymm6 + vmovdqa %ymm5,%ymm0 + vpunpcklqdq %ymm7,%ymm0,%ymm5 + vpunpckhqdq %ymm7,%ymm0,%ymm7 + vmovdqa %ymm8,%ymm0 + vpunpcklqdq %ymm10,%ymm0,%ymm8 + vpunpckhqdq %ymm10,%ymm0,%ymm10 + vmovdqa %ymm9,%ymm0 + vpunpcklqdq %ymm11,%ymm0,%ymm9 + vpunpckhqdq %ymm11,%ymm0,%ymm11 + vmovdqa %ymm12,%ymm0 + vpunpcklqdq %ymm14,%ymm0,%ymm12 + vpunpckhqdq %ymm14,%ymm0,%ymm14 + vmovdqa %ymm13,%ymm0 + vpunpcklqdq %ymm15,%ymm0,%ymm13 + vpunpckhqdq %ymm15,%ymm0,%ymm15 + + # interleave 128-bit words in state n, n+4 + vmovdqa 0x00(%rsp),%ymm0 + vperm2i128 $0x20,%ymm4,%ymm0,%ymm1 + vperm2i128 $0x31,%ymm4,%ymm0,%ymm4 + vmovdqa %ymm1,0x00(%rsp) + vmovdqa 0x20(%rsp),%ymm0 + vperm2i128 $0x20,%ymm5,%ymm0,%ymm1 + vperm2i128 $0x31,%ymm5,%ymm0,%ymm5 + vmovdqa %ymm1,0x20(%rsp) + vmovdqa 0x40(%rsp),%ymm0 + vperm2i128 $0x20,%ymm6,%ymm0,%ymm1 + vperm2i128 $0x31,%ymm6,%ymm0,%ymm6 + vmovdqa %ymm1,0x40(%rsp) + vmovdqa 0x60(%rsp),%ymm0 + vperm2i128 $0x20,%ymm7,%ymm0,%ymm1 + vperm2i128 $0x31,%ymm7,%ymm0,%ymm7 + vmovdqa %ymm1,0x60(%rsp) + vperm2i128 $0x20,%ymm12,%ymm8,%ymm0 + vperm2i128 $0x31,%ymm12,%ymm8,%ymm12 + vmovdqa %ymm0,%ymm8 + vperm2i128 $0x20,%ymm13,%ymm9,%ymm0 + vperm2i128 $0x31,%ymm13,%ymm9,%ymm13 + vmovdqa %ymm0,%ymm9 + vperm2i128 $0x20,%ymm14,%ymm10,%ymm0 + vperm2i128 $0x31,%ymm14,%ymm10,%ymm14 + vmovdqa %ymm0,%ymm10 + vperm2i128 $0x20,%ymm15,%ymm11,%ymm0 + vperm2i128 $0x31,%ymm15,%ymm11,%ymm15 + vmovdqa %ymm0,%ymm11 + + # xor with corresponding input, write to output + vmovdqa 0x00(%rsp),%ymm0 + vpxor 0x0000(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0000(%rsi) + vmovdqa 0x20(%rsp),%ymm0 + vpxor 0x0080(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0080(%rsi) + vmovdqa 0x40(%rsp),%ymm0 + vpxor 0x0040(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0040(%rsi) + vmovdqa 0x60(%rsp),%ymm0 + vpxor 0x00c0(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x00c0(%rsi) + vpxor 0x0100(%rdx),%ymm4,%ymm4 + vmovdqu %ymm4,0x0100(%rsi) + vpxor 0x0180(%rdx),%ymm5,%ymm5 + vmovdqu %ymm5,0x00180(%rsi) + vpxor 0x0140(%rdx),%ymm6,%ymm6 + vmovdqu %ymm6,0x0140(%rsi) + vpxor 0x01c0(%rdx),%ymm7,%ymm7 + vmovdqu %ymm7,0x01c0(%rsi) + vpxor 0x0020(%rdx),%ymm8,%ymm8 + vmovdqu %ymm8,0x0020(%rsi) + vpxor 0x00a0(%rdx),%ymm9,%ymm9 + vmovdqu %ymm9,0x00a0(%rsi) + vpxor 0x0060(%rdx),%ymm10,%ymm10 + vmovdqu %ymm10,0x0060(%rsi) + vpxor 0x00e0(%rdx),%ymm11,%ymm11 + vmovdqu %ymm11,0x00e0(%rsi) + vpxor 0x0120(%rdx),%ymm12,%ymm12 + vmovdqu %ymm12,0x0120(%rsi) + vpxor 0x01a0(%rdx),%ymm13,%ymm13 + vmovdqu %ymm13,0x01a0(%rsi) + vpxor 0x0160(%rdx),%ymm14,%ymm14 + vmovdqu %ymm14,0x0160(%rsi) + vpxor 0x01e0(%rdx),%ymm15,%ymm15 + vmovdqu %ymm15,0x01e0(%rsi) + + vzeroupper + mov %r8,%rsp + ret +ENDPROC(chacha20_8block_xor_avx2) diff --git a/arch/x86/crypto/chacha20-ssse3-x86_64.S b/arch/x86/crypto/chacha20-ssse3-x86_64.S new file mode 100644 index 000000000..712b13047 --- /dev/null +++ b/arch/x86/crypto/chacha20-ssse3-x86_64.S @@ -0,0 +1,625 @@ +/* + * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/linkage.h> + +.data +.align 16 + +ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 +ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 +CTRINC: .octa 0x00000003000000020000000100000000 + +.text + +ENTRY(chacha20_block_xor_ssse3) + # %rdi: Input state matrix, s + # %rsi: 1 data block output, o + # %rdx: 1 data block input, i + + # This function encrypts one ChaCha20 block by loading the state matrix + # in four SSE registers. It performs matrix operation on four words in + # parallel, but requireds shuffling to rearrange the words after each + # round. 8/16-bit word rotation is done with the slightly better + # performing SSSE3 byte shuffling, 7/12-bit word rotation uses + # traditional shift+OR. + + # x0..3 = s0..3 + movdqa 0x00(%rdi),%xmm0 + movdqa 0x10(%rdi),%xmm1 + movdqa 0x20(%rdi),%xmm2 + movdqa 0x30(%rdi),%xmm3 + movdqa %xmm0,%xmm8 + movdqa %xmm1,%xmm9 + movdqa %xmm2,%xmm10 + movdqa %xmm3,%xmm11 + + movdqa ROT8(%rip),%xmm4 + movdqa ROT16(%rip),%xmm5 + + mov $10,%ecx + +.Ldoubleround: + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm5,%xmm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm6 + pslld $12,%xmm6 + psrld $20,%xmm1 + por %xmm6,%xmm1 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm4,%xmm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm7 + pslld $7,%xmm7 + psrld $25,%xmm1 + por %xmm7,%xmm1 + + # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) + pshufd $0x39,%xmm1,%xmm1 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + pshufd $0x4e,%xmm2,%xmm2 + # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) + pshufd $0x93,%xmm3,%xmm3 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm5,%xmm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm6 + pslld $12,%xmm6 + psrld $20,%xmm1 + por %xmm6,%xmm1 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm4,%xmm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm7 + pslld $7,%xmm7 + psrld $25,%xmm1 + por %xmm7,%xmm1 + + # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) + pshufd $0x93,%xmm1,%xmm1 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + pshufd $0x4e,%xmm2,%xmm2 + # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) + pshufd $0x39,%xmm3,%xmm3 + + dec %ecx + jnz .Ldoubleround + + # o0 = i0 ^ (x0 + s0) + movdqu 0x00(%rdx),%xmm4 + paddd %xmm8,%xmm0 + pxor %xmm4,%xmm0 + movdqu %xmm0,0x00(%rsi) + # o1 = i1 ^ (x1 + s1) + movdqu 0x10(%rdx),%xmm5 + paddd %xmm9,%xmm1 + pxor %xmm5,%xmm1 + movdqu %xmm1,0x10(%rsi) + # o2 = i2 ^ (x2 + s2) + movdqu 0x20(%rdx),%xmm6 + paddd %xmm10,%xmm2 + pxor %xmm6,%xmm2 + movdqu %xmm2,0x20(%rsi) + # o3 = i3 ^ (x3 + s3) + movdqu 0x30(%rdx),%xmm7 + paddd %xmm11,%xmm3 + pxor %xmm7,%xmm3 + movdqu %xmm3,0x30(%rsi) + + ret +ENDPROC(chacha20_block_xor_ssse3) + +ENTRY(chacha20_4block_xor_ssse3) + # %rdi: Input state matrix, s + # %rsi: 4 data blocks output, o + # %rdx: 4 data blocks input, i + + # This function encrypts four consecutive ChaCha20 blocks by loading the + # the state matrix in SSE registers four times. As we need some scratch + # registers, we save the first four registers on the stack. The + # algorithm performs each operation on the corresponding word of each + # state matrix, hence requires no word shuffling. For final XORing step + # we transpose the matrix by interleaving 32- and then 64-bit words, + # which allows us to do XOR in SSE registers. 8/16-bit word rotation is + # done with the slightly better performing SSSE3 byte shuffling, + # 7/12-bit word rotation uses traditional shift+OR. + + sub $0x40,%rsp + + # x0..15[0-3] = s0..3[0..3] + movq 0x00(%rdi),%xmm1 + pshufd $0x00,%xmm1,%xmm0 + pshufd $0x55,%xmm1,%xmm1 + movq 0x08(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + movq 0x10(%rdi),%xmm5 + pshufd $0x00,%xmm5,%xmm4 + pshufd $0x55,%xmm5,%xmm5 + movq 0x18(%rdi),%xmm7 + pshufd $0x00,%xmm7,%xmm6 + pshufd $0x55,%xmm7,%xmm7 + movq 0x20(%rdi),%xmm9 + pshufd $0x00,%xmm9,%xmm8 + pshufd $0x55,%xmm9,%xmm9 + movq 0x28(%rdi),%xmm11 + pshufd $0x00,%xmm11,%xmm10 + pshufd $0x55,%xmm11,%xmm11 + movq 0x30(%rdi),%xmm13 + pshufd $0x00,%xmm13,%xmm12 + pshufd $0x55,%xmm13,%xmm13 + movq 0x38(%rdi),%xmm15 + pshufd $0x00,%xmm15,%xmm14 + pshufd $0x55,%xmm15,%xmm15 + # x0..3 on stack + movdqa %xmm0,0x00(%rsp) + movdqa %xmm1,0x10(%rsp) + movdqa %xmm2,0x20(%rsp) + movdqa %xmm3,0x30(%rsp) + + movdqa CTRINC(%rip),%xmm1 + movdqa ROT8(%rip),%xmm2 + movdqa ROT16(%rip),%xmm3 + + # x12 += counter values 0-3 + paddd %xmm1,%xmm12 + + mov $10,%ecx + +.Ldoubleround4: + # x0 += x4, x12 = rotl32(x12 ^ x0, 16) + movdqa 0x00(%rsp),%xmm0 + paddd %xmm4,%xmm0 + movdqa %xmm0,0x00(%rsp) + pxor %xmm0,%xmm12 + pshufb %xmm3,%xmm12 + # x1 += x5, x13 = rotl32(x13 ^ x1, 16) + movdqa 0x10(%rsp),%xmm0 + paddd %xmm5,%xmm0 + movdqa %xmm0,0x10(%rsp) + pxor %xmm0,%xmm13 + pshufb %xmm3,%xmm13 + # x2 += x6, x14 = rotl32(x14 ^ x2, 16) + movdqa 0x20(%rsp),%xmm0 + paddd %xmm6,%xmm0 + movdqa %xmm0,0x20(%rsp) + pxor %xmm0,%xmm14 + pshufb %xmm3,%xmm14 + # x3 += x7, x15 = rotl32(x15 ^ x3, 16) + movdqa 0x30(%rsp),%xmm0 + paddd %xmm7,%xmm0 + movdqa %xmm0,0x30(%rsp) + pxor %xmm0,%xmm15 + pshufb %xmm3,%xmm15 + + # x8 += x12, x4 = rotl32(x4 ^ x8, 12) + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm4 + por %xmm0,%xmm4 + # x9 += x13, x5 = rotl32(x5 ^ x9, 12) + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm5 + por %xmm0,%xmm5 + # x10 += x14, x6 = rotl32(x6 ^ x10, 12) + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm6 + por %xmm0,%xmm6 + # x11 += x15, x7 = rotl32(x7 ^ x11, 12) + paddd %xmm15,%xmm11 + pxor %xmm11,%xmm7 + movdqa %xmm7,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm7 + por %xmm0,%xmm7 + + # x0 += x4, x12 = rotl32(x12 ^ x0, 8) + movdqa 0x00(%rsp),%xmm0 + paddd %xmm4,%xmm0 + movdqa %xmm0,0x00(%rsp) + pxor %xmm0,%xmm12 + pshufb %xmm2,%xmm12 + # x1 += x5, x13 = rotl32(x13 ^ x1, 8) + movdqa 0x10(%rsp),%xmm0 + paddd %xmm5,%xmm0 + movdqa %xmm0,0x10(%rsp) + pxor %xmm0,%xmm13 + pshufb %xmm2,%xmm13 + # x2 += x6, x14 = rotl32(x14 ^ x2, 8) + movdqa 0x20(%rsp),%xmm0 + paddd %xmm6,%xmm0 + movdqa %xmm0,0x20(%rsp) + pxor %xmm0,%xmm14 + pshufb %xmm2,%xmm14 + # x3 += x7, x15 = rotl32(x15 ^ x3, 8) + movdqa 0x30(%rsp),%xmm0 + paddd %xmm7,%xmm0 + movdqa %xmm0,0x30(%rsp) + pxor %xmm0,%xmm15 + pshufb %xmm2,%xmm15 + + # x8 += x12, x4 = rotl32(x4 ^ x8, 7) + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm4 + por %xmm0,%xmm4 + # x9 += x13, x5 = rotl32(x5 ^ x9, 7) + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm5 + por %xmm0,%xmm5 + # x10 += x14, x6 = rotl32(x6 ^ x10, 7) + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm6 + por %xmm0,%xmm6 + # x11 += x15, x7 = rotl32(x7 ^ x11, 7) + paddd %xmm15,%xmm11 + pxor %xmm11,%xmm7 + movdqa %xmm7,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm7 + por %xmm0,%xmm7 + + # x0 += x5, x15 = rotl32(x15 ^ x0, 16) + movdqa 0x00(%rsp),%xmm0 + paddd %xmm5,%xmm0 + movdqa %xmm0,0x00(%rsp) + pxor %xmm0,%xmm15 + pshufb %xmm3,%xmm15 + # x1 += x6, x12 = rotl32(x12 ^ x1, 16) + movdqa 0x10(%rsp),%xmm0 + paddd %xmm6,%xmm0 + movdqa %xmm0,0x10(%rsp) + pxor %xmm0,%xmm12 + pshufb %xmm3,%xmm12 + # x2 += x7, x13 = rotl32(x13 ^ x2, 16) + movdqa 0x20(%rsp),%xmm0 + paddd %xmm7,%xmm0 + movdqa %xmm0,0x20(%rsp) + pxor %xmm0,%xmm13 + pshufb %xmm3,%xmm13 + # x3 += x4, x14 = rotl32(x14 ^ x3, 16) + movdqa 0x30(%rsp),%xmm0 + paddd %xmm4,%xmm0 + movdqa %xmm0,0x30(%rsp) + pxor %xmm0,%xmm14 + pshufb %xmm3,%xmm14 + + # x10 += x15, x5 = rotl32(x5 ^ x10, 12) + paddd %xmm15,%xmm10 + pxor %xmm10,%xmm5 + movdqa %xmm5,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm5 + por %xmm0,%xmm5 + # x11 += x12, x6 = rotl32(x6 ^ x11, 12) + paddd %xmm12,%xmm11 + pxor %xmm11,%xmm6 + movdqa %xmm6,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm6 + por %xmm0,%xmm6 + # x8 += x13, x7 = rotl32(x7 ^ x8, 12) + paddd %xmm13,%xmm8 + pxor %xmm8,%xmm7 + movdqa %xmm7,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm7 + por %xmm0,%xmm7 + # x9 += x14, x4 = rotl32(x4 ^ x9, 12) + paddd %xmm14,%xmm9 + pxor %xmm9,%xmm4 + movdqa %xmm4,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm4 + por %xmm0,%xmm4 + + # x0 += x5, x15 = rotl32(x15 ^ x0, 8) + movdqa 0x00(%rsp),%xmm0 + paddd %xmm5,%xmm0 + movdqa %xmm0,0x00(%rsp) + pxor %xmm0,%xmm15 + pshufb %xmm2,%xmm15 + # x1 += x6, x12 = rotl32(x12 ^ x1, 8) + movdqa 0x10(%rsp),%xmm0 + paddd %xmm6,%xmm0 + movdqa %xmm0,0x10(%rsp) + pxor %xmm0,%xmm12 + pshufb %xmm2,%xmm12 + # x2 += x7, x13 = rotl32(x13 ^ x2, 8) + movdqa 0x20(%rsp),%xmm0 + paddd %xmm7,%xmm0 + movdqa %xmm0,0x20(%rsp) + pxor %xmm0,%xmm13 + pshufb %xmm2,%xmm13 + # x3 += x4, x14 = rotl32(x14 ^ x3, 8) + movdqa 0x30(%rsp),%xmm0 + paddd %xmm4,%xmm0 + movdqa %xmm0,0x30(%rsp) + pxor %xmm0,%xmm14 + pshufb %xmm2,%xmm14 + + # x10 += x15, x5 = rotl32(x5 ^ x10, 7) + paddd %xmm15,%xmm10 + pxor %xmm10,%xmm5 + movdqa %xmm5,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm5 + por %xmm0,%xmm5 + # x11 += x12, x6 = rotl32(x6 ^ x11, 7) + paddd %xmm12,%xmm11 + pxor %xmm11,%xmm6 + movdqa %xmm6,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm6 + por %xmm0,%xmm6 + # x8 += x13, x7 = rotl32(x7 ^ x8, 7) + paddd %xmm13,%xmm8 + pxor %xmm8,%xmm7 + movdqa %xmm7,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm7 + por %xmm0,%xmm7 + # x9 += x14, x4 = rotl32(x4 ^ x9, 7) + paddd %xmm14,%xmm9 + pxor %xmm9,%xmm4 + movdqa %xmm4,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm4 + por %xmm0,%xmm4 + + dec %ecx + jnz .Ldoubleround4 + + # x0[0-3] += s0[0] + # x1[0-3] += s0[1] + movq 0x00(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd 0x00(%rsp),%xmm2 + movdqa %xmm2,0x00(%rsp) + paddd 0x10(%rsp),%xmm3 + movdqa %xmm3,0x10(%rsp) + # x2[0-3] += s0[2] + # x3[0-3] += s0[3] + movq 0x08(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd 0x20(%rsp),%xmm2 + movdqa %xmm2,0x20(%rsp) + paddd 0x30(%rsp),%xmm3 + movdqa %xmm3,0x30(%rsp) + + # x4[0-3] += s1[0] + # x5[0-3] += s1[1] + movq 0x10(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd %xmm2,%xmm4 + paddd %xmm3,%xmm5 + # x6[0-3] += s1[2] + # x7[0-3] += s1[3] + movq 0x18(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd %xmm2,%xmm6 + paddd %xmm3,%xmm7 + + # x8[0-3] += s2[0] + # x9[0-3] += s2[1] + movq 0x20(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd %xmm2,%xmm8 + paddd %xmm3,%xmm9 + # x10[0-3] += s2[2] + # x11[0-3] += s2[3] + movq 0x28(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd %xmm2,%xmm10 + paddd %xmm3,%xmm11 + + # x12[0-3] += s3[0] + # x13[0-3] += s3[1] + movq 0x30(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd %xmm2,%xmm12 + paddd %xmm3,%xmm13 + # x14[0-3] += s3[2] + # x15[0-3] += s3[3] + movq 0x38(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd %xmm2,%xmm14 + paddd %xmm3,%xmm15 + + # x12 += counter values 0-3 + paddd %xmm1,%xmm12 + + # interleave 32-bit words in state n, n+1 + movdqa 0x00(%rsp),%xmm0 + movdqa 0x10(%rsp),%xmm1 + movdqa %xmm0,%xmm2 + punpckldq %xmm1,%xmm2 + punpckhdq %xmm1,%xmm0 + movdqa %xmm2,0x00(%rsp) + movdqa %xmm0,0x10(%rsp) + movdqa 0x20(%rsp),%xmm0 + movdqa 0x30(%rsp),%xmm1 + movdqa %xmm0,%xmm2 + punpckldq %xmm1,%xmm2 + punpckhdq %xmm1,%xmm0 + movdqa %xmm2,0x20(%rsp) + movdqa %xmm0,0x30(%rsp) + movdqa %xmm4,%xmm0 + punpckldq %xmm5,%xmm4 + punpckhdq %xmm5,%xmm0 + movdqa %xmm0,%xmm5 + movdqa %xmm6,%xmm0 + punpckldq %xmm7,%xmm6 + punpckhdq %xmm7,%xmm0 + movdqa %xmm0,%xmm7 + movdqa %xmm8,%xmm0 + punpckldq %xmm9,%xmm8 + punpckhdq %xmm9,%xmm0 + movdqa %xmm0,%xmm9 + movdqa %xmm10,%xmm0 + punpckldq %xmm11,%xmm10 + punpckhdq %xmm11,%xmm0 + movdqa %xmm0,%xmm11 + movdqa %xmm12,%xmm0 + punpckldq %xmm13,%xmm12 + punpckhdq %xmm13,%xmm0 + movdqa %xmm0,%xmm13 + movdqa %xmm14,%xmm0 + punpckldq %xmm15,%xmm14 + punpckhdq %xmm15,%xmm0 + movdqa %xmm0,%xmm15 + + # interleave 64-bit words in state n, n+2 + movdqa 0x00(%rsp),%xmm0 + movdqa 0x20(%rsp),%xmm1 + movdqa %xmm0,%xmm2 + punpcklqdq %xmm1,%xmm2 + punpckhqdq %xmm1,%xmm0 + movdqa %xmm2,0x00(%rsp) + movdqa %xmm0,0x20(%rsp) + movdqa 0x10(%rsp),%xmm0 + movdqa 0x30(%rsp),%xmm1 + movdqa %xmm0,%xmm2 + punpcklqdq %xmm1,%xmm2 + punpckhqdq %xmm1,%xmm0 + movdqa %xmm2,0x10(%rsp) + movdqa %xmm0,0x30(%rsp) + movdqa %xmm4,%xmm0 + punpcklqdq %xmm6,%xmm4 + punpckhqdq %xmm6,%xmm0 + movdqa %xmm0,%xmm6 + movdqa %xmm5,%xmm0 + punpcklqdq %xmm7,%xmm5 + punpckhqdq %xmm7,%xmm0 + movdqa %xmm0,%xmm7 + movdqa %xmm8,%xmm0 + punpcklqdq %xmm10,%xmm8 + punpckhqdq %xmm10,%xmm0 + movdqa %xmm0,%xmm10 + movdqa %xmm9,%xmm0 + punpcklqdq %xmm11,%xmm9 + punpckhqdq %xmm11,%xmm0 + movdqa %xmm0,%xmm11 + movdqa %xmm12,%xmm0 + punpcklqdq %xmm14,%xmm12 + punpckhqdq %xmm14,%xmm0 + movdqa %xmm0,%xmm14 + movdqa %xmm13,%xmm0 + punpcklqdq %xmm15,%xmm13 + punpckhqdq %xmm15,%xmm0 + movdqa %xmm0,%xmm15 + + # xor with corresponding input, write to output + movdqa 0x00(%rsp),%xmm0 + movdqu 0x00(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x00(%rsi) + movdqa 0x10(%rsp),%xmm0 + movdqu 0x80(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x80(%rsi) + movdqa 0x20(%rsp),%xmm0 + movdqu 0x40(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x40(%rsi) + movdqa 0x30(%rsp),%xmm0 + movdqu 0xc0(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0xc0(%rsi) + movdqu 0x10(%rdx),%xmm1 + pxor %xmm1,%xmm4 + movdqu %xmm4,0x10(%rsi) + movdqu 0x90(%rdx),%xmm1 + pxor %xmm1,%xmm5 + movdqu %xmm5,0x90(%rsi) + movdqu 0x50(%rdx),%xmm1 + pxor %xmm1,%xmm6 + movdqu %xmm6,0x50(%rsi) + movdqu 0xd0(%rdx),%xmm1 + pxor %xmm1,%xmm7 + movdqu %xmm7,0xd0(%rsi) + movdqu 0x20(%rdx),%xmm1 + pxor %xmm1,%xmm8 + movdqu %xmm8,0x20(%rsi) + movdqu 0xa0(%rdx),%xmm1 + pxor %xmm1,%xmm9 + movdqu %xmm9,0xa0(%rsi) + movdqu 0x60(%rdx),%xmm1 + pxor %xmm1,%xmm10 + movdqu %xmm10,0x60(%rsi) + movdqu 0xe0(%rdx),%xmm1 + pxor %xmm1,%xmm11 + movdqu %xmm11,0xe0(%rsi) + movdqu 0x30(%rdx),%xmm1 + pxor %xmm1,%xmm12 + movdqu %xmm12,0x30(%rsi) + movdqu 0xb0(%rdx),%xmm1 + pxor %xmm1,%xmm13 + movdqu %xmm13,0xb0(%rsi) + movdqu 0x70(%rdx),%xmm1 + pxor %xmm1,%xmm14 + movdqu %xmm14,0x70(%rsi) + movdqu 0xf0(%rdx),%xmm1 + pxor %xmm1,%xmm15 + movdqu %xmm15,0xf0(%rsi) + + add $0x40,%rsp + ret +ENDPROC(chacha20_4block_xor_ssse3) diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c new file mode 100644 index 000000000..effe2160b --- /dev/null +++ b/arch/x86/crypto/chacha20_glue.c @@ -0,0 +1,150 @@ +/* + * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <crypto/algapi.h> +#include <crypto/chacha20.h> +#include <linux/crypto.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <asm/fpu/api.h> +#include <asm/simd.h> + +#define CHACHA20_STATE_ALIGN 16 + +asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src); +asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src); +#ifdef CONFIG_AS_AVX2 +asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src); +static bool chacha20_use_avx2; +#endif + +static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src, + unsigned int bytes) +{ + u8 buf[CHACHA20_BLOCK_SIZE]; + +#ifdef CONFIG_AS_AVX2 + if (chacha20_use_avx2) { + while (bytes >= CHACHA20_BLOCK_SIZE * 8) { + chacha20_8block_xor_avx2(state, dst, src); + bytes -= CHACHA20_BLOCK_SIZE * 8; + src += CHACHA20_BLOCK_SIZE * 8; + dst += CHACHA20_BLOCK_SIZE * 8; + state[12] += 8; + } + } +#endif + while (bytes >= CHACHA20_BLOCK_SIZE * 4) { + chacha20_4block_xor_ssse3(state, dst, src); + bytes -= CHACHA20_BLOCK_SIZE * 4; + src += CHACHA20_BLOCK_SIZE * 4; + dst += CHACHA20_BLOCK_SIZE * 4; + state[12] += 4; + } + while (bytes >= CHACHA20_BLOCK_SIZE) { + chacha20_block_xor_ssse3(state, dst, src); + bytes -= CHACHA20_BLOCK_SIZE; + src += CHACHA20_BLOCK_SIZE; + dst += CHACHA20_BLOCK_SIZE; + state[12]++; + } + if (bytes) { + memcpy(buf, src, bytes); + chacha20_block_xor_ssse3(state, buf, buf); + memcpy(dst, buf, bytes); + } +} + +static int chacha20_simd(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + u32 *state, state_buf[16 + (CHACHA20_STATE_ALIGN / sizeof(u32)) - 1]; + struct blkcipher_walk walk; + int err; + + if (!may_use_simd()) + return crypto_chacha20_crypt(desc, dst, src, nbytes); + + state = (u32 *)roundup((uintptr_t)state_buf, CHACHA20_STATE_ALIGN); + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt_block(desc, &walk, CHACHA20_BLOCK_SIZE); + + crypto_chacha20_init(state, crypto_blkcipher_ctx(desc->tfm), walk.iv); + + kernel_fpu_begin(); + + while (walk.nbytes >= CHACHA20_BLOCK_SIZE) { + chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr, + rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE)); + err = blkcipher_walk_done(desc, &walk, + walk.nbytes % CHACHA20_BLOCK_SIZE); + } + + if (walk.nbytes) { + chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr, + walk.nbytes); + err = blkcipher_walk_done(desc, &walk, 0); + } + + kernel_fpu_end(); + + return err; +} + +static struct crypto_alg alg = { + .cra_name = "chacha20", + .cra_driver_name = "chacha20-simd", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = 1, + .cra_type = &crypto_blkcipher_type, + .cra_ctxsize = sizeof(struct chacha20_ctx), + .cra_alignmask = sizeof(u32) - 1, + .cra_module = THIS_MODULE, + .cra_u = { + .blkcipher = { + .min_keysize = CHACHA20_KEY_SIZE, + .max_keysize = CHACHA20_KEY_SIZE, + .ivsize = CHACHA20_IV_SIZE, + .geniv = "seqiv", + .setkey = crypto_chacha20_setkey, + .encrypt = chacha20_simd, + .decrypt = chacha20_simd, + }, + }, +}; + +static int __init chacha20_simd_mod_init(void) +{ + if (!cpu_has_ssse3) + return -ENODEV; + +#ifdef CONFIG_AS_AVX2 + chacha20_use_avx2 = cpu_has_avx && cpu_has_avx2 && + cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, NULL); +#endif + return crypto_register_alg(&alg); +} + +static void __exit chacha20_simd_mod_fini(void) +{ + crypto_unregister_alg(&alg); +} + +module_init(chacha20_simd_mod_init); +module_exit(chacha20_simd_mod_fini); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Martin Willi <martin@strongswan.org>"); +MODULE_DESCRIPTION("chacha20 cipher algorithm, SIMD accelerated"); +MODULE_ALIAS_CRYPTO("chacha20"); +MODULE_ALIAS_CRYPTO("chacha20-simd"); diff --git a/arch/x86/crypto/poly1305-avx2-x86_64.S b/arch/x86/crypto/poly1305-avx2-x86_64.S new file mode 100644 index 000000000..eff2f414e --- /dev/null +++ b/arch/x86/crypto/poly1305-avx2-x86_64.S @@ -0,0 +1,386 @@ +/* + * Poly1305 authenticator algorithm, RFC7539, x64 AVX2 functions + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/linkage.h> + +.data +.align 32 + +ANMASK: .octa 0x0000000003ffffff0000000003ffffff + .octa 0x0000000003ffffff0000000003ffffff +ORMASK: .octa 0x00000000010000000000000001000000 + .octa 0x00000000010000000000000001000000 + +.text + +#define h0 0x00(%rdi) +#define h1 0x04(%rdi) +#define h2 0x08(%rdi) +#define h3 0x0c(%rdi) +#define h4 0x10(%rdi) +#define r0 0x00(%rdx) +#define r1 0x04(%rdx) +#define r2 0x08(%rdx) +#define r3 0x0c(%rdx) +#define r4 0x10(%rdx) +#define u0 0x00(%r8) +#define u1 0x04(%r8) +#define u2 0x08(%r8) +#define u3 0x0c(%r8) +#define u4 0x10(%r8) +#define w0 0x14(%r8) +#define w1 0x18(%r8) +#define w2 0x1c(%r8) +#define w3 0x20(%r8) +#define w4 0x24(%r8) +#define y0 0x28(%r8) +#define y1 0x2c(%r8) +#define y2 0x30(%r8) +#define y3 0x34(%r8) +#define y4 0x38(%r8) +#define m %rsi +#define hc0 %ymm0 +#define hc1 %ymm1 +#define hc2 %ymm2 +#define hc3 %ymm3 +#define hc4 %ymm4 +#define hc0x %xmm0 +#define hc1x %xmm1 +#define hc2x %xmm2 +#define hc3x %xmm3 +#define hc4x %xmm4 +#define t1 %ymm5 +#define t2 %ymm6 +#define t1x %xmm5 +#define t2x %xmm6 +#define ruwy0 %ymm7 +#define ruwy1 %ymm8 +#define ruwy2 %ymm9 +#define ruwy3 %ymm10 +#define ruwy4 %ymm11 +#define ruwy0x %xmm7 +#define ruwy1x %xmm8 +#define ruwy2x %xmm9 +#define ruwy3x %xmm10 +#define ruwy4x %xmm11 +#define svxz1 %ymm12 +#define svxz2 %ymm13 +#define svxz3 %ymm14 +#define svxz4 %ymm15 +#define d0 %r9 +#define d1 %r10 +#define d2 %r11 +#define d3 %r12 +#define d4 %r13 + +ENTRY(poly1305_4block_avx2) + # %rdi: Accumulator h[5] + # %rsi: 64 byte input block m + # %rdx: Poly1305 key r[5] + # %rcx: Quadblock count + # %r8: Poly1305 derived key r^2 u[5], r^3 w[5], r^4 y[5], + + # This four-block variant uses loop unrolled block processing. It + # requires 4 Poly1305 keys: r, r^2, r^3 and r^4: + # h = (h + m) * r => h = (h + m1) * r^4 + m2 * r^3 + m3 * r^2 + m4 * r + + vzeroupper + push %rbx + push %r12 + push %r13 + + # combine r0,u0,w0,y0 + vmovd y0,ruwy0x + vmovd w0,t1x + vpunpcklqdq t1,ruwy0,ruwy0 + vmovd u0,t1x + vmovd r0,t2x + vpunpcklqdq t2,t1,t1 + vperm2i128 $0x20,t1,ruwy0,ruwy0 + + # combine r1,u1,w1,y1 and s1=r1*5,v1=u1*5,x1=w1*5,z1=y1*5 + vmovd y1,ruwy1x + vmovd w1,t1x + vpunpcklqdq t1,ruwy1,ruwy1 + vmovd u1,t1x + vmovd r1,t2x + vpunpcklqdq t2,t1,t1 + vperm2i128 $0x20,t1,ruwy1,ruwy1 + vpslld $2,ruwy1,svxz1 + vpaddd ruwy1,svxz1,svxz1 + + # combine r2,u2,w2,y2 and s2=r2*5,v2=u2*5,x2=w2*5,z2=y2*5 + vmovd y2,ruwy2x + vmovd w2,t1x + vpunpcklqdq t1,ruwy2,ruwy2 + vmovd u2,t1x + vmovd r2,t2x + vpunpcklqdq t2,t1,t1 + vperm2i128 $0x20,t1,ruwy2,ruwy2 + vpslld $2,ruwy2,svxz2 + vpaddd ruwy2,svxz2,svxz2 + + # combine r3,u3,w3,y3 and s3=r3*5,v3=u3*5,x3=w3*5,z3=y3*5 + vmovd y3,ruwy3x + vmovd w3,t1x + vpunpcklqdq t1,ruwy3,ruwy3 + vmovd u3,t1x + vmovd r3,t2x + vpunpcklqdq t2,t1,t1 + vperm2i128 $0x20,t1,ruwy3,ruwy3 + vpslld $2,ruwy3,svxz3 + vpaddd ruwy3,svxz3,svxz3 + + # combine r4,u4,w4,y4 and s4=r4*5,v4=u4*5,x4=w4*5,z4=y4*5 + vmovd y4,ruwy4x + vmovd w4,t1x + vpunpcklqdq t1,ruwy4,ruwy4 + vmovd u4,t1x + vmovd r4,t2x + vpunpcklqdq t2,t1,t1 + vperm2i128 $0x20,t1,ruwy4,ruwy4 + vpslld $2,ruwy4,svxz4 + vpaddd ruwy4,svxz4,svxz4 + +.Ldoblock4: + # hc0 = [m[48-51] & 0x3ffffff, m[32-35] & 0x3ffffff, + # m[16-19] & 0x3ffffff, m[ 0- 3] & 0x3ffffff + h0] + vmovd 0x00(m),hc0x + vmovd 0x10(m),t1x + vpunpcklqdq t1,hc0,hc0 + vmovd 0x20(m),t1x + vmovd 0x30(m),t2x + vpunpcklqdq t2,t1,t1 + vperm2i128 $0x20,t1,hc0,hc0 + vpand ANMASK(%rip),hc0,hc0 + vmovd h0,t1x + vpaddd t1,hc0,hc0 + # hc1 = [(m[51-54] >> 2) & 0x3ffffff, (m[35-38] >> 2) & 0x3ffffff, + # (m[19-22] >> 2) & 0x3ffffff, (m[ 3- 6] >> 2) & 0x3ffffff + h1] + vmovd 0x03(m),hc1x + vmovd 0x13(m),t1x + vpunpcklqdq t1,hc1,hc1 + vmovd 0x23(m),t1x + vmovd 0x33(m),t2x + vpunpcklqdq t2,t1,t1 + vperm2i128 $0x20,t1,hc1,hc1 + vpsrld $2,hc1,hc1 + vpand ANMASK(%rip),hc1,hc1 + vmovd h1,t1x + vpaddd t1,hc1,hc1 + # hc2 = [(m[54-57] >> 4) & 0x3ffffff, (m[38-41] >> 4) & 0x3ffffff, + # (m[22-25] >> 4) & 0x3ffffff, (m[ 6- 9] >> 4) & 0x3ffffff + h2] + vmovd 0x06(m),hc2x + vmovd 0x16(m),t1x + vpunpcklqdq t1,hc2,hc2 + vmovd 0x26(m),t1x + vmovd 0x36(m),t2x + vpunpcklqdq t2,t1,t1 + vperm2i128 $0x20,t1,hc2,hc2 + vpsrld $4,hc2,hc2 + vpand ANMASK(%rip),hc2,hc2 + vmovd h2,t1x + vpaddd t1,hc2,hc2 + # hc3 = [(m[57-60] >> 6) & 0x3ffffff, (m[41-44] >> 6) & 0x3ffffff, + # (m[25-28] >> 6) & 0x3ffffff, (m[ 9-12] >> 6) & 0x3ffffff + h3] + vmovd 0x09(m),hc3x + vmovd 0x19(m),t1x + vpunpcklqdq t1,hc3,hc3 + vmovd 0x29(m),t1x + vmovd 0x39(m),t2x + vpunpcklqdq t2,t1,t1 + vperm2i128 $0x20,t1,hc3,hc3 + vpsrld $6,hc3,hc3 + vpand ANMASK(%rip),hc3,hc3 + vmovd h3,t1x + vpaddd t1,hc3,hc3 + # hc4 = [(m[60-63] >> 8) | (1<<24), (m[44-47] >> 8) | (1<<24), + # (m[28-31] >> 8) | (1<<24), (m[12-15] >> 8) | (1<<24) + h4] + vmovd 0x0c(m),hc4x + vmovd 0x1c(m),t1x + vpunpcklqdq t1,hc4,hc4 + vmovd 0x2c(m),t1x + vmovd 0x3c(m),t2x + vpunpcklqdq t2,t1,t1 + vperm2i128 $0x20,t1,hc4,hc4 + vpsrld $8,hc4,hc4 + vpor ORMASK(%rip),hc4,hc4 + vmovd h4,t1x + vpaddd t1,hc4,hc4 + + # t1 = [ hc0[3] * r0, hc0[2] * u0, hc0[1] * w0, hc0[0] * y0 ] + vpmuludq hc0,ruwy0,t1 + # t1 += [ hc1[3] * s4, hc1[2] * v4, hc1[1] * x4, hc1[0] * z4 ] + vpmuludq hc1,svxz4,t2 + vpaddq t2,t1,t1 + # t1 += [ hc2[3] * s3, hc2[2] * v3, hc2[1] * x3, hc2[0] * z3 ] + vpmuludq hc2,svxz3,t2 + vpaddq t2,t1,t1 + # t1 += [ hc3[3] * s2, hc3[2] * v2, hc3[1] * x2, hc3[0] * z2 ] + vpmuludq hc3,svxz2,t2 + vpaddq t2,t1,t1 + # t1 += [ hc4[3] * s1, hc4[2] * v1, hc4[1] * x1, hc4[0] * z1 ] + vpmuludq hc4,svxz1,t2 + vpaddq t2,t1,t1 + # d0 = t1[0] + t1[1] + t[2] + t[3] + vpermq $0xee,t1,t2 + vpaddq t2,t1,t1 + vpsrldq $8,t1,t2 + vpaddq t2,t1,t1 + vmovq t1x,d0 + + # t1 = [ hc0[3] * r1, hc0[2] * u1,hc0[1] * w1, hc0[0] * y1 ] + vpmuludq hc0,ruwy1,t1 + # t1 += [ hc1[3] * r0, hc1[2] * u0, hc1[1] * w0, hc1[0] * y0 ] + vpmuludq hc1,ruwy0,t2 + vpaddq t2,t1,t1 + # t1 += [ hc2[3] * s4, hc2[2] * v4, hc2[1] * x4, hc2[0] * z4 ] + vpmuludq hc2,svxz4,t2 + vpaddq t2,t1,t1 + # t1 += [ hc3[3] * s3, hc3[2] * v3, hc3[1] * x3, hc3[0] * z3 ] + vpmuludq hc3,svxz3,t2 + vpaddq t2,t1,t1 + # t1 += [ hc4[3] * s2, hc4[2] * v2, hc4[1] * x2, hc4[0] * z2 ] + vpmuludq hc4,svxz2,t2 + vpaddq t2,t1,t1 + # d1 = t1[0] + t1[1] + t1[3] + t1[4] + vpermq $0xee,t1,t2 + vpaddq t2,t1,t1 + vpsrldq $8,t1,t2 + vpaddq t2,t1,t1 + vmovq t1x,d1 + + # t1 = [ hc0[3] * r2, hc0[2] * u2, hc0[1] * w2, hc0[0] * y2 ] + vpmuludq hc0,ruwy2,t1 + # t1 += [ hc1[3] * r1, hc1[2] * u1, hc1[1] * w1, hc1[0] * y1 ] + vpmuludq hc1,ruwy1,t2 + vpaddq t2,t1,t1 + # t1 += [ hc2[3] * r0, hc2[2] * u0, hc2[1] * w0, hc2[0] * y0 ] + vpmuludq hc2,ruwy0,t2 + vpaddq t2,t1,t1 + # t1 += [ hc3[3] * s4, hc3[2] * v4, hc3[1] * x4, hc3[0] * z4 ] + vpmuludq hc3,svxz4,t2 + vpaddq t2,t1,t1 + # t1 += [ hc4[3] * s3, hc4[2] * v3, hc4[1] * x3, hc4[0] * z3 ] + vpmuludq hc4,svxz3,t2 + vpaddq t2,t1,t1 + # d2 = t1[0] + t1[1] + t1[2] + t1[3] + vpermq $0xee,t1,t2 + vpaddq t2,t1,t1 + vpsrldq $8,t1,t2 + vpaddq t2,t1,t1 + vmovq t1x,d2 + + # t1 = [ hc0[3] * r3, hc0[2] * u3, hc0[1] * w3, hc0[0] * y3 ] + vpmuludq hc0,ruwy3,t1 + # t1 += [ hc1[3] * r2, hc1[2] * u2, hc1[1] * w2, hc1[0] * y2 ] + vpmuludq hc1,ruwy2,t2 + vpaddq t2,t1,t1 + # t1 += [ hc2[3] * r1, hc2[2] * u1, hc2[1] * w1, hc2[0] * y1 ] + vpmuludq hc2,ruwy1,t2 + vpaddq t2,t1,t1 + # t1 += [ hc3[3] * r0, hc3[2] * u0, hc3[1] * w0, hc3[0] * y0 ] + vpmuludq hc3,ruwy0,t2 + vpaddq t2,t1,t1 + # t1 += [ hc4[3] * s4, hc4[2] * v4, hc4[1] * x4, hc4[0] * z4 ] + vpmuludq hc4,svxz4,t2 + vpaddq t2,t1,t1 + # d3 = t1[0] + t1[1] + t1[2] + t1[3] + vpermq $0xee,t1,t2 + vpaddq t2,t1,t1 + vpsrldq $8,t1,t2 + vpaddq t2,t1,t1 + vmovq t1x,d3 + + # t1 = [ hc0[3] * r4, hc0[2] * u4, hc0[1] * w4, hc0[0] * y4 ] + vpmuludq hc0,ruwy4,t1 + # t1 += [ hc1[3] * r3, hc1[2] * u3, hc1[1] * w3, hc1[0] * y3 ] + vpmuludq hc1,ruwy3,t2 + vpaddq t2,t1,t1 + # t1 += [ hc2[3] * r2, hc2[2] * u2, hc2[1] * w2, hc2[0] * y2 ] + vpmuludq hc2,ruwy2,t2 + vpaddq t2,t1,t1 + # t1 += [ hc3[3] * r1, hc3[2] * u1, hc3[1] * w1, hc3[0] * y1 ] + vpmuludq hc3,ruwy1,t2 + vpaddq t2,t1,t1 + # t1 += [ hc4[3] * r0, hc4[2] * u0, hc4[1] * w0, hc4[0] * y0 ] + vpmuludq hc4,ruwy0,t2 + vpaddq t2,t1,t1 + # d4 = t1[0] + t1[1] + t1[2] + t1[3] + vpermq $0xee,t1,t2 + vpaddq t2,t1,t1 + vpsrldq $8,t1,t2 + vpaddq t2,t1,t1 + vmovq t1x,d4 + + # d1 += d0 >> 26 + mov d0,%rax + shr $26,%rax + add %rax,d1 + # h0 = d0 & 0x3ffffff + mov d0,%rbx + and $0x3ffffff,%ebx + + # d2 += d1 >> 26 + mov d1,%rax + shr $26,%rax + add %rax,d2 + # h1 = d1 & 0x3ffffff + mov d1,%rax + and $0x3ffffff,%eax + mov %eax,h1 + + # d3 += d2 >> 26 + mov d2,%rax + shr $26,%rax + add %rax,d3 + # h2 = d2 & 0x3ffffff + mov d2,%rax + and $0x3ffffff,%eax + mov %eax,h2 + + # d4 += d3 >> 26 + mov d3,%rax + shr $26,%rax + add %rax,d4 + # h3 = d3 & 0x3ffffff + mov d3,%rax + and $0x3ffffff,%eax + mov %eax,h3 + + # h0 += (d4 >> 26) * 5 + mov d4,%rax + shr $26,%rax + lea (%eax,%eax,4),%eax + add %eax,%ebx + # h4 = d4 & 0x3ffffff + mov d4,%rax + and $0x3ffffff,%eax + mov %eax,h4 + + # h1 += h0 >> 26 + mov %ebx,%eax + shr $26,%eax + add %eax,h1 + # h0 = h0 & 0x3ffffff + andl $0x3ffffff,%ebx + mov %ebx,h0 + + add $0x40,m + dec %rcx + jnz .Ldoblock4 + + vzeroupper + pop %r13 + pop %r12 + pop %rbx + ret +ENDPROC(poly1305_4block_avx2) diff --git a/arch/x86/crypto/poly1305-sse2-x86_64.S b/arch/x86/crypto/poly1305-sse2-x86_64.S new file mode 100644 index 000000000..338c74805 --- /dev/null +++ b/arch/x86/crypto/poly1305-sse2-x86_64.S @@ -0,0 +1,582 @@ +/* + * Poly1305 authenticator algorithm, RFC7539, x64 SSE2 functions + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/linkage.h> + +.data +.align 16 + +ANMASK: .octa 0x0000000003ffffff0000000003ffffff +ORMASK: .octa 0x00000000010000000000000001000000 + +.text + +#define h0 0x00(%rdi) +#define h1 0x04(%rdi) +#define h2 0x08(%rdi) +#define h3 0x0c(%rdi) +#define h4 0x10(%rdi) +#define r0 0x00(%rdx) +#define r1 0x04(%rdx) +#define r2 0x08(%rdx) +#define r3 0x0c(%rdx) +#define r4 0x10(%rdx) +#define s1 0x00(%rsp) +#define s2 0x04(%rsp) +#define s3 0x08(%rsp) +#define s4 0x0c(%rsp) +#define m %rsi +#define h01 %xmm0 +#define h23 %xmm1 +#define h44 %xmm2 +#define t1 %xmm3 +#define t2 %xmm4 +#define t3 %xmm5 +#define t4 %xmm6 +#define mask %xmm7 +#define d0 %r8 +#define d1 %r9 +#define d2 %r10 +#define d3 %r11 +#define d4 %r12 + +ENTRY(poly1305_block_sse2) + # %rdi: Accumulator h[5] + # %rsi: 16 byte input block m + # %rdx: Poly1305 key r[5] + # %rcx: Block count + + # This single block variant tries to improve performance by doing two + # multiplications in parallel using SSE instructions. There is quite + # some quardword packing involved, hence the speedup is marginal. + + push %rbx + push %r12 + sub $0x10,%rsp + + # s1..s4 = r1..r4 * 5 + mov r1,%eax + lea (%eax,%eax,4),%eax + mov %eax,s1 + mov r2,%eax + lea (%eax,%eax,4),%eax + mov %eax,s2 + mov r3,%eax + lea (%eax,%eax,4),%eax + mov %eax,s3 + mov r4,%eax + lea (%eax,%eax,4),%eax + mov %eax,s4 + + movdqa ANMASK(%rip),mask + +.Ldoblock: + # h01 = [0, h1, 0, h0] + # h23 = [0, h3, 0, h2] + # h44 = [0, h4, 0, h4] + movd h0,h01 + movd h1,t1 + movd h2,h23 + movd h3,t2 + movd h4,h44 + punpcklqdq t1,h01 + punpcklqdq t2,h23 + punpcklqdq h44,h44 + + # h01 += [ (m[3-6] >> 2) & 0x3ffffff, m[0-3] & 0x3ffffff ] + movd 0x00(m),t1 + movd 0x03(m),t2 + psrld $2,t2 + punpcklqdq t2,t1 + pand mask,t1 + paddd t1,h01 + # h23 += [ (m[9-12] >> 6) & 0x3ffffff, (m[6-9] >> 4) & 0x3ffffff ] + movd 0x06(m),t1 + movd 0x09(m),t2 + psrld $4,t1 + psrld $6,t2 + punpcklqdq t2,t1 + pand mask,t1 + paddd t1,h23 + # h44 += [ (m[12-15] >> 8) | (1 << 24), (m[12-15] >> 8) | (1 << 24) ] + mov 0x0c(m),%eax + shr $8,%eax + or $0x01000000,%eax + movd %eax,t1 + pshufd $0xc4,t1,t1 + paddd t1,h44 + + # t1[0] = h0 * r0 + h2 * s3 + # t1[1] = h1 * s4 + h3 * s2 + movd r0,t1 + movd s4,t2 + punpcklqdq t2,t1 + pmuludq h01,t1 + movd s3,t2 + movd s2,t3 + punpcklqdq t3,t2 + pmuludq h23,t2 + paddq t2,t1 + # t2[0] = h0 * r1 + h2 * s4 + # t2[1] = h1 * r0 + h3 * s3 + movd r1,t2 + movd r0,t3 + punpcklqdq t3,t2 + pmuludq h01,t2 + movd s4,t3 + movd s3,t4 + punpcklqdq t4,t3 + pmuludq h23,t3 + paddq t3,t2 + # t3[0] = h4 * s1 + # t3[1] = h4 * s2 + movd s1,t3 + movd s2,t4 + punpcklqdq t4,t3 + pmuludq h44,t3 + # d0 = t1[0] + t1[1] + t3[0] + # d1 = t2[0] + t2[1] + t3[1] + movdqa t1,t4 + punpcklqdq t2,t4 + punpckhqdq t2,t1 + paddq t4,t1 + paddq t3,t1 + movq t1,d0 + psrldq $8,t1 + movq t1,d1 + + # t1[0] = h0 * r2 + h2 * r0 + # t1[1] = h1 * r1 + h3 * s4 + movd r2,t1 + movd r1,t2 + punpcklqdq t2,t1 + pmuludq h01,t1 + movd r0,t2 + movd s4,t3 + punpcklqdq t3,t2 + pmuludq h23,t2 + paddq t2,t1 + # t2[0] = h0 * r3 + h2 * r1 + # t2[1] = h1 * r2 + h3 * r0 + movd r3,t2 + movd r2,t3 + punpcklqdq t3,t2 + pmuludq h01,t2 + movd r1,t3 + movd r0,t4 + punpcklqdq t4,t3 + pmuludq h23,t3 + paddq t3,t2 + # t3[0] = h4 * s3 + # t3[1] = h4 * s4 + movd s3,t3 + movd s4,t4 + punpcklqdq t4,t3 + pmuludq h44,t3 + # d2 = t1[0] + t1[1] + t3[0] + # d3 = t2[0] + t2[1] + t3[1] + movdqa t1,t4 + punpcklqdq t2,t4 + punpckhqdq t2,t1 + paddq t4,t1 + paddq t3,t1 + movq t1,d2 + psrldq $8,t1 + movq t1,d3 + + # t1[0] = h0 * r4 + h2 * r2 + # t1[1] = h1 * r3 + h3 * r1 + movd r4,t1 + movd r3,t2 + punpcklqdq t2,t1 + pmuludq h01,t1 + movd r2,t2 + movd r1,t3 + punpcklqdq t3,t2 + pmuludq h23,t2 + paddq t2,t1 + # t3[0] = h4 * r0 + movd r0,t3 + pmuludq h44,t3 + # d4 = t1[0] + t1[1] + t3[0] + movdqa t1,t4 + psrldq $8,t4 + paddq t4,t1 + paddq t3,t1 + movq t1,d4 + + # d1 += d0 >> 26 + mov d0,%rax + shr $26,%rax + add %rax,d1 + # h0 = d0 & 0x3ffffff + mov d0,%rbx + and $0x3ffffff,%ebx + + # d2 += d1 >> 26 + mov d1,%rax + shr $26,%rax + add %rax,d2 + # h1 = d1 & 0x3ffffff + mov d1,%rax + and $0x3ffffff,%eax + mov %eax,h1 + + # d3 += d2 >> 26 + mov d2,%rax + shr $26,%rax + add %rax,d3 + # h2 = d2 & 0x3ffffff + mov d2,%rax + and $0x3ffffff,%eax + mov %eax,h2 + + # d4 += d3 >> 26 + mov d3,%rax + shr $26,%rax + add %rax,d4 + # h3 = d3 & 0x3ffffff + mov d3,%rax + and $0x3ffffff,%eax + mov %eax,h3 + + # h0 += (d4 >> 26) * 5 + mov d4,%rax + shr $26,%rax + lea (%eax,%eax,4),%eax + add %eax,%ebx + # h4 = d4 & 0x3ffffff + mov d4,%rax + and $0x3ffffff,%eax + mov %eax,h4 + + # h1 += h0 >> 26 + mov %ebx,%eax + shr $26,%eax + add %eax,h1 + # h0 = h0 & 0x3ffffff + andl $0x3ffffff,%ebx + mov %ebx,h0 + + add $0x10,m + dec %rcx + jnz .Ldoblock + + add $0x10,%rsp + pop %r12 + pop %rbx + ret +ENDPROC(poly1305_block_sse2) + + +#define u0 0x00(%r8) +#define u1 0x04(%r8) +#define u2 0x08(%r8) +#define u3 0x0c(%r8) +#define u4 0x10(%r8) +#define hc0 %xmm0 +#define hc1 %xmm1 +#define hc2 %xmm2 +#define hc3 %xmm5 +#define hc4 %xmm6 +#define ru0 %xmm7 +#define ru1 %xmm8 +#define ru2 %xmm9 +#define ru3 %xmm10 +#define ru4 %xmm11 +#define sv1 %xmm12 +#define sv2 %xmm13 +#define sv3 %xmm14 +#define sv4 %xmm15 +#undef d0 +#define d0 %r13 + +ENTRY(poly1305_2block_sse2) + # %rdi: Accumulator h[5] + # %rsi: 16 byte input block m + # %rdx: Poly1305 key r[5] + # %rcx: Doubleblock count + # %r8: Poly1305 derived key r^2 u[5] + + # This two-block variant further improves performance by using loop + # unrolled block processing. This is more straight forward and does + # less byte shuffling, but requires a second Poly1305 key r^2: + # h = (h + m) * r => h = (h + m1) * r^2 + m2 * r + + push %rbx + push %r12 + push %r13 + + # combine r0,u0 + movd u0,ru0 + movd r0,t1 + punpcklqdq t1,ru0 + + # combine r1,u1 and s1=r1*5,v1=u1*5 + movd u1,ru1 + movd r1,t1 + punpcklqdq t1,ru1 + movdqa ru1,sv1 + pslld $2,sv1 + paddd ru1,sv1 + + # combine r2,u2 and s2=r2*5,v2=u2*5 + movd u2,ru2 + movd r2,t1 + punpcklqdq t1,ru2 + movdqa ru2,sv2 + pslld $2,sv2 + paddd ru2,sv2 + + # combine r3,u3 and s3=r3*5,v3=u3*5 + movd u3,ru3 + movd r3,t1 + punpcklqdq t1,ru3 + movdqa ru3,sv3 + pslld $2,sv3 + paddd ru3,sv3 + + # combine r4,u4 and s4=r4*5,v4=u4*5 + movd u4,ru4 + movd r4,t1 + punpcklqdq t1,ru4 + movdqa ru4,sv4 + pslld $2,sv4 + paddd ru4,sv4 + +.Ldoblock2: + # hc0 = [ m[16-19] & 0x3ffffff, h0 + m[0-3] & 0x3ffffff ] + movd 0x00(m),hc0 + movd 0x10(m),t1 + punpcklqdq t1,hc0 + pand ANMASK(%rip),hc0 + movd h0,t1 + paddd t1,hc0 + # hc1 = [ (m[19-22] >> 2) & 0x3ffffff, h1 + (m[3-6] >> 2) & 0x3ffffff ] + movd 0x03(m),hc1 + movd 0x13(m),t1 + punpcklqdq t1,hc1 + psrld $2,hc1 + pand ANMASK(%rip),hc1 + movd h1,t1 + paddd t1,hc1 + # hc2 = [ (m[22-25] >> 4) & 0x3ffffff, h2 + (m[6-9] >> 4) & 0x3ffffff ] + movd 0x06(m),hc2 + movd 0x16(m),t1 + punpcklqdq t1,hc2 + psrld $4,hc2 + pand ANMASK(%rip),hc2 + movd h2,t1 + paddd t1,hc2 + # hc3 = [ (m[25-28] >> 6) & 0x3ffffff, h3 + (m[9-12] >> 6) & 0x3ffffff ] + movd 0x09(m),hc3 + movd 0x19(m),t1 + punpcklqdq t1,hc3 + psrld $6,hc3 + pand ANMASK(%rip),hc3 + movd h3,t1 + paddd t1,hc3 + # hc4 = [ (m[28-31] >> 8) | (1<<24), h4 + (m[12-15] >> 8) | (1<<24) ] + movd 0x0c(m),hc4 + movd 0x1c(m),t1 + punpcklqdq t1,hc4 + psrld $8,hc4 + por ORMASK(%rip),hc4 + movd h4,t1 + paddd t1,hc4 + + # t1 = [ hc0[1] * r0, hc0[0] * u0 ] + movdqa ru0,t1 + pmuludq hc0,t1 + # t1 += [ hc1[1] * s4, hc1[0] * v4 ] + movdqa sv4,t2 + pmuludq hc1,t2 + paddq t2,t1 + # t1 += [ hc2[1] * s3, hc2[0] * v3 ] + movdqa sv3,t2 + pmuludq hc2,t2 + paddq t2,t1 + # t1 += [ hc3[1] * s2, hc3[0] * v2 ] + movdqa sv2,t2 + pmuludq hc3,t2 + paddq t2,t1 + # t1 += [ hc4[1] * s1, hc4[0] * v1 ] + movdqa sv1,t2 + pmuludq hc4,t2 + paddq t2,t1 + # d0 = t1[0] + t1[1] + movdqa t1,t2 + psrldq $8,t2 + paddq t2,t1 + movq t1,d0 + + # t1 = [ hc0[1] * r1, hc0[0] * u1 ] + movdqa ru1,t1 + pmuludq hc0,t1 + # t1 += [ hc1[1] * r0, hc1[0] * u0 ] + movdqa ru0,t2 + pmuludq hc1,t2 + paddq t2,t1 + # t1 += [ hc2[1] * s4, hc2[0] * v4 ] + movdqa sv4,t2 + pmuludq hc2,t2 + paddq t2,t1 + # t1 += [ hc3[1] * s3, hc3[0] * v3 ] + movdqa sv3,t2 + pmuludq hc3,t2 + paddq t2,t1 + # t1 += [ hc4[1] * s2, hc4[0] * v2 ] + movdqa sv2,t2 + pmuludq hc4,t2 + paddq t2,t1 + # d1 = t1[0] + t1[1] + movdqa t1,t2 + psrldq $8,t2 + paddq t2,t1 + movq t1,d1 + + # t1 = [ hc0[1] * r2, hc0[0] * u2 ] + movdqa ru2,t1 + pmuludq hc0,t1 + # t1 += [ hc1[1] * r1, hc1[0] * u1 ] + movdqa ru1,t2 + pmuludq hc1,t2 + paddq t2,t1 + # t1 += [ hc2[1] * r0, hc2[0] * u0 ] + movdqa ru0,t2 + pmuludq hc2,t2 + paddq t2,t1 + # t1 += [ hc3[1] * s4, hc3[0] * v4 ] + movdqa sv4,t2 + pmuludq hc3,t2 + paddq t2,t1 + # t1 += [ hc4[1] * s3, hc4[0] * v3 ] + movdqa sv3,t2 + pmuludq hc4,t2 + paddq t2,t1 + # d2 = t1[0] + t1[1] + movdqa t1,t2 + psrldq $8,t2 + paddq t2,t1 + movq t1,d2 + + # t1 = [ hc0[1] * r3, hc0[0] * u3 ] + movdqa ru3,t1 + pmuludq hc0,t1 + # t1 += [ hc1[1] * r2, hc1[0] * u2 ] + movdqa ru2,t2 + pmuludq hc1,t2 + paddq t2,t1 + # t1 += [ hc2[1] * r1, hc2[0] * u1 ] + movdqa ru1,t2 + pmuludq hc2,t2 + paddq t2,t1 + # t1 += [ hc3[1] * r0, hc3[0] * u0 ] + movdqa ru0,t2 + pmuludq hc3,t2 + paddq t2,t1 + # t1 += [ hc4[1] * s4, hc4[0] * v4 ] + movdqa sv4,t2 + pmuludq hc4,t2 + paddq t2,t1 + # d3 = t1[0] + t1[1] + movdqa t1,t2 + psrldq $8,t2 + paddq t2,t1 + movq t1,d3 + + # t1 = [ hc0[1] * r4, hc0[0] * u4 ] + movdqa ru4,t1 + pmuludq hc0,t1 + # t1 += [ hc1[1] * r3, hc1[0] * u3 ] + movdqa ru3,t2 + pmuludq hc1,t2 + paddq t2,t1 + # t1 += [ hc2[1] * r2, hc2[0] * u2 ] + movdqa ru2,t2 + pmuludq hc2,t2 + paddq t2,t1 + # t1 += [ hc3[1] * r1, hc3[0] * u1 ] + movdqa ru1,t2 + pmuludq hc3,t2 + paddq t2,t1 + # t1 += [ hc4[1] * r0, hc4[0] * u0 ] + movdqa ru0,t2 + pmuludq hc4,t2 + paddq t2,t1 + # d4 = t1[0] + t1[1] + movdqa t1,t2 + psrldq $8,t2 + paddq t2,t1 + movq t1,d4 + + # d1 += d0 >> 26 + mov d0,%rax + shr $26,%rax + add %rax,d1 + # h0 = d0 & 0x3ffffff + mov d0,%rbx + and $0x3ffffff,%ebx + + # d2 += d1 >> 26 + mov d1,%rax + shr $26,%rax + add %rax,d2 + # h1 = d1 & 0x3ffffff + mov d1,%rax + and $0x3ffffff,%eax + mov %eax,h1 + + # d3 += d2 >> 26 + mov d2,%rax + shr $26,%rax + add %rax,d3 + # h2 = d2 & 0x3ffffff + mov d2,%rax + and $0x3ffffff,%eax + mov %eax,h2 + + # d4 += d3 >> 26 + mov d3,%rax + shr $26,%rax + add %rax,d4 + # h3 = d3 & 0x3ffffff + mov d3,%rax + and $0x3ffffff,%eax + mov %eax,h3 + + # h0 += (d4 >> 26) * 5 + mov d4,%rax + shr $26,%rax + lea (%eax,%eax,4),%eax + add %eax,%ebx + # h4 = d4 & 0x3ffffff + mov d4,%rax + and $0x3ffffff,%eax + mov %eax,h4 + + # h1 += h0 >> 26 + mov %ebx,%eax + shr $26,%eax + add %eax,h1 + # h0 = h0 & 0x3ffffff + andl $0x3ffffff,%ebx + mov %ebx,h0 + + add $0x20,m + dec %rcx + jnz .Ldoblock2 + + pop %r13 + pop %r12 + pop %rbx + ret +ENDPROC(poly1305_2block_sse2) diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c new file mode 100644 index 000000000..f7170d764 --- /dev/null +++ b/arch/x86/crypto/poly1305_glue.c @@ -0,0 +1,207 @@ +/* + * Poly1305 authenticator algorithm, RFC7539, SIMD glue code + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <crypto/algapi.h> +#include <crypto/internal/hash.h> +#include <crypto/poly1305.h> +#include <linux/crypto.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <asm/fpu/api.h> +#include <asm/simd.h> + +struct poly1305_simd_desc_ctx { + struct poly1305_desc_ctx base; + /* derived key u set? */ + bool uset; +#ifdef CONFIG_AS_AVX2 + /* derived keys r^3, r^4 set? */ + bool wset; +#endif + /* derived Poly1305 key r^2 */ + u32 u[5]; + /* ... silently appended r^3 and r^4 when using AVX2 */ +}; + +asmlinkage void poly1305_block_sse2(u32 *h, const u8 *src, + const u32 *r, unsigned int blocks); +asmlinkage void poly1305_2block_sse2(u32 *h, const u8 *src, const u32 *r, + unsigned int blocks, const u32 *u); +#ifdef CONFIG_AS_AVX2 +asmlinkage void poly1305_4block_avx2(u32 *h, const u8 *src, const u32 *r, + unsigned int blocks, const u32 *u); +static bool poly1305_use_avx2; +#endif + +static int poly1305_simd_init(struct shash_desc *desc) +{ + struct poly1305_simd_desc_ctx *sctx = shash_desc_ctx(desc); + + sctx->uset = false; +#ifdef CONFIG_AS_AVX2 + sctx->wset = false; +#endif + + return crypto_poly1305_init(desc); +} + +static void poly1305_simd_mult(u32 *a, const u32 *b) +{ + u8 m[POLY1305_BLOCK_SIZE]; + + memset(m, 0, sizeof(m)); + /* The poly1305 block function adds a hi-bit to the accumulator which + * we don't need for key multiplication; compensate for it. */ + a[4] -= 1 << 24; + poly1305_block_sse2(a, m, b, 1); +} + +static unsigned int poly1305_simd_blocks(struct poly1305_desc_ctx *dctx, + const u8 *src, unsigned int srclen) +{ + struct poly1305_simd_desc_ctx *sctx; + unsigned int blocks, datalen; + + BUILD_BUG_ON(offsetof(struct poly1305_simd_desc_ctx, base)); + sctx = container_of(dctx, struct poly1305_simd_desc_ctx, base); + + if (unlikely(!dctx->sset)) { + datalen = crypto_poly1305_setdesckey(dctx, src, srclen); + src += srclen - datalen; + srclen = datalen; + } + +#ifdef CONFIG_AS_AVX2 + if (poly1305_use_avx2 && srclen >= POLY1305_BLOCK_SIZE * 4) { + if (unlikely(!sctx->wset)) { + if (!sctx->uset) { + memcpy(sctx->u, dctx->r, sizeof(sctx->u)); + poly1305_simd_mult(sctx->u, dctx->r); + sctx->uset = true; + } + memcpy(sctx->u + 5, sctx->u, sizeof(sctx->u)); + poly1305_simd_mult(sctx->u + 5, dctx->r); + memcpy(sctx->u + 10, sctx->u + 5, sizeof(sctx->u)); + poly1305_simd_mult(sctx->u + 10, dctx->r); + sctx->wset = true; + } + blocks = srclen / (POLY1305_BLOCK_SIZE * 4); + poly1305_4block_avx2(dctx->h, src, dctx->r, blocks, sctx->u); + src += POLY1305_BLOCK_SIZE * 4 * blocks; + srclen -= POLY1305_BLOCK_SIZE * 4 * blocks; + } +#endif + if (likely(srclen >= POLY1305_BLOCK_SIZE * 2)) { + if (unlikely(!sctx->uset)) { + memcpy(sctx->u, dctx->r, sizeof(sctx->u)); + poly1305_simd_mult(sctx->u, dctx->r); + sctx->uset = true; + } + blocks = srclen / (POLY1305_BLOCK_SIZE * 2); + poly1305_2block_sse2(dctx->h, src, dctx->r, blocks, sctx->u); + src += POLY1305_BLOCK_SIZE * 2 * blocks; + srclen -= POLY1305_BLOCK_SIZE * 2 * blocks; + } + if (srclen >= POLY1305_BLOCK_SIZE) { + poly1305_block_sse2(dctx->h, src, dctx->r, 1); + srclen -= POLY1305_BLOCK_SIZE; + } + return srclen; +} + +static int poly1305_simd_update(struct shash_desc *desc, + const u8 *src, unsigned int srclen) +{ + struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); + unsigned int bytes; + + /* kernel_fpu_begin/end is costly, use fallback for small updates */ + if (srclen <= 288 || !may_use_simd()) + return crypto_poly1305_update(desc, src, srclen); + + kernel_fpu_begin(); + + if (unlikely(dctx->buflen)) { + bytes = min(srclen, POLY1305_BLOCK_SIZE - dctx->buflen); + memcpy(dctx->buf + dctx->buflen, src, bytes); + src += bytes; + srclen -= bytes; + dctx->buflen += bytes; + + if (dctx->buflen == POLY1305_BLOCK_SIZE) { + poly1305_simd_blocks(dctx, dctx->buf, + POLY1305_BLOCK_SIZE); + dctx->buflen = 0; + } + } + + if (likely(srclen >= POLY1305_BLOCK_SIZE)) { + bytes = poly1305_simd_blocks(dctx, src, srclen); + src += srclen - bytes; + srclen = bytes; + } + + kernel_fpu_end(); + + if (unlikely(srclen)) { + dctx->buflen = srclen; + memcpy(dctx->buf, src, srclen); + } + + return 0; +} + +static struct shash_alg alg = { + .digestsize = POLY1305_DIGEST_SIZE, + .init = poly1305_simd_init, + .update = poly1305_simd_update, + .final = crypto_poly1305_final, + .setkey = crypto_poly1305_setkey, + .descsize = sizeof(struct poly1305_simd_desc_ctx), + .base = { + .cra_name = "poly1305", + .cra_driver_name = "poly1305-simd", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_alignmask = sizeof(u32) - 1, + .cra_blocksize = POLY1305_BLOCK_SIZE, + .cra_module = THIS_MODULE, + }, +}; + +static int __init poly1305_simd_mod_init(void) +{ + if (!cpu_has_xmm2) + return -ENODEV; + +#ifdef CONFIG_AS_AVX2 + poly1305_use_avx2 = cpu_has_avx && cpu_has_avx2 && + cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, NULL); + alg.descsize = sizeof(struct poly1305_simd_desc_ctx); + if (poly1305_use_avx2) + alg.descsize += 10 * sizeof(u32); +#endif + return crypto_register_shash(&alg); +} + +static void __exit poly1305_simd_mod_exit(void) +{ + crypto_unregister_shash(&alg); +} + +module_init(poly1305_simd_mod_init); +module_exit(poly1305_simd_mod_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Martin Willi <martin@strongswan.org>"); +MODULE_DESCRIPTION("Poly1305 authenticator"); +MODULE_ALIAS_CRYPTO("poly1305"); +MODULE_ALIAS_CRYPTO("poly1305-simd"); diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index 7a144971d..bd55dedd7 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -2,6 +2,7 @@ # Makefile for the x86 low level entry code # obj-y := entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o +obj-y += common.o obj-y += vdso/ obj-y += vsyscall/ diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index f4e6308c4..3c71dd947 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -135,9 +135,6 @@ For 32-bit we have the following conventions - kernel is built with movq %rbp, 4*8+\offset(%rsp) movq %rbx, 5*8+\offset(%rsp) .endm - .macro SAVE_EXTRA_REGS_RBP offset=0 - movq %rbp, 4*8+\offset(%rsp) - .endm .macro RESTORE_EXTRA_REGS offset=0 movq 0*8+\offset(%rsp), %r15 @@ -193,12 +190,6 @@ For 32-bit we have the following conventions - kernel is built with .macro RESTORE_C_REGS_EXCEPT_RCX_R11 RESTORE_C_REGS_HELPER 1,0,0,1,1 .endm - .macro RESTORE_RSI_RDI - RESTORE_C_REGS_HELPER 0,0,0,0,0 - .endm - .macro RESTORE_RSI_RDI_RDX - RESTORE_C_REGS_HELPER 0,0,0,0,1 - .endm .macro REMOVE_PT_GPREGS_FROM_STACK addskip=0 subq $-(15*8+\addskip), %rsp diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c new file mode 100644 index 000000000..80dcc9261 --- /dev/null +++ b/arch/x86/entry/common.c @@ -0,0 +1,318 @@ +/* + * common.c - C code for kernel entry and exit + * Copyright (c) 2015 Andrew Lutomirski + * GPL v2 + * + * Based on asm and ptrace code by many authors. The code here originated + * in ptrace.c and signal.c. + */ + +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/errno.h> +#include <linux/ptrace.h> +#include <linux/tracehook.h> +#include <linux/audit.h> +#include <linux/seccomp.h> +#include <linux/signal.h> +#include <linux/export.h> +#include <linux/context_tracking.h> +#include <linux/user-return-notifier.h> +#include <linux/uprobes.h> + +#include <asm/desc.h> +#include <asm/traps.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/syscalls.h> + +#ifdef CONFIG_CONTEXT_TRACKING +/* Called on entry from user mode with IRQs off. */ +__visible void enter_from_user_mode(void) +{ + CT_WARN_ON(ct_state() != CONTEXT_USER); + user_exit(); +} +#endif + +static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch) +{ +#ifdef CONFIG_X86_64 + if (arch == AUDIT_ARCH_X86_64) { + audit_syscall_entry(regs->orig_ax, regs->di, + regs->si, regs->dx, regs->r10); + } else +#endif + { + audit_syscall_entry(regs->orig_ax, regs->bx, + regs->cx, regs->dx, regs->si); + } +} + +/* + * We can return 0 to resume the syscall or anything else to go to phase + * 2. If we resume the syscall, we need to put something appropriate in + * regs->orig_ax. + * + * NB: We don't have full pt_regs here, but regs->orig_ax and regs->ax + * are fully functional. + * + * For phase 2's benefit, our return value is: + * 0: resume the syscall + * 1: go to phase 2; no seccomp phase 2 needed + * anything else: go to phase 2; pass return value to seccomp + */ +unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch) +{ + unsigned long ret = 0; + u32 work; + + BUG_ON(regs != task_pt_regs(current)); + + work = ACCESS_ONCE(current_thread_info()->flags) & + _TIF_WORK_SYSCALL_ENTRY; + +#ifdef CONFIG_CONTEXT_TRACKING + /* + * If TIF_NOHZ is set, we are required to call user_exit() before + * doing anything that could touch RCU. + */ + if (work & _TIF_NOHZ) { + enter_from_user_mode(); + work &= ~_TIF_NOHZ; + } +#endif + +#ifdef CONFIG_SECCOMP + /* + * Do seccomp first -- it should minimize exposure of other + * code, and keeping seccomp fast is probably more valuable + * than the rest of this. + */ + if (work & _TIF_SECCOMP) { + struct seccomp_data sd; + + sd.arch = arch; + sd.nr = regs->orig_ax; + sd.instruction_pointer = regs->ip; +#ifdef CONFIG_X86_64 + if (arch == AUDIT_ARCH_X86_64) { + sd.args[0] = regs->di; + sd.args[1] = regs->si; + sd.args[2] = regs->dx; + sd.args[3] = regs->r10; + sd.args[4] = regs->r8; + sd.args[5] = regs->r9; + } else +#endif + { + sd.args[0] = regs->bx; + sd.args[1] = regs->cx; + sd.args[2] = regs->dx; + sd.args[3] = regs->si; + sd.args[4] = regs->di; + sd.args[5] = regs->bp; + } + + BUILD_BUG_ON(SECCOMP_PHASE1_OK != 0); + BUILD_BUG_ON(SECCOMP_PHASE1_SKIP != 1); + + ret = seccomp_phase1(&sd); + if (ret == SECCOMP_PHASE1_SKIP) { + regs->orig_ax = -1; + ret = 0; + } else if (ret != SECCOMP_PHASE1_OK) { + return ret; /* Go directly to phase 2 */ + } + + work &= ~_TIF_SECCOMP; + } +#endif + + /* Do our best to finish without phase 2. */ + if (work == 0) + return ret; /* seccomp and/or nohz only (ret == 0 here) */ + +#ifdef CONFIG_AUDITSYSCALL + if (work == _TIF_SYSCALL_AUDIT) { + /* + * If there is no more work to be done except auditing, + * then audit in phase 1. Phase 2 always audits, so, if + * we audit here, then we can't go on to phase 2. + */ + do_audit_syscall_entry(regs, arch); + return 0; + } +#endif + + return 1; /* Something is enabled that we can't handle in phase 1 */ +} + +/* Returns the syscall nr to run (which should match regs->orig_ax). */ +long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch, + unsigned long phase1_result) +{ + long ret = 0; + u32 work = ACCESS_ONCE(current_thread_info()->flags) & + _TIF_WORK_SYSCALL_ENTRY; + + BUG_ON(regs != task_pt_regs(current)); + + /* + * If we stepped into a sysenter/syscall insn, it trapped in + * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP. + * If user-mode had set TF itself, then it's still clear from + * do_debug() and we need to set it again to restore the user + * state. If we entered on the slow path, TF was already set. + */ + if (work & _TIF_SINGLESTEP) + regs->flags |= X86_EFLAGS_TF; + +#ifdef CONFIG_SECCOMP + /* + * Call seccomp_phase2 before running the other hooks so that + * they can see any changes made by a seccomp tracer. + */ + if (phase1_result > 1 && seccomp_phase2(phase1_result)) { + /* seccomp failures shouldn't expose any additional code. */ + return -1; + } +#endif + + if (unlikely(work & _TIF_SYSCALL_EMU)) + ret = -1L; + + if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) && + tracehook_report_syscall_entry(regs)) + ret = -1L; + + if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) + trace_sys_enter(regs, regs->orig_ax); + + do_audit_syscall_entry(regs, arch); + + return ret ?: regs->orig_ax; +} + +long syscall_trace_enter(struct pt_regs *regs) +{ + u32 arch = is_ia32_task() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; + unsigned long phase1_result = syscall_trace_enter_phase1(regs, arch); + + if (phase1_result == 0) + return regs->orig_ax; + else + return syscall_trace_enter_phase2(regs, arch, phase1_result); +} + +static struct thread_info *pt_regs_to_thread_info(struct pt_regs *regs) +{ + unsigned long top_of_stack = + (unsigned long)(regs + 1) + TOP_OF_KERNEL_STACK_PADDING; + return (struct thread_info *)(top_of_stack - THREAD_SIZE); +} + +/* Called with IRQs disabled. */ +__visible void prepare_exit_to_usermode(struct pt_regs *regs) +{ + if (WARN_ON(!irqs_disabled())) + local_irq_disable(); + + /* + * In order to return to user mode, we need to have IRQs off with + * none of _TIF_SIGPENDING, _TIF_NOTIFY_RESUME, _TIF_USER_RETURN_NOTIFY, + * _TIF_UPROBE, or _TIF_NEED_RESCHED set. Several of these flags + * can be set at any time on preemptable kernels if we have IRQs on, + * so we need to loop. Disabling preemption wouldn't help: doing the + * work to clear some of the flags can sleep. + */ + while (true) { + u32 cached_flags = + READ_ONCE(pt_regs_to_thread_info(regs)->flags); + + if (!(cached_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | + _TIF_UPROBE | _TIF_NEED_RESCHED | + _TIF_USER_RETURN_NOTIFY))) + break; + + /* We have work to do. */ + local_irq_enable(); + + if (cached_flags & _TIF_NEED_RESCHED) + schedule(); + + if (cached_flags & _TIF_UPROBE) + uprobe_notify_resume(regs); + + /* deal with pending signal delivery */ + if (cached_flags & _TIF_SIGPENDING) + do_signal(regs); + + if (cached_flags & _TIF_NOTIFY_RESUME) { + clear_thread_flag(TIF_NOTIFY_RESUME); + tracehook_notify_resume(regs); + } + + if (cached_flags & _TIF_USER_RETURN_NOTIFY) + fire_user_return_notifiers(); + + /* Disable IRQs and retry */ + local_irq_disable(); + } + + user_enter(); +} + +/* + * Called with IRQs on and fully valid regs. Returns with IRQs off in a + * state such that we can immediately switch to user mode. + */ +__visible void syscall_return_slowpath(struct pt_regs *regs) +{ + struct thread_info *ti = pt_regs_to_thread_info(regs); + u32 cached_flags = READ_ONCE(ti->flags); + bool step; + + CT_WARN_ON(ct_state() != CONTEXT_KERNEL); + + if (WARN(irqs_disabled(), "syscall %ld left IRQs disabled", + regs->orig_ax)) + local_irq_enable(); + + /* + * First do one-time work. If these work items are enabled, we + * want to run them exactly once per syscall exit with IRQs on. + */ + if (cached_flags & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | + _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT)) { + audit_syscall_exit(regs); + + if (cached_flags & _TIF_SYSCALL_TRACEPOINT) + trace_sys_exit(regs, regs->ax); + + /* + * If TIF_SYSCALL_EMU is set, we only get here because of + * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP). + * We already reported this syscall instruction in + * syscall_trace_enter(). + */ + step = unlikely( + (cached_flags & (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU)) + == _TIF_SINGLESTEP); + if (step || cached_flags & _TIF_SYSCALL_TRACE) + tracehook_report_syscall_exit(regs, step); + } + +#ifdef CONFIG_COMPAT + /* + * Compat syscalls set TS_COMPAT. Make sure we clear it before + * returning to user mode. + */ + ti->status &= ~TS_COMPAT; +#endif + + local_irq_disable(); + prepare_exit_to_usermode(regs); +} diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 21dc60a60..b2909bf8c 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -45,16 +45,6 @@ #include <asm/asm.h> #include <asm/smap.h> -/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ -#include <linux/elf-em.h> -#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE) -#define __AUDIT_ARCH_LE 0x40000000 - -#ifndef CONFIG_AUDITSYSCALL -# define sysenter_audit syscall_trace_entry -# define sysexit_audit syscall_exit_work -#endif - .section .entry.text, "ax" /* @@ -266,14 +256,10 @@ ret_from_intr: ENTRY(resume_userspace) LOCKDEP_SYS_EXIT - DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt - # setting need_resched or sigpending - # between sampling and the iret + DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF - movl TI_flags(%ebp), %ecx - andl $_TIF_WORK_MASK, %ecx # is there any work to be done on - # int/exception return? - jne work_pending + movl %esp, %eax + call prepare_exit_to_usermode jmp restore_all END(ret_from_exception) @@ -339,7 +325,7 @@ sysenter_past_esp: GET_THREAD_INFO(%ebp) testl $_TIF_WORK_SYSCALL_ENTRY, TI_flags(%ebp) - jnz sysenter_audit + jnz syscall_trace_entry sysenter_do_call: cmpl $(NR_syscalls), %eax jae sysenter_badsys @@ -351,7 +337,7 @@ sysenter_after_call: TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx testl $_TIF_ALLWORK_MASK, %ecx - jnz sysexit_audit + jnz syscall_exit_work_irqs_off sysenter_exit: /* if something modifies registers it must also disable sysexit */ movl PT_EIP(%esp), %edx @@ -362,40 +348,6 @@ sysenter_exit: PTGS_TO_GS ENABLE_INTERRUPTS_SYSEXIT -#ifdef CONFIG_AUDITSYSCALL -sysenter_audit: - testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), TI_flags(%ebp) - jnz syscall_trace_entry - /* movl PT_EAX(%esp), %eax already set, syscall number: 1st arg to audit */ - movl PT_EBX(%esp), %edx /* ebx/a0: 2nd arg to audit */ - /* movl PT_ECX(%esp), %ecx already set, a1: 3nd arg to audit */ - pushl PT_ESI(%esp) /* a3: 5th arg */ - pushl PT_EDX+4(%esp) /* a2: 4th arg */ - call __audit_syscall_entry - popl %ecx /* get that remapped edx off the stack */ - popl %ecx /* get that remapped esi off the stack */ - movl PT_EAX(%esp), %eax /* reload syscall number */ - jmp sysenter_do_call - -sysexit_audit: - testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx - jnz syscall_exit_work - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_ANY) - movl %eax, %edx /* second arg, syscall return value */ - cmpl $-MAX_ERRNO, %eax /* is it an error ? */ - setbe %al /* 1 if so, 0 if not */ - movzbl %al, %eax /* zero-extend that */ - call __audit_syscall_exit - DISABLE_INTERRUPTS(CLBR_ANY) - TRACE_IRQS_OFF - movl TI_flags(%ebp), %ecx - testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx - jnz syscall_exit_work - movl PT_EAX(%esp), %eax /* reload syscall return value */ - jmp sysenter_exit -#endif - .pushsection .fixup, "ax" 2: movl $0, PT_FS(%esp) jmp 1b @@ -421,13 +373,7 @@ syscall_after_call: movl %eax, PT_EAX(%esp) # store the return value syscall_exit: LOCKDEP_SYS_EXIT - DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt - # setting need_resched or sigpending - # between sampling and the iret - TRACE_IRQS_OFF - movl TI_flags(%ebp), %ecx - testl $_TIF_ALLWORK_MASK, %ecx # current->work - jnz syscall_exit_work + jmp syscall_exit_work restore_all: TRACE_IRQS_IRET @@ -504,57 +450,6 @@ ldt_ss: #endif ENDPROC(entry_INT80_32) - # perform work that needs to be done immediately before resumption - ALIGN -work_pending: - testb $_TIF_NEED_RESCHED, %cl - jz work_notifysig -work_resched: - call schedule - LOCKDEP_SYS_EXIT - DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt - # setting need_resched or sigpending - # between sampling and the iret - TRACE_IRQS_OFF - movl TI_flags(%ebp), %ecx - andl $_TIF_WORK_MASK, %ecx # is there any work to be done other - # than syscall tracing? - jz restore_all - testb $_TIF_NEED_RESCHED, %cl - jnz work_resched - -work_notifysig: # deal with pending signals and - # notify-resume requests -#ifdef CONFIG_VM86 - testl $X86_EFLAGS_VM, PT_EFLAGS(%esp) - movl %esp, %eax - jnz work_notifysig_v86 # returning to kernel-space or - # vm86-space -1: -#else - movl %esp, %eax -#endif - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_NONE) - movb PT_CS(%esp), %bl - andb $SEGMENT_RPL_MASK, %bl - cmpb $USER_RPL, %bl - jb resume_kernel - xorl %edx, %edx - call do_notify_resume - jmp resume_userspace - -#ifdef CONFIG_VM86 - ALIGN -work_notifysig_v86: - pushl %ecx # save ti_flags for do_notify_resume - call save_v86_state # %eax contains pt_regs pointer - popl %ecx - movl %eax, %esp - jmp 1b -#endif -END(work_pending) - # perform syscall exit tracing ALIGN syscall_trace_entry: @@ -569,15 +464,14 @@ END(syscall_trace_entry) # perform syscall exit tracing ALIGN -syscall_exit_work: - testl $_TIF_WORK_SYSCALL_EXIT, %ecx - jz work_pending +syscall_exit_work_irqs_off: TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call - # schedule() instead + ENABLE_INTERRUPTS(CLBR_ANY) + +syscall_exit_work: movl %esp, %eax - call syscall_trace_leave - jmp resume_userspace + call syscall_return_slowpath + jmp restore_all END(syscall_exit_work) syscall_fault: diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index d330840a2..055a01de7 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -33,7 +33,6 @@ #include <asm/paravirt.h> #include <asm/percpu.h> #include <asm/asm.h> -#include <asm/context_tracking.h> #include <asm/smap.h> #include <asm/pgtable_types.h> #include <linux/err.h> @@ -229,6 +228,11 @@ entry_SYSCALL_64_fastpath: */ USERGS_SYSRET64 +GLOBAL(int_ret_from_sys_call_irqs_off) + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) + jmp int_ret_from_sys_call + /* Do syscall entry tracing */ tracesys: movq %rsp, %rdi @@ -272,69 +276,11 @@ tracesys_phase2: * Has correct iret frame. */ GLOBAL(int_ret_from_sys_call) - DISABLE_INTERRUPTS(CLBR_NONE) -int_ret_from_sys_call_irqs_off: /* jumps come here from the irqs-off SYSRET path */ - TRACE_IRQS_OFF - movl $_TIF_ALLWORK_MASK, %edi - /* edi: mask to check */ -GLOBAL(int_with_check) - LOCKDEP_SYS_EXIT_IRQ - GET_THREAD_INFO(%rcx) - movl TI_flags(%rcx), %edx - andl %edi, %edx - jnz int_careful - andl $~TS_COMPAT, TI_status(%rcx) - jmp syscall_return - - /* - * Either reschedule or signal or syscall exit tracking needed. - * First do a reschedule test. - * edx: work, edi: workmask - */ -int_careful: - bt $TIF_NEED_RESCHED, %edx - jnc int_very_careful - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_NONE) - pushq %rdi - SCHEDULE_USER - popq %rdi - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - jmp int_with_check - - /* handle signals and tracing -- both require a full pt_regs */ -int_very_careful: - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_NONE) SAVE_EXTRA_REGS - /* Check for syscall exit trace */ - testl $_TIF_WORK_SYSCALL_EXIT, %edx - jz int_signal - pushq %rdi - leaq 8(%rsp), %rdi /* &ptregs -> arg1 */ - call syscall_trace_leave - popq %rdi - andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU), %edi - jmp int_restore_rest - -int_signal: - testl $_TIF_DO_NOTIFY_MASK, %edx - jz 1f - movq %rsp, %rdi /* &ptregs -> arg1 */ - xorl %esi, %esi /* oldset -> arg2 */ - call do_notify_resume -1: movl $_TIF_WORK_MASK, %edi -int_restore_rest: + movq %rsp, %rdi + call syscall_return_slowpath /* returns with IRQs disabled */ RESTORE_EXTRA_REGS - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - jmp int_with_check - -syscall_return: - /* The IRETQ could re-enable interrupts: */ - DISABLE_INTERRUPTS(CLBR_ANY) - TRACE_IRQS_IRETQ + TRACE_IRQS_IRETQ /* we're about to change IF */ /* * Try to use SYSRET instead of IRET if we're returning to @@ -555,23 +501,22 @@ END(irq_entries_start) /* 0(%rsp): ~(interrupt number) */ .macro interrupt func cld - /* - * Since nothing in interrupt handling code touches r12...r15 members - * of "struct pt_regs", and since interrupts can nest, we can save - * four stack slots and simultaneously provide - * an unwind-friendly stack layout by saving "truncated" pt_regs - * exactly up to rbp slot, without these members. - */ - ALLOC_PT_GPREGS_ON_STACK -RBP - SAVE_C_REGS -RBP - /* this goes to 0(%rsp) for unwinder, not for saving the value: */ - SAVE_EXTRA_REGS_RBP -RBP - - leaq -RBP(%rsp), %rdi /* arg1 for \func (pointer to pt_regs) */ + ALLOC_PT_GPREGS_ON_STACK + SAVE_C_REGS + SAVE_EXTRA_REGS - testb $3, CS-RBP(%rsp) + testb $3, CS(%rsp) jz 1f + + /* + * IRQ from user mode. Switch to kernel gsbase and inform context + * tracking that we're in kernel mode. + */ SWAPGS +#ifdef CONFIG_CONTEXT_TRACKING + call enter_from_user_mode +#endif + 1: /* * Save previous stack pointer, optionally switch to interrupt stack. @@ -580,14 +525,14 @@ END(irq_entries_start) * a little cheaper to use a separate counter in the PDA (short of * moving irq_enter into assembly, which would be too much work) */ - movq %rsp, %rsi + movq %rsp, %rdi incl PER_CPU_VAR(irq_count) cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp - pushq %rsi + pushq %rdi /* We entered an interrupt context - irqs are off: */ TRACE_IRQS_OFF - call \func + call \func /* rdi points to pt_regs */ .endm /* @@ -606,34 +551,19 @@ ret_from_intr: decl PER_CPU_VAR(irq_count) /* Restore saved previous stack */ - popq %rsi - /* return code expects complete pt_regs - adjust rsp accordingly: */ - leaq -RBP(%rsi), %rsp + popq %rsp testb $3, CS(%rsp) jz retint_kernel - /* Interrupt came from user space */ -retint_user: - GET_THREAD_INFO(%rcx) - /* %rcx: thread info. Interrupts are off. */ -retint_with_reschedule: - movl $_TIF_WORK_MASK, %edi -retint_check: + /* Interrupt came from user space */ LOCKDEP_SYS_EXIT_IRQ - movl TI_flags(%rcx), %edx - andl %edi, %edx - jnz retint_careful - -retint_swapgs: /* return to user-space */ - /* - * The iretq could re-enable interrupts: - */ - DISABLE_INTERRUPTS(CLBR_ANY) +GLOBAL(retint_user) + mov %rsp,%rdi + call prepare_exit_to_usermode TRACE_IRQS_IRETQ - SWAPGS - jmp restore_c_regs_and_iret + jmp restore_regs_and_iret /* Returning to kernel space */ retint_kernel: @@ -657,6 +587,8 @@ retint_kernel: * At this label, code paths which return to kernel and to user, * which come from interrupts/exception and from syscalls, merge. */ +restore_regs_and_iret: + RESTORE_EXTRA_REGS restore_c_regs_and_iret: RESTORE_C_REGS REMOVE_PT_GPREGS_FROM_STACK 8 @@ -707,37 +639,6 @@ native_irq_return_ldt: popq %rax jmp native_irq_return_iret #endif - - /* edi: workmask, edx: work */ -retint_careful: - bt $TIF_NEED_RESCHED, %edx - jnc retint_signal - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_NONE) - pushq %rdi - SCHEDULE_USER - popq %rdi - GET_THREAD_INFO(%rcx) - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - jmp retint_check - -retint_signal: - testl $_TIF_DO_NOTIFY_MASK, %edx - jz retint_swapgs - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_NONE) - SAVE_EXTRA_REGS - movq $-1, ORIG_RAX(%rsp) - xorl %esi, %esi /* oldset */ - movq %rsp, %rdi /* &pt_regs */ - call do_notify_resume - RESTORE_EXTRA_REGS - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - GET_THREAD_INFO(%rcx) - jmp retint_with_reschedule - END(common_interrupt) /* @@ -1143,12 +1044,22 @@ ENTRY(error_entry) SAVE_EXTRA_REGS 8 xorl %ebx, %ebx testb $3, CS+8(%rsp) - jz error_kernelspace + jz .Lerror_kernelspace - /* We entered from user mode */ +.Lerror_entry_from_usermode_swapgs: + /* + * We entered from user mode or we're pretending to have entered + * from user mode due to an IRET fault. + */ SWAPGS -error_entry_done: +.Lerror_entry_from_usermode_after_swapgs: +#ifdef CONFIG_CONTEXT_TRACKING + call enter_from_user_mode +#endif + +.Lerror_entry_done: + TRACE_IRQS_OFF ret @@ -1158,31 +1069,30 @@ error_entry_done: * truncated RIP for IRET exceptions returning to compat mode. Check * for these here too. */ -error_kernelspace: +.Lerror_kernelspace: incl %ebx leaq native_irq_return_iret(%rip), %rcx cmpq %rcx, RIP+8(%rsp) - je error_bad_iret + je .Lerror_bad_iret movl %ecx, %eax /* zero extend */ cmpq %rax, RIP+8(%rsp) - je bstep_iret + je .Lbstep_iret cmpq $gs_change, RIP+8(%rsp) - jne error_entry_done + jne .Lerror_entry_done /* * hack: gs_change can fail with user gsbase. If this happens, fix up * gsbase and proceed. We'll fix up the exception and land in * gs_change's error handler with kernel gsbase. */ - SWAPGS - jmp error_entry_done + jmp .Lerror_entry_from_usermode_swapgs -bstep_iret: +.Lbstep_iret: /* Fix truncated RIP */ movq %rcx, RIP+8(%rsp) /* fall through */ -error_bad_iret: +.Lerror_bad_iret: /* * We came from an IRET to user mode, so we have user gsbase. * Switch to kernel gsbase: @@ -1198,7 +1108,7 @@ error_bad_iret: call fixup_bad_iret mov %rax, %rsp decl %ebx - jmp error_entry_done + jmp .Lerror_entry_from_usermode_after_swapgs END(error_entry) @@ -1209,7 +1119,6 @@ END(error_entry) */ ENTRY(error_exit) movl %ebx, %eax - RESTORE_EXTRA_REGS DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF testl %eax, %eax diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index a7e257d9c..a9360d40f 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -22,8 +22,8 @@ #define __AUDIT_ARCH_LE 0x40000000 #ifndef CONFIG_AUDITSYSCALL -# define sysexit_audit ia32_ret_from_sys_call -# define sysretl_audit ia32_ret_from_sys_call +# define sysexit_audit ia32_ret_from_sys_call_irqs_off +# define sysretl_audit ia32_ret_from_sys_call_irqs_off #endif .section .entry.text, "ax" @@ -141,7 +141,8 @@ sysexit_from_sys_call: andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) movl RIP(%rsp), %ecx /* User %eip */ movq RAX(%rsp), %rax - RESTORE_RSI_RDI + movl RSI(%rsp), %esi + movl RDI(%rsp), %edi xorl %edx, %edx /* Do not leak kernel information */ xorq %r8, %r8 xorq %r9, %r9 @@ -209,10 +210,10 @@ sysexit_from_sys_call: .endm .macro auditsys_exit exit - testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz ia32_ret_from_sys_call TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz ia32_ret_from_sys_call movl %eax, %esi /* second arg, syscall return value */ cmpl $-MAX_ERRNO, %eax /* is it an error ? */ jbe 1f @@ -230,7 +231,7 @@ sysexit_from_sys_call: movq %rax, R10(%rsp) movq %rax, R9(%rsp) movq %rax, R8(%rsp) - jmp int_with_check + jmp int_ret_from_sys_call_irqs_off .endm sysenter_auditsys: @@ -365,7 +366,9 @@ cstar_dispatch: sysretl_from_sys_call: andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - RESTORE_RSI_RDI_RDX + movl RDX(%rsp), %edx + movl RSI(%rsp), %esi + movl RDI(%rsp), %edi movl RIP(%rsp), %ecx movl EFLAGS(%rsp), %r11d movq RAX(%rsp), %rax @@ -430,8 +433,48 @@ cstar_tracesys: END(entry_SYSCALL_compat) ia32_badarg: - ASM_CLAC - movq $-EFAULT, RAX(%rsp) + /* + * So far, we've entered kernel mode, set AC, turned on IRQs, and + * saved C regs except r8-r11. We haven't done any of the other + * standard entry work, though. We want to bail, but we shouldn't + * treat this as a syscall entry since we don't even know what the + * args are. Instead, treat this as a non-syscall entry, finish + * the entry work, and immediately exit after setting AX = -EFAULT. + * + * We're really just being polite here. Killing the task outright + * would be a reasonable action, too. Given that the only valid + * way to have gotten here is through the vDSO, and we already know + * that the stack pointer is bad, the task isn't going to survive + * for long no matter what we do. + */ + + ASM_CLAC /* undo STAC */ + movq $-EFAULT, RAX(%rsp) /* return -EFAULT if possible */ + + /* Fill in the rest of pt_regs */ + xorl %eax, %eax + movq %rax, R11(%rsp) + movq %rax, R10(%rsp) + movq %rax, R9(%rsp) + movq %rax, R8(%rsp) + SAVE_EXTRA_REGS + + /* Turn IRQs back off. */ + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + + /* Now finish entering normal kernel mode. */ +#ifdef CONFIG_CONTEXT_TRACKING + call enter_from_user_mode +#endif + + /* And exit again. */ + jmp retint_user + +ia32_ret_from_sys_call_irqs_off: + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) + ia32_ret_from_sys_call: xorl %eax, %eax /* Do not leak kernel information */ movq %rax, R11(%rsp) diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index ef8187f9d..7663c455b 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -365,3 +365,20 @@ 356 i386 memfd_create sys_memfd_create 357 i386 bpf sys_bpf 358 i386 execveat sys_execveat stub32_execveat +359 i386 socket sys_socket +360 i386 socketpair sys_socketpair +361 i386 bind sys_bind +362 i386 connect sys_connect +363 i386 listen sys_listen +364 i386 accept4 sys_accept4 +365 i386 getsockopt sys_getsockopt compat_sys_getsockopt +366 i386 setsockopt sys_setsockopt compat_sys_setsockopt +367 i386 getsockname sys_getsockname +368 i386 getpeername sys_getpeername +369 i386 sendto sys_sendto +370 i386 sendmsg sys_sendmsg compat_sys_sendmsg +371 i386 recvfrom sys_recvfrom compat_sys_recvfrom +372 i386 recvmsg sys_recvmsg compat_sys_recvmsg +373 i386 shutdown sys_shutdown +374 i386 userfaultfd sys_userfaultfd +375 i386 membarrier sys_membarrier diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 9ef32d5f1..278842fdf 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -329,6 +329,8 @@ 320 common kexec_file_load sys_kexec_file_load 321 common bpf sys_bpf 322 64 execveat stub_execveat +323 common userfaultfd sys_userfaultfd +324 common membarrier sys_membarrier # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index e97032069..a3d0767a6 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -8,7 +8,7 @@ KASAN_SANITIZE := n VDSO64-$(CONFIG_X86_64) := y VDSOX32-$(CONFIG_X86_X32_ABI) := y VDSO32-$(CONFIG_X86_32) := y -VDSO32-$(CONFIG_COMPAT) := y +VDSO32-$(CONFIG_IA32_EMULATION) := y # files to link into the vdso vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o @@ -20,7 +20,7 @@ obj-y += vma.o vdso_img-$(VDSO64-y) += 64 vdso_img-$(VDSOX32-y) += x32 vdso_img-$(VDSO32-y) += 32-int80 -vdso_img-$(CONFIG_COMPAT) += 32-syscall +vdso_img-$(CONFIG_IA32_EMULATION) += 32-syscall vdso_img-$(VDSO32-y) += 32-sysenter obj-$(VDSO32-y) += vdso32-setup.o @@ -126,7 +126,7 @@ $(obj)/vdsox32.so.dbg: $(src)/vdsox32.lds $(vobjx32s) FORCE # Build multiple 32-bit vDSO images to choose from at boot time. # vdso32.so-$(VDSO32-y) += int80 -vdso32.so-$(CONFIG_COMPAT) += syscall +vdso32.so-$(CONFIG_IA32_EMULATION) += syscall vdso32.so-$(VDSO32-y) += sysenter vdso32-images = $(vdso32.so-y:%=vdso32-%.so) @@ -175,7 +175,7 @@ quiet_cmd_vdso = VDSO $@ -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) && \ sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@' -VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) \ +VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=both) \ $(call cc-ldoption, -Wl$(comma)--build-id) -Wl,-Bsymbolic $(LTO_CFLAGS) GCOV_PROFILE := n diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c index 979332275..ca94fa649 100644 --- a/arch/x86/entry/vdso/vclock_gettime.c +++ b/arch/x86/entry/vdso/vclock_gettime.c @@ -175,20 +175,8 @@ static notrace cycle_t vread_pvclock(int *mode) notrace static cycle_t vread_tsc(void) { - cycle_t ret; - u64 last; - - /* - * Empirically, a fence (of type that depends on the CPU) - * before rdtsc is enough to ensure that rdtsc is ordered - * with respect to loads. The various CPU manuals are unclear - * as to whether rdtsc can be reordered with later loads, - * but no one has ever seen it happen. - */ - rdtsc_barrier(); - ret = (cycle_t)__native_read_tsc(); - - last = gtod->cycle_last; + cycle_t ret = (cycle_t)rdtsc_ordered(); + u64 last = gtod->cycle_last; if (likely(ret >= last)) return ret; diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 1c9f750c3..434543145 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -177,7 +177,7 @@ up_fail: return ret; } -#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT) +#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) static int load_vdso32(void) { int ret; @@ -219,8 +219,11 @@ int compat_arch_setup_additional_pages(struct linux_binprm *bprm, return map_vdso(&vdso_image_x32, true); } #endif - +#ifdef CONFIG_IA32_EMULATION return load_vdso32(); +#else + return 0; +#endif } #endif #else diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index 2dcc6ff6f..b160c0c6b 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -277,7 +277,7 @@ static const char *gate_vma_name(struct vm_area_struct *vma) { return "[vsyscall]"; } -static struct vm_operations_struct gate_vma_ops = { +static const struct vm_operations_struct gate_vma_ops = { .name = gate_vma_name, }; static struct vm_area_struct gate_vma = { @@ -290,7 +290,7 @@ static struct vm_area_struct gate_vma = { struct vm_area_struct *get_gate_vma(struct mm_struct *mm) { -#ifdef CONFIG_IA32_EMULATION +#ifdef CONFIG_COMPAT if (!mm || mm->context.ia32_compat) return NULL; #endif diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index ae3a29ae8..a0a19b7ba 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -34,99 +34,6 @@ #include <asm/sys_ia32.h> #include <asm/smap.h> -int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from) -{ - int err = 0; - bool ia32 = test_thread_flag(TIF_IA32); - - if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t))) - return -EFAULT; - - put_user_try { - /* If you change siginfo_t structure, please make sure that - this code is fixed accordingly. - It should never copy any pad contained in the structure - to avoid security leaks, but must copy the generic - 3 ints plus the relevant union member. */ - put_user_ex(from->si_signo, &to->si_signo); - put_user_ex(from->si_errno, &to->si_errno); - put_user_ex((short)from->si_code, &to->si_code); - - if (from->si_code < 0) { - put_user_ex(from->si_pid, &to->si_pid); - put_user_ex(from->si_uid, &to->si_uid); - put_user_ex(ptr_to_compat(from->si_ptr), &to->si_ptr); - } else { - /* - * First 32bits of unions are always present: - * si_pid === si_band === si_tid === si_addr(LS half) - */ - put_user_ex(from->_sifields._pad[0], - &to->_sifields._pad[0]); - switch (from->si_code >> 16) { - case __SI_FAULT >> 16: - break; - case __SI_SYS >> 16: - put_user_ex(from->si_syscall, &to->si_syscall); - put_user_ex(from->si_arch, &to->si_arch); - break; - case __SI_CHLD >> 16: - if (ia32) { - put_user_ex(from->si_utime, &to->si_utime); - put_user_ex(from->si_stime, &to->si_stime); - } else { - put_user_ex(from->si_utime, &to->_sifields._sigchld_x32._utime); - put_user_ex(from->si_stime, &to->_sifields._sigchld_x32._stime); - } - put_user_ex(from->si_status, &to->si_status); - /* FALL THROUGH */ - default: - case __SI_KILL >> 16: - put_user_ex(from->si_uid, &to->si_uid); - break; - case __SI_POLL >> 16: - put_user_ex(from->si_fd, &to->si_fd); - break; - case __SI_TIMER >> 16: - put_user_ex(from->si_overrun, &to->si_overrun); - put_user_ex(ptr_to_compat(from->si_ptr), - &to->si_ptr); - break; - /* This is not generated by the kernel as of now. */ - case __SI_RT >> 16: - case __SI_MESGQ >> 16: - put_user_ex(from->si_uid, &to->si_uid); - put_user_ex(from->si_int, &to->si_int); - break; - } - } - } put_user_catch(err); - - return err; -} - -int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from) -{ - int err = 0; - u32 ptr32; - - if (!access_ok(VERIFY_READ, from, sizeof(compat_siginfo_t))) - return -EFAULT; - - get_user_try { - get_user_ex(to->si_signo, &from->si_signo); - get_user_ex(to->si_errno, &from->si_errno); - get_user_ex(to->si_code, &from->si_code); - - get_user_ex(to->si_pid, &from->si_pid); - get_user_ex(to->si_uid, &from->si_uid); - get_user_ex(ptr32, &from->si_ptr); - to->si_ptr = compat_ptr(ptr32); - } get_user_catch(err); - - return err; -} - /* * Do a signal return; undo the signal stack. */ diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index c8393634c..ebf6d5e56 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -313,7 +313,6 @@ struct apic { /* wakeup_secondary_cpu */ int (*wakeup_secondary_cpu)(int apicid, unsigned long start_eip); - bool wait_for_init_deassert; void (*inquire_remote_apic)(int apicid); /* apic ops */ @@ -378,7 +377,6 @@ extern struct apic *__apicdrivers[], *__apicdrivers_end[]; * APIC functionality to boot other CPUs - only used on SMP: */ #ifdef CONFIG_SMP -extern atomic_t init_deasserted; extern int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip); #endif diff --git a/arch/x86/include/asm/arch_hweight.h b/arch/x86/include/asm/arch_hweight.h index 9686c3d9f..259a7c1ef 100644 --- a/arch/x86/include/asm/arch_hweight.h +++ b/arch/x86/include/asm/arch_hweight.h @@ -21,7 +21,7 @@ * ARCH_HWEIGHT_CFLAGS in <arch/x86/Kconfig> for the respective * compiler switches. */ -static inline unsigned int __arch_hweight32(unsigned int w) +static __always_inline unsigned int __arch_hweight32(unsigned int w) { unsigned int res = 0; @@ -42,20 +42,23 @@ static inline unsigned int __arch_hweight8(unsigned int w) return __arch_hweight32(w & 0xff); } +#ifdef CONFIG_X86_32 static inline unsigned long __arch_hweight64(__u64 w) { - unsigned long res = 0; - -#ifdef CONFIG_X86_32 return __arch_hweight32((u32)w) + __arch_hweight32((u32)(w >> 32)); +} #else +static __always_inline unsigned long __arch_hweight64(__u64 w) +{ + unsigned long res = 0; + asm (ALTERNATIVE("call __sw_hweight64", POPCNT64, X86_FEATURE_POPCNT) : "="REG_OUT (res) : REG_IN (w)); -#endif /* CONFIG_X86_32 */ return res; } +#endif /* CONFIG_X86_32 */ #endif diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index e9168955c..fb52aa644 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -182,6 +182,21 @@ static inline int atomic_xchg(atomic_t *v, int new) return xchg(&v->counter, new); } +#define ATOMIC_OP(op) \ +static inline void atomic_##op(int i, atomic_t *v) \ +{ \ + asm volatile(LOCK_PREFIX #op"l %1,%0" \ + : "+m" (v->counter) \ + : "ir" (i) \ + : "memory"); \ +} + +ATOMIC_OP(and) +ATOMIC_OP(or) +ATOMIC_OP(xor) + +#undef ATOMIC_OP + /** * __atomic_add_unless - add unless the number is already a given value * @v: pointer of type atomic_t @@ -219,16 +234,6 @@ static __always_inline short int atomic_inc_short(short int *v) return *v; } -/* These are x86-specific, used by some header files */ -#define atomic_clear_mask(mask, addr) \ - asm volatile(LOCK_PREFIX "andl %0,%1" \ - : : "r" (~(mask)), "m" (*(addr)) : "memory") - -#define atomic_set_mask(mask, addr) \ - asm volatile(LOCK_PREFIX "orl %0,%1" \ - : : "r" ((unsigned)(mask)), "m" (*(addr)) \ - : "memory") - #ifdef CONFIG_X86_32 # include <asm/atomic64_32.h> #else diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h index b154de75c..a11c30b77 100644 --- a/arch/x86/include/asm/atomic64_32.h +++ b/arch/x86/include/asm/atomic64_32.h @@ -313,4 +313,18 @@ static inline long long atomic64_dec_if_positive(atomic64_t *v) #undef alternative_atomic64 #undef __alternative_atomic64 +#define ATOMIC64_OP(op, c_op) \ +static inline void atomic64_##op(long long i, atomic64_t *v) \ +{ \ + long long old, c = 0; \ + while ((old = atomic64_cmpxchg(v, c, c c_op i)) != c) \ + c = old; \ +} + +ATOMIC64_OP(and, &) +ATOMIC64_OP(or, |) +ATOMIC64_OP(xor, ^) + +#undef ATOMIC64_OP + #endif /* _ASM_X86_ATOMIC64_32_H */ diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h index b965f9e03..50e33eff5 100644 --- a/arch/x86/include/asm/atomic64_64.h +++ b/arch/x86/include/asm/atomic64_64.h @@ -220,4 +220,19 @@ static inline long atomic64_dec_if_positive(atomic64_t *v) return dec; } +#define ATOMIC64_OP(op) \ +static inline void atomic64_##op(long i, atomic64_t *v) \ +{ \ + asm volatile(LOCK_PREFIX #op"q %1,%0" \ + : "+m" (v->counter) \ + : "er" (i) \ + : "memory"); \ +} + +ATOMIC64_OP(and) +ATOMIC64_OP(or) +ATOMIC64_OP(xor) + +#undef ATOMIC64_OP + #endif /* _ASM_X86_ATOMIC64_64_H */ diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index e51a8f803..0681d2532 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -57,12 +57,12 @@ do { \ compiletime_assert_atomic_type(*p); \ smp_mb(); \ - ACCESS_ONCE(*p) = (v); \ + WRITE_ONCE(*p, v); \ } while (0) #define smp_load_acquire(p) \ ({ \ - typeof(*p) ___p1 = ACCESS_ONCE(*p); \ + typeof(*p) ___p1 = READ_ONCE(*p); \ compiletime_assert_atomic_type(*p); \ smp_mb(); \ ___p1; \ @@ -74,12 +74,12 @@ do { \ do { \ compiletime_assert_atomic_type(*p); \ barrier(); \ - ACCESS_ONCE(*p) = (v); \ + WRITE_ONCE(*p, v); \ } while (0) #define smp_load_acquire(p) \ ({ \ - typeof(*p) ___p1 = ACCESS_ONCE(*p); \ + typeof(*p) ___p1 = READ_ONCE(*p); \ compiletime_assert_atomic_type(*p); \ barrier(); \ ___p1; \ @@ -91,15 +91,4 @@ do { \ #define smp_mb__before_atomic() barrier() #define smp_mb__after_atomic() barrier() -/* - * Stop RDTSC speculation. This is needed when you need to use RDTSC - * (or get_cycles or vread that possibly accesses the TSC) in a defined - * code region. - */ -static __always_inline void rdtsc_barrier(void) -{ - alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC, - "lfence", X86_FEATURE_LFENCE_RDTSC); -} - #endif /* _ASM_X86_BARRIER_H */ diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h index 9bf3ea14b..e63aa38e8 100644 --- a/arch/x86/include/asm/cacheflush.h +++ b/arch/x86/include/asm/cacheflush.h @@ -89,6 +89,8 @@ int set_pages_rw(struct page *page, int numpages); void clflush_cache_range(void *addr, unsigned int size); +#define mmio_flush_range(addr, size) clflush_cache_range(addr, size) + #ifdef CONFIG_DEBUG_RODATA void mark_rodata_ro(void); extern const int rodata_test_data; @@ -109,75 +111,4 @@ static inline int rodata_test(void) } #endif -#ifdef ARCH_HAS_NOCACHE_UACCESS - -/** - * arch_memcpy_to_pmem - copy data to persistent memory - * @dst: destination buffer for the copy - * @src: source buffer for the copy - * @n: length of the copy in bytes - * - * Copy data to persistent memory media via non-temporal stores so that - * a subsequent arch_wmb_pmem() can flush cpu and memory controller - * write buffers to guarantee durability. - */ -static inline void arch_memcpy_to_pmem(void __pmem *dst, const void *src, - size_t n) -{ - int unwritten; - - /* - * We are copying between two kernel buffers, if - * __copy_from_user_inatomic_nocache() returns an error (page - * fault) we would have already reported a general protection fault - * before the WARN+BUG. - */ - unwritten = __copy_from_user_inatomic_nocache((void __force *) dst, - (void __user *) src, n); - if (WARN(unwritten, "%s: fault copying %p <- %p unwritten: %d\n", - __func__, dst, src, unwritten)) - BUG(); -} - -/** - * arch_wmb_pmem - synchronize writes to persistent memory - * - * After a series of arch_memcpy_to_pmem() operations this drains data - * from cpu write buffers and any platform (memory controller) buffers - * to ensure that written data is durable on persistent memory media. - */ -static inline void arch_wmb_pmem(void) -{ - /* - * wmb() to 'sfence' all previous writes such that they are - * architecturally visible to 'pcommit'. Note, that we've - * already arranged for pmem writes to avoid the cache via - * arch_memcpy_to_pmem(). - */ - wmb(); - pcommit_sfence(); -} - -static inline bool __arch_has_wmb_pmem(void) -{ -#ifdef CONFIG_X86_64 - /* - * We require that wmb() be an 'sfence', that is only guaranteed on - * 64-bit builds - */ - return static_cpu_has(X86_FEATURE_PCOMMIT); -#else - return false; -#endif -} -#else /* ARCH_HAS_NOCACHE_UACCESS i.e. ARCH=um */ -extern void arch_memcpy_to_pmem(void __pmem *dst, const void *src, size_t n); -extern void arch_wmb_pmem(void); - -static inline bool __arch_has_wmb_pmem(void) -{ - return false; -} -#endif - #endif /* _ASM_X86_CACHEFLUSH_H */ diff --git a/arch/x86/include/asm/context_tracking.h b/arch/x86/include/asm/context_tracking.h deleted file mode 100644 index 1fe49704b..000000000 --- a/arch/x86/include/asm/context_tracking.h +++ /dev/null @@ -1,10 +0,0 @@ -#ifndef _ASM_X86_CONTEXT_TRACKING_H -#define _ASM_X86_CONTEXT_TRACKING_H - -#ifdef CONFIG_CONTEXT_TRACKING -# define SCHEDULE_USER call schedule_user -#else -# define SCHEDULE_USER call schedule -#endif - -#endif diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 3d6606fb9..9727b3b48 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -119,6 +119,7 @@ #define X86_FEATURE_TM2 ( 4*32+ 8) /* Thermal Monitor 2 */ #define X86_FEATURE_SSSE3 ( 4*32+ 9) /* Supplemental SSE-3 */ #define X86_FEATURE_CID ( 4*32+10) /* Context ID */ +#define X86_FEATURE_SDBG ( 4*32+11) /* Silicon Debug */ #define X86_FEATURE_FMA ( 4*32+12) /* Fused multiply-add */ #define X86_FEATURE_CX16 ( 4*32+13) /* CMPXCHG16B */ #define X86_FEATURE_XTPR ( 4*32+14) /* Send Task Priority Messages */ @@ -176,6 +177,7 @@ #define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */ #define X86_FEATURE_BPEXT (6*32+26) /* data breakpoint extension */ #define X86_FEATURE_PERFCTR_L2 ( 6*32+28) /* L2 performance counter extensions */ +#define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */ /* * Auxiliary flags: Linux defined - For features scattered in various @@ -191,7 +193,7 @@ #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ #define X86_FEATURE_HWP ( 7*32+ 10) /* "hwp" Intel HWP */ -#define X86_FEATURE_HWP_NOITFY ( 7*32+ 11) /* Intel HWP_NOTIFY */ +#define X86_FEATURE_HWP_NOTIFY ( 7*32+ 11) /* Intel HWP_NOTIFY */ #define X86_FEATURE_HWP_ACT_WINDOW ( 7*32+ 12) /* Intel HWP_ACT_WINDOW */ #define X86_FEATURE_HWP_EPP ( 7*32+13) /* Intel HWP_EPP */ #define X86_FEATURE_HWP_PKG_REQ ( 7*32+14) /* Intel HWP_PKG_REQ */ @@ -239,6 +241,7 @@ #define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */ #define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */ #define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */ +#define X86_FEATURE_SHA_NI ( 9*32+29) /* SHA1/SHA256 Instruction Extensions */ /* Extended state features, CPUID level 0x0000000d:1 (eax), word 10 */ #define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT */ diff --git a/arch/x86/include/asm/delay.h b/arch/x86/include/asm/delay.h index 9b3b4f275..36a760bda 100644 --- a/arch/x86/include/asm/delay.h +++ b/arch/x86/include/asm/delay.h @@ -4,5 +4,6 @@ #include <asm-generic/delay.h> void use_tsc_delay(void); +void use_mwaitx_delay(void); #endif /* _ASM_X86_DELAY_H */ diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index 1f5b7287d..953b7263f 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -12,7 +12,6 @@ #include <linux/dma-attrs.h> #include <asm/io.h> #include <asm/swiotlb.h> -#include <asm-generic/dma-coherent.h> #include <linux/dma-contiguous.h> #ifdef CONFIG_ISA @@ -41,24 +40,13 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev) #endif } -#include <asm-generic/dma-mapping-common.h> - -/* Make sure we keep the same behaviour */ -static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) -{ - struct dma_map_ops *ops = get_dma_ops(dev); - debug_dma_mapping_error(dev, dma_addr); - if (ops->mapping_error) - return ops->mapping_error(dev, dma_addr); - - return (dma_addr == DMA_ERROR_CODE); -} - -#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f) -#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h) +bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp); +#define arch_dma_alloc_attrs arch_dma_alloc_attrs +#define HAVE_ARCH_DMA_SUPPORTED 1 extern int dma_supported(struct device *hwdev, u64 mask); -extern int dma_set_mask(struct device *dev, u64 mask); + +#include <asm-generic/dma-mapping-common.h> extern void *dma_generic_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, gfp_t flag, @@ -125,16 +113,4 @@ static inline gfp_t dma_alloc_coherent_gfp_flags(struct device *dev, gfp_t gfp) return gfp; } -#define dma_alloc_coherent(d,s,h,f) dma_alloc_attrs(d,s,h,f,NULL) - -void * -dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, - gfp_t gfp, struct dma_attrs *attrs); - -#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL) - -void dma_free_attrs(struct device *dev, size_t size, - void *vaddr, dma_addr_t bus, - struct dma_attrs *attrs); - #endif diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 155162ea0..ae68be92f 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -86,6 +86,18 @@ extern u64 asmlinkage efi_call(void *fp, ...); extern void __iomem *__init efi_ioremap(unsigned long addr, unsigned long size, u32 type, u64 attribute); +#ifdef CONFIG_KASAN +/* + * CONFIG_KASAN may redefine memset to __memset. __memset function is present + * only in kernel binary. Since the EFI stub linked into a separate binary it + * doesn't have __memset(). So we should use standard memset from + * arch/x86/boot/compressed/string.c. The same applies to memcpy and memmove. + */ +#undef memcpy +#undef memset +#undef memmove +#endif + #endif /* CONFIG_X86_32 */ extern struct efi_scratch efi_scratch; diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index f161c189c..141c561f4 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -78,7 +78,7 @@ typedef struct user_fxsr_struct elf_fpxregset_t; #ifdef CONFIG_X86_64 extern unsigned int vdso64_enabled; #endif -#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT) +#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) extern unsigned int vdso32_enabled; #endif @@ -187,8 +187,8 @@ static inline void elf_common_init(struct thread_struct *t, #define COMPAT_ELF_PLAT_INIT(regs, load_addr) \ elf_common_init(¤t->thread, regs, __USER_DS) -void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp); -#define compat_start_thread start_thread_ia32 +void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp); +#define compat_start_thread compat_start_thread void set_personality_ia32(bool); #define COMPAT_SET_PERSONALITY(ex) \ @@ -344,14 +344,9 @@ extern int compat_arch_setup_additional_pages(struct linux_binprm *bprm, */ static inline int mmap_is_ia32(void) { -#ifdef CONFIG_X86_32 - return 1; -#endif -#ifdef CONFIG_IA32_EMULATION - if (test_thread_flag(TIF_ADDR32)) - return 1; -#endif - return 0; + return config_enabled(CONFIG_X86_32) || + (config_enabled(CONFIG_COMPAT) && + test_thread_flag(TIF_ADDR32)); } /* Do not change the values. See get_align_mask() */ diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index f45acad3c..24938852d 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -3,9 +3,9 @@ #ifdef CONFIG_FUNCTION_TRACER #ifdef CC_USING_FENTRY -# define MCOUNT_ADDR ((long)(__fentry__)) +# define MCOUNT_ADDR ((unsigned long)(__fentry__)) #else -# define MCOUNT_ADDR ((long)(mcount)) +# define MCOUNT_ADDR ((unsigned long)(mcount)) #endif #define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */ diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 6615032e1..1e3408e88 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -182,10 +182,10 @@ extern char irq_entries_start[]; #define trace_irq_entries_start irq_entries_start #endif -#define VECTOR_UNDEFINED (-1) -#define VECTOR_RETRIGGERED (-2) +#define VECTOR_UNUSED NULL +#define VECTOR_RETRIGGERED ((void *)~0UL) -typedef int vector_irq_t[NR_VECTORS]; +typedef struct irq_desc* vector_irq_t[NR_VECTORS]; DECLARE_PER_CPU(vector_irq_t, vector_irq); #endif /* !ASSEMBLY_ */ diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h index ccffa5375..39bcefc20 100644 --- a/arch/x86/include/asm/i8259.h +++ b/arch/x86/include/asm/i8259.h @@ -60,6 +60,7 @@ struct legacy_pic { void (*mask_all)(void); void (*restore_mask)(void); void (*init)(int auto_eoi); + int (*probe)(void); int (*irq_pending)(unsigned int irq); void (*make_irq)(unsigned int irq); }; diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h index d0e8e0141..280197654 100644 --- a/arch/x86/include/asm/ia32.h +++ b/arch/x86/include/asm/ia32.h @@ -22,15 +22,6 @@ struct ucontext_ia32 { compat_sigset_t uc_sigmask; /* mask last for extensibility */ }; -struct ucontext_x32 { - unsigned int uc_flags; - unsigned int uc_link; - compat_stack_t uc_stack; - unsigned int uc__pad0; /* needed for alignment */ - struct sigcontext uc_mcontext; /* the 64-bit sigcontext type */ - compat_sigset_t uc_sigmask; /* mask last for extensibility */ -}; - /* This matches struct stat64 in glibc2.2, hence the absolutely * insane amounts of padding around dev_t's. */ diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index cc9c61bc1..de25aad07 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -180,6 +180,8 @@ static inline unsigned int isa_virt_to_bus(volatile void *address) */ extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size); extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size); +#define ioremap_uc ioremap_uc + extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size); extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, unsigned long prot_val); @@ -248,12 +250,6 @@ static inline void flush_write_buffers(void) #endif } -static inline void __pmem *arch_memremap_pmem(resource_size_t offset, - unsigned long size) -{ - return (void __force __pmem *) ioremap_cache(offset, size); -} - #endif /* __KERNEL__ */ extern void native_io_delay(void); diff --git a/arch/x86/include/asm/iosf_mbi.h b/arch/x86/include/asm/iosf_mbi.h index 57995f059..b72ad0faa 100644 --- a/arch/x86/include/asm/iosf_mbi.h +++ b/arch/x86/include/asm/iosf_mbi.h @@ -52,20 +52,20 @@ /* Quark available units */ #define QRK_MBI_UNIT_HBA 0x00 -#define QRK_MBI_UNIT_HB 0x03 +#define QRK_MBI_UNIT_HB 0x03 #define QRK_MBI_UNIT_RMU 0x04 -#define QRK_MBI_UNIT_MM 0x05 +#define QRK_MBI_UNIT_MM 0x05 #define QRK_MBI_UNIT_MMESRAM 0x05 #define QRK_MBI_UNIT_SOC 0x31 /* Quark read/write opcodes */ #define QRK_MBI_HBA_READ 0x10 #define QRK_MBI_HBA_WRITE 0x11 -#define QRK_MBI_HB_READ 0x10 +#define QRK_MBI_HB_READ 0x10 #define QRK_MBI_HB_WRITE 0x11 #define QRK_MBI_RMU_READ 0x10 #define QRK_MBI_RMU_WRITE 0x11 -#define QRK_MBI_MM_READ 0x10 +#define QRK_MBI_MM_READ 0x10 #define QRK_MBI_MM_WRITE 0x11 #define QRK_MBI_MMESRAM_READ 0x12 #define QRK_MBI_MMESRAM_WRITE 0x13 diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index 8008d0658..881b47686 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -36,7 +36,9 @@ extern void kvm_set_posted_intr_wakeup_handler(void (*handler)(void)); extern void (*x86_platform_ipi_callback)(void); extern void native_init_IRQ(void); -extern bool handle_irq(unsigned irq, struct pt_regs *regs); + +struct irq_desc; +extern bool handle_irq(struct irq_desc *desc, struct pt_regs *regs); extern __visible unsigned int do_IRQ(struct pt_regs *regs); diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 4c2d2eb20..6ca9fd623 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -117,16 +117,6 @@ #define FPU_IRQ 13 -#define FIRST_VM86_IRQ 3 -#define LAST_VM86_IRQ 15 - -#ifndef __ASSEMBLY__ -static inline int invalid_vm86_irq(int irq) -{ - return irq < FIRST_VM86_IRQ || irq > LAST_VM86_IRQ; -} -#endif - /* * Size the maximum number of interrupts. * diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index a4c1cf7e9..5daeca3d0 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -16,15 +16,32 @@ # define STATIC_KEY_INIT_NOP GENERIC_NOP5_ATOMIC #endif -static __always_inline bool arch_static_branch(struct static_key *key) +static __always_inline bool arch_static_branch(struct static_key *key, bool branch) { asm_volatile_goto("1:" ".byte " __stringify(STATIC_KEY_INIT_NOP) "\n\t" ".pushsection __jump_table, \"aw\" \n\t" _ASM_ALIGN "\n\t" - _ASM_PTR "1b, %l[l_yes], %c0 \n\t" + _ASM_PTR "1b, %l[l_yes], %c0 + %c1 \n\t" ".popsection \n\t" - : : "i" (key) : : l_yes); + : : "i" (key), "i" (branch) : : l_yes); + + return false; +l_yes: + return true; +} + +static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch) +{ + asm_volatile_goto("1:" + ".byte 0xe9\n\t .long %l[l_yes] - 2f\n\t" + "2:\n\t" + ".pushsection __jump_table, \"aw\" \n\t" + _ASM_ALIGN "\n\t" + _ASM_PTR "1b, %l[l_yes], %c0 + %c1 \n\t" + ".popsection \n\t" + : : "i" (key), "i" (branch) : : l_yes); + return false; l_yes: return true; diff --git a/arch/x86/include/asm/kasan.h b/arch/x86/include/asm/kasan.h index 74a2a8dc9..1410b567e 100644 --- a/arch/x86/include/asm/kasan.h +++ b/arch/x86/include/asm/kasan.h @@ -1,6 +1,9 @@ #ifndef _ASM_X86_KASAN_H #define _ASM_X86_KASAN_H +#include <linux/const.h> +#define KASAN_SHADOW_OFFSET _AC(CONFIG_KASAN_SHADOW_OFFSET, UL) + /* * Compiler uses shadow offset assuming that addresses start * from 0. Kernel addresses don't start from 0, so shadow diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h index 32ce71375..b130d5940 100644 --- a/arch/x86/include/asm/kdebug.h +++ b/arch/x86/include/asm/kdebug.h @@ -29,7 +29,7 @@ extern void show_trace(struct task_struct *t, struct pt_regs *regs, extern void __show_regs(struct pt_regs *regs, int all); extern unsigned long oops_begin(void); extern void oops_end(unsigned long, struct pt_regs *, int signr); -#ifdef CONFIG_KEXEC +#ifdef CONFIG_KEXEC_CORE extern int in_crash_kexec; #else /* no crash dump is ever in progress if no crash kernel can be kexec'd */ diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index e16466ec4..e9cd7befc 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -112,6 +112,16 @@ struct x86_emulate_ops { struct x86_exception *fault); /* + * read_phys: Read bytes of standard (non-emulated/special) memory. + * Used for descriptor reading. + * @addr: [IN ] Physical address from which to read. + * @val: [OUT] Value read from memory. + * @bytes: [IN ] Number of bytes to read from memory. + */ + int (*read_phys)(struct x86_emulate_ctxt *ctxt, unsigned long addr, + void *val, unsigned int bytes); + + /* * write_std: Write bytes of standard (non-emulated/special) memory. * Used for descriptor writing. * @addr: [IN ] Linear address to which to write. diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 49ec9038e..3a36ee704 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -40,6 +40,7 @@ #define KVM_PIO_PAGE_OFFSET 1 #define KVM_COALESCED_MMIO_PAGE_OFFSET 2 +#define KVM_HALT_POLL_NS_DEFAULT 500000 #define KVM_IRQCHIP_NUM_PINS KVM_IOAPIC_NUM_PINS @@ -252,6 +253,11 @@ struct kvm_pio_request { int size; }; +struct rsvd_bits_validate { + u64 rsvd_bits_mask[2][4]; + u64 bad_mt_xwr; +}; + /* * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level * 32-bit). The kvm_mmu structure abstracts the details of the current mmu @@ -289,8 +295,15 @@ struct kvm_mmu { u64 *pae_root; u64 *lm_root; - u64 rsvd_bits_mask[2][4]; - u64 bad_mt_xwr; + + /* + * check zero bits on shadow page table entries, these + * bits include not only hardware reserved bits but also + * the bits spte never used. + */ + struct rsvd_bits_validate shadow_zero_check; + + struct rsvd_bits_validate guest_rsvd_check; /* * Bitmap: bit set = last pte in walk @@ -358,6 +371,11 @@ struct kvm_mtrr { struct list_head head; }; +/* Hyper-V per vcpu emulation context */ +struct kvm_vcpu_hv { + u64 hv_vapic; +}; + struct kvm_vcpu_arch { /* * rip and regs accesses must go through @@ -514,8 +532,7 @@ struct kvm_vcpu_arch { /* used for guest single stepping over the given code position */ unsigned long singlestep_rip; - /* fields used by HYPER-V emulation */ - u64 hv_vapic; + struct kvm_vcpu_hv hyperv; cpumask_var_t wbinvd_dirty_mask; @@ -586,6 +603,17 @@ struct kvm_apic_map { struct kvm_lapic *logical_map[16][16]; }; +/* Hyper-V emulation context */ +struct kvm_hv { + u64 hv_guest_os_id; + u64 hv_hypercall; + u64 hv_tsc_page; + + /* Hyper-v based guest crash (NT kernel bugcheck) parameters */ + u64 hv_crash_param[HV_X64_MSR_CRASH_PARAMS]; + u64 hv_crash_ctl; +}; + struct kvm_arch { unsigned int n_used_mmu_pages; unsigned int n_requested_mmu_pages; @@ -645,16 +673,14 @@ struct kvm_arch { /* reads protected by irq_srcu, writes by irq_lock */ struct hlist_head mask_notifier_list; - /* fields used by HYPER-V emulation */ - u64 hv_guest_os_id; - u64 hv_hypercall; - u64 hv_tsc_page; + struct kvm_hv hyperv; #ifdef CONFIG_KVM_MMU_AUDIT int audit_point; #endif bool boot_vcpu_runs_old_kvmclock; + u32 bsp_vcpu_id; u64 disabled_quirks; }; @@ -686,6 +712,7 @@ struct kvm_vcpu_stat { u32 nmi_window_exits; u32 halt_exits; u32 halt_successful_poll; + u32 halt_attempted_poll; u32 halt_wakeup; u32 request_irq_exits; u32 irq_exits; @@ -1199,9 +1226,9 @@ void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err); int kvm_is_in_guest(void); -int __x86_set_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region *mem); -int x86_set_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region *mem); +int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size); +int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size); +bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu); +bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu); #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/include/asm/math_emu.h b/arch/x86/include/asm/math_emu.h index 031f6266f..0d9b14f60 100644 --- a/arch/x86/include/asm/math_emu.h +++ b/arch/x86/include/asm/math_emu.h @@ -2,7 +2,6 @@ #define _ASM_X86_MATH_EMU_H #include <asm/ptrace.h> -#include <asm/vm86.h> /* This structure matches the layout of the data saved to the stack following a device-not-present interrupt, part of it saved @@ -10,9 +9,6 @@ */ struct math_emu_info { long ___orig_eip; - union { - struct pt_regs *regs; - struct kernel_vm86_regs *vm86; - }; + struct pt_regs *regs; }; #endif /* _ASM_X86_MATH_EMU_H */ diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 982dfc367..2dbc0bf2b 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -151,10 +151,12 @@ extern int mce_p5_enabled; #ifdef CONFIG_X86_MCE int mcheck_init(void); void mcheck_cpu_init(struct cpuinfo_x86 *c); +void mcheck_cpu_clear(struct cpuinfo_x86 *c); void mcheck_vendor_init_severity(void); #else static inline int mcheck_init(void) { return 0; } static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {} +static inline void mcheck_cpu_clear(struct cpuinfo_x86 *c) {} static inline void mcheck_vendor_init_severity(void) {} #endif @@ -181,20 +183,18 @@ DECLARE_PER_CPU(struct device *, mce_device); #ifdef CONFIG_X86_MCE_INTEL void mce_intel_feature_init(struct cpuinfo_x86 *c); +void mce_intel_feature_clear(struct cpuinfo_x86 *c); void cmci_clear(void); void cmci_reenable(void); void cmci_rediscover(void); void cmci_recheck(void); -void lmce_clear(void); -void lmce_enable(void); #else static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { } +static inline void mce_intel_feature_clear(struct cpuinfo_x86 *c) { } static inline void cmci_clear(void) {} static inline void cmci_reenable(void) {} static inline void cmci_rediscover(void) {} static inline void cmci_recheck(void) {} -static inline void lmce_clear(void) {} -static inline void lmce_enable(void) {} #endif #ifdef CONFIG_X86_MCE_AMD diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index 364d27481..55234d5e7 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -9,7 +9,9 @@ * we put the segment information here. */ typedef struct { +#ifdef CONFIG_MODIFY_LDT_SYSCALL struct ldt_struct *ldt; +#endif #ifdef CONFIG_X86_64 /* True if mm supports a task running in 32 bit compatibility mode. */ diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 984abfe47..379cd3658 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -33,6 +33,7 @@ static inline void load_mm_cr4(struct mm_struct *mm) static inline void load_mm_cr4(struct mm_struct *mm) {} #endif +#ifdef CONFIG_MODIFY_LDT_SYSCALL /* * ldt_structs can be allocated, used, and freed, but they are never * modified while live. @@ -48,8 +49,23 @@ struct ldt_struct { int size; }; +/* + * Used for LDT copy/destruction. + */ +int init_new_context(struct task_struct *tsk, struct mm_struct *mm); +void destroy_context(struct mm_struct *mm); +#else /* CONFIG_MODIFY_LDT_SYSCALL */ +static inline int init_new_context(struct task_struct *tsk, + struct mm_struct *mm) +{ + return 0; +} +static inline void destroy_context(struct mm_struct *mm) {} +#endif + static inline void load_mm_ldt(struct mm_struct *mm) { +#ifdef CONFIG_MODIFY_LDT_SYSCALL struct ldt_struct *ldt; /* lockless_dereference synchronizes with smp_store_release */ @@ -73,17 +89,13 @@ static inline void load_mm_ldt(struct mm_struct *mm) set_ldt(ldt->entries, ldt->size); else clear_LDT(); +#else + clear_LDT(); +#endif DEBUG_LOCKS_WARN_ON(preemptible()); } -/* - * Used for LDT copy/destruction. - */ -int init_new_context(struct task_struct *tsk, struct mm_struct *mm); -void destroy_context(struct mm_struct *mm); - - static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) { #ifdef CONFIG_SMP @@ -114,6 +126,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, /* Load per-mm CR4 state */ load_mm_cr4(next); +#ifdef CONFIG_MODIFY_LDT_SYSCALL /* * Load the LDT, if the LDT is different. * @@ -128,6 +141,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, */ if (unlikely(prev->context.ldt != next->context.ldt)) load_mm_ldt(next); +#endif } #ifdef CONFIG_SMP else { diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h index de518691f..90f3c767e 100644 --- a/arch/x86/include/asm/module.h +++ b/arch/x86/include/asm/module.h @@ -31,6 +31,8 @@ #define MODULE_PROC_FAMILY "HASWELL " #elif defined CONFIG_MBROADWELL #define MODULE_PROC_FAMILY "BROADWELL " +#elif defined CONFIG_MSKYLAKE +#define MODULE_PROC_FAMILY "SKYLAKE " #elif defined CONFIG_MATOM #define MODULE_PROC_FAMILY "ATOM " #elif defined CONFIG_M686 @@ -60,6 +62,8 @@ #elif defined CONFIG_MBULLDOZER #define MODULE_PROC_FAMILY "BULLDOZER " #elif defined CONFIG_MPILEDRIVER +#define MODULE_PROC_FAMILY "STEAMROLLER " +#elif defined CONFIG_MSTEAMROLLER #define MODULE_PROC_FAMILY "PILEDRIVER " #elif defined CONFIG_MJAGUAR #define MODULE_PROC_FAMILY "JAGUAR " diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index c163215ab..aaf59b7da 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -7,6 +7,7 @@ struct ms_hyperv_info { u32 features; + u32 misc_features; u32 hints; }; @@ -20,4 +21,8 @@ void hyperv_vector_handler(struct pt_regs *regs); void hv_setup_vmbus_irq(void (*handler)(void)); void hv_remove_vmbus_irq(void); +void hv_setup_kexec_handler(void (*handler)(void)); +void hv_remove_kexec_handler(void); +void hv_setup_crash_handler(void (*handler)(struct pt_regs *regs)); +void hv_remove_crash_handler(void); #endif diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 2350ab781..b8c14bb7f 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -73,6 +73,12 @@ #define MSR_LBR_CORE_FROM 0x00000040 #define MSR_LBR_CORE_TO 0x00000060 +#define MSR_LBR_INFO_0 0x00000dc0 /* ... 0xddf for _31 */ +#define LBR_INFO_MISPRED BIT_ULL(63) +#define LBR_INFO_IN_TX BIT_ULL(62) +#define LBR_INFO_ABORT BIT_ULL(61) +#define LBR_INFO_CYCLES 0xffff + #define MSR_IA32_PEBS_ENABLE 0x000003f1 #define MSR_IA32_DS_AREA 0x00000600 #define MSR_IA32_PERF_CAPABILITIES 0x00000345 @@ -80,13 +86,21 @@ #define MSR_IA32_RTIT_CTL 0x00000570 #define RTIT_CTL_TRACEEN BIT(0) +#define RTIT_CTL_CYCLEACC BIT(1) #define RTIT_CTL_OS BIT(2) #define RTIT_CTL_USR BIT(3) #define RTIT_CTL_CR3EN BIT(7) #define RTIT_CTL_TOPA BIT(8) +#define RTIT_CTL_MTC_EN BIT(9) #define RTIT_CTL_TSC_EN BIT(10) #define RTIT_CTL_DISRETC BIT(11) #define RTIT_CTL_BRANCH_EN BIT(13) +#define RTIT_CTL_MTC_RANGE_OFFSET 14 +#define RTIT_CTL_MTC_RANGE (0x0full << RTIT_CTL_MTC_RANGE_OFFSET) +#define RTIT_CTL_CYC_THRESH_OFFSET 19 +#define RTIT_CTL_CYC_THRESH (0x0full << RTIT_CTL_CYC_THRESH_OFFSET) +#define RTIT_CTL_PSB_FREQ_OFFSET 24 +#define RTIT_CTL_PSB_FREQ (0x0full << RTIT_CTL_PSB_FREQ_OFFSET) #define MSR_IA32_RTIT_STATUS 0x00000571 #define RTIT_STATUS_CONTEXTEN BIT(1) #define RTIT_STATUS_TRIGGEREN BIT(2) @@ -127,6 +141,8 @@ #define DEBUGCTLMSR_BTS_OFF_USR (1UL << 10) #define DEBUGCTLMSR_FREEZE_LBRS_ON_PMI (1UL << 11) +#define MSR_PEBS_FRONTEND 0x000003f7 + #define MSR_IA32_POWER_CTL 0x000001fc #define MSR_IA32_MC0_CTL 0x00000400 @@ -170,6 +186,12 @@ #define MSR_PP1_ENERGY_STATUS 0x00000641 #define MSR_PP1_POLICY 0x00000642 +#define MSR_CONFIG_TDP_NOMINAL 0x00000648 +#define MSR_CONFIG_TDP_LEVEL_1 0x00000649 +#define MSR_CONFIG_TDP_LEVEL_2 0x0000064A +#define MSR_CONFIG_TDP_CONTROL 0x0000064B +#define MSR_TURBO_ACTIVATION_RATIO 0x0000064C + #define MSR_PKG_WEIGHTED_CORE_C0_RES 0x00000658 #define MSR_PKG_ANY_CORE_C0_RES 0x00000659 #define MSR_PKG_ANY_GFXE_C0_RES 0x0000065A diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index e6a707eb5..77d8b284e 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -47,14 +47,13 @@ static inline unsigned long long native_read_tscp(unsigned int *aux) * it means rax *or* rdx. */ #ifdef CONFIG_X86_64 -#define DECLARE_ARGS(val, low, high) unsigned low, high -#define EAX_EDX_VAL(val, low, high) ((low) | ((u64)(high) << 32)) -#define EAX_EDX_ARGS(val, low, high) "a" (low), "d" (high) +/* Using 64-bit values saves one instruction clearing the high half of low */ +#define DECLARE_ARGS(val, low, high) unsigned long low, high +#define EAX_EDX_VAL(val, low, high) ((low) | (high) << 32) #define EAX_EDX_RET(val, low, high) "=a" (low), "=d" (high) #else #define DECLARE_ARGS(val, low, high) unsigned long long val #define EAX_EDX_VAL(val, low, high) (val) -#define EAX_EDX_ARGS(val, low, high) "A" (val) #define EAX_EDX_RET(val, low, high) "=A" (val) #endif @@ -106,12 +105,19 @@ notrace static inline int native_write_msr_safe(unsigned int msr, return err; } -extern unsigned long long native_read_tsc(void); - extern int rdmsr_safe_regs(u32 regs[8]); extern int wrmsr_safe_regs(u32 regs[8]); -static __always_inline unsigned long long __native_read_tsc(void) +/** + * rdtsc() - returns the current TSC without ordering constraints + * + * rdtsc() returns the result of RDTSC as a 64-bit integer. The + * only ordering constraint it supplies is the ordering implied by + * "asm volatile": it will put the RDTSC in the place you expect. The + * CPU can and will speculatively execute that RDTSC, though, so the + * results can be non-monotonic if compared on different CPUs. + */ +static __always_inline unsigned long long rdtsc(void) { DECLARE_ARGS(val, low, high); @@ -120,6 +126,35 @@ static __always_inline unsigned long long __native_read_tsc(void) return EAX_EDX_VAL(val, low, high); } +/** + * rdtsc_ordered() - read the current TSC in program order + * + * rdtsc_ordered() returns the result of RDTSC as a 64-bit integer. + * It is ordered like a load to a global in-memory counter. It should + * be impossible to observe non-monotonic rdtsc_unordered() behavior + * across multiple CPUs as long as the TSC is synced. + */ +static __always_inline unsigned long long rdtsc_ordered(void) +{ + /* + * The RDTSC instruction is not ordered relative to memory + * access. The Intel SDM and the AMD APM are both vague on this + * point, but empirically an RDTSC instruction can be + * speculatively executed before prior loads. An RDTSC + * immediately after an appropriate barrier appears to be + * ordered as a normal load, that is, it provides the same + * ordering guarantees as reading from a global memory location + * that some other imaginary CPU is updating continuously with a + * time stamp. + */ + alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC, + "lfence", X86_FEATURE_LFENCE_RDTSC); + return rdtsc(); +} + +/* Deprecated, keep it for a cycle for easier merging: */ +#define rdtscll(now) do { (now) = rdtsc_ordered(); } while (0) + static inline unsigned long long native_read_pmc(int counter) { DECLARE_ARGS(val, low, high); @@ -153,8 +188,10 @@ static inline void wrmsr(unsigned msr, unsigned low, unsigned high) #define rdmsrl(msr, val) \ ((val) = native_read_msr((msr))) -#define wrmsrl(msr, val) \ - native_write_msr((msr), (u32)((u64)(val)), (u32)((u64)(val) >> 32)) +static inline void wrmsrl(unsigned msr, u64 val) +{ + native_write_msr(msr, (u32)val, (u32)(val >> 32)); +} /* wrmsr with exception handling */ static inline int wrmsr_safe(unsigned msr, unsigned low, unsigned high) @@ -180,12 +217,6 @@ static inline int rdmsrl_safe(unsigned msr, unsigned long long *p) return err; } -#define rdtscl(low) \ - ((low) = (u32)__native_read_tsc()) - -#define rdtscll(val) \ - ((val) = __native_read_tsc()) - #define rdpmc(counter, low, high) \ do { \ u64 _l = native_read_pmc((counter)); \ @@ -195,15 +226,6 @@ do { \ #define rdpmcl(counter, val) ((val) = native_read_pmc(counter)) -#define rdtscp(low, high, aux) \ -do { \ - unsigned long long _val = native_read_tscp(&(aux)); \ - (low) = (u32)_val; \ - (high) = (u32)(_val >> 32); \ -} while (0) - -#define rdtscpll(val, aux) (val) = native_read_tscp(&(aux)) - #endif /* !CONFIG_PARAVIRT */ /* diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index 653dfa766..c70689b5e 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -14,6 +14,9 @@ #define CPUID5_ECX_INTERRUPT_BREAK 0x2 #define MWAIT_ECX_INTERRUPT_BREAK 0x1 +#define MWAITX_ECX_TIMER_ENABLE BIT(1) +#define MWAITX_MAX_LOOPS ((u32)-1) +#define MWAITX_DISABLE_CSTATES 0xf static inline void __monitor(const void *eax, unsigned long ecx, unsigned long edx) @@ -23,6 +26,14 @@ static inline void __monitor(const void *eax, unsigned long ecx, :: "a" (eax), "c" (ecx), "d"(edx)); } +static inline void __monitorx(const void *eax, unsigned long ecx, + unsigned long edx) +{ + /* "monitorx %eax, %ecx, %edx;" */ + asm volatile(".byte 0x0f, 0x01, 0xfa;" + :: "a" (eax), "c" (ecx), "d"(edx)); +} + static inline void __mwait(unsigned long eax, unsigned long ecx) { /* "mwait %eax, %ecx;" */ @@ -30,6 +41,40 @@ static inline void __mwait(unsigned long eax, unsigned long ecx) :: "a" (eax), "c" (ecx)); } +/* + * MWAITX allows for a timer expiration to get the core out a wait state in + * addition to the default MWAIT exit condition of a store appearing at a + * monitored virtual address. + * + * Registers: + * + * MWAITX ECX[1]: enable timer if set + * MWAITX EBX[31:0]: max wait time expressed in SW P0 clocks. The software P0 + * frequency is the same as the TSC frequency. + * + * Below is a comparison between MWAIT and MWAITX on AMD processors: + * + * MWAIT MWAITX + * opcode 0f 01 c9 | 0f 01 fb + * ECX[0] value of RFLAGS.IF seen by instruction + * ECX[1] unused/#GP if set | enable timer if set + * ECX[31:2] unused/#GP if set + * EAX unused (reserve for hint) + * EBX[31:0] unused | max wait time (P0 clocks) + * + * MONITOR MONITORX + * opcode 0f 01 c8 | 0f 01 fa + * EAX (logical) address to monitor + * ECX #GP if not zero + */ +static inline void __mwaitx(unsigned long eax, unsigned long ebx, + unsigned long ecx) +{ + /* "mwaitx %eax, %ebx, %ecx;" */ + asm volatile(".byte 0x0f, 0x01, 0xfb;" + :: "a" (eax), "b" (ebx), "c" (ecx)); +} + static inline void __sti_mwait(unsigned long eax, unsigned long ecx) { trace_hardirqs_on(); diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index d143bfad4..10d059643 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -153,7 +153,11 @@ do { \ val = paravirt_read_msr(msr, &_err); \ } while (0) -#define wrmsrl(msr, val) wrmsr(msr, (u32)((u64)(val)), ((u64)(val))>>32) +static inline void wrmsrl(unsigned msr, u64 val) +{ + wrmsr(msr, (u32)val, (u32)(val>>32)); +} + #define wrmsr_safe(msr, a, b) paravirt_write_msr(msr, a, b) /* rdmsr with exception handling */ @@ -174,19 +178,6 @@ static inline int rdmsrl_safe(unsigned msr, unsigned long long *p) return err; } -static inline u64 paravirt_read_tsc(void) -{ - return PVOP_CALL0(u64, pv_cpu_ops.read_tsc); -} - -#define rdtscl(low) \ -do { \ - u64 _l = paravirt_read_tsc(); \ - low = (int)_l; \ -} while (0) - -#define rdtscll(val) (val = paravirt_read_tsc()) - static inline unsigned long long paravirt_sched_clock(void) { return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock); @@ -215,27 +206,6 @@ do { \ #define rdpmcl(counter, val) ((val) = paravirt_read_pmc(counter)) -static inline unsigned long long paravirt_rdtscp(unsigned int *aux) -{ - return PVOP_CALL1(u64, pv_cpu_ops.read_tscp, aux); -} - -#define rdtscp(low, high, aux) \ -do { \ - int __aux; \ - unsigned long __val = paravirt_rdtscp(&__aux); \ - (low) = (u32)__val; \ - (high) = (u32)(__val >> 32); \ - (aux) = __aux; \ -} while (0) - -#define rdtscpll(val, aux) \ -do { \ - unsigned long __aux; \ - val = paravirt_rdtscp(&__aux); \ - (aux) = __aux; \ -} while (0) - static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries) { PVOP_VCALL2(pv_cpu_ops.alloc_ldt, ldt, entries); diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index a6b8f9fad..31247b5bf 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -97,7 +97,6 @@ struct pv_lazy_ops { struct pv_time_ops { unsigned long long (*sched_clock)(void); unsigned long long (*steal_clock)(int cpu); - unsigned long (*get_tsc_khz)(void); }; struct pv_cpu_ops { @@ -156,9 +155,7 @@ struct pv_cpu_ops { u64 (*read_msr)(unsigned int msr, int *err); int (*write_msr)(unsigned int msr, unsigned low, unsigned high); - u64 (*read_tsc)(void); u64 (*read_pmc)(int counter); - unsigned long long (*read_tscp)(unsigned int *aux); #ifdef CONFIG_X86_32 /* diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index 164e3f8d3..fa1195dae 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -93,8 +93,6 @@ extern raw_spinlock_t pci_config_lock; extern int (*pcibios_enable_irq)(struct pci_dev *dev); extern void (*pcibios_disable_irq)(struct pci_dev *dev); -extern bool mp_should_keep_irq(struct device *dev); - struct pci_raw_ops { int (*read)(unsigned int domain, unsigned int bus, unsigned int devfn, int reg, int len, u32 *val); diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index dc0f6ed35..7bcb861a0 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -159,6 +159,13 @@ struct x86_pmu_capability { */ #define INTEL_PMC_IDX_FIXED_BTS (INTEL_PMC_IDX_FIXED + 16) +#define GLOBAL_STATUS_COND_CHG BIT_ULL(63) +#define GLOBAL_STATUS_BUFFER_OVF BIT_ULL(62) +#define GLOBAL_STATUS_UNC_OVF BIT_ULL(61) +#define GLOBAL_STATUS_ASIF BIT_ULL(60) +#define GLOBAL_STATUS_COUNTERS_FROZEN BIT_ULL(59) +#define GLOBAL_STATUS_LBRS_FROZEN BIT_ULL(58) + /* * IBS cpuid feature detection */ diff --git a/arch/x86/include/asm/pmc_atom.h b/arch/x86/include/asm/pmc_atom.h index bc0fc0866..aa8744c77 100644 --- a/arch/x86/include/asm/pmc_atom.h +++ b/arch/x86/include/asm/pmc_atom.h @@ -18,6 +18,8 @@ /* ValleyView Power Control Unit PCI Device ID */ #define PCI_DEVICE_ID_VLV_PMC 0x0F1C +/* CherryTrail Power Control Unit PCI Device ID */ +#define PCI_DEVICE_ID_CHT_PMC 0x229C /* PMC Memory mapped IO registers */ #define PMC_BASE_ADDR_OFFSET 0x44 @@ -29,6 +31,10 @@ #define PMC_FUNC_DIS 0x34 #define PMC_FUNC_DIS_2 0x38 +/* CHT specific bits in FUNC_DIS2 register */ +#define BIT_FD_GMM BIT(3) +#define BIT_FD_ISH BIT(4) + /* S0ix wake event control */ #define PMC_S0IX_WAKE_EN 0x3C @@ -75,6 +81,21 @@ #define PMC_PSS_BIT_USB BIT(16) #define PMC_PSS_BIT_USB_SUS BIT(17) +/* CHT specific bits in PSS register */ +#define PMC_PSS_BIT_CHT_UFS BIT(7) +#define PMC_PSS_BIT_CHT_UXD BIT(11) +#define PMC_PSS_BIT_CHT_UXD_FD BIT(12) +#define PMC_PSS_BIT_CHT_UX_ENG BIT(15) +#define PMC_PSS_BIT_CHT_USB_SUS BIT(16) +#define PMC_PSS_BIT_CHT_GMM BIT(17) +#define PMC_PSS_BIT_CHT_ISH BIT(18) +#define PMC_PSS_BIT_CHT_DFX_MASTER BIT(26) +#define PMC_PSS_BIT_CHT_DFX_CLUSTER1 BIT(27) +#define PMC_PSS_BIT_CHT_DFX_CLUSTER2 BIT(28) +#define PMC_PSS_BIT_CHT_DFX_CLUSTER3 BIT(29) +#define PMC_PSS_BIT_CHT_DFX_CLUSTER4 BIT(30) +#define PMC_PSS_BIT_CHT_DFX_CLUSTER5 BIT(31) + /* These registers reflect D3 status of functions */ #define PMC_D3_STS_0 0xA0 @@ -117,6 +138,10 @@ #define BIT_USH_SS_PHY BIT(2) #define BIT_DFX BIT(3) +/* CHT specific bits in PMC_D3_STS_1 register */ +#define BIT_STS_GMM BIT(1) +#define BIT_STS_ISH BIT(2) + /* PMC I/O Registers */ #define ACPI_BASE_ADDR_OFFSET 0x40 #define ACPI_BASE_ADDR_MASK 0xFFFFFE00 @@ -126,4 +151,8 @@ #define SLEEP_TYPE_MASK 0xFFFFECFF #define SLEEP_TYPE_S5 0x1C00 #define SLEEP_ENABLE 0x2000 + +extern int pmc_atom_read(int offset, u32 *value); +extern int pmc_atom_write(int offset, u32 value); + #endif /* PMC_ATOM_H */ diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h new file mode 100644 index 000000000..d8ce3ec81 --- /dev/null +++ b/arch/x86/include/asm/pmem.h @@ -0,0 +1,153 @@ +/* + * Copyright(c) 2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#ifndef __ASM_X86_PMEM_H__ +#define __ASM_X86_PMEM_H__ + +#include <linux/uaccess.h> +#include <asm/cacheflush.h> +#include <asm/cpufeature.h> +#include <asm/special_insns.h> + +#ifdef CONFIG_ARCH_HAS_PMEM_API +/** + * arch_memcpy_to_pmem - copy data to persistent memory + * @dst: destination buffer for the copy + * @src: source buffer for the copy + * @n: length of the copy in bytes + * + * Copy data to persistent memory media via non-temporal stores so that + * a subsequent arch_wmb_pmem() can flush cpu and memory controller + * write buffers to guarantee durability. + */ +static inline void arch_memcpy_to_pmem(void __pmem *dst, const void *src, + size_t n) +{ + int unwritten; + + /* + * We are copying between two kernel buffers, if + * __copy_from_user_inatomic_nocache() returns an error (page + * fault) we would have already reported a general protection fault + * before the WARN+BUG. + */ + unwritten = __copy_from_user_inatomic_nocache((void __force *) dst, + (void __user *) src, n); + if (WARN(unwritten, "%s: fault copying %p <- %p unwritten: %d\n", + __func__, dst, src, unwritten)) + BUG(); +} + +/** + * arch_wmb_pmem - synchronize writes to persistent memory + * + * After a series of arch_memcpy_to_pmem() operations this drains data + * from cpu write buffers and any platform (memory controller) buffers + * to ensure that written data is durable on persistent memory media. + */ +static inline void arch_wmb_pmem(void) +{ + /* + * wmb() to 'sfence' all previous writes such that they are + * architecturally visible to 'pcommit'. Note, that we've + * already arranged for pmem writes to avoid the cache via + * arch_memcpy_to_pmem(). + */ + wmb(); + pcommit_sfence(); +} + +/** + * __arch_wb_cache_pmem - write back a cache range with CLWB + * @vaddr: virtual start address + * @size: number of bytes to write back + * + * Write back a cache range using the CLWB (cache line write back) + * instruction. This function requires explicit ordering with an + * arch_wmb_pmem() call. This API is internal to the x86 PMEM implementation. + */ +static inline void __arch_wb_cache_pmem(void *vaddr, size_t size) +{ + u16 x86_clflush_size = boot_cpu_data.x86_clflush_size; + unsigned long clflush_mask = x86_clflush_size - 1; + void *vend = vaddr + size; + void *p; + + for (p = (void *)((unsigned long)vaddr & ~clflush_mask); + p < vend; p += x86_clflush_size) + clwb(p); +} + +/* + * copy_from_iter_nocache() on x86 only uses non-temporal stores for iovec + * iterators, so for other types (bvec & kvec) we must do a cache write-back. + */ +static inline bool __iter_needs_pmem_wb(struct iov_iter *i) +{ + return iter_is_iovec(i) == false; +} + +/** + * arch_copy_from_iter_pmem - copy data from an iterator to PMEM + * @addr: PMEM destination address + * @bytes: number of bytes to copy + * @i: iterator with source data + * + * Copy data from the iterator 'i' to the PMEM buffer starting at 'addr'. + * This function requires explicit ordering with an arch_wmb_pmem() call. + */ +static inline size_t arch_copy_from_iter_pmem(void __pmem *addr, size_t bytes, + struct iov_iter *i) +{ + void *vaddr = (void __force *)addr; + size_t len; + + /* TODO: skip the write-back by always using non-temporal stores */ + len = copy_from_iter_nocache(vaddr, bytes, i); + + if (__iter_needs_pmem_wb(i)) + __arch_wb_cache_pmem(vaddr, bytes); + + return len; +} + +/** + * arch_clear_pmem - zero a PMEM memory range + * @addr: virtual start address + * @size: number of bytes to zero + * + * Write zeros into the memory range starting at 'addr' for 'size' bytes. + * This function requires explicit ordering with an arch_wmb_pmem() call. + */ +static inline void arch_clear_pmem(void __pmem *addr, size_t size) +{ + void *vaddr = (void __force *)addr; + + /* TODO: implement the zeroing via non-temporal writes */ + if (size == PAGE_SIZE && ((unsigned long)vaddr & ~PAGE_MASK) == 0) + clear_page(vaddr); + else + memset(vaddr, 0, size); + + __arch_wb_cache_pmem(vaddr, size); +} + +static inline bool __arch_has_wmb_pmem(void) +{ + /* + * We require that wmb() be an 'sfence', that is only guaranteed on + * 64-bit builds + */ + return static_cpu_has(X86_FEATURE_PCOMMIT); +} +#endif /* CONFIG_ARCH_HAS_PMEM_API */ +#endif /* __ASM_X86_PMEM_H__ */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 944f1785e..19577dd32 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -6,8 +6,8 @@ /* Forward declaration, a strange C thing */ struct task_struct; struct mm_struct; +struct vm86; -#include <asm/vm86.h> #include <asm/math_emu.h> #include <asm/segment.h> #include <asm/types.h> @@ -400,15 +400,9 @@ struct thread_struct { unsigned long cr2; unsigned long trap_nr; unsigned long error_code; -#ifdef CONFIG_X86_32 +#ifdef CONFIG_VM86 /* Virtual 86 mode info */ - struct vm86_struct __user *vm86_info; - unsigned long screen_bitmap; - unsigned long v86flags; - unsigned long v86mask; - unsigned long saved_sp0; - unsigned int saved_fs; - unsigned int saved_gs; + struct vm86 *vm86; #endif /* IO permissions: */ unsigned long *io_bitmap_ptr; @@ -651,14 +645,6 @@ static inline void update_debugctlmsr(unsigned long debugctlmsr) extern void set_task_blockstep(struct task_struct *task, bool on); -/* - * from system description table in BIOS. Mostly for MCA use, but - * others may find it useful: - */ -extern unsigned int machine_id; -extern unsigned int machine_submodel_id; -extern unsigned int BIOS_revision; - /* Boot loader type from the setup header: */ extern int bootloader_type; extern int bootloader_version; @@ -720,7 +706,6 @@ static inline void spin_lock_prefetch(const void *x) #define INIT_THREAD { \ .sp0 = TOP_OF_INIT_STACK, \ - .vm86_info = NULL, \ .sysenter_cs = __KERNEL_CS, \ .io_bitmap_ptr = NULL, \ } diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 5fabf1362..6271281f9 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -88,7 +88,6 @@ extern long syscall_trace_enter_phase2(struct pt_regs *, u32 arch, unsigned long phase1_result); extern long syscall_trace_enter(struct pt_regs *); -extern void syscall_trace_leave(struct pt_regs *); static inline unsigned long regs_return_value(struct pt_regs *regs) { diff --git a/arch/x86/include/asm/pvclock-abi.h b/arch/x86/include/asm/pvclock-abi.h index 655e07a48..67f082301 100644 --- a/arch/x86/include/asm/pvclock-abi.h +++ b/arch/x86/include/asm/pvclock-abi.h @@ -41,6 +41,7 @@ struct pvclock_wall_clock { #define PVCLOCK_TSC_STABLE_BIT (1 << 0) #define PVCLOCK_GUEST_STOPPED (1 << 1) +/* PVCLOCK_COUNTS_FROM_ZERO broke ABI and can't be used anymore. */ #define PVCLOCK_COUNTS_FROM_ZERO (1 << 2) #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_PVCLOCK_ABI_H */ diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index 628954cee..7a6bed5c0 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -62,7 +62,7 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift) static __always_inline u64 pvclock_get_nsec_offset(const struct pvclock_vcpu_time_info *src) { - u64 delta = __native_read_tsc() - src->tsc_timestamp; + u64 delta = rdtsc_ordered() - src->tsc_timestamp; return pvclock_scale_delta(delta, src->tsc_to_system_mul, src->tsc_shift); } @@ -76,13 +76,7 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src, u8 ret_flags; version = src->version; - /* Note: emulated platforms which do not advertise SSE2 support - * result in kvmclock not using the necessary RDTSC barriers. - * Without barriers, it is possible that RDTSC instruction reads from - * the time stamp counter outside rdtsc_barrier protected section - * below, resulting in violation of monotonicity. - */ - rdtsc_barrier(); + offset = pvclock_get_nsec_offset(src); ret = src->system_time + offset; ret_flags = src->flags; diff --git a/arch/x86/include/asm/qrwlock.h b/arch/x86/include/asm/qrwlock.h index ae0e241e2..c537cbb03 100644 --- a/arch/x86/include/asm/qrwlock.h +++ b/arch/x86/include/asm/qrwlock.h @@ -2,16 +2,6 @@ #define _ASM_X86_QRWLOCK_H #include <asm-generic/qrwlock_types.h> - -#ifndef CONFIG_X86_PPRO_FENCE -#define queue_write_unlock queue_write_unlock -static inline void queue_write_unlock(struct qrwlock *lock) -{ - barrier(); - ACCESS_ONCE(*(u8 *)&lock->cnts) = 0; -} -#endif - #include <asm-generic/qrwlock.h> #endif /* _ASM_X86_QRWLOCK_H */ diff --git a/arch/x86/include/asm/sigframe.h b/arch/x86/include/asm/sigframe.h index 7c7c27c97..1f3175bb9 100644 --- a/arch/x86/include/asm/sigframe.h +++ b/arch/x86/include/asm/sigframe.h @@ -4,6 +4,7 @@ #include <asm/sigcontext.h> #include <asm/siginfo.h> #include <asm/ucontext.h> +#include <linux/compat.h> #ifdef CONFIG_X86_32 #define sigframe_ia32 sigframe @@ -69,6 +70,15 @@ struct rt_sigframe { #ifdef CONFIG_X86_X32_ABI +struct ucontext_x32 { + unsigned int uc_flags; + unsigned int uc_link; + compat_stack_t uc_stack; + unsigned int uc__pad0; /* needed for alignment */ + struct sigcontext uc_mcontext; /* the 64-bit sigcontext type */ + compat_sigset_t uc_sigmask; /* mask last for extensibility */ +}; + struct rt_sigframe_x32 { u64 pretcode; struct ucontext_x32 uc; diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h index 31eab867e..c481be78f 100644 --- a/arch/x86/include/asm/signal.h +++ b/arch/x86/include/asm/signal.h @@ -30,7 +30,7 @@ typedef sigset_t compat_sigset_t; #endif /* __ASSEMBLY__ */ #include <uapi/asm/signal.h> #ifndef __ASSEMBLY__ -extern void do_notify_resume(struct pt_regs *, void *, __u32); +extern void do_signal(struct pt_regs *regs); #define __ARCH_HAS_SA_RESTORER diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h index c2e00bb2a..58505f019 100644 --- a/arch/x86/include/asm/stackprotector.h +++ b/arch/x86/include/asm/stackprotector.h @@ -72,7 +72,7 @@ static __always_inline void boot_init_stack_canary(void) * on during the bootup the random pool has true entropy too. */ get_random_bytes(&canary, sizeof(canary)); - tsc = __native_read_tsc(); + tsc = rdtsc(); canary += tsc + (tsc << 32UL); current->stack_canary = canary; diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h index e46611969..ff8b9a17d 100644 --- a/arch/x86/include/asm/string_64.h +++ b/arch/x86/include/asm/string_64.h @@ -27,12 +27,11 @@ static __always_inline void *__inline_memcpy(void *to, const void *from, size_t function. */ #define __HAVE_ARCH_MEMCPY 1 +extern void *memcpy(void *to, const void *from, size_t len); extern void *__memcpy(void *to, const void *from, size_t len); #ifndef CONFIG_KMEMCHECK -#if (__GNUC__ == 4 && __GNUC_MINOR__ >= 3) || __GNUC__ > 4 -extern void *memcpy(void *to, const void *from, size_t len); -#else +#if (__GNUC__ == 4 && __GNUC_MINOR__ < 3) || __GNUC__ < 4 #define memcpy(dst, src, len) \ ({ \ size_t __len = (len); \ diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index 592a6a672..91dfcafe2 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -37,6 +37,7 @@ asmlinkage long sys_get_thread_area(struct user_desc __user *); asmlinkage unsigned long sys_sigreturn(void); /* kernel/vm86_32.c */ +struct vm86_struct; asmlinkage long sys_vm86old(struct vm86_struct __user *); asmlinkage long sys_vm86(unsigned long, unsigned long); diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 225ee545e..8afdc3e44 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -27,14 +27,17 @@ * Without this offset, that can result in a page fault. (We are * careful that, in this case, the value we read doesn't matter.) * - * In vm86 mode, the hardware frame is much longer still, but we neither - * access the extra members from NMI context, nor do we write such a - * frame at sp0 at all. + * In vm86 mode, the hardware frame is much longer still, so add 16 + * bytes to make room for the real-mode segments. * * x86_64 has a fixed-length stack frame. */ #ifdef CONFIG_X86_32 -# define TOP_OF_KERNEL_STACK_PADDING 8 +# ifdef CONFIG_VM86 +# define TOP_OF_KERNEL_STACK_PADDING 16 +# else +# define TOP_OF_KERNEL_STACK_PADDING 8 +# endif #else # define TOP_OF_KERNEL_STACK_PADDING 0 #endif @@ -140,27 +143,11 @@ struct thread_info { _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT | \ _TIF_NOHZ) -/* work to do in syscall_trace_leave() */ -#define _TIF_WORK_SYSCALL_EXIT \ - (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP | \ - _TIF_SYSCALL_TRACEPOINT | _TIF_NOHZ) - -/* work to do on interrupt/exception return */ -#define _TIF_WORK_MASK \ - (0x0000FFFF & \ - ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT| \ - _TIF_SINGLESTEP|_TIF_SECCOMP|_TIF_SYSCALL_EMU)) - /* work to do on any return to user space */ #define _TIF_ALLWORK_MASK \ ((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_TRACEPOINT | \ _TIF_NOHZ) -/* Only used for 64 bit */ -#define _TIF_DO_NOTIFY_MASK \ - (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | \ - _TIF_USER_RETURN_NOTIFY | _TIF_UPROBE) - /* flags to check in __switch_to() */ #define _TIF_WORK_CTXSW \ (_TIF_IO_BITMAP|_TIF_NOTSC|_TIF_BLOCKSTEP) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index cd791948b..6df202940 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -261,6 +261,12 @@ static inline void reset_lazy_tlbstate(void) #endif /* SMP */ +/* Not inlined due to inc_irq_stat not being defined yet */ +#define flush_tlb_local() { \ + inc_irq_stat(irq_tlb_count); \ + local_flush_tlb(); \ +} + #ifndef CONFIG_PARAVIRT #define flush_tlb_others(mask, mm, start, end) \ native_flush_tlb_others(mask, mm, start, end) diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index c5380bea2..c34966197 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -112,8 +112,8 @@ asmlinkage void smp_threshold_interrupt(void); asmlinkage void smp_deferred_error_interrupt(void); #endif -extern enum ctx_state ist_enter(struct pt_regs *regs); -extern void ist_exit(struct pt_regs *regs, enum ctx_state prev_state); +extern void ist_enter(struct pt_regs *regs); +extern void ist_exit(struct pt_regs *regs); extern void ist_begin_non_atomic(struct pt_regs *regs); extern void ist_end_non_atomic(void); diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 94605c0e9..6d7c5479b 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -21,28 +21,12 @@ extern void disable_TSC(void); static inline cycles_t get_cycles(void) { - unsigned long long ret = 0; - #ifndef CONFIG_X86_TSC if (!cpu_has_tsc) return 0; #endif - rdtscll(ret); - - return ret; -} -static __always_inline cycles_t vget_cycles(void) -{ - /* - * We only do VDSOs on TSC capable CPUs, so this shouldn't - * access boot_cpu_data (which is not VDSO-safe): - */ -#ifndef CONFIG_X86_TSC - if (!cpu_has_tsc) - return 0; -#endif - return (cycles_t)__native_read_tsc(); + return rdtsc(); } extern void tsc_init(void); @@ -51,6 +35,7 @@ extern int unsynchronized_tsc(void); extern int check_tsc_unstable(void); extern int check_tsc_disabled(void); extern unsigned long native_calibrate_tsc(void); +extern unsigned long long native_sched_clock_from_tsc(u64 tsc); extern int tsc_clocksource_reliable; diff --git a/arch/x86/include/asm/vm86.h b/arch/x86/include/asm/vm86.h index 1d8de3f3f..1e491f3af 100644 --- a/arch/x86/include/asm/vm86.h +++ b/arch/x86/include/asm/vm86.h @@ -1,7 +1,6 @@ #ifndef _ASM_X86_VM86_H #define _ASM_X86_VM86_H - #include <asm/ptrace.h> #include <uapi/asm/vm86.h> @@ -28,43 +27,49 @@ struct kernel_vm86_regs { unsigned short gs, __gsh; }; -struct kernel_vm86_struct { - struct kernel_vm86_regs regs; -/* - * the below part remains on the kernel stack while we are in VM86 mode. - * 'tss.esp0' then contains the address of VM86_TSS_ESP0 below, and when we - * get forced back from VM86, the CPU and "SAVE_ALL" will restore the above - * 'struct kernel_vm86_regs' with the then actual values. - * Therefore, pt_regs in fact points to a complete 'kernel_vm86_struct' - * in kernelspace, hence we need not reget the data from userspace. - */ -#define VM86_TSS_ESP0 flags +struct vm86 { + struct vm86plus_struct __user *user_vm86; + struct pt_regs regs32; + unsigned long veflags; + unsigned long veflags_mask; + unsigned long saved_sp0; + unsigned long flags; unsigned long screen_bitmap; unsigned long cpu_type; struct revectored_struct int_revectored; struct revectored_struct int21_revectored; struct vm86plus_info_struct vm86plus; - struct pt_regs *regs32; /* here we save the pointer to the old regs */ -/* - * The below is not part of the structure, but the stack layout continues - * this way. In front of 'return-eip' may be some data, depending on - * compilation, so we don't rely on this and save the pointer to 'oldregs' - * in 'regs32' above. - * However, with GCC-2.7.2 and the current CFLAGS you see exactly this: - - long return-eip; from call to vm86() - struct pt_regs oldregs; user space registers as saved by syscall - */ }; #ifdef CONFIG_VM86 void handle_vm86_fault(struct kernel_vm86_regs *, long); int handle_vm86_trap(struct kernel_vm86_regs *, long, int); -struct pt_regs *save_v86_state(struct kernel_vm86_regs *); +void save_v86_state(struct kernel_vm86_regs *, int); struct task_struct; + +#define free_vm86(t) do { \ + struct thread_struct *__t = (t); \ + if (__t->vm86 != NULL) { \ + kfree(__t->vm86); \ + __t->vm86 = NULL; \ + } \ +} while (0) + +/* + * Support for VM86 programs to request interrupts for + * real mode hardware drivers: + */ +#define FIRST_VM86_IRQ 3 +#define LAST_VM86_IRQ 15 + +static inline int invalid_vm86_irq(int irq) +{ + return irq < FIRST_VM86_IRQ || irq > LAST_VM86_IRQ; +} + void release_vm86_irqs(struct task_struct *); #else @@ -77,6 +82,10 @@ static inline int handle_vm86_trap(struct kernel_vm86_regs *a, long b, int c) return 0; } +static inline void save_v86_state(struct kernel_vm86_regs *a, int b) { } + +#define free_vm86(t) do { } while(0) + #endif /* CONFIG_VM86 */ #endif /* _ASM_X86_VM86_H */ diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index da772edd1..448b7ca61 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -47,6 +47,7 @@ #define CPU_BASED_MOV_DR_EXITING 0x00800000 #define CPU_BASED_UNCOND_IO_EXITING 0x01000000 #define CPU_BASED_USE_IO_BITMAPS 0x02000000 +#define CPU_BASED_MONITOR_TRAP_FLAG 0x08000000 #define CPU_BASED_USE_MSR_BITMAPS 0x10000000 #define CPU_BASED_MONITOR_EXITING 0x20000000 #define CPU_BASED_PAUSE_EXITING 0x40000000 @@ -367,29 +368,29 @@ enum vmcs_field { #define TYPE_PHYSICAL_APIC_EVENT (10 << 12) #define TYPE_PHYSICAL_APIC_INST (15 << 12) -/* segment AR */ -#define SEGMENT_AR_L_MASK (1 << 13) - -#define AR_TYPE_ACCESSES_MASK 1 -#define AR_TYPE_READABLE_MASK (1 << 1) -#define AR_TYPE_WRITEABLE_MASK (1 << 2) -#define AR_TYPE_CODE_MASK (1 << 3) -#define AR_TYPE_MASK 0x0f -#define AR_TYPE_BUSY_64_TSS 11 -#define AR_TYPE_BUSY_32_TSS 11 -#define AR_TYPE_BUSY_16_TSS 3 -#define AR_TYPE_LDT 2 - -#define AR_UNUSABLE_MASK (1 << 16) -#define AR_S_MASK (1 << 4) -#define AR_P_MASK (1 << 7) -#define AR_L_MASK (1 << 13) -#define AR_DB_MASK (1 << 14) -#define AR_G_MASK (1 << 15) -#define AR_DPL_SHIFT 5 -#define AR_DPL(ar) (((ar) >> AR_DPL_SHIFT) & 3) - -#define AR_RESERVD_MASK 0xfffe0f00 +/* segment AR in VMCS -- these are different from what LAR reports */ +#define VMX_SEGMENT_AR_L_MASK (1 << 13) + +#define VMX_AR_TYPE_ACCESSES_MASK 1 +#define VMX_AR_TYPE_READABLE_MASK (1 << 1) +#define VMX_AR_TYPE_WRITEABLE_MASK (1 << 2) +#define VMX_AR_TYPE_CODE_MASK (1 << 3) +#define VMX_AR_TYPE_MASK 0x0f +#define VMX_AR_TYPE_BUSY_64_TSS 11 +#define VMX_AR_TYPE_BUSY_32_TSS 11 +#define VMX_AR_TYPE_BUSY_16_TSS 3 +#define VMX_AR_TYPE_LDT 2 + +#define VMX_AR_UNUSABLE_MASK (1 << 16) +#define VMX_AR_S_MASK (1 << 4) +#define VMX_AR_P_MASK (1 << 7) +#define VMX_AR_L_MASK (1 << 13) +#define VMX_AR_DB_MASK (1 << 14) +#define VMX_AR_G_MASK (1 << 15) +#define VMX_AR_DPL_SHIFT 5 +#define VMX_AR_DPL(ar) (((ar) >> VMX_AR_DPL_SHIFT) & 3) + +#define VMX_AR_RESERVD_MASK 0xfffe0f00 #define TSS_PRIVATE_MEMSLOT (KVM_USER_MEM_SLOTS + 0) #define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (KVM_USER_MEM_SLOTS + 1) diff --git a/arch/x86/include/asm/xen/events.h b/arch/x86/include/asm/xen/events.h index 608a79d5a..e6911caf5 100644 --- a/arch/x86/include/asm/xen/events.h +++ b/arch/x86/include/asm/xen/events.h @@ -20,4 +20,15 @@ static inline int xen_irqs_disabled(struct pt_regs *regs) /* No need for a barrier -- XCHG is a barrier on x86. */ #define xchg_xen_ulong(ptr, val) xchg((ptr), (val)) +extern int xen_have_vector_callback; + +/* + * Events delivered via platform PCI interrupts are always + * routed to vcpu 0 and hence cannot be rebound. + */ +static inline bool xen_support_evtchn_rebind(void) +{ + return (!xen_hvm_domain() || xen_have_vector_callback); +} + #endif /* _ASM_X86_XEN_EVENTS_H */ diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index ca08a27b9..4c20dd333 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -336,10 +336,10 @@ HYPERVISOR_update_descriptor(u64 ma, u64 desc) return _hypercall4(int, update_descriptor, ma, ma>>32, desc, desc>>32); } -static inline int +static inline long HYPERVISOR_memory_op(unsigned int cmd, void *arg) { - return _hypercall2(int, memory_op, cmd, arg); + return _hypercall2(long, memory_op, cmd, arg); } static inline int @@ -465,6 +465,12 @@ HYPERVISOR_tmem_op( return _hypercall1(int, tmem_op, op); } +static inline int +HYPERVISOR_xenpmu_op(unsigned int op, void *arg) +{ + return _hypercall2(int, xenpmu_op, op, arg); +} + static inline void MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set) { diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h index 3400dbaec..62ca03ef5 100644 --- a/arch/x86/include/asm/xen/interface.h +++ b/arch/x86/include/asm/xen/interface.h @@ -3,12 +3,38 @@ * * Guest OS interface to x86 Xen. * - * Copyright (c) 2004, K A Fraser + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Copyright (c) 2004-2006, K A Fraser */ #ifndef _ASM_X86_XEN_INTERFACE_H #define _ASM_X86_XEN_INTERFACE_H +/* + * XEN_GUEST_HANDLE represents a guest pointer, when passed as a field + * in a struct in memory. + * XEN_GUEST_HANDLE_PARAM represent a guest pointer, when passed as an + * hypercall argument. + * XEN_GUEST_HANDLE_PARAM and XEN_GUEST_HANDLE are the same on X86 but + * they might not be on other architectures. + */ #ifdef __XEN__ #define __DEFINE_GUEST_HANDLE(name, type) \ typedef struct { type *p; } __guest_handle_ ## name @@ -88,13 +114,16 @@ DEFINE_GUEST_HANDLE(xen_ulong_t); * start of the GDT because some stupid OSes export hard-coded selector values * in their ABI. These hard-coded values are always near the start of the GDT, * so Xen places itself out of the way, at the far end of the GDT. + * + * NB The LDT is set using the MMUEXT_SET_LDT op of HYPERVISOR_mmuext_op */ #define FIRST_RESERVED_GDT_PAGE 14 #define FIRST_RESERVED_GDT_BYTE (FIRST_RESERVED_GDT_PAGE * 4096) #define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8) /* - * Send an array of these to HYPERVISOR_set_trap_table() + * Send an array of these to HYPERVISOR_set_trap_table(). + * Terminate the array with a sentinel entry, with traps[].address==0. * The privilege level specifies which modes may enter a trap via a software * interrupt. On x86/64, since rings 1 and 2 are unavailable, we allocate * privilege levels as follows: @@ -118,10 +147,41 @@ struct trap_info { DEFINE_GUEST_HANDLE_STRUCT(trap_info); struct arch_shared_info { - unsigned long max_pfn; /* max pfn that appears in table */ - /* Frame containing list of mfns containing list of mfns containing p2m. */ - unsigned long pfn_to_mfn_frame_list_list; - unsigned long nmi_reason; + /* + * Number of valid entries in the p2m table(s) anchored at + * pfn_to_mfn_frame_list_list and/or p2m_vaddr. + */ + unsigned long max_pfn; + /* + * Frame containing list of mfns containing list of mfns containing p2m. + * A value of 0 indicates it has not yet been set up, ~0 indicates it + * has been set to invalid e.g. due to the p2m being too large for the + * 3-level p2m tree. In this case the linear mapper p2m list anchored + * at p2m_vaddr is to be used. + */ + xen_pfn_t pfn_to_mfn_frame_list_list; + unsigned long nmi_reason; + /* + * Following three fields are valid if p2m_cr3 contains a value + * different from 0. + * p2m_cr3 is the root of the address space where p2m_vaddr is valid. + * p2m_cr3 is in the same format as a cr3 value in the vcpu register + * state and holds the folded machine frame number (via xen_pfn_to_cr3) + * of a L3 or L4 page table. + * p2m_vaddr holds the virtual address of the linear p2m list. All + * entries in the range [0...max_pfn[ are accessible via this pointer. + * p2m_generation will be incremented by the guest before and after each + * change of the mappings of the p2m list. p2m_generation starts at 0 + * and a value with the least significant bit set indicates that a + * mapping update is in progress. This allows guest external software + * (e.g. in Dom0) to verify that read mappings are consistent and + * whether they have changed since the last check. + * Modifying a p2m element in the linear p2m list is allowed via an + * atomic write only. + */ + unsigned long p2m_cr3; /* cr3 value of the p2m address space */ + unsigned long p2m_vaddr; /* virtual address of the p2m list */ + unsigned long p2m_generation; /* generation count of p2m mapping */ }; #endif /* !__ASSEMBLY__ */ @@ -137,13 +197,31 @@ struct arch_shared_info { /* * The following is all CPU context. Note that the fpu_ctxt block is filled * in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used. + * + * Also note that when calling DOMCTL_setvcpucontext and VCPU_initialise + * for HVM and PVH guests, not all information in this structure is updated: + * + * - For HVM guests, the structures read include: fpu_ctxt (if + * VGCT_I387_VALID is set), flags, user_regs, debugreg[*] + * + * - PVH guests are the same as HVM guests, but additionally use ctrlreg[3] to + * set cr3. All other fields not used should be set to 0. */ struct vcpu_guest_context { /* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */ struct { char x[512]; } fpu_ctxt; /* User-level FPU registers */ -#define VGCF_I387_VALID (1<<0) -#define VGCF_HVM_GUEST (1<<1) -#define VGCF_IN_KERNEL (1<<2) +#define VGCF_I387_VALID (1<<0) +#define VGCF_IN_KERNEL (1<<2) +#define _VGCF_i387_valid 0 +#define VGCF_i387_valid (1<<_VGCF_i387_valid) +#define _VGCF_in_kernel 2 +#define VGCF_in_kernel (1<<_VGCF_in_kernel) +#define _VGCF_failsafe_disables_events 3 +#define VGCF_failsafe_disables_events (1<<_VGCF_failsafe_disables_events) +#define _VGCF_syscall_disables_events 4 +#define VGCF_syscall_disables_events (1<<_VGCF_syscall_disables_events) +#define _VGCF_online 5 +#define VGCF_online (1<<_VGCF_online) unsigned long flags; /* VGCF_* flags */ struct cpu_user_regs user_regs; /* User-level CPU registers */ struct trap_info trap_ctxt[256]; /* Virtual IDT */ @@ -172,6 +250,129 @@ struct vcpu_guest_context { #endif }; DEFINE_GUEST_HANDLE_STRUCT(vcpu_guest_context); + +/* AMD PMU registers and structures */ +struct xen_pmu_amd_ctxt { + /* + * Offsets to counter and control MSRs (relative to xen_pmu_arch.c.amd). + * For PV(H) guests these fields are RO. + */ + uint32_t counters; + uint32_t ctrls; + + /* Counter MSRs */ +#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + uint64_t regs[]; +#elif defined(__GNUC__) + uint64_t regs[0]; +#endif +}; + +/* Intel PMU registers and structures */ +struct xen_pmu_cntr_pair { + uint64_t counter; + uint64_t control; +}; + +struct xen_pmu_intel_ctxt { + /* + * Offsets to fixed and architectural counter MSRs (relative to + * xen_pmu_arch.c.intel). + * For PV(H) guests these fields are RO. + */ + uint32_t fixed_counters; + uint32_t arch_counters; + + /* PMU registers */ + uint64_t global_ctrl; + uint64_t global_ovf_ctrl; + uint64_t global_status; + uint64_t fixed_ctrl; + uint64_t ds_area; + uint64_t pebs_enable; + uint64_t debugctl; + + /* Fixed and architectural counter MSRs */ +#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + uint64_t regs[]; +#elif defined(__GNUC__) + uint64_t regs[0]; +#endif +}; + +/* Sampled domain's registers */ +struct xen_pmu_regs { + uint64_t ip; + uint64_t sp; + uint64_t flags; + uint16_t cs; + uint16_t ss; + uint8_t cpl; + uint8_t pad[3]; +}; + +/* PMU flags */ +#define PMU_CACHED (1<<0) /* PMU MSRs are cached in the context */ +#define PMU_SAMPLE_USER (1<<1) /* Sample is from user or kernel mode */ +#define PMU_SAMPLE_REAL (1<<2) /* Sample is from realmode */ +#define PMU_SAMPLE_PV (1<<3) /* Sample from a PV guest */ + +/* + * Architecture-specific information describing state of the processor at + * the time of PMU interrupt. + * Fields of this structure marked as RW for guest should only be written by + * the guest when PMU_CACHED bit in pmu_flags is set (which is done by the + * hypervisor during PMU interrupt). Hypervisor will read updated data in + * XENPMU_flush hypercall and clear PMU_CACHED bit. + */ +struct xen_pmu_arch { + union { + /* + * Processor's registers at the time of interrupt. + * WO for hypervisor, RO for guests. + */ + struct xen_pmu_regs regs; + /* + * Padding for adding new registers to xen_pmu_regs in + * the future + */ +#define XENPMU_REGS_PAD_SZ 64 + uint8_t pad[XENPMU_REGS_PAD_SZ]; + } r; + + /* WO for hypervisor, RO for guest */ + uint64_t pmu_flags; + + /* + * APIC LVTPC register. + * RW for both hypervisor and guest. + * Only APIC_LVT_MASKED bit is loaded by the hypervisor into hardware + * during XENPMU_flush or XENPMU_lvtpc_set. + */ + union { + uint32_t lapic_lvtpc; + uint64_t pad; + } l; + + /* + * Vendor-specific PMU registers. + * RW for both hypervisor and guest (see exceptions above). + * Guest's updates to this field are verified and then loaded by the + * hypervisor into hardware during XENPMU_flush + */ + union { + struct xen_pmu_amd_ctxt amd; + struct xen_pmu_intel_ctxt intel; + + /* + * Padding for contexts (fixed parts only, does not include + * MSR banks that are specified by offsets) + */ +#define XENPMU_CTXT_PAD_SZ 128 + uint8_t pad[XENPMU_CTXT_PAD_SZ]; + } c; +}; + #endif /* !__ASSEMBLY__ */ /* diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index c44a5d53e..0679e11d2 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -35,9 +35,7 @@ typedef struct xpaddr { #define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT) #define IDENTITY_FRAME(m) ((m) | IDENTITY_FRAME_BIT) -/* Maximum amount of memory we can handle in a domain in pages */ -#define MAX_DOMAIN_PAGES \ - ((unsigned long)((u64)CONFIG_XEN_MAX_DOMAIN_MEMORY * 1024 * 1024 * 1024 / PAGE_SIZE)) +#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) extern unsigned long *machine_to_phys_mapping; extern unsigned long machine_to_phys_nr; @@ -48,8 +46,8 @@ extern unsigned long xen_max_p2m_pfn; extern unsigned long get_phys_to_machine(unsigned long pfn); extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn); extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); -extern unsigned long set_phys_range_identity(unsigned long pfn_s, - unsigned long pfn_e); +extern unsigned long __init set_phys_range_identity(unsigned long pfn_s, + unsigned long pfn_e); extern int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, struct gnttab_map_grant_ref *kmap_ops, @@ -103,6 +101,11 @@ static inline unsigned long pfn_to_mfn(unsigned long pfn) { unsigned long mfn; + /* + * Some x86 code are still using pfn_to_mfn instead of + * pfn_to_mfn. This will have to be removed when we figured + * out which call. + */ if (xen_feature(XENFEAT_auto_translated_physmap)) return pfn; @@ -149,6 +152,11 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn) { unsigned long pfn; + /* + * Some x86 code are still using mfn_to_pfn instead of + * gfn_to_pfn. This will have to be removed when we figure + * out which call. + */ if (xen_feature(XENFEAT_auto_translated_physmap)) return mfn; @@ -178,6 +186,27 @@ static inline xpaddr_t machine_to_phys(xmaddr_t machine) return XPADDR(PFN_PHYS(mfn_to_pfn(PFN_DOWN(machine.maddr))) | offset); } +/* Pseudo-physical <-> Guest conversion */ +static inline unsigned long pfn_to_gfn(unsigned long pfn) +{ + if (xen_feature(XENFEAT_auto_translated_physmap)) + return pfn; + else + return pfn_to_mfn(pfn); +} + +static inline unsigned long gfn_to_pfn(unsigned long gfn) +{ + if (xen_feature(XENFEAT_auto_translated_physmap)) + return gfn; + else + return mfn_to_pfn(gfn); +} + +/* Pseudo-physical <-> Bus conversion */ +#define pfn_to_bfn(pfn) pfn_to_gfn(pfn) +#define bfn_to_pfn(bfn) gfn_to_pfn(bfn) + /* * We detect special mappings in one of two ways: * 1. If the MFN is an I/O page then Xen will set the m2p entry @@ -198,7 +227,7 @@ static inline xpaddr_t machine_to_phys(xmaddr_t machine) * require. In all the cases we care about, the FOREIGN_FRAME bit is * masked (e.g., pfn_to_mfn()) so behaviour there is correct. */ -static inline unsigned long mfn_to_local_pfn(unsigned long mfn) +static inline unsigned long bfn_to_local_pfn(unsigned long mfn) { unsigned long pfn; @@ -217,6 +246,10 @@ static inline unsigned long mfn_to_local_pfn(unsigned long mfn) #define virt_to_mfn(v) (pfn_to_mfn(virt_to_pfn(v))) #define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT)) +/* VIRT <-> GUEST conversion */ +#define virt_to_gfn(v) (pfn_to_gfn(virt_to_pfn(v))) +#define gfn_to_virt(g) (__va(gfn_to_pfn(g) << PAGE_SHIFT)) + static inline unsigned long pte_mfn(pte_t pte) { return (pte.pte & PTE_PFN_MASK) >> PAGE_SHIFT; @@ -264,7 +297,7 @@ void make_lowmem_page_readwrite(void *vaddr); static inline bool xen_arch_need_swiotlb(struct device *dev, unsigned long pfn, - unsigned long mfn) + unsigned long bfn) { return false; } diff --git a/arch/x86/include/uapi/asm/bitsperlong.h b/arch/x86/include/uapi/asm/bitsperlong.h index b0ae1c4dc..217909b4d 100644 --- a/arch/x86/include/uapi/asm/bitsperlong.h +++ b/arch/x86/include/uapi/asm/bitsperlong.h @@ -1,7 +1,7 @@ #ifndef __ASM_X86_BITSPERLONG_H #define __ASM_X86_BITSPERLONG_H -#ifdef __x86_64__ +#if defined(__x86_64__) && !defined(__ILP32__) # define __BITS_PER_LONG 64 #else # define __BITS_PER_LONG 32 diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index ab456dc23..329254373 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -120,7 +120,7 @@ struct boot_params { __u8 _pad3[16]; /* 0x070 */ __u8 hd0_info[16]; /* obsolete! */ /* 0x080 */ __u8 hd1_info[16]; /* obsolete! */ /* 0x090 */ - struct sys_desc_table sys_desc_table; /* 0x0a0 */ + struct sys_desc_table sys_desc_table; /* obsolete! */ /* 0x0a0 */ struct olpc_ofw_header olpc_ofw_header; /* 0x0b0 */ __u32 ext_ramdisk_image; /* 0x0c0 */ __u32 ext_ramdisk_size; /* 0x0c4 */ diff --git a/arch/x86/include/uapi/asm/e820.h b/arch/x86/include/uapi/asm/e820.h index 0f457e6ea..9dafe59cf 100644 --- a/arch/x86/include/uapi/asm/e820.h +++ b/arch/x86/include/uapi/asm/e820.h @@ -37,7 +37,7 @@ /* * This is a non-standardized way to represent ADR or NVDIMM regions that * persist over a reboot. The kernel will ignore their special capabilities - * unless the CONFIG_X86_PMEM_LEGACY=y option is set. + * unless the CONFIG_X86_PMEM_LEGACY option is set. * * ( Note that older platforms also used 6 for the same type of memory, * but newer versions switched to 12 as 6 was assigned differently. Some diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h index f36d56bd7..f0412c50c 100644 --- a/arch/x86/include/uapi/asm/hyperv.h +++ b/arch/x86/include/uapi/asm/hyperv.h @@ -27,6 +27,8 @@ #define HV_X64_MSR_VP_RUNTIME_AVAILABLE (1 << 0) /* Partition Reference Counter (HV_X64_MSR_TIME_REF_COUNT) available*/ #define HV_X64_MSR_TIME_REF_COUNT_AVAILABLE (1 << 1) +/* Partition reference TSC MSR is available */ +#define HV_X64_MSR_REFERENCE_TSC_AVAILABLE (1 << 9) /* A partition's reference time stamp counter (TSC) page */ #define HV_X64_MSR_REFERENCE_TSC 0x40000021 diff --git a/arch/x86/include/uapi/asm/mce.h b/arch/x86/include/uapi/asm/mce.h index a0eab85ce..76880ede9 100644 --- a/arch/x86/include/uapi/asm/mce.h +++ b/arch/x86/include/uapi/asm/mce.h @@ -15,7 +15,8 @@ struct mce { __u64 time; /* wall time_t when error was detected */ __u8 cpuvendor; /* cpu vendor as encoded in system.h */ __u8 inject_flags; /* software inject flags */ - __u16 pad; + __u8 severity; + __u8 usable_addr; __u32 cpuid; /* CPUID 1 EAX */ __u8 cs; /* code segment */ __u8 bank; /* machine check bank */ diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h index 180a0c3c2..79887abcb 100644 --- a/arch/x86/include/uapi/asm/processor-flags.h +++ b/arch/x86/include/uapi/asm/processor-flags.h @@ -37,8 +37,6 @@ #define X86_EFLAGS_VM _BITUL(X86_EFLAGS_VM_BIT) #define X86_EFLAGS_AC_BIT 18 /* Alignment Check/Access Control */ #define X86_EFLAGS_AC _BITUL(X86_EFLAGS_AC_BIT) -#define X86_EFLAGS_AC_BIT 18 /* Alignment Check/Access Control */ -#define X86_EFLAGS_AC _BITUL(X86_EFLAGS_AC_BIT) #define X86_EFLAGS_VIF_BIT 19 /* Virtual Interrupt Flag */ #define X86_EFLAGS_VIF _BITUL(X86_EFLAGS_VIF_BIT) #define X86_EFLAGS_VIP_BIT 20 /* Virtual Interrupt Pending */ diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h index b5d7640ab..8a4add8e4 100644 --- a/arch/x86/include/uapi/asm/svm.h +++ b/arch/x86/include/uapi/asm/svm.h @@ -100,6 +100,7 @@ { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" }, \ { SVM_EXIT_EXCP_BASE + PF_VECTOR, "PF excp" }, \ { SVM_EXIT_EXCP_BASE + NM_VECTOR, "NM excp" }, \ + { SVM_EXIT_EXCP_BASE + AC_VECTOR, "AC excp" }, \ { SVM_EXIT_EXCP_BASE + MC_VECTOR, "MC excp" }, \ { SVM_EXIT_INTR, "interrupt" }, \ { SVM_EXIT_NMI, "nmi" }, \ diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index 1fe92181e..37fee2726 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -58,6 +58,7 @@ #define EXIT_REASON_INVALID_STATE 33 #define EXIT_REASON_MSR_LOAD_FAIL 34 #define EXIT_REASON_MWAIT_INSTRUCTION 36 +#define EXIT_REASON_MONITOR_TRAP_FLAG 37 #define EXIT_REASON_MONITOR_INSTRUCTION 39 #define EXIT_REASON_PAUSE_INSTRUCTION 40 #define EXIT_REASON_MCE_DURING_VMENTRY 41 @@ -106,6 +107,7 @@ { EXIT_REASON_MSR_READ, "MSR_READ" }, \ { EXIT_REASON_MSR_WRITE, "MSR_WRITE" }, \ { EXIT_REASON_MWAIT_INSTRUCTION, "MWAIT_INSTRUCTION" }, \ + { EXIT_REASON_MONITOR_TRAP_FLAG, "MONITOR_TRAP_FLAG" }, \ { EXIT_REASON_MONITOR_INSTRUCTION, "MONITOR_INSTRUCTION" }, \ { EXIT_REASON_PAUSE_INSTRUCTION, "PAUSE_INSTRUCTION" }, \ { EXIT_REASON_MCE_DURING_VMENTRY, "MCE_DURING_VMENTRY" }, \ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 0f15af41b..b1b78ffe0 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -23,8 +23,10 @@ KASAN_SANITIZE_dumpstack_$(BITS).o := n CFLAGS_irq.o := -I$(src)/../include/asm/trace obj-y := process_$(BITS).o signal.o +obj-$(CONFIG_COMPAT) += signal_compat.o obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o -obj-y += time.o ioport.o ldt.o dumpstack.o nmi.o +obj-y += time.o ioport.o dumpstack.o nmi.o +obj-$(CONFIG_MODIFY_LDT_SYSCALL) += ldt.o obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-y += probe_roms.o @@ -69,8 +71,8 @@ obj-$(CONFIG_LIVEPATCH) += livepatch.o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o obj-$(CONFIG_X86_TSC) += trace_clock.o -obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o -obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o +obj-$(CONFIG_KEXEC_CORE) += machine_kexec_$(BITS).o +obj-$(CONFIG_KEXEC_CORE) += relocate_kernel_$(BITS).o crash.o obj-$(CONFIG_KEXEC_FILE) += kexec-bzimage64.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o obj-y += kprobes/ @@ -92,7 +94,7 @@ obj-$(CONFIG_KVM_GUEST) += kvm.o kvmclock.o obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o -obj-$(CONFIG_X86_PMEM_LEGACY) += pmem.o +obj-$(CONFIG_X86_PMEM_LEGACY_DEVICE) += pmem.o obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o @@ -107,8 +109,6 @@ obj-$(CONFIG_EFI) += sysfb_efi.o obj-$(CONFIG_PERF_EVENTS) += perf_regs.o obj-$(CONFIG_TRACING) += tracepoint.o -obj-$(CONFIG_IOSF_MBI) += iosf_mbi.o -obj-$(CONFIG_PMC_ATOM) += pmc_atom.o ### # 64 bit specific files diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 939389671..ded848c20 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -711,7 +711,7 @@ static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) #endif } -static int _acpi_map_lsapic(acpi_handle handle, int physid, int *pcpu) +int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, int *pcpu) { int cpu; @@ -727,12 +727,6 @@ static int _acpi_map_lsapic(acpi_handle handle, int physid, int *pcpu) *pcpu = cpu; return 0; } - -/* wrapper to silence section mismatch warning */ -int __ref acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, int *pcpu) -{ - return _acpi_map_lsapic(handle, physid, pcpu); -} EXPORT_SYMBOL(acpi_map_cpu); int acpi_unmap_cpu(int cpu) diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index ede92c336..222a57076 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -263,7 +263,7 @@ static int apbt_clocksource_register(void) /* Verify whether apbt counter works */ t1 = dw_apb_clocksource_read(clocksource_apbt); - rdtscll(start); + start = rdtsc(); /* * We don't know the TSC frequency yet, but waiting for @@ -273,7 +273,7 @@ static int apbt_clocksource_register(void) */ do { rep_nop(); - rdtscll(now); + now = rdtsc(); } while ((now - start) < 200000UL); /* APBT is the only always on clocksource, it has to work! */ @@ -390,13 +390,13 @@ unsigned long apbt_quick_calibrate(void) old = dw_apb_clocksource_read(clocksource_apbt); old += loop; - t1 = __native_read_tsc(); + t1 = rdtsc(); do { new = dw_apb_clocksource_read(clocksource_apbt); } while (new < old); - t2 = __native_read_tsc(); + t2 = rdtsc(); shift = 5; if (unlikely(loop >> shift == 0)) { diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 307a49828..24e94ce45 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -464,45 +464,45 @@ static int lapic_next_deadline(unsigned long delta, { u64 tsc; - rdtscll(tsc); + tsc = rdtsc(); wrmsrl(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR)); return 0; } -/* - * Setup the lapic timer in periodic or oneshot mode - */ -static void lapic_timer_setup(enum clock_event_mode mode, - struct clock_event_device *evt) +static int lapic_timer_shutdown(struct clock_event_device *evt) { - unsigned long flags; unsigned int v; /* Lapic used as dummy for broadcast ? */ if (evt->features & CLOCK_EVT_FEAT_DUMMY) - return; + return 0; - local_irq_save(flags); + v = apic_read(APIC_LVTT); + v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); + apic_write(APIC_LVTT, v); + apic_write(APIC_TMICT, 0); + return 0; +} - switch (mode) { - case CLOCK_EVT_MODE_PERIODIC: - case CLOCK_EVT_MODE_ONESHOT: - __setup_APIC_LVTT(lapic_timer_frequency, - mode != CLOCK_EVT_MODE_PERIODIC, 1); - break; - case CLOCK_EVT_MODE_UNUSED: - case CLOCK_EVT_MODE_SHUTDOWN: - v = apic_read(APIC_LVTT); - v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); - apic_write(APIC_LVTT, v); - apic_write(APIC_TMICT, 0); - break; - case CLOCK_EVT_MODE_RESUME: - /* Nothing to do here */ - break; - } +static inline int +lapic_timer_set_periodic_oneshot(struct clock_event_device *evt, bool oneshot) +{ + /* Lapic used as dummy for broadcast ? */ + if (evt->features & CLOCK_EVT_FEAT_DUMMY) + return 0; - local_irq_restore(flags); + __setup_APIC_LVTT(lapic_timer_frequency, oneshot, 1); + return 0; +} + +static int lapic_timer_set_periodic(struct clock_event_device *evt) +{ + return lapic_timer_set_periodic_oneshot(evt, false); +} + +static int lapic_timer_set_oneshot(struct clock_event_device *evt) +{ + return lapic_timer_set_periodic_oneshot(evt, true); } /* @@ -520,15 +520,18 @@ static void lapic_timer_broadcast(const struct cpumask *mask) * The local apic timer can be used for any function which is CPU local. */ static struct clock_event_device lapic_clockevent = { - .name = "lapic", - .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT - | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY, - .shift = 32, - .set_mode = lapic_timer_setup, - .set_next_event = lapic_next_event, - .broadcast = lapic_timer_broadcast, - .rating = 100, - .irq = -1, + .name = "lapic", + .features = CLOCK_EVT_FEAT_PERIODIC | + CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_C3STOP + | CLOCK_EVT_FEAT_DUMMY, + .shift = 32, + .set_state_shutdown = lapic_timer_shutdown, + .set_state_periodic = lapic_timer_set_periodic, + .set_state_oneshot = lapic_timer_set_oneshot, + .set_next_event = lapic_next_event, + .broadcast = lapic_timer_broadcast, + .rating = 100, + .irq = -1, }; static DEFINE_PER_CPU(struct clock_event_device, lapic_events); @@ -599,7 +602,7 @@ static void __init lapic_cal_handler(struct clock_event_device *dev) unsigned long pm = acpi_pm_read_early(); if (cpu_has_tsc) - rdtscll(tsc); + tsc = rdtsc(); switch (lapic_cal_loops++) { case 0: @@ -785,7 +788,7 @@ static int __init calibrate_APIC_clock(void) * Setup the apic timer manually */ levt->event_handler = lapic_cal_handler; - lapic_timer_setup(CLOCK_EVT_MODE_PERIODIC, levt); + lapic_timer_set_periodic(levt); lapic_cal_loops = -1; /* Let the interrupts run */ @@ -795,7 +798,8 @@ static int __init calibrate_APIC_clock(void) cpu_relax(); /* Stop the lapic timer */ - lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, levt); + local_irq_disable(); + lapic_timer_shutdown(levt); /* Jiffies delta */ deltaj = lapic_cal_j2 - lapic_cal_j1; @@ -806,8 +810,8 @@ static int __init calibrate_APIC_clock(void) apic_printk(APIC_VERBOSE, "... jiffies result ok\n"); else levt->features |= CLOCK_EVT_FEAT_DUMMY; - } else - local_irq_enable(); + } + local_irq_enable(); if (levt->features & CLOCK_EVT_FEAT_DUMMY) { pr_warning("APIC timer disabled due to verification failure\n"); @@ -885,7 +889,7 @@ static void local_apic_timer_interrupt(void) if (!evt->event_handler) { pr_warning("Spurious LAPIC timer interrupt on cpu %d\n", cpu); /* Switch it off */ - lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt); + lapic_timer_shutdown(evt); return; } @@ -1216,7 +1220,7 @@ void setup_local_APIC(void) long long max_loops = cpu_khz ? cpu_khz : 1000000; if (cpu_has_tsc) - rdtscll(tsc); + tsc = rdtsc(); if (disable_apic) { disable_ioapic_support(); @@ -1300,7 +1304,7 @@ void setup_local_APIC(void) } if (queued) { if (cpu_has_tsc && cpu_khz) { - rdtscll(ntsc); + ntsc = rdtsc(); max_loops = (cpu_khz << 10) - (ntsc - tsc); } else max_loops--; diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index de918c410..f92ab3697 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -191,7 +191,6 @@ static struct apic apic_flat = { .send_IPI_all = flat_send_IPI_all, .send_IPI_self = apic_send_IPI_self, - .wait_for_init_deassert = false, .inquire_remote_apic = default_inquire_remote_apic, .read = native_apic_mem_read, @@ -299,7 +298,6 @@ static struct apic apic_physflat = { .send_IPI_all = physflat_send_IPI_all, .send_IPI_self = apic_send_IPI_self, - .wait_for_init_deassert = false, .inquire_remote_apic = default_inquire_remote_apic, .read = native_apic_mem_read, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index b205cdbdb..0d96749cf 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -152,7 +152,6 @@ struct apic apic_noop = { .wakeup_secondary_cpu = noop_wakeup_secondary_cpu, - .wait_for_init_deassert = false, .inquire_remote_apic = NULL, .read = noop_apic_read, diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 017149cde..b548fd3b7 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -92,7 +92,6 @@ static int numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip) write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v); - atomic_set(&init_deasserted, 1); return 0; } @@ -235,7 +234,6 @@ static const struct apic apic_numachip __refconst = { .send_IPI_self = numachip_send_IPI_self, .wakeup_secondary_cpu = numachip_wakeup_secondary, - .wait_for_init_deassert = false, .inquire_remote_apic = NULL, /* REMRD not supported */ .read = native_apic_mem_read, diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index c4a8d63f8..971cf8875 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -186,7 +186,6 @@ static struct apic apic_bigsmp = { .send_IPI_all = bigsmp_send_IPI_all, .send_IPI_self = default_send_IPI_self, - .wait_for_init_deassert = true, .inquire_remote_apic = default_inquire_remote_apic, .read = native_apic_mem_read, diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 6873ab925..045e424fb 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -28,146 +28,21 @@ u64 hw_nmi_get_sample_period(int watchdog_thresh) #endif #ifdef arch_trigger_all_cpu_backtrace -/* For reliability, we're prepared to waste bits here. */ -static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; -static cpumask_t printtrace_mask; - -#define NMI_BUF_SIZE 4096 - -struct nmi_seq_buf { - unsigned char buffer[NMI_BUF_SIZE]; - struct seq_buf seq; -}; - -/* Safe printing in NMI context */ -static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq); - -/* "in progress" flag of arch_trigger_all_cpu_backtrace */ -static unsigned long backtrace_flag; - -static void print_seq_line(struct nmi_seq_buf *s, int start, int end) +static void nmi_raise_cpu_backtrace(cpumask_t *mask) { - const char *buf = s->buffer + start; - - printk("%.*s", (end - start) + 1, buf); + apic->send_IPI_mask(mask, NMI_VECTOR); } void arch_trigger_all_cpu_backtrace(bool include_self) { - struct nmi_seq_buf *s; - int len; - int cpu; - int i; - int this_cpu = get_cpu(); - - if (test_and_set_bit(0, &backtrace_flag)) { - /* - * If there is already a trigger_all_cpu_backtrace() in progress - * (backtrace_flag == 1), don't output double cpu dump infos. - */ - put_cpu(); - return; - } - - cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask); - if (!include_self) - cpumask_clear_cpu(this_cpu, to_cpumask(backtrace_mask)); - - cpumask_copy(&printtrace_mask, to_cpumask(backtrace_mask)); - /* - * Set up per_cpu seq_buf buffers that the NMIs running on the other - * CPUs will write to. - */ - for_each_cpu(cpu, to_cpumask(backtrace_mask)) { - s = &per_cpu(nmi_print_seq, cpu); - seq_buf_init(&s->seq, s->buffer, NMI_BUF_SIZE); - } - - if (!cpumask_empty(to_cpumask(backtrace_mask))) { - pr_info("sending NMI to %s CPUs:\n", - (include_self ? "all" : "other")); - apic->send_IPI_mask(to_cpumask(backtrace_mask), NMI_VECTOR); - } - - /* Wait for up to 10 seconds for all CPUs to do the backtrace */ - for (i = 0; i < 10 * 1000; i++) { - if (cpumask_empty(to_cpumask(backtrace_mask))) - break; - mdelay(1); - touch_softlockup_watchdog(); - } - - /* - * Now that all the NMIs have triggered, we can dump out their - * back traces safely to the console. - */ - for_each_cpu(cpu, &printtrace_mask) { - int last_i = 0; - - s = &per_cpu(nmi_print_seq, cpu); - len = seq_buf_used(&s->seq); - if (!len) - continue; - - /* Print line by line. */ - for (i = 0; i < len; i++) { - if (s->buffer[i] == '\n') { - print_seq_line(s, last_i, i); - last_i = i + 1; - } - } - /* Check if there was a partial line. */ - if (last_i < len) { - print_seq_line(s, last_i, len - 1); - pr_cont("\n"); - } - } - - clear_bit(0, &backtrace_flag); - smp_mb__after_atomic(); - put_cpu(); -} - -/* - * It is not safe to call printk() directly from NMI handlers. - * It may be fine if the NMI detected a lock up and we have no choice - * but to do so, but doing a NMI on all other CPUs to get a back trace - * can be done with a sysrq-l. We don't want that to lock up, which - * can happen if the NMI interrupts a printk in progress. - * - * Instead, we redirect the vprintk() to this nmi_vprintk() that writes - * the content into a per cpu seq_buf buffer. Then when the NMIs are - * all done, we can safely dump the contents of the seq_buf to a printk() - * from a non NMI context. - */ -static int nmi_vprintk(const char *fmt, va_list args) -{ - struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq); - unsigned int len = seq_buf_used(&s->seq); - - seq_buf_vprintf(&s->seq, fmt, args); - return seq_buf_used(&s->seq) - len; + nmi_trigger_all_cpu_backtrace(include_self, nmi_raise_cpu_backtrace); } static int arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs) { - int cpu; - - cpu = smp_processor_id(); - - if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { - printk_func_t printk_func_save = this_cpu_read(printk_func); - - /* Replace printk to write into the NMI seq */ - this_cpu_write(printk_func, nmi_vprintk); - printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); - show_regs(regs); - this_cpu_write(printk_func, printk_func_save); - - cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); + if (nmi_cpu_backtrace(regs)) return NMI_HANDLED; - } return NMI_DONE; } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 5880b482d..4f2821527 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2542,12 +2542,14 @@ void __init setup_ioapic_dest(void) * Honour affinities which have been set in early boot */ if (!irqd_can_balance(idata) || irqd_affinity_was_set(idata)) - mask = idata->affinity; + mask = irq_data_get_affinity_mask(idata); else mask = apic->target_cpus(); chip = irq_data_get_irq_chip(idata); - chip->irq_set_affinity(idata, mask, false); + /* Might be lapic_chip for irq 0 */ + if (chip->irq_set_affinity) + chip->irq_set_affinity(idata, mask, false); } } #endif @@ -2907,6 +2909,7 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, struct irq_data *irq_data; struct mp_chip_data *data; struct irq_alloc_info *info = arg; + unsigned long flags; if (!info || nr_irqs > 1) return -EINVAL; @@ -2939,11 +2942,14 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, cfg = irqd_cfg(irq_data); add_pin_to_irq_node(data, ioapic_alloc_attr_node(info), ioapic, pin); + + local_irq_save(flags); if (info->ioapic_entry) mp_setup_entry(cfg, data, info->ioapic_entry); mp_register_handler(virq, data->trigger); if (virq < nr_legacy_irqs()) legacy_pic->mask(virq); + local_irq_restore(flags); apic_printk(APIC_VERBOSE, KERN_DEBUG "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i Dest:%d)\n", diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 1a9d735e0..5f1feb685 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -264,7 +264,7 @@ static inline int hpet_dev_id(struct irq_domain *domain) static void hpet_msi_write_msg(struct irq_data *data, struct msi_msg *msg) { - hpet_msi_write(data->handler_data, msg); + hpet_msi_write(irq_data_get_irq_handler_data(data), msg); } static struct irq_chip hpet_msi_controller = { diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index bda488680..7694ae6c1 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -111,7 +111,6 @@ static struct apic apic_default = { .send_IPI_all = default_send_IPI_all, .send_IPI_self = default_send_IPI_self, - .wait_for_init_deassert = true, .inquire_remote_apic = default_inquire_remote_apic, .read = native_apic_mem_read, diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 2683f36e4..861bc59c8 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -169,8 +169,7 @@ next: goto next; for_each_cpu_and(new_cpu, vector_cpumask, cpu_online_mask) { - if (per_cpu(vector_irq, new_cpu)[vector] > - VECTOR_UNDEFINED) + if (!IS_ERR_OR_NULL(per_cpu(vector_irq, new_cpu)[vector])) goto next; } /* Found one! */ @@ -182,7 +181,7 @@ next: cpumask_intersects(d->old_domain, cpu_online_mask); } for_each_cpu_and(new_cpu, vector_cpumask, cpu_online_mask) - per_cpu(vector_irq, new_cpu)[vector] = irq; + per_cpu(vector_irq, new_cpu)[vector] = irq_to_desc(irq); d->cfg.vector = vector; cpumask_copy(d->domain, vector_cpumask); err = 0; @@ -224,15 +223,16 @@ static int assign_irq_vector_policy(int irq, int node, static void clear_irq_vector(int irq, struct apic_chip_data *data) { - int cpu, vector; + struct irq_desc *desc; unsigned long flags; + int cpu, vector; raw_spin_lock_irqsave(&vector_lock, flags); BUG_ON(!data->cfg.vector); vector = data->cfg.vector; for_each_cpu_and(cpu, data->domain, cpu_online_mask) - per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED; + per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED; data->cfg.vector = 0; cpumask_clear(data->domain); @@ -242,12 +242,13 @@ static void clear_irq_vector(int irq, struct apic_chip_data *data) return; } + desc = irq_to_desc(irq); for_each_cpu_and(cpu, data->old_domain, cpu_online_mask) { for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { - if (per_cpu(vector_irq, cpu)[vector] != irq) + if (per_cpu(vector_irq, cpu)[vector] != desc) continue; - per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED; + per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED; break; } } @@ -296,7 +297,7 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, struct irq_alloc_info *info = arg; struct apic_chip_data *data; struct irq_data *irq_data; - int i, err; + int i, err, node; if (disable_apic) return -ENXIO; @@ -308,12 +309,13 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, for (i = 0; i < nr_irqs; i++) { irq_data = irq_domain_get_irq_data(domain, virq + i); BUG_ON(!irq_data); + node = irq_data_get_node(irq_data); #ifdef CONFIG_X86_IO_APIC if (virq + i < nr_legacy_irqs() && legacy_irq_data[virq + i]) data = legacy_irq_data[virq + i]; else #endif - data = alloc_apic_chip_data(irq_data->node); + data = alloc_apic_chip_data(node); if (!data) { err = -ENOMEM; goto error; @@ -322,8 +324,7 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, irq_data->chip = &lapic_controller; irq_data->chip_data = data; irq_data->hwirq = virq + i; - err = assign_irq_vector_policy(virq + i, irq_data->node, data, - info); + err = assign_irq_vector_policy(virq + i, node, data, info); if (err) goto error; } @@ -360,7 +361,11 @@ int __init arch_probe_nr_irqs(void) if (nr < nr_irqs) nr_irqs = nr; - return nr_legacy_irqs(); + /* + * We don't know if PIC is present at this point so we need to do + * probe() to get the right number of legacy IRQs. + */ + return legacy_pic->probe(); } #ifdef CONFIG_X86_IO_APIC @@ -403,32 +408,32 @@ int __init arch_early_irq_init(void) return arch_early_ioapic_init(); } +/* Initialize vector_irq on a new cpu */ static void __setup_vector_irq(int cpu) { - /* Initialize vector_irq on a new cpu */ - int irq, vector; struct apic_chip_data *data; + struct irq_desc *desc; + int irq, vector; /* Mark the inuse vectors */ - for_each_active_irq(irq) { - data = apic_chip_data(irq_get_irq_data(irq)); - if (!data) - continue; + for_each_irq_desc(irq, desc) { + struct irq_data *idata = irq_desc_get_irq_data(desc); - if (!cpumask_test_cpu(cpu, data->domain)) + data = apic_chip_data(idata); + if (!data || !cpumask_test_cpu(cpu, data->domain)) continue; vector = data->cfg.vector; - per_cpu(vector_irq, cpu)[vector] = irq; + per_cpu(vector_irq, cpu)[vector] = desc; } /* Mark the free vectors */ for (vector = 0; vector < NR_VECTORS; ++vector) { - irq = per_cpu(vector_irq, cpu)[vector]; - if (irq <= VECTOR_UNDEFINED) + desc = per_cpu(vector_irq, cpu)[vector]; + if (IS_ERR_OR_NULL(desc)) continue; - data = apic_chip_data(irq_get_irq_data(irq)); + data = apic_chip_data(irq_desc_get_irq_data(desc)); if (!cpumask_test_cpu(cpu, data->domain)) - per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED; + per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED; } } @@ -448,7 +453,7 @@ void setup_vector_irq(int cpu) * legacy vector to irq mapping: */ for (irq = 0; irq < nr_legacy_irqs(); irq++) - per_cpu(vector_irq, cpu)[ISA_IRQ_VECTOR(irq)] = irq; + per_cpu(vector_irq, cpu)[ISA_IRQ_VECTOR(irq)] = irq_to_desc(irq); __setup_vector_irq(cpu); } @@ -488,9 +493,8 @@ static int apic_set_affinity(struct irq_data *irq_data, err = assign_irq_vector(irq, data, dest); if (err) { - struct irq_data *top = irq_get_irq_data(irq); - - if (assign_irq_vector(irq, data, top->affinity)) + if (assign_irq_vector(irq, data, + irq_data_get_affinity_mask(irq_data))) pr_err("Failed to recover vector for irq %d\n", irq); return err; } @@ -538,27 +542,30 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void) entering_ack_irq(); + /* Prevent vectors vanishing under us */ + raw_spin_lock(&vector_lock); + me = smp_processor_id(); for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { - int irq; - unsigned int irr; - struct irq_desc *desc; struct apic_chip_data *data; + struct irq_desc *desc; + unsigned int irr; - irq = __this_cpu_read(vector_irq[vector]); - - if (irq <= VECTOR_UNDEFINED) + retry: + desc = __this_cpu_read(vector_irq[vector]); + if (IS_ERR_OR_NULL(desc)) continue; - desc = irq_to_desc(irq); - if (!desc) - continue; + if (!raw_spin_trylock(&desc->lock)) { + raw_spin_unlock(&vector_lock); + cpu_relax(); + raw_spin_lock(&vector_lock); + goto retry; + } - data = apic_chip_data(&desc->irq_data); + data = apic_chip_data(irq_desc_get_irq_data(desc)); if (!data) - continue; - - raw_spin_lock(&desc->lock); + goto unlock; /* * Check if the irq migration is in progress. If so, we @@ -583,11 +590,13 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void) apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR); goto unlock; } - __this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED); + __this_cpu_write(vector_irq[vector], VECTOR_UNUSED); unlock: raw_spin_unlock(&desc->lock); } + raw_spin_unlock(&vector_lock); + exiting_irq(); } diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index ab3219b3f..cc8311c4d 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -182,7 +182,7 @@ update_clusterinfo(struct notifier_block *nfb, unsigned long action, void *hcpu) return notifier_from_errno(err); } -static struct notifier_block __refdata x2apic_cpu_notifier = { +static struct notifier_block x2apic_cpu_notifier = { .notifier_call = update_clusterinfo, }; @@ -272,7 +272,6 @@ static struct apic apic_x2apic_cluster = { .send_IPI_all = x2apic_send_IPI_all, .send_IPI_self = x2apic_send_IPI_self, - .wait_for_init_deassert = false, .inquire_remote_apic = NULL, .read = native_apic_msr_read, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 3ffd92565..662e9150e 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -128,7 +128,6 @@ static struct apic apic_x2apic_phys = { .send_IPI_all = x2apic_send_IPI_all, .send_IPI_self = x2apic_send_IPI_self, - .wait_for_init_deassert = false, .inquire_remote_apic = NULL, .read = native_apic_msr_read, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index c8d92950b..4a139465f 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -248,7 +248,6 @@ static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip) APIC_DM_STARTUP; uv_write_global_mmr64(pnode, UVH_IPI_INT, val); - atomic_set(&init_deasserted, 1); return 0; } @@ -414,7 +413,6 @@ static struct apic __refdata apic_x2apic_uv_x = { .send_IPI_self = uv_send_IPI_self, .wakeup_secondary_cpu = uv_wakeup_secondary, - .wait_for_init_deassert = false, .inquire_remote_apic = NULL, .read = native_apic_msr_read, diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 927ec9235..052c9c302 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -919,7 +919,7 @@ recalc: } else if (jiffies_since_last_check > idle_period) { unsigned int idle_percentage; - idle_percentage = stime - last_stime; + idle_percentage = cputime_to_jiffies(stime - last_stime); idle_percentage *= 100; idle_percentage /= jiffies_since_last_check; use_apm_idle = (idle_percentage > idle_threshold); diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c index 58118e207..145863d4d 100644 --- a/arch/x86/kernel/check.c +++ b/arch/x86/kernel/check.c @@ -1,4 +1,4 @@ -#include <linux/module.h> +#include <linux/init.h> #include <linux/sched.h> #include <linux/kthread.h> #include <linux/workqueue.h> @@ -163,6 +163,5 @@ static int start_periodic_check_for_corruption(void) schedule_delayed_work(&bios_check_work, 0); return 0; } - -module_init(start_periodic_check_for_corruption); +device_initcall(start_periodic_check_for_corruption); diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 9bff68798..4eb065c6b 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -46,6 +46,8 @@ obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += perf_event_intel_uncore.o \ perf_event_intel_uncore_snb.o \ perf_event_intel_uncore_snbep.o \ perf_event_intel_uncore_nhmex.o +obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_msr.o +obj-$(CONFIG_CPU_SUP_AMD) += perf_event_msr.o endif diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index dd3a4baff..4a70fc6d4 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -11,6 +11,7 @@ #include <asm/cpu.h> #include <asm/smp.h> #include <asm/pci-direct.h> +#include <asm/delay.h> #ifdef CONFIG_X86_64 # include <asm/mmconfig.h> @@ -114,7 +115,7 @@ static void init_amd_k6(struct cpuinfo_x86 *c) const int K6_BUG_LOOP = 1000000; int n; void (*f_vide)(void); - unsigned long d, d2; + u64 d, d2; printk(KERN_INFO "AMD K6 stepping B detected - "); @@ -125,10 +126,10 @@ static void init_amd_k6(struct cpuinfo_x86 *c) n = K6_BUG_LOOP; f_vide = vide; - rdtscl(d); + d = rdtsc(); while (n--) f_vide(); - rdtscl(d2); + d2 = rdtsc(); d = d2-d; if (d > 20*K6_BUG_LOOP) @@ -506,6 +507,9 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) /* A random value per boot for bit slice [12:upper_bit) */ va_align.bits = get_random_int() & va_align.mask; } + + if (cpu_has(c, X86_FEATURE_MWAITX)) + use_mwaitx_delay(); } static void early_init_amd(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index cb9e5df42..1a292573d 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -13,6 +13,7 @@ #include <linux/kgdb.h> #include <linux/smp.h> #include <linux/io.h> +#include <linux/syscore_ops.h> #include <asm/stackprotector.h> #include <asm/perf_event.h> @@ -272,10 +273,9 @@ __setup("nosmap", setup_disable_smap); static __always_inline void setup_smap(struct cpuinfo_x86 *c) { - unsigned long eflags; + unsigned long eflags = native_save_fl(); /* This should have been cleared long ago */ - raw_local_save_flags(eflags); BUG_ON(eflags & X86_EFLAGS_AC); if (cpu_has(c, X86_FEATURE_SMAP)) { @@ -1109,10 +1109,10 @@ void print_cpu_info(struct cpuinfo_x86 *c) else printk(KERN_CONT "%d86", c->x86); - printk(KERN_CONT " (fam: %02x, model: %02x", c->x86, c->x86_model); + printk(KERN_CONT " (family: 0x%x, model: 0x%x", c->x86, c->x86_model); if (c->x86_mask || c->cpuid_level >= 0) - printk(KERN_CONT ", stepping: %02x)\n", c->x86_mask); + printk(KERN_CONT ", stepping: 0x%x)\n", c->x86_mask); else printk(KERN_CONT ")\n"); @@ -1185,10 +1185,10 @@ void syscall_init(void) * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip. */ wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); - wrmsrl(MSR_LSTAR, entry_SYSCALL_64); + wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); #ifdef CONFIG_IA32_EMULATION - wrmsrl(MSR_CSTAR, entry_SYSCALL_compat); + wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat); /* * This only works on Intel CPUs. * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP. @@ -1199,7 +1199,7 @@ void syscall_init(void) wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); #else - wrmsrl(MSR_CSTAR, ignore_sysret); + wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret); wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG); wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL); @@ -1488,3 +1488,20 @@ inline bool __static_cpu_has_safe(u16 bit) return boot_cpu_has(bit); } EXPORT_SYMBOL_GPL(__static_cpu_has_safe); + +static void bsp_resume(void) +{ + if (this_cpu->c_bsp_resume) + this_cpu->c_bsp_resume(&boot_cpu_data); +} + +static struct syscore_ops cpu_syscore_ops = { + .resume = bsp_resume, +}; + +static int __init init_cpu_syscore(void) +{ + register_syscore_ops(&cpu_syscore_ops); + return 0; +} +core_initcall(init_cpu_syscore); diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index c37dc37e8..2584265d4 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -13,6 +13,7 @@ struct cpu_dev { void (*c_init)(struct cpuinfo_x86 *); void (*c_identify)(struct cpuinfo_x86 *); void (*c_detect_tlb)(struct cpuinfo_x86 *); + void (*c_bsp_resume)(struct cpuinfo_x86 *); int c_x86_vendor; #ifdef CONFIG_X86_32 /* Optional vendor specific routine to obtain the cache size. */ diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 50163fa90..98a13db5f 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -371,6 +371,36 @@ static void detect_vmx_virtcap(struct cpuinfo_x86 *c) } } +static void init_intel_energy_perf(struct cpuinfo_x86 *c) +{ + u64 epb; + + /* + * Initialize MSR_IA32_ENERGY_PERF_BIAS if not already initialized. + * (x86_energy_perf_policy(8) is available to change it at run-time.) + */ + if (!cpu_has(c, X86_FEATURE_EPB)) + return; + + rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); + if ((epb & 0xF) != ENERGY_PERF_BIAS_PERFORMANCE) + return; + + pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n"); + pr_warn_once("ENERGY_PERF_BIAS: View and update with x86_energy_perf_policy(8)\n"); + epb = (epb & ~0xF) | ENERGY_PERF_BIAS_NORMAL; + wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); +} + +static void intel_bsp_resume(struct cpuinfo_x86 *c) +{ + /* + * MSR_IA32_ENERGY_PERF_BIAS is lost across suspend/resume, + * so reinitialize it properly like during bootup: + */ + init_intel_energy_perf(c); +} + static void init_intel(struct cpuinfo_x86 *c) { unsigned int l2 = 0; @@ -478,21 +508,7 @@ static void init_intel(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_VMX)) detect_vmx_virtcap(c); - /* - * Initialize MSR_IA32_ENERGY_PERF_BIAS if BIOS did not. - * x86_energy_perf_policy(8) is available to change it at run-time - */ - if (cpu_has(c, X86_FEATURE_EPB)) { - u64 epb; - - rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); - if ((epb & 0xF) == ENERGY_PERF_BIAS_PERFORMANCE) { - pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n"); - pr_warn_once("ENERGY_PERF_BIAS: View and update with x86_energy_perf_policy(8)\n"); - epb = (epb & ~0xF) | ENERGY_PERF_BIAS_NORMAL; - wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); - } - } + init_intel_energy_perf(c); } #ifdef CONFIG_X86_32 @@ -747,6 +763,7 @@ static const struct cpu_dev intel_cpu_dev = { .c_detect_tlb = intel_detect_tlb, .c_early_init = early_init_intel, .c_init = init_intel, + .c_bsp_resume = intel_bsp_resume, .c_x86_vendor = X86_VENDOR_INTEL, }; diff --git a/arch/x86/kernel/cpu/intel_pt.h b/arch/x86/kernel/cpu/intel_pt.h index 1c338b0eb..336878a5d 100644 --- a/arch/x86/kernel/cpu/intel_pt.h +++ b/arch/x86/kernel/cpu/intel_pt.h @@ -25,32 +25,11 @@ */ #define TOPA_PMI_MARGIN 512 -/* - * Table of Physical Addresses bits - */ -enum topa_sz { - TOPA_4K = 0, - TOPA_8K, - TOPA_16K, - TOPA_32K, - TOPA_64K, - TOPA_128K, - TOPA_256K, - TOPA_512K, - TOPA_1MB, - TOPA_2MB, - TOPA_4MB, - TOPA_8MB, - TOPA_16MB, - TOPA_32MB, - TOPA_64MB, - TOPA_128MB, - TOPA_SZ_END, -}; +#define TOPA_SHIFT 12 -static inline unsigned int sizes(enum topa_sz tsz) +static inline unsigned int sizes(unsigned int tsz) { - return 1 << (tsz + 12); + return 1 << (tsz + TOPA_SHIFT); }; struct topa_entry { @@ -66,20 +45,26 @@ struct topa_entry { u64 rsvd4 : 16; }; -#define TOPA_SHIFT 12 -#define PT_CPUID_LEAVES 2 +#define PT_CPUID_LEAVES 2 +#define PT_CPUID_REGS_NUM 4 /* number of regsters (eax, ebx, ecx, edx) */ enum pt_capabilities { PT_CAP_max_subleaf = 0, PT_CAP_cr3_filtering, + PT_CAP_psb_cyc, + PT_CAP_mtc, PT_CAP_topa_output, PT_CAP_topa_multiple_entries, + PT_CAP_single_range_output, PT_CAP_payloads_lip, + PT_CAP_mtc_periods, + PT_CAP_cycle_thresholds, + PT_CAP_psb_periods, }; struct pt_pmu { struct pmu pmu; - u32 caps[4 * PT_CPUID_LEAVES]; + u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES]; }; /** diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index bb34b03af..a3311c886 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -1,4 +1,4 @@ -obj-y = mce.o mce-severity.o +obj-y = mce.o mce-severity.o mce-genpool.o obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c index a1aef9533..34c89a3e8 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-apei.c +++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c @@ -57,7 +57,6 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err) m.addr = mem_err->physical_addr; mce_log(&m); - mce_notify_irq(); } EXPORT_SYMBOL_GPL(apei_mce_report_mem_error); diff --git a/arch/x86/kernel/cpu/mcheck/mce-genpool.c b/arch/x86/kernel/cpu/mcheck/mce-genpool.c new file mode 100644 index 000000000..0a850100c --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/mce-genpool.c @@ -0,0 +1,99 @@ +/* + * MCE event pool management in MCE context + * + * Copyright (C) 2015 Intel Corp. + * Author: Chen, Gong <gong.chen@linux.intel.com> + * + * This file is licensed under GPLv2. + */ +#include <linux/smp.h> +#include <linux/mm.h> +#include <linux/genalloc.h> +#include <linux/llist.h> +#include "mce-internal.h" + +/* + * printk() is not safe in MCE context. This is a lock-less memory allocator + * used to save error information organized in a lock-less list. + * + * This memory pool is only to be used to save MCE records in MCE context. + * MCE events are rare, so a fixed size memory pool should be enough. Use + * 2 pages to save MCE events for now (~80 MCE records at most). + */ +#define MCE_POOLSZ (2 * PAGE_SIZE) + +static struct gen_pool *mce_evt_pool; +static LLIST_HEAD(mce_event_llist); +static char gen_pool_buf[MCE_POOLSZ]; + +void mce_gen_pool_process(void) +{ + struct llist_node *head; + struct mce_evt_llist *node; + struct mce *mce; + + head = llist_del_all(&mce_event_llist); + if (!head) + return; + + head = llist_reverse_order(head); + llist_for_each_entry(node, head, llnode) { + mce = &node->mce; + atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); + gen_pool_free(mce_evt_pool, (unsigned long)node, sizeof(*node)); + } +} + +bool mce_gen_pool_empty(void) +{ + return llist_empty(&mce_event_llist); +} + +int mce_gen_pool_add(struct mce *mce) +{ + struct mce_evt_llist *node; + + if (!mce_evt_pool) + return -EINVAL; + + node = (void *)gen_pool_alloc(mce_evt_pool, sizeof(*node)); + if (!node) { + pr_warn_ratelimited("MCE records pool full!\n"); + return -ENOMEM; + } + + memcpy(&node->mce, mce, sizeof(*mce)); + llist_add(&node->llnode, &mce_event_llist); + + return 0; +} + +static int mce_gen_pool_create(void) +{ + struct gen_pool *tmpp; + int ret = -ENOMEM; + + tmpp = gen_pool_create(ilog2(sizeof(struct mce_evt_llist)), -1); + if (!tmpp) + goto out; + + ret = gen_pool_add(tmpp, (unsigned long)gen_pool_buf, MCE_POOLSZ, -1); + if (ret) { + gen_pool_destroy(tmpp); + goto out; + } + + mce_evt_pool = tmpp; + +out: + return ret; +} + +int mce_gen_pool_init(void) +{ + /* Just init mce_gen_pool once. */ + if (mce_evt_pool) + return 0; + + return mce_gen_pool_create(); +} diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index fe32074b8..547720efd 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -13,6 +13,8 @@ enum severity_level { MCE_PANIC_SEVERITY, }; +extern struct atomic_notifier_head x86_mce_decoder_chain; + #define ATTR_LEN 16 #define INITIAL_CHECK_INTERVAL 5 * 60 /* 5 minutes */ @@ -24,6 +26,16 @@ struct mce_bank { char attrname[ATTR_LEN]; /* attribute name */ }; +struct mce_evt_llist { + struct llist_node llnode; + struct mce mce; +}; + +void mce_gen_pool_process(void); +bool mce_gen_pool_empty(void); +int mce_gen_pool_add(struct mce *mce); +int mce_gen_pool_init(void); + extern int (*mce_severity)(struct mce *a, int tolerant, char **msg, bool is_excp); struct dentry *mce_get_debugfs_dir(void); @@ -67,3 +79,5 @@ static inline int apei_clear_mce(u64 record_id) return -EINVAL; } #endif + +void mce_inject_log(struct mce *m); diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index df919ff10..9d014b82a 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -52,11 +52,11 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex); -#define rcu_dereference_check_mce(p) \ +#define mce_log_get_idx_check(p) \ ({ \ - rcu_lockdep_assert(rcu_read_lock_sched_held() || \ - lockdep_is_held(&mce_chrdev_read_mutex), \ - "suspicious rcu_dereference_check_mce() usage"); \ + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \ + !lockdep_is_held(&mce_chrdev_read_mutex), \ + "suspicious mce_log_get_idx_check() usage"); \ smp_load_acquire(&(p)); \ }) @@ -110,22 +110,24 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { */ mce_banks_t mce_banks_ce_disabled; -static DEFINE_PER_CPU(struct work_struct, mce_work); +static struct work_struct mce_work; +static struct irq_work mce_irq_work; static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs); +static int mce_usable_address(struct mce *m); /* * CPU/chipset specific EDAC code can register a notifier call here to print * MCE errors in a human-readable form. */ -static ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); +ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); /* Do initial initialization of a struct mce */ void mce_setup(struct mce *m) { memset(m, 0, sizeof(struct mce)); m->cpu = m->extcpu = smp_processor_id(); - rdtscll(m->tsc); + m->tsc = rdtsc(); /* We hope get_seconds stays lockless */ m->time = get_seconds(); m->cpuvendor = boot_cpu_data.x86_vendor; @@ -157,12 +159,13 @@ void mce_log(struct mce *mce) /* Emit the trace record: */ trace_mce_record(mce); - atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); + if (!mce_gen_pool_add(mce)) + irq_work_queue(&mce_irq_work); mce->finished = 0; wmb(); for (;;) { - entry = rcu_dereference_check_mce(mcelog.next); + entry = mce_log_get_idx_check(mcelog.next); for (;;) { /* @@ -196,48 +199,23 @@ void mce_log(struct mce *mce) set_bit(0, &mce_need_notify); } -static void drain_mcelog_buffer(void) +void mce_inject_log(struct mce *m) { - unsigned int next, i, prev = 0; - - next = ACCESS_ONCE(mcelog.next); - - do { - struct mce *m; - - /* drain what was logged during boot */ - for (i = prev; i < next; i++) { - unsigned long start = jiffies; - unsigned retries = 1; - - m = &mcelog.entry[i]; - - while (!m->finished) { - if (time_after_eq(jiffies, start + 2*retries)) - retries++; - - cpu_relax(); - - if (!m->finished && retries >= 4) { - pr_err("skipping error being logged currently!\n"); - break; - } - } - smp_rmb(); - atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); - } - - memset(mcelog.entry + prev, 0, (next - prev) * sizeof(*m)); - prev = next; - next = cmpxchg(&mcelog.next, prev, 0); - } while (next != prev); + mutex_lock(&mce_chrdev_read_mutex); + mce_log(m); + mutex_unlock(&mce_chrdev_read_mutex); } +EXPORT_SYMBOL_GPL(mce_inject_log); +static struct notifier_block mce_srao_nb; void mce_register_decode_chain(struct notifier_block *nb) { + /* Ensure SRAO notifier has the highest priority in the decode chain. */ + if (nb != &mce_srao_nb && nb->priority == INT_MAX) + nb->priority -= 1; + atomic_notifier_chain_register(&x86_mce_decoder_chain, nb); - drain_mcelog_buffer(); } EXPORT_SYMBOL_GPL(mce_register_decode_chain); @@ -461,61 +439,6 @@ static inline void mce_gather_info(struct mce *m, struct pt_regs *regs) } } -/* - * Simple lockless ring to communicate PFNs from the exception handler with the - * process context work function. This is vastly simplified because there's - * only a single reader and a single writer. - */ -#define MCE_RING_SIZE 16 /* we use one entry less */ - -struct mce_ring { - unsigned short start; - unsigned short end; - unsigned long ring[MCE_RING_SIZE]; -}; -static DEFINE_PER_CPU(struct mce_ring, mce_ring); - -/* Runs with CPU affinity in workqueue */ -static int mce_ring_empty(void) -{ - struct mce_ring *r = this_cpu_ptr(&mce_ring); - - return r->start == r->end; -} - -static int mce_ring_get(unsigned long *pfn) -{ - struct mce_ring *r; - int ret = 0; - - *pfn = 0; - get_cpu(); - r = this_cpu_ptr(&mce_ring); - if (r->start == r->end) - goto out; - *pfn = r->ring[r->start]; - r->start = (r->start + 1) % MCE_RING_SIZE; - ret = 1; -out: - put_cpu(); - return ret; -} - -/* Always runs in MCE context with preempt off */ -static int mce_ring_add(unsigned long pfn) -{ - struct mce_ring *r = this_cpu_ptr(&mce_ring); - unsigned next; - - next = (r->end + 1) % MCE_RING_SIZE; - if (next == r->start) - return -1; - r->ring[r->end] = pfn; - wmb(); - r->end = next; - return 0; -} - int mce_available(struct cpuinfo_x86 *c) { if (mca_cfg.disabled) @@ -525,12 +448,10 @@ int mce_available(struct cpuinfo_x86 *c) static void mce_schedule_work(void) { - if (!mce_ring_empty()) - schedule_work(this_cpu_ptr(&mce_work)); + if (!mce_gen_pool_empty() && keventd_up()) + schedule_work(&mce_work); } -static DEFINE_PER_CPU(struct irq_work, mce_irq_work); - static void mce_irq_work_cb(struct irq_work *entry) { mce_notify_irq(); @@ -551,8 +472,29 @@ static void mce_report_event(struct pt_regs *regs) return; } - irq_work_queue(this_cpu_ptr(&mce_irq_work)); + irq_work_queue(&mce_irq_work); +} + +static int srao_decode_notifier(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct mce *mce = (struct mce *)data; + unsigned long pfn; + + if (!mce) + return NOTIFY_DONE; + + if (mce->usable_addr && (mce->severity == MCE_AO_SEVERITY)) { + pfn = mce->addr >> PAGE_SHIFT; + memory_failure(pfn, MCE_VECTOR, 0); + } + + return NOTIFY_OK; } +static struct notifier_block mce_srao_nb = { + .notifier_call = srao_decode_notifier, + .priority = INT_MAX, +}; /* * Read ADDR and MISC registers. @@ -672,8 +614,11 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) */ if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m)) { if (m.status & MCI_STATUS_ADDRV) { - mce_ring_add(m.addr >> PAGE_SHIFT); - mce_schedule_work(); + m.severity = severity; + m.usable_addr = mce_usable_address(&m); + + if (!mce_gen_pool_add(&m)) + mce_schedule_work(); } } @@ -1029,7 +974,6 @@ void do_machine_check(struct pt_regs *regs, long error_code) { struct mca_config *cfg = &mca_cfg; struct mce m, *final; - enum ctx_state prev_state; int i; int worst = 0; int severity; @@ -1055,7 +999,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) int flags = MF_ACTION_REQUIRED; int lmce = 0; - prev_state = ist_enter(regs); + ist_enter(regs); this_cpu_inc(mce_exception_count); @@ -1143,15 +1087,9 @@ void do_machine_check(struct pt_regs *regs, long error_code) mce_read_aux(&m, i); - /* - * Action optional error. Queue address for later processing. - * When the ring overflows we just ignore the AO error. - * RED-PEN add some logging mechanism when - * usable_address or mce_add_ring fails. - * RED-PEN don't ignore overflow for mca_cfg.tolerant == 0 - */ - if (severity == MCE_AO_SEVERITY && mce_usable_address(&m)) - mce_ring_add(m.addr >> PAGE_SHIFT); + /* assuming valid severity level != 0 */ + m.severity = severity; + m.usable_addr = mce_usable_address(&m); mce_log(&m); @@ -1227,7 +1165,7 @@ out: local_irq_disable(); ist_end_non_atomic(); done: - ist_exit(regs, prev_state); + ist_exit(regs); } EXPORT_SYMBOL_GPL(do_machine_check); @@ -1247,14 +1185,11 @@ int memory_failure(unsigned long pfn, int vector, int flags) /* * Action optional processing happens here (picking up * from the list of faulting pages that do_machine_check() - * placed into the "ring"). + * placed into the genpool). */ static void mce_process_work(struct work_struct *dummy) { - unsigned long pfn; - - while (mce_ring_get(&pfn)) - memory_failure(pfn, MCE_VECTOR, 0); + mce_gen_pool_process(); } #ifdef CONFIG_X86_MCE_INTEL @@ -1678,6 +1613,17 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) } } +static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c) +{ + switch (c->x86_vendor) { + case X86_VENDOR_INTEL: + mce_intel_feature_clear(c); + break; + default: + break; + } +} + static void mce_start_timer(unsigned int cpu, struct timer_list *t) { unsigned long iv = check_interval * HZ; @@ -1731,13 +1677,36 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c) return; } + if (mce_gen_pool_init()) { + mca_cfg.disabled = true; + pr_emerg("Couldn't allocate MCE records pool!\n"); + return; + } + machine_check_vector = do_machine_check; __mcheck_cpu_init_generic(); __mcheck_cpu_init_vendor(c); __mcheck_cpu_init_timer(); - INIT_WORK(this_cpu_ptr(&mce_work), mce_process_work); - init_irq_work(this_cpu_ptr(&mce_irq_work), &mce_irq_work_cb); +} + +/* + * Called for each booted CPU to clear some machine checks opt-ins + */ +void mcheck_cpu_clear(struct cpuinfo_x86 *c) +{ + if (mca_cfg.disabled) + return; + + if (!mce_available(c)) + return; + + /* + * Possibly to clear general settings generic to x86 + * __mcheck_cpu_clear_generic(c); + */ + __mcheck_cpu_clear_vendor(c); + } /* @@ -1784,7 +1753,7 @@ static void collect_tscs(void *data) { unsigned long *cpu_tsc = (unsigned long *)data; - rdtscll(cpu_tsc[smp_processor_id()]); + cpu_tsc[smp_processor_id()] = rdtsc(); } static int mce_apei_read_done; @@ -1850,7 +1819,7 @@ static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf, goto out; } - next = rcu_dereference_check_mce(mcelog.next); + next = mce_log_get_idx_check(mcelog.next); /* Only supports full reads right now */ err = -EINVAL; @@ -2056,8 +2025,12 @@ __setup("mce", mcheck_enable); int __init mcheck_init(void) { mcheck_intel_therm_init(); + mce_register_decode_chain(&mce_srao_nb); mcheck_vendor_init_severity(); + INIT_WORK(&mce_work, mce_process_work); + init_irq_work(&mce_irq_work, mce_irq_work_cb); + return 0; } @@ -2591,5 +2564,20 @@ static int __init mcheck_debugfs_init(void) return 0; } -late_initcall(mcheck_debugfs_init); +#else +static int __init mcheck_debugfs_init(void) { return -EINVAL; } #endif + +static int __init mcheck_late_init(void) +{ + mcheck_debugfs_init(); + + /* + * Flush out everything that has been logged during early boot, now that + * everything has been initialized (workqueues, decoders, ...). + */ + mce_schedule_work(); + + return 0; +} +late_initcall(mcheck_late_init); diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index c93c27df9..1e8bb6c94 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -251,7 +251,6 @@ static void intel_threshold_interrupt(void) return; machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)); - mce_notify_irq(); } /* @@ -440,7 +439,7 @@ static void intel_init_cmci(void) cmci_recheck(); } -void intel_init_lmce(void) +static void intel_init_lmce(void) { u64 val; @@ -453,9 +452,26 @@ void intel_init_lmce(void) wrmsrl(MSR_IA32_MCG_EXT_CTL, val | MCG_EXT_CTL_LMCE_EN); } +static void intel_clear_lmce(void) +{ + u64 val; + + if (!lmce_supported()) + return; + + rdmsrl(MSR_IA32_MCG_EXT_CTL, val); + val &= ~MCG_EXT_CTL_LMCE_EN; + wrmsrl(MSR_IA32_MCG_EXT_CTL, val); +} + void mce_intel_feature_init(struct cpuinfo_x86 *c) { intel_init_thermal(c); intel_init_cmci(); intel_init_lmce(); } + +void mce_intel_feature_clear(struct cpuinfo_x86 *c) +{ + intel_clear_lmce(); +} diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c index 737b0ad4e..12402e10a 100644 --- a/arch/x86/kernel/cpu/mcheck/p5.c +++ b/arch/x86/kernel/cpu/mcheck/p5.c @@ -19,10 +19,9 @@ int mce_p5_enabled __read_mostly; /* Machine check handler for Pentium class Intel CPUs: */ static void pentium_machine_check(struct pt_regs *regs, long error_code) { - enum ctx_state prev_state; u32 loaddr, hi, lotype; - prev_state = ist_enter(regs); + ist_enter(regs); rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi); rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi); @@ -39,7 +38,7 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code) add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); - ist_exit(regs, prev_state); + ist_exit(regs); } /* Set up machine check reporting for processors with Intel style MCE: */ diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c index 44f138296..01dd87028 100644 --- a/arch/x86/kernel/cpu/mcheck/winchip.c +++ b/arch/x86/kernel/cpu/mcheck/winchip.c @@ -15,12 +15,12 @@ /* Machine check handler for WinChip C6: */ static void winchip_machine_check(struct pt_regs *regs, long error_code) { - enum ctx_state prev_state = ist_enter(regs); + ist_enter(regs); printk(KERN_EMERG "CPU0: Machine Check Exception.\n"); add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); - ist_exit(regs, prev_state); + ist_exit(regs); } /* Set up machine check reporting on the Winchip C6 series */ diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 6236a54a6..9e3f3c7dd 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -377,17 +377,16 @@ static int mc_device_add(struct device *dev, struct subsys_interface *sif) return err; } -static int mc_device_remove(struct device *dev, struct subsys_interface *sif) +static void mc_device_remove(struct device *dev, struct subsys_interface *sif) { int cpu = dev->id; if (!cpu_online(cpu)) - return 0; + return; pr_debug("CPU%d removed\n", cpu); microcode_fini_cpu(cpu); sysfs_remove_group(&dev->kobj, &mc_attr_group); - return 0; } static struct subsys_interface mc_cpu_interface = { @@ -460,7 +459,7 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) return NOTIFY_OK; } -static struct notifier_block __refdata mc_cpu_notifier = { +static struct notifier_block mc_cpu_notifier = { .notifier_call = mc_cpu_callback, }; diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c index 6b5021fc3..66ed31939 100644 --- a/arch/x86/kernel/cpu/microcode/intel_early.c +++ b/arch/x86/kernel/cpu/microcode/intel_early.c @@ -390,7 +390,7 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci) } #ifdef DEBUG -static void __ref show_saved_mc(void) +static void show_saved_mc(void) { int i, j; unsigned int sig, pf, rev, total_size, data_size, date; diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index aad4bd84b..20e242ea1 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -18,6 +18,7 @@ #include <linux/efi.h> #include <linux/interrupt.h> #include <linux/irq.h> +#include <linux/kexec.h> #include <asm/processor.h> #include <asm/hypervisor.h> #include <asm/hyperv.h> @@ -28,12 +29,15 @@ #include <asm/i8259.h> #include <asm/apic.h> #include <asm/timer.h> +#include <asm/reboot.h> struct ms_hyperv_info ms_hyperv; EXPORT_SYMBOL_GPL(ms_hyperv); #if IS_ENABLED(CONFIG_HYPERV) static void (*vmbus_handler)(void); +static void (*hv_kexec_handler)(void); +static void (*hv_crash_handler)(struct pt_regs *regs); void hyperv_vector_handler(struct pt_regs *regs) { @@ -67,7 +71,47 @@ void hv_remove_vmbus_irq(void) } EXPORT_SYMBOL_GPL(hv_setup_vmbus_irq); EXPORT_SYMBOL_GPL(hv_remove_vmbus_irq); -#endif + +void hv_setup_kexec_handler(void (*handler)(void)) +{ + hv_kexec_handler = handler; +} +EXPORT_SYMBOL_GPL(hv_setup_kexec_handler); + +void hv_remove_kexec_handler(void) +{ + hv_kexec_handler = NULL; +} +EXPORT_SYMBOL_GPL(hv_remove_kexec_handler); + +void hv_setup_crash_handler(void (*handler)(struct pt_regs *regs)) +{ + hv_crash_handler = handler; +} +EXPORT_SYMBOL_GPL(hv_setup_crash_handler); + +void hv_remove_crash_handler(void) +{ + hv_crash_handler = NULL; +} +EXPORT_SYMBOL_GPL(hv_remove_crash_handler); + +#ifdef CONFIG_KEXEC_CORE +static void hv_machine_shutdown(void) +{ + if (kexec_in_progress && hv_kexec_handler) + hv_kexec_handler(); + native_machine_shutdown(); +} + +static void hv_machine_crash_shutdown(struct pt_regs *regs) +{ + if (hv_crash_handler) + hv_crash_handler(regs); + native_machine_crash_shutdown(regs); +} +#endif /* CONFIG_KEXEC_CORE */ +#endif /* CONFIG_HYPERV */ static uint32_t __init ms_hyperv_platform(void) { @@ -114,6 +158,7 @@ static void __init ms_hyperv_init_platform(void) * Extract the features and hints */ ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES); + ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES); ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO); printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n", @@ -141,6 +186,11 @@ static void __init ms_hyperv_init_platform(void) no_timer_check = 1; #endif +#if IS_ENABLED(CONFIG_HYPERV) && defined(CONFIG_KEXEC_CORE) + machine_ops.shutdown = hv_machine_shutdown; + machine_ops.crash_shutdown = hv_machine_crash_shutdown; +#endif + mark_tsc_unstable("running on Hyper-V"); } const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = { diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index e7ed0d8eb..f891b4750 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -448,7 +448,6 @@ int mtrr_add(unsigned long base, unsigned long size, unsigned int type, return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type, increment); } -EXPORT_SYMBOL(mtrr_add); /** * mtrr_del_page - delete a memory type region @@ -537,7 +536,6 @@ int mtrr_del(int reg, unsigned long base, unsigned long size) return -EINVAL; return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT); } -EXPORT_SYMBOL(mtrr_del); /** * arch_phys_wc_add - add a WC MTRR and handle errors if PAT is unavailable diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 9469dfa55..66dd3fe99 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1551,7 +1551,7 @@ static void __init filter_events(struct attribute **attrs) } /* Merge two pointer arrays */ -static __init struct attribute **merge_attr(struct attribute **a, struct attribute **b) +__init struct attribute **merge_attr(struct attribute **a, struct attribute **b) { struct attribute **new; int j, i; @@ -2179,6 +2179,7 @@ static unsigned long get_segment_base(unsigned int segment) int idx = segment >> 3; if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) { +#ifdef CONFIG_MODIFY_LDT_SYSCALL struct ldt_struct *ldt; if (idx > LDT_ENTRIES) @@ -2190,6 +2191,9 @@ static unsigned long get_segment_base(unsigned int segment) return 0; desc = &ldt->entries[idx]; +#else + return 0; +#endif } else { if (idx > GDT_ENTRIES) return 0; @@ -2200,7 +2204,7 @@ static unsigned long get_segment_base(unsigned int segment) return get_desc_base(desc); } -#ifdef CONFIG_COMPAT +#ifdef CONFIG_IA32_EMULATION #include <asm/compat.h> diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 3e7fd27df..165be83a7 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -47,6 +47,7 @@ enum extra_reg_type { EXTRA_REG_RSP_1 = 1, /* offcore_response_1 */ EXTRA_REG_LBR = 2, /* lbr_select */ EXTRA_REG_LDLAT = 3, /* ld_lat_threshold */ + EXTRA_REG_FE = 4, /* fe_* */ EXTRA_REG_MAX /* number of entries needed */ }; @@ -165,7 +166,7 @@ struct intel_excl_cntrs { unsigned core_id; /* per-core: core id */ }; -#define MAX_LBR_ENTRIES 16 +#define MAX_LBR_ENTRIES 32 enum { X86_PERF_KFREE_SHARED = 0, @@ -594,6 +595,7 @@ struct x86_pmu { struct event_constraint *pebs_constraints; void (*pebs_aliases)(struct perf_event *event); int max_pebs_events; + unsigned long free_running_flags; /* * Intel LBR @@ -624,6 +626,7 @@ struct x86_pmu { struct x86_perf_task_context { u64 lbr_from[MAX_LBR_ENTRIES]; u64 lbr_to[MAX_LBR_ENTRIES]; + u64 lbr_info[MAX_LBR_ENTRIES]; int lbr_callstack_users; int lbr_stack_state; }; @@ -793,6 +796,8 @@ static inline void set_linear_ip(struct pt_regs *regs, unsigned long ip) ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event); ssize_t intel_event_sysfs_show(char *page, u64 config); +struct attribute **merge_attr(struct attribute **a, struct attribute **b); + #ifdef CONFIG_CPU_SUP_AMD int amd_pmu_init(void); @@ -808,20 +813,6 @@ static inline int amd_pmu_init(void) #ifdef CONFIG_CPU_SUP_INTEL -static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event) -{ - /* user explicitly requested branch sampling */ - if (has_branch_stack(event)) - return true; - - /* implicit branch sampling to correct PEBS skid */ - if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1 && - x86_pmu.intel_cap.pebs_format < 2) - return true; - - return false; -} - static inline bool intel_pmu_has_bts(struct perf_event *event) { if (event->attr.config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS && @@ -873,6 +864,8 @@ extern struct event_constraint intel_ivb_pebs_event_constraints[]; extern struct event_constraint intel_hsw_pebs_event_constraints[]; +extern struct event_constraint intel_skl_pebs_event_constraints[]; + struct event_constraint *intel_pebs_constraints(struct perf_event *event); void intel_pmu_pebs_enable(struct perf_event *event); @@ -911,6 +904,8 @@ void intel_pmu_lbr_init_snb(void); void intel_pmu_lbr_init_hsw(void); +void intel_pmu_lbr_init_skl(void); + int intel_pmu_setup_lbr_filter(struct perf_event *event); void intel_pt_interrupt(void); @@ -934,6 +929,7 @@ static inline int is_ht_workaround_enabled(void) { return !!(x86_pmu.flags & PMU_FL_EXCL_ENABLED); } + #else /* CONFIG_CPU_SUP_INTEL */ static inline void reserve_ds_buffers(void) diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 1b09c420c..f63360be2 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -12,7 +12,7 @@ #include <linux/init.h> #include <linux/slab.h> #include <linux/export.h> -#include <linux/watchdog.h> +#include <linux/nmi.h> #include <asm/cpufeature.h> #include <asm/hardirq.h> @@ -177,6 +177,14 @@ static struct event_constraint intel_slm_event_constraints[] __read_mostly = EVENT_CONSTRAINT_END }; +struct event_constraint intel_skl_event_constraints[] = { + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ + INTEL_UEVENT_CONSTRAINT(0x1c0, 0x2), /* INST_RETIRED.PREC_DIST */ + EVENT_CONSTRAINT_END +}; + static struct extra_reg intel_snb_extra_regs[] __read_mostly = { /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3f807f8fffull, RSP_0), @@ -193,6 +201,18 @@ static struct extra_reg intel_snbep_extra_regs[] __read_mostly = { EVENT_EXTRA_END }; +static struct extra_reg intel_skl_extra_regs[] __read_mostly = { + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1), + INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd), + /* + * Note the low 8 bits eventsel code is not a continuous field, containing + * some #GPing bits. These are masked out. + */ + INTEL_UEVENT_EXTRA_REG(0x01c6, MSR_PEBS_FRONTEND, 0x7fff17, FE), + EVENT_EXTRA_END +}; + EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3"); EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3"); EVENT_ATTR_STR(mem-stores, mem_st_snb, "event=0xcd,umask=0x2"); @@ -235,7 +255,7 @@ struct event_constraint intel_bdw_event_constraints[] = { FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ INTEL_UEVENT_CONSTRAINT(0x148, 0x4), /* L1D_PEND_MISS.PENDING */ - INTEL_EVENT_CONSTRAINT(0xa3, 0x4), /* CYCLE_ACTIVITY.* */ + INTEL_UEVENT_CONSTRAINT(0x8a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_MISS */ EVENT_CONSTRAINT_END }; @@ -244,6 +264,200 @@ static u64 intel_pmu_event_map(int hw_event) return intel_perfmon_event_map[hw_event]; } +/* + * Notes on the events: + * - data reads do not include code reads (comparable to earlier tables) + * - data counts include speculative execution (except L1 write, dtlb, bpu) + * - remote node access includes remote memory, remote cache, remote mmio. + * - prefetches are not included in the counts. + * - icache miss does not include decoded icache + */ + +#define SKL_DEMAND_DATA_RD BIT_ULL(0) +#define SKL_DEMAND_RFO BIT_ULL(1) +#define SKL_ANY_RESPONSE BIT_ULL(16) +#define SKL_SUPPLIER_NONE BIT_ULL(17) +#define SKL_L3_MISS_LOCAL_DRAM BIT_ULL(26) +#define SKL_L3_MISS_REMOTE_HOP0_DRAM BIT_ULL(27) +#define SKL_L3_MISS_REMOTE_HOP1_DRAM BIT_ULL(28) +#define SKL_L3_MISS_REMOTE_HOP2P_DRAM BIT_ULL(29) +#define SKL_L3_MISS (SKL_L3_MISS_LOCAL_DRAM| \ + SKL_L3_MISS_REMOTE_HOP0_DRAM| \ + SKL_L3_MISS_REMOTE_HOP1_DRAM| \ + SKL_L3_MISS_REMOTE_HOP2P_DRAM) +#define SKL_SPL_HIT BIT_ULL(30) +#define SKL_SNOOP_NONE BIT_ULL(31) +#define SKL_SNOOP_NOT_NEEDED BIT_ULL(32) +#define SKL_SNOOP_MISS BIT_ULL(33) +#define SKL_SNOOP_HIT_NO_FWD BIT_ULL(34) +#define SKL_SNOOP_HIT_WITH_FWD BIT_ULL(35) +#define SKL_SNOOP_HITM BIT_ULL(36) +#define SKL_SNOOP_NON_DRAM BIT_ULL(37) +#define SKL_ANY_SNOOP (SKL_SPL_HIT|SKL_SNOOP_NONE| \ + SKL_SNOOP_NOT_NEEDED|SKL_SNOOP_MISS| \ + SKL_SNOOP_HIT_NO_FWD|SKL_SNOOP_HIT_WITH_FWD| \ + SKL_SNOOP_HITM|SKL_SNOOP_NON_DRAM) +#define SKL_DEMAND_READ SKL_DEMAND_DATA_RD +#define SKL_SNOOP_DRAM (SKL_SNOOP_NONE| \ + SKL_SNOOP_NOT_NEEDED|SKL_SNOOP_MISS| \ + SKL_SNOOP_HIT_NO_FWD|SKL_SNOOP_HIT_WITH_FWD| \ + SKL_SNOOP_HITM|SKL_SPL_HIT) +#define SKL_DEMAND_WRITE SKL_DEMAND_RFO +#define SKL_LLC_ACCESS SKL_ANY_RESPONSE +#define SKL_L3_MISS_REMOTE (SKL_L3_MISS_REMOTE_HOP0_DRAM| \ + SKL_L3_MISS_REMOTE_HOP1_DRAM| \ + SKL_L3_MISS_REMOTE_HOP2P_DRAM) + +static __initconst const u64 skl_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_INST_RETIRED.ALL_LOADS */ + [ C(RESULT_MISS) ] = 0x151, /* L1D.REPLACEMENT */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_INST_RETIRED.ALL_STORES */ + [ C(RESULT_MISS) ] = 0x0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x283, /* ICACHE_64B.MISS */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */ + [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */ + [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_INST_RETIRED.ALL_LOADS */ + [ C(RESULT_MISS) ] = 0x608, /* DTLB_LOAD_MISSES.WALK_COMPLETED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_INST_RETIRED.ALL_STORES */ + [ C(RESULT_MISS) ] = 0x649, /* DTLB_STORE_MISSES.WALK_COMPLETED */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x2085, /* ITLB_MISSES.STLB_HIT */ + [ C(RESULT_MISS) ] = 0xe85, /* ITLB_MISSES.WALK_COMPLETED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0xc4, /* BR_INST_RETIRED.ALL_BRANCHES */ + [ C(RESULT_MISS) ] = 0xc5, /* BR_MISP_RETIRED.ALL_BRANCHES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */ + [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */ + [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, +}; + +static __initconst const u64 skl_hw_cache_extra_regs + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = SKL_DEMAND_READ| + SKL_LLC_ACCESS|SKL_ANY_SNOOP, + [ C(RESULT_MISS) ] = SKL_DEMAND_READ| + SKL_L3_MISS|SKL_ANY_SNOOP| + SKL_SUPPLIER_NONE, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = SKL_DEMAND_WRITE| + SKL_LLC_ACCESS|SKL_ANY_SNOOP, + [ C(RESULT_MISS) ] = SKL_DEMAND_WRITE| + SKL_L3_MISS|SKL_ANY_SNOOP| + SKL_SUPPLIER_NONE, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = SKL_DEMAND_READ| + SKL_L3_MISS_LOCAL_DRAM|SKL_SNOOP_DRAM, + [ C(RESULT_MISS) ] = SKL_DEMAND_READ| + SKL_L3_MISS_REMOTE|SKL_SNOOP_DRAM, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = SKL_DEMAND_WRITE| + SKL_L3_MISS_LOCAL_DRAM|SKL_SNOOP_DRAM, + [ C(RESULT_MISS) ] = SKL_DEMAND_WRITE| + SKL_L3_MISS_REMOTE|SKL_SNOOP_DRAM, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, +}; + #define SNB_DMND_DATA_RD (1ULL << 0) #define SNB_DMND_RFO (1ULL << 1) #define SNB_DMND_IFETCH (1ULL << 2) @@ -1114,7 +1328,7 @@ static struct extra_reg intel_slm_extra_regs[] __read_mostly = { /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x768005ffffull, RSP_0), - INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x768005ffffull, RSP_1), + INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x368005ffffull, RSP_1), EVENT_EXTRA_END }; @@ -1594,6 +1808,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) loops = 0; again: + intel_pmu_lbr_read(); intel_pmu_ack_status(status); if (++loops > 100) { static bool warned = false; @@ -1608,16 +1823,16 @@ again: inc_irq_stat(apic_perf_irqs); - intel_pmu_lbr_read(); /* - * CondChgd bit 63 doesn't mean any overflow status. Ignore - * and clear the bit. + * Ignore a range of extra bits in status that do not indicate + * overflow by themselves. */ - if (__test_and_clear_bit(63, (unsigned long *)&status)) { - if (!status) - goto done; - } + status &= ~(GLOBAL_STATUS_COND_CHG | + GLOBAL_STATUS_ASIF | + GLOBAL_STATUS_LBRS_FROZEN); + if (!status) + goto done; /* * PEBS overflow sets bit 62 in the global status register @@ -1699,18 +1914,22 @@ intel_bts_constraints(struct perf_event *event) return NULL; } -static int intel_alt_er(int idx) +static int intel_alt_er(int idx, u64 config) { + int alt_idx; if (!(x86_pmu.flags & PMU_FL_HAS_RSP_1)) return idx; if (idx == EXTRA_REG_RSP_0) - return EXTRA_REG_RSP_1; + alt_idx = EXTRA_REG_RSP_1; if (idx == EXTRA_REG_RSP_1) - return EXTRA_REG_RSP_0; + alt_idx = EXTRA_REG_RSP_0; + + if (config & ~x86_pmu.extra_regs[alt_idx].valid_mask) + return idx; - return idx; + return alt_idx; } static void intel_fixup_er(struct perf_event *event, int idx) @@ -1799,7 +2018,7 @@ again: */ c = NULL; } else { - idx = intel_alt_er(idx); + idx = intel_alt_er(idx, reg->config); if (idx != reg->idx) { raw_spin_unlock_irqrestore(&era->lock, flags); goto again; @@ -2256,6 +2475,15 @@ static void intel_pebs_aliases_snb(struct perf_event *event) } } +static unsigned long intel_pmu_free_running_flags(struct perf_event *event) +{ + unsigned long flags = x86_pmu.free_running_flags; + + if (event->attr.use_clockid) + flags &= ~PERF_SAMPLE_TIME; + return flags; +} + static int intel_pmu_hw_config(struct perf_event *event) { int ret = x86_pmu_hw_config(event); @@ -2266,7 +2494,8 @@ static int intel_pmu_hw_config(struct perf_event *event) if (event->attr.precise_ip) { if (!event->attr.freq) { event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD; - if (!(event->attr.sample_type & ~PEBS_FREERUNNING_FLAGS)) + if (!(event->attr.sample_type & + ~intel_pmu_free_running_flags(event))) event->hw.flags |= PERF_X86_EVENT_FREERUNNING; } if (x86_pmu.pebs_aliases) @@ -2667,6 +2896,8 @@ PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63"); PMU_FORMAT_ATTR(ldlat, "config1:0-15"); +PMU_FORMAT_ATTR(frontend, "config1:0-23"); + static struct attribute *intel_arch3_formats_attr[] = { &format_attr_event.attr, &format_attr_umask.attr, @@ -2683,6 +2914,11 @@ static struct attribute *intel_arch3_formats_attr[] = { NULL, }; +static struct attribute *skl_format_attr[] = { + &format_attr_frontend.attr, + NULL, +}; + static __initconst const struct x86_pmu core_pmu = { .name = "core", .handle_irq = x86_pmu_handle_irq, @@ -2697,6 +2933,8 @@ static __initconst const struct x86_pmu core_pmu = { .event_map = intel_pmu_event_map, .max_events = ARRAY_SIZE(intel_perfmon_event_map), .apic = 1, + .free_running_flags = PEBS_FREERUNNING_FLAGS, + /* * Intel PMCs cannot be accessed sanely above 32-bit width, * so we install an artificial 1<<31 period regardless of @@ -2735,6 +2973,7 @@ static __initconst const struct x86_pmu intel_pmu = { .event_map = intel_pmu_event_map, .max_events = ARRAY_SIZE(intel_perfmon_event_map), .apic = 1, + .free_running_flags = PEBS_FREERUNNING_FLAGS, /* * Intel PMCs cannot be accessed sanely above 32 bit width, * so we install an artificial 1<<31 period regardless of @@ -3272,6 +3511,30 @@ __init int intel_pmu_init(void) pr_cont("Broadwell events, "); break; + case 78: /* 14nm Skylake Mobile */ + case 94: /* 14nm Skylake Desktop */ + x86_pmu.late_ack = true; + memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); + intel_pmu_lbr_init_skl(); + + x86_pmu.event_constraints = intel_skl_event_constraints; + x86_pmu.pebs_constraints = intel_skl_pebs_event_constraints; + x86_pmu.extra_regs = intel_skl_extra_regs; + x86_pmu.pebs_aliases = intel_pebs_aliases_snb; + /* all extra regs are per-cpu when HT is on */ + x86_pmu.flags |= PMU_FL_HAS_RSP_1; + x86_pmu.flags |= PMU_FL_NO_HT_SHARING; + + x86_pmu.hw_config = hsw_hw_config; + x86_pmu.get_event_constraints = hsw_get_event_constraints; + x86_pmu.format_attrs = merge_attr(intel_arch3_formats_attr, + skl_format_attr); + WARN_ON(!x86_pmu.format_attrs); + x86_pmu.cpu_events = hsw_events_attrs; + pr_cont("Skylake events, "); + break; + default: switch (x86_pmu.version) { case 1: @@ -3341,7 +3604,7 @@ __init int intel_pmu_init(void) */ if (x86_pmu.extra_regs) { for (er = x86_pmu.extra_regs; er->msr; er++) { - er->extra_msr_access = check_msr(er->msr, 0x1ffUL); + er->extra_msr_access = check_msr(er->msr, 0x11UL); /* Disable LBR select mapping */ if ((er->idx == EXTRA_REG_LBR) && !er->extra_msr_access) x86_pmu.lbr_sel_map = NULL; @@ -3380,7 +3643,10 @@ static __init int fixup_ht_bug(void) return 0; } - watchdog_nmi_disable_all(); + if (lockup_detector_suspend() != 0) { + pr_debug("failed to disable PMU erratum BJ122, BV98, HSD29 workaround\n"); + return 0; + } x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED); @@ -3388,7 +3654,7 @@ static __init int fixup_ht_bug(void) x86_pmu.commit_scheduling = NULL; x86_pmu.stop_scheduling = NULL; - watchdog_nmi_enable_all(); + lockup_detector_resume(); get_online_cpus(); diff --git a/arch/x86/kernel/cpu/perf_event_intel_bts.c b/arch/x86/kernel/cpu/perf_event_intel_bts.c index 43dd672d7..d1c0f254a 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_bts.c +++ b/arch/x86/kernel/cpu/perf_event_intel_bts.c @@ -62,9 +62,6 @@ struct bts_buffer { struct pmu bts_pmu; -void intel_pmu_enable_bts(u64 config); -void intel_pmu_disable_bts(void); - static size_t buf_size(struct page *page) { return 1 << (PAGE_SHIFT + page_private(page)); @@ -225,6 +222,7 @@ static void __bts_event_start(struct perf_event *event) if (!buf || bts_buffer_is_full(buf, bts)) return; + event->hw.itrace_started = 1; event->hw.state = 0; if (!buf->snapshot) diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 71fc40238..84f236ab9 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -224,6 +224,19 @@ union hsw_tsx_tuning { #define PEBS_HSW_TSX_FLAGS 0xff00000000ULL +/* Same as HSW, plus TSC */ + +struct pebs_record_skl { + u64 flags, ip; + u64 ax, bx, cx, dx; + u64 si, di, bp, sp; + u64 r8, r9, r10, r11; + u64 r12, r13, r14, r15; + u64 status, dla, dse, lat; + u64 real_ip, tsx_tuning; + u64 tsc; +}; + void init_debug_store_on_cpu(int cpu) { struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; @@ -675,6 +688,28 @@ struct event_constraint intel_hsw_pebs_event_constraints[] = { EVENT_CONSTRAINT_END }; +struct event_constraint intel_skl_pebs_event_constraints[] = { + INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x2), /* INST_RETIRED.PREC_DIST */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */ + /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */ + INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf), + INTEL_PLD_CONSTRAINT(0x1cd, 0xf), /* MEM_TRANS_RETIRED.* */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_LOADS */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_STORES */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_INST_RETIRED.LOCK_LOADS */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x22d0, 0xf), /* MEM_INST_RETIRED.LOCK_STORES */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_INST_RETIRED.SPLIT_LOADS */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_INST_RETIRED.SPLIT_STORES */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_INST_RETIRED.ALL_LOADS */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_INST_RETIRED.ALL_STORES */ + INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd1, 0xf), /* MEM_LOAD_RETIRED.* */ + INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd2, 0xf), /* MEM_LOAD_L3_HIT_RETIRED.* */ + INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd3, 0xf), /* MEM_LOAD_L3_MISS_RETIRED.* */ + /* Allow all events as PEBS with no flags */ + INTEL_ALL_EVENT_CONSTRAINT(0, 0xf), + EVENT_CONSTRAINT_END +}; + struct event_constraint *intel_pebs_constraints(struct perf_event *event) { struct event_constraint *c; @@ -754,6 +789,11 @@ void intel_pmu_pebs_disable(struct perf_event *event) struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; struct debug_store *ds = cpuc->ds; + bool large_pebs = ds->pebs_interrupt_threshold > + ds->pebs_buffer_base + x86_pmu.pebs_record_size; + + if (large_pebs) + intel_pmu_drain_pebs_buffer(); cpuc->pebs_enabled &= ~(1ULL << hwc->idx); @@ -762,12 +802,8 @@ void intel_pmu_pebs_disable(struct perf_event *event) else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST) cpuc->pebs_enabled &= ~(1ULL << 63); - if (ds->pebs_interrupt_threshold > - ds->pebs_buffer_base + x86_pmu.pebs_record_size) { - intel_pmu_drain_pebs_buffer(); - if (!pebs_is_enabled(cpuc)) - perf_sched_cb_dec(event->ctx->pmu); - } + if (large_pebs && !pebs_is_enabled(cpuc)) + perf_sched_cb_dec(event->ctx->pmu); if (cpuc->enabled) wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled); @@ -885,7 +921,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) return 0; } -static inline u64 intel_hsw_weight(struct pebs_record_hsw *pebs) +static inline u64 intel_hsw_weight(struct pebs_record_skl *pebs) { if (pebs->tsx_tuning) { union hsw_tsx_tuning tsx = { .value = pebs->tsx_tuning }; @@ -894,7 +930,7 @@ static inline u64 intel_hsw_weight(struct pebs_record_hsw *pebs) return 0; } -static inline u64 intel_hsw_transaction(struct pebs_record_hsw *pebs) +static inline u64 intel_hsw_transaction(struct pebs_record_skl *pebs) { u64 txn = (pebs->tsx_tuning & PEBS_HSW_TSX_FLAGS) >> 32; @@ -918,7 +954,7 @@ static void setup_pebs_sample_data(struct perf_event *event, * unconditionally access the 'extra' entries. */ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - struct pebs_record_hsw *pebs = __pebs; + struct pebs_record_skl *pebs = __pebs; u64 sample_type; int fll, fst, dsrc; int fl = event->hw.flags; @@ -1016,6 +1052,16 @@ static void setup_pebs_sample_data(struct perf_event *event, data->txn = intel_hsw_transaction(pebs); } + /* + * v3 supplies an accurate time stamp, so we use that + * for the time stamp. + * + * We can only do this for the default trace clock. + */ + if (x86_pmu.intel_cap.pebs_format >= 3 && + event->attr.use_clockid == 0) + data->time = native_sched_clock_from_tsc(pebs->tsc); + if (has_branch_stack(event)) data->br_stack = &cpuc->lbr_stack; } @@ -1142,6 +1188,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) for (at = base; at < top; at += x86_pmu.pebs_record_size) { struct pebs_record_nhm *p = at; + u64 pebs_status; /* PEBS v3 has accurate status bits */ if (x86_pmu.intel_cap.pebs_format >= 3) { @@ -1152,12 +1199,17 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) continue; } - bit = find_first_bit((unsigned long *)&p->status, + pebs_status = p->status & cpuc->pebs_enabled; + pebs_status &= (1ULL << x86_pmu.max_pebs_events) - 1; + + bit = find_first_bit((unsigned long *)&pebs_status, x86_pmu.max_pebs_events); - if (bit >= x86_pmu.max_pebs_events) - continue; - if (!test_bit(bit, cpuc->active_mask)) + if (WARN(bit >= x86_pmu.max_pebs_events, + "PEBS record without PEBS event! status=%Lx pebs_enabled=%Lx active_mask=%Lx", + (unsigned long long)p->status, (unsigned long long)cpuc->pebs_enabled, + *(unsigned long long *)cpuc->active_mask)) continue; + /* * The PEBS hardware does not deal well with the situation * when events happen near to each other and multiple bits @@ -1172,27 +1224,21 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) * one, and it's not possible to reconstruct all events * that caused the PEBS record. It's called collision. * If collision happened, the record will be dropped. - * */ - if (p->status != (1 << bit)) { - u64 pebs_status; - - /* slow path */ - pebs_status = p->status & cpuc->pebs_enabled; - pebs_status &= (1ULL << MAX_PEBS_EVENTS) - 1; - if (pebs_status != (1 << bit)) { - for_each_set_bit(i, (unsigned long *)&pebs_status, - MAX_PEBS_EVENTS) - error[i]++; - continue; - } + if (p->status != (1ULL << bit)) { + for_each_set_bit(i, (unsigned long *)&pebs_status, + x86_pmu.max_pebs_events) + error[i]++; + continue; } + counts[bit]++; } for (bit = 0; bit < x86_pmu.max_pebs_events; bit++) { if ((counts[bit] == 0) && (error[bit] == 0)) continue; + event = cpuc->events[bit]; WARN_ON_ONCE(!event); WARN_ON_ONCE(!event->attr.precise_ip); @@ -1245,6 +1291,14 @@ void __init intel_ds_init(void) x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm; break; + case 3: + pr_cont("PEBS fmt3%c, ", pebs_type); + x86_pmu.pebs_record_size = + sizeof(struct pebs_record_skl); + x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm; + x86_pmu.free_running_flags |= PERF_SAMPLE_TIME; + break; + default: printk(KERN_CONT "no PEBS fmt%d%c, ", format, pebs_type); x86_pmu.pebs = 0; diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index 452a7bd2d..b2c9475b7 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -13,7 +13,8 @@ enum { LBR_FORMAT_EIP = 0x02, LBR_FORMAT_EIP_FLAGS = 0x03, LBR_FORMAT_EIP_FLAGS2 = 0x04, - LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_EIP_FLAGS2, + LBR_FORMAT_INFO = 0x05, + LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_INFO, }; static enum { @@ -140,6 +141,13 @@ static void __intel_pmu_lbr_enable(bool pmi) u64 debugctl, lbr_select = 0, orig_debugctl; /* + * No need to unfreeze manually, as v4 can do that as part + * of the GLOBAL_STATUS ack. + */ + if (pmi && x86_pmu.version >= 4) + return; + + /* * No need to reprogram LBR_SELECT in a PMI, as it * did not change. */ @@ -186,6 +194,8 @@ static void intel_pmu_lbr_reset_64(void) for (i = 0; i < x86_pmu.lbr_nr; i++) { wrmsrl(x86_pmu.lbr_from + i, 0); wrmsrl(x86_pmu.lbr_to + i, 0); + if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) + wrmsrl(MSR_LBR_INFO_0 + i, 0); } } @@ -230,10 +240,12 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx) mask = x86_pmu.lbr_nr - 1; tos = intel_pmu_lbr_tos(); - for (i = 0; i < x86_pmu.lbr_nr; i++) { + for (i = 0; i < tos; i++) { lbr_idx = (tos - i) & mask; wrmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]); wrmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]); + if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) + wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]); } task_ctx->lbr_stack_state = LBR_NONE; } @@ -251,10 +263,12 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx) mask = x86_pmu.lbr_nr - 1; tos = intel_pmu_lbr_tos(); - for (i = 0; i < x86_pmu.lbr_nr; i++) { + for (i = 0; i < tos; i++) { lbr_idx = (tos - i) & mask; rdmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]); rdmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]); + if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) + rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]); } task_ctx->lbr_stack_state = LBR_VALID; } @@ -411,16 +425,31 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) u64 tos = intel_pmu_lbr_tos(); int i; int out = 0; + int num = x86_pmu.lbr_nr; - for (i = 0; i < x86_pmu.lbr_nr; i++) { + if (cpuc->lbr_sel->config & LBR_CALL_STACK) + num = tos; + + for (i = 0; i < num; i++) { unsigned long lbr_idx = (tos - i) & mask; u64 from, to, mis = 0, pred = 0, in_tx = 0, abort = 0; int skip = 0; + u16 cycles = 0; int lbr_flags = lbr_desc[lbr_format]; rdmsrl(x86_pmu.lbr_from + lbr_idx, from); rdmsrl(x86_pmu.lbr_to + lbr_idx, to); + if (lbr_format == LBR_FORMAT_INFO) { + u64 info; + + rdmsrl(MSR_LBR_INFO_0 + lbr_idx, info); + mis = !!(info & LBR_INFO_MISPRED); + pred = !mis; + in_tx = !!(info & LBR_INFO_IN_TX); + abort = !!(info & LBR_INFO_ABORT); + cycles = (info & LBR_INFO_CYCLES); + } if (lbr_flags & LBR_EIP_FLAGS) { mis = !!(from & LBR_FROM_FLAG_MISPRED); pred = !mis; @@ -450,6 +479,7 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) cpuc->lbr_entries[out].predicted = pred; cpuc->lbr_entries[out].in_tx = in_tx; cpuc->lbr_entries[out].abort = abort; + cpuc->lbr_entries[out].cycles = cycles; cpuc->lbr_entries[out].reserved = 0; out++; } @@ -947,6 +977,26 @@ void intel_pmu_lbr_init_hsw(void) pr_cont("16-deep LBR, "); } +/* skylake */ +__init void intel_pmu_lbr_init_skl(void) +{ + x86_pmu.lbr_nr = 32; + x86_pmu.lbr_tos = MSR_LBR_TOS; + x86_pmu.lbr_from = MSR_LBR_NHM_FROM; + x86_pmu.lbr_to = MSR_LBR_NHM_TO; + + x86_pmu.lbr_sel_mask = LBR_SEL_MASK; + x86_pmu.lbr_sel_map = hsw_lbr_sel_map; + + /* + * SW branch filter usage: + * - support syscall, sysret capture. + * That requires LBR_FAR but that means far + * jmp need to be filtered out + */ + pr_cont("32-deep LBR, "); +} + /* atom */ void __init intel_pmu_lbr_init_atom(void) { diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c index 183de7196..421692834 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_pt.c +++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c @@ -65,15 +65,21 @@ static struct pt_cap_desc { } pt_caps[] = { PT_CAP(max_subleaf, 0, CR_EAX, 0xffffffff), PT_CAP(cr3_filtering, 0, CR_EBX, BIT(0)), + PT_CAP(psb_cyc, 0, CR_EBX, BIT(1)), + PT_CAP(mtc, 0, CR_EBX, BIT(3)), PT_CAP(topa_output, 0, CR_ECX, BIT(0)), PT_CAP(topa_multiple_entries, 0, CR_ECX, BIT(1)), + PT_CAP(single_range_output, 0, CR_ECX, BIT(2)), PT_CAP(payloads_lip, 0, CR_ECX, BIT(31)), + PT_CAP(mtc_periods, 1, CR_EAX, 0xffff0000), + PT_CAP(cycle_thresholds, 1, CR_EBX, 0xffff), + PT_CAP(psb_periods, 1, CR_EBX, 0xffff0000), }; static u32 pt_cap_get(enum pt_capabilities cap) { struct pt_cap_desc *cd = &pt_caps[cap]; - u32 c = pt_pmu.caps[cd->leaf * 4 + cd->reg]; + u32 c = pt_pmu.caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg]; unsigned int shift = __ffs(cd->mask); return (c & cd->mask) >> shift; @@ -94,12 +100,22 @@ static struct attribute_group pt_cap_group = { .name = "caps", }; +PMU_FORMAT_ATTR(cyc, "config:1" ); +PMU_FORMAT_ATTR(mtc, "config:9" ); PMU_FORMAT_ATTR(tsc, "config:10" ); PMU_FORMAT_ATTR(noretcomp, "config:11" ); +PMU_FORMAT_ATTR(mtc_period, "config:14-17" ); +PMU_FORMAT_ATTR(cyc_thresh, "config:19-22" ); +PMU_FORMAT_ATTR(psb_period, "config:24-27" ); static struct attribute *pt_formats_attr[] = { + &format_attr_cyc.attr, + &format_attr_mtc.attr, &format_attr_tsc.attr, &format_attr_noretcomp.attr, + &format_attr_mtc_period.attr, + &format_attr_cyc_thresh.attr, + &format_attr_psb_period.attr, NULL, }; @@ -129,10 +145,10 @@ static int __init pt_pmu_hw_init(void) for (i = 0; i < PT_CPUID_LEAVES; i++) { cpuid_count(20, i, - &pt_pmu.caps[CR_EAX + i*4], - &pt_pmu.caps[CR_EBX + i*4], - &pt_pmu.caps[CR_ECX + i*4], - &pt_pmu.caps[CR_EDX + i*4]); + &pt_pmu.caps[CR_EAX + i*PT_CPUID_REGS_NUM], + &pt_pmu.caps[CR_EBX + i*PT_CPUID_REGS_NUM], + &pt_pmu.caps[CR_ECX + i*PT_CPUID_REGS_NUM], + &pt_pmu.caps[CR_EDX + i*PT_CPUID_REGS_NUM]); } ret = -ENOMEM; @@ -170,15 +186,65 @@ fail: return ret; } -#define PT_CONFIG_MASK (RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC) +#define RTIT_CTL_CYC_PSB (RTIT_CTL_CYCLEACC | \ + RTIT_CTL_CYC_THRESH | \ + RTIT_CTL_PSB_FREQ) + +#define RTIT_CTL_MTC (RTIT_CTL_MTC_EN | \ + RTIT_CTL_MTC_RANGE) + +#define PT_CONFIG_MASK (RTIT_CTL_TSC_EN | \ + RTIT_CTL_DISRETC | \ + RTIT_CTL_CYC_PSB | \ + RTIT_CTL_MTC) static bool pt_event_valid(struct perf_event *event) { u64 config = event->attr.config; + u64 allowed, requested; if ((config & PT_CONFIG_MASK) != config) return false; + if (config & RTIT_CTL_CYC_PSB) { + if (!pt_cap_get(PT_CAP_psb_cyc)) + return false; + + allowed = pt_cap_get(PT_CAP_psb_periods); + requested = (config & RTIT_CTL_PSB_FREQ) >> + RTIT_CTL_PSB_FREQ_OFFSET; + if (requested && (!(allowed & BIT(requested)))) + return false; + + allowed = pt_cap_get(PT_CAP_cycle_thresholds); + requested = (config & RTIT_CTL_CYC_THRESH) >> + RTIT_CTL_CYC_THRESH_OFFSET; + if (requested && (!(allowed & BIT(requested)))) + return false; + } + + if (config & RTIT_CTL_MTC) { + /* + * In the unlikely case that CPUID lists valid mtc periods, + * but not the mtc capability, drop out here. + * + * Spec says that setting mtc period bits while mtc bit in + * CPUID is 0 will #GP, so better safe than sorry. + */ + if (!pt_cap_get(PT_CAP_mtc)) + return false; + + allowed = pt_cap_get(PT_CAP_mtc_periods); + if (!allowed) + return false; + + requested = (config & RTIT_CTL_MTC_RANGE) >> + RTIT_CTL_MTC_RANGE_OFFSET; + + if (!(allowed & BIT(requested))) + return false; + } + return true; } @@ -191,6 +257,11 @@ static void pt_config(struct perf_event *event) { u64 reg; + if (!event->hw.itrace_started) { + event->hw.itrace_started = 1; + wrmsrl(MSR_IA32_RTIT_STATUS, 0); + } + reg = RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN; if (!event->attr.exclude_kernel) @@ -910,7 +981,6 @@ void intel_pt_interrupt(void) pt_config_buffer(buf->cur->table, buf->cur_idx, buf->output_off); - wrmsrl(MSR_IA32_RTIT_STATUS, 0); pt_config(event); } } @@ -934,7 +1004,6 @@ static void pt_event_start(struct perf_event *event, int mode) pt_config_buffer(buf->cur->table, buf->cur_idx, buf->output_off); - wrmsrl(MSR_IA32_RTIT_STATUS, 0); pt_config(event); } diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c index 5cbd4e64f..81431c0f0 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c +++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c @@ -86,6 +86,10 @@ static const char *rapl_domain_names[NR_RAPL_DOMAINS] __initconst = { 1<<RAPL_IDX_RAM_NRG_STAT|\ 1<<RAPL_IDX_PP1_NRG_STAT) +/* Knights Landing has PKG, RAM */ +#define RAPL_IDX_KNL (1<<RAPL_IDX_PKG_NRG_STAT|\ + 1<<RAPL_IDX_RAM_NRG_STAT) + /* * event code: LSB 8 bits, passed in attr->config * any other bit is reserved @@ -486,6 +490,18 @@ static struct attribute *rapl_events_hsw_attr[] = { NULL, }; +static struct attribute *rapl_events_knl_attr[] = { + EVENT_PTR(rapl_pkg), + EVENT_PTR(rapl_ram), + + EVENT_PTR(rapl_pkg_unit), + EVENT_PTR(rapl_ram_unit), + + EVENT_PTR(rapl_pkg_scale), + EVENT_PTR(rapl_ram_scale), + NULL, +}; + static struct attribute_group rapl_pmu_events_group = { .name = "events", .attrs = NULL, /* patched at runtime */ @@ -730,6 +746,10 @@ static int __init rapl_pmu_init(void) rapl_cntr_mask = RAPL_IDX_SRV; rapl_pmu_events_group.attrs = rapl_events_srv_attr; break; + case 87: /* Knights Landing */ + rapl_add_quirk(rapl_hsw_server_quirk); + rapl_cntr_mask = RAPL_IDX_KNL; + rapl_pmu_events_group.attrs = rapl_events_knl_attr; default: /* unsupported */ diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 21b5e38c9..560e5255b 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -911,6 +911,9 @@ static int __init uncore_pci_init(void) case 63: /* Haswell-EP */ ret = hswep_uncore_pci_init(); break; + case 86: /* BDX-DE */ + ret = bdx_uncore_pci_init(); + break; case 42: /* Sandy Bridge */ ret = snb_uncore_pci_init(); break; @@ -1209,6 +1212,11 @@ static int __init uncore_cpu_init(void) break; case 42: /* Sandy Bridge */ case 58: /* Ivy Bridge */ + case 60: /* Haswell */ + case 69: /* Haswell */ + case 70: /* Haswell */ + case 61: /* Broadwell */ + case 71: /* Broadwell */ snb_uncore_cpu_init(); break; case 45: /* Sandy Bridge-EP */ @@ -1224,6 +1232,9 @@ static int __init uncore_cpu_init(void) case 63: /* Haswell-EP */ hswep_uncore_cpu_init(); break; + case 86: /* BDX-DE */ + bdx_uncore_cpu_init(); + break; default: return 0; } diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h index 0f77f0a19..72c54c2e5 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h @@ -336,6 +336,8 @@ int ivbep_uncore_pci_init(void); void ivbep_uncore_cpu_init(void); int hswep_uncore_pci_init(void); void hswep_uncore_cpu_init(void); +int bdx_uncore_pci_init(void); +void bdx_uncore_cpu_init(void); /* perf_event_intel_uncore_nhmex.c */ void nhmex_uncore_cpu_init(void); diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c index b005a78c7..f78574b3c 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c @@ -45,6 +45,11 @@ #define SNB_UNC_CBO_0_PER_CTR0 0x706 #define SNB_UNC_CBO_MSR_OFFSET 0x10 +/* SNB ARB register */ +#define SNB_UNC_ARB_PER_CTR0 0x3b0 +#define SNB_UNC_ARB_PERFEVTSEL0 0x3b2 +#define SNB_UNC_ARB_MSR_OFFSET 0x10 + /* NHM global control register */ #define NHM_UNC_PERF_GLOBAL_CTL 0x391 #define NHM_UNC_FIXED_CTR 0x394 @@ -115,7 +120,7 @@ static struct intel_uncore_ops snb_uncore_msr_ops = { .read_counter = uncore_msr_read_counter, }; -static struct event_constraint snb_uncore_cbox_constraints[] = { +static struct event_constraint snb_uncore_arb_constraints[] = { UNCORE_EVENT_CONSTRAINT(0x80, 0x1), UNCORE_EVENT_CONSTRAINT(0x83, 0x1), EVENT_CONSTRAINT_END @@ -134,14 +139,28 @@ static struct intel_uncore_type snb_uncore_cbox = { .single_fixed = 1, .event_mask = SNB_UNC_RAW_EVENT_MASK, .msr_offset = SNB_UNC_CBO_MSR_OFFSET, - .constraints = snb_uncore_cbox_constraints, .ops = &snb_uncore_msr_ops, .format_group = &snb_uncore_format_group, .event_descs = snb_uncore_events, }; +static struct intel_uncore_type snb_uncore_arb = { + .name = "arb", + .num_counters = 2, + .num_boxes = 1, + .perf_ctr_bits = 44, + .perf_ctr = SNB_UNC_ARB_PER_CTR0, + .event_ctl = SNB_UNC_ARB_PERFEVTSEL0, + .event_mask = SNB_UNC_RAW_EVENT_MASK, + .msr_offset = SNB_UNC_ARB_MSR_OFFSET, + .constraints = snb_uncore_arb_constraints, + .ops = &snb_uncore_msr_ops, + .format_group = &snb_uncore_format_group, +}; + static struct intel_uncore_type *snb_msr_uncores[] = { &snb_uncore_cbox, + &snb_uncore_arb, NULL, }; diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c index 6d6e85dd5..694510a88 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c @@ -2215,7 +2215,7 @@ static struct intel_uncore_type *hswep_pci_uncores[] = { NULL, }; -static DEFINE_PCI_DEVICE_TABLE(hswep_uncore_pci_ids) = { +static const struct pci_device_id hswep_uncore_pci_ids[] = { { /* Home Agent 0 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f30), .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_HA, 0), @@ -2321,3 +2321,167 @@ int hswep_uncore_pci_init(void) return 0; } /* end of Haswell-EP uncore support */ + +/* BDX-DE uncore support */ + +static struct intel_uncore_type bdx_uncore_ubox = { + .name = "ubox", + .num_counters = 2, + .num_boxes = 1, + .perf_ctr_bits = 48, + .fixed_ctr_bits = 48, + .perf_ctr = HSWEP_U_MSR_PMON_CTR0, + .event_ctl = HSWEP_U_MSR_PMON_CTL0, + .event_mask = SNBEP_U_MSR_PMON_RAW_EVENT_MASK, + .fixed_ctr = HSWEP_U_MSR_PMON_UCLK_FIXED_CTR, + .fixed_ctl = HSWEP_U_MSR_PMON_UCLK_FIXED_CTL, + .num_shared_regs = 1, + .ops = &ivbep_uncore_msr_ops, + .format_group = &ivbep_uncore_ubox_format_group, +}; + +static struct event_constraint bdx_uncore_cbox_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x09, 0x3), + UNCORE_EVENT_CONSTRAINT(0x11, 0x1), + UNCORE_EVENT_CONSTRAINT(0x36, 0x1), + EVENT_CONSTRAINT_END +}; + +static struct intel_uncore_type bdx_uncore_cbox = { + .name = "cbox", + .num_counters = 4, + .num_boxes = 8, + .perf_ctr_bits = 48, + .event_ctl = HSWEP_C0_MSR_PMON_CTL0, + .perf_ctr = HSWEP_C0_MSR_PMON_CTR0, + .event_mask = SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK, + .box_ctl = HSWEP_C0_MSR_PMON_BOX_CTL, + .msr_offset = HSWEP_CBO_MSR_OFFSET, + .num_shared_regs = 1, + .constraints = bdx_uncore_cbox_constraints, + .ops = &hswep_uncore_cbox_ops, + .format_group = &hswep_uncore_cbox_format_group, +}; + +static struct intel_uncore_type *bdx_msr_uncores[] = { + &bdx_uncore_ubox, + &bdx_uncore_cbox, + &hswep_uncore_pcu, + NULL, +}; + +void bdx_uncore_cpu_init(void) +{ + if (bdx_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) + bdx_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; + uncore_msr_uncores = bdx_msr_uncores; +} + +static struct intel_uncore_type bdx_uncore_ha = { + .name = "ha", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + +static struct intel_uncore_type bdx_uncore_imc = { + .name = "imc", + .num_counters = 5, + .num_boxes = 2, + .perf_ctr_bits = 48, + .fixed_ctr_bits = 48, + .fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR, + .fixed_ctl = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL, + .event_descs = hswep_uncore_imc_events, + SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + +static struct intel_uncore_type bdx_uncore_irp = { + .name = "irp", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_PCI_PMON_BOX_CTL, + .ops = &hswep_uncore_irp_ops, + .format_group = &snbep_uncore_format_group, +}; + + +static struct event_constraint bdx_uncore_r2pcie_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x10, 0x3), + UNCORE_EVENT_CONSTRAINT(0x11, 0x3), + UNCORE_EVENT_CONSTRAINT(0x13, 0x1), + UNCORE_EVENT_CONSTRAINT(0x23, 0x1), + UNCORE_EVENT_CONSTRAINT(0x25, 0x1), + UNCORE_EVENT_CONSTRAINT(0x26, 0x3), + UNCORE_EVENT_CONSTRAINT(0x2d, 0x3), + EVENT_CONSTRAINT_END +}; + +static struct intel_uncore_type bdx_uncore_r2pcie = { + .name = "r2pcie", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + .constraints = bdx_uncore_r2pcie_constraints, + SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + +enum { + BDX_PCI_UNCORE_HA, + BDX_PCI_UNCORE_IMC, + BDX_PCI_UNCORE_IRP, + BDX_PCI_UNCORE_R2PCIE, +}; + +static struct intel_uncore_type *bdx_pci_uncores[] = { + [BDX_PCI_UNCORE_HA] = &bdx_uncore_ha, + [BDX_PCI_UNCORE_IMC] = &bdx_uncore_imc, + [BDX_PCI_UNCORE_IRP] = &bdx_uncore_irp, + [BDX_PCI_UNCORE_R2PCIE] = &bdx_uncore_r2pcie, + NULL, +}; + +static DEFINE_PCI_DEVICE_TABLE(bdx_uncore_pci_ids) = { + { /* Home Agent 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f30), + .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_HA, 0), + }, + { /* MC0 Channel 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb0), + .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 0), + }, + { /* MC0 Channel 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb1), + .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 1), + }, + { /* IRP */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f39), + .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IRP, 0), + }, + { /* R2PCIe */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f34), + .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R2PCIE, 0), + }, + { /* end: all zeroes */ } +}; + +static struct pci_driver bdx_uncore_pci_driver = { + .name = "bdx_uncore", + .id_table = bdx_uncore_pci_ids, +}; + +int bdx_uncore_pci_init(void) +{ + int ret = snbep_pci2phy_map_init(0x6f1e); + + if (ret) + return ret; + uncore_pci_uncores = bdx_pci_uncores; + uncore_pci_driver = &bdx_uncore_pci_driver; + return 0; +} + +/* end of BDX-DE uncore support */ diff --git a/arch/x86/kernel/cpu/perf_event_msr.c b/arch/x86/kernel/cpu/perf_event_msr.c new file mode 100644 index 000000000..f32ac1393 --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_msr.c @@ -0,0 +1,242 @@ +#include <linux/perf_event.h> + +enum perf_msr_id { + PERF_MSR_TSC = 0, + PERF_MSR_APERF = 1, + PERF_MSR_MPERF = 2, + PERF_MSR_PPERF = 3, + PERF_MSR_SMI = 4, + + PERF_MSR_EVENT_MAX, +}; + +static bool test_aperfmperf(int idx) +{ + return boot_cpu_has(X86_FEATURE_APERFMPERF); +} + +static bool test_intel(int idx) +{ + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || + boot_cpu_data.x86 != 6) + return false; + + switch (boot_cpu_data.x86_model) { + case 30: /* 45nm Nehalem */ + case 26: /* 45nm Nehalem-EP */ + case 46: /* 45nm Nehalem-EX */ + + case 37: /* 32nm Westmere */ + case 44: /* 32nm Westmere-EP */ + case 47: /* 32nm Westmere-EX */ + + case 42: /* 32nm SandyBridge */ + case 45: /* 32nm SandyBridge-E/EN/EP */ + + case 58: /* 22nm IvyBridge */ + case 62: /* 22nm IvyBridge-EP/EX */ + + case 60: /* 22nm Haswell Core */ + case 63: /* 22nm Haswell Server */ + case 69: /* 22nm Haswell ULT */ + case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */ + + case 61: /* 14nm Broadwell Core-M */ + case 86: /* 14nm Broadwell Xeon D */ + case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */ + case 79: /* 14nm Broadwell Server */ + + case 55: /* 22nm Atom "Silvermont" */ + case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */ + case 76: /* 14nm Atom "Airmont" */ + if (idx == PERF_MSR_SMI) + return true; + break; + + case 78: /* 14nm Skylake Mobile */ + case 94: /* 14nm Skylake Desktop */ + if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF) + return true; + break; + } + + return false; +} + +struct perf_msr { + u64 msr; + struct perf_pmu_events_attr *attr; + bool (*test)(int idx); +}; + +PMU_EVENT_ATTR_STRING(tsc, evattr_tsc, "event=0x00"); +PMU_EVENT_ATTR_STRING(aperf, evattr_aperf, "event=0x01"); +PMU_EVENT_ATTR_STRING(mperf, evattr_mperf, "event=0x02"); +PMU_EVENT_ATTR_STRING(pperf, evattr_pperf, "event=0x03"); +PMU_EVENT_ATTR_STRING(smi, evattr_smi, "event=0x04"); + +static struct perf_msr msr[] = { + [PERF_MSR_TSC] = { 0, &evattr_tsc, NULL, }, + [PERF_MSR_APERF] = { MSR_IA32_APERF, &evattr_aperf, test_aperfmperf, }, + [PERF_MSR_MPERF] = { MSR_IA32_MPERF, &evattr_mperf, test_aperfmperf, }, + [PERF_MSR_PPERF] = { MSR_PPERF, &evattr_pperf, test_intel, }, + [PERF_MSR_SMI] = { MSR_SMI_COUNT, &evattr_smi, test_intel, }, +}; + +static struct attribute *events_attrs[PERF_MSR_EVENT_MAX + 1] = { + NULL, +}; + +static struct attribute_group events_attr_group = { + .name = "events", + .attrs = events_attrs, +}; + +PMU_FORMAT_ATTR(event, "config:0-63"); +static struct attribute *format_attrs[] = { + &format_attr_event.attr, + NULL, +}; +static struct attribute_group format_attr_group = { + .name = "format", + .attrs = format_attrs, +}; + +static const struct attribute_group *attr_groups[] = { + &events_attr_group, + &format_attr_group, + NULL, +}; + +static int msr_event_init(struct perf_event *event) +{ + u64 cfg = event->attr.config; + + if (event->attr.type != event->pmu->type) + return -ENOENT; + + if (cfg >= PERF_MSR_EVENT_MAX) + return -EINVAL; + + /* unsupported modes and filters */ + if (event->attr.exclude_user || + event->attr.exclude_kernel || + event->attr.exclude_hv || + event->attr.exclude_idle || + event->attr.exclude_host || + event->attr.exclude_guest || + event->attr.sample_period) /* no sampling */ + return -EINVAL; + + if (!msr[cfg].attr) + return -EINVAL; + + event->hw.idx = -1; + event->hw.event_base = msr[cfg].msr; + event->hw.config = cfg; + + return 0; +} + +static inline u64 msr_read_counter(struct perf_event *event) +{ + u64 now; + + if (event->hw.event_base) + rdmsrl(event->hw.event_base, now); + else + rdtscll(now); + + return now; +} +static void msr_event_update(struct perf_event *event) +{ + u64 prev, now; + s64 delta; + + /* Careful, an NMI might modify the previous event value. */ +again: + prev = local64_read(&event->hw.prev_count); + now = msr_read_counter(event); + + if (local64_cmpxchg(&event->hw.prev_count, prev, now) != prev) + goto again; + + delta = now - prev; + if (unlikely(event->hw.event_base == MSR_SMI_COUNT)) { + delta <<= 32; + delta >>= 32; /* sign extend */ + } + local64_add(now - prev, &event->count); +} + +static void msr_event_start(struct perf_event *event, int flags) +{ + u64 now; + + now = msr_read_counter(event); + local64_set(&event->hw.prev_count, now); +} + +static void msr_event_stop(struct perf_event *event, int flags) +{ + msr_event_update(event); +} + +static void msr_event_del(struct perf_event *event, int flags) +{ + msr_event_stop(event, PERF_EF_UPDATE); +} + +static int msr_event_add(struct perf_event *event, int flags) +{ + if (flags & PERF_EF_START) + msr_event_start(event, flags); + + return 0; +} + +static struct pmu pmu_msr = { + .task_ctx_nr = perf_sw_context, + .attr_groups = attr_groups, + .event_init = msr_event_init, + .add = msr_event_add, + .del = msr_event_del, + .start = msr_event_start, + .stop = msr_event_stop, + .read = msr_event_update, + .capabilities = PERF_PMU_CAP_NO_INTERRUPT, +}; + +static int __init msr_init(void) +{ + int i, j = 0; + + if (!boot_cpu_has(X86_FEATURE_TSC)) { + pr_cont("no MSR PMU driver.\n"); + return 0; + } + + /* Probe the MSRs. */ + for (i = PERF_MSR_TSC + 1; i < PERF_MSR_EVENT_MAX; i++) { + u64 val; + + /* + * Virt sucks arse; you cannot tell if a R/O MSR is present :/ + */ + if (!msr[i].test(i) || rdmsrl_safe(msr[i].msr, &val)) + msr[i].attr = NULL; + } + + /* List remaining MSRs in the sysfs attrs. */ + for (i = 0; i < PERF_MSR_EVENT_MAX; i++) { + if (msr[i].attr) + events_attrs[j++] = &msr[i].attr->attr.attr; + } + events_attrs[j] = NULL; + + perf_pmu_register(&pmu_msr, "msr", -1); + + return 0; +} +device_initcall(msr_init); diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 3d423a101..608fb26c7 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -37,7 +37,7 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c) { X86_FEATURE_PLN, CR_EAX, 4, 0x00000006, 0 }, { X86_FEATURE_PTS, CR_EAX, 6, 0x00000006, 0 }, { X86_FEATURE_HWP, CR_EAX, 7, 0x00000006, 0 }, - { X86_FEATURE_HWP_NOITFY, CR_EAX, 8, 0x00000006, 0 }, + { X86_FEATURE_HWP_NOTIFY, CR_EAX, 8, 0x00000006, 0 }, { X86_FEATURE_HWP_ACT_WINDOW, CR_EAX, 9, 0x00000006, 0 }, { X86_FEATURE_HWP_EPP, CR_EAX,10, 0x00000006, 0 }, { X86_FEATURE_HWP_PKG_REQ, CR_EAX,11, 0x00000006, 0 }, diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 83741a715..bd3507da3 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -170,7 +170,7 @@ static int cpuid_class_cpu_callback(struct notifier_block *nfb, return notifier_from_errno(err); } -static struct notifier_block __refdata cpuid_class_cpu_notifier = +static struct notifier_block cpuid_class_cpu_notifier = { .notifier_call = cpuid_class_cpu_callback, }; diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 61059c4d0..a457bce46 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -110,7 +110,7 @@ static void init_espfix_random(void) */ if (!arch_get_random_long(&rand)) { /* The constant is an arbitrary large prime */ - rdtscll(rand); + rand = rdtsc(); rand *= 0xc345c6b72fd16123UL; } diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 50ec9af1b..6545e6ddb 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -385,20 +385,19 @@ fpu__alloc_mathframe(unsigned long sp, int ia32_frame, */ void fpu__init_prepare_fx_sw_frame(void) { - int fsave_header_size = sizeof(struct fregs_state); int size = xstate_size + FP_XSTATE_MAGIC2_SIZE; - if (config_enabled(CONFIG_X86_32)) - size += fsave_header_size; - fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1; fx_sw_reserved.extended_size = size; fx_sw_reserved.xfeatures = xfeatures_mask; fx_sw_reserved.xstate_size = xstate_size; - if (config_enabled(CONFIG_IA32_EMULATION)) { + if (config_enabled(CONFIG_IA32_EMULATION) || + config_enabled(CONFIG_X86_32)) { + int fsave_header_size = sizeof(struct fregs_state); + fx_sw_reserved_ia32 = fx_sw_reserved; - fx_sw_reserved_ia32.extended_size += fsave_header_size; + fx_sw_reserved_ia32.extended_size = size + fsave_header_size; } } diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 62fc001c7..2c4ac072a 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -402,7 +402,6 @@ void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature) if (!boot_cpu_has(X86_FEATURE_XSAVE)) return NULL; - xsave = ¤t->thread.fpu.state.xsave; /* * We should not ever be requesting features that we * have not enabled. Remember that pcntxt_mask is diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 1d40ca8a7..ffdc0e860 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -65,6 +65,9 @@ startup_64: * tables and then reload them. */ + /* Sanitize CPU configuration */ + call verify_cpu + /* * Compute the delta between the address I am compiled to run at and the * address I am actually running at. @@ -174,6 +177,9 @@ ENTRY(secondary_startup_64) * after the boot processor executes this code. */ + /* Sanitize CPU configuration */ + call verify_cpu + movq $(init_level4_pgt - __START_KERNEL_map), %rax 1: @@ -288,6 +294,8 @@ ENTRY(secondary_startup_64) pushq %rax # target address in negative space lretq +#include "verify_cpu.S" + #ifdef CONFIG_HOTPLUG_CPU /* * Boot CPU0 entry point. It's called from play_dead(). Everything has been set diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 10757d0a3..88b4da373 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -226,22 +226,7 @@ static void hpet_reserve_platform_timers(unsigned int id) { } */ static unsigned long hpet_freq; -static void hpet_legacy_set_mode(enum clock_event_mode mode, - struct clock_event_device *evt); -static int hpet_legacy_next_event(unsigned long delta, - struct clock_event_device *evt); - -/* - * The hpet clock event device - */ -static struct clock_event_device hpet_clockevent = { - .name = "hpet", - .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, - .set_mode = hpet_legacy_set_mode, - .set_next_event = hpet_legacy_next_event, - .irq = 0, - .rating = 50, -}; +static struct clock_event_device hpet_clockevent; static void hpet_stop_counter(void) { @@ -306,64 +291,74 @@ static void hpet_legacy_clockevent_register(void) printk(KERN_DEBUG "hpet clockevent registered\n"); } -static void hpet_set_mode(enum clock_event_mode mode, - struct clock_event_device *evt, int timer) +static int hpet_set_periodic(struct clock_event_device *evt, int timer) { unsigned int cfg, cmp, now; uint64_t delta; - switch (mode) { - case CLOCK_EVT_MODE_PERIODIC: - hpet_stop_counter(); - delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult; - delta >>= evt->shift; - now = hpet_readl(HPET_COUNTER); - cmp = now + (unsigned int) delta; - cfg = hpet_readl(HPET_Tn_CFG(timer)); - cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | - HPET_TN_SETVAL | HPET_TN_32BIT; - hpet_writel(cfg, HPET_Tn_CFG(timer)); - hpet_writel(cmp, HPET_Tn_CMP(timer)); - udelay(1); - /* - * HPET on AMD 81xx needs a second write (with HPET_TN_SETVAL - * cleared) to T0_CMP to set the period. The HPET_TN_SETVAL - * bit is automatically cleared after the first write. - * (See AMD-8111 HyperTransport I/O Hub Data Sheet, - * Publication # 24674) - */ - hpet_writel((unsigned int) delta, HPET_Tn_CMP(timer)); - hpet_start_counter(); - hpet_print_config(); - break; + hpet_stop_counter(); + delta = ((uint64_t)(NSEC_PER_SEC / HZ)) * evt->mult; + delta >>= evt->shift; + now = hpet_readl(HPET_COUNTER); + cmp = now + (unsigned int)delta; + cfg = hpet_readl(HPET_Tn_CFG(timer)); + cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL | + HPET_TN_32BIT; + hpet_writel(cfg, HPET_Tn_CFG(timer)); + hpet_writel(cmp, HPET_Tn_CMP(timer)); + udelay(1); + /* + * HPET on AMD 81xx needs a second write (with HPET_TN_SETVAL + * cleared) to T0_CMP to set the period. The HPET_TN_SETVAL + * bit is automatically cleared after the first write. + * (See AMD-8111 HyperTransport I/O Hub Data Sheet, + * Publication # 24674) + */ + hpet_writel((unsigned int)delta, HPET_Tn_CMP(timer)); + hpet_start_counter(); + hpet_print_config(); - case CLOCK_EVT_MODE_ONESHOT: - cfg = hpet_readl(HPET_Tn_CFG(timer)); - cfg &= ~HPET_TN_PERIODIC; - cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; - hpet_writel(cfg, HPET_Tn_CFG(timer)); - break; + return 0; +} - case CLOCK_EVT_MODE_UNUSED: - case CLOCK_EVT_MODE_SHUTDOWN: - cfg = hpet_readl(HPET_Tn_CFG(timer)); - cfg &= ~HPET_TN_ENABLE; - hpet_writel(cfg, HPET_Tn_CFG(timer)); - break; +static int hpet_set_oneshot(struct clock_event_device *evt, int timer) +{ + unsigned int cfg; - case CLOCK_EVT_MODE_RESUME: - if (timer == 0) { - hpet_enable_legacy_int(); - } else { - struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); - irq_domain_activate_irq(irq_get_irq_data(hdev->irq)); - disable_irq(hdev->irq); - irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu)); - enable_irq(hdev->irq); - } - hpet_print_config(); - break; + cfg = hpet_readl(HPET_Tn_CFG(timer)); + cfg &= ~HPET_TN_PERIODIC; + cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; + hpet_writel(cfg, HPET_Tn_CFG(timer)); + + return 0; +} + +static int hpet_shutdown(struct clock_event_device *evt, int timer) +{ + unsigned int cfg; + + cfg = hpet_readl(HPET_Tn_CFG(timer)); + cfg &= ~HPET_TN_ENABLE; + hpet_writel(cfg, HPET_Tn_CFG(timer)); + + return 0; +} + +static int hpet_resume(struct clock_event_device *evt, int timer) +{ + if (!timer) { + hpet_enable_legacy_int(); + } else { + struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); + + irq_domain_activate_irq(irq_get_irq_data(hdev->irq)); + disable_irq(hdev->irq); + irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu)); + enable_irq(hdev->irq); } + hpet_print_config(); + + return 0; } static int hpet_next_event(unsigned long delta, @@ -403,10 +398,24 @@ static int hpet_next_event(unsigned long delta, return res < HPET_MIN_CYCLES ? -ETIME : 0; } -static void hpet_legacy_set_mode(enum clock_event_mode mode, - struct clock_event_device *evt) +static int hpet_legacy_shutdown(struct clock_event_device *evt) +{ + return hpet_shutdown(evt, 0); +} + +static int hpet_legacy_set_oneshot(struct clock_event_device *evt) +{ + return hpet_set_oneshot(evt, 0); +} + +static int hpet_legacy_set_periodic(struct clock_event_device *evt) { - hpet_set_mode(mode, evt, 0); + return hpet_set_periodic(evt, 0); +} + +static int hpet_legacy_resume(struct clock_event_device *evt) +{ + return hpet_resume(evt, 0); } static int hpet_legacy_next_event(unsigned long delta, @@ -416,6 +425,22 @@ static int hpet_legacy_next_event(unsigned long delta, } /* + * The hpet clock event device + */ +static struct clock_event_device hpet_clockevent = { + .name = "hpet", + .features = CLOCK_EVT_FEAT_PERIODIC | + CLOCK_EVT_FEAT_ONESHOT, + .set_state_periodic = hpet_legacy_set_periodic, + .set_state_oneshot = hpet_legacy_set_oneshot, + .set_state_shutdown = hpet_legacy_shutdown, + .tick_resume = hpet_legacy_resume, + .set_next_event = hpet_legacy_next_event, + .irq = 0, + .rating = 50, +}; + +/* * HPET MSI Support */ #ifdef CONFIG_PCI_MSI @@ -426,7 +451,7 @@ static struct irq_domain *hpet_domain; void hpet_msi_unmask(struct irq_data *data) { - struct hpet_dev *hdev = data->handler_data; + struct hpet_dev *hdev = irq_data_get_irq_handler_data(data); unsigned int cfg; /* unmask it */ @@ -437,7 +462,7 @@ void hpet_msi_unmask(struct irq_data *data) void hpet_msi_mask(struct irq_data *data) { - struct hpet_dev *hdev = data->handler_data; + struct hpet_dev *hdev = irq_data_get_irq_handler_data(data); unsigned int cfg; /* mask it */ @@ -459,11 +484,32 @@ void hpet_msi_read(struct hpet_dev *hdev, struct msi_msg *msg) msg->address_hi = 0; } -static void hpet_msi_set_mode(enum clock_event_mode mode, - struct clock_event_device *evt) +static int hpet_msi_shutdown(struct clock_event_device *evt) +{ + struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); + + return hpet_shutdown(evt, hdev->num); +} + +static int hpet_msi_set_oneshot(struct clock_event_device *evt) +{ + struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); + + return hpet_set_oneshot(evt, hdev->num); +} + +static int hpet_msi_set_periodic(struct clock_event_device *evt) { struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); - hpet_set_mode(mode, evt, hdev->num); + + return hpet_set_periodic(evt, hdev->num); +} + +static int hpet_msi_resume(struct clock_event_device *evt) +{ + struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); + + return hpet_resume(evt, hdev->num); } static int hpet_msi_next_event(unsigned long delta, @@ -523,10 +569,14 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu) evt->rating = 110; evt->features = CLOCK_EVT_FEAT_ONESHOT; - if (hdev->flags & HPET_DEV_PERI_CAP) + if (hdev->flags & HPET_DEV_PERI_CAP) { evt->features |= CLOCK_EVT_FEAT_PERIODIC; + evt->set_state_periodic = hpet_msi_set_periodic; + } - evt->set_mode = hpet_msi_set_mode; + evt->set_state_shutdown = hpet_msi_shutdown; + evt->set_state_oneshot = hpet_msi_set_oneshot; + evt->tick_resume = hpet_msi_resume; evt->set_next_event = hpet_msi_next_event; evt->cpumask = cpumask_of(hdev->cpu); @@ -735,7 +785,7 @@ static int hpet_clocksource_register(void) /* Verify whether hpet counter works */ t1 = hpet_readl(HPET_COUNTER); - rdtscll(start); + start = rdtsc(); /* * We don't know the TSC frequency yet, but waiting for @@ -745,7 +795,7 @@ static int hpet_clocksource_register(void) */ do { rep_nop(); - rdtscll(now); + now = rdtsc(); } while ((now - start) < 200000UL); if (t1 == hpet_readl(HPET_COUNTER)) { diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 7114ba220..50a3fad5b 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -32,6 +32,7 @@ #include <linux/irqflags.h> #include <linux/notifier.h> #include <linux/kallsyms.h> +#include <linux/kprobes.h> #include <linux/percpu.h> #include <linux/kdebug.h> #include <linux/kernel.h> @@ -179,7 +180,11 @@ int arch_check_bp_in_kernelspace(struct perf_event *bp) va = info->address; len = bp->attr.bp_len; - return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE); + /* + * We don't need to worry about va + len - 1 overflowing: + * we already require that va is aligned to a multiple of len. + */ + return (va >= TASK_SIZE_MAX) || ((va + len - 1) >= TASK_SIZE_MAX); } int arch_bp_generic_fields(int x86_len, int x86_type, @@ -243,6 +248,20 @@ static int arch_build_bp_info(struct perf_event *bp) info->type = X86_BREAKPOINT_RW; break; case HW_BREAKPOINT_X: + /* + * We don't allow kernel breakpoints in places that are not + * acceptable for kprobes. On non-kprobes kernels, we don't + * allow kernel breakpoints at all. + */ + if (bp->attr.bp_addr >= TASK_SIZE_MAX) { +#ifdef CONFIG_KPROBES + if (within_kprobe_blacklist(bp->attr.bp_addr)) + return -EINVAL; +#else + return -EINVAL; +#endif + } + info->type = X86_BREAKPOINT_EXECUTE; /* * x86 inst breakpoints need to have a specific undefined len. @@ -276,8 +295,18 @@ static int arch_build_bp_info(struct perf_event *bp) break; #endif default: + /* AMD range breakpoint */ if (!is_power_of_2(bp->attr.bp_len)) return -EINVAL; + if (bp->attr.bp_addr & (bp->attr.bp_len - 1)) + return -EINVAL; + /* + * It's impossible to use a range breakpoint to fake out + * user vs kernel detection because bp_len - 1 can't + * have the high bit set. If we ever allow range instruction + * breakpoints, then we'll have to check for kprobe-blacklisted + * addresses anywhere in the range. + */ if (!cpu_has_bpext) return -EOPNOTSUPP; info->mask = bp->attr.bp_len - 1; diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c index f2b96de3c..efb82f07b 100644 --- a/arch/x86/kernel/i8253.c +++ b/arch/x86/kernel/i8253.c @@ -34,7 +34,7 @@ static int __init init_pit_clocksource(void) * - when local APIC timer is active (PIT is switched off) */ if (num_possible_cpus() > 1 || is_hpet_enabled() || - i8253_clockevent.mode != CLOCK_EVT_MODE_PERIODIC) + !clockevent_state_periodic(&i8253_clockevent)) return 0; return clocksource_i8253_init(); diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 16cb827a5..be22f5a21 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -295,16 +295,11 @@ static void unmask_8259A(void) raw_spin_unlock_irqrestore(&i8259A_lock, flags); } -static void init_8259A(int auto_eoi) +static int probe_8259A(void) { unsigned long flags; unsigned char probe_val = ~(1 << PIC_CASCADE_IR); unsigned char new_val; - - i8259A_auto_eoi = auto_eoi; - - raw_spin_lock_irqsave(&i8259A_lock, flags); - /* * Check to see if we have a PIC. * Mask all except the cascade and read @@ -312,16 +307,28 @@ static void init_8259A(int auto_eoi) * have a PIC, we will read 0xff as opposed to the * value we wrote. */ + raw_spin_lock_irqsave(&i8259A_lock, flags); + outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ outb(probe_val, PIC_MASTER_IMR); new_val = inb(PIC_MASTER_IMR); if (new_val != probe_val) { printk(KERN_INFO "Using NULL legacy PIC\n"); legacy_pic = &null_legacy_pic; - raw_spin_unlock_irqrestore(&i8259A_lock, flags); - return; } + raw_spin_unlock_irqrestore(&i8259A_lock, flags); + return nr_legacy_irqs(); +} + +static void init_8259A(int auto_eoi) +{ + unsigned long flags; + + i8259A_auto_eoi = auto_eoi; + + raw_spin_lock_irqsave(&i8259A_lock, flags); + outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ /* @@ -379,6 +386,10 @@ static int legacy_pic_irq_pending_noop(unsigned int irq) { return 0; } +static int legacy_pic_probe(void) +{ + return 0; +} struct legacy_pic null_legacy_pic = { .nr_legacy_irqs = 0, @@ -388,6 +399,7 @@ struct legacy_pic null_legacy_pic = { .mask_all = legacy_pic_noop, .restore_mask = legacy_pic_noop, .init = legacy_pic_int_noop, + .probe = legacy_pic_probe, .irq_pending = legacy_pic_irq_pending_noop, .make_irq = legacy_pic_uint_noop, }; @@ -400,6 +412,7 @@ struct legacy_pic default_legacy_pic = { .mask_all = mask_8259A, .restore_mask = unmask_8259A, .init = init_8259A, + .probe = probe_8259A, .irq_pending = i8259A_irq_pending, .make_irq = make_8259A_irq, }; diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index 37dae792d..816f81544 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -28,8 +28,18 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) return -EINVAL; +#ifdef CONFIG_SCHED_BFS_AUTOISO + if (turn_on) { + struct sched_param param = { .sched_priority = 0 }; + if (!capable(CAP_SYS_RAWIO)) + return -EPERM; + /* Start X as SCHED_ISO */ + sched_setscheduler_nocheck(current, SCHED_ISO, ¶m); + } +#else if (turn_on && !capable(CAP_SYS_RAWIO)) return -EPERM; +#endif /* * If it's the first ioperm() call in this thread's lifetime, set the @@ -103,8 +113,15 @@ SYSCALL_DEFINE1(iopl, unsigned int, level) return -EINVAL; /* Trying to gain more privileges? */ if (level > old) { +#ifdef CONFIG_SCHED_BFS_AUTOISO + struct sched_param param = { .sched_priority = 0 }; +#endif if (!capable(CAP_SYS_RAWIO)) return -EPERM; +#ifdef CONFIG_SCHED_BFS_AUTOISO + /* Start X as SCHED_ISO */ + sched_setscheduler_nocheck(current, SCHED_ISO, ¶m); +#endif } regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | (level << 12); t->iopl = level << 12; diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index c7dfe1be7..f8062aaf5 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -139,10 +139,13 @@ int arch_show_interrupts(struct seq_file *p, int prec) seq_puts(p, " Machine check polls\n"); #endif #if IS_ENABLED(CONFIG_HYPERV) || defined(CONFIG_XEN) - seq_printf(p, "%*s: ", prec, "HYP"); - for_each_online_cpu(j) - seq_printf(p, "%10u ", irq_stats(j)->irq_hv_callback_count); - seq_puts(p, " Hypervisor callback interrupts\n"); + if (test_bit(HYPERVISOR_CALLBACK_VECTOR, used_vectors)) { + seq_printf(p, "%*s: ", prec, "HYP"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", + irq_stats(j)->irq_hv_callback_count); + seq_puts(p, " Hypervisor callback interrupts\n"); + } #endif seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count)); #if defined(CONFIG_X86_IO_APIC) @@ -211,24 +214,38 @@ u64 arch_irq_stat(void) __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); - + struct irq_desc * desc; /* high bit used in ret_from_ code */ unsigned vector = ~regs->orig_ax; - unsigned irq; + + /* + * NB: Unlike exception entries, IRQ entries do not reliably + * handle context tracking in the low-level entry code. This is + * because syscall entries execute briefly with IRQs on before + * updating context tracking state, so we can take an IRQ from + * kernel mode with CONTEXT_USER. The low-level entry code only + * updates the context if we came from user mode, so we won't + * switch to CONTEXT_KERNEL. We'll fix that once the syscall + * code is cleaned up enough that we can cleanly defer enabling + * IRQs. + */ entering_irq(); - irq = __this_cpu_read(vector_irq[vector]); + /* entering_irq() tells RCU that we're not quiescent. Check it. */ + RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU"); + + desc = __this_cpu_read(vector_irq[vector]); - if (!handle_irq(irq, regs)) { + if (!handle_irq(desc, regs)) { ack_APIC_irq(); - if (irq != VECTOR_RETRIGGERED) { - pr_emerg_ratelimited("%s: %d.%d No irq handler for vector (irq %d)\n", + if (desc != VECTOR_RETRIGGERED) { + pr_emerg_ratelimited("%s: %d.%d No irq handler for vector\n", __func__, smp_processor_id(), - vector, irq); + vector); } else { - __this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED); + __this_cpu_write(vector_irq[vector], VECTOR_UNUSED); } } @@ -330,10 +347,10 @@ static struct cpumask affinity_new, online_new; */ int check_irq_vectors_for_cpu_disable(void) { - int irq, cpu; unsigned int this_cpu, vector, this_count, count; struct irq_desc *desc; struct irq_data *data; + int cpu; this_cpu = smp_processor_id(); cpumask_copy(&online_new, cpu_online_mask); @@ -341,47 +358,43 @@ int check_irq_vectors_for_cpu_disable(void) this_count = 0; for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { - irq = __this_cpu_read(vector_irq[vector]); - if (irq >= 0) { - desc = irq_to_desc(irq); - if (!desc) - continue; - - /* - * Protect against concurrent action removal, - * affinity changes etc. - */ - raw_spin_lock(&desc->lock); - data = irq_desc_get_irq_data(desc); - cpumask_copy(&affinity_new, data->affinity); - cpumask_clear_cpu(this_cpu, &affinity_new); - - /* Do not count inactive or per-cpu irqs. */ - if (!irq_has_action(irq) || irqd_is_per_cpu(data)) { - raw_spin_unlock(&desc->lock); - continue; - } + desc = __this_cpu_read(vector_irq[vector]); + if (IS_ERR_OR_NULL(desc)) + continue; + /* + * Protect against concurrent action removal, affinity + * changes etc. + */ + raw_spin_lock(&desc->lock); + data = irq_desc_get_irq_data(desc); + cpumask_copy(&affinity_new, + irq_data_get_affinity_mask(data)); + cpumask_clear_cpu(this_cpu, &affinity_new); + /* Do not count inactive or per-cpu irqs. */ + if (!irq_desc_has_action(desc) || irqd_is_per_cpu(data)) { raw_spin_unlock(&desc->lock); - /* - * A single irq may be mapped to multiple - * cpu's vector_irq[] (for example IOAPIC cluster - * mode). In this case we have two - * possibilities: - * - * 1) the resulting affinity mask is empty; that is - * this the down'd cpu is the last cpu in the irq's - * affinity mask, or - * - * 2) the resulting affinity mask is no longer - * a subset of the online cpus but the affinity - * mask is not zero; that is the down'd cpu is the - * last online cpu in a user set affinity mask. - */ - if (cpumask_empty(&affinity_new) || - !cpumask_subset(&affinity_new, &online_new)) - this_count++; + continue; } + + raw_spin_unlock(&desc->lock); + /* + * A single irq may be mapped to multiple cpu's + * vector_irq[] (for example IOAPIC cluster mode). In + * this case we have two possibilities: + * + * 1) the resulting affinity mask is empty; that is + * this the down'd cpu is the last cpu in the irq's + * affinity mask, or + * + * 2) the resulting affinity mask is no longer a + * subset of the online cpus but the affinity mask is + * not zero; that is the down'd cpu is the last online + * cpu in a user set affinity mask. + */ + if (cpumask_empty(&affinity_new) || + !cpumask_subset(&affinity_new, &online_new)) + this_count++; } count = 0; @@ -400,8 +413,8 @@ int check_irq_vectors_for_cpu_disable(void) for (vector = FIRST_EXTERNAL_VECTOR; vector < first_system_vector; vector++) { if (!test_bit(vector, used_vectors) && - per_cpu(vector_irq, cpu)[vector] < 0) - count++; + IS_ERR_OR_NULL(per_cpu(vector_irq, cpu)[vector])) + count++; } } @@ -437,7 +450,7 @@ void fixup_irqs(void) raw_spin_lock(&desc->lock); data = irq_desc_get_irq_data(desc); - affinity = data->affinity; + affinity = irq_data_get_affinity_mask(data); if (!irq_has_action(irq) || irqd_is_per_cpu(data) || cpumask_subset(affinity, cpu_online_mask)) { raw_spin_unlock(&desc->lock); @@ -505,14 +518,13 @@ void fixup_irqs(void) for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { unsigned int irr; - if (__this_cpu_read(vector_irq[vector]) <= VECTOR_UNDEFINED) + if (IS_ERR_OR_NULL(__this_cpu_read(vector_irq[vector]))) continue; irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); if (irr & (1 << (vector % 32))) { - irq = __this_cpu_read(vector_irq[vector]); + desc = __this_cpu_read(vector_irq[vector]); - desc = irq_to_desc(irq); raw_spin_lock(&desc->lock); data = irq_desc_get_irq_data(desc); chip = irq_data_get_irq_chip(data); @@ -523,7 +535,7 @@ void fixup_irqs(void) raw_spin_unlock(&desc->lock); } if (__this_cpu_read(vector_irq[vector]) != VECTOR_RETRIGGERED) - __this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED); + __this_cpu_write(vector_irq[vector], VECTOR_UNUSED); } } #endif diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index cd74f5978..38da8f29a 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -68,11 +68,10 @@ static inline void *current_stack(void) return (void *)(current_stack_pointer() & ~(THREAD_SIZE - 1)); } -static inline int -execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) +static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc) { struct irq_stack *curstk, *irqstk; - u32 *isp, *prev_esp, arg1, arg2; + u32 *isp, *prev_esp, arg1; curstk = (struct irq_stack *) current_stack(); irqstk = __this_cpu_read(hardirq_stack); @@ -98,8 +97,8 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) asm volatile("xchgl %%ebx,%%esp \n" "call *%%edi \n" "movl %%ebx,%%esp \n" - : "=a" (arg1), "=d" (arg2), "=b" (isp) - : "0" (irq), "1" (desc), "2" (isp), + : "=a" (arg1), "=b" (isp) + : "0" (desc), "1" (isp), "D" (desc->handle_irq) : "memory", "cc", "ecx"); return 1; @@ -148,21 +147,17 @@ void do_softirq_own_stack(void) call_on_stack(__do_softirq, isp); } -bool handle_irq(unsigned irq, struct pt_regs *regs) +bool handle_irq(struct irq_desc *desc, struct pt_regs *regs) { - struct irq_desc *desc; - int overflow; + int overflow = check_stack_overflow(); - overflow = check_stack_overflow(); - - desc = irq_to_desc(irq); - if (unlikely(!desc)) + if (IS_ERR_OR_NULL(desc)) return false; - if (user_mode(regs) || !execute_on_irq_stack(overflow, desc, irq)) { + if (user_mode(regs) || !execute_on_irq_stack(overflow, desc)) { if (unlikely(overflow)) print_stack_overflow(); - desc->handle_irq(irq, desc); + generic_handle_irq_desc(desc); } return true; diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index bc4604e50..c767cf2bc 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -68,16 +68,13 @@ static inline void stack_overflow_check(struct pt_regs *regs) #endif } -bool handle_irq(unsigned irq, struct pt_regs *regs) +bool handle_irq(struct irq_desc *desc, struct pt_regs *regs) { - struct irq_desc *desc; - stack_overflow_check(regs); - desc = irq_to_desc(irq); - if (unlikely(!desc)) + if (unlikely(IS_ERR_OR_NULL(desc))) return false; - generic_handle_irq_desc(irq, desc); + generic_handle_irq_desc(desc); return true; } diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index a3a5e158e..1423ab1b0 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -52,7 +52,7 @@ static struct irqaction irq2 = { }; DEFINE_PER_CPU(vector_irq_t, vector_irq) = { - [0 ... NR_VECTORS - 1] = VECTOR_UNDEFINED, + [0 ... NR_VECTORS - 1] = VECTOR_UNUSED, }; int vector_used_by_percpu_irq(unsigned int vector) @@ -60,7 +60,7 @@ int vector_used_by_percpu_irq(unsigned int vector) int cpu; for_each_online_cpu(cpu) { - if (per_cpu(vector_irq, cpu)[vector] > VECTOR_UNDEFINED) + if (!IS_ERR_OR_NULL(per_cpu(vector_irq, cpu)[vector])) return 1; } @@ -94,7 +94,7 @@ void __init init_IRQ(void) * irq's migrate etc. */ for (i = 0; i < nr_legacy_irqs(); i++) - per_cpu(vector_irq, 0)[ISA_IRQ_VECTOR(i)] = i; + per_cpu(vector_irq, 0)[ISA_IRQ_VECTOR(i)] = irq_to_desc(i); x86_init.irqs.intr_init(); } diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index 26d5a55a2..e565e0e4d 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -45,7 +45,7 @@ static void __jump_label_transform(struct jump_entry *entry, const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP }; const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5]; - if (type == JUMP_LABEL_ENABLE) { + if (type == JUMP_LABEL_JMP) { if (init) { /* * Jump label is enabled for the first time. diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index ca83f7ac3..0f8a6bbaa 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -223,9 +223,6 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params, memset(¶ms->hd0_info, 0, sizeof(params->hd0_info)); memset(¶ms->hd1_info, 0, sizeof(params->hd1_info)); - /* Default sysdesc table */ - params->sys_desc_table.length = 0; - if (image->type == KEXEC_TYPE_CRASH) { ret = crash_setup_memmap_entries(image, params); if (ret) @@ -536,7 +533,9 @@ static int bzImage64_verify_sig(const char *kernel, unsigned long kernel_len) int ret; ret = verify_pefile_signature(kernel, kernel_len, - system_trusted_keyring, &trusted); + system_trusted_keyring, + VERIFYING_KEXEC_PE_SIGNATURE, + &trusted); if (ret < 0) return ret; if (!trusted) diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 49487b488..2c7aafa70 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -200,7 +200,7 @@ static void kvm_setup_secondary_clock(void) * kind of shutdown from our side, we unregister the clock by writting anything * that does not have the 'enable' bit set in the msr */ -#ifdef CONFIG_KEXEC +#ifdef CONFIG_KEXEC_CORE static void kvm_crash_shutdown(struct pt_regs *regs) { native_write_msr(msr_kvm_system_time, 0, 0); @@ -259,7 +259,7 @@ void __init kvmclock_init(void) x86_platform.save_sched_clock_state = kvm_save_sched_clock_state; x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state; machine_ops.shutdown = kvm_shutdown; -#ifdef CONFIG_KEXEC +#ifdef CONFIG_KEXEC_CORE machine_ops.crash_shutdown = kvm_crash_shutdown; #endif kvm_get_preset_lpj(); diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 2bcc0525f..6acc9dd91 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -58,7 +58,7 @@ static struct ldt_struct *alloc_ldt_struct(int size) if (alloc_size > PAGE_SIZE) new_ldt->entries = vzalloc(alloc_size); else - new_ldt->entries = kzalloc(PAGE_SIZE, GFP_KERNEL); + new_ldt->entries = (void *)get_zeroed_page(GFP_KERNEL); if (!new_ldt->entries) { kfree(new_ldt); @@ -95,7 +95,7 @@ static void free_ldt_struct(struct ldt_struct *ldt) if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE) vfree(ldt->entries); else - kfree(ldt->entries); + free_page((unsigned long)ldt->entries); kfree(ldt); } diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index d05bd2e2e..697f90db0 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -110,7 +110,7 @@ static void nmi_max_handler(struct irq_work *w) a->handler, whole_msecs, decimal_msecs); } -static int nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b) +static int nmi_handle(unsigned int type, struct pt_regs *regs) { struct nmi_desc *desc = nmi_to_desc(type); struct nmiaction *a; @@ -213,7 +213,7 @@ static void pci_serr_error(unsigned char reason, struct pt_regs *regs) { /* check to see if anyone registered against these types of errors */ - if (nmi_handle(NMI_SERR, regs, false)) + if (nmi_handle(NMI_SERR, regs)) return; pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n", @@ -247,7 +247,7 @@ io_check_error(unsigned char reason, struct pt_regs *regs) unsigned long i; /* check to see if anyone registered against these types of errors */ - if (nmi_handle(NMI_IO_CHECK, regs, false)) + if (nmi_handle(NMI_IO_CHECK, regs)) return; pr_emerg( @@ -284,7 +284,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs) * as only the first one is ever run (unless it can actually determine * if it caused the NMI) */ - handled = nmi_handle(NMI_UNKNOWN, regs, false); + handled = nmi_handle(NMI_UNKNOWN, regs); if (handled) { __this_cpu_add(nmi_stats.unknown, handled); return; @@ -332,7 +332,7 @@ static void default_do_nmi(struct pt_regs *regs) __this_cpu_write(last_nmi_rip, regs->ip); - handled = nmi_handle(NMI_LOCAL, regs, b2b); + handled = nmi_handle(NMI_LOCAL, regs); __this_cpu_add(nmi_stats.normal, handled); if (handled) { /* diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index ebb5657ee..c2130aef3 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -359,9 +359,7 @@ __visible struct pv_cpu_ops pv_cpu_ops = { .wbinvd = native_wbinvd, .read_msr = native_read_msr_safe, .write_msr = native_write_msr_safe, - .read_tsc = native_read_tsc, .read_pmc = native_read_pmc, - .read_tscp = native_read_tscp, .load_tr_desc = native_load_tr_desc, .set_ldt = native_set_ldt, .load_gdt = native_load_gdt, diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c index e1b013696..c89f50a76 100644 --- a/arch/x86/kernel/paravirt_patch_32.c +++ b/arch/x86/kernel/paravirt_patch_32.c @@ -10,7 +10,6 @@ DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax"); DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3"); DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax"); DEF_NATIVE(pv_cpu_ops, clts, "clts"); -DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc"); #if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS) DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%eax)"); @@ -52,7 +51,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, PATCH_SITE(pv_mmu_ops, read_cr3); PATCH_SITE(pv_mmu_ops, write_cr3); PATCH_SITE(pv_cpu_ops, clts); - PATCH_SITE(pv_cpu_ops, read_tsc); #if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS) case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock): if (pv_is_native_spin_unlock()) { diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 353972c19..cd99433b8 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -58,17 +58,6 @@ EXPORT_SYMBOL(x86_dma_fallback_dev); /* Number of entries preallocated for DMA-API debugging */ #define PREALLOC_DMA_DEBUG_ENTRIES 65536 -int dma_set_mask(struct device *dev, u64 mask) -{ - if (!dev->dma_mask || !dma_supported(dev, mask)) - return -EIO; - - *dev->dma_mask = mask; - - return 0; -} -EXPORT_SYMBOL(dma_set_mask); - void __init pci_iommu_alloc(void) { struct iommu_table_entry *p; @@ -140,50 +129,20 @@ void dma_generic_free_coherent(struct device *dev, size_t size, void *vaddr, free_pages((unsigned long)vaddr, get_order(size)); } -void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, - gfp_t gfp, struct dma_attrs *attrs) -{ - struct dma_map_ops *ops = get_dma_ops(dev); - void *memory; - - gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); - - if (dma_alloc_from_coherent(dev, size, dma_handle, &memory)) - return memory; - - if (!dev) - dev = &x86_dma_fallback_dev; - - if (!is_device_dma_capable(dev)) - return NULL; - - if (!ops->alloc) - return NULL; - - memory = ops->alloc(dev, size, dma_handle, - dma_alloc_coherent_gfp_flags(dev, gfp), attrs); - debug_dma_alloc_coherent(dev, size, *dma_handle, memory); - - return memory; -} -EXPORT_SYMBOL(dma_alloc_attrs); - -void dma_free_attrs(struct device *dev, size_t size, - void *vaddr, dma_addr_t bus, - struct dma_attrs *attrs) +bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp) { - struct dma_map_ops *ops = get_dma_ops(dev); + if (!*dev) + *dev = &x86_dma_fallback_dev; - WARN_ON(irqs_disabled()); /* for portability */ + *gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); + *gfp = dma_alloc_coherent_gfp_flags(*dev, *gfp); - if (dma_release_from_coherent(dev, get_order(size), vaddr)) - return; + if (!is_device_dma_capable(*dev)) + return false; + return true; - debug_dma_free_coherent(dev, size, vaddr, bus); - if (ops->free) - ops->free(dev, size, vaddr, bus, attrs); } -EXPORT_SYMBOL(dma_free_attrs); +EXPORT_SYMBOL(arch_dma_alloc_attrs); /* * See <Documentation/x86/x86_64/boot-options.txt> for the iommu kernel diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c index 64f90f53b..4f00b63d7 100644 --- a/arch/x86/kernel/pmem.c +++ b/arch/x86/kernel/pmem.c @@ -3,80 +3,17 @@ * Copyright (c) 2015, Intel Corporation. */ #include <linux/platform_device.h> -#include <linux/libnvdimm.h> #include <linux/module.h> -#include <asm/e820.h> - -static void e820_pmem_release(struct device *dev) -{ - struct nvdimm_bus *nvdimm_bus = dev->platform_data; - - if (nvdimm_bus) - nvdimm_bus_unregister(nvdimm_bus); -} - -static struct platform_device e820_pmem = { - .name = "e820_pmem", - .id = -1, - .dev = { - .release = e820_pmem_release, - }, -}; - -static const struct attribute_group *e820_pmem_attribute_groups[] = { - &nvdimm_bus_attribute_group, - NULL, -}; - -static const struct attribute_group *e820_pmem_region_attribute_groups[] = { - &nd_region_attribute_group, - &nd_device_attribute_group, - NULL, -}; static __init int register_e820_pmem(void) { - static struct nvdimm_bus_descriptor nd_desc; - struct device *dev = &e820_pmem.dev; - struct nvdimm_bus *nvdimm_bus; - int rc, i; - - rc = platform_device_register(&e820_pmem); - if (rc) - return rc; - - nd_desc.attr_groups = e820_pmem_attribute_groups; - nd_desc.provider_name = "e820"; - nvdimm_bus = nvdimm_bus_register(dev, &nd_desc); - if (!nvdimm_bus) - goto err; - dev->platform_data = nvdimm_bus; - - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - struct resource res = { - .flags = IORESOURCE_MEM, - .start = ei->addr, - .end = ei->addr + ei->size - 1, - }; - struct nd_region_desc ndr_desc; - - if (ei->type != E820_PRAM) - continue; - - memset(&ndr_desc, 0, sizeof(ndr_desc)); - ndr_desc.res = &res; - ndr_desc.attr_groups = e820_pmem_region_attribute_groups; - ndr_desc.numa_node = NUMA_NO_NODE; - if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc)) - goto err; - } - - return 0; - - err: - dev_err(dev, "failed to register legacy persistent memory ranges\n"); - platform_device_unregister(&e820_pmem); - return -ENXIO; + struct platform_device *pdev; + + /* + * See drivers/nvdimm/e820.c for the implementation, this is + * simply here to trigger the module to load on demand. + */ + pdev = platform_device_alloc("e820_pmem", -1); + return platform_device_add(pdev); } device_initcall(register_e820_pmem); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index c27cad726..9f7c21c22 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -29,6 +29,8 @@ #include <asm/debugreg.h> #include <asm/nmi.h> #include <asm/tlbflush.h> +#include <asm/mce.h> +#include <asm/vm86.h> /* * per-CPU TSS segments. Threads are completely 'soft' on Linux, @@ -82,6 +84,9 @@ EXPORT_SYMBOL_GPL(idle_notifier_unregister); int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { memcpy(dst, src, arch_task_struct_size); +#ifdef CONFIG_VM86 + dst->thread.vm86 = NULL; +#endif return fpu__copy(&dst->thread.fpu, &src->thread.fpu); } @@ -110,6 +115,8 @@ void exit_thread(void) kfree(bp); } + free_vm86(t); + fpu__drop(fpu); } @@ -319,6 +326,7 @@ void stop_this_cpu(void *dummy) */ set_cpu_online(smp_processor_id(), false); disable_local_APIC(); + mcheck_cpu_clear(this_cpu_ptr(&cpu_info)); for (;;) halt(); @@ -501,3 +509,58 @@ unsigned long arch_randomize_brk(struct mm_struct *mm) return randomize_range(mm->brk, range_end, 0) ? : mm->brk; } +/* + * Called from fs/proc with a reference on @p to find the function + * which called into schedule(). This needs to be done carefully + * because the task might wake up and we might look at a stack + * changing under us. + */ +unsigned long get_wchan(struct task_struct *p) +{ + unsigned long start, bottom, top, sp, fp, ip; + int count = 0; + + if (!p || p == current || p->state == TASK_RUNNING) + return 0; + + start = (unsigned long)task_stack_page(p); + if (!start) + return 0; + + /* + * Layout of the stack page: + * + * ----------- topmax = start + THREAD_SIZE - sizeof(unsigned long) + * PADDING + * ----------- top = topmax - TOP_OF_KERNEL_STACK_PADDING + * stack + * ----------- bottom = start + sizeof(thread_info) + * thread_info + * ----------- start + * + * The tasks stack pointer points at the location where the + * framepointer is stored. The data on the stack is: + * ... IP FP ... IP FP + * + * We need to read FP and IP, so we need to adjust the upper + * bound by another unsigned long. + */ + top = start + THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; + top -= 2 * sizeof(unsigned long); + bottom = start + sizeof(struct thread_info); + + sp = READ_ONCE(p->thread.sp); + if (sp < bottom || sp > top) + return 0; + + fp = READ_ONCE_NOCHECK(*(unsigned long *)sp); + do { + if (fp < bottom || fp > top) + return 0; + ip = READ_ONCE_NOCHECK(*(unsigned long *)(fp + sizeof(unsigned long))); + if (!in_sched_functions(ip)) + return ip; + fp = READ_ONCE_NOCHECK(*(unsigned long *)fp); + } while (count++ < 16 && p->state != TASK_RUNNING); + return 0; +} diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index f73c962fe..737527b40 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -53,6 +53,7 @@ #include <asm/syscalls.h> #include <asm/debugreg.h> #include <asm/switch_to.h> +#include <asm/vm86.h> asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); asmlinkage void ret_from_kernel_thread(void) __asm__("ret_from_kernel_thread"); @@ -323,31 +324,3 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) return prev_p; } - -#define top_esp (THREAD_SIZE - sizeof(unsigned long)) -#define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long)) - -unsigned long get_wchan(struct task_struct *p) -{ - unsigned long bp, sp, ip; - unsigned long stack_page; - int count = 0; - if (!p || p == current || p->state == TASK_RUNNING) - return 0; - stack_page = (unsigned long)task_stack_page(p); - sp = p->thread.sp; - if (!stack_page || sp < stack_page || sp > top_esp+stack_page) - return 0; - /* include/asm-i386/system.h:switch_to() pushes bp last. */ - bp = *(unsigned long *) sp; - do { - if (bp < stack_page || bp > top_ebp+stack_page) - return 0; - ip = *(unsigned long *) (bp+4); - if (!in_sched_functions(ip)) - return ip; - bp = *(unsigned long *) bp; - } while (count++ < 16); - return 0; -} - diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index a90ac9556..b35921a67 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -121,6 +121,7 @@ void __show_regs(struct pt_regs *regs, int all) void release_thread(struct task_struct *dead_task) { if (dead_task->mm) { +#ifdef CONFIG_MODIFY_LDT_SYSCALL if (dead_task->mm->context.ldt) { pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n", dead_task->comm, @@ -128,6 +129,7 @@ void release_thread(struct task_struct *dead_task) dead_task->mm->context.ldt->size); BUG(); } +#endif } } @@ -248,8 +250,8 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) __USER_CS, __USER_DS, 0); } -#ifdef CONFIG_IA32_EMULATION -void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp) +#ifdef CONFIG_COMPAT +void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp) { start_thread_common(regs, new_ip, new_sp, test_thread_flag(TIF_X32) @@ -497,62 +499,6 @@ void set_personality_ia32(bool x32) } EXPORT_SYMBOL_GPL(set_personality_ia32); -/* - * Called from fs/proc with a reference on @p to find the function - * which called into schedule(). This needs to be done carefully - * because the task might wake up and we might look at a stack - * changing under us. - */ -unsigned long get_wchan(struct task_struct *p) -{ - unsigned long start, bottom, top, sp, fp, ip; - int count = 0; - - if (!p || p == current || p->state == TASK_RUNNING) - return 0; - - start = (unsigned long)task_stack_page(p); - if (!start) - return 0; - - /* - * Layout of the stack page: - * - * ----------- topmax = start + THREAD_SIZE - sizeof(unsigned long) - * PADDING - * ----------- top = topmax - TOP_OF_KERNEL_STACK_PADDING - * stack - * ----------- bottom = start + sizeof(thread_info) - * thread_info - * ----------- start - * - * The tasks stack pointer points at the location where the - * framepointer is stored. The data on the stack is: - * ... IP FP ... IP FP - * - * We need to read FP and IP, so we need to adjust the upper - * bound by another unsigned long. - */ - top = start + THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; - top -= 2 * sizeof(unsigned long); - bottom = start + sizeof(struct thread_info); - - sp = READ_ONCE(p->thread.sp); - if (sp < bottom || sp > top) - return 0; - - fp = READ_ONCE(*(unsigned long *)sp); - do { - if (fp < bottom || fp > top) - return 0; - ip = READ_ONCE(*(unsigned long *)(fp + sizeof(unsigned long))); - if (!in_sched_functions(ip)) - return ip; - fp = READ_ONCE(*(unsigned long *)fp); - } while (count++ < 16 && p->state != TASK_RUNNING); - return 0; -} - long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) { int ret = 0; diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 9be72bc36..558f50ede 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -37,12 +37,10 @@ #include <asm/proto.h> #include <asm/hw_breakpoint.h> #include <asm/traps.h> +#include <asm/syscall.h> #include "tls.h" -#define CREATE_TRACE_POINTS -#include <trace/events/syscalls.h> - enum x86_regset { REGSET_GENERAL, REGSET_FP, @@ -1123,6 +1121,73 @@ static int genregs32_set(struct task_struct *target, return ret; } +static long ia32_arch_ptrace(struct task_struct *child, compat_long_t request, + compat_ulong_t caddr, compat_ulong_t cdata) +{ + unsigned long addr = caddr; + unsigned long data = cdata; + void __user *datap = compat_ptr(data); + int ret; + __u32 val; + + switch (request) { + case PTRACE_PEEKUSR: + ret = getreg32(child, addr, &val); + if (ret == 0) + ret = put_user(val, (__u32 __user *)datap); + break; + + case PTRACE_POKEUSR: + ret = putreg32(child, addr, data); + break; + + case PTRACE_GETREGS: /* Get all gp regs from the child. */ + return copy_regset_to_user(child, &user_x86_32_view, + REGSET_GENERAL, + 0, sizeof(struct user_regs_struct32), + datap); + + case PTRACE_SETREGS: /* Set all gp regs in the child. */ + return copy_regset_from_user(child, &user_x86_32_view, + REGSET_GENERAL, 0, + sizeof(struct user_regs_struct32), + datap); + + case PTRACE_GETFPREGS: /* Get the child FPU state. */ + return copy_regset_to_user(child, &user_x86_32_view, + REGSET_FP, 0, + sizeof(struct user_i387_ia32_struct), + datap); + + case PTRACE_SETFPREGS: /* Set the child FPU state. */ + return copy_regset_from_user( + child, &user_x86_32_view, REGSET_FP, + 0, sizeof(struct user_i387_ia32_struct), datap); + + case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */ + return copy_regset_to_user(child, &user_x86_32_view, + REGSET_XFP, 0, + sizeof(struct user32_fxsr_struct), + datap); + + case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */ + return copy_regset_from_user(child, &user_x86_32_view, + REGSET_XFP, 0, + sizeof(struct user32_fxsr_struct), + datap); + + case PTRACE_GET_THREAD_AREA: + case PTRACE_SET_THREAD_AREA: + return arch_ptrace(child, request, addr, data); + + default: + return compat_ptrace_request(child, request, addr, data); + } + + return ret; +} +#endif /* CONFIG_IA32_EMULATION */ + #ifdef CONFIG_X86_X32_ABI static long x32_arch_ptrace(struct task_struct *child, compat_long_t request, compat_ulong_t caddr, @@ -1211,78 +1276,21 @@ static long x32_arch_ptrace(struct task_struct *child, } #endif +#ifdef CONFIG_COMPAT long compat_arch_ptrace(struct task_struct *child, compat_long_t request, compat_ulong_t caddr, compat_ulong_t cdata) { - unsigned long addr = caddr; - unsigned long data = cdata; - void __user *datap = compat_ptr(data); - int ret; - __u32 val; - #ifdef CONFIG_X86_X32_ABI if (!is_ia32_task()) return x32_arch_ptrace(child, request, caddr, cdata); #endif - - switch (request) { - case PTRACE_PEEKUSR: - ret = getreg32(child, addr, &val); - if (ret == 0) - ret = put_user(val, (__u32 __user *)datap); - break; - - case PTRACE_POKEUSR: - ret = putreg32(child, addr, data); - break; - - case PTRACE_GETREGS: /* Get all gp regs from the child. */ - return copy_regset_to_user(child, &user_x86_32_view, - REGSET_GENERAL, - 0, sizeof(struct user_regs_struct32), - datap); - - case PTRACE_SETREGS: /* Set all gp regs in the child. */ - return copy_regset_from_user(child, &user_x86_32_view, - REGSET_GENERAL, 0, - sizeof(struct user_regs_struct32), - datap); - - case PTRACE_GETFPREGS: /* Get the child FPU state. */ - return copy_regset_to_user(child, &user_x86_32_view, - REGSET_FP, 0, - sizeof(struct user_i387_ia32_struct), - datap); - - case PTRACE_SETFPREGS: /* Set the child FPU state. */ - return copy_regset_from_user( - child, &user_x86_32_view, REGSET_FP, - 0, sizeof(struct user_i387_ia32_struct), datap); - - case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */ - return copy_regset_to_user(child, &user_x86_32_view, - REGSET_XFP, 0, - sizeof(struct user32_fxsr_struct), - datap); - - case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */ - return copy_regset_from_user(child, &user_x86_32_view, - REGSET_XFP, 0, - sizeof(struct user32_fxsr_struct), - datap); - - case PTRACE_GET_THREAD_AREA: - case PTRACE_SET_THREAD_AREA: - return arch_ptrace(child, request, addr, data); - - default: - return compat_ptrace_request(child, request, addr, data); - } - - return ret; +#ifdef CONFIG_IA32_EMULATION + return ia32_arch_ptrace(child, request, caddr, cdata); +#else + return 0; +#endif } - -#endif /* CONFIG_IA32_EMULATION */ +#endif /* CONFIG_COMPAT */ #ifdef CONFIG_X86_64 @@ -1434,201 +1442,3 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, /* Send us the fake SIGTRAP */ force_sig_info(SIGTRAP, &info, tsk); } - -static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch) -{ -#ifdef CONFIG_X86_64 - if (arch == AUDIT_ARCH_X86_64) { - audit_syscall_entry(regs->orig_ax, regs->di, - regs->si, regs->dx, regs->r10); - } else -#endif - { - audit_syscall_entry(regs->orig_ax, regs->bx, - regs->cx, regs->dx, regs->si); - } -} - -/* - * We can return 0 to resume the syscall or anything else to go to phase - * 2. If we resume the syscall, we need to put something appropriate in - * regs->orig_ax. - * - * NB: We don't have full pt_regs here, but regs->orig_ax and regs->ax - * are fully functional. - * - * For phase 2's benefit, our return value is: - * 0: resume the syscall - * 1: go to phase 2; no seccomp phase 2 needed - * anything else: go to phase 2; pass return value to seccomp - */ -unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch) -{ - unsigned long ret = 0; - u32 work; - - BUG_ON(regs != task_pt_regs(current)); - - work = ACCESS_ONCE(current_thread_info()->flags) & - _TIF_WORK_SYSCALL_ENTRY; - - /* - * If TIF_NOHZ is set, we are required to call user_exit() before - * doing anything that could touch RCU. - */ - if (work & _TIF_NOHZ) { - user_exit(); - work &= ~_TIF_NOHZ; - } - -#ifdef CONFIG_SECCOMP - /* - * Do seccomp first -- it should minimize exposure of other - * code, and keeping seccomp fast is probably more valuable - * than the rest of this. - */ - if (work & _TIF_SECCOMP) { - struct seccomp_data sd; - - sd.arch = arch; - sd.nr = regs->orig_ax; - sd.instruction_pointer = regs->ip; -#ifdef CONFIG_X86_64 - if (arch == AUDIT_ARCH_X86_64) { - sd.args[0] = regs->di; - sd.args[1] = regs->si; - sd.args[2] = regs->dx; - sd.args[3] = regs->r10; - sd.args[4] = regs->r8; - sd.args[5] = regs->r9; - } else -#endif - { - sd.args[0] = regs->bx; - sd.args[1] = regs->cx; - sd.args[2] = regs->dx; - sd.args[3] = regs->si; - sd.args[4] = regs->di; - sd.args[5] = regs->bp; - } - - BUILD_BUG_ON(SECCOMP_PHASE1_OK != 0); - BUILD_BUG_ON(SECCOMP_PHASE1_SKIP != 1); - - ret = seccomp_phase1(&sd); - if (ret == SECCOMP_PHASE1_SKIP) { - regs->orig_ax = -1; - ret = 0; - } else if (ret != SECCOMP_PHASE1_OK) { - return ret; /* Go directly to phase 2 */ - } - - work &= ~_TIF_SECCOMP; - } -#endif - - /* Do our best to finish without phase 2. */ - if (work == 0) - return ret; /* seccomp and/or nohz only (ret == 0 here) */ - -#ifdef CONFIG_AUDITSYSCALL - if (work == _TIF_SYSCALL_AUDIT) { - /* - * If there is no more work to be done except auditing, - * then audit in phase 1. Phase 2 always audits, so, if - * we audit here, then we can't go on to phase 2. - */ - do_audit_syscall_entry(regs, arch); - return 0; - } -#endif - - return 1; /* Something is enabled that we can't handle in phase 1 */ -} - -/* Returns the syscall nr to run (which should match regs->orig_ax). */ -long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch, - unsigned long phase1_result) -{ - long ret = 0; - u32 work = ACCESS_ONCE(current_thread_info()->flags) & - _TIF_WORK_SYSCALL_ENTRY; - - BUG_ON(regs != task_pt_regs(current)); - - /* - * If we stepped into a sysenter/syscall insn, it trapped in - * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP. - * If user-mode had set TF itself, then it's still clear from - * do_debug() and we need to set it again to restore the user - * state. If we entered on the slow path, TF was already set. - */ - if (work & _TIF_SINGLESTEP) - regs->flags |= X86_EFLAGS_TF; - -#ifdef CONFIG_SECCOMP - /* - * Call seccomp_phase2 before running the other hooks so that - * they can see any changes made by a seccomp tracer. - */ - if (phase1_result > 1 && seccomp_phase2(phase1_result)) { - /* seccomp failures shouldn't expose any additional code. */ - return -1; - } -#endif - - if (unlikely(work & _TIF_SYSCALL_EMU)) - ret = -1L; - - if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) && - tracehook_report_syscall_entry(regs)) - ret = -1L; - - if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) - trace_sys_enter(regs, regs->orig_ax); - - do_audit_syscall_entry(regs, arch); - - return ret ?: regs->orig_ax; -} - -long syscall_trace_enter(struct pt_regs *regs) -{ - u32 arch = is_ia32_task() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; - unsigned long phase1_result = syscall_trace_enter_phase1(regs, arch); - - if (phase1_result == 0) - return regs->orig_ax; - else - return syscall_trace_enter_phase2(regs, arch, phase1_result); -} - -void syscall_trace_leave(struct pt_regs *regs) -{ - bool step; - - /* - * We may come here right after calling schedule_user() - * or do_notify_resume(), in which case we can be in RCU - * user mode. - */ - user_exit(); - - audit_syscall_exit(regs); - - if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) - trace_sys_exit(regs, regs->ax); - - /* - * If TIF_SYSCALL_EMU is set, we only get here because of - * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP). - * We already reported this syscall instruction in - * syscall_trace_enter(). - */ - step = unlikely(test_thread_flag(TIF_SINGLESTEP)) && - !test_thread_flag(TIF_SYSCALL_EMU); - if (step || test_thread_flag(TIF_SYSCALL_TRACE)) - tracehook_report_syscall_exit(regs, step); - - user_enter(); -} diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 86db4bcd7..02693dd9a 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -673,7 +673,7 @@ struct machine_ops machine_ops = { .emergency_restart = native_machine_emergency_restart, .restart = native_machine_restart, .halt = native_machine_halt, -#ifdef CONFIG_KEXEC +#ifdef CONFIG_KEXEC_CORE .crash_shutdown = native_machine_crash_shutdown, #endif }; @@ -703,7 +703,7 @@ void machine_halt(void) machine_ops.halt(); } -#ifdef CONFIG_KEXEC +#ifdef CONFIG_KEXEC_CORE void machine_crash_shutdown(struct pt_regs *regs) { machine_ops.crash_shutdown(regs); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 80f874bf9..37c8ea8e7 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -317,15 +317,12 @@ static u64 __init get_ramdisk_size(void) return ramdisk_size; } -#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) static void __init relocate_initrd(void) { /* Assume only end is not page aligned */ u64 ramdisk_image = get_ramdisk_image(); u64 ramdisk_size = get_ramdisk_size(); u64 area_size = PAGE_ALIGN(ramdisk_size); - unsigned long slop, clen, mapaddr; - char *p, *q; /* We need to move the initrd down into directly mapped mem */ relocated_ramdisk = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), @@ -343,25 +340,8 @@ static void __init relocate_initrd(void) printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n", relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1); - q = (char *)initrd_start; - - /* Copy the initrd */ - while (ramdisk_size) { - slop = ramdisk_image & ~PAGE_MASK; - clen = ramdisk_size; - if (clen > MAX_MAP_CHUNK-slop) - clen = MAX_MAP_CHUNK-slop; - mapaddr = ramdisk_image & PAGE_MASK; - p = early_memremap(mapaddr, clen+slop); - memcpy(q, p+slop, clen); - early_memunmap(p, clen+slop); - q += clen; - ramdisk_image += clen; - ramdisk_size -= clen; - } + copy_from_early_mem((void *)initrd_start, ramdisk_image, ramdisk_size); - ramdisk_image = get_ramdisk_image(); - ramdisk_size = get_ramdisk_size(); printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to" " [mem %#010llx-%#010llx]\n", ramdisk_image, ramdisk_image + ramdisk_size - 1, @@ -498,7 +478,7 @@ static void __init memblock_x86_reserve_range_setup_data(void) * --------- Crashkernel reservation ------------------------------ */ -#ifdef CONFIG_KEXEC +#ifdef CONFIG_KEXEC_CORE /* * Keep the crash kernel below this limit. On 32 bits earlier kernels @@ -916,11 +896,6 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_X86_32 apm_info.bios = boot_params.apm_bios_info; ist_info = boot_params.ist_info; - if (boot_params.sys_desc_table.length != 0) { - machine_id = boot_params.sys_desc_table.table[0]; - machine_submodel_id = boot_params.sys_desc_table.table[1]; - BIOS_revision = boot_params.sys_desc_table.table[2]; - } #endif saved_video_mode = boot_params.hdr.vid_mode; bootloader_type = boot_params.hdr.type_of_loader; @@ -1198,6 +1173,14 @@ void __init setup_arch(char **cmdline_p) clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY, swapper_pg_dir + KERNEL_PGD_BOUNDARY, KERNEL_PGD_PTRS); + + /* + * sync back low identity map too. It is used for example + * in the 32-bit EFI stub. + */ + clone_pgd_range(initial_page_table, + swapper_pg_dir + KERNEL_PGD_BOUNDARY, + min(KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY)); #endif tboot_probe(); diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 71820c42b..da52e6bb5 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -31,11 +31,11 @@ #include <asm/vdso.h> #include <asm/mce.h> #include <asm/sighandling.h> +#include <asm/vm86.h> #ifdef CONFIG_X86_64 #include <asm/proto.h> #include <asm/ia32_unistd.h> -#include <asm/sys_ia32.h> #endif /* CONFIG_X86_64 */ #include <asm/syscall.h> @@ -632,6 +632,9 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) bool stepping, failed; struct fpu *fpu = ¤t->thread.fpu; + if (v8086_mode(regs)) + save_v86_state((struct kernel_vm86_regs *) regs, VM86_SIGNAL); + /* Are we from a system call? */ if (syscall_get_nr(current, regs) >= 0) { /* If so, check system call restarting.. */ @@ -697,7 +700,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) * want to handle. Thus you cannot kill init even with a SIGKILL even by * mistake. */ -static void do_signal(struct pt_regs *regs) +void do_signal(struct pt_regs *regs) { struct ksignal ksig; @@ -732,32 +735,6 @@ static void do_signal(struct pt_regs *regs) restore_saved_sigmask(); } -/* - * notification of userspace execution resumption - * - triggered by the TIF_WORK_MASK flags - */ -__visible void -do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) -{ - user_exit(); - - if (thread_info_flags & _TIF_UPROBE) - uprobe_notify_resume(regs); - - /* deal with pending signal delivery */ - if (thread_info_flags & _TIF_SIGPENDING) - do_signal(regs); - - if (thread_info_flags & _TIF_NOTIFY_RESUME) { - clear_thread_flag(TIF_NOTIFY_RESUME); - tracehook_notify_resume(regs); - } - if (thread_info_flags & _TIF_USER_RETURN_NOTIFY) - fire_user_return_notifiers(); - - user_enter(); -} - void signal_fault(struct pt_regs *regs, void __user *frame, char *where) { struct task_struct *me = current; diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c new file mode 100644 index 000000000..dc3c0b1c8 --- /dev/null +++ b/arch/x86/kernel/signal_compat.c @@ -0,0 +1,95 @@ +#include <linux/compat.h> +#include <linux/uaccess.h> + +int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from) +{ + int err = 0; + bool ia32 = test_thread_flag(TIF_IA32); + + if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t))) + return -EFAULT; + + put_user_try { + /* If you change siginfo_t structure, please make sure that + this code is fixed accordingly. + It should never copy any pad contained in the structure + to avoid security leaks, but must copy the generic + 3 ints plus the relevant union member. */ + put_user_ex(from->si_signo, &to->si_signo); + put_user_ex(from->si_errno, &to->si_errno); + put_user_ex((short)from->si_code, &to->si_code); + + if (from->si_code < 0) { + put_user_ex(from->si_pid, &to->si_pid); + put_user_ex(from->si_uid, &to->si_uid); + put_user_ex(ptr_to_compat(from->si_ptr), &to->si_ptr); + } else { + /* + * First 32bits of unions are always present: + * si_pid === si_band === si_tid === si_addr(LS half) + */ + put_user_ex(from->_sifields._pad[0], + &to->_sifields._pad[0]); + switch (from->si_code >> 16) { + case __SI_FAULT >> 16: + break; + case __SI_SYS >> 16: + put_user_ex(from->si_syscall, &to->si_syscall); + put_user_ex(from->si_arch, &to->si_arch); + break; + case __SI_CHLD >> 16: + if (ia32) { + put_user_ex(from->si_utime, &to->si_utime); + put_user_ex(from->si_stime, &to->si_stime); + } else { + put_user_ex(from->si_utime, &to->_sifields._sigchld_x32._utime); + put_user_ex(from->si_stime, &to->_sifields._sigchld_x32._stime); + } + put_user_ex(from->si_status, &to->si_status); + /* FALL THROUGH */ + default: + case __SI_KILL >> 16: + put_user_ex(from->si_uid, &to->si_uid); + break; + case __SI_POLL >> 16: + put_user_ex(from->si_fd, &to->si_fd); + break; + case __SI_TIMER >> 16: + put_user_ex(from->si_overrun, &to->si_overrun); + put_user_ex(ptr_to_compat(from->si_ptr), + &to->si_ptr); + break; + /* This is not generated by the kernel as of now. */ + case __SI_RT >> 16: + case __SI_MESGQ >> 16: + put_user_ex(from->si_uid, &to->si_uid); + put_user_ex(from->si_int, &to->si_int); + break; + } + } + } put_user_catch(err); + + return err; +} + +int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from) +{ + int err = 0; + u32 ptr32; + + if (!access_ok(VERIFY_READ, from, sizeof(compat_siginfo_t))) + return -EFAULT; + + get_user_try { + get_user_ex(to->si_signo, &from->si_signo); + get_user_ex(to->si_errno, &from->si_errno); + get_user_ex(to->si_code, &from->si_code); + + get_user_ex(to->si_pid, &from->si_pid); + get_user_ex(to->si_uid, &from->si_uid); + get_user_ex(ptr32, &from->si_ptr); + to->si_ptr = compat_ptr(ptr32); + } get_user_catch(err); + + return err; +} diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 15aaa69bb..12c828620 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -30,6 +30,7 @@ #include <asm/proto.h> #include <asm/apic.h> #include <asm/nmi.h> +#include <asm/mce.h> #include <asm/trace/irq_vectors.h> /* * Some notes on x86 processor bugs affecting SMP operation: @@ -243,6 +244,7 @@ static void native_stop_other_cpus(int wait) finish: local_irq_save(flags); disable_local_APIC(); + mcheck_cpu_clear(this_cpu_ptr(&cpu_info)); local_irq_restore(flags); } diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index b1f3ed9c7..892ee2e5e 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -97,8 +97,6 @@ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); EXPORT_PER_CPU_SYMBOL(cpu_info); -atomic_t init_deasserted; - static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip) { unsigned long flags; @@ -146,16 +144,11 @@ static void smp_callin(void) /* * If waken up by an INIT in an 82489DX configuration - * we may get here before an INIT-deassert IPI reaches - * our local APIC. We have to wait for the IPI or we'll - * lock up on an APIC access. - * - * Since CPU0 is not wakened up by INIT, it doesn't wait for the IPI. + * cpu_callout_mask guarantees we don't get here before + * an INIT_deassert IPI reaches our local APIC, so it is + * now safe to touch our local APIC. */ cpuid = smp_processor_id(); - if (apic->wait_for_init_deassert && cpuid) - while (!atomic_read(&init_deasserted)) - cpu_relax(); /* * (This works even if the APIC is not enabled.) @@ -516,7 +509,7 @@ void __inquire_remote_apic(int apicid) */ #define UDELAY_10MS_DEFAULT 10000 -static unsigned int init_udelay = UDELAY_10MS_DEFAULT; +static unsigned int init_udelay = INT_MAX; static int __init cpu_init_udelay(char *str) { @@ -529,13 +522,16 @@ early_param("cpu_init_udelay", cpu_init_udelay); static void __init smp_quirk_init_udelay(void) { /* if cmdline changed it from default, leave it alone */ - if (init_udelay != UDELAY_10MS_DEFAULT) + if (init_udelay != INT_MAX) return; /* if modern processor, use no delay */ if (((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 6)) || ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0xF))) init_udelay = 0; + + /* else, use legacy delay */ + init_udelay = UDELAY_10MS_DEFAULT; } /* @@ -620,7 +616,6 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) send_status = safe_apic_wait_icr_idle(); mb(); - atomic_set(&init_deasserted, 1); /* * Should we send STARTUP IPIs ? @@ -665,7 +660,10 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) /* * Give the other CPU some time to accept the IPI. */ - udelay(300); + if (init_udelay == 0) + udelay(10); + else + udelay(300); pr_debug("Startup point 1\n"); @@ -675,7 +673,10 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) /* * Give the other CPU some time to accept the IPI. */ - udelay(200); + if (init_udelay == 0) + udelay(10); + else + udelay(200); if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); @@ -859,8 +860,6 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) * the targeted processor. */ - atomic_set(&init_deasserted, 0); - if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { pr_debug("Setting warm reset code and vector.\n"); @@ -898,7 +897,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) if (!boot_error) { /* - * Wait 10s total for a response from AP + * Wait 10s total for first sign of life from AP */ boot_error = -1; timeout = jiffies + 10*HZ; @@ -911,7 +910,6 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) boot_error = 0; break; } - udelay(100); schedule(); } } @@ -927,7 +925,6 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) * for the MTRR work(triggered by the AP coming online) * to be completed in the stop machine context. */ - udelay(100); schedule(); } } @@ -1358,7 +1355,7 @@ static void remove_siblinginfo(int cpu) cpumask_clear_cpu(cpu, cpu_sibling_setup_mask); } -static void __ref remove_cpu_from_maps(int cpu) +static void remove_cpu_from_maps(int cpu) { set_cpu_online(cpu, false); cpumask_clear_cpu(cpu, cpu_callout_mask); diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index 0ccb53a9f..c9a073866 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -18,6 +18,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re return addr; } +#ifdef CONFIG_MODIFY_LDT_SYSCALL /* * We'll assume that the code segments in the GDT * are all zero-based. That is largely true: the @@ -45,6 +46,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re } mutex_unlock(&child->mm->context.lock); } +#endif return addr; } diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c index 649b010da..12cbe2b88 100644 --- a/arch/x86/kernel/topology.c +++ b/arch/x86/kernel/topology.c @@ -57,7 +57,7 @@ __setup("cpu0_hotplug", enable_cpu0_hotplug); * * This is only called for debugging CPU offline/online feature. */ -int __ref _debug_hotplug_cpu(int cpu, int action) +int _debug_hotplug_cpu(int cpu, int action) { struct device *dev = get_cpu_device(cpu); int ret; @@ -104,7 +104,7 @@ static int __init debug_hotplug_cpu(void) late_initcall_sync(debug_hotplug_cpu); #endif /* CONFIG_DEBUG_HOTPLUG_CPU0 */ -int __ref arch_register_cpu(int num) +int arch_register_cpu(int num) { struct cpuinfo_x86 *c = &cpu_data(num); diff --git a/arch/x86/kernel/trace_clock.c b/arch/x86/kernel/trace_clock.c index 25b993729..80bb24d9b 100644 --- a/arch/x86/kernel/trace_clock.c +++ b/arch/x86/kernel/trace_clock.c @@ -12,10 +12,5 @@ */ u64 notrace trace_clock_x86_tsc(void) { - u64 ret; - - rdtsc_barrier(); - rdtscll(ret); - - return ret; + return rdtsc_ordered(); } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index f5791927a..346eec73f 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -62,6 +62,7 @@ #include <asm/fpu/xstate.h> #include <asm/trace/mpx.h> #include <asm/mpx.h> +#include <asm/vm86.h> #ifdef CONFIG_X86_64 #include <asm/x86_init.h> @@ -108,13 +109,10 @@ static inline void preempt_conditional_cli(struct pt_regs *regs) preempt_count_dec(); } -enum ctx_state ist_enter(struct pt_regs *regs) +void ist_enter(struct pt_regs *regs) { - enum ctx_state prev_state; - if (user_mode(regs)) { - /* Other than that, we're just an exception. */ - prev_state = exception_enter(); + RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); } else { /* * We might have interrupted pretty much anything. In @@ -123,32 +121,25 @@ enum ctx_state ist_enter(struct pt_regs *regs) * but we need to notify RCU. */ rcu_nmi_enter(); - prev_state = CONTEXT_KERNEL; /* the value is irrelevant. */ } /* - * We are atomic because we're on the IST stack (or we're on x86_32, - * in which case we still shouldn't schedule). - * - * This must be after exception_enter(), because exception_enter() - * won't do anything if in_interrupt() returns true. + * We are atomic because we're on the IST stack; or we're on + * x86_32, in which case we still shouldn't schedule; or we're + * on x86_64 and entered from user mode, in which case we're + * still atomic unless ist_begin_non_atomic is called. */ preempt_count_add(HARDIRQ_OFFSET); /* This code is a bit fragile. Test it. */ - rcu_lockdep_assert(rcu_is_watching(), "ist_enter didn't work"); - - return prev_state; + RCU_LOCKDEP_WARN(!rcu_is_watching(), "ist_enter didn't work"); } -void ist_exit(struct pt_regs *regs, enum ctx_state prev_state) +void ist_exit(struct pt_regs *regs) { - /* Must be before exception_exit. */ preempt_count_sub(HARDIRQ_OFFSET); - if (user_mode(regs)) - return exception_exit(prev_state); - else + if (!user_mode(regs)) rcu_nmi_exit(); } @@ -162,7 +153,7 @@ void ist_exit(struct pt_regs *regs, enum ctx_state prev_state) * a double fault, it can be safe to schedule. ist_begin_non_atomic() * begins a non-atomic section within an ist_enter()/ist_exit() region. * Callers are responsible for enabling interrupts themselves inside - * the non-atomic section, and callers must call is_end_non_atomic() + * the non-atomic section, and callers must call ist_end_non_atomic() * before ist_exit(). */ void ist_begin_non_atomic(struct pt_regs *regs) @@ -289,17 +280,16 @@ NOKPROBE_SYMBOL(do_trap); static void do_error_trap(struct pt_regs *regs, long error_code, char *str, unsigned long trapnr, int signr) { - enum ctx_state prev_state = exception_enter(); siginfo_t info; + RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) != NOTIFY_STOP) { conditional_sti(regs); do_trap(trapnr, signr, str, regs, error_code, fill_trap_info(regs, signr, trapnr, &info)); } - - exception_exit(prev_state); } #define DO_ERROR(trapnr, signr, str, name) \ @@ -351,7 +341,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) } #endif - ist_enter(regs); /* Discard prev_state because we won't return. */ + ist_enter(regs); notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); tsk->thread.error_code = error_code; @@ -371,14 +361,13 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) dotraplinkage void do_bounds(struct pt_regs *regs, long error_code) { - enum ctx_state prev_state; const struct bndcsr *bndcsr; siginfo_t *info; - prev_state = exception_enter(); + RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); if (notify_die(DIE_TRAP, "bounds", regs, error_code, X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP) - goto exit; + return; conditional_sti(regs); if (!user_mode(regs)) @@ -435,9 +424,8 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code) die("bounds", regs, error_code); } -exit: - exception_exit(prev_state); return; + exit_trap: /* * This path out is for all the cases where we could not @@ -447,35 +435,33 @@ exit_trap: * time.. */ do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, NULL); - exception_exit(prev_state); } dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code) { struct task_struct *tsk; - enum ctx_state prev_state; - prev_state = exception_enter(); + RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); conditional_sti(regs); if (v8086_mode(regs)) { local_irq_enable(); handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); - goto exit; + return; } tsk = current; if (!user_mode(regs)) { if (fixup_exception(regs)) - goto exit; + return; tsk->thread.error_code = error_code; tsk->thread.trap_nr = X86_TRAP_GP; if (notify_die(DIE_GPF, "general protection fault", regs, error_code, X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP) die("general protection fault", regs, error_code); - goto exit; + return; } tsk->thread.error_code = error_code; @@ -491,16 +477,12 @@ do_general_protection(struct pt_regs *regs, long error_code) } force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk); -exit: - exception_exit(prev_state); } NOKPROBE_SYMBOL(do_general_protection); /* May run on IST stack. */ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) { - enum ctx_state prev_state; - #ifdef CONFIG_DYNAMIC_FTRACE /* * ftrace must be first, everything else may cause a recursive crash. @@ -513,7 +495,8 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) if (poke_int3_handler(regs)) return; - prev_state = ist_enter(regs); + ist_enter(regs); + RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, SIGTRAP) == NOTIFY_STOP) @@ -539,7 +522,7 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) preempt_conditional_cli(regs); debug_stack_usage_dec(); exit: - ist_exit(regs, prev_state); + ist_exit(regs); } NOKPROBE_SYMBOL(do_int3); @@ -615,12 +598,11 @@ NOKPROBE_SYMBOL(fixup_bad_iret); dotraplinkage void do_debug(struct pt_regs *regs, long error_code) { struct task_struct *tsk = current; - enum ctx_state prev_state; int user_icebp = 0; unsigned long dr6; int si_code; - prev_state = ist_enter(regs); + ist_enter(regs); get_debugreg(dr6, 6); @@ -695,7 +677,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) debug_stack_usage_dec(); exit: - ist_exit(regs, prev_state); + ist_exit(regs); } NOKPROBE_SYMBOL(do_debug); @@ -747,21 +729,15 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr) dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) { - enum ctx_state prev_state; - - prev_state = exception_enter(); + RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); math_error(regs, error_code, X86_TRAP_MF); - exception_exit(prev_state); } dotraplinkage void do_simd_coprocessor_error(struct pt_regs *regs, long error_code) { - enum ctx_state prev_state; - - prev_state = exception_enter(); + RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); math_error(regs, error_code, X86_TRAP_XF); - exception_exit(prev_state); } dotraplinkage void @@ -773,9 +749,7 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) dotraplinkage void do_device_not_available(struct pt_regs *regs, long error_code) { - enum ctx_state prev_state; - - prev_state = exception_enter(); + RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); BUG_ON(use_eager_fpu()); #ifdef CONFIG_MATH_EMULATION @@ -786,7 +760,6 @@ do_device_not_available(struct pt_regs *regs, long error_code) info.regs = regs; math_emulate(&info); - exception_exit(prev_state); return; } #endif @@ -794,7 +767,6 @@ do_device_not_available(struct pt_regs *regs, long error_code) #ifdef CONFIG_X86_32 conditional_sti(regs); #endif - exception_exit(prev_state); } NOKPROBE_SYMBOL(do_device_not_available); @@ -802,9 +774,8 @@ NOKPROBE_SYMBOL(do_device_not_available); dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) { siginfo_t info; - enum ctx_state prev_state; - prev_state = exception_enter(); + RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); local_irq_enable(); info.si_signo = SIGILL; @@ -816,7 +787,6 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code, &info); } - exception_exit(prev_state); } #endif diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 0e8151b4b..f86453801 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -40,7 +40,7 @@ static int __read_mostly tsc_unstable; erroneous rdtsc usage on !cpu_has_tsc processors */ static int __read_mostly tsc_disabled = -1; -static struct static_key __use_tsc = STATIC_KEY_INIT; +static DEFINE_STATIC_KEY_FALSE(__use_tsc); int tsc_clocksource_reliable; @@ -254,7 +254,7 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) data = cyc2ns_write_begin(cpu); - rdtscll(tsc_now); + tsc_now = rdtsc(); ns_now = cycles_2_ns(tsc_now); /* @@ -280,7 +280,12 @@ done: */ u64 native_sched_clock(void) { - u64 tsc_now; + if (static_branch_likely(&__use_tsc)) { + u64 tsc_now = rdtsc(); + + /* return the value in ns */ + return cycles_2_ns(tsc_now); + } /* * Fall back to jiffies if there's no TSC available: @@ -290,16 +295,17 @@ u64 native_sched_clock(void) * very important for it to be as fast as the platform * can achieve it. ) */ - if (!static_key_false(&__use_tsc)) { - /* No locking but a rare wrong value is not a big deal: */ - return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); - } - /* read the Time Stamp Counter: */ - rdtscll(tsc_now); + /* No locking but a rare wrong value is not a big deal: */ + return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); +} - /* return the value in ns */ - return cycles_2_ns(tsc_now); +/* + * Generate a sched_clock if you already have a TSC value. + */ +u64 native_sched_clock_from_tsc(u64 tsc) +{ + return cycles_2_ns(tsc); } /* We need to define a real function for sched_clock, to override the @@ -314,12 +320,6 @@ unsigned long long sched_clock(void) __attribute__((alias("native_sched_clock"))); #endif -unsigned long long native_read_tsc(void) -{ - return __native_read_tsc(); -} -EXPORT_SYMBOL(native_read_tsc); - int check_tsc_unstable(void) { return tsc_unstable; @@ -982,7 +982,7 @@ static struct clocksource clocksource_tsc; */ static cycle_t read_tsc(struct clocksource *cs) { - return (cycle_t)get_cycles(); + return (cycle_t)rdtsc_ordered(); } /* @@ -1218,7 +1218,7 @@ void __init tsc_init(void) /* now allow native_sched_clock() to use rdtsc */ tsc_disabled = 0; - static_key_slow_inc(&__use_tsc); + static_branch_enable(&__use_tsc); if (!no_sched_irq_time) enable_sched_clock_irqtime(); diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index dd8d0791d..78083bf23 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -39,16 +39,15 @@ static cycles_t max_warp; static int nr_warps; /* - * TSC-warp measurement loop running on both CPUs: + * TSC-warp measurement loop running on both CPUs. This is not called + * if there is no TSC. */ static void check_tsc_warp(unsigned int timeout) { cycles_t start, now, prev, end; int i; - rdtsc_barrier(); - start = get_cycles(); - rdtsc_barrier(); + start = rdtsc_ordered(); /* * The measurement runs for 'timeout' msecs: */ @@ -63,9 +62,7 @@ static void check_tsc_warp(unsigned int timeout) */ arch_spin_lock(&sync_lock); prev = last_tsc; - rdtsc_barrier(); - now = get_cycles(); - rdtsc_barrier(); + now = rdtsc_ordered(); last_tsc = now; arch_spin_unlock(&sync_lock); @@ -126,7 +123,7 @@ void check_tsc_sync_source(int cpu) /* * No need to check if we already know that the TSC is not - * synchronized: + * synchronized or if we have no TSC. */ if (unsynchronized_tsc()) return; @@ -190,6 +187,7 @@ void check_tsc_sync_target(void) { int cpus = 2; + /* Also aborts if there is no TSC. */ if (unsynchronized_tsc() || tsc_clocksource_reliable) return; diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 664762447..bf4db6eae 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -985,3 +985,12 @@ arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs return -1; } + +bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx, + struct pt_regs *regs) +{ + if (ctx == RP_CHECK_CALL) /* sp was just decremented by "call" insn */ + return regs->sp < ret->stack; + else + return regs->sp <= ret->stack; +} diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S index b9242bacb..4cf401f58 100644 --- a/arch/x86/kernel/verify_cpu.S +++ b/arch/x86/kernel/verify_cpu.S @@ -34,10 +34,11 @@ #include <asm/msr-index.h> verify_cpu: - pushfl # Save caller passed flags - pushl $0 # Kill any dangerous flags - popfl + pushf # Save caller passed flags + push $0 # Kill any dangerous flags + popf +#ifndef __x86_64__ pushfl # standard way to check for cpuid popl %eax movl %eax,%ebx @@ -48,6 +49,7 @@ verify_cpu: popl %eax cmpl %eax,%ebx jz verify_cpu_no_longmode # cpu has no cpuid +#endif movl $0x0,%eax # See if cpuid 1 is implemented cpuid @@ -130,10 +132,10 @@ verify_cpu_sse_test: jmp verify_cpu_sse_test # try again verify_cpu_no_longmode: - popfl # Restore caller passed flags + popf # Restore caller passed flags movl $1,%eax ret verify_cpu_sse_ok: - popfl # Restore caller passed flags + popf # Restore caller passed flags xorl %eax, %eax ret diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index fc9db6ef2..524619351 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -44,11 +44,15 @@ #include <linux/ptrace.h> #include <linux/audit.h> #include <linux/stddef.h> +#include <linux/slab.h> +#include <linux/security.h> #include <asm/uaccess.h> #include <asm/io.h> #include <asm/tlbflush.h> #include <asm/irq.h> +#include <asm/traps.h> +#include <asm/vm86.h> /* * Known problems: @@ -66,10 +70,6 @@ */ -#define KVM86 ((struct kernel_vm86_struct *)regs) -#define VMPI KVM86->vm86plus - - /* * 8- and 16-bit register defines.. */ @@ -81,8 +81,8 @@ /* * virtual flags (16 and 32-bit versions) */ -#define VFLAGS (*(unsigned short *)&(current->thread.v86flags)) -#define VEFLAGS (current->thread.v86flags) +#define VFLAGS (*(unsigned short *)&(current->thread.vm86->veflags)) +#define VEFLAGS (current->thread.vm86->veflags) #define set_flags(X, new, mask) \ ((X) = ((X) & ~(mask)) | ((new) & (mask))) @@ -90,46 +90,13 @@ #define SAFE_MASK (0xDD5) #define RETURN_MASK (0xDFF) -/* convert kernel_vm86_regs to vm86_regs */ -static int copy_vm86_regs_to_user(struct vm86_regs __user *user, - const struct kernel_vm86_regs *regs) -{ - int ret = 0; - - /* - * kernel_vm86_regs is missing gs, so copy everything up to - * (but not including) orig_eax, and then rest including orig_eax. - */ - ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.orig_ax)); - ret += copy_to_user(&user->orig_eax, ®s->pt.orig_ax, - sizeof(struct kernel_vm86_regs) - - offsetof(struct kernel_vm86_regs, pt.orig_ax)); - - return ret; -} - -/* convert vm86_regs to kernel_vm86_regs */ -static int copy_vm86_regs_from_user(struct kernel_vm86_regs *regs, - const struct vm86_regs __user *user, - unsigned extra) -{ - int ret = 0; - - /* copy ax-fs inclusive */ - ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.orig_ax)); - /* copy orig_ax-__gsh+extra */ - ret += copy_from_user(®s->pt.orig_ax, &user->orig_eax, - sizeof(struct kernel_vm86_regs) - - offsetof(struct kernel_vm86_regs, pt.orig_ax) + - extra); - return ret; -} - -struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs) +void save_v86_state(struct kernel_vm86_regs *regs, int retval) { struct tss_struct *tss; - struct pt_regs *ret; - unsigned long tmp; + struct task_struct *tsk = current; + struct vm86plus_struct __user *user; + struct vm86 *vm86 = current->thread.vm86; + long err = 0; /* * This gets called from entry.S with interrupts disabled, but @@ -138,31 +105,57 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs) */ local_irq_enable(); - if (!current->thread.vm86_info) { - pr_alert("no vm86_info: BAD\n"); + if (!vm86 || !vm86->user_vm86) { + pr_alert("no user_vm86: BAD\n"); + do_exit(SIGSEGV); + } + set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | vm86->veflags_mask); + user = vm86->user_vm86; + + if (!access_ok(VERIFY_WRITE, user, vm86->vm86plus.is_vm86pus ? + sizeof(struct vm86plus_struct) : + sizeof(struct vm86_struct))) { + pr_alert("could not access userspace vm86 info\n"); do_exit(SIGSEGV); } - set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | current->thread.v86mask); - tmp = copy_vm86_regs_to_user(¤t->thread.vm86_info->regs, regs); - tmp += put_user(current->thread.screen_bitmap, ¤t->thread.vm86_info->screen_bitmap); - if (tmp) { - pr_alert("could not access userspace vm86_info\n"); + + put_user_try { + put_user_ex(regs->pt.bx, &user->regs.ebx); + put_user_ex(regs->pt.cx, &user->regs.ecx); + put_user_ex(regs->pt.dx, &user->regs.edx); + put_user_ex(regs->pt.si, &user->regs.esi); + put_user_ex(regs->pt.di, &user->regs.edi); + put_user_ex(regs->pt.bp, &user->regs.ebp); + put_user_ex(regs->pt.ax, &user->regs.eax); + put_user_ex(regs->pt.ip, &user->regs.eip); + put_user_ex(regs->pt.cs, &user->regs.cs); + put_user_ex(regs->pt.flags, &user->regs.eflags); + put_user_ex(regs->pt.sp, &user->regs.esp); + put_user_ex(regs->pt.ss, &user->regs.ss); + put_user_ex(regs->es, &user->regs.es); + put_user_ex(regs->ds, &user->regs.ds); + put_user_ex(regs->fs, &user->regs.fs); + put_user_ex(regs->gs, &user->regs.gs); + + put_user_ex(vm86->screen_bitmap, &user->screen_bitmap); + } put_user_catch(err); + if (err) { + pr_alert("could not access userspace vm86 info\n"); do_exit(SIGSEGV); } tss = &per_cpu(cpu_tss, get_cpu()); - current->thread.sp0 = current->thread.saved_sp0; - current->thread.sysenter_cs = __KERNEL_CS; - load_sp0(tss, ¤t->thread); - current->thread.saved_sp0 = 0; + tsk->thread.sp0 = vm86->saved_sp0; + tsk->thread.sysenter_cs = __KERNEL_CS; + load_sp0(tss, &tsk->thread); + vm86->saved_sp0 = 0; put_cpu(); - ret = KVM86->regs32; + memcpy(®s->pt, &vm86->regs32, sizeof(struct pt_regs)); - ret->fs = current->thread.saved_fs; - set_user_gs(ret, current->thread.saved_gs); + lazy_load_gs(vm86->regs32.gs); - return ret; + regs->pt.ax = retval; } static void mark_screen_rdonly(struct mm_struct *mm) @@ -200,45 +193,16 @@ out: static int do_vm86_irq_handling(int subfunction, int irqnumber); -static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk); +static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus); -SYSCALL_DEFINE1(vm86old, struct vm86_struct __user *, v86) +SYSCALL_DEFINE1(vm86old, struct vm86_struct __user *, user_vm86) { - struct kernel_vm86_struct info; /* declare this _on top_, - * this avoids wasting of stack space. - * This remains on the stack until we - * return to 32 bit user space. - */ - struct task_struct *tsk = current; - int tmp; - - if (tsk->thread.saved_sp0) - return -EPERM; - tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs, - offsetof(struct kernel_vm86_struct, vm86plus) - - sizeof(info.regs)); - if (tmp) - return -EFAULT; - memset(&info.vm86plus, 0, (int)&info.regs32 - (int)&info.vm86plus); - info.regs32 = current_pt_regs(); - tsk->thread.vm86_info = v86; - do_sys_vm86(&info, tsk); - return 0; /* we never return here */ + return do_sys_vm86((struct vm86plus_struct __user *) user_vm86, false); } SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg) { - struct kernel_vm86_struct info; /* declare this _on top_, - * this avoids wasting of stack space. - * This remains on the stack until we - * return to 32 bit user space. - */ - struct task_struct *tsk; - int tmp; - struct vm86plus_struct __user *v86; - - tsk = current; switch (cmd) { case VM86_REQUEST_IRQ: case VM86_FREE_IRQ: @@ -256,114 +220,159 @@ SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg) } /* we come here only for functions VM86_ENTER, VM86_ENTER_NO_BYPASS */ - if (tsk->thread.saved_sp0) - return -EPERM; - v86 = (struct vm86plus_struct __user *)arg; - tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs, - offsetof(struct kernel_vm86_struct, regs32) - - sizeof(info.regs)); - if (tmp) - return -EFAULT; - info.regs32 = current_pt_regs(); - info.vm86plus.is_vm86pus = 1; - tsk->thread.vm86_info = (struct vm86_struct __user *)v86; - do_sys_vm86(&info, tsk); - return 0; /* we never return here */ + return do_sys_vm86((struct vm86plus_struct __user *) arg, true); } -static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk) +static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) { struct tss_struct *tss; -/* - * make sure the vm86() system call doesn't try to do anything silly - */ - info->regs.pt.ds = 0; - info->regs.pt.es = 0; - info->regs.pt.fs = 0; -#ifndef CONFIG_X86_32_LAZY_GS - info->regs.pt.gs = 0; -#endif + struct task_struct *tsk = current; + struct vm86 *vm86 = tsk->thread.vm86; + struct kernel_vm86_regs vm86regs; + struct pt_regs *regs = current_pt_regs(); + unsigned long err = 0; + + err = security_mmap_addr(0); + if (err) { + /* + * vm86 cannot virtualize the address space, so vm86 users + * need to manage the low 1MB themselves using mmap. Given + * that BIOS places important data in the first page, vm86 + * is essentially useless if mmap_min_addr != 0. DOSEMU, + * for example, won't even bother trying to use vm86 if it + * can't map a page at virtual address 0. + * + * To reduce the available kernel attack surface, simply + * disallow vm86(old) for users who cannot mmap at va 0. + * + * The implementation of security_mmap_addr will allow + * suitably privileged users to map va 0 even if + * vm.mmap_min_addr is set above 0, and we want this + * behavior for vm86 as well, as it ensures that legacy + * tools like vbetool will not fail just because of + * vm.mmap_min_addr. + */ + pr_info_once("Denied a call to vm86(old) from %s[%d] (uid: %d). Set the vm.mmap_min_addr sysctl to 0 and/or adjust LSM mmap_min_addr policy to enable vm86 if you are using a vm86-based DOS emulator.\n", + current->comm, task_pid_nr(current), + from_kuid_munged(&init_user_ns, current_uid())); + return -EPERM; + } + + if (!vm86) { + if (!(vm86 = kzalloc(sizeof(*vm86), GFP_KERNEL))) + return -ENOMEM; + tsk->thread.vm86 = vm86; + } + if (vm86->saved_sp0) + return -EPERM; + + if (!access_ok(VERIFY_READ, user_vm86, plus ? + sizeof(struct vm86_struct) : + sizeof(struct vm86plus_struct))) + return -EFAULT; + + memset(&vm86regs, 0, sizeof(vm86regs)); + get_user_try { + unsigned short seg; + get_user_ex(vm86regs.pt.bx, &user_vm86->regs.ebx); + get_user_ex(vm86regs.pt.cx, &user_vm86->regs.ecx); + get_user_ex(vm86regs.pt.dx, &user_vm86->regs.edx); + get_user_ex(vm86regs.pt.si, &user_vm86->regs.esi); + get_user_ex(vm86regs.pt.di, &user_vm86->regs.edi); + get_user_ex(vm86regs.pt.bp, &user_vm86->regs.ebp); + get_user_ex(vm86regs.pt.ax, &user_vm86->regs.eax); + get_user_ex(vm86regs.pt.ip, &user_vm86->regs.eip); + get_user_ex(seg, &user_vm86->regs.cs); + vm86regs.pt.cs = seg; + get_user_ex(vm86regs.pt.flags, &user_vm86->regs.eflags); + get_user_ex(vm86regs.pt.sp, &user_vm86->regs.esp); + get_user_ex(seg, &user_vm86->regs.ss); + vm86regs.pt.ss = seg; + get_user_ex(vm86regs.es, &user_vm86->regs.es); + get_user_ex(vm86regs.ds, &user_vm86->regs.ds); + get_user_ex(vm86regs.fs, &user_vm86->regs.fs); + get_user_ex(vm86regs.gs, &user_vm86->regs.gs); + + get_user_ex(vm86->flags, &user_vm86->flags); + get_user_ex(vm86->screen_bitmap, &user_vm86->screen_bitmap); + get_user_ex(vm86->cpu_type, &user_vm86->cpu_type); + } get_user_catch(err); + if (err) + return err; + + if (copy_from_user(&vm86->int_revectored, + &user_vm86->int_revectored, + sizeof(struct revectored_struct))) + return -EFAULT; + if (copy_from_user(&vm86->int21_revectored, + &user_vm86->int21_revectored, + sizeof(struct revectored_struct))) + return -EFAULT; + if (plus) { + if (copy_from_user(&vm86->vm86plus, &user_vm86->vm86plus, + sizeof(struct vm86plus_info_struct))) + return -EFAULT; + vm86->vm86plus.is_vm86pus = 1; + } else + memset(&vm86->vm86plus, 0, + sizeof(struct vm86plus_info_struct)); + + memcpy(&vm86->regs32, regs, sizeof(struct pt_regs)); + vm86->user_vm86 = user_vm86; /* * The flags register is also special: we cannot trust that the user * has set it up safely, so this makes sure interrupt etc flags are * inherited from protected mode. */ - VEFLAGS = info->regs.pt.flags; - info->regs.pt.flags &= SAFE_MASK; - info->regs.pt.flags |= info->regs32->flags & ~SAFE_MASK; - info->regs.pt.flags |= X86_VM_MASK; + VEFLAGS = vm86regs.pt.flags; + vm86regs.pt.flags &= SAFE_MASK; + vm86regs.pt.flags |= regs->flags & ~SAFE_MASK; + vm86regs.pt.flags |= X86_VM_MASK; + + vm86regs.pt.orig_ax = regs->orig_ax; - switch (info->cpu_type) { + switch (vm86->cpu_type) { case CPU_286: - tsk->thread.v86mask = 0; + vm86->veflags_mask = 0; break; case CPU_386: - tsk->thread.v86mask = X86_EFLAGS_NT | X86_EFLAGS_IOPL; + vm86->veflags_mask = X86_EFLAGS_NT | X86_EFLAGS_IOPL; break; case CPU_486: - tsk->thread.v86mask = X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL; + vm86->veflags_mask = X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL; break; default: - tsk->thread.v86mask = X86_EFLAGS_ID | X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL; + vm86->veflags_mask = X86_EFLAGS_ID | X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL; break; } /* - * Save old state, set default return value (%ax) to 0 (VM86_SIGNAL) + * Save old state */ - info->regs32->ax = VM86_SIGNAL; - tsk->thread.saved_sp0 = tsk->thread.sp0; - tsk->thread.saved_fs = info->regs32->fs; - tsk->thread.saved_gs = get_user_gs(info->regs32); + vm86->saved_sp0 = tsk->thread.sp0; + lazy_save_gs(vm86->regs32.gs); tss = &per_cpu(cpu_tss, get_cpu()); - tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0; + /* make room for real-mode segments */ + tsk->thread.sp0 += 16; if (cpu_has_sep) tsk->thread.sysenter_cs = 0; load_sp0(tss, &tsk->thread); put_cpu(); - tsk->thread.screen_bitmap = info->screen_bitmap; - if (info->flags & VM86_SCREEN_BITMAP) + if (vm86->flags & VM86_SCREEN_BITMAP) mark_screen_rdonly(tsk->mm); - /*call __audit_syscall_exit since we do not exit via the normal paths */ -#ifdef CONFIG_AUDITSYSCALL - if (unlikely(current->audit_context)) - __audit_syscall_exit(1, 0); -#endif - - __asm__ __volatile__( - "movl %0,%%esp\n\t" - "movl %1,%%ebp\n\t" -#ifdef CONFIG_X86_32_LAZY_GS - "mov %2, %%gs\n\t" -#endif - "jmp resume_userspace" - : /* no outputs */ - :"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0)); - /* we never return here */ -} - -static inline void return_to_32bit(struct kernel_vm86_regs *regs16, int retval) -{ - struct pt_regs *regs32; - - regs32 = save_v86_state(regs16); - regs32->ax = retval; - __asm__ __volatile__("movl %0,%%esp\n\t" - "movl %1,%%ebp\n\t" - "jmp resume_userspace" - : : "r" (regs32), "r" (current_thread_info())); + memcpy((struct kernel_vm86_regs *)regs, &vm86regs, sizeof(vm86regs)); + force_iret(); + return regs->ax; } static inline void set_IF(struct kernel_vm86_regs *regs) { VEFLAGS |= X86_EFLAGS_VIF; - if (VEFLAGS & X86_EFLAGS_VIP) - return_to_32bit(regs, VM86_STI); } static inline void clear_IF(struct kernel_vm86_regs *regs) @@ -395,7 +404,7 @@ static inline void clear_AC(struct kernel_vm86_regs *regs) static inline void set_vflags_long(unsigned long flags, struct kernel_vm86_regs *regs) { - set_flags(VEFLAGS, flags, current->thread.v86mask); + set_flags(VEFLAGS, flags, current->thread.vm86->veflags_mask); set_flags(regs->pt.flags, flags, SAFE_MASK); if (flags & X86_EFLAGS_IF) set_IF(regs); @@ -405,7 +414,7 @@ static inline void set_vflags_long(unsigned long flags, struct kernel_vm86_regs static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs *regs) { - set_flags(VFLAGS, flags, current->thread.v86mask); + set_flags(VFLAGS, flags, current->thread.vm86->veflags_mask); set_flags(regs->pt.flags, flags, SAFE_MASK); if (flags & X86_EFLAGS_IF) set_IF(regs); @@ -420,7 +429,7 @@ static inline unsigned long get_vflags(struct kernel_vm86_regs *regs) if (VEFLAGS & X86_EFLAGS_VIF) flags |= X86_EFLAGS_IF; flags |= X86_EFLAGS_IOPL; - return flags | (VEFLAGS & current->thread.v86mask); + return flags | (VEFLAGS & current->thread.vm86->veflags_mask); } static inline int is_revectored(int nr, struct revectored_struct *bitmap) @@ -518,12 +527,13 @@ static void do_int(struct kernel_vm86_regs *regs, int i, { unsigned long __user *intr_ptr; unsigned long segoffs; + struct vm86 *vm86 = current->thread.vm86; if (regs->pt.cs == BIOSSEG) goto cannot_handle; - if (is_revectored(i, &KVM86->int_revectored)) + if (is_revectored(i, &vm86->int_revectored)) goto cannot_handle; - if (i == 0x21 && is_revectored(AH(regs), &KVM86->int21_revectored)) + if (i == 0x21 && is_revectored(AH(regs), &vm86->int21_revectored)) goto cannot_handle; intr_ptr = (unsigned long __user *) (i << 2); if (get_user(segoffs, intr_ptr)) @@ -542,18 +552,16 @@ static void do_int(struct kernel_vm86_regs *regs, int i, return; cannot_handle: - return_to_32bit(regs, VM86_INTx + (i << 8)); + save_v86_state(regs, VM86_INTx + (i << 8)); } int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno) { - if (VMPI.is_vm86pus) { + struct vm86 *vm86 = current->thread.vm86; + + if (vm86->vm86plus.is_vm86pus) { if ((trapno == 3) || (trapno == 1)) { - KVM86->regs32->ax = VM86_TRAP + (trapno << 8); - /* setting this flag forces the code in entry_32.S to - the path where we call save_v86_state() and change - the stack pointer to KVM86->regs32 */ - set_thread_flag(TIF_NOTIFY_RESUME); + save_v86_state(regs, VM86_TRAP + (trapno << 8)); return 0; } do_int(regs, trapno, (unsigned char __user *) (regs->pt.ss << 4), SP(regs)); @@ -574,16 +582,11 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code) unsigned char __user *ssp; unsigned short ip, sp, orig_flags; int data32, pref_done; + struct vm86plus_info_struct *vmpi = ¤t->thread.vm86->vm86plus; #define CHECK_IF_IN_TRAP \ - if (VMPI.vm86dbg_active && VMPI.vm86dbg_TFpendig) \ + if (vmpi->vm86dbg_active && vmpi->vm86dbg_TFpendig) \ newflags |= X86_EFLAGS_TF -#define VM86_FAULT_RETURN do { \ - if (VMPI.force_return_for_pic && (VEFLAGS & (X86_EFLAGS_IF | X86_EFLAGS_VIF))) \ - return_to_32bit(regs, VM86_PICRETURN); \ - if (orig_flags & X86_EFLAGS_TF) \ - handle_vm86_trap(regs, 0, 1); \ - return; } while (0) orig_flags = *(unsigned short *)®s->pt.flags; @@ -622,7 +625,7 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code) SP(regs) -= 2; } IP(regs) = ip; - VM86_FAULT_RETURN; + goto vm86_fault_return; /* popf */ case 0x9d: @@ -642,16 +645,18 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code) else set_vflags_short(newflags, regs); - VM86_FAULT_RETURN; + goto check_vip; } /* int xx */ case 0xcd: { int intno = popb(csp, ip, simulate_sigsegv); IP(regs) = ip; - if (VMPI.vm86dbg_active) { - if ((1 << (intno & 7)) & VMPI.vm86dbg_intxxtab[intno >> 3]) - return_to_32bit(regs, VM86_INTx + (intno << 8)); + if (vmpi->vm86dbg_active) { + if ((1 << (intno & 7)) & vmpi->vm86dbg_intxxtab[intno >> 3]) { + save_v86_state(regs, VM86_INTx + (intno << 8)); + return; + } } do_int(regs, intno, ssp, sp); return; @@ -682,14 +687,14 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code) } else { set_vflags_short(newflags, regs); } - VM86_FAULT_RETURN; + goto check_vip; } /* cli */ case 0xfa: IP(regs) = ip; clear_IF(regs); - VM86_FAULT_RETURN; + goto vm86_fault_return; /* sti */ /* @@ -701,14 +706,29 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code) case 0xfb: IP(regs) = ip; set_IF(regs); - VM86_FAULT_RETURN; + goto check_vip; default: - return_to_32bit(regs, VM86_UNKNOWN); + save_v86_state(regs, VM86_UNKNOWN); } return; +check_vip: + if (VEFLAGS & X86_EFLAGS_VIP) { + save_v86_state(regs, VM86_STI); + return; + } + +vm86_fault_return: + if (vmpi->force_return_for_pic && (VEFLAGS & (X86_EFLAGS_IF | X86_EFLAGS_VIF))) { + save_v86_state(regs, VM86_PICRETURN); + return; + } + if (orig_flags & X86_EFLAGS_TF) + handle_vm86_trap(regs, 0, X86_TRAP_DB); + return; + simulate_sigsegv: /* FIXME: After a long discussion with Stas we finally * agreed, that this is wrong. Here we should @@ -720,7 +740,7 @@ simulate_sigsegv: * should be a mixture of the two, but how do we * get the information? [KD] */ - return_to_32bit(regs, VM86_UNKNOWN); + save_v86_state(regs, VM86_UNKNOWN); } /* ---------------- vm86 special IRQ passing stuff ----------------- */ diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 00bf300fd..74e4bf11f 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -364,7 +364,7 @@ INIT_PER_CPU(irq_stack_union); #endif /* CONFIG_X86_32 */ -#ifdef CONFIG_KEXEC +#ifdef CONFIG_KEXEC_CORE #include <asm/kexec.h> . = ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 67d215cb8..a1ff508bb 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -12,7 +12,9 @@ kvm-y += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \ kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ - i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o + i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \ + hyperv.o + kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += assigned-dev.o iommu.o kvm-intel-y += vmx.o pmu_intel.o kvm-amd-y += svm.o pmu_amd.o diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 2392541a9..1505587d0 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -650,6 +650,7 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt, u16 sel; la = seg_base(ctxt, addr.seg) + addr.ea; + *linear = la; *max_size = 0; switch (mode) { case X86EMUL_MODE_PROT64: @@ -693,7 +694,6 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt, } if (insn_aligned(ctxt, size) && ((la & (size - 1)) != 0)) return emulate_gp(ctxt, 0); - *linear = la; return X86EMUL_CONTINUE; bad: if (addr.seg == VCPU_SREG_SS) @@ -2272,8 +2272,8 @@ static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt) #define GET_SMSTATE(type, smbase, offset) \ ({ \ type __val; \ - int r = ctxt->ops->read_std(ctxt, smbase + offset, &__val, \ - sizeof(__val), NULL); \ + int r = ctxt->ops->read_phys(ctxt, smbase + offset, &__val, \ + sizeof(__val)); \ if (r != X86EMUL_CONTINUE) \ return X86EMUL_UNHANDLEABLE; \ __val; \ @@ -2484,17 +2484,36 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt) /* * Get back to real mode, to prepare a safe state in which to load - * CR0/CR3/CR4/EFER. Also this will ensure that addresses passed - * to read_std/write_std are not virtual. - * - * CR4.PCIDE must be zero, because it is a 64-bit mode only feature. + * CR0/CR3/CR4/EFER. It's all a bit more complicated if the vCPU + * supports long mode. */ + cr4 = ctxt->ops->get_cr(ctxt, 4); + if (emulator_has_longmode(ctxt)) { + struct desc_struct cs_desc; + + /* Zero CR4.PCIDE before CR0.PG. */ + if (cr4 & X86_CR4_PCIDE) { + ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE); + cr4 &= ~X86_CR4_PCIDE; + } + + /* A 32-bit code segment is required to clear EFER.LMA. */ + memset(&cs_desc, 0, sizeof(cs_desc)); + cs_desc.type = 0xb; + cs_desc.s = cs_desc.g = cs_desc.p = 1; + ctxt->ops->set_segment(ctxt, 0, &cs_desc, 0, VCPU_SREG_CS); + } + + /* For the 64-bit case, this will clear EFER.LMA. */ cr0 = ctxt->ops->get_cr(ctxt, 0); if (cr0 & X86_CR0_PE) ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE)); - cr4 = ctxt->ops->get_cr(ctxt, 4); + + /* Now clear CR4.PAE (which must be done before clearing EFER.LME). */ if (cr4 & X86_CR4_PAE) ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE); + + /* And finally go back to 32-bit mode. */ efer = 0; ctxt->ops->set_msr(ctxt, MSR_EFER, efer); @@ -4455,7 +4474,7 @@ static const struct opcode twobyte_table[256] = { F(DstMem | SrcReg | Src2CL | ModRM, em_shld), N, N, /* 0xA8 - 0xAF */ I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg), - II(No64 | EmulateOnUD | ImplicitOps, em_rsm, rsm), + II(EmulateOnUD | ImplicitOps, em_rsm, rsm), F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts), F(DstMem | SrcReg | Src2ImmByte | ModRM, em_shrd), F(DstMem | SrcReg | Src2CL | ModRM, em_shrd), diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c new file mode 100644 index 000000000..a8160d2ae --- /dev/null +++ b/arch/x86/kvm/hyperv.c @@ -0,0 +1,377 @@ +/* + * KVM Microsoft Hyper-V emulation + * + * derived from arch/x86/kvm/x86.c + * + * Copyright (C) 2006 Qumranet, Inc. + * Copyright (C) 2008 Qumranet, Inc. + * Copyright IBM Corporation, 2008 + * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * Copyright (C) 2015 Andrey Smetanin <asmetanin@virtuozzo.com> + * + * Authors: + * Avi Kivity <avi@qumranet.com> + * Yaniv Kamay <yaniv@qumranet.com> + * Amit Shah <amit.shah@qumranet.com> + * Ben-Ami Yassour <benami@il.ibm.com> + * Andrey Smetanin <asmetanin@virtuozzo.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include "x86.h" +#include "lapic.h" +#include "hyperv.h" + +#include <linux/kvm_host.h> +#include <trace/events/kvm.h> + +#include "trace.h" + +static bool kvm_hv_msr_partition_wide(u32 msr) +{ + bool r = false; + + switch (msr) { + case HV_X64_MSR_GUEST_OS_ID: + case HV_X64_MSR_HYPERCALL: + case HV_X64_MSR_REFERENCE_TSC: + case HV_X64_MSR_TIME_REF_COUNT: + case HV_X64_MSR_CRASH_CTL: + case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: + r = true; + break; + } + + return r; +} + +static int kvm_hv_msr_get_crash_data(struct kvm_vcpu *vcpu, + u32 index, u64 *pdata) +{ + struct kvm_hv *hv = &vcpu->kvm->arch.hyperv; + + if (WARN_ON_ONCE(index >= ARRAY_SIZE(hv->hv_crash_param))) + return -EINVAL; + + *pdata = hv->hv_crash_param[index]; + return 0; +} + +static int kvm_hv_msr_get_crash_ctl(struct kvm_vcpu *vcpu, u64 *pdata) +{ + struct kvm_hv *hv = &vcpu->kvm->arch.hyperv; + + *pdata = hv->hv_crash_ctl; + return 0; +} + +static int kvm_hv_msr_set_crash_ctl(struct kvm_vcpu *vcpu, u64 data, bool host) +{ + struct kvm_hv *hv = &vcpu->kvm->arch.hyperv; + + if (host) + hv->hv_crash_ctl = data & HV_X64_MSR_CRASH_CTL_NOTIFY; + + if (!host && (data & HV_X64_MSR_CRASH_CTL_NOTIFY)) { + + vcpu_debug(vcpu, "hv crash (0x%llx 0x%llx 0x%llx 0x%llx 0x%llx)\n", + hv->hv_crash_param[0], + hv->hv_crash_param[1], + hv->hv_crash_param[2], + hv->hv_crash_param[3], + hv->hv_crash_param[4]); + + /* Send notification about crash to user space */ + kvm_make_request(KVM_REQ_HV_CRASH, vcpu); + } + + return 0; +} + +static int kvm_hv_msr_set_crash_data(struct kvm_vcpu *vcpu, + u32 index, u64 data) +{ + struct kvm_hv *hv = &vcpu->kvm->arch.hyperv; + + if (WARN_ON_ONCE(index >= ARRAY_SIZE(hv->hv_crash_param))) + return -EINVAL; + + hv->hv_crash_param[index] = data; + return 0; +} + +static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, + bool host) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_hv *hv = &kvm->arch.hyperv; + + switch (msr) { + case HV_X64_MSR_GUEST_OS_ID: + hv->hv_guest_os_id = data; + /* setting guest os id to zero disables hypercall page */ + if (!hv->hv_guest_os_id) + hv->hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE; + break; + case HV_X64_MSR_HYPERCALL: { + u64 gfn; + unsigned long addr; + u8 instructions[4]; + + /* if guest os id is not set hypercall should remain disabled */ + if (!hv->hv_guest_os_id) + break; + if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) { + hv->hv_hypercall = data; + break; + } + gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT; + addr = gfn_to_hva(kvm, gfn); + if (kvm_is_error_hva(addr)) + return 1; + kvm_x86_ops->patch_hypercall(vcpu, instructions); + ((unsigned char *)instructions)[3] = 0xc3; /* ret */ + if (__copy_to_user((void __user *)addr, instructions, 4)) + return 1; + hv->hv_hypercall = data; + mark_page_dirty(kvm, gfn); + break; + } + case HV_X64_MSR_REFERENCE_TSC: { + u64 gfn; + HV_REFERENCE_TSC_PAGE tsc_ref; + + memset(&tsc_ref, 0, sizeof(tsc_ref)); + hv->hv_tsc_page = data; + if (!(data & HV_X64_MSR_TSC_REFERENCE_ENABLE)) + break; + gfn = data >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT; + if (kvm_write_guest( + kvm, + gfn << HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT, + &tsc_ref, sizeof(tsc_ref))) + return 1; + mark_page_dirty(kvm, gfn); + break; + } + case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: + return kvm_hv_msr_set_crash_data(vcpu, + msr - HV_X64_MSR_CRASH_P0, + data); + case HV_X64_MSR_CRASH_CTL: + return kvm_hv_msr_set_crash_ctl(vcpu, data, host); + default: + vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n", + msr, data); + return 1; + } + return 0; +} + +static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv; + + switch (msr) { + case HV_X64_MSR_APIC_ASSIST_PAGE: { + u64 gfn; + unsigned long addr; + + if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) { + hv->hv_vapic = data; + if (kvm_lapic_enable_pv_eoi(vcpu, 0)) + return 1; + break; + } + gfn = data >> HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT; + addr = kvm_vcpu_gfn_to_hva(vcpu, gfn); + if (kvm_is_error_hva(addr)) + return 1; + if (__clear_user((void __user *)addr, PAGE_SIZE)) + return 1; + hv->hv_vapic = data; + kvm_vcpu_mark_page_dirty(vcpu, gfn); + if (kvm_lapic_enable_pv_eoi(vcpu, + gfn_to_gpa(gfn) | KVM_MSR_ENABLED)) + return 1; + break; + } + case HV_X64_MSR_EOI: + return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data); + case HV_X64_MSR_ICR: + return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data); + case HV_X64_MSR_TPR: + return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data); + default: + vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n", + msr, data); + return 1; + } + + return 0; +} + +static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +{ + u64 data = 0; + struct kvm *kvm = vcpu->kvm; + struct kvm_hv *hv = &kvm->arch.hyperv; + + switch (msr) { + case HV_X64_MSR_GUEST_OS_ID: + data = hv->hv_guest_os_id; + break; + case HV_X64_MSR_HYPERCALL: + data = hv->hv_hypercall; + break; + case HV_X64_MSR_TIME_REF_COUNT: { + data = + div_u64(get_kernel_ns() + kvm->arch.kvmclock_offset, 100); + break; + } + case HV_X64_MSR_REFERENCE_TSC: + data = hv->hv_tsc_page; + break; + case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: + return kvm_hv_msr_get_crash_data(vcpu, + msr - HV_X64_MSR_CRASH_P0, + pdata); + case HV_X64_MSR_CRASH_CTL: + return kvm_hv_msr_get_crash_ctl(vcpu, pdata); + default: + vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); + return 1; + } + + *pdata = data; + return 0; +} + +static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +{ + u64 data = 0; + struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv; + + switch (msr) { + case HV_X64_MSR_VP_INDEX: { + int r; + struct kvm_vcpu *v; + + kvm_for_each_vcpu(r, v, vcpu->kvm) { + if (v == vcpu) { + data = r; + break; + } + } + break; + } + case HV_X64_MSR_EOI: + return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata); + case HV_X64_MSR_ICR: + return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata); + case HV_X64_MSR_TPR: + return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata); + case HV_X64_MSR_APIC_ASSIST_PAGE: + data = hv->hv_vapic; + break; + default: + vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); + return 1; + } + *pdata = data; + return 0; +} + +int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) +{ + if (kvm_hv_msr_partition_wide(msr)) { + int r; + + mutex_lock(&vcpu->kvm->lock); + r = kvm_hv_set_msr_pw(vcpu, msr, data, host); + mutex_unlock(&vcpu->kvm->lock); + return r; + } else + return kvm_hv_set_msr(vcpu, msr, data); +} + +int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +{ + if (kvm_hv_msr_partition_wide(msr)) { + int r; + + mutex_lock(&vcpu->kvm->lock); + r = kvm_hv_get_msr_pw(vcpu, msr, pdata); + mutex_unlock(&vcpu->kvm->lock); + return r; + } else + return kvm_hv_get_msr(vcpu, msr, pdata); +} + +bool kvm_hv_hypercall_enabled(struct kvm *kvm) +{ + return kvm->arch.hyperv.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE; +} + +int kvm_hv_hypercall(struct kvm_vcpu *vcpu) +{ + u64 param, ingpa, outgpa, ret; + uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0; + bool fast, longmode; + + /* + * hypercall generates UD from non zero cpl and real mode + * per HYPER-V spec + */ + if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 0; + } + + longmode = is_64_bit_mode(vcpu); + + if (!longmode) { + param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) | + (kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffffff); + ingpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) | + (kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffffff); + outgpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) | + (kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffffff); + } +#ifdef CONFIG_X86_64 + else { + param = kvm_register_read(vcpu, VCPU_REGS_RCX); + ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX); + outgpa = kvm_register_read(vcpu, VCPU_REGS_R8); + } +#endif + + code = param & 0xffff; + fast = (param >> 16) & 0x1; + rep_cnt = (param >> 32) & 0xfff; + rep_idx = (param >> 48) & 0xfff; + + trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa); + + switch (code) { + case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT: + kvm_vcpu_on_spin(vcpu); + break; + default: + res = HV_STATUS_INVALID_HYPERCALL_CODE; + break; + } + + ret = res | (((u64)rep_done & 0xfff) << 32); + if (longmode) { + kvm_register_write(vcpu, VCPU_REGS_RAX, ret); + } else { + kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32); + kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff); + } + + return 1; +} diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h new file mode 100644 index 000000000..c7bce559f --- /dev/null +++ b/arch/x86/kvm/hyperv.h @@ -0,0 +1,32 @@ +/* + * KVM Microsoft Hyper-V emulation + * + * derived from arch/x86/kvm/x86.c + * + * Copyright (C) 2006 Qumranet, Inc. + * Copyright (C) 2008 Qumranet, Inc. + * Copyright IBM Corporation, 2008 + * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * Copyright (C) 2015 Andrey Smetanin <asmetanin@virtuozzo.com> + * + * Authors: + * Avi Kivity <avi@qumranet.com> + * Yaniv Kamay <yaniv@qumranet.com> + * Amit Shah <amit.shah@qumranet.com> + * Ben-Ami Yassour <benami@il.ibm.com> + * Andrey Smetanin <asmetanin@virtuozzo.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#ifndef __ARCH_X86_KVM_HYPERV_H__ +#define __ARCH_X86_KVM_HYPERV_H__ + +int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host); +int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); +bool kvm_hv_hypercall_enabled(struct kvm *kvm); +int kvm_hv_hypercall(struct kvm_vcpu *vcpu); + +#endif diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index fef922ff2..7cc2360f1 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -651,15 +651,10 @@ fail_unlock: return NULL; } -void kvm_destroy_pic(struct kvm *kvm) +void kvm_destroy_pic(struct kvm_pic *vpic) { - struct kvm_pic *vpic = kvm->arch.vpic; - - if (vpic) { - kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev_master); - kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev_slave); - kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev_eclr); - kvm->arch.vpic = NULL; - kfree(vpic); - } + kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_master); + kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_slave); + kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_eclr); + kfree(vpic); } diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index ad68c7300..3d782a2c3 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -74,7 +74,7 @@ struct kvm_pic { }; struct kvm_pic *kvm_create_pic(struct kvm *kvm); -void kvm_destroy_pic(struct kvm *kvm); +void kvm_destroy_pic(struct kvm_pic *vpic); int kvm_pic_read_irq(struct kvm *kvm); void kvm_pic_update_irq(struct kvm_pic *s); @@ -85,11 +85,11 @@ static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) static inline int irqchip_in_kernel(struct kvm *kvm) { - int ret; + struct kvm_pic *vpic = pic_irqchip(kvm); - ret = (pic_irqchip(kvm) != NULL); + /* Read vpic before kvm->irq_routing. */ smp_rmb(); - return ret; + return vpic != NULL; } void kvm_pic_reset(struct kvm_kpic_state *s); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 2a5ca97c2..ae4483a3e 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -348,6 +348,8 @@ void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir) struct kvm_lapic *apic = vcpu->arch.apic; __kvm_apic_update_irr(pir, apic->regs); + + kvm_make_request(KVM_REQ_EVENT, vcpu); } EXPORT_SYMBOL_GPL(kvm_apic_update_irr); @@ -1172,7 +1174,7 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu) tsc_deadline = apic->lapic_timer.expired_tscdeadline; apic->lapic_timer.expired_tscdeadline = 0; - guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc()); + guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, rdtsc()); trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline); /* __delay is delay_tsc whenever the hardware has TSC, thus always. */ @@ -1240,7 +1242,7 @@ static void start_apic_timer(struct kvm_lapic *apic) local_irq_save(flags); now = apic->lapic_timer.timer.base->get_time(); - guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc()); + guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, rdtsc()); if (likely(tscdeadline > guest_tsc)) { ns = (tscdeadline - guest_tsc) * 1000000ULL; do_div(ns, this_tsc_khz); @@ -1900,8 +1902,9 @@ void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu) if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) return; - kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data, - sizeof(u32)); + if (kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data, + sizeof(u32))) + return; apic_set_tpr(vcpu->arch.apic, data & 0xff); } diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 719527482..764037991 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -91,7 +91,7 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu) { - return vcpu->arch.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE; + return vcpu->arch.hyperv.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE; } int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 82362ad2f..ff606f507 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3268,6 +3268,25 @@ static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr, return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access, exception); } +static bool +__is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check, u64 pte, int level) +{ + int bit7 = (pte >> 7) & 1, low6 = pte & 0x3f; + + return (pte & rsvd_check->rsvd_bits_mask[bit7][level-1]) | + ((rsvd_check->bad_mt_xwr & (1ull << low6)) != 0); +} + +static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level) +{ + return __is_rsvd_bits_set(&mmu->guest_rsvd_check, gpte, level); +} + +static bool is_shadow_zero_bits_set(struct kvm_mmu *mmu, u64 spte, int level) +{ + return __is_rsvd_bits_set(&mmu->shadow_zero_check, spte, level); +} + static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct) { if (direct) @@ -3276,31 +3295,63 @@ static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct) return vcpu_match_mmio_gva(vcpu, addr); } -static u64 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr) +/* return true if reserved bit is detected on spte. */ +static bool +walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) { struct kvm_shadow_walk_iterator iterator; - u64 spte = 0ull; + u64 sptes[PT64_ROOT_LEVEL], spte = 0ull; + int root, leaf; + bool reserved = false; if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) - return spte; + goto exit; walk_shadow_page_lockless_begin(vcpu); - for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) + + for (shadow_walk_init(&iterator, vcpu, addr), + leaf = root = iterator.level; + shadow_walk_okay(&iterator); + __shadow_walk_next(&iterator, spte)) { + spte = mmu_spte_get_lockless(iterator.sptep); + + sptes[leaf - 1] = spte; + leaf--; + if (!is_shadow_present_pte(spte)) break; + + reserved |= is_shadow_zero_bits_set(&vcpu->arch.mmu, spte, + iterator.level); + } + walk_shadow_page_lockless_end(vcpu); - return spte; + if (reserved) { + pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n", + __func__, addr); + while (root > leaf) { + pr_err("------ spte 0x%llx level %d.\n", + sptes[root - 1], root); + root--; + } + } +exit: + *sptep = spte; + return reserved; } int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct) { u64 spte; + bool reserved; if (quickly_check_mmio_pf(vcpu, addr, direct)) return RET_MMIO_PF_EMULATE; - spte = walk_shadow_page_get_mmio_spte(vcpu, addr); + reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte); + if (unlikely(reserved)) + return RET_MMIO_PF_BUG; if (is_mmio_spte(spte)) { gfn_t gfn = get_mmio_spte_gfn(spte); @@ -3559,102 +3610,119 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gp #include "paging_tmpl.h" #undef PTTYPE -static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, - struct kvm_mmu *context) +static void +__reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, + struct rsvd_bits_validate *rsvd_check, + int maxphyaddr, int level, bool nx, bool gbpages, + bool pse, bool amd) { - int maxphyaddr = cpuid_maxphyaddr(vcpu); u64 exb_bit_rsvd = 0; u64 gbpages_bit_rsvd = 0; u64 nonleaf_bit8_rsvd = 0; - context->bad_mt_xwr = 0; + rsvd_check->bad_mt_xwr = 0; - if (!context->nx) + if (!nx) exb_bit_rsvd = rsvd_bits(63, 63); - if (!guest_cpuid_has_gbpages(vcpu)) + if (!gbpages) gbpages_bit_rsvd = rsvd_bits(7, 7); /* * Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for * leaf entries) on AMD CPUs only. */ - if (guest_cpuid_is_amd(vcpu)) + if (amd) nonleaf_bit8_rsvd = rsvd_bits(8, 8); - switch (context->root_level) { + switch (level) { case PT32_ROOT_LEVEL: /* no rsvd bits for 2 level 4K page table entries */ - context->rsvd_bits_mask[0][1] = 0; - context->rsvd_bits_mask[0][0] = 0; - context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0]; + rsvd_check->rsvd_bits_mask[0][1] = 0; + rsvd_check->rsvd_bits_mask[0][0] = 0; + rsvd_check->rsvd_bits_mask[1][0] = + rsvd_check->rsvd_bits_mask[0][0]; - if (!is_pse(vcpu)) { - context->rsvd_bits_mask[1][1] = 0; + if (!pse) { + rsvd_check->rsvd_bits_mask[1][1] = 0; break; } if (is_cpuid_PSE36()) /* 36bits PSE 4MB page */ - context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21); + rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(17, 21); else /* 32 bits PSE 4MB page */ - context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21); + rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21); break; case PT32E_ROOT_LEVEL: - context->rsvd_bits_mask[0][2] = + rsvd_check->rsvd_bits_mask[0][2] = rsvd_bits(maxphyaddr, 63) | rsvd_bits(5, 8) | rsvd_bits(1, 2); /* PDPTE */ - context->rsvd_bits_mask[0][1] = exb_bit_rsvd | + rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 62); /* PDE */ - context->rsvd_bits_mask[0][0] = exb_bit_rsvd | + rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 62); /* PTE */ - context->rsvd_bits_mask[1][1] = exb_bit_rsvd | + rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 62) | rsvd_bits(13, 20); /* large page */ - context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0]; + rsvd_check->rsvd_bits_mask[1][0] = + rsvd_check->rsvd_bits_mask[0][0]; break; case PT64_ROOT_LEVEL: - context->rsvd_bits_mask[0][3] = exb_bit_rsvd | - nonleaf_bit8_rsvd | rsvd_bits(7, 7) | rsvd_bits(maxphyaddr, 51); - context->rsvd_bits_mask[0][2] = exb_bit_rsvd | - nonleaf_bit8_rsvd | gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51); - context->rsvd_bits_mask[0][1] = exb_bit_rsvd | + rsvd_check->rsvd_bits_mask[0][3] = exb_bit_rsvd | + nonleaf_bit8_rsvd | rsvd_bits(7, 7) | + rsvd_bits(maxphyaddr, 51); + rsvd_check->rsvd_bits_mask[0][2] = exb_bit_rsvd | + nonleaf_bit8_rsvd | gbpages_bit_rsvd | + rsvd_bits(maxphyaddr, 51); + rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 51); - context->rsvd_bits_mask[0][0] = exb_bit_rsvd | + rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 51); - context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3]; - context->rsvd_bits_mask[1][2] = exb_bit_rsvd | + rsvd_check->rsvd_bits_mask[1][3] = + rsvd_check->rsvd_bits_mask[0][3]; + rsvd_check->rsvd_bits_mask[1][2] = exb_bit_rsvd | gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51) | rsvd_bits(13, 29); - context->rsvd_bits_mask[1][1] = exb_bit_rsvd | + rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 51) | rsvd_bits(13, 20); /* large page */ - context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0]; + rsvd_check->rsvd_bits_mask[1][0] = + rsvd_check->rsvd_bits_mask[0][0]; break; } } -static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, - struct kvm_mmu *context, bool execonly) +static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, + struct kvm_mmu *context) +{ + __reset_rsvds_bits_mask(vcpu, &context->guest_rsvd_check, + cpuid_maxphyaddr(vcpu), context->root_level, + context->nx, guest_cpuid_has_gbpages(vcpu), + is_pse(vcpu), guest_cpuid_is_amd(vcpu)); +} + +static void +__reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check, + int maxphyaddr, bool execonly) { - int maxphyaddr = cpuid_maxphyaddr(vcpu); int pte; - context->rsvd_bits_mask[0][3] = + rsvd_check->rsvd_bits_mask[0][3] = rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7); - context->rsvd_bits_mask[0][2] = + rsvd_check->rsvd_bits_mask[0][2] = rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6); - context->rsvd_bits_mask[0][1] = + rsvd_check->rsvd_bits_mask[0][1] = rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6); - context->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51); + rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51); /* large page */ - context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3]; - context->rsvd_bits_mask[1][2] = + rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3]; + rsvd_check->rsvd_bits_mask[1][2] = rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29); - context->rsvd_bits_mask[1][1] = + rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20); - context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0]; + rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0]; for (pte = 0; pte < 64; pte++) { int rwx_bits = pte & 7; @@ -3662,10 +3730,75 @@ static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, if (mt == 0x2 || mt == 0x3 || mt == 0x7 || rwx_bits == 0x2 || rwx_bits == 0x6 || (rwx_bits == 0x4 && !execonly)) - context->bad_mt_xwr |= (1ull << pte); + rsvd_check->bad_mt_xwr |= (1ull << pte); } } +static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, + struct kvm_mmu *context, bool execonly) +{ + __reset_rsvds_bits_mask_ept(&context->guest_rsvd_check, + cpuid_maxphyaddr(vcpu), execonly); +} + +/* + * the page table on host is the shadow page table for the page + * table in guest or amd nested guest, its mmu features completely + * follow the features in guest. + */ +void +reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) +{ + /* + * Passing "true" to the last argument is okay; it adds a check + * on bit 8 of the SPTEs which KVM doesn't use anyway. + */ + __reset_rsvds_bits_mask(vcpu, &context->shadow_zero_check, + boot_cpu_data.x86_phys_bits, + context->shadow_root_level, context->nx, + guest_cpuid_has_gbpages(vcpu), is_pse(vcpu), + true); +} +EXPORT_SYMBOL_GPL(reset_shadow_zero_bits_mask); + +static inline bool boot_cpu_is_amd(void) +{ + WARN_ON_ONCE(!tdp_enabled); + return shadow_x_mask == 0; +} + +/* + * the direct page table on host, use as much mmu features as + * possible, however, kvm currently does not do execution-protection. + */ +static void +reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, + struct kvm_mmu *context) +{ + if (boot_cpu_is_amd()) + __reset_rsvds_bits_mask(vcpu, &context->shadow_zero_check, + boot_cpu_data.x86_phys_bits, + context->shadow_root_level, false, + cpu_has_gbpages, true, true); + else + __reset_rsvds_bits_mask_ept(&context->shadow_zero_check, + boot_cpu_data.x86_phys_bits, + false); + +} + +/* + * as the comments in reset_shadow_zero_bits_mask() except it + * is the shadow page table for intel nested guest. + */ +static void +reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, + struct kvm_mmu *context, bool execonly) +{ + __reset_rsvds_bits_mask_ept(&context->shadow_zero_check, + boot_cpu_data.x86_phys_bits, execonly); +} + static void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, bool ept) { @@ -3844,6 +3977,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) update_permission_bitmask(vcpu, context, false); update_last_pte_bitmap(vcpu, context); + reset_tdp_shadow_zero_bits_mask(vcpu, context); } void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) @@ -3871,6 +4005,7 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) context->base_role.smap_andnot_wp = smap && !is_write_protection(vcpu); context->base_role.smm = is_smm(vcpu); + reset_shadow_zero_bits_mask(vcpu, context); } EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); @@ -3894,6 +4029,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly) update_permission_bitmask(vcpu, context, true); reset_rsvds_bits_mask_ept(vcpu, context, execonly); + reset_ept_shadow_zero_bits_mask(vcpu, context, execonly); } EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu); @@ -4815,28 +4951,6 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm) return nr_mmu_pages; } -int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]) -{ - struct kvm_shadow_walk_iterator iterator; - u64 spte; - int nr_sptes = 0; - - if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) - return nr_sptes; - - walk_shadow_page_lockless_begin(vcpu); - for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) { - sptes[iterator.level-1] = spte; - nr_sptes++; - if (!is_shadow_present_pte(spte)) - break; - } - walk_shadow_page_lockless_end(vcpu); - - return nr_sptes; -} -EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy); - void kvm_mmu_destroy(struct kvm_vcpu *vcpu) { kvm_mmu_unload(vcpu); diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 398d21c0f..e4202e41d 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -50,9 +50,11 @@ static inline u64 rsvd_bits(int s, int e) return ((1ULL << (e - s + 1)) - 1) << s; } -int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask); +void +reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context); + /* * Return values of handle_mmio_page_fault_common: * RET_MMIO_PF_EMULATE: it is a real mmio page fault, emulate the instruction diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 0f67d7e24..736e6ab87 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -128,14 +128,6 @@ static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte) *access &= mask; } -static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level) -{ - int bit7 = (gpte >> 7) & 1, low6 = gpte & 0x3f; - - return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) | - ((mmu->bad_mt_xwr & (1ull << low6)) != 0); -} - static inline int FNAME(is_present_gpte)(unsigned long pte) { #if PTTYPE != PTTYPE_EPT @@ -172,7 +164,7 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, u64 gpte) { - if (FNAME(is_rsvd_bits_set)(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) + if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) goto no_present; if (!FNAME(is_present_gpte)(gpte)) @@ -353,8 +345,7 @@ retry_walk: if (unlikely(!FNAME(is_present_gpte)(pte))) goto error; - if (unlikely(FNAME(is_rsvd_bits_set)(mmu, pte, - walker->level))) { + if (unlikely(is_rsvd_bits_set(mmu, pte, walker->level))) { errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK; goto error; } diff --git a/arch/x86/kvm/pmu_amd.c b/arch/x86/kvm/pmu_amd.c index 886aa25a7..39b91127e 100644 --- a/arch/x86/kvm/pmu_amd.c +++ b/arch/x86/kvm/pmu_amd.c @@ -133,8 +133,6 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) /* MSR_K7_PERFCTRn */ pmc = get_gp_pmc(pmu, msr, MSR_K7_PERFCTR0); if (pmc) { - if (!msr_info->host_initiated) - data = (s64)data; pmc->counter += data - pmc_read_counter(pmc); return 0; } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 2d32b67a1..d7f89387b 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -202,6 +202,7 @@ module_param(npt, int, S_IRUGO); static int nested = true; module_param(nested, int, S_IRUGO); +static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); static void svm_flush_tlb(struct kvm_vcpu *vcpu); static void svm_complete_interrupts(struct vcpu_svm *svm); @@ -1080,12 +1081,12 @@ static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) { u64 tsc; - tsc = svm_scale_tsc(vcpu, native_read_tsc()); + tsc = svm_scale_tsc(vcpu, rdtsc()); return target_tsc - tsc; } -static void init_vmcb(struct vcpu_svm *svm, bool init_event) +static void init_vmcb(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; struct vmcb_save_area *save = &svm->vmcb->save; @@ -1106,6 +1107,7 @@ static void init_vmcb(struct vcpu_svm *svm, bool init_event) set_exception_intercept(svm, PF_VECTOR); set_exception_intercept(svm, UD_VECTOR); set_exception_intercept(svm, MC_VECTOR); + set_exception_intercept(svm, AC_VECTOR); set_intercept(svm, INTERCEPT_INTR); set_intercept(svm, INTERCEPT_NMI); @@ -1156,8 +1158,7 @@ static void init_vmcb(struct vcpu_svm *svm, bool init_event) init_sys_seg(&save->ldtr, SEG_TYPE_LDT); init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); - if (!init_event) - svm_set_efer(&svm->vcpu, 0); + svm_set_efer(&svm->vcpu, 0); save->dr6 = 0xffff0ff0; kvm_set_rflags(&svm->vcpu, 2); save->rip = 0x0000fff0; @@ -1167,7 +1168,7 @@ static void init_vmcb(struct vcpu_svm *svm, bool init_event) * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0. * It also updates the guest-visible cr0 value. */ - (void)kvm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET); + svm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET); kvm_mmu_reset_context(&svm->vcpu); save->cr4 = X86_CR4_PAE; @@ -1211,7 +1212,7 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) if (kvm_vcpu_is_reset_bsp(&svm->vcpu)) svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; } - init_vmcb(svm, init_event); + init_vmcb(svm); kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy); kvm_register_write(vcpu, VCPU_REGS_RDX, eax); @@ -1267,7 +1268,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) clear_page(svm->vmcb); svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; svm->asid_generation = 0; - init_vmcb(svm, false); + init_vmcb(svm); svm_init_osvw(&svm->vcpu); @@ -1795,6 +1796,12 @@ static int ud_interception(struct vcpu_svm *svm) return 1; } +static int ac_interception(struct vcpu_svm *svm) +{ + kvm_queue_exception_e(&svm->vcpu, AC_VECTOR, 0); + return 1; +} + static void svm_fpu_activate(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1889,7 +1896,7 @@ static int shutdown_interception(struct vcpu_svm *svm) * so reinitialize it. */ clear_page(svm->vmcb); - init_vmcb(svm, false); + init_vmcb(svm); kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; return 0; @@ -2014,6 +2021,7 @@ static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) vcpu->arch.mmu.get_pdptr = nested_svm_get_tdp_pdptr; vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit; vcpu->arch.mmu.shadow_root_level = get_npt_level(); + reset_shadow_zero_bits_mask(vcpu, &vcpu->arch.mmu); vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; } @@ -3080,7 +3088,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) switch (msr_info->index) { case MSR_IA32_TSC: { msr_info->data = svm->vmcb->control.tsc_offset + - svm_scale_tsc(vcpu, native_read_tsc()); + svm_scale_tsc(vcpu, rdtsc()); break; } @@ -3369,6 +3377,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception, [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception, + [SVM_EXIT_EXCP_BASE + AC_VECTOR] = ac_interception, [SVM_EXIT_INTR] = intr_interception, [SVM_EXIT_NMI] = nmi_interception, [SVM_EXIT_SMI] = nop_on_interception, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index aa9e82295..343d3692d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1264,7 +1264,7 @@ static void vmcs_load(struct vmcs *vmcs) vmcs, phys_addr); } -#ifdef CONFIG_KEXEC +#ifdef CONFIG_KEXEC_CORE /* * This bitmap is used to indicate whether the vmclear * operation is enabled on all cpus. All disabled by @@ -1302,7 +1302,7 @@ static void crash_vmclear_local_loaded_vmcss(void) #else static inline void crash_enable_local_vmclear(int cpu) { } static inline void crash_disable_local_vmclear(int cpu) { } -#endif /* CONFIG_KEXEC */ +#endif /* CONFIG_KEXEC_CORE */ static void __loaded_vmcs_clear(void *arg) { @@ -1567,7 +1567,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) u32 eb; eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | - (1u << NM_VECTOR) | (1u << DB_VECTOR); + (1u << NM_VECTOR) | (1u << DB_VECTOR) | (1u << AC_VECTOR); if ((vcpu->guest_debug & (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) == (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) @@ -2236,7 +2236,7 @@ static u64 guest_read_tsc(void) { u64 host_tsc, tsc_offset; - rdtscll(host_tsc); + host_tsc = rdtsc(); tsc_offset = vmcs_read64(TSC_OFFSET); return host_tsc + tsc_offset; } @@ -2317,7 +2317,7 @@ static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool ho static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) { - return target_tsc - native_read_tsc(); + return target_tsc - rdtsc(); } static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu) @@ -2443,10 +2443,10 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | #endif CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | - CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING | - CPU_BASED_RDPMC_EXITING | CPU_BASED_RDTSC_EXITING | - CPU_BASED_PAUSE_EXITING | CPU_BASED_TPR_SHADOW | - CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; + CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG | + CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING | + CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING | + CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; /* * We can allow some features even when not supported by the * hardware. For example, L1 can specify an MSR bitmap - and we @@ -3150,7 +3150,7 @@ static struct vmcs *alloc_vmcs_cpu(int cpu) struct page *pages; struct vmcs *vmcs; - pages = alloc_pages_exact_node(node, GFP_KERNEL, vmcs_config.order); + pages = __alloc_pages_node(node, GFP_KERNEL, vmcs_config.order); if (!pages) return NULL; vmcs = page_address(pages); @@ -3423,12 +3423,12 @@ static void enter_lmode(struct kvm_vcpu *vcpu) vmx_segment_cache_clear(to_vmx(vcpu)); guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); - if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) { + if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) { pr_debug_ratelimited("%s: tss fixup for long mode. \n", __func__); vmcs_write32(GUEST_TR_AR_BYTES, - (guest_tr_ar & ~AR_TYPE_MASK) - | AR_TYPE_BUSY_64_TSS); + (guest_tr_ar & ~VMX_AR_TYPE_MASK) + | VMX_AR_TYPE_BUSY_64_TSS); } vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA); } @@ -3719,7 +3719,7 @@ static int vmx_get_cpl(struct kvm_vcpu *vcpu) return 0; else { int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS); - return AR_DPL(ar); + return VMX_AR_DPL(ar); } } @@ -3847,11 +3847,11 @@ static bool code_segment_valid(struct kvm_vcpu *vcpu) if (cs.unusable) return false; - if (~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_ACCESSES_MASK)) + if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK)) return false; if (!cs.s) return false; - if (cs.type & AR_TYPE_WRITEABLE_MASK) { + if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) { if (cs.dpl > cs_rpl) return false; } else { @@ -3901,7 +3901,7 @@ static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg) return false; if (!var.present) return false; - if (~var.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK)) { + if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) { if (var.dpl < rpl) /* DPL < RPL */ return false; } @@ -4105,17 +4105,13 @@ static void seg_setup(int seg) static int alloc_apic_access_page(struct kvm *kvm) { struct page *page; - struct kvm_userspace_memory_region kvm_userspace_mem; int r = 0; mutex_lock(&kvm->slots_lock); if (kvm->arch.apic_access_page_done) goto out; - kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT; - kvm_userspace_mem.flags = 0; - kvm_userspace_mem.guest_phys_addr = APIC_DEFAULT_PHYS_BASE; - kvm_userspace_mem.memory_size = PAGE_SIZE; - r = __x86_set_memory_region(kvm, &kvm_userspace_mem); + r = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, + APIC_DEFAULT_PHYS_BASE, PAGE_SIZE); if (r) goto out; @@ -4140,17 +4136,12 @@ static int alloc_identity_pagetable(struct kvm *kvm) { /* Called with kvm->slots_lock held. */ - struct kvm_userspace_memory_region kvm_userspace_mem; int r = 0; BUG_ON(kvm->arch.ept_identity_pagetable_done); - kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT; - kvm_userspace_mem.flags = 0; - kvm_userspace_mem.guest_phys_addr = - kvm->arch.ept_identity_map_addr; - kvm_userspace_mem.memory_size = PAGE_SIZE; - r = __x86_set_memory_region(kvm, &kvm_userspace_mem); + r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, + kvm->arch.ept_identity_map_addr, PAGE_SIZE); return r; } @@ -4780,8 +4771,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmx_set_cr0(vcpu, cr0); /* enter rmode */ vmx->vcpu.arch.cr0 = cr0; vmx_set_cr4(vcpu, 0); - if (!init_event) - vmx_set_efer(vcpu, 0); + vmx_set_efer(vcpu, 0); vmx_fpu_activate(vcpu); update_exception_bitmap(vcpu); @@ -4949,14 +4939,9 @@ static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) { int ret; - struct kvm_userspace_memory_region tss_mem = { - .slot = TSS_PRIVATE_MEMSLOT, - .guest_phys_addr = addr, - .memory_size = PAGE_SIZE * 3, - .flags = 0, - }; - ret = x86_set_memory_region(kvm, &tss_mem); + ret = x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr, + PAGE_SIZE * 3); if (ret) return ret; kvm->arch.tss_addr = addr; @@ -5118,6 +5103,9 @@ static int handle_exception(struct kvm_vcpu *vcpu) return handle_rmode_exception(vcpu, ex_no, error_code); switch (ex_no) { + case AC_VECTOR: + kvm_queue_exception_e(vcpu, AC_VECTOR, error_code); + return 1; case DB_VECTOR: dr6 = vmcs_readl(EXIT_QUALIFICATION); if (!(vcpu->guest_debug & @@ -5759,73 +5747,9 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); } -static u64 ept_rsvd_mask(u64 spte, int level) -{ - int i; - u64 mask = 0; - - for (i = 51; i > boot_cpu_data.x86_phys_bits; i--) - mask |= (1ULL << i); - - if (level == 4) - /* bits 7:3 reserved */ - mask |= 0xf8; - else if (spte & (1ULL << 7)) - /* - * 1GB/2MB page, bits 29:12 or 20:12 reserved respectively, - * level == 1 if the hypervisor is using the ignored bit 7. - */ - mask |= (PAGE_SIZE << ((level - 1) * 9)) - PAGE_SIZE; - else if (level > 1) - /* bits 6:3 reserved */ - mask |= 0x78; - - return mask; -} - -static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte, - int level) -{ - printk(KERN_ERR "%s: spte 0x%llx level %d\n", __func__, spte, level); - - /* 010b (write-only) */ - WARN_ON((spte & 0x7) == 0x2); - - /* 110b (write/execute) */ - WARN_ON((spte & 0x7) == 0x6); - - /* 100b (execute-only) and value not supported by logical processor */ - if (!cpu_has_vmx_ept_execute_only()) - WARN_ON((spte & 0x7) == 0x4); - - /* not 000b */ - if ((spte & 0x7)) { - u64 rsvd_bits = spte & ept_rsvd_mask(spte, level); - - if (rsvd_bits != 0) { - printk(KERN_ERR "%s: rsvd_bits = 0x%llx\n", - __func__, rsvd_bits); - WARN_ON(1); - } - - /* bits 5:3 are _not_ reserved for large page or leaf page */ - if ((rsvd_bits & 0x38) == 0) { - u64 ept_mem_type = (spte & 0x38) >> 3; - - if (ept_mem_type == 2 || ept_mem_type == 3 || - ept_mem_type == 7) { - printk(KERN_ERR "%s: ept_mem_type=0x%llx\n", - __func__, ept_mem_type); - WARN_ON(1); - } - } - } -} - static int handle_ept_misconfig(struct kvm_vcpu *vcpu) { - u64 sptes[4]; - int nr_sptes, i, ret; + int ret; gpa_t gpa; gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); @@ -5846,13 +5770,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) return 1; /* It is the real ept misconfig */ - printk(KERN_ERR "EPT: Misconfiguration.\n"); - printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa); - - nr_sptes = kvm_mmu_get_spte_hierarchy(vcpu, gpa, sptes); - - for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i) - ept_misconfig_inspect_spte(vcpu, sptes[i-1], i); + WARN_ON(1); vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG; @@ -6248,6 +6166,11 @@ static int handle_mwait(struct kvm_vcpu *vcpu) return handle_nop(vcpu); } +static int handle_monitor_trap(struct kvm_vcpu *vcpu) +{ + return 1; +} + static int handle_monitor(struct kvm_vcpu *vcpu) { printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n"); @@ -6410,8 +6333,12 @@ static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) */ static int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, - u32 vmx_instruction_info, gva_t *ret) + u32 vmx_instruction_info, bool wr, gva_t *ret) { + gva_t off; + bool exn; + struct kvm_segment s; + /* * According to Vol. 3B, "Information for VM Exits Due to Instruction * Execution", on an exit, vmx_instruction_info holds most of the @@ -6436,22 +6363,63 @@ static int get_vmx_mem_address(struct kvm_vcpu *vcpu, /* Addr = segment_base + offset */ /* offset = base + [index * scale] + displacement */ - *ret = vmx_get_segment_base(vcpu, seg_reg); + off = exit_qualification; /* holds the displacement */ if (base_is_valid) - *ret += kvm_register_read(vcpu, base_reg); + off += kvm_register_read(vcpu, base_reg); if (index_is_valid) - *ret += kvm_register_read(vcpu, index_reg)<<scaling; - *ret += exit_qualification; /* holds the displacement */ + off += kvm_register_read(vcpu, index_reg)<<scaling; + vmx_get_segment(vcpu, &s, seg_reg); + *ret = s.base + off; if (addr_size == 1) /* 32 bit */ *ret &= 0xffffffff; - /* - * TODO: throw #GP (and return 1) in various cases that the VM* - * instructions require it - e.g., offset beyond segment limit, - * unusable or unreadable/unwritable segment, non-canonical 64-bit - * address, and so on. Currently these are not checked. - */ + /* Checks for #GP/#SS exceptions. */ + exn = false; + if (is_protmode(vcpu)) { + /* Protected mode: apply checks for segment validity in the + * following order: + * - segment type check (#GP(0) may be thrown) + * - usability check (#GP(0)/#SS(0)) + * - limit check (#GP(0)/#SS(0)) + */ + if (wr) + /* #GP(0) if the destination operand is located in a + * read-only data segment or any code segment. + */ + exn = ((s.type & 0xa) == 0 || (s.type & 8)); + else + /* #GP(0) if the source operand is located in an + * execute-only code segment + */ + exn = ((s.type & 0xa) == 8); + } + if (exn) { + kvm_queue_exception_e(vcpu, GP_VECTOR, 0); + return 1; + } + if (is_long_mode(vcpu)) { + /* Long mode: #GP(0)/#SS(0) if the memory address is in a + * non-canonical form. This is an only check for long mode. + */ + exn = is_noncanonical_address(*ret); + } else if (is_protmode(vcpu)) { + /* Protected mode: #GP(0)/#SS(0) if the segment is unusable. + */ + exn = (s.unusable != 0); + /* Protected mode: #GP(0)/#SS(0) if the memory + * operand is outside the segment limit. + */ + exn = exn || (off + sizeof(u64) > s.limit); + } + if (exn) { + kvm_queue_exception_e(vcpu, + seg_reg == VCPU_SREG_SS ? + SS_VECTOR : GP_VECTOR, + 0); + return 1; + } + return 0; } @@ -6473,7 +6441,7 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason, int maxphyaddr = cpuid_maxphyaddr(vcpu); if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), - vmcs_read32(VMX_INSTRUCTION_INFO), &gva)) + vmcs_read32(VMX_INSTRUCTION_INFO), false, &gva)) return 1; if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr, @@ -7001,7 +6969,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) field_value); } else { if (get_vmx_mem_address(vcpu, exit_qualification, - vmx_instruction_info, &gva)) + vmx_instruction_info, true, &gva)) return 1; /* _system ok, as nested_vmx_check_permission verified cpl=0 */ kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, gva, @@ -7038,7 +7006,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) (((vmx_instruction_info) >> 3) & 0xf)); else { if (get_vmx_mem_address(vcpu, exit_qualification, - vmx_instruction_info, &gva)) + vmx_instruction_info, false, &gva)) return 1; if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &field_value, (is_64_bit_mode(vcpu) ? 8 : 4), &e)) { @@ -7130,7 +7098,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu) return 1; if (get_vmx_mem_address(vcpu, exit_qualification, - vmx_instruction_info, &vmcs_gva)) + vmx_instruction_info, true, &vmcs_gva)) return 1; /* ok to use *_system, as nested_vmx_check_permission verified cpl=0 */ if (kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, vmcs_gva, @@ -7186,7 +7154,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) * operand is read even if it isn't needed (e.g., for type==global) */ if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), - vmx_instruction_info, &gva)) + vmx_instruction_info, false, &gva)) return 1; if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand, sizeof(operand), &e)) { @@ -7284,6 +7252,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, [EXIT_REASON_MWAIT_INSTRUCTION] = handle_mwait, + [EXIT_REASON_MONITOR_TRAP_FLAG] = handle_monitor_trap, [EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor, [EXIT_REASON_INVEPT] = handle_invept, [EXIT_REASON_INVVPID] = handle_invvpid, @@ -7544,6 +7513,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) return true; case EXIT_REASON_MWAIT_INSTRUCTION: return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); + case EXIT_REASON_MONITOR_TRAP_FLAG: + return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_TRAP_FLAG); case EXIT_REASON_MONITOR_INSTRUCTION: return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); case EXIT_REASON_PAUSE_INSTRUCTION: @@ -10435,7 +10406,7 @@ static int __init vmx_init(void) if (r) return r; -#ifdef CONFIG_KEXEC +#ifdef CONFIG_KEXEC_CORE rcu_assign_pointer(crash_vmclear_loaded_vmcss, crash_vmclear_local_loaded_vmcss); #endif @@ -10445,7 +10416,7 @@ static int __init vmx_init(void) static void __exit vmx_exit(void) { -#ifdef CONFIG_KEXEC +#ifdef CONFIG_KEXEC_CORE RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL); synchronize_rcu(); #endif diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 373328b71..43609af03 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -29,6 +29,7 @@ #include "cpuid.h" #include "assigned-dev.h" #include "pmu.h" +#include "hyperv.h" #include <linux/clocksource.h> #include <linux/interrupt.h> @@ -148,6 +149,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "nmi_window", VCPU_STAT(nmi_window_exits) }, { "halt_exits", VCPU_STAT(halt_exits) }, { "halt_successful_poll", VCPU_STAT(halt_successful_poll) }, + { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) }, { "halt_wakeup", VCPU_STAT(halt_wakeup) }, { "hypercalls", VCPU_STAT(hypercalls) }, { "request_irq", VCPU_STAT(request_irq_exits) }, @@ -221,11 +223,9 @@ static void shared_msr_update(unsigned slot, u32 msr) void kvm_define_shared_msr(unsigned slot, u32 msr) { BUG_ON(slot >= KVM_NR_SHARED_MSRS); + shared_msrs_global.msrs[slot] = msr; if (slot >= shared_msrs_global.nr) shared_msrs_global.nr = slot + 1; - shared_msrs_global.msrs[slot] = msr; - /* we need ensured the shared_msr_global have been updated */ - smp_wmb(); } EXPORT_SYMBOL_GPL(kvm_define_shared_msr); @@ -526,7 +526,8 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3) } for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { if (is_present_gpte(pdpte[i]) && - (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) { + (pdpte[i] & + vcpu->arch.mmu.guest_rsvd_check.rsvd_bits_mask[0][2])) { ret = 0; goto out; } @@ -621,7 +622,9 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if ((cr0 ^ old_cr0) & update_bits) kvm_mmu_reset_context(vcpu); - if ((cr0 ^ old_cr0) & X86_CR0_CD) + if (((cr0 ^ old_cr0) & X86_CR0_CD) && + kvm_arch_has_noncoherent_dma(vcpu->kvm) && + !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL); return 0; @@ -949,6 +952,8 @@ static u32 emulated_msrs[] = { MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC, + HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2, + HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL, HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, MSR_KVM_PV_EOI_EN, @@ -1217,11 +1222,6 @@ static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz, __func__, base_khz, scaled_khz, shift, *pmultiplier); } -static inline u64 get_kernel_ns(void) -{ - return ktime_get_boot_ns(); -} - #ifdef CONFIG_X86_64 static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0); #endif @@ -1444,20 +1444,8 @@ EXPORT_SYMBOL_GPL(kvm_write_tsc); static cycle_t read_tsc(void) { - cycle_t ret; - u64 last; - - /* - * Empirically, a fence (of type that depends on the CPU) - * before rdtsc is enough to ensure that rdtsc is ordered - * with respect to loads. The various CPU manuals are unclear - * as to whether rdtsc can be reordered with later loads, - * but no one has ever seen it happen. - */ - rdtsc_barrier(); - ret = (cycle_t)vget_cycles(); - - last = pvclock_gtod_data.clock.cycle_last; + cycle_t ret = (cycle_t)rdtsc_ordered(); + u64 last = pvclock_gtod_data.clock.cycle_last; if (likely(ret >= last)) return ret; @@ -1646,7 +1634,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) return 1; } if (!use_master_clock) { - host_tsc = native_read_tsc(); + host_tsc = rdtsc(); kernel_ns = get_kernel_ns(); } @@ -1722,8 +1710,6 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) vcpu->pvclock_set_guest_stopped_request = false; } - pvclock_flags |= PVCLOCK_COUNTS_FROM_ZERO; - /* If the host uses TSC clocksource, then it is stable */ if (use_master_clock) pvclock_flags |= PVCLOCK_TSC_STABLE_BIT; @@ -1869,123 +1855,6 @@ out: return r; } -static bool kvm_hv_hypercall_enabled(struct kvm *kvm) -{ - return kvm->arch.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE; -} - -static bool kvm_hv_msr_partition_wide(u32 msr) -{ - bool r = false; - switch (msr) { - case HV_X64_MSR_GUEST_OS_ID: - case HV_X64_MSR_HYPERCALL: - case HV_X64_MSR_REFERENCE_TSC: - case HV_X64_MSR_TIME_REF_COUNT: - r = true; - break; - } - - return r; -} - -static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data) -{ - struct kvm *kvm = vcpu->kvm; - - switch (msr) { - case HV_X64_MSR_GUEST_OS_ID: - kvm->arch.hv_guest_os_id = data; - /* setting guest os id to zero disables hypercall page */ - if (!kvm->arch.hv_guest_os_id) - kvm->arch.hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE; - break; - case HV_X64_MSR_HYPERCALL: { - u64 gfn; - unsigned long addr; - u8 instructions[4]; - - /* if guest os id is not set hypercall should remain disabled */ - if (!kvm->arch.hv_guest_os_id) - break; - if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) { - kvm->arch.hv_hypercall = data; - break; - } - gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT; - addr = gfn_to_hva(kvm, gfn); - if (kvm_is_error_hva(addr)) - return 1; - kvm_x86_ops->patch_hypercall(vcpu, instructions); - ((unsigned char *)instructions)[3] = 0xc3; /* ret */ - if (__copy_to_user((void __user *)addr, instructions, 4)) - return 1; - kvm->arch.hv_hypercall = data; - mark_page_dirty(kvm, gfn); - break; - } - case HV_X64_MSR_REFERENCE_TSC: { - u64 gfn; - HV_REFERENCE_TSC_PAGE tsc_ref; - memset(&tsc_ref, 0, sizeof(tsc_ref)); - kvm->arch.hv_tsc_page = data; - if (!(data & HV_X64_MSR_TSC_REFERENCE_ENABLE)) - break; - gfn = data >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT; - if (kvm_write_guest(kvm, gfn << HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT, - &tsc_ref, sizeof(tsc_ref))) - return 1; - mark_page_dirty(kvm, gfn); - break; - } - default: - vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " - "data 0x%llx\n", msr, data); - return 1; - } - return 0; -} - -static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data) -{ - switch (msr) { - case HV_X64_MSR_APIC_ASSIST_PAGE: { - u64 gfn; - unsigned long addr; - - if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) { - vcpu->arch.hv_vapic = data; - if (kvm_lapic_enable_pv_eoi(vcpu, 0)) - return 1; - break; - } - gfn = data >> HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT; - addr = kvm_vcpu_gfn_to_hva(vcpu, gfn); - if (kvm_is_error_hva(addr)) - return 1; - if (__clear_user((void __user *)addr, PAGE_SIZE)) - return 1; - vcpu->arch.hv_vapic = data; - kvm_vcpu_mark_page_dirty(vcpu, gfn); - if (kvm_lapic_enable_pv_eoi(vcpu, gfn_to_gpa(gfn) | KVM_MSR_ENABLED)) - return 1; - break; - } - case HV_X64_MSR_EOI: - return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data); - case HV_X64_MSR_ICR: - return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data); - case HV_X64_MSR_TPR: - return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data); - default: - vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " - "data 0x%llx\n", msr, data); - return 1; - } - - return 0; -} - static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) { gpa_t gpa = data & ~0x3f; @@ -2138,8 +2007,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) &vcpu->requests); ka->boot_vcpu_runs_old_kvmclock = tmp; - - ka->kvmclock_offset = -get_kernel_ns(); } vcpu->arch.time = data; @@ -2224,15 +2091,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) */ break; case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: - if (kvm_hv_msr_partition_wide(msr)) { - int r; - mutex_lock(&vcpu->kvm->lock); - r = set_msr_hyperv_pw(vcpu, msr, data); - mutex_unlock(&vcpu->kvm->lock); - return r; - } else - return set_msr_hyperv(vcpu, msr, data); - break; + case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: + case HV_X64_MSR_CRASH_CTL: + return kvm_hv_set_msr_common(vcpu, msr, data, + msr_info->host_initiated); case MSR_IA32_BBL_CR_CTL3: /* Drop writes to this legacy MSR -- see rdmsr * counterpart for further detail. @@ -2315,68 +2177,6 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) return 0; } -static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) -{ - u64 data = 0; - struct kvm *kvm = vcpu->kvm; - - switch (msr) { - case HV_X64_MSR_GUEST_OS_ID: - data = kvm->arch.hv_guest_os_id; - break; - case HV_X64_MSR_HYPERCALL: - data = kvm->arch.hv_hypercall; - break; - case HV_X64_MSR_TIME_REF_COUNT: { - data = - div_u64(get_kernel_ns() + kvm->arch.kvmclock_offset, 100); - break; - } - case HV_X64_MSR_REFERENCE_TSC: - data = kvm->arch.hv_tsc_page; - break; - default: - vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); - return 1; - } - - *pdata = data; - return 0; -} - -static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) -{ - u64 data = 0; - - switch (msr) { - case HV_X64_MSR_VP_INDEX: { - int r; - struct kvm_vcpu *v; - kvm_for_each_vcpu(r, v, vcpu->kvm) { - if (v == vcpu) { - data = r; - break; - } - } - break; - } - case HV_X64_MSR_EOI: - return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata); - case HV_X64_MSR_ICR: - return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata); - case HV_X64_MSR_TPR: - return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata); - case HV_X64_MSR_APIC_ASSIST_PAGE: - data = vcpu->arch.hv_vapic; - break; - default: - vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); - return 1; - } - *pdata = data; - return 0; -} - int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { switch (msr_info->index) { @@ -2495,14 +2295,10 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = 0x20000000; break; case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: - if (kvm_hv_msr_partition_wide(msr_info->index)) { - int r; - mutex_lock(&vcpu->kvm->lock); - r = get_msr_hyperv_pw(vcpu, msr_info->index, &msr_info->data); - mutex_unlock(&vcpu->kvm->lock); - return r; - } else - return get_msr_hyperv(vcpu, msr_info->index, &msr_info->data); + case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: + case HV_X64_MSR_CRASH_CTL: + return kvm_hv_get_msr_common(vcpu, + msr_info->index, &msr_info->data); break; case MSR_IA32_BBL_CR_CTL3: /* This legacy MSR exists but isn't fully documented in current @@ -2653,6 +2449,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_TSC_DEADLINE_TIMER: case KVM_CAP_ENABLE_CAP_VM: case KVM_CAP_DISABLE_QUIRKS: + case KVM_CAP_SET_BOOT_CPU_ID: #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT case KVM_CAP_ASSIGN_DEV_IRQ: case KVM_CAP_PCI_2_3: @@ -2812,7 +2609,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) { s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 : - native_read_tsc() - vcpu->arch.last_host_tsc; + rdtsc() - vcpu->arch.last_host_tsc; if (tsc_delta < 0) mark_tsc_unstable("KVM discovered backwards TSC"); if (check_tsc_unstable()) { @@ -2840,7 +2637,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { kvm_x86_ops->vcpu_put(vcpu); kvm_put_guest_fpu(vcpu); - vcpu->arch.last_host_tsc = native_read_tsc(); + vcpu->arch.last_host_tsc = rdtsc(); } static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, @@ -3819,30 +3616,25 @@ long kvm_arch_vm_ioctl(struct file *filp, r = kvm_ioapic_init(kvm); if (r) { mutex_lock(&kvm->slots_lock); - kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, - &vpic->dev_master); - kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, - &vpic->dev_slave); - kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, - &vpic->dev_eclr); + kvm_destroy_pic(vpic); mutex_unlock(&kvm->slots_lock); - kfree(vpic); goto create_irqchip_unlock; } } else goto create_irqchip_unlock; - smp_wmb(); - kvm->arch.vpic = vpic; - smp_wmb(); r = kvm_setup_default_irq_routing(kvm); if (r) { mutex_lock(&kvm->slots_lock); mutex_lock(&kvm->irq_lock); kvm_ioapic_destroy(kvm); - kvm_destroy_pic(kvm); + kvm_destroy_pic(vpic); mutex_unlock(&kvm->irq_lock); mutex_unlock(&kvm->slots_lock); + goto create_irqchip_unlock; } + /* Write kvm->irq_routing before kvm->arch.vpic. */ + smp_wmb(); + kvm->arch.vpic = vpic; create_irqchip_unlock: mutex_unlock(&kvm->lock); break; @@ -3969,6 +3761,15 @@ long kvm_arch_vm_ioctl(struct file *filp, r = kvm_vm_ioctl_reinject(kvm, &control); break; } + case KVM_SET_BOOT_CPU_ID: + r = 0; + mutex_lock(&kvm->lock); + if (atomic_read(&kvm->online_vcpus) != 0) + r = -EBUSY; + else + kvm->arch.bsp_vcpu_id = arg; + mutex_unlock(&kvm->lock); + break; case KVM_XEN_HVM_CONFIG: { r = -EFAULT; if (copy_from_user(&kvm->arch.xen_hvm_config, argp, @@ -4260,6 +4061,15 @@ static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt, return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception); } +static int kvm_read_guest_phys_system(struct x86_emulate_ctxt *ctxt, + unsigned long addr, void *val, unsigned int bytes) +{ + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + int r = kvm_vcpu_read_guest(vcpu, addr, val, bytes); + + return r < 0 ? X86EMUL_IO_NEEDED : X86EMUL_CONTINUE; +} + int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val, unsigned int bytes, @@ -4995,6 +4805,7 @@ static const struct x86_emulate_ops emulate_ops = { .write_gpr = emulator_write_gpr, .read_std = kvm_read_guest_virt_system, .write_std = kvm_write_guest_virt_system, + .read_phys = kvm_read_guest_phys_system, .fetch = kvm_fetch_guest_virt, .read_emulated = emulator_read_emulated, .write_emulated = emulator_write_emulated, @@ -5884,66 +5695,6 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_emulate_halt); -int kvm_hv_hypercall(struct kvm_vcpu *vcpu) -{ - u64 param, ingpa, outgpa, ret; - uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0; - bool fast, longmode; - - /* - * hypercall generates UD from non zero cpl and real mode - * per HYPER-V spec - */ - if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 0; - } - - longmode = is_64_bit_mode(vcpu); - - if (!longmode) { - param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) | - (kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffffff); - ingpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) | - (kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffffff); - outgpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) | - (kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffffff); - } -#ifdef CONFIG_X86_64 - else { - param = kvm_register_read(vcpu, VCPU_REGS_RCX); - ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX); - outgpa = kvm_register_read(vcpu, VCPU_REGS_R8); - } -#endif - - code = param & 0xffff; - fast = (param >> 16) & 0x1; - rep_cnt = (param >> 32) & 0xfff; - rep_idx = (param >> 48) & 0xfff; - - trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa); - - switch (code) { - case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT: - kvm_vcpu_on_spin(vcpu); - break; - default: - res = HV_STATUS_INVALID_HYPERCALL_CODE; - break; - } - - ret = res | (((u64)rep_done & 0xfff) << 32); - if (longmode) { - kvm_register_write(vcpu, VCPU_REGS_RAX, ret); - } else { - kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32); - kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff); - } - - return 1; -} - /* * kvm_pv_kick_cpu_op: Kick a vcpu. * @@ -6203,6 +5954,7 @@ static void process_smi_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n) put_smstate(u32, buf, offset, process_smi_get_segment_flags(&seg)); } +#ifdef CONFIG_X86_64 static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n) { struct kvm_segment seg; @@ -6218,6 +5970,7 @@ static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n) put_smstate(u32, buf, offset + 4, seg.limit); put_smstate(u64, buf, offset + 8, seg.base); } +#endif static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf) { @@ -6520,6 +6273,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) vcpu_scan_ioapic(vcpu); if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu)) kvm_vcpu_reload_apic_access_page(vcpu); + if (kvm_check_request(KVM_REQ_HV_CRASH, vcpu)) { + vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; + vcpu->run->system_event.type = KVM_SYSTEM_EVENT_CRASH; + r = 0; + goto out; + } } if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { @@ -6629,7 +6388,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) hw_breakpoint_restore(); vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, - native_read_tsc()); + rdtsc()); vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); @@ -7443,7 +7202,7 @@ int kvm_arch_hardware_enable(void) if (ret != 0) return ret; - local_tsc = native_read_tsc(); + local_tsc = rdtsc(); stable = !check_tsc_unstable(); list_for_each_entry(kvm, &vm_list, vm_list) { kvm_for_each_vcpu(i, vcpu, kvm) { @@ -7547,6 +7306,17 @@ void kvm_arch_check_processor_compat(void *rtn) kvm_x86_ops->check_processor_compatibility(rtn); } +bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu) +{ + return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id; +} +EXPORT_SYMBOL_GPL(kvm_vcpu_is_reset_bsp); + +bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu) +{ + return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0; +} + bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) { return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL); @@ -7721,34 +7491,66 @@ void kvm_arch_sync_events(struct kvm *kvm) kvm_free_pit(kvm); } -int __x86_set_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region *mem) +int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) { int i, r; + unsigned long hva; + struct kvm_memslots *slots = kvm_memslots(kvm); + struct kvm_memory_slot *slot, old; /* Called with kvm->slots_lock held. */ - BUG_ON(mem->slot >= KVM_MEM_SLOTS_NUM); + if (WARN_ON(id >= KVM_MEM_SLOTS_NUM)) + return -EINVAL; + + slot = id_to_memslot(slots, id); + if (size) { + if (WARN_ON(slot->npages)) + return -EEXIST; + /* + * MAP_SHARED to prevent internal slot pages from being moved + * by fork()/COW. + */ + hva = vm_mmap(NULL, 0, size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, 0); + if (IS_ERR((void *)hva)) + return PTR_ERR((void *)hva); + } else { + if (!slot->npages) + return 0; + + hva = 0; + } + + old = *slot; for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { - struct kvm_userspace_memory_region m = *mem; + struct kvm_userspace_memory_region m; - m.slot |= i << 16; + m.slot = id | (i << 16); + m.flags = 0; + m.guest_phys_addr = gpa; + m.userspace_addr = hva; + m.memory_size = size; r = __kvm_set_memory_region(kvm, &m); if (r < 0) return r; } + if (!size) { + r = vm_munmap(old.userspace_addr, old.npages * PAGE_SIZE); + WARN_ON(r < 0); + } + return 0; } EXPORT_SYMBOL_GPL(__x86_set_memory_region); -int x86_set_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region *mem) +int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) { int r; mutex_lock(&kvm->slots_lock); - r = __x86_set_memory_region(kvm, mem); + r = __x86_set_memory_region(kvm, id, gpa, size); mutex_unlock(&kvm->slots_lock); return r; @@ -7763,16 +7565,9 @@ void kvm_arch_destroy_vm(struct kvm *kvm) * unless the the memory map has changed due to process exit * or fd copying. */ - struct kvm_userspace_memory_region mem; - memset(&mem, 0, sizeof(mem)); - mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT; - x86_set_memory_region(kvm, &mem); - - mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT; - x86_set_memory_region(kvm, &mem); - - mem.slot = TSS_PRIVATE_MEMSLOT; - x86_set_memory_region(kvm, &mem); + x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, 0, 0); + x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, 0, 0); + x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0); } kvm_iommu_unmap_guest(kvm); kfree(kvm->arch.vpic); @@ -7875,27 +7670,6 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, const struct kvm_userspace_memory_region *mem, enum kvm_mr_change change) { - /* - * Only private memory slots need to be mapped here since - * KVM_SET_MEMORY_REGION ioctl is no longer supported. - */ - if ((memslot->id >= KVM_USER_MEM_SLOTS) && (change == KVM_MR_CREATE)) { - unsigned long userspace_addr; - - /* - * MAP_SHARED to prevent internal slot pages from being moved - * by fork()/COW. - */ - userspace_addr = vm_mmap(NULL, 0, memslot->npages * PAGE_SIZE, - PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_ANONYMOUS, 0); - - if (IS_ERR((void *)userspace_addr)) - return PTR_ERR((void *)userspace_addr); - - memslot->userspace_addr = userspace_addr; - } - return 0; } @@ -7957,17 +7731,6 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, { int nr_mmu_pages = 0; - if (change == KVM_MR_DELETE && old->id >= KVM_USER_MEM_SLOTS) { - int ret; - - ret = vm_munmap(old->userspace_addr, - old->npages * PAGE_SIZE); - if (ret < 0) - printk(KERN_WARNING - "kvm_vm_ioctl_set_memory_region: " - "failed to munmap memory\n"); - } - if (!kvm->arch.n_requested_mmu_pages) nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 0ca2f3e48..2f822cd88 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -147,6 +147,11 @@ static inline void kvm_register_writel(struct kvm_vcpu *vcpu, return kvm_register_write(vcpu, reg, val); } +static inline u64 get_kernel_ns(void) +{ + return ktime_get_boot_ns(); +} + static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk) { return !(kvm->arch.disabled_quirks & quirk); diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index f2dc08c00..a0d09f6c6 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -835,16 +835,46 @@ static struct irq_chip lguest_irq_controller = { .irq_unmask = enable_lguest_irq, }; +/* + * Interrupt descriptors are allocated as-needed, but low-numbered ones are + * reserved by the generic x86 code. So we ignore irq_alloc_desc_at if it + * tells us the irq is already used: other errors (ie. ENOMEM) we take + * seriously. + */ +static int lguest_setup_irq(unsigned int irq) +{ + struct irq_desc *desc; + int err; + + /* Returns -ve error or vector number. */ + err = irq_alloc_desc_at(irq, 0); + if (err < 0 && err != -EEXIST) + return err; + + /* + * Tell the Linux infrastructure that the interrupt is + * controlled by our level-based lguest interrupt controller. + */ + irq_set_chip_and_handler_name(irq, &lguest_irq_controller, + handle_level_irq, "level"); + + /* Some systems map "vectors" to interrupts weirdly. Not us! */ + desc = irq_to_desc(irq); + __this_cpu_write(vector_irq[FIRST_EXTERNAL_VECTOR + irq], desc); + return 0; +} + static int lguest_enable_irq(struct pci_dev *dev) { + int err; u8 line = 0; /* We literally use the PCI interrupt line as the irq number. */ pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &line); - irq_set_chip_and_handler_name(line, &lguest_irq_controller, - handle_level_irq, "level"); - dev->irq = line; - return 0; + err = lguest_setup_irq(line); + if (!err) + dev->irq = line; + return err; } /* We don't do hotplug PCI, so this shouldn't be called. */ @@ -855,17 +885,13 @@ static void lguest_disable_irq(struct pci_dev *dev) /* * This sets up the Interrupt Descriptor Table (IDT) entry for each hardware - * interrupt (except 128, which is used for system calls), and then tells the - * Linux infrastructure that each interrupt is controlled by our level-based - * lguest interrupt controller. + * interrupt (except 128, which is used for system calls). */ static void __init lguest_init_IRQ(void) { unsigned int i; for (i = FIRST_EXTERNAL_VECTOR; i < FIRST_SYSTEM_VECTOR; i++) { - /* Some systems map "vectors" to interrupts weirdly. Not us! */ - __this_cpu_write(vector_irq[i], i - FIRST_EXTERNAL_VECTOR); if (i != IA32_SYSCALL_VECTOR) set_intr_gate(i, irq_entries_start + 8 * (i - FIRST_EXTERNAL_VECTOR)); @@ -879,26 +905,6 @@ static void __init lguest_init_IRQ(void) } /* - * Interrupt descriptors are allocated as-needed, but low-numbered ones are - * reserved by the generic x86 code. So we ignore irq_alloc_desc_at if it - * tells us the irq is already used: other errors (ie. ENOMEM) we take - * seriously. - */ -int lguest_setup_irq(unsigned int irq) -{ - int err; - - /* Returns -ve error or vector number. */ - err = irq_alloc_desc_at(irq, 0); - if (err < 0 && err != -EEXIST) - return err; - - irq_set_chip_and_handler_name(irq, &lguest_irq_controller, - handle_level_irq, "level"); - return 0; -} - -/* * Time. * * It would be far better for everyone if the Guest had its own clock, but @@ -985,23 +991,11 @@ static int lguest_clockevent_set_next_event(unsigned long delta, return 0; } -static void lguest_clockevent_set_mode(enum clock_event_mode mode, - struct clock_event_device *evt) +static int lguest_clockevent_shutdown(struct clock_event_device *evt) { - switch (mode) { - case CLOCK_EVT_MODE_UNUSED: - case CLOCK_EVT_MODE_SHUTDOWN: - /* A 0 argument shuts the clock down. */ - hcall(LHCALL_SET_CLOCKEVENT, 0, 0, 0, 0); - break; - case CLOCK_EVT_MODE_ONESHOT: - /* This is what we expect. */ - break; - case CLOCK_EVT_MODE_PERIODIC: - BUG(); - case CLOCK_EVT_MODE_RESUME: - break; - } + /* A 0 argument shuts the clock down. */ + hcall(LHCALL_SET_CLOCKEVENT, 0, 0, 0, 0); + return 0; } /* This describes our primitive timer chip. */ @@ -1009,7 +1003,7 @@ static struct clock_event_device lguest_clockevent = { .name = "lguest", .features = CLOCK_EVT_FEAT_ONESHOT, .set_next_event = lguest_clockevent_set_next_event, - .set_mode = lguest_clockevent_set_mode, + .set_state_shutdown = lguest_clockevent_shutdown, .rating = INT_MAX, .mult = 1, .shift = 0, @@ -1021,7 +1015,7 @@ static struct clock_event_device lguest_clockevent = { * This is the Guest timer interrupt handler (hardware interrupt 0). We just * call the clockevent infrastructure and it does whatever needs doing. */ -static void lguest_time_irq(unsigned int irq, struct irq_desc *desc) +static void lguest_time_irq(struct irq_desc *desc) { unsigned long flags; @@ -1040,7 +1034,8 @@ static void lguest_time_irq(unsigned int irq, struct irq_desc *desc) static void lguest_time_init(void) { /* Set up the timer interrupt (0) to go to our simple timer routine */ - lguest_setup_irq(0); + if (lguest_setup_irq(0) != 0) + panic("Could not set up timer irq"); irq_set_handler(0, lguest_time_irq); clocksource_register_hz(&lguest_clock, NSEC_PER_SEC); diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index 39d6a3db0..e912b2f6d 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -20,6 +20,7 @@ #include <asm/processor.h> #include <asm/delay.h> #include <asm/timer.h> +#include <asm/mwait.h> #ifdef CONFIG_SMP # include <asm/smp.h> @@ -49,16 +50,14 @@ static void delay_loop(unsigned long loops) /* TSC based delay: */ static void delay_tsc(unsigned long __loops) { - u32 bclock, now, loops = __loops; + u64 bclock, now, loops = __loops; int cpu; preempt_disable(); cpu = smp_processor_id(); - rdtsc_barrier(); - rdtscl(bclock); + bclock = rdtsc_ordered(); for (;;) { - rdtsc_barrier(); - rdtscl(now); + now = rdtsc_ordered(); if ((now - bclock) >= loops) break; @@ -79,14 +78,51 @@ static void delay_tsc(unsigned long __loops) if (unlikely(cpu != smp_processor_id())) { loops -= (now - bclock); cpu = smp_processor_id(); - rdtsc_barrier(); - rdtscl(bclock); + bclock = rdtsc_ordered(); } } preempt_enable(); } /* + * On some AMD platforms, MWAITX has a configurable 32-bit timer, that + * counts with TSC frequency. The input value is the loop of the + * counter, it will exit when the timer expires. + */ +static void delay_mwaitx(unsigned long __loops) +{ + u64 start, end, delay, loops = __loops; + + start = rdtsc_ordered(); + + for (;;) { + delay = min_t(u64, MWAITX_MAX_LOOPS, loops); + + /* + * Use cpu_tss as a cacheline-aligned, seldomly + * accessed per-cpu variable as the monitor target. + */ + __monitorx(this_cpu_ptr(&cpu_tss), 0, 0); + + /* + * AMD, like Intel, supports the EAX hint and EAX=0xf + * means, do not enter any deep C-state and we use it + * here in delay() to minimize wakeup latency. + */ + __mwaitx(MWAITX_DISABLE_CSTATES, delay, MWAITX_ECX_TIMER_ENABLE); + + end = rdtsc_ordered(); + + if (loops <= end - start) + break; + + loops -= end - start; + + start = end; + } +} + +/* * Since we calibrate only once at boot, this * function should be set once at boot and not changed */ @@ -94,13 +130,19 @@ static void (*delay_fn)(unsigned long) = delay_loop; void use_tsc_delay(void) { - delay_fn = delay_tsc; + if (delay_fn == delay_loop) + delay_fn = delay_tsc; +} + +void use_mwaitx_delay(void) +{ + delay_fn = delay_mwaitx; } int read_current_timer(unsigned long *timer_val) { if (delay_fn == delay_tsc) { - rdtscll(*timer_val); + *timer_val = rdtsc(); return 0; } return -1; diff --git a/arch/x86/math-emu/get_address.c b/arch/x86/math-emu/get_address.c index 8300db71c..8db26591d 100644 --- a/arch/x86/math-emu/get_address.c +++ b/arch/x86/math-emu/get_address.c @@ -20,6 +20,7 @@ #include <linux/stddef.h> #include <asm/uaccess.h> +#include <asm/vm86.h> #include "fpu_system.h" #include "exception.h" diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 462f3e959..3b544e4c1 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -21,6 +21,7 @@ #include <asm/kmemcheck.h> /* kmemcheck_*(), ... */ #include <asm/fixmap.h> /* VSYSCALL_ADDR */ #include <asm/vsyscall.h> /* emulate_vsyscall */ +#include <asm/vm86.h> /* struct vm86 */ #define CREATE_TRACE_POINTS #include <asm/trace/exceptions.h> @@ -302,14 +303,16 @@ static inline void check_v8086_mode(struct pt_regs *regs, unsigned long address, struct task_struct *tsk) { +#ifdef CONFIG_VM86 unsigned long bit; - if (!v8086_mode(regs)) + if (!v8086_mode(regs) || !tsk->thread.vm86) return; bit = (address - 0xA0000) >> PAGE_SHIFT; if (bit < 32) - tsk->thread.screen_bitmap |= 1 << bit; + tsk->thread.vm86->screen_bitmap |= 1 << bit; +#endif } static bool low_pfn(unsigned long pfn) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 2f1810c3a..266ee8aea 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -30,8 +30,11 @@ /* * Tables translating between page_cache_type_t and pte encoding. * - * Minimal supported modes are defined statically, they are modified - * during bootup if more supported cache modes are available. + * The default values are defined statically as minimal supported mode; + * WC and WT fall back to UC-. pat_init() updates these values to support + * more cache modes, WC and WT, when it is safe to do so. See pat_init() + * for the details. Note, __early_ioremap() used during early boot-time + * takes pgprot_t (pte encoding) and does not use these tables. * * Index into __cachemode2pte_tbl[] is the cachemode. * diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 68aec4254..7562f4291 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -823,11 +823,11 @@ void __init mem_init(void) } #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size) +int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { struct pglist_data *pgdata = NODE_DATA(nid); struct zone *zone = pgdata->node_zones + - zone_for_memory(nid, start, size, ZONE_HIGHMEM); + zone_for_memory(nid, start, size, ZONE_HIGHMEM, for_device); unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index f9977a7a9..df48430c2 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -687,11 +687,11 @@ static void update_end_of_memory_vars(u64 start, u64 size) * Memory is added always to NORMAL zone. This means you will never get * additional DMA/DMA32 memory. */ -int arch_add_memory(int nid, u64 start, u64 size) +int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { struct pglist_data *pgdat = NODE_DATA(nid); struct zone *zone = pgdat->node_zones + - zone_for_memory(nid, start, size, ZONE_NORMAL); + zone_for_memory(nid, start, size, ZONE_NORMAL, for_device); unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; int ret; diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index e1840f3db..9ce5da27b 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -12,20 +12,6 @@ extern pgd_t early_level4_pgt[PTRS_PER_PGD]; extern struct range pfn_mapped[E820_X_MAX]; -static pud_t kasan_zero_pud[PTRS_PER_PUD] __page_aligned_bss; -static pmd_t kasan_zero_pmd[PTRS_PER_PMD] __page_aligned_bss; -static pte_t kasan_zero_pte[PTRS_PER_PTE] __page_aligned_bss; - -/* - * This page used as early shadow. We don't use empty_zero_page - * at early stages, stack instrumentation could write some garbage - * to this page. - * Latter we reuse it as zero shadow for large ranges of memory - * that allowed to access, but not instrumented by kasan - * (vmalloc/vmemmap ...). - */ -static unsigned char kasan_zero_page[PAGE_SIZE] __page_aligned_bss; - static int __init map_range(struct range *range) { unsigned long start; @@ -62,106 +48,6 @@ static void __init kasan_map_early_shadow(pgd_t *pgd) } } -static int __init zero_pte_populate(pmd_t *pmd, unsigned long addr, - unsigned long end) -{ - pte_t *pte = pte_offset_kernel(pmd, addr); - - while (addr + PAGE_SIZE <= end) { - WARN_ON(!pte_none(*pte)); - set_pte(pte, __pte(__pa_nodebug(kasan_zero_page) - | __PAGE_KERNEL_RO)); - addr += PAGE_SIZE; - pte = pte_offset_kernel(pmd, addr); - } - return 0; -} - -static int __init zero_pmd_populate(pud_t *pud, unsigned long addr, - unsigned long end) -{ - int ret = 0; - pmd_t *pmd = pmd_offset(pud, addr); - - while (IS_ALIGNED(addr, PMD_SIZE) && addr + PMD_SIZE <= end) { - WARN_ON(!pmd_none(*pmd)); - set_pmd(pmd, __pmd(__pa_nodebug(kasan_zero_pte) - | _KERNPG_TABLE)); - addr += PMD_SIZE; - pmd = pmd_offset(pud, addr); - } - if (addr < end) { - if (pmd_none(*pmd)) { - void *p = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE); - if (!p) - return -ENOMEM; - set_pmd(pmd, __pmd(__pa_nodebug(p) | _KERNPG_TABLE)); - } - ret = zero_pte_populate(pmd, addr, end); - } - return ret; -} - - -static int __init zero_pud_populate(pgd_t *pgd, unsigned long addr, - unsigned long end) -{ - int ret = 0; - pud_t *pud = pud_offset(pgd, addr); - - while (IS_ALIGNED(addr, PUD_SIZE) && addr + PUD_SIZE <= end) { - WARN_ON(!pud_none(*pud)); - set_pud(pud, __pud(__pa_nodebug(kasan_zero_pmd) - | _KERNPG_TABLE)); - addr += PUD_SIZE; - pud = pud_offset(pgd, addr); - } - - if (addr < end) { - if (pud_none(*pud)) { - void *p = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE); - if (!p) - return -ENOMEM; - set_pud(pud, __pud(__pa_nodebug(p) | _KERNPG_TABLE)); - } - ret = zero_pmd_populate(pud, addr, end); - } - return ret; -} - -static int __init zero_pgd_populate(unsigned long addr, unsigned long end) -{ - int ret = 0; - pgd_t *pgd = pgd_offset_k(addr); - - while (IS_ALIGNED(addr, PGDIR_SIZE) && addr + PGDIR_SIZE <= end) { - WARN_ON(!pgd_none(*pgd)); - set_pgd(pgd, __pgd(__pa_nodebug(kasan_zero_pud) - | _KERNPG_TABLE)); - addr += PGDIR_SIZE; - pgd = pgd_offset_k(addr); - } - - if (addr < end) { - if (pgd_none(*pgd)) { - void *p = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE); - if (!p) - return -ENOMEM; - set_pgd(pgd, __pgd(__pa_nodebug(p) | _KERNPG_TABLE)); - } - ret = zero_pud_populate(pgd, addr, end); - } - return ret; -} - - -static void __init populate_zero_shadow(const void *start, const void *end) -{ - if (zero_pgd_populate((unsigned long)start, (unsigned long)end)) - panic("kasan: unable to map zero shadow!"); -} - - #ifdef CONFIG_KASAN_INLINE static int kasan_die_handler(struct notifier_block *self, unsigned long val, @@ -213,7 +99,7 @@ void __init kasan_init(void) clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END); - populate_zero_shadow((void *)KASAN_SHADOW_START, + kasan_populate_zero_shadow((void *)KASAN_SHADOW_START, kasan_mem_to_shadow((void *)PAGE_OFFSET)); for (i = 0; i < E820_X_MAX; i++) { @@ -223,14 +109,15 @@ void __init kasan_init(void) if (map_range(&pfn_mapped[i])) panic("kasan: unable to allocate shadow!"); } - populate_zero_shadow(kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM), - kasan_mem_to_shadow((void *)__START_KERNEL_map)); + kasan_populate_zero_shadow( + kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM), + kasan_mem_to_shadow((void *)__START_KERNEL_map)); vmemmap_populate((unsigned long)kasan_mem_to_shadow(_stext), (unsigned long)kasan_mem_to_shadow(_end), NUMA_NO_NODE); - populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END), + kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END), (void *)KASAN_SHADOW_END); memset(kasan_zero_page, 0, PAGE_SIZE); diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index db1b0bc50..71fc79a58 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -42,58 +42,21 @@ static inline unsigned long mpx_bt_size_bytes(struct mm_struct *mm) */ static unsigned long mpx_mmap(unsigned long len) { - unsigned long ret; - unsigned long addr, pgoff; struct mm_struct *mm = current->mm; - vm_flags_t vm_flags; - struct vm_area_struct *vma; + unsigned long addr, populate; /* Only bounds table can be allocated here */ if (len != mpx_bt_size_bytes(mm)) return -EINVAL; down_write(&mm->mmap_sem); - - /* Too many mappings? */ - if (mm->map_count > sysctl_max_map_count) { - ret = -ENOMEM; - goto out; - } - - /* Obtain the address to map to. we verify (or select) it and ensure - * that it represents a valid section of the address space. - */ - addr = get_unmapped_area(NULL, 0, len, 0, MAP_ANONYMOUS | MAP_PRIVATE); - if (addr & ~PAGE_MASK) { - ret = addr; - goto out; - } - - vm_flags = VM_READ | VM_WRITE | VM_MPX | - mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; - - /* Set pgoff according to addr for anon_vma */ - pgoff = addr >> PAGE_SHIFT; - - ret = mmap_region(NULL, addr, len, vm_flags, pgoff); - if (IS_ERR_VALUE(ret)) - goto out; - - vma = find_vma(mm, ret); - if (!vma) { - ret = -ENOMEM; - goto out; - } - - if (vm_flags & VM_LOCKED) { - up_write(&mm->mmap_sem); - mm_populate(ret, len); - return ret; - } - -out: + addr = do_mmap(NULL, 0, len, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, VM_MPX, 0, &populate); up_write(&mm->mmap_sem); - return ret; + if (populate) + mm_populate(addr, populate); + + return addr; } enum reg_type { @@ -622,6 +585,29 @@ static unsigned long mpx_bd_entry_to_bt_addr(struct mm_struct *mm, } /* + * We only want to do a 4-byte get_user() on 32-bit. Otherwise, + * we might run off the end of the bounds table if we are on + * a 64-bit kernel and try to get 8 bytes. + */ +int get_user_bd_entry(struct mm_struct *mm, unsigned long *bd_entry_ret, + long __user *bd_entry_ptr) +{ + u32 bd_entry_32; + int ret; + + if (is_64bit_mm(mm)) + return get_user(*bd_entry_ret, bd_entry_ptr); + + /* + * Note that get_user() uses the type of the *pointer* to + * establish the size of the get, not the destination. + */ + ret = get_user(bd_entry_32, (u32 __user *)bd_entry_ptr); + *bd_entry_ret = bd_entry_32; + return ret; +} + +/* * Get the base of bounds tables pointed by specific bounds * directory entry. */ @@ -641,7 +627,7 @@ static int get_bt_addr(struct mm_struct *mm, int need_write = 0; pagefault_disable(); - ret = get_user(bd_entry, bd_entry_ptr); + ret = get_user_bd_entry(mm, &bd_entry, bd_entry_ptr); pagefault_enable(); if (!ret) break; @@ -736,11 +722,23 @@ static unsigned long mpx_get_bt_entry_offset_bytes(struct mm_struct *mm, */ static inline unsigned long bd_entry_virt_space(struct mm_struct *mm) { - unsigned long long virt_space = (1ULL << boot_cpu_data.x86_virt_bits); - if (is_64bit_mm(mm)) - return virt_space / MPX_BD_NR_ENTRIES_64; - else - return virt_space / MPX_BD_NR_ENTRIES_32; + unsigned long long virt_space; + unsigned long long GB = (1ULL << 30); + + /* + * This covers 32-bit emulation as well as 32-bit kernels + * running on 64-bit harware. + */ + if (!is_64bit_mm(mm)) + return (4ULL * GB) / MPX_BD_NR_ENTRIES_32; + + /* + * 'x86_virt_bits' returns what the hardware is capable + * of, and returns the full >32-bit adddress space when + * running 32-bit kernels on 64-bit hardware. + */ + virt_space = (1ULL << boot_cpu_data.x86_virt_bits); + return virt_space / MPX_BD_NR_ENTRIES_64; } /* diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 4053bb58b..c3b3f653e 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -246,8 +246,10 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi) bi->start = max(bi->start, low); bi->end = min(bi->end, high); - /* and there's no empty block */ - if (bi->start >= bi->end) + /* and there's no empty or non-exist block */ + if (bi->start >= bi->end || + !memblock_overlaps_region(&memblock.memory, + bi->start, bi->end - bi->start)) numa_remove_memblk_from(i--, mi); } diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c index 8ff686aa7..5f169d5d7 100644 --- a/arch/x86/mm/pageattr-test.c +++ b/arch/x86/mm/pageattr-test.c @@ -8,6 +8,7 @@ #include <linux/kthread.h> #include <linux/random.h> #include <linux/kernel.h> +#include <linux/init.h> #include <linux/mm.h> #include <linux/vmalloc.h> @@ -256,5 +257,4 @@ static int start_pageattr_test(void) return 0; } - -module_init(start_pageattr_test); +device_initcall(start_pageattr_test); diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 727158cb3..2c44c0792 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -4,7 +4,6 @@ */ #include <linux/highmem.h> #include <linux/bootmem.h> -#include <linux/module.h> #include <linux/sched.h> #include <linux/mm.h> #include <linux/interrupt.h> diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index 66338a60a..c2aea63be 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c @@ -192,10 +192,11 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) node_set(node, numa_nodes_parsed); - pr_info("SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]%s\n", + pr_info("SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]%s%s\n", node, pxm, (unsigned long long) start, (unsigned long long) end - 1, - hotpluggable ? " hotplug" : ""); + hotpluggable ? " hotplug" : "", + ma->flags & ACPI_SRAT_MEM_NON_VOLATILE ? " non-volatile" : ""); /* Mark hotplug range in memblock. */ if (hotpluggable && memblock_mark_hotplug(start, ma->length)) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 90b924acd..8ddb5d0d6 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -140,6 +140,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask, info.flush_end = end; count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); + trace_tlb_flush(TLB_REMOTE_SEND_IPI, end - start); if (is_uv_system()) { unsigned int cpu; diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index be2e7a2b1..70efcd094 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -246,7 +246,7 @@ static void emit_prologue(u8 **pprog) * goto out; * if (++tail_call_cnt > MAX_TAIL_CALL_CNT) * goto out; - * prog = array->prog[index]; + * prog = array->ptrs[index]; * if (prog == NULL) * goto out; * goto *(prog->bpf_func + prologue_size); @@ -284,9 +284,9 @@ static void emit_bpf_tail_call(u8 **pprog) EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */ EMIT2_off32(0x89, 0x85, -STACKSIZE + 36); /* mov dword ptr [rbp - 516], eax */ - /* prog = array->prog[index]; */ + /* prog = array->ptrs[index]; */ EMIT4_off32(0x48, 0x8D, 0x84, 0xD6, /* lea rax, [rsi + rdx * 8 + offsetof(...)] */ - offsetof(struct bpf_array, prog)); + offsetof(struct bpf_array, ptrs)); EMIT3(0x48, 0x8B, 0x00); /* mov rax, qword ptr [rax] */ /* if (prog == NULL) @@ -315,6 +315,26 @@ static void emit_bpf_tail_call(u8 **pprog) *pprog = prog; } + +static void emit_load_skb_data_hlen(u8 **pprog) +{ + u8 *prog = *pprog; + int cnt = 0; + + /* r9d = skb->len - skb->data_len (headlen) + * r10 = skb->data + */ + /* mov %r9d, off32(%rdi) */ + EMIT3_off32(0x44, 0x8b, 0x8f, offsetof(struct sk_buff, len)); + + /* sub %r9d, off32(%rdi) */ + EMIT3_off32(0x44, 0x2b, 0x8f, offsetof(struct sk_buff, data_len)); + + /* mov %r10, off32(%rdi) */ + EMIT3_off32(0x4c, 0x8b, 0x97, offsetof(struct sk_buff, data)); + *pprog = prog; +} + static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, int oldproglen, struct jit_context *ctx) { @@ -329,36 +349,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, emit_prologue(&prog); - if (seen_ld_abs) { - /* r9d : skb->len - skb->data_len (headlen) - * r10 : skb->data - */ - if (is_imm8(offsetof(struct sk_buff, len))) - /* mov %r9d, off8(%rdi) */ - EMIT4(0x44, 0x8b, 0x4f, - offsetof(struct sk_buff, len)); - else - /* mov %r9d, off32(%rdi) */ - EMIT3_off32(0x44, 0x8b, 0x8f, - offsetof(struct sk_buff, len)); - - if (is_imm8(offsetof(struct sk_buff, data_len))) - /* sub %r9d, off8(%rdi) */ - EMIT4(0x44, 0x2b, 0x4f, - offsetof(struct sk_buff, data_len)); - else - EMIT3_off32(0x44, 0x2b, 0x8f, - offsetof(struct sk_buff, data_len)); - - if (is_imm8(offsetof(struct sk_buff, data))) - /* mov %r10, off8(%rdi) */ - EMIT4(0x4c, 0x8b, 0x57, - offsetof(struct sk_buff, data)); - else - /* mov %r10, off32(%rdi) */ - EMIT3_off32(0x4c, 0x8b, 0x97, - offsetof(struct sk_buff, data)); - } + if (seen_ld_abs) + emit_load_skb_data_hlen(&prog); for (i = 0; i < insn_cnt; i++, insn++) { const s32 imm32 = insn->imm; @@ -367,6 +359,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 b1 = 0, b2 = 0, b3 = 0; s64 jmp_offset; u8 jmp_cond; + bool reload_skb_data; int ilen; u8 *func; @@ -818,12 +811,18 @@ xadd: if (is_imm8(insn->off)) func = (u8 *) __bpf_call_base + imm32; jmp_offset = func - (image + addrs[i]); if (seen_ld_abs) { - EMIT2(0x41, 0x52); /* push %r10 */ - EMIT2(0x41, 0x51); /* push %r9 */ - /* need to adjust jmp offset, since - * pop %r9, pop %r10 take 4 bytes after call insn - */ - jmp_offset += 4; + reload_skb_data = bpf_helper_changes_skb_data(func); + if (reload_skb_data) { + EMIT1(0x57); /* push %rdi */ + jmp_offset += 22; /* pop, mov, sub, mov */ + } else { + EMIT2(0x41, 0x52); /* push %r10 */ + EMIT2(0x41, 0x51); /* push %r9 */ + /* need to adjust jmp offset, since + * pop %r9, pop %r10 take 4 bytes after call insn + */ + jmp_offset += 4; + } } if (!imm32 || !is_simm32(jmp_offset)) { pr_err("unsupported bpf func %d addr %p image %p\n", @@ -832,8 +831,13 @@ xadd: if (is_imm8(insn->off)) } EMIT1_off32(0xE8, jmp_offset); if (seen_ld_abs) { - EMIT2(0x41, 0x59); /* pop %r9 */ - EMIT2(0x41, 0x5A); /* pop %r10 */ + if (reload_skb_data) { + EMIT1(0x5F); /* pop %rdi */ + emit_load_skb_data_hlen(&prog); + } else { + EMIT2(0x41, 0x59); /* pop %r9 */ + EMIT2(0x41, 0x5A); /* pop %r10 */ + } } break; @@ -1099,7 +1103,7 @@ void bpf_int_jit_compile(struct bpf_prog *prog) } if (bpf_jit_enable > 1) - bpf_jit_dump(prog->len, proglen, 0, image); + bpf_jit_dump(prog->len, proglen, pass + 1, image); if (image) { bpf_flush_icache(header, image + proglen); diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 8fd6f44ae..dc78a4a9a 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -673,24 +673,22 @@ int pcibios_add_device(struct pci_dev *dev) return 0; } -int pcibios_enable_device(struct pci_dev *dev, int mask) +int pcibios_alloc_irq(struct pci_dev *dev) { - int err; - - if ((err = pci_enable_resources(dev, mask)) < 0) - return err; - - if (!pci_dev_msi_enabled(dev)) - return pcibios_enable_irq(dev); - return 0; + return pcibios_enable_irq(dev); } -void pcibios_disable_device (struct pci_dev *dev) +void pcibios_free_irq(struct pci_dev *dev) { - if (!pci_dev_msi_enabled(dev) && pcibios_disable_irq) + if (pcibios_disable_irq) pcibios_disable_irq(dev); } +int pcibios_enable_device(struct pci_dev *dev, int mask) +{ + return pci_enable_resources(dev, mask); +} + int pci_ext_cfg_avail(void) { if (raw_pci_ext_ops) diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 9a2b7101a..e58565556 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -62,19 +62,6 @@ static void pci_fixup_umc_ide(struct pci_dev *d) } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_UMC, PCI_DEVICE_ID_UMC_UM8886BF, pci_fixup_umc_ide); -static void pci_fixup_ncr53c810(struct pci_dev *d) -{ - /* - * NCR 53C810 returns class code 0 (at least on some systems). - * Fix class to be PCI_CLASS_STORAGE_SCSI - */ - if (!d->class) { - dev_warn(&d->dev, "Fixing NCR 53C810 class code\n"); - d->class = PCI_CLASS_STORAGE_SCSI << 8; - } -} -DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NCR, PCI_DEVICE_ID_NCR_53C810, pci_fixup_ncr53c810); - static void pci_fixup_latency(struct pci_dev *d) { /* diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c index 7553921c1..0d24e7c10 100644 --- a/arch/x86/pci/intel_mid_pci.c +++ b/arch/x86/pci/intel_mid_pci.c @@ -213,13 +213,14 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev) { struct irq_alloc_info info; int polarity; + int ret; - if (dev->irq_managed && dev->irq > 0) + if (pci_has_managed_irq(dev)) return 0; switch (intel_mid_identify_cpu()) { case INTEL_MID_CPU_CHIP_TANGIER: - polarity = 0; /* active high */ + polarity = IOAPIC_POL_HIGH; /* Special treatment for IRQ0 */ if (dev->irq == 0) { @@ -234,7 +235,7 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev) } break; default: - polarity = 1; /* active low */ + polarity = IOAPIC_POL_LOW; break; } @@ -244,8 +245,9 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev) * MRST only have IOAPIC, the PCI irq lines are 1:1 mapped to * IOAPIC RTE entries, so we just enable RTE for the device. */ - if (mp_map_gsi_to_irq(dev->irq, IOAPIC_MAP_ALLOC, &info) < 0) - return -EBUSY; + ret = mp_map_gsi_to_irq(dev->irq, IOAPIC_MAP_ALLOC, &info); + if (ret < 0) + return ret; dev->irq_managed = 1; @@ -254,14 +256,17 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev) static void intel_mid_pci_irq_disable(struct pci_dev *dev) { - if (!mp_should_keep_irq(&dev->dev) && dev->irq_managed && - dev->irq > 0) { + if (pci_has_managed_irq(dev)) { mp_unmap_irq(dev->irq); dev->irq_managed = 0; + /* + * Don't reset dev->irq here, otherwise + * intel_mid_pci_irq_enable() will fail on next call. + */ } } -struct pci_ops intel_mid_pci_ops = { +static struct pci_ops intel_mid_pci_ops = { .read = pci_read, .write = pci_write, }; diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index 9bd115484..32e70343e 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -1202,7 +1202,7 @@ static int pirq_enable_irq(struct pci_dev *dev) struct pci_dev *temp_dev; int irq; - if (dev->irq_managed && dev->irq > 0) + if (pci_has_managed_irq(dev)) return 0; irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, @@ -1230,8 +1230,7 @@ static int pirq_enable_irq(struct pci_dev *dev) } dev = temp_dev; if (irq >= 0) { - dev->irq_managed = 1; - dev->irq = irq; + pci_set_managed_irq(dev, irq); dev_info(&dev->dev, "PCI->APIC IRQ transform: " "INT %c -> IRQ %d\n", 'A' + pin - 1, irq); return 0; @@ -1257,24 +1256,10 @@ static int pirq_enable_irq(struct pci_dev *dev) return 0; } -bool mp_should_keep_irq(struct device *dev) -{ - if (dev->power.is_prepared) - return true; -#ifdef CONFIG_PM - if (dev->power.runtime_status == RPM_SUSPENDING) - return true; -#endif - - return false; -} - static void pirq_disable_irq(struct pci_dev *dev) { - if (io_apic_assign_pci_irqs && !mp_should_keep_irq(&dev->dev) && - dev->irq_managed && dev->irq) { + if (io_apic_assign_pci_irqs && pci_has_managed_irq(dev)) { mp_unmap_irq(dev->irq); - dev->irq = 0; - dev->irq_managed = 0; + pci_reset_managed_irq(dev); } } diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index d22f4b5bb..ff31ab464 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -179,7 +179,7 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) if (ret) goto error; i = 0; - list_for_each_entry(msidesc, &dev->msi_list, list) { + for_each_pci_msi_entry(msidesc, dev) { irq = xen_bind_pirq_msi_to_irq(dev, msidesc, v[i], (type == PCI_CAP_ID_MSI) ? nvec : 1, (type == PCI_CAP_ID_MSIX) ? @@ -230,7 +230,7 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) if (type == PCI_CAP_ID_MSI && nvec > 1) return 1; - list_for_each_entry(msidesc, &dev->msi_list, list) { + for_each_pci_msi_entry(msidesc, dev) { __pci_read_msi_msg(msidesc, &msg); pirq = MSI_ADDR_EXT_DEST_ID(msg.address_hi) | ((msg.address_lo >> MSI_ADDR_DEST_ID_SHIFT) & 0xff); @@ -274,7 +274,7 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) int ret = 0; struct msi_desc *msidesc; - list_for_each_entry(msidesc, &dev->msi_list, list) { + for_each_pci_msi_entry(msidesc, dev) { struct physdev_map_pirq map_irq; domid_t domid; @@ -386,7 +386,7 @@ static void xen_teardown_msi_irqs(struct pci_dev *dev) { struct msi_desc *msidesc; - msidesc = list_entry(dev->msi_list.next, struct msi_desc, list); + msidesc = first_pci_msi_entry(dev); if (msidesc->msi_attrib.is_msix) xen_pci_frontend_disable_msix(dev); else diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile index f1a6c8e86..184842ef3 100644 --- a/arch/x86/platform/Makefile +++ b/arch/x86/platform/Makefile @@ -5,6 +5,7 @@ obj-y += efi/ obj-y += geode/ obj-y += goldfish/ obj-y += iris/ +obj-y += intel/ obj-y += intel-mid/ obj-y += intel-quark/ obj-y += olpc/ diff --git a/arch/x86/platform/atom/Makefile b/arch/x86/platform/atom/Makefile index 0a3a40cbc..40983f5b0 100644 --- a/arch/x86/platform/atom/Makefile +++ b/arch/x86/platform/atom/Makefile @@ -1 +1,2 @@ -obj-$(CONFIG_PUNIT_ATOM_DEBUG) += punit_atom_debug.o +obj-$(CONFIG_PMC_ATOM) += pmc_atom.o +obj-$(CONFIG_PUNIT_ATOM_DEBUG) += punit_atom_debug.o diff --git a/arch/x86/kernel/pmc_atom.c b/arch/x86/platform/atom/pmc_atom.c index d66a4fe6c..964ff4fc6 100644 --- a/arch/x86/kernel/pmc_atom.c +++ b/arch/x86/platform/atom/pmc_atom.c @@ -15,7 +15,6 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/module.h> #include <linux/init.h> #include <linux/pci.h> #include <linux/device.h> @@ -25,80 +24,149 @@ #include <asm/pmc_atom.h> +struct pmc_bit_map { + const char *name; + u32 bit_mask; +}; + +struct pmc_reg_map { + const struct pmc_bit_map *d3_sts_0; + const struct pmc_bit_map *d3_sts_1; + const struct pmc_bit_map *func_dis; + const struct pmc_bit_map *func_dis_2; + const struct pmc_bit_map *pss; +}; + struct pmc_dev { u32 base_addr; void __iomem *regmap; + const struct pmc_reg_map *map; #ifdef CONFIG_DEBUG_FS struct dentry *dbgfs_dir; #endif /* CONFIG_DEBUG_FS */ + bool init; }; static struct pmc_dev pmc_device; static u32 acpi_base_addr; -struct pmc_bit_map { - const char *name; - u32 bit_mask; +static const struct pmc_bit_map d3_sts_0_map[] = { + {"LPSS1_F0_DMA", BIT_LPSS1_F0_DMA}, + {"LPSS1_F1_PWM1", BIT_LPSS1_F1_PWM1}, + {"LPSS1_F2_PWM2", BIT_LPSS1_F2_PWM2}, + {"LPSS1_F3_HSUART1", BIT_LPSS1_F3_HSUART1}, + {"LPSS1_F4_HSUART2", BIT_LPSS1_F4_HSUART2}, + {"LPSS1_F5_SPI", BIT_LPSS1_F5_SPI}, + {"LPSS1_F6_Reserved", BIT_LPSS1_F6_XXX}, + {"LPSS1_F7_Reserved", BIT_LPSS1_F7_XXX}, + {"SCC_EMMC", BIT_SCC_EMMC}, + {"SCC_SDIO", BIT_SCC_SDIO}, + {"SCC_SDCARD", BIT_SCC_SDCARD}, + {"SCC_MIPI", BIT_SCC_MIPI}, + {"HDA", BIT_HDA}, + {"LPE", BIT_LPE}, + {"OTG", BIT_OTG}, + {"USH", BIT_USH}, + {"GBE", BIT_GBE}, + {"SATA", BIT_SATA}, + {"USB_EHCI", BIT_USB_EHCI}, + {"SEC", BIT_SEC}, + {"PCIE_PORT0", BIT_PCIE_PORT0}, + {"PCIE_PORT1", BIT_PCIE_PORT1}, + {"PCIE_PORT2", BIT_PCIE_PORT2}, + {"PCIE_PORT3", BIT_PCIE_PORT3}, + {"LPSS2_F0_DMA", BIT_LPSS2_F0_DMA}, + {"LPSS2_F1_I2C1", BIT_LPSS2_F1_I2C1}, + {"LPSS2_F2_I2C2", BIT_LPSS2_F2_I2C2}, + {"LPSS2_F3_I2C3", BIT_LPSS2_F3_I2C3}, + {"LPSS2_F3_I2C4", BIT_LPSS2_F4_I2C4}, + {"LPSS2_F5_I2C5", BIT_LPSS2_F5_I2C5}, + {"LPSS2_F6_I2C6", BIT_LPSS2_F6_I2C6}, + {"LPSS2_F7_I2C7", BIT_LPSS2_F7_I2C7}, + {}, +}; + +static struct pmc_bit_map byt_d3_sts_1_map[] = { + {"SMB", BIT_SMB}, + {"OTG_SS_PHY", BIT_OTG_SS_PHY}, + {"USH_SS_PHY", BIT_USH_SS_PHY}, + {"DFX", BIT_DFX}, + {}, }; -static const struct pmc_bit_map dev_map[] = { - {"0 - LPSS1_F0_DMA", BIT_LPSS1_F0_DMA}, - {"1 - LPSS1_F1_PWM1", BIT_LPSS1_F1_PWM1}, - {"2 - LPSS1_F2_PWM2", BIT_LPSS1_F2_PWM2}, - {"3 - LPSS1_F3_HSUART1", BIT_LPSS1_F3_HSUART1}, - {"4 - LPSS1_F4_HSUART2", BIT_LPSS1_F4_HSUART2}, - {"5 - LPSS1_F5_SPI", BIT_LPSS1_F5_SPI}, - {"6 - LPSS1_F6_Reserved", BIT_LPSS1_F6_XXX}, - {"7 - LPSS1_F7_Reserved", BIT_LPSS1_F7_XXX}, - {"8 - SCC_EMMC", BIT_SCC_EMMC}, - {"9 - SCC_SDIO", BIT_SCC_SDIO}, - {"10 - SCC_SDCARD", BIT_SCC_SDCARD}, - {"11 - SCC_MIPI", BIT_SCC_MIPI}, - {"12 - HDA", BIT_HDA}, - {"13 - LPE", BIT_LPE}, - {"14 - OTG", BIT_OTG}, - {"15 - USH", BIT_USH}, - {"16 - GBE", BIT_GBE}, - {"17 - SATA", BIT_SATA}, - {"18 - USB_EHCI", BIT_USB_EHCI}, - {"19 - SEC", BIT_SEC}, - {"20 - PCIE_PORT0", BIT_PCIE_PORT0}, - {"21 - PCIE_PORT1", BIT_PCIE_PORT1}, - {"22 - PCIE_PORT2", BIT_PCIE_PORT2}, - {"23 - PCIE_PORT3", BIT_PCIE_PORT3}, - {"24 - LPSS2_F0_DMA", BIT_LPSS2_F0_DMA}, - {"25 - LPSS2_F1_I2C1", BIT_LPSS2_F1_I2C1}, - {"26 - LPSS2_F2_I2C2", BIT_LPSS2_F2_I2C2}, - {"27 - LPSS2_F3_I2C3", BIT_LPSS2_F3_I2C3}, - {"28 - LPSS2_F3_I2C4", BIT_LPSS2_F4_I2C4}, - {"29 - LPSS2_F5_I2C5", BIT_LPSS2_F5_I2C5}, - {"30 - LPSS2_F6_I2C6", BIT_LPSS2_F6_I2C6}, - {"31 - LPSS2_F7_I2C7", BIT_LPSS2_F7_I2C7}, - {"32 - SMB", BIT_SMB}, - {"33 - OTG_SS_PHY", BIT_OTG_SS_PHY}, - {"34 - USH_SS_PHY", BIT_USH_SS_PHY}, - {"35 - DFX", BIT_DFX}, +static struct pmc_bit_map cht_d3_sts_1_map[] = { + {"SMB", BIT_SMB}, + {"GMM", BIT_STS_GMM}, + {"ISH", BIT_STS_ISH}, + {}, }; -static const struct pmc_bit_map pss_map[] = { - {"0 - GBE", PMC_PSS_BIT_GBE}, - {"1 - SATA", PMC_PSS_BIT_SATA}, - {"2 - HDA", PMC_PSS_BIT_HDA}, - {"3 - SEC", PMC_PSS_BIT_SEC}, - {"4 - PCIE", PMC_PSS_BIT_PCIE}, - {"5 - LPSS", PMC_PSS_BIT_LPSS}, - {"6 - LPE", PMC_PSS_BIT_LPE}, - {"7 - DFX", PMC_PSS_BIT_DFX}, - {"8 - USH_CTRL", PMC_PSS_BIT_USH_CTRL}, - {"9 - USH_SUS", PMC_PSS_BIT_USH_SUS}, - {"10 - USH_VCCS", PMC_PSS_BIT_USH_VCCS}, - {"11 - USH_VCCA", PMC_PSS_BIT_USH_VCCA}, - {"12 - OTG_CTRL", PMC_PSS_BIT_OTG_CTRL}, - {"13 - OTG_VCCS", PMC_PSS_BIT_OTG_VCCS}, - {"14 - OTG_VCCA_CLK", PMC_PSS_BIT_OTG_VCCA_CLK}, - {"15 - OTG_VCCA", PMC_PSS_BIT_OTG_VCCA}, - {"16 - USB", PMC_PSS_BIT_USB}, - {"17 - USB_SUS", PMC_PSS_BIT_USB_SUS}, +static struct pmc_bit_map cht_func_dis_2_map[] = { + {"SMB", BIT_SMB}, + {"GMM", BIT_FD_GMM}, + {"ISH", BIT_FD_ISH}, + {}, +}; + +static const struct pmc_bit_map byt_pss_map[] = { + {"GBE", PMC_PSS_BIT_GBE}, + {"SATA", PMC_PSS_BIT_SATA}, + {"HDA", PMC_PSS_BIT_HDA}, + {"SEC", PMC_PSS_BIT_SEC}, + {"PCIE", PMC_PSS_BIT_PCIE}, + {"LPSS", PMC_PSS_BIT_LPSS}, + {"LPE", PMC_PSS_BIT_LPE}, + {"DFX", PMC_PSS_BIT_DFX}, + {"USH_CTRL", PMC_PSS_BIT_USH_CTRL}, + {"USH_SUS", PMC_PSS_BIT_USH_SUS}, + {"USH_VCCS", PMC_PSS_BIT_USH_VCCS}, + {"USH_VCCA", PMC_PSS_BIT_USH_VCCA}, + {"OTG_CTRL", PMC_PSS_BIT_OTG_CTRL}, + {"OTG_VCCS", PMC_PSS_BIT_OTG_VCCS}, + {"OTG_VCCA_CLK", PMC_PSS_BIT_OTG_VCCA_CLK}, + {"OTG_VCCA", PMC_PSS_BIT_OTG_VCCA}, + {"USB", PMC_PSS_BIT_USB}, + {"USB_SUS", PMC_PSS_BIT_USB_SUS}, + {}, +}; + +static const struct pmc_bit_map cht_pss_map[] = { + {"SATA", PMC_PSS_BIT_SATA}, + {"HDA", PMC_PSS_BIT_HDA}, + {"SEC", PMC_PSS_BIT_SEC}, + {"PCIE", PMC_PSS_BIT_PCIE}, + {"LPSS", PMC_PSS_BIT_LPSS}, + {"LPE", PMC_PSS_BIT_LPE}, + {"UFS", PMC_PSS_BIT_CHT_UFS}, + {"UXD", PMC_PSS_BIT_CHT_UXD}, + {"UXD_FD", PMC_PSS_BIT_CHT_UXD_FD}, + {"UX_ENG", PMC_PSS_BIT_CHT_UX_ENG}, + {"USB_SUS", PMC_PSS_BIT_CHT_USB_SUS}, + {"GMM", PMC_PSS_BIT_CHT_GMM}, + {"ISH", PMC_PSS_BIT_CHT_ISH}, + {"DFX_MASTER", PMC_PSS_BIT_CHT_DFX_MASTER}, + {"DFX_CLUSTER1", PMC_PSS_BIT_CHT_DFX_CLUSTER1}, + {"DFX_CLUSTER2", PMC_PSS_BIT_CHT_DFX_CLUSTER2}, + {"DFX_CLUSTER3", PMC_PSS_BIT_CHT_DFX_CLUSTER3}, + {"DFX_CLUSTER4", PMC_PSS_BIT_CHT_DFX_CLUSTER4}, + {"DFX_CLUSTER5", PMC_PSS_BIT_CHT_DFX_CLUSTER5}, + {}, +}; + +static const struct pmc_reg_map byt_reg_map = { + .d3_sts_0 = d3_sts_0_map, + .d3_sts_1 = byt_d3_sts_1_map, + .func_dis = d3_sts_0_map, + .func_dis_2 = byt_d3_sts_1_map, + .pss = byt_pss_map, +}; + +static const struct pmc_reg_map cht_reg_map = { + .d3_sts_0 = d3_sts_0_map, + .d3_sts_1 = cht_d3_sts_1_map, + .func_dis = d3_sts_0_map, + .func_dis_2 = cht_func_dis_2_map, + .pss = cht_pss_map, }; static inline u32 pmc_reg_read(struct pmc_dev *pmc, int reg_offset) @@ -111,6 +179,30 @@ static inline void pmc_reg_write(struct pmc_dev *pmc, int reg_offset, u32 val) writel(val, pmc->regmap + reg_offset); } +int pmc_atom_read(int offset, u32 *value) +{ + struct pmc_dev *pmc = &pmc_device; + + if (!pmc->init) + return -ENODEV; + + *value = pmc_reg_read(pmc, offset); + return 0; +} +EXPORT_SYMBOL_GPL(pmc_atom_read); + +int pmc_atom_write(int offset, u32 value) +{ + struct pmc_dev *pmc = &pmc_device; + + if (!pmc->init) + return -ENODEV; + + pmc_reg_write(pmc, offset, value); + return 0; +} +EXPORT_SYMBOL_GPL(pmc_atom_write); + static void pmc_power_off(void) { u16 pm1_cnt_port; @@ -142,37 +234,39 @@ static void pmc_hw_reg_setup(struct pmc_dev *pmc) } #ifdef CONFIG_DEBUG_FS +static void pmc_dev_state_print(struct seq_file *s, int reg_index, + u32 sts, const struct pmc_bit_map *sts_map, + u32 fd, const struct pmc_bit_map *fd_map) +{ + int offset = PMC_REG_BIT_WIDTH * reg_index; + int index; + + for (index = 0; sts_map[index].name; index++) { + seq_printf(s, "Dev: %-2d - %-32s\tState: %s [%s]\n", + offset + index, sts_map[index].name, + fd_map[index].bit_mask & fd ? "Disabled" : "Enabled ", + sts_map[index].bit_mask & sts ? "D3" : "D0"); + } +} + static int pmc_dev_state_show(struct seq_file *s, void *unused) { struct pmc_dev *pmc = s->private; - u32 func_dis, func_dis_2, func_dis_index; - u32 d3_sts_0, d3_sts_1, d3_sts_index; - int dev_num, dev_index, reg_index; + const struct pmc_reg_map *m = pmc->map; + u32 func_dis, func_dis_2; + u32 d3_sts_0, d3_sts_1; func_dis = pmc_reg_read(pmc, PMC_FUNC_DIS); func_dis_2 = pmc_reg_read(pmc, PMC_FUNC_DIS_2); d3_sts_0 = pmc_reg_read(pmc, PMC_D3_STS_0); d3_sts_1 = pmc_reg_read(pmc, PMC_D3_STS_1); - dev_num = ARRAY_SIZE(dev_map); - - for (dev_index = 0; dev_index < dev_num; dev_index++) { - reg_index = dev_index / PMC_REG_BIT_WIDTH; - if (reg_index) { - func_dis_index = func_dis_2; - d3_sts_index = d3_sts_1; - } else { - func_dis_index = func_dis; - d3_sts_index = d3_sts_0; - } - - seq_printf(s, "Dev: %-32s\tState: %s [%s]\n", - dev_map[dev_index].name, - dev_map[dev_index].bit_mask & func_dis_index ? - "Disabled" : "Enabled ", - dev_map[dev_index].bit_mask & d3_sts_index ? - "D3" : "D0"); - } + /* Low part */ + pmc_dev_state_print(s, 0, d3_sts_0, m->d3_sts_0, func_dis, m->func_dis); + + /* High part */ + pmc_dev_state_print(s, 1, d3_sts_1, m->d3_sts_1, func_dis_2, m->func_dis_2); + return 0; } @@ -191,13 +285,14 @@ static const struct file_operations pmc_dev_state_ops = { static int pmc_pss_state_show(struct seq_file *s, void *unused) { struct pmc_dev *pmc = s->private; + const struct pmc_bit_map *map = pmc->map->pss; u32 pss = pmc_reg_read(pmc, PMC_PSS); - int pss_index; + int index; - for (pss_index = 0; pss_index < ARRAY_SIZE(pss_map); pss_index++) { - seq_printf(s, "Island: %-32s\tState: %s\n", - pss_map[pss_index].name, - pss_map[pss_index].bit_mask & pss ? "Off" : "On"); + for (index = 0; map[index].name; index++) { + seq_printf(s, "Island: %-2d - %-32s\tState: %s\n", + index, map[index].name, + map[index].bit_mask & pss ? "Off" : "On"); } return 0; } @@ -250,7 +345,7 @@ static void pmc_dbgfs_unregister(struct pmc_dev *pmc) debugfs_remove_recursive(pmc->dbgfs_dir); } -static int pmc_dbgfs_register(struct pmc_dev *pmc, struct pci_dev *pdev) +static int pmc_dbgfs_register(struct pmc_dev *pmc) { struct dentry *dir, *f; @@ -262,24 +357,18 @@ static int pmc_dbgfs_register(struct pmc_dev *pmc, struct pci_dev *pdev) f = debugfs_create_file("dev_state", S_IFREG | S_IRUGO, dir, pmc, &pmc_dev_state_ops); - if (!f) { - dev_err(&pdev->dev, "dev_state register failed\n"); + if (!f) goto err; - } f = debugfs_create_file("pss_state", S_IFREG | S_IRUGO, dir, pmc, &pmc_pss_state_ops); - if (!f) { - dev_err(&pdev->dev, "pss_state register failed\n"); + if (!f) goto err; - } f = debugfs_create_file("sleep_state", S_IFREG | S_IRUGO, dir, pmc, &pmc_sleep_tmr_ops); - if (!f) { - dev_err(&pdev->dev, "sleep_state register failed\n"); + if (!f) goto err; - } return 0; err: @@ -287,15 +376,16 @@ err: return -ENODEV; } #else -static int pmc_dbgfs_register(struct pmc_dev *pmc, struct pci_dev *pdev) +static int pmc_dbgfs_register(struct pmc_dev *pmc) { return 0; } #endif /* CONFIG_DEBUG_FS */ -static int pmc_setup_dev(struct pci_dev *pdev) +static int pmc_setup_dev(struct pci_dev *pdev, const struct pci_device_id *ent) { struct pmc_dev *pmc = &pmc_device; + const struct pmc_reg_map *map = (struct pmc_reg_map *)ent->driver_data; int ret; /* Obtain ACPI base address */ @@ -315,32 +405,30 @@ static int pmc_setup_dev(struct pci_dev *pdev) return -ENOMEM; } + pmc->map = map; + /* PMC hardware registers setup */ pmc_hw_reg_setup(pmc); - ret = pmc_dbgfs_register(pmc, pdev); - if (ret) { - iounmap(pmc->regmap); - } + ret = pmc_dbgfs_register(pmc); + if (ret) + dev_warn(&pdev->dev, "debugfs register failed\n"); + pmc->init = true; return ret; } /* * Data for PCI driver interface * - * This data only exists for exporting the supported - * PCI ids via MODULE_DEVICE_TABLE. We do not actually - * register a pci_driver, because lpc_ich will register - * a driver on the same PCI id. + * used by pci_match_id() call below. */ static const struct pci_device_id pmc_pci_ids[] = { - { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_VLV_PMC) }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_VLV_PMC), (kernel_ulong_t)&byt_reg_map }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_CHT_PMC), (kernel_ulong_t)&cht_reg_map }, { 0, }, }; -MODULE_DEVICE_TABLE(pci, pmc_pci_ids); - static int __init pmc_atom_init(void) { struct pci_dev *pdev = NULL; @@ -357,15 +445,16 @@ static int __init pmc_atom_init(void) for_each_pci_dev(pdev) { ent = pci_match_id(pmc_pci_ids, pdev); if (ent) - return pmc_setup_dev(pdev); + return pmc_setup_dev(pdev, ent); } /* Device not found. */ return -ENODEV; } -module_init(pmc_atom_init); -/* no module_exit, this driver shouldn't be unloaded */ +device_initcall(pmc_atom_init); +/* MODULE_AUTHOR("Aubrey Li <aubrey.li@linux.intel.com>"); MODULE_DESCRIPTION("Intel Atom SOC Power Management Controller Interface"); MODULE_LICENSE("GPL v2"); +*/ diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index c6835bfad..6a28ded74 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -650,7 +650,7 @@ static void __init get_systab_virt_addr(efi_memory_desc_t *md) static void __init save_runtime_map(void) { -#ifdef CONFIG_KEXEC +#ifdef CONFIG_KEXEC_CORE efi_memory_desc_t *md; void *tmp, *p, *q = NULL; int count = 0; @@ -813,7 +813,7 @@ static void * __init efi_map_regions(int *count, int *pg_shift) static void __init kexec_enter_virtual_mode(void) { -#ifdef CONFIG_KEXEC +#ifdef CONFIG_KEXEC_CORE efi_memory_desc_t *md; void *p; diff --git a/arch/x86/platform/intel/Makefile b/arch/x86/platform/intel/Makefile new file mode 100644 index 000000000..b878032fb --- /dev/null +++ b/arch/x86/platform/intel/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_IOSF_MBI) += iosf_mbi.o diff --git a/arch/x86/kernel/iosf_mbi.c b/arch/x86/platform/intel/iosf_mbi.c index 82f8d02f0..edf2c54bf 100644 --- a/arch/x86/kernel/iosf_mbi.c +++ b/arch/x86/platform/intel/iosf_mbi.c @@ -30,7 +30,9 @@ #define PCI_DEVICE_ID_BAYTRAIL 0x0F00 #define PCI_DEVICE_ID_BRASWELL 0x2280 #define PCI_DEVICE_ID_QUARK_X1000 0x0958 +#define PCI_DEVICE_ID_TANGIER 0x1170 +static struct pci_dev *mbi_pdev; static DEFINE_SPINLOCK(iosf_mbi_lock); static inline u32 iosf_mbi_form_mcr(u8 op, u8 port, u8 offset) @@ -38,8 +40,6 @@ static inline u32 iosf_mbi_form_mcr(u8 op, u8 port, u8 offset) return (op << 24) | (port << 16) | (offset << 8) | MBI_ENABLE; } -static struct pci_dev *mbi_pdev; /* one mbi device */ - static int iosf_mbi_pci_read_mdr(u32 mcrx, u32 mcr, u32 *mdr) { int result; @@ -104,7 +104,7 @@ int iosf_mbi_read(u8 port, u8 opcode, u32 offset, u32 *mdr) unsigned long flags; int ret; - /*Access to the GFX unit is handled by GPU code */ + /* Access to the GFX unit is handled by GPU code */ if (port == BT_MBI_UNIT_GFX) { WARN_ON(1); return -EPERM; @@ -127,7 +127,7 @@ int iosf_mbi_write(u8 port, u8 opcode, u32 offset, u32 mdr) unsigned long flags; int ret; - /*Access to the GFX unit is handled by GPU code */ + /* Access to the GFX unit is handled by GPU code */ if (port == BT_MBI_UNIT_GFX) { WARN_ON(1); return -EPERM; @@ -151,7 +151,7 @@ int iosf_mbi_modify(u8 port, u8 opcode, u32 offset, u32 mdr, u32 mask) unsigned long flags; int ret; - /*Access to the GFX unit is handled by GPU code */ + /* Access to the GFX unit is handled by GPU code */ if (port == BT_MBI_UNIT_GFX) { WARN_ON(1); return -EPERM; @@ -240,17 +240,17 @@ static void iosf_sideband_debug_init(void) /* mdr */ d = debugfs_create_x32("mdr", 0660, iosf_dbg, &dbg_mdr); - if (IS_ERR_OR_NULL(d)) + if (!d) goto cleanup; /* mcrx */ - debugfs_create_x32("mcrx", 0660, iosf_dbg, &dbg_mcrx); - if (IS_ERR_OR_NULL(d)) + d = debugfs_create_x32("mcrx", 0660, iosf_dbg, &dbg_mcrx); + if (!d) goto cleanup; /* mcr - initiates mailbox tranaction */ - debugfs_create_file("mcr", 0660, iosf_dbg, &dbg_mcr, &iosf_mcr_fops); - if (IS_ERR_OR_NULL(d)) + d = debugfs_create_file("mcr", 0660, iosf_dbg, &dbg_mcr, &iosf_mcr_fops); + if (!d) goto cleanup; return; @@ -292,6 +292,7 @@ static const struct pci_device_id iosf_mbi_pci_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_BAYTRAIL) }, { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_BRASWELL) }, { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_QUARK_X1000) }, + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_TANGIER) }, { 0, }, }; MODULE_DEVICE_TABLE(pci, iosf_mbi_pci_ids); @@ -314,10 +315,8 @@ static void __exit iosf_mbi_exit(void) iosf_debugfs_remove(); pci_unregister_driver(&iosf_mbi_pci_driver); - if (mbi_pdev) { - pci_dev_put(mbi_pdev); - mbi_pdev = NULL; - } + pci_dev_put(mbi_pdev); + mbi_pdev = NULL; } module_init(iosf_mbi_init); diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c index 8570abe68..e1c24631a 100644 --- a/arch/x86/platform/uv/uv_irq.c +++ b/arch/x86/platform/uv/uv_irq.c @@ -89,7 +89,7 @@ static int uv_domain_alloc(struct irq_domain *domain, unsigned int virq, return -EINVAL; chip_data = kmalloc_node(sizeof(*chip_data), GFP_KERNEL, - irq_data->node); + irq_data_get_node(irq_data)); if (!chip_data) return -ENOMEM; diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c index 020c101c2..5c9f63fa6 100644 --- a/arch/x86/platform/uv/uv_nmi.c +++ b/arch/x86/platform/uv/uv_nmi.c @@ -492,7 +492,7 @@ static void uv_nmi_touch_watchdogs(void) touch_nmi_watchdog(); } -#if defined(CONFIG_KEXEC) +#if defined(CONFIG_KEXEC_CORE) static atomic_t uv_nmi_kexec_failed; static void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs) { @@ -519,13 +519,13 @@ static void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs) uv_nmi_sync_exit(0); } -#else /* !CONFIG_KEXEC */ +#else /* !CONFIG_KEXEC_CORE */ static inline void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs) { if (master) pr_err("UV: NMI kdump: KEXEC not supported in this kernel\n"); } -#endif /* !CONFIG_KEXEC */ +#endif /* !CONFIG_KEXEC_CORE */ #ifdef CONFIG_KGDB #ifdef CONFIG_KGDB_KDB diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c index a244237f3..2b158a9fa 100644 --- a/arch/x86/platform/uv/uv_time.c +++ b/arch/x86/platform/uv/uv_time.c @@ -32,8 +32,7 @@ static cycle_t uv_read_rtc(struct clocksource *cs); static int uv_rtc_next_event(unsigned long, struct clock_event_device *); -static void uv_rtc_timer_setup(enum clock_event_mode, - struct clock_event_device *); +static int uv_rtc_shutdown(struct clock_event_device *evt); static struct clocksource clocksource_uv = { .name = RTC_NAME, @@ -44,14 +43,14 @@ static struct clocksource clocksource_uv = { }; static struct clock_event_device clock_event_device_uv = { - .name = RTC_NAME, - .features = CLOCK_EVT_FEAT_ONESHOT, - .shift = 20, - .rating = 400, - .irq = -1, - .set_next_event = uv_rtc_next_event, - .set_mode = uv_rtc_timer_setup, - .event_handler = NULL, + .name = RTC_NAME, + .features = CLOCK_EVT_FEAT_ONESHOT, + .shift = 20, + .rating = 400, + .irq = -1, + .set_next_event = uv_rtc_next_event, + .set_state_shutdown = uv_rtc_shutdown, + .event_handler = NULL, }; static DEFINE_PER_CPU(struct clock_event_device, cpu_ced); @@ -321,24 +320,14 @@ static int uv_rtc_next_event(unsigned long delta, } /* - * Setup the RTC timer in oneshot mode + * Shutdown the RTC timer */ -static void uv_rtc_timer_setup(enum clock_event_mode mode, - struct clock_event_device *evt) +static int uv_rtc_shutdown(struct clock_event_device *evt) { int ced_cpu = cpumask_first(evt->cpumask); - switch (mode) { - case CLOCK_EVT_MODE_PERIODIC: - case CLOCK_EVT_MODE_ONESHOT: - case CLOCK_EVT_MODE_RESUME: - /* Nothing to do here yet */ - break; - case CLOCK_EVT_MODE_UNUSED: - case CLOCK_EVT_MODE_SHUTDOWN: - uv_rtc_unset_timer(ced_cpu, 1); - break; - } + uv_rtc_unset_timer(ced_cpu, 1); + return 0; } static void uv_rtc_interrupt(void) diff --git a/arch/x86/ras/Kconfig b/arch/x86/ras/Kconfig new file mode 100644 index 000000000..10fea5fc8 --- /dev/null +++ b/arch/x86/ras/Kconfig @@ -0,0 +1,11 @@ +config AMD_MCE_INJ + tristate "Simple MCE injection interface for AMD processors" + depends on RAS && EDAC_DECODE_MCE && DEBUG_FS + default n + help + This is a simple debugfs interface to inject MCEs and test different + aspects of the MCE handling code. + + WARNING: Do not even assume this interface is staying stable! + + diff --git a/arch/x86/ras/Makefile b/arch/x86/ras/Makefile new file mode 100644 index 000000000..dd2c98b84 --- /dev/null +++ b/arch/x86/ras/Makefile @@ -0,0 +1,2 @@ +obj-$(CONFIG_AMD_MCE_INJ) += mce_amd_inj.o + diff --git a/arch/x86/ras/mce_amd_inj.c b/arch/x86/ras/mce_amd_inj.c new file mode 100644 index 000000000..17e35b5bf --- /dev/null +++ b/arch/x86/ras/mce_amd_inj.c @@ -0,0 +1,375 @@ +/* + * A simple MCE injection facility for testing different aspects of the RAS + * code. This driver should be built as module so that it can be loaded + * on production kernels for testing purposes. + * + * This file may be distributed under the terms of the GNU General Public + * License version 2. + * + * Copyright (c) 2010-15: Borislav Petkov <bp@alien8.de> + * Advanced Micro Devices Inc. + */ + +#include <linux/kobject.h> +#include <linux/debugfs.h> +#include <linux/device.h> +#include <linux/module.h> +#include <linux/cpu.h> +#include <linux/string.h> +#include <linux/uaccess.h> +#include <asm/mce.h> + +#include "../kernel/cpu/mcheck/mce-internal.h" + +/* + * Collect all the MCi_XXX settings + */ +static struct mce i_mce; +static struct dentry *dfs_inj; + +static u8 n_banks; + +#define MAX_FLAG_OPT_SIZE 3 + +enum injection_type { + SW_INJ = 0, /* SW injection, simply decode the error */ + HW_INJ, /* Trigger a #MC */ + N_INJ_TYPES, +}; + +static const char * const flags_options[] = { + [SW_INJ] = "sw", + [HW_INJ] = "hw", + NULL +}; + +/* Set default injection to SW_INJ */ +static enum injection_type inj_type = SW_INJ; + +#define MCE_INJECT_SET(reg) \ +static int inj_##reg##_set(void *data, u64 val) \ +{ \ + struct mce *m = (struct mce *)data; \ + \ + m->reg = val; \ + return 0; \ +} + +MCE_INJECT_SET(status); +MCE_INJECT_SET(misc); +MCE_INJECT_SET(addr); + +#define MCE_INJECT_GET(reg) \ +static int inj_##reg##_get(void *data, u64 *val) \ +{ \ + struct mce *m = (struct mce *)data; \ + \ + *val = m->reg; \ + return 0; \ +} + +MCE_INJECT_GET(status); +MCE_INJECT_GET(misc); +MCE_INJECT_GET(addr); + +DEFINE_SIMPLE_ATTRIBUTE(status_fops, inj_status_get, inj_status_set, "%llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(misc_fops, inj_misc_get, inj_misc_set, "%llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(addr_fops, inj_addr_get, inj_addr_set, "%llx\n"); + +/* + * Caller needs to be make sure this cpu doesn't disappear + * from under us, i.e.: get_cpu/put_cpu. + */ +static int toggle_hw_mce_inject(unsigned int cpu, bool enable) +{ + u32 l, h; + int err; + + err = rdmsr_on_cpu(cpu, MSR_K7_HWCR, &l, &h); + if (err) { + pr_err("%s: error reading HWCR\n", __func__); + return err; + } + + enable ? (l |= BIT(18)) : (l &= ~BIT(18)); + + err = wrmsr_on_cpu(cpu, MSR_K7_HWCR, l, h); + if (err) + pr_err("%s: error writing HWCR\n", __func__); + + return err; +} + +static int __set_inj(const char *buf) +{ + int i; + + for (i = 0; i < N_INJ_TYPES; i++) { + if (!strncmp(flags_options[i], buf, strlen(flags_options[i]))) { + inj_type = i; + return 0; + } + } + return -EINVAL; +} + +static ssize_t flags_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[MAX_FLAG_OPT_SIZE]; + int n; + + n = sprintf(buf, "%s\n", flags_options[inj_type]); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, n); +} + +static ssize_t flags_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[MAX_FLAG_OPT_SIZE], *__buf; + int err; + size_t ret; + + if (cnt > MAX_FLAG_OPT_SIZE) + cnt = MAX_FLAG_OPT_SIZE; + + ret = cnt; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt - 1] = 0; + + /* strip whitespace */ + __buf = strstrip(buf); + + err = __set_inj(__buf); + if (err) { + pr_err("%s: Invalid flags value: %s\n", __func__, __buf); + return err; + } + + *ppos += ret; + + return ret; +} + +static const struct file_operations flags_fops = { + .read = flags_read, + .write = flags_write, + .llseek = generic_file_llseek, +}; + +/* + * On which CPU to inject? + */ +MCE_INJECT_GET(extcpu); + +static int inj_extcpu_set(void *data, u64 val) +{ + struct mce *m = (struct mce *)data; + + if (val >= nr_cpu_ids || !cpu_online(val)) { + pr_err("%s: Invalid CPU: %llu\n", __func__, val); + return -EINVAL; + } + m->extcpu = val; + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(extcpu_fops, inj_extcpu_get, inj_extcpu_set, "%llu\n"); + +static void trigger_mce(void *info) +{ + asm volatile("int $18"); +} + +static void do_inject(void) +{ + u64 mcg_status = 0; + unsigned int cpu = i_mce.extcpu; + u8 b = i_mce.bank; + + if (i_mce.misc) + i_mce.status |= MCI_STATUS_MISCV; + + if (inj_type == SW_INJ) { + mce_inject_log(&i_mce); + return; + } + + /* prep MCE global settings for the injection */ + mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV; + + if (!(i_mce.status & MCI_STATUS_PCC)) + mcg_status |= MCG_STATUS_RIPV; + + get_online_cpus(); + if (!cpu_online(cpu)) + goto err; + + toggle_hw_mce_inject(cpu, true); + + wrmsr_on_cpu(cpu, MSR_IA32_MCG_STATUS, + (u32)mcg_status, (u32)(mcg_status >> 32)); + + wrmsr_on_cpu(cpu, MSR_IA32_MCx_STATUS(b), + (u32)i_mce.status, (u32)(i_mce.status >> 32)); + + wrmsr_on_cpu(cpu, MSR_IA32_MCx_ADDR(b), + (u32)i_mce.addr, (u32)(i_mce.addr >> 32)); + + wrmsr_on_cpu(cpu, MSR_IA32_MCx_MISC(b), + (u32)i_mce.misc, (u32)(i_mce.misc >> 32)); + + toggle_hw_mce_inject(cpu, false); + + smp_call_function_single(cpu, trigger_mce, NULL, 0); + +err: + put_online_cpus(); + +} + +/* + * This denotes into which bank we're injecting and triggers + * the injection, at the same time. + */ +static int inj_bank_set(void *data, u64 val) +{ + struct mce *m = (struct mce *)data; + + if (val >= n_banks) { + pr_err("Non-existent MCE bank: %llu\n", val); + return -EINVAL; + } + + m->bank = val; + do_inject(); + + return 0; +} + +MCE_INJECT_GET(bank); + +DEFINE_SIMPLE_ATTRIBUTE(bank_fops, inj_bank_get, inj_bank_set, "%llu\n"); + +static const char readme_msg[] = +"Description of the files and their usages:\n" +"\n" +"Note1: i refers to the bank number below.\n" +"Note2: See respective BKDGs for the exact bit definitions of the files below\n" +"as they mirror the hardware registers.\n" +"\n" +"status:\t Set MCi_STATUS: the bits in that MSR control the error type and\n" +"\t attributes of the error which caused the MCE.\n" +"\n" +"misc:\t Set MCi_MISC: provide auxiliary info about the error. It is mostly\n" +"\t used for error thresholding purposes and its validity is indicated by\n" +"\t MCi_STATUS[MiscV].\n" +"\n" +"addr:\t Error address value to be written to MCi_ADDR. Log address information\n" +"\t associated with the error.\n" +"\n" +"cpu:\t The CPU to inject the error on.\n" +"\n" +"bank:\t Specify the bank you want to inject the error into: the number of\n" +"\t banks in a processor varies and is family/model-specific, therefore, the\n" +"\t supplied value is sanity-checked. Setting the bank value also triggers the\n" +"\t injection.\n" +"\n" +"flags:\t Injection type to be performed. Writing to this file will trigger a\n" +"\t real machine check, an APIC interrupt or invoke the error decoder routines\n" +"\t for AMD processors.\n" +"\n" +"\t Allowed error injection types:\n" +"\t - \"sw\": Software error injection. Decode error to a human-readable \n" +"\t format only. Safe to use.\n" +"\t - \"hw\": Hardware error injection. Causes the #MC exception handler to \n" +"\t handle the error. Be warned: might cause system panic if MCi_STATUS[PCC] \n" +"\t is set. Therefore, consider setting (debugfs_mountpoint)/mce/fake_panic \n" +"\t before injecting.\n" +"\n"; + +static ssize_t +inj_readme_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return simple_read_from_buffer(ubuf, cnt, ppos, + readme_msg, strlen(readme_msg)); +} + +static const struct file_operations readme_fops = { + .read = inj_readme_read, +}; + +static struct dfs_node { + char *name; + struct dentry *d; + const struct file_operations *fops; + umode_t perm; +} dfs_fls[] = { + { .name = "status", .fops = &status_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "misc", .fops = &misc_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "addr", .fops = &addr_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "bank", .fops = &bank_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "flags", .fops = &flags_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "cpu", .fops = &extcpu_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "README", .fops = &readme_fops, .perm = S_IRUSR | S_IRGRP | S_IROTH }, +}; + +static int __init init_mce_inject(void) +{ + int i; + u64 cap; + + rdmsrl(MSR_IA32_MCG_CAP, cap); + n_banks = cap & MCG_BANKCNT_MASK; + + dfs_inj = debugfs_create_dir("mce-inject", NULL); + if (!dfs_inj) + return -EINVAL; + + for (i = 0; i < ARRAY_SIZE(dfs_fls); i++) { + dfs_fls[i].d = debugfs_create_file(dfs_fls[i].name, + dfs_fls[i].perm, + dfs_inj, + &i_mce, + dfs_fls[i].fops); + + if (!dfs_fls[i].d) + goto err_dfs_add; + } + + return 0; + +err_dfs_add: + while (--i >= 0) + debugfs_remove(dfs_fls[i].d); + + debugfs_remove(dfs_inj); + dfs_inj = NULL; + + return -ENOMEM; +} + +static void __exit exit_mce_inject(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(dfs_fls); i++) + debugfs_remove(dfs_fls[i].d); + + memset(&dfs_fls, 0, sizeof(dfs_fls)); + + debugfs_remove(dfs_inj); + dfs_inj = NULL; +} +module_init(init_mce_inject); +module_exit(exit_mce_inject); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Borislav Petkov <bp@alien8.de>"); +MODULE_AUTHOR("AMD Inc."); +MODULE_DESCRIPTION("MCE injection facility for RAS testing"); diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h index b9531d343..755481f14 100644 --- a/arch/x86/um/asm/barrier.h +++ b/arch/x86/um/asm/barrier.h @@ -45,17 +45,4 @@ #define read_barrier_depends() do { } while (0) #define smp_read_barrier_depends() do { } while (0) -/* - * Stop RDTSC speculation. This is needed when you need to use RDTSC - * (or get_cycles or vread that possibly accesses the TSC) in a defined - * code region. - * - * (Could use an alternative three way for this if there was one.) - */ -static inline void rdtsc_barrier(void) -{ - alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC, - "lfence", X86_FEATURE_LFENCE_RDTSC); -} - #endif diff --git a/arch/x86/um/ldt.c b/arch/x86/um/ldt.c index 9701a4fd7..836a1eb5d 100644 --- a/arch/x86/um/ldt.c +++ b/arch/x86/um/ldt.c @@ -12,7 +12,10 @@ #include <skas.h> #include <sysdep/tls.h> -extern int modify_ldt(int func, void *ptr, unsigned long bytecount); +static inline int modify_ldt (int func, void *ptr, unsigned long bytecount) +{ + return syscall(__NR_modify_ldt, func, ptr, bytecount); +} static long write_ldt_entry(struct mm_id *mm_idp, int func, struct user_desc *desc, void **addr, int done) diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 484145368..c7b15f3e2 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -7,6 +7,7 @@ config XEN depends on PARAVIRT select PARAVIRT_CLOCK select XEN_HAVE_PVMMU + select XEN_HAVE_VPMU depends on X86_64 || (X86_32 && X86_PAE) depends on X86_LOCAL_APIC && X86_TSC help @@ -23,14 +24,18 @@ config XEN_PVHVM def_bool y depends on XEN && PCI && X86_LOCAL_APIC -config XEN_MAX_DOMAIN_MEMORY - int - default 500 if X86_64 - default 64 if X86_32 - depends on XEN - help - This only affects the sizing of some bss arrays, the unused - portions of which are freed. +config XEN_512GB + bool "Limit Xen pv-domain memory to 512GB" + depends on XEN && X86_64 + default y + help + Limit paravirtualized user domains to 512GB of RAM. + + The Xen tools and crash dump analysis tools might not support + pv-domains with more than 512 GB of RAM. This option controls the + default setting of the kernel to use only up to 512 GB or more. + It is always possible to change the default via specifying the + boot parameter "xen_512gb_limit". config XEN_SAVE_RESTORE bool diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 4b6e29ac0..e47e52787 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -13,7 +13,7 @@ CFLAGS_mmu.o := $(nostackp) obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ time.o xen-asm.o xen-asm_$(BITS).o \ grant-table.o suspend.o platform-pci-unplug.o \ - p2m.o apic.o + p2m.o apic.o pmu.o obj-$(CONFIG_EVENT_TRACING) += trace.o diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c index 70e060ad8..acda713ab 100644 --- a/arch/x86/xen/apic.c +++ b/arch/x86/xen/apic.c @@ -7,6 +7,7 @@ #include <xen/xen.h> #include <xen/interface/physdev.h> #include "xen-ops.h" +#include "pmu.h" #include "smp.h" static unsigned int xen_io_apic_read(unsigned apic, unsigned reg) @@ -72,6 +73,11 @@ static u32 xen_apic_read(u32 reg) static void xen_apic_write(u32 reg, u32 val) { + if (reg == APIC_LVTPC) { + (void)pmu_apic_update(reg); + return; + } + /* Warn to see if there's any stray references */ WARN(1,"register: %x, value: %x\n", reg, val); } diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 777ad2f03..993b7a713 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -88,6 +88,7 @@ #include "mmu.h" #include "smp.h" #include "multicalls.h" +#include "pmu.h" EXPORT_SYMBOL_GPL(hypercall_page); @@ -1014,8 +1015,7 @@ static void xen_write_cr0(unsigned long cr0) static void xen_write_cr4(unsigned long cr4) { - cr4 &= ~X86_CR4_PGE; - cr4 &= ~X86_CR4_PSE; + cr4 &= ~(X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PCE); native_write_cr4(cr4); } @@ -1034,6 +1034,9 @@ static u64 xen_read_msr_safe(unsigned int msr, int *err) { u64 val; + if (pmu_msr_read(msr, &val, err)) + return val; + val = native_read_msr_safe(msr, err); switch (msr) { case MSR_IA32_APICBASE: @@ -1078,9 +1081,11 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) /* Fast syscall setup is all done in hypercalls, so these are all ignored. Stub them out here to stop Xen console noise. */ + break; default: - ret = native_write_msr_safe(msr, low, high); + if (!pmu_msr_write(msr, low, high, &ret)) + ret = native_write_msr_safe(msr, low, high); } return ret; @@ -1219,10 +1224,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = { .read_msr = xen_read_msr_safe, .write_msr = xen_write_msr_safe, - .read_tsc = native_read_tsc, - .read_pmc = native_read_pmc, - - .read_tscp = native_read_tscp, + .read_pmc = xen_read_pmc, .iret = xen_iret, #ifdef CONFIG_X86_64 @@ -1271,6 +1273,10 @@ static const struct pv_apic_ops xen_apic_ops __initconst = { static void xen_reboot(int reason) { struct sched_shutdown r = { .reason = reason }; + int cpu; + + for_each_online_cpu(cpu) + xen_pmu_finish(cpu); if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r)) BUG(); @@ -1614,7 +1620,9 @@ asmlinkage __visible void __init xen_start_kernel(void) early_boot_irqs_disabled = true; xen_raw_console_write("mapping kernel into physical memory\n"); - xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base, xen_start_info->nr_pages); + xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base, + xen_start_info->nr_pages); + xen_reserve_special_pages(); /* * Modify the cache mode translation tables to match Xen's PAT diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index dd151b204..9c479fe40 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -116,6 +116,7 @@ static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss; DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */ DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ +static phys_addr_t xen_pt_base, xen_pt_size __initdata; /* * Just beyond the highest usermode address. STACK_TOP_MAX has a @@ -1093,6 +1094,16 @@ static void xen_exit_mmap(struct mm_struct *mm) static void xen_post_allocator_init(void); +static void __init pin_pagetable_pfn(unsigned cmd, unsigned long pfn) +{ + struct mmuext_op op; + + op.cmd = cmd; + op.arg1.mfn = pfn_to_mfn(pfn); + if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) + BUG(); +} + #ifdef CONFIG_X86_64 static void __init xen_cleanhighmap(unsigned long vaddr, unsigned long vaddr_end) @@ -1114,6 +1125,83 @@ static void __init xen_cleanhighmap(unsigned long vaddr, xen_mc_flush(); } +/* + * Make a page range writeable and free it. + */ +static void __init xen_free_ro_pages(unsigned long paddr, unsigned long size) +{ + void *vaddr = __va(paddr); + void *vaddr_end = vaddr + size; + + for (; vaddr < vaddr_end; vaddr += PAGE_SIZE) + make_lowmem_page_readwrite(vaddr); + + memblock_free(paddr, size); +} + +static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl, bool unpin) +{ + unsigned long pa = __pa(pgtbl) & PHYSICAL_PAGE_MASK; + + if (unpin) + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(pa)); + ClearPagePinned(virt_to_page(__va(pa))); + xen_free_ro_pages(pa, PAGE_SIZE); +} + +/* + * Since it is well isolated we can (and since it is perhaps large we should) + * also free the page tables mapping the initial P->M table. + */ +static void __init xen_cleanmfnmap(unsigned long vaddr) +{ + unsigned long va = vaddr & PMD_MASK; + unsigned long pa; + pgd_t *pgd = pgd_offset_k(va); + pud_t *pud_page = pud_offset(pgd, 0); + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + unsigned int i; + bool unpin; + + unpin = (vaddr == 2 * PGDIR_SIZE); + set_pgd(pgd, __pgd(0)); + do { + pud = pud_page + pud_index(va); + if (pud_none(*pud)) { + va += PUD_SIZE; + } else if (pud_large(*pud)) { + pa = pud_val(*pud) & PHYSICAL_PAGE_MASK; + xen_free_ro_pages(pa, PUD_SIZE); + va += PUD_SIZE; + } else { + pmd = pmd_offset(pud, va); + if (pmd_large(*pmd)) { + pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK; + xen_free_ro_pages(pa, PMD_SIZE); + } else if (!pmd_none(*pmd)) { + pte = pte_offset_kernel(pmd, va); + set_pmd(pmd, __pmd(0)); + for (i = 0; i < PTRS_PER_PTE; ++i) { + if (pte_none(pte[i])) + break; + pa = pte_pfn(pte[i]) << PAGE_SHIFT; + xen_free_ro_pages(pa, PAGE_SIZE); + } + xen_cleanmfnmap_free_pgtbl(pte, unpin); + } + va += PMD_SIZE; + if (pmd_index(va)) + continue; + set_pud(pud, __pud(0)); + xen_cleanmfnmap_free_pgtbl(pmd, unpin); + } + + } while (pud_index(va) || pmd_index(va)); + xen_cleanmfnmap_free_pgtbl(pud_page, unpin); +} + static void __init xen_pagetable_p2m_free(void) { unsigned long size; @@ -1128,18 +1216,31 @@ static void __init xen_pagetable_p2m_free(void) /* using __ka address and sticking INVALID_P2M_ENTRY! */ memset((void *)xen_start_info->mfn_list, 0xff, size); - /* We should be in __ka space. */ - BUG_ON(xen_start_info->mfn_list < __START_KERNEL_map); addr = xen_start_info->mfn_list; - /* We roundup to the PMD, which means that if anybody at this stage is - * using the __ka address of xen_start_info or xen_start_info->shared_info - * they are in going to crash. Fortunatly we have already revectored - * in xen_setup_kernel_pagetable and in xen_setup_shared_info. */ + /* + * We could be in __ka space. + * We roundup to the PMD, which means that if anybody at this stage is + * using the __ka address of xen_start_info or + * xen_start_info->shared_info they are in going to crash. Fortunatly + * we have already revectored in xen_setup_kernel_pagetable and in + * xen_setup_shared_info. + */ size = roundup(size, PMD_SIZE); - xen_cleanhighmap(addr, addr + size); - size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); - memblock_free(__pa(xen_start_info->mfn_list), size); + if (addr >= __START_KERNEL_map) { + xen_cleanhighmap(addr, addr + size); + size = PAGE_ALIGN(xen_start_info->nr_pages * + sizeof(unsigned long)); + memblock_free(__pa(addr), size); + } else { + xen_cleanmfnmap(addr); + } +} + +static void __init xen_pagetable_cleanhighmap(void) +{ + unsigned long size; + unsigned long addr; /* At this stage, cleanup_highmap has already cleaned __ka space * from _brk_limit way up to the max_pfn_mapped (which is the end of @@ -1172,6 +1273,8 @@ static void __init xen_pagetable_p2m_setup(void) #ifdef CONFIG_X86_64 xen_pagetable_p2m_free(); + + xen_pagetable_cleanhighmap(); #endif /* And revector! Bye bye old array */ xen_start_info->mfn_list = (unsigned long)xen_p2m_addr; @@ -1461,6 +1564,24 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte) #else /* CONFIG_X86_64 */ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte) { + unsigned long pfn; + + if (xen_feature(XENFEAT_writable_page_tables) || + xen_feature(XENFEAT_auto_translated_physmap) || + xen_start_info->mfn_list >= __START_KERNEL_map) + return pte; + + /* + * Pages belonging to the initial p2m list mapped outside the default + * address range must be mapped read-only. This region contains the + * page tables for mapping the p2m list, too, and page tables MUST be + * mapped read-only. + */ + pfn = pte_pfn(pte); + if (pfn >= xen_start_info->first_p2m_pfn && + pfn < xen_start_info->first_p2m_pfn + xen_start_info->nr_p2m_frames) + pte = __pte_ma(pte_val_ma(pte) & ~_PAGE_RW); + return pte; } #endif /* CONFIG_X86_64 */ @@ -1489,15 +1610,6 @@ static void __init xen_set_pte_init(pte_t *ptep, pte_t pte) native_set_pte(ptep, pte); } -static void __init pin_pagetable_pfn(unsigned cmd, unsigned long pfn) -{ - struct mmuext_op op; - op.cmd = cmd; - op.arg1.mfn = pfn_to_mfn(pfn); - if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) - BUG(); -} - /* Early in boot, while setting up the initial pagetable, assume everything is pinned. */ static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn) @@ -1815,7 +1927,10 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) * mappings. Considering that on Xen after the kernel mappings we * have the mappings of some pages that don't exist in pfn space, we * set max_pfn_mapped to the last real pfn mapped. */ - max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list)); + if (xen_start_info->mfn_list < __START_KERNEL_map) + max_pfn_mapped = xen_start_info->first_p2m_pfn; + else + max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list)); pt_base = PFN_DOWN(__pa(xen_start_info->pt_base)); pt_end = pt_base + xen_start_info->nr_pt_frames; @@ -1855,6 +1970,11 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) /* Graft it onto L4[511][510] */ copy_page(level2_kernel_pgt, l2); + /* Copy the initial P->M table mappings if necessary. */ + i = pgd_index(xen_start_info->mfn_list); + if (i && i < pgd_index(__START_KERNEL_map)) + init_level4_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i]; + if (!xen_feature(XENFEAT_auto_translated_physmap)) { /* Make pagetable pieces RO */ set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); @@ -1894,10 +2014,192 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) check_pt_base(&pt_base, &pt_end, addr[i]); /* Our (by three pages) smaller Xen pagetable that we are using */ - memblock_reserve(PFN_PHYS(pt_base), (pt_end - pt_base) * PAGE_SIZE); + xen_pt_base = PFN_PHYS(pt_base); + xen_pt_size = (pt_end - pt_base) * PAGE_SIZE; + memblock_reserve(xen_pt_base, xen_pt_size); + /* Revector the xen_start_info */ xen_start_info = (struct start_info *)__va(__pa(xen_start_info)); } + +/* + * Read a value from a physical address. + */ +static unsigned long __init xen_read_phys_ulong(phys_addr_t addr) +{ + unsigned long *vaddr; + unsigned long val; + + vaddr = early_memremap_ro(addr, sizeof(val)); + val = *vaddr; + early_memunmap(vaddr, sizeof(val)); + return val; +} + +/* + * Translate a virtual address to a physical one without relying on mapped + * page tables. + */ +static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr) +{ + phys_addr_t pa; + pgd_t pgd; + pud_t pud; + pmd_t pmd; + pte_t pte; + + pa = read_cr3(); + pgd = native_make_pgd(xen_read_phys_ulong(pa + pgd_index(vaddr) * + sizeof(pgd))); + if (!pgd_present(pgd)) + return 0; + + pa = pgd_val(pgd) & PTE_PFN_MASK; + pud = native_make_pud(xen_read_phys_ulong(pa + pud_index(vaddr) * + sizeof(pud))); + if (!pud_present(pud)) + return 0; + pa = pud_pfn(pud) << PAGE_SHIFT; + if (pud_large(pud)) + return pa + (vaddr & ~PUD_MASK); + + pmd = native_make_pmd(xen_read_phys_ulong(pa + pmd_index(vaddr) * + sizeof(pmd))); + if (!pmd_present(pmd)) + return 0; + pa = pmd_pfn(pmd) << PAGE_SHIFT; + if (pmd_large(pmd)) + return pa + (vaddr & ~PMD_MASK); + + pte = native_make_pte(xen_read_phys_ulong(pa + pte_index(vaddr) * + sizeof(pte))); + if (!pte_present(pte)) + return 0; + pa = pte_pfn(pte) << PAGE_SHIFT; + + return pa | (vaddr & ~PAGE_MASK); +} + +/* + * Find a new area for the hypervisor supplied p2m list and relocate the p2m to + * this area. + */ +void __init xen_relocate_p2m(void) +{ + phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys; + unsigned long p2m_pfn, p2m_pfn_end, n_frames, pfn, pfn_end; + int n_pte, n_pt, n_pmd, n_pud, idx_pte, idx_pt, idx_pmd, idx_pud; + pte_t *pt; + pmd_t *pmd; + pud_t *pud; + pgd_t *pgd; + unsigned long *new_p2m; + + size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); + n_pte = roundup(size, PAGE_SIZE) >> PAGE_SHIFT; + n_pt = roundup(size, PMD_SIZE) >> PMD_SHIFT; + n_pmd = roundup(size, PUD_SIZE) >> PUD_SHIFT; + n_pud = roundup(size, PGDIR_SIZE) >> PGDIR_SHIFT; + n_frames = n_pte + n_pt + n_pmd + n_pud; + + new_area = xen_find_free_area(PFN_PHYS(n_frames)); + if (!new_area) { + xen_raw_console_write("Can't find new memory area for p2m needed due to E820 map conflict\n"); + BUG(); + } + + /* + * Setup the page tables for addressing the new p2m list. + * We have asked the hypervisor to map the p2m list at the user address + * PUD_SIZE. It may have done so, or it may have used a kernel space + * address depending on the Xen version. + * To avoid any possible virtual address collision, just use + * 2 * PUD_SIZE for the new area. + */ + pud_phys = new_area; + pmd_phys = pud_phys + PFN_PHYS(n_pud); + pt_phys = pmd_phys + PFN_PHYS(n_pmd); + p2m_pfn = PFN_DOWN(pt_phys) + n_pt; + + pgd = __va(read_cr3()); + new_p2m = (unsigned long *)(2 * PGDIR_SIZE); + for (idx_pud = 0; idx_pud < n_pud; idx_pud++) { + pud = early_memremap(pud_phys, PAGE_SIZE); + clear_page(pud); + for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD); + idx_pmd++) { + pmd = early_memremap(pmd_phys, PAGE_SIZE); + clear_page(pmd); + for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD); + idx_pt++) { + pt = early_memremap(pt_phys, PAGE_SIZE); + clear_page(pt); + for (idx_pte = 0; + idx_pte < min(n_pte, PTRS_PER_PTE); + idx_pte++) { + set_pte(pt + idx_pte, + pfn_pte(p2m_pfn, PAGE_KERNEL)); + p2m_pfn++; + } + n_pte -= PTRS_PER_PTE; + early_memunmap(pt, PAGE_SIZE); + make_lowmem_page_readonly(__va(pt_phys)); + pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, + PFN_DOWN(pt_phys)); + set_pmd(pmd + idx_pt, + __pmd(_PAGE_TABLE | pt_phys)); + pt_phys += PAGE_SIZE; + } + n_pt -= PTRS_PER_PMD; + early_memunmap(pmd, PAGE_SIZE); + make_lowmem_page_readonly(__va(pmd_phys)); + pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE, + PFN_DOWN(pmd_phys)); + set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys)); + pmd_phys += PAGE_SIZE; + } + n_pmd -= PTRS_PER_PUD; + early_memunmap(pud, PAGE_SIZE); + make_lowmem_page_readonly(__va(pud_phys)); + pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys)); + set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys)); + pud_phys += PAGE_SIZE; + } + + /* Now copy the old p2m info to the new area. */ + memcpy(new_p2m, xen_p2m_addr, size); + xen_p2m_addr = new_p2m; + + /* Release the old p2m list and set new list info. */ + p2m_pfn = PFN_DOWN(xen_early_virt_to_phys(xen_start_info->mfn_list)); + BUG_ON(!p2m_pfn); + p2m_pfn_end = p2m_pfn + PFN_DOWN(size); + + if (xen_start_info->mfn_list < __START_KERNEL_map) { + pfn = xen_start_info->first_p2m_pfn; + pfn_end = xen_start_info->first_p2m_pfn + + xen_start_info->nr_p2m_frames; + set_pgd(pgd + 1, __pgd(0)); + } else { + pfn = p2m_pfn; + pfn_end = p2m_pfn_end; + } + + memblock_free(PFN_PHYS(pfn), PAGE_SIZE * (pfn_end - pfn)); + while (pfn < pfn_end) { + if (pfn == p2m_pfn) { + pfn = p2m_pfn_end; + continue; + } + make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); + pfn++; + } + + xen_start_info->mfn_list = (unsigned long)xen_p2m_addr; + xen_start_info->first_p2m_pfn = PFN_DOWN(new_area); + xen_start_info->nr_p2m_frames = n_frames; +} + #else /* !CONFIG_X86_64 */ static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD); static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD); @@ -1938,18 +2240,41 @@ static void __init xen_write_cr3_init(unsigned long cr3) pv_mmu_ops.write_cr3 = &xen_write_cr3; } +/* + * For 32 bit domains xen_start_info->pt_base is the pgd address which might be + * not the first page table in the page table pool. + * Iterate through the initial page tables to find the real page table base. + */ +static phys_addr_t xen_find_pt_base(pmd_t *pmd) +{ + phys_addr_t pt_base, paddr; + unsigned pmdidx; + + pt_base = min(__pa(xen_start_info->pt_base), __pa(pmd)); + + for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) + if (pmd_present(pmd[pmdidx]) && !pmd_large(pmd[pmdidx])) { + paddr = m2p(pmd[pmdidx].pmd); + pt_base = min(pt_base, paddr); + } + + return pt_base; +} + void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) { pmd_t *kernel_pmd; + kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); + + xen_pt_base = xen_find_pt_base(kernel_pmd); + xen_pt_size = xen_start_info->nr_pt_frames * PAGE_SIZE; + initial_kernel_pmd = extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE); - max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) + - xen_start_info->nr_pt_frames * PAGE_SIZE + - 512*1024); + max_pfn_mapped = PFN_DOWN(xen_pt_base + xen_pt_size + 512 * 1024); - kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); copy_page(initial_kernel_pmd, kernel_pmd); xen_map_identity_early(initial_kernel_pmd, max_pfn); @@ -1968,11 +2293,33 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) PFN_DOWN(__pa(initial_page_table))); xen_write_cr3(__pa(initial_page_table)); - memblock_reserve(__pa(xen_start_info->pt_base), - xen_start_info->nr_pt_frames * PAGE_SIZE); + memblock_reserve(xen_pt_base, xen_pt_size); } #endif /* CONFIG_X86_64 */ +void __init xen_reserve_special_pages(void) +{ + phys_addr_t paddr; + + memblock_reserve(__pa(xen_start_info), PAGE_SIZE); + if (xen_start_info->store_mfn) { + paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->store_mfn)); + memblock_reserve(paddr, PAGE_SIZE); + } + if (!xen_initial_domain()) { + paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->console.domU.mfn)); + memblock_reserve(paddr, PAGE_SIZE); + } +} + +void __init xen_pt_check_e820(void) +{ + if (xen_is_e820_reserved(xen_pt_base, xen_pt_size)) { + xen_raw_console_write("Xen hypervisor allocated page table memory conflicts with E820 map\n"); + BUG(); + } +} + static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss; static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) @@ -2465,9 +2812,9 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token, return 0; } -static int do_remap_mfn(struct vm_area_struct *vma, +static int do_remap_gfn(struct vm_area_struct *vma, unsigned long addr, - xen_pfn_t *mfn, int nr, + xen_pfn_t *gfn, int nr, int *err_ptr, pgprot_t prot, unsigned domid, struct page **pages) @@ -2483,14 +2830,14 @@ static int do_remap_mfn(struct vm_area_struct *vma, if (xen_feature(XENFEAT_auto_translated_physmap)) { #ifdef CONFIG_XEN_PVH /* We need to update the local page tables and the xen HAP */ - return xen_xlate_remap_gfn_array(vma, addr, mfn, nr, err_ptr, + return xen_xlate_remap_gfn_array(vma, addr, gfn, nr, err_ptr, prot, domid, pages); #else return -EINVAL; #endif } - rmd.mfn = mfn; + rmd.mfn = gfn; rmd.prot = prot; /* We use the err_ptr to indicate if there we are doing a contigious * mapping or a discontigious mapping. */ @@ -2518,8 +2865,8 @@ static int do_remap_mfn(struct vm_area_struct *vma, batch_left, &done, domid); /* - * @err_ptr may be the same buffer as @mfn, so - * only clear it after each chunk of @mfn is + * @err_ptr may be the same buffer as @gfn, so + * only clear it after each chunk of @gfn is * used. */ if (err_ptr) { @@ -2549,19 +2896,19 @@ out: return err < 0 ? err : mapped; } -int xen_remap_domain_mfn_range(struct vm_area_struct *vma, +int xen_remap_domain_gfn_range(struct vm_area_struct *vma, unsigned long addr, - xen_pfn_t mfn, int nr, + xen_pfn_t gfn, int nr, pgprot_t prot, unsigned domid, struct page **pages) { - return do_remap_mfn(vma, addr, &mfn, nr, NULL, prot, domid, pages); + return do_remap_gfn(vma, addr, &gfn, nr, NULL, prot, domid, pages); } -EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range); +EXPORT_SYMBOL_GPL(xen_remap_domain_gfn_range); -int xen_remap_domain_mfn_array(struct vm_area_struct *vma, +int xen_remap_domain_gfn_array(struct vm_area_struct *vma, unsigned long addr, - xen_pfn_t *mfn, int nr, + xen_pfn_t *gfn, int nr, int *err_ptr, pgprot_t prot, unsigned domid, struct page **pages) { @@ -2570,13 +2917,13 @@ int xen_remap_domain_mfn_array(struct vm_area_struct *vma, * cause of "wrong memory was mapped in". */ BUG_ON(err_ptr == NULL); - return do_remap_mfn(vma, addr, mfn, nr, err_ptr, prot, domid, pages); + return do_remap_gfn(vma, addr, gfn, nr, err_ptr, prot, domid, pages); } -EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_array); +EXPORT_SYMBOL_GPL(xen_remap_domain_gfn_array); /* Returns: 0 success */ -int xen_unmap_domain_mfn_range(struct vm_area_struct *vma, +int xen_unmap_domain_gfn_range(struct vm_area_struct *vma, int numpgs, struct page **pages) { if (!pages || !xen_feature(XENFEAT_auto_translated_physmap)) @@ -2588,4 +2935,4 @@ int xen_unmap_domain_mfn_range(struct vm_area_struct *vma, return -EINVAL; #endif } -EXPORT_SYMBOL_GPL(xen_unmap_domain_mfn_range); +EXPORT_SYMBOL_GPL(xen_unmap_domain_gfn_range); diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 8b7f18e20..660b3cfef 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -79,10 +79,14 @@ #include <xen/balloon.h> #include <xen/grant_table.h> -#include "p2m.h" #include "multicalls.h" #include "xen-ops.h" +#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *)) +#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **)) + +#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE) + #define PMDS_PER_MID_PAGE (P2M_MID_PER_PAGE / PTRS_PER_PTE) unsigned long *xen_p2m_addr __read_mostly; @@ -108,6 +112,15 @@ static unsigned long *p2m_identity; static pte_t *p2m_missing_pte; static pte_t *p2m_identity_pte; +/* + * Hint at last populated PFN. + * + * Used to set HYPERVISOR_shared_info->arch.max_pfn so the toolstack + * can avoid scanning the whole P2M (which may be sized to account for + * hotplugged memory). + */ +static unsigned long xen_p2m_last_pfn; + static inline unsigned p2m_top_index(unsigned long pfn) { BUG_ON(pfn >= MAX_P2M_PFN); @@ -199,7 +212,8 @@ void __ref xen_build_mfn_list_list(void) unsigned int level, topidx, mididx; unsigned long *mid_mfn_p; - if (xen_feature(XENFEAT_auto_translated_physmap)) + if (xen_feature(XENFEAT_auto_translated_physmap) || + xen_start_info->flags & SIF_VIRT_P2M_4TOOLS) return; /* Pre-initialize p2m_top_mfn to be completely missing */ @@ -260,9 +274,16 @@ void xen_setup_mfn_list_list(void) BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); - HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = - virt_to_mfn(p2m_top_mfn); - HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn; + if (xen_start_info->flags & SIF_VIRT_P2M_4TOOLS) + HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = ~0UL; + else + HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = + virt_to_mfn(p2m_top_mfn); + HYPERVISOR_shared_info->arch.max_pfn = xen_p2m_last_pfn; + HYPERVISOR_shared_info->arch.p2m_generation = 0; + HYPERVISOR_shared_info->arch.p2m_vaddr = (unsigned long)xen_p2m_addr; + HYPERVISOR_shared_info->arch.p2m_cr3 = + xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir)); } /* Set up p2m_top to point to the domain-builder provided p2m pages */ @@ -394,6 +415,8 @@ void __init xen_vmalloc_p2m_tree(void) static struct vm_struct vm; unsigned long p2m_limit; + xen_p2m_last_pfn = xen_max_p2m_pfn; + p2m_limit = (phys_addr_t)P2M_LIMIT * 1024 * 1024 * 1024 / PAGE_SIZE; vm.flags = VM_ALLOC; vm.size = ALIGN(sizeof(unsigned long) * max(xen_max_p2m_pfn, p2m_limit), @@ -478,8 +501,12 @@ static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *pte_pg) ptechk = lookup_address(vaddr, &level); if (ptechk == pte_pg) { + HYPERVISOR_shared_info->arch.p2m_generation++; + wmb(); /* Tools are synchronizing via p2m_generation. */ set_pmd(pmdp, __pmd(__pa(pte_newpg[i]) | _KERNPG_TABLE)); + wmb(); /* Tools are synchronizing via p2m_generation. */ + HYPERVISOR_shared_info->arch.p2m_generation++; pte_newpg[i] = NULL; } @@ -505,7 +532,7 @@ static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *pte_pg) */ static bool alloc_p2m(unsigned long pfn) { - unsigned topidx, mididx; + unsigned topidx; unsigned long *top_mfn_p, *mid_mfn; pte_t *ptep, *pte_pg; unsigned int level; @@ -513,9 +540,6 @@ static bool alloc_p2m(unsigned long pfn) unsigned long addr = (unsigned long)(xen_p2m_addr + pfn); unsigned long p2m_pfn; - topidx = p2m_top_index(pfn); - mididx = p2m_mid_index(pfn); - ptep = lookup_address(addr, &level); BUG_ON(!ptep || level != PG_LEVEL_4K); pte_pg = (pte_t *)((unsigned long)ptep & ~(PAGE_SIZE - 1)); @@ -527,7 +551,8 @@ static bool alloc_p2m(unsigned long pfn) return false; } - if (p2m_top_mfn) { + if (p2m_top_mfn && pfn < MAX_P2M_PFN) { + topidx = p2m_top_index(pfn); top_mfn_p = &p2m_top_mfn[topidx]; mid_mfn = ACCESS_ONCE(p2m_top_mfn_p[topidx]); @@ -577,10 +602,14 @@ static bool alloc_p2m(unsigned long pfn) spin_lock_irqsave(&p2m_update_lock, flags); if (pte_pfn(*ptep) == p2m_pfn) { + HYPERVISOR_shared_info->arch.p2m_generation++; + wmb(); /* Tools are synchronizing via p2m_generation. */ set_pte(ptep, pfn_pte(PFN_DOWN(__pa(p2m)), PAGE_KERNEL)); + wmb(); /* Tools are synchronizing via p2m_generation. */ + HYPERVISOR_shared_info->arch.p2m_generation++; if (mid_mfn) - mid_mfn[mididx] = virt_to_mfn(p2m); + mid_mfn[p2m_mid_index(pfn)] = virt_to_mfn(p2m); p2m = NULL; } @@ -590,6 +619,12 @@ static bool alloc_p2m(unsigned long pfn) free_p2m_page(p2m); } + /* Expanded the p2m? */ + if (pfn > xen_p2m_last_pfn) { + xen_p2m_last_pfn = pfn; + HYPERVISOR_shared_info->arch.max_pfn = xen_p2m_last_pfn; + } + return true; } @@ -630,6 +665,11 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) return true; } + /* + * The interface requires atomic updates on p2m elements. + * xen_safe_write_ulong() is using __put_user which does an atomic + * store via asm(). + */ if (likely(!xen_safe_write_ulong(xen_p2m_addr + pfn, mfn))) return true; diff --git a/arch/x86/xen/p2m.h b/arch/x86/xen/p2m.h deleted file mode 100644 index ad8aee24a..000000000 --- a/arch/x86/xen/p2m.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef _XEN_P2M_H -#define _XEN_P2M_H - -#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) -#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *)) -#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **)) - -#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE) - -#define MAX_REMAP_RANGES 10 - -extern unsigned long __init set_phys_range_identity(unsigned long pfn_s, - unsigned long pfn_e); - -#endif /* _XEN_P2M_H */ diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c index a8261716d..9586ff328 100644 --- a/arch/x86/xen/platform-pci-unplug.c +++ b/arch/x86/xen/platform-pci-unplug.c @@ -68,7 +68,7 @@ static int check_platform_magic(void) return 0; } -bool xen_has_pv_devices() +bool xen_has_pv_devices(void) { if (!xen_domain()) return false; diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c new file mode 100644 index 000000000..724a08740 --- /dev/null +++ b/arch/x86/xen/pmu.c @@ -0,0 +1,570 @@ +#include <linux/types.h> +#include <linux/interrupt.h> + +#include <asm/xen/hypercall.h> +#include <xen/page.h> +#include <xen/interface/xen.h> +#include <xen/interface/vcpu.h> +#include <xen/interface/xenpmu.h> + +#include "xen-ops.h" +#include "pmu.h" + +/* x86_pmu.handle_irq definition */ +#include "../kernel/cpu/perf_event.h" + +#define XENPMU_IRQ_PROCESSING 1 +struct xenpmu { + /* Shared page between hypervisor and domain */ + struct xen_pmu_data *xenpmu_data; + + uint8_t flags; +}; +static DEFINE_PER_CPU(struct xenpmu, xenpmu_shared); +#define get_xenpmu_data() (this_cpu_ptr(&xenpmu_shared)->xenpmu_data) +#define get_xenpmu_flags() (this_cpu_ptr(&xenpmu_shared)->flags) + +/* Macro for computing address of a PMU MSR bank */ +#define field_offset(ctxt, field) ((void *)((uintptr_t)ctxt + \ + (uintptr_t)ctxt->field)) + +/* AMD PMU */ +#define F15H_NUM_COUNTERS 6 +#define F10H_NUM_COUNTERS 4 + +static __read_mostly uint32_t amd_counters_base; +static __read_mostly uint32_t amd_ctrls_base; +static __read_mostly int amd_msr_step; +static __read_mostly int k7_counters_mirrored; +static __read_mostly int amd_num_counters; + +/* Intel PMU */ +#define MSR_TYPE_COUNTER 0 +#define MSR_TYPE_CTRL 1 +#define MSR_TYPE_GLOBAL 2 +#define MSR_TYPE_ARCH_COUNTER 3 +#define MSR_TYPE_ARCH_CTRL 4 + +/* Number of general pmu registers (CPUID.EAX[0xa].EAX[8..15]) */ +#define PMU_GENERAL_NR_SHIFT 8 +#define PMU_GENERAL_NR_BITS 8 +#define PMU_GENERAL_NR_MASK (((1 << PMU_GENERAL_NR_BITS) - 1) \ + << PMU_GENERAL_NR_SHIFT) + +/* Number of fixed pmu registers (CPUID.EDX[0xa].EDX[0..4]) */ +#define PMU_FIXED_NR_SHIFT 0 +#define PMU_FIXED_NR_BITS 5 +#define PMU_FIXED_NR_MASK (((1 << PMU_FIXED_NR_BITS) - 1) \ + << PMU_FIXED_NR_SHIFT) + +/* Alias registers (0x4c1) for full-width writes to PMCs */ +#define MSR_PMC_ALIAS_MASK (~(MSR_IA32_PERFCTR0 ^ MSR_IA32_PMC0)) + +#define INTEL_PMC_TYPE_SHIFT 30 + +static __read_mostly int intel_num_arch_counters, intel_num_fixed_counters; + + +static void xen_pmu_arch_init(void) +{ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { + + switch (boot_cpu_data.x86) { + case 0x15: + amd_num_counters = F15H_NUM_COUNTERS; + amd_counters_base = MSR_F15H_PERF_CTR; + amd_ctrls_base = MSR_F15H_PERF_CTL; + amd_msr_step = 2; + k7_counters_mirrored = 1; + break; + case 0x10: + case 0x12: + case 0x14: + case 0x16: + default: + amd_num_counters = F10H_NUM_COUNTERS; + amd_counters_base = MSR_K7_PERFCTR0; + amd_ctrls_base = MSR_K7_EVNTSEL0; + amd_msr_step = 1; + k7_counters_mirrored = 0; + break; + } + } else { + uint32_t eax, ebx, ecx, edx; + + cpuid(0xa, &eax, &ebx, &ecx, &edx); + + intel_num_arch_counters = (eax & PMU_GENERAL_NR_MASK) >> + PMU_GENERAL_NR_SHIFT; + intel_num_fixed_counters = (edx & PMU_FIXED_NR_MASK) >> + PMU_FIXED_NR_SHIFT; + } +} + +static inline uint32_t get_fam15h_addr(u32 addr) +{ + switch (addr) { + case MSR_K7_PERFCTR0: + case MSR_K7_PERFCTR1: + case MSR_K7_PERFCTR2: + case MSR_K7_PERFCTR3: + return MSR_F15H_PERF_CTR + (addr - MSR_K7_PERFCTR0); + case MSR_K7_EVNTSEL0: + case MSR_K7_EVNTSEL1: + case MSR_K7_EVNTSEL2: + case MSR_K7_EVNTSEL3: + return MSR_F15H_PERF_CTL + (addr - MSR_K7_EVNTSEL0); + default: + break; + } + + return addr; +} + +static inline bool is_amd_pmu_msr(unsigned int msr) +{ + if ((msr >= MSR_F15H_PERF_CTL && + msr < MSR_F15H_PERF_CTR + (amd_num_counters * 2)) || + (msr >= MSR_K7_EVNTSEL0 && + msr < MSR_K7_PERFCTR0 + amd_num_counters)) + return true; + + return false; +} + +static int is_intel_pmu_msr(u32 msr_index, int *type, int *index) +{ + u32 msr_index_pmc; + + switch (msr_index) { + case MSR_CORE_PERF_FIXED_CTR_CTRL: + case MSR_IA32_DS_AREA: + case MSR_IA32_PEBS_ENABLE: + *type = MSR_TYPE_CTRL; + return true; + + case MSR_CORE_PERF_GLOBAL_CTRL: + case MSR_CORE_PERF_GLOBAL_STATUS: + case MSR_CORE_PERF_GLOBAL_OVF_CTRL: + *type = MSR_TYPE_GLOBAL; + return true; + + default: + + if ((msr_index >= MSR_CORE_PERF_FIXED_CTR0) && + (msr_index < MSR_CORE_PERF_FIXED_CTR0 + + intel_num_fixed_counters)) { + *index = msr_index - MSR_CORE_PERF_FIXED_CTR0; + *type = MSR_TYPE_COUNTER; + return true; + } + + if ((msr_index >= MSR_P6_EVNTSEL0) && + (msr_index < MSR_P6_EVNTSEL0 + intel_num_arch_counters)) { + *index = msr_index - MSR_P6_EVNTSEL0; + *type = MSR_TYPE_ARCH_CTRL; + return true; + } + + msr_index_pmc = msr_index & MSR_PMC_ALIAS_MASK; + if ((msr_index_pmc >= MSR_IA32_PERFCTR0) && + (msr_index_pmc < MSR_IA32_PERFCTR0 + + intel_num_arch_counters)) { + *type = MSR_TYPE_ARCH_COUNTER; + *index = msr_index_pmc - MSR_IA32_PERFCTR0; + return true; + } + return false; + } +} + +static bool xen_intel_pmu_emulate(unsigned int msr, u64 *val, int type, + int index, bool is_read) +{ + uint64_t *reg = NULL; + struct xen_pmu_intel_ctxt *ctxt; + uint64_t *fix_counters; + struct xen_pmu_cntr_pair *arch_cntr_pair; + struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); + uint8_t xenpmu_flags = get_xenpmu_flags(); + + + if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING)) + return false; + + ctxt = &xenpmu_data->pmu.c.intel; + + switch (msr) { + case MSR_CORE_PERF_GLOBAL_OVF_CTRL: + reg = &ctxt->global_ovf_ctrl; + break; + case MSR_CORE_PERF_GLOBAL_STATUS: + reg = &ctxt->global_status; + break; + case MSR_CORE_PERF_GLOBAL_CTRL: + reg = &ctxt->global_ctrl; + break; + case MSR_CORE_PERF_FIXED_CTR_CTRL: + reg = &ctxt->fixed_ctrl; + break; + default: + switch (type) { + case MSR_TYPE_COUNTER: + fix_counters = field_offset(ctxt, fixed_counters); + reg = &fix_counters[index]; + break; + case MSR_TYPE_ARCH_COUNTER: + arch_cntr_pair = field_offset(ctxt, arch_counters); + reg = &arch_cntr_pair[index].counter; + break; + case MSR_TYPE_ARCH_CTRL: + arch_cntr_pair = field_offset(ctxt, arch_counters); + reg = &arch_cntr_pair[index].control; + break; + default: + return false; + } + } + + if (reg) { + if (is_read) + *val = *reg; + else { + *reg = *val; + + if (msr == MSR_CORE_PERF_GLOBAL_OVF_CTRL) + ctxt->global_status &= (~(*val)); + } + return true; + } + + return false; +} + +static bool xen_amd_pmu_emulate(unsigned int msr, u64 *val, bool is_read) +{ + uint64_t *reg = NULL; + int i, off = 0; + struct xen_pmu_amd_ctxt *ctxt; + uint64_t *counter_regs, *ctrl_regs; + struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); + uint8_t xenpmu_flags = get_xenpmu_flags(); + + if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING)) + return false; + + if (k7_counters_mirrored && + ((msr >= MSR_K7_EVNTSEL0) && (msr <= MSR_K7_PERFCTR3))) + msr = get_fam15h_addr(msr); + + ctxt = &xenpmu_data->pmu.c.amd; + for (i = 0; i < amd_num_counters; i++) { + if (msr == amd_ctrls_base + off) { + ctrl_regs = field_offset(ctxt, ctrls); + reg = &ctrl_regs[i]; + break; + } else if (msr == amd_counters_base + off) { + counter_regs = field_offset(ctxt, counters); + reg = &counter_regs[i]; + break; + } + off += amd_msr_step; + } + + if (reg) { + if (is_read) + *val = *reg; + else + *reg = *val; + + return true; + } + return false; +} + +bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err) +{ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { + if (is_amd_pmu_msr(msr)) { + if (!xen_amd_pmu_emulate(msr, val, 1)) + *val = native_read_msr_safe(msr, err); + return true; + } + } else { + int type, index; + + if (is_intel_pmu_msr(msr, &type, &index)) { + if (!xen_intel_pmu_emulate(msr, val, type, index, 1)) + *val = native_read_msr_safe(msr, err); + return true; + } + } + + return false; +} + +bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err) +{ + uint64_t val = ((uint64_t)high << 32) | low; + + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { + if (is_amd_pmu_msr(msr)) { + if (!xen_amd_pmu_emulate(msr, &val, 0)) + *err = native_write_msr_safe(msr, low, high); + return true; + } + } else { + int type, index; + + if (is_intel_pmu_msr(msr, &type, &index)) { + if (!xen_intel_pmu_emulate(msr, &val, type, index, 0)) + *err = native_write_msr_safe(msr, low, high); + return true; + } + } + + return false; +} + +static unsigned long long xen_amd_read_pmc(int counter) +{ + struct xen_pmu_amd_ctxt *ctxt; + uint64_t *counter_regs; + struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); + uint8_t xenpmu_flags = get_xenpmu_flags(); + + if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING)) { + uint32_t msr; + int err; + + msr = amd_counters_base + (counter * amd_msr_step); + return native_read_msr_safe(msr, &err); + } + + ctxt = &xenpmu_data->pmu.c.amd; + counter_regs = field_offset(ctxt, counters); + return counter_regs[counter]; +} + +static unsigned long long xen_intel_read_pmc(int counter) +{ + struct xen_pmu_intel_ctxt *ctxt; + uint64_t *fixed_counters; + struct xen_pmu_cntr_pair *arch_cntr_pair; + struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); + uint8_t xenpmu_flags = get_xenpmu_flags(); + + if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING)) { + uint32_t msr; + int err; + + if (counter & (1 << INTEL_PMC_TYPE_SHIFT)) + msr = MSR_CORE_PERF_FIXED_CTR0 + (counter & 0xffff); + else + msr = MSR_IA32_PERFCTR0 + counter; + + return native_read_msr_safe(msr, &err); + } + + ctxt = &xenpmu_data->pmu.c.intel; + if (counter & (1 << INTEL_PMC_TYPE_SHIFT)) { + fixed_counters = field_offset(ctxt, fixed_counters); + return fixed_counters[counter & 0xffff]; + } + + arch_cntr_pair = field_offset(ctxt, arch_counters); + return arch_cntr_pair[counter].counter; +} + +unsigned long long xen_read_pmc(int counter) +{ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + return xen_amd_read_pmc(counter); + else + return xen_intel_read_pmc(counter); +} + +int pmu_apic_update(uint32_t val) +{ + int ret; + struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); + + if (!xenpmu_data) { + pr_warn_once("%s: pmudata not initialized\n", __func__); + return -EINVAL; + } + + xenpmu_data->pmu.l.lapic_lvtpc = val; + + if (get_xenpmu_flags() & XENPMU_IRQ_PROCESSING) + return 0; + + ret = HYPERVISOR_xenpmu_op(XENPMU_lvtpc_set, NULL); + + return ret; +} + +/* perf callbacks */ +static int xen_is_in_guest(void) +{ + const struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); + + if (!xenpmu_data) { + pr_warn_once("%s: pmudata not initialized\n", __func__); + return 0; + } + + if (!xen_initial_domain() || (xenpmu_data->domain_id >= DOMID_SELF)) + return 0; + + return 1; +} + +static int xen_is_user_mode(void) +{ + const struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); + + if (!xenpmu_data) { + pr_warn_once("%s: pmudata not initialized\n", __func__); + return 0; + } + + if (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_PV) + return (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_USER); + else + return !!(xenpmu_data->pmu.r.regs.cpl & 3); +} + +static unsigned long xen_get_guest_ip(void) +{ + const struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); + + if (!xenpmu_data) { + pr_warn_once("%s: pmudata not initialized\n", __func__); + return 0; + } + + return xenpmu_data->pmu.r.regs.ip; +} + +static struct perf_guest_info_callbacks xen_guest_cbs = { + .is_in_guest = xen_is_in_guest, + .is_user_mode = xen_is_user_mode, + .get_guest_ip = xen_get_guest_ip, +}; + +/* Convert registers from Xen's format to Linux' */ +static void xen_convert_regs(const struct xen_pmu_regs *xen_regs, + struct pt_regs *regs, uint64_t pmu_flags) +{ + regs->ip = xen_regs->ip; + regs->cs = xen_regs->cs; + regs->sp = xen_regs->sp; + + if (pmu_flags & PMU_SAMPLE_PV) { + if (pmu_flags & PMU_SAMPLE_USER) + regs->cs |= 3; + else + regs->cs &= ~3; + } else { + if (xen_regs->cpl) + regs->cs |= 3; + else + regs->cs &= ~3; + } +} + +irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id) +{ + int err, ret = IRQ_NONE; + struct pt_regs regs; + const struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); + uint8_t xenpmu_flags = get_xenpmu_flags(); + + if (!xenpmu_data) { + pr_warn_once("%s: pmudata not initialized\n", __func__); + return ret; + } + + this_cpu_ptr(&xenpmu_shared)->flags = + xenpmu_flags | XENPMU_IRQ_PROCESSING; + xen_convert_regs(&xenpmu_data->pmu.r.regs, ®s, + xenpmu_data->pmu.pmu_flags); + if (x86_pmu.handle_irq(®s)) + ret = IRQ_HANDLED; + + /* Write out cached context to HW */ + err = HYPERVISOR_xenpmu_op(XENPMU_flush, NULL); + this_cpu_ptr(&xenpmu_shared)->flags = xenpmu_flags; + if (err) { + pr_warn_once("%s: failed hypercall, err: %d\n", __func__, err); + return IRQ_NONE; + } + + return ret; +} + +bool is_xen_pmu(int cpu) +{ + return (get_xenpmu_data() != NULL); +} + +void xen_pmu_init(int cpu) +{ + int err; + struct xen_pmu_params xp; + unsigned long pfn; + struct xen_pmu_data *xenpmu_data; + + BUILD_BUG_ON(sizeof(struct xen_pmu_data) > PAGE_SIZE); + + if (xen_hvm_domain()) + return; + + xenpmu_data = (struct xen_pmu_data *)get_zeroed_page(GFP_KERNEL); + if (!xenpmu_data) { + pr_err("VPMU init: No memory\n"); + return; + } + pfn = virt_to_pfn(xenpmu_data); + + xp.val = pfn_to_mfn(pfn); + xp.vcpu = cpu; + xp.version.maj = XENPMU_VER_MAJ; + xp.version.min = XENPMU_VER_MIN; + err = HYPERVISOR_xenpmu_op(XENPMU_init, &xp); + if (err) + goto fail; + + per_cpu(xenpmu_shared, cpu).xenpmu_data = xenpmu_data; + per_cpu(xenpmu_shared, cpu).flags = 0; + + if (cpu == 0) { + perf_register_guest_info_callbacks(&xen_guest_cbs); + xen_pmu_arch_init(); + } + + return; + +fail: + pr_warn_once("Could not initialize VPMU for cpu %d, error %d\n", + cpu, err); + free_pages((unsigned long)xenpmu_data, 0); +} + +void xen_pmu_finish(int cpu) +{ + struct xen_pmu_params xp; + + if (xen_hvm_domain()) + return; + + xp.vcpu = cpu; + xp.version.maj = XENPMU_VER_MAJ; + xp.version.min = XENPMU_VER_MIN; + + (void)HYPERVISOR_xenpmu_op(XENPMU_finish, &xp); + + free_pages((unsigned long)per_cpu(xenpmu_shared, cpu).xenpmu_data, 0); + per_cpu(xenpmu_shared, cpu).xenpmu_data = NULL; +} diff --git a/arch/x86/xen/pmu.h b/arch/x86/xen/pmu.h new file mode 100644 index 000000000..af5f0ad94 --- /dev/null +++ b/arch/x86/xen/pmu.h @@ -0,0 +1,15 @@ +#ifndef __XEN_PMU_H +#define __XEN_PMU_H + +#include <xen/interface/xenpmu.h> + +irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id); +void xen_pmu_init(int cpu); +void xen_pmu_finish(int cpu); +bool is_xen_pmu(int cpu); +bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err); +bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err); +int pmu_apic_update(uint32_t reg); +unsigned long long xen_read_pmc(int counter); + +#endif /* __XEN_PMU_H */ diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 55f388ef4..1c30e4ab1 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -27,17 +27,23 @@ #include <xen/interface/memory.h> #include <xen/interface/physdev.h> #include <xen/features.h> +#include <xen/hvc-console.h> #include "xen-ops.h" #include "vdso.h" -#include "p2m.h" #include "mmu.h" +#define GB(x) ((uint64_t)(x) * 1024 * 1024 * 1024) + /* Amount of extra memory space we add to the e820 ranges */ struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata; /* Number of pages released from the initial allocation. */ unsigned long xen_released_pages; +/* E820 map used during setting up memory. */ +static struct e820entry xen_e820_map[E820MAX] __initdata; +static u32 xen_e820_map_entries __initdata; + /* * Buffer used to remap identity mapped pages. We only need the virtual space. * The physical page behind this address is remapped as needed to different @@ -64,62 +70,89 @@ static unsigned long xen_remap_mfn __initdata = INVALID_P2M_ENTRY; */ #define EXTRA_MEM_RATIO (10) -static void __init xen_add_extra_mem(phys_addr_t start, phys_addr_t size) +static bool xen_512gb_limit __initdata = IS_ENABLED(CONFIG_XEN_512GB); + +static void __init xen_parse_512gb(void) +{ + bool val = false; + char *arg; + + arg = strstr(xen_start_info->cmd_line, "xen_512gb_limit"); + if (!arg) + return; + + arg = strstr(xen_start_info->cmd_line, "xen_512gb_limit="); + if (!arg) + val = true; + else if (strtobool(arg + strlen("xen_512gb_limit="), &val)) + return; + + xen_512gb_limit = val; +} + +static void __init xen_add_extra_mem(unsigned long start_pfn, + unsigned long n_pfns) { int i; + /* + * No need to check for zero size, should happen rarely and will only + * write a new entry regarded to be unused due to zero size. + */ for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { /* Add new region. */ - if (xen_extra_mem[i].size == 0) { - xen_extra_mem[i].start = start; - xen_extra_mem[i].size = size; + if (xen_extra_mem[i].n_pfns == 0) { + xen_extra_mem[i].start_pfn = start_pfn; + xen_extra_mem[i].n_pfns = n_pfns; break; } /* Append to existing region. */ - if (xen_extra_mem[i].start + xen_extra_mem[i].size == start) { - xen_extra_mem[i].size += size; + if (xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns == + start_pfn) { + xen_extra_mem[i].n_pfns += n_pfns; break; } } if (i == XEN_EXTRA_MEM_MAX_REGIONS) printk(KERN_WARNING "Warning: not enough extra memory regions\n"); - memblock_reserve(start, size); + memblock_reserve(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns)); } -static void __init xen_del_extra_mem(phys_addr_t start, phys_addr_t size) +static void __init xen_del_extra_mem(unsigned long start_pfn, + unsigned long n_pfns) { int i; - phys_addr_t start_r, size_r; + unsigned long start_r, size_r; for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { - start_r = xen_extra_mem[i].start; - size_r = xen_extra_mem[i].size; + start_r = xen_extra_mem[i].start_pfn; + size_r = xen_extra_mem[i].n_pfns; /* Start of region. */ - if (start_r == start) { - BUG_ON(size > size_r); - xen_extra_mem[i].start += size; - xen_extra_mem[i].size -= size; + if (start_r == start_pfn) { + BUG_ON(n_pfns > size_r); + xen_extra_mem[i].start_pfn += n_pfns; + xen_extra_mem[i].n_pfns -= n_pfns; break; } /* End of region. */ - if (start_r + size_r == start + size) { - BUG_ON(size > size_r); - xen_extra_mem[i].size -= size; + if (start_r + size_r == start_pfn + n_pfns) { + BUG_ON(n_pfns > size_r); + xen_extra_mem[i].n_pfns -= n_pfns; break; } /* Mid of region. */ - if (start > start_r && start < start_r + size_r) { - BUG_ON(start + size > start_r + size_r); - xen_extra_mem[i].size = start - start_r; + if (start_pfn > start_r && start_pfn < start_r + size_r) { + BUG_ON(start_pfn + n_pfns > start_r + size_r); + xen_extra_mem[i].n_pfns = start_pfn - start_r; /* Calling memblock_reserve() again is okay. */ - xen_add_extra_mem(start + size, start_r + size_r - - (start + size)); + xen_add_extra_mem(start_pfn + n_pfns, start_r + size_r - + (start_pfn + n_pfns)); break; } } - memblock_free(start, size); + memblock_free(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns)); } /* @@ -130,11 +163,10 @@ static void __init xen_del_extra_mem(phys_addr_t start, phys_addr_t size) unsigned long __ref xen_chk_extra_mem(unsigned long pfn) { int i; - phys_addr_t addr = PFN_PHYS(pfn); for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { - if (addr >= xen_extra_mem[i].start && - addr < xen_extra_mem[i].start + xen_extra_mem[i].size) + if (pfn >= xen_extra_mem[i].start_pfn && + pfn < xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns) return INVALID_P2M_ENTRY; } @@ -150,10 +182,10 @@ void __init xen_inv_extra_mem(void) int i; for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { - if (!xen_extra_mem[i].size) + if (!xen_extra_mem[i].n_pfns) continue; - pfn_s = PFN_DOWN(xen_extra_mem[i].start); - pfn_e = PFN_UP(xen_extra_mem[i].start + xen_extra_mem[i].size); + pfn_s = xen_extra_mem[i].start_pfn; + pfn_e = pfn_s + xen_extra_mem[i].n_pfns; for (pfn = pfn_s; pfn < pfn_e; pfn++) set_phys_to_machine(pfn, INVALID_P2M_ENTRY); } @@ -164,15 +196,13 @@ void __init xen_inv_extra_mem(void) * This function updates min_pfn with the pfn found and returns * the size of that range or zero if not found. */ -static unsigned long __init xen_find_pfn_range( - const struct e820entry *list, size_t map_size, - unsigned long *min_pfn) +static unsigned long __init xen_find_pfn_range(unsigned long *min_pfn) { - const struct e820entry *entry; + const struct e820entry *entry = xen_e820_map; unsigned int i; unsigned long done = 0; - for (i = 0, entry = list; i < map_size; i++, entry++) { + for (i = 0; i < xen_e820_map_entries; i++, entry++) { unsigned long s_pfn; unsigned long e_pfn; @@ -221,7 +251,7 @@ static int __init xen_free_mfn(unsigned long mfn) * as a fallback if the remapping fails. */ static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn, - unsigned long end_pfn, unsigned long nr_pages, unsigned long *released) + unsigned long end_pfn, unsigned long nr_pages) { unsigned long pfn, end; int ret; @@ -241,7 +271,7 @@ static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn, WARN(ret != 1, "Failed to release pfn %lx err=%d\n", pfn, ret); if (ret == 1) { - (*released)++; + xen_released_pages++; if (!__set_phys_to_machine(pfn, INVALID_P2M_ENTRY)) break; } else @@ -356,9 +386,8 @@ static void __init xen_do_set_identity_and_remap_chunk( * to Xen and not remapped. */ static unsigned long __init xen_set_identity_and_remap_chunk( - const struct e820entry *list, size_t map_size, unsigned long start_pfn, - unsigned long end_pfn, unsigned long nr_pages, unsigned long remap_pfn, - unsigned long *released, unsigned long *remapped) + unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages, + unsigned long remap_pfn) { unsigned long pfn; unsigned long i = 0; @@ -379,12 +408,11 @@ static unsigned long __init xen_set_identity_and_remap_chunk( if (cur_pfn + size > nr_pages) size = nr_pages - cur_pfn; - remap_range_size = xen_find_pfn_range(list, map_size, - &remap_pfn); + remap_range_size = xen_find_pfn_range(&remap_pfn); if (!remap_range_size) { pr_warning("Unable to find available pfn range, not remapping identity pages\n"); xen_set_identity_and_release_chunk(cur_pfn, - cur_pfn + left, nr_pages, released); + cur_pfn + left, nr_pages); break; } /* Adjust size to fit in current e820 RAM region */ @@ -396,7 +424,6 @@ static unsigned long __init xen_set_identity_and_remap_chunk( /* Update variables to reflect new mappings. */ i += size; remap_pfn += size; - *remapped += size; } /* @@ -411,15 +438,11 @@ static unsigned long __init xen_set_identity_and_remap_chunk( return remap_pfn; } -static void __init xen_set_identity_and_remap( - const struct e820entry *list, size_t map_size, unsigned long nr_pages, - unsigned long *released, unsigned long *remapped) +static void __init xen_set_identity_and_remap(unsigned long nr_pages) { phys_addr_t start = 0; unsigned long last_pfn = nr_pages; - const struct e820entry *entry; - unsigned long num_released = 0; - unsigned long num_remapped = 0; + const struct e820entry *entry = xen_e820_map; int i; /* @@ -433,9 +456,9 @@ static void __init xen_set_identity_and_remap( * example) the DMI tables in a reserved region that begins on * a non-page boundary. */ - for (i = 0, entry = list; i < map_size; i++, entry++) { + for (i = 0; i < xen_e820_map_entries; i++, entry++) { phys_addr_t end = entry->addr + entry->size; - if (entry->type == E820_RAM || i == map_size - 1) { + if (entry->type == E820_RAM || i == xen_e820_map_entries - 1) { unsigned long start_pfn = PFN_DOWN(start); unsigned long end_pfn = PFN_UP(end); @@ -444,17 +467,13 @@ static void __init xen_set_identity_and_remap( if (start_pfn < end_pfn) last_pfn = xen_set_identity_and_remap_chunk( - list, map_size, start_pfn, - end_pfn, nr_pages, last_pfn, - &num_released, &num_remapped); + start_pfn, end_pfn, nr_pages, + last_pfn); start = end; } } - *released = num_released; - *remapped = num_remapped; - - pr_info("Released %ld page(s)\n", num_released); + pr_info("Released %ld page(s)\n", xen_released_pages); } /* @@ -494,7 +513,7 @@ void __init xen_remap_memory(void) } else if (pfn_s + len == xen_remap_buf.target_pfn) { len += xen_remap_buf.size; } else { - xen_del_extra_mem(PFN_PHYS(pfn_s), PFN_PHYS(len)); + xen_del_extra_mem(pfn_s, len); pfn_s = xen_remap_buf.target_pfn; len = xen_remap_buf.size; } @@ -504,18 +523,35 @@ void __init xen_remap_memory(void) } if (pfn_s != ~0UL && len) - xen_del_extra_mem(PFN_PHYS(pfn_s), PFN_PHYS(len)); + xen_del_extra_mem(pfn_s, len); set_pte_mfn(buf, mfn_save, PAGE_KERNEL); pr_info("Remapped %ld page(s)\n", remapped); } +static unsigned long __init xen_get_pages_limit(void) +{ + unsigned long limit; + +#ifdef CONFIG_X86_32 + limit = GB(64) / PAGE_SIZE; +#else + limit = MAXMEM / PAGE_SIZE; + if (!xen_initial_domain() && xen_512gb_limit) + limit = GB(512) / PAGE_SIZE; +#endif + return limit; +} + static unsigned long __init xen_get_max_pages(void) { - unsigned long max_pages = MAX_DOMAIN_PAGES; + unsigned long max_pages, limit; domid_t domid = DOMID_SELF; - int ret; + long ret; + + limit = xen_get_pages_limit(); + max_pages = limit; /* * For the initial domain we use the maximum reservation as @@ -532,7 +568,7 @@ static unsigned long __init xen_get_max_pages(void) max_pages = ret; } - return min(max_pages, MAX_DOMAIN_PAGES); + return min(max_pages, limit); } static void __init xen_align_and_add_e820_region(phys_addr_t start, @@ -549,39 +585,188 @@ static void __init xen_align_and_add_e820_region(phys_addr_t start, e820_add_region(start, end - start, type); } -static void __init xen_ignore_unusable(struct e820entry *list, size_t map_size) +static void __init xen_ignore_unusable(void) { - struct e820entry *entry; + struct e820entry *entry = xen_e820_map; unsigned int i; - for (i = 0, entry = list; i < map_size; i++, entry++) { + for (i = 0; i < xen_e820_map_entries; i++, entry++) { if (entry->type == E820_UNUSABLE) entry->type = E820_RAM; } } +static unsigned long __init xen_count_remap_pages(unsigned long max_pfn) +{ + unsigned long extra = 0; + unsigned long start_pfn, end_pfn; + const struct e820entry *entry = xen_e820_map; + int i; + + end_pfn = 0; + for (i = 0; i < xen_e820_map_entries; i++, entry++) { + start_pfn = PFN_DOWN(entry->addr); + /* Adjacent regions on non-page boundaries handling! */ + end_pfn = min(end_pfn, start_pfn); + + if (start_pfn >= max_pfn) + return extra + max_pfn - end_pfn; + + /* Add any holes in map to result. */ + extra += start_pfn - end_pfn; + + end_pfn = PFN_UP(entry->addr + entry->size); + end_pfn = min(end_pfn, max_pfn); + + if (entry->type != E820_RAM) + extra += end_pfn - start_pfn; + } + + return extra; +} + +bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size) +{ + struct e820entry *entry; + unsigned mapcnt; + phys_addr_t end; + + if (!size) + return false; + + end = start + size; + entry = xen_e820_map; + + for (mapcnt = 0; mapcnt < xen_e820_map_entries; mapcnt++) { + if (entry->type == E820_RAM && entry->addr <= start && + (entry->addr + entry->size) >= end) + return false; + + entry++; + } + + return true; +} + +/* + * Find a free area in physical memory not yet reserved and compliant with + * E820 map. + * Used to relocate pre-allocated areas like initrd or p2m list which are in + * conflict with the to be used E820 map. + * In case no area is found, return 0. Otherwise return the physical address + * of the area which is already reserved for convenience. + */ +phys_addr_t __init xen_find_free_area(phys_addr_t size) +{ + unsigned mapcnt; + phys_addr_t addr, start; + struct e820entry *entry = xen_e820_map; + + for (mapcnt = 0; mapcnt < xen_e820_map_entries; mapcnt++, entry++) { + if (entry->type != E820_RAM || entry->size < size) + continue; + start = entry->addr; + for (addr = start; addr < start + size; addr += PAGE_SIZE) { + if (!memblock_is_reserved(addr)) + continue; + start = addr + PAGE_SIZE; + if (start + size > entry->addr + entry->size) + break; + } + if (addr >= start + size) { + memblock_reserve(start, size); + return start; + } + } + + return 0; +} + +/* + * Like memcpy, but with physical addresses for dest and src. + */ +static void __init xen_phys_memcpy(phys_addr_t dest, phys_addr_t src, + phys_addr_t n) +{ + phys_addr_t dest_off, src_off, dest_len, src_len, len; + void *from, *to; + + while (n) { + dest_off = dest & ~PAGE_MASK; + src_off = src & ~PAGE_MASK; + dest_len = n; + if (dest_len > (NR_FIX_BTMAPS << PAGE_SHIFT) - dest_off) + dest_len = (NR_FIX_BTMAPS << PAGE_SHIFT) - dest_off; + src_len = n; + if (src_len > (NR_FIX_BTMAPS << PAGE_SHIFT) - src_off) + src_len = (NR_FIX_BTMAPS << PAGE_SHIFT) - src_off; + len = min(dest_len, src_len); + to = early_memremap(dest - dest_off, dest_len + dest_off); + from = early_memremap(src - src_off, src_len + src_off); + memcpy(to, from, len); + early_memunmap(to, dest_len + dest_off); + early_memunmap(from, src_len + src_off); + n -= len; + dest += len; + src += len; + } +} + +/* + * Reserve Xen mfn_list. + */ +static void __init xen_reserve_xen_mfnlist(void) +{ + phys_addr_t start, size; + + if (xen_start_info->mfn_list >= __START_KERNEL_map) { + start = __pa(xen_start_info->mfn_list); + size = PFN_ALIGN(xen_start_info->nr_pages * + sizeof(unsigned long)); + } else { + start = PFN_PHYS(xen_start_info->first_p2m_pfn); + size = PFN_PHYS(xen_start_info->nr_p2m_frames); + } + + if (!xen_is_e820_reserved(start, size)) { + memblock_reserve(start, size); + return; + } + +#ifdef CONFIG_X86_32 + /* + * Relocating the p2m on 32 bit system to an arbitrary virtual address + * is not supported, so just give up. + */ + xen_raw_console_write("Xen hypervisor allocated p2m list conflicts with E820 map\n"); + BUG(); +#else + xen_relocate_p2m(); +#endif +} + /** * machine_specific_memory_setup - Hook for machine specific memory setup. **/ char * __init xen_memory_setup(void) { - static struct e820entry map[E820MAX] __initdata; - - unsigned long max_pfn = xen_start_info->nr_pages; - phys_addr_t mem_end; + unsigned long max_pfn, pfn_s, n_pfns; + phys_addr_t mem_end, addr, size, chunk_size; + u32 type; int rc; struct xen_memory_map memmap; unsigned long max_pages; unsigned long extra_pages = 0; - unsigned long remapped_pages; int i; int op; - max_pfn = min(MAX_DOMAIN_PAGES, max_pfn); + xen_parse_512gb(); + max_pfn = xen_get_pages_limit(); + max_pfn = min(max_pfn, xen_start_info->nr_pages); mem_end = PFN_PHYS(max_pfn); memmap.nr_entries = E820MAX; - set_xen_guest_handle(memmap.buffer, map); + set_xen_guest_handle(memmap.buffer, xen_e820_map); op = xen_initial_domain() ? XENMEM_machine_memory_map : @@ -590,15 +775,16 @@ char * __init xen_memory_setup(void) if (rc == -ENOSYS) { BUG_ON(xen_initial_domain()); memmap.nr_entries = 1; - map[0].addr = 0ULL; - map[0].size = mem_end; + xen_e820_map[0].addr = 0ULL; + xen_e820_map[0].size = mem_end; /* 8MB slack (to balance backend allocations). */ - map[0].size += 8ULL << 20; - map[0].type = E820_RAM; + xen_e820_map[0].size += 8ULL << 20; + xen_e820_map[0].type = E820_RAM; rc = 0; } BUG_ON(rc); BUG_ON(memmap.nr_entries == 0); + xen_e820_map_entries = memmap.nr_entries; /* * Xen won't allow a 1:1 mapping to be created to UNUSABLE @@ -609,24 +795,19 @@ char * __init xen_memory_setup(void) * a patch in the future. */ if (xen_initial_domain()) - xen_ignore_unusable(map, memmap.nr_entries); + xen_ignore_unusable(); /* Make sure the Xen-supplied memory map is well-ordered. */ - sanitize_e820_map(map, memmap.nr_entries, &memmap.nr_entries); + sanitize_e820_map(xen_e820_map, ARRAY_SIZE(xen_e820_map), + &xen_e820_map_entries); max_pages = xen_get_max_pages(); - if (max_pages > max_pfn) - extra_pages += max_pages - max_pfn; - /* - * Set identity map on non-RAM pages and prepare remapping the - * underlying RAM. - */ - xen_set_identity_and_remap(map, memmap.nr_entries, max_pfn, - &xen_released_pages, &remapped_pages); + /* How many extra pages do we need due to remapping? */ + max_pages += xen_count_remap_pages(max_pfn); - extra_pages += xen_released_pages; - extra_pages += remapped_pages; + if (max_pages > max_pfn) + extra_pages += max_pages - max_pfn; /* * Clamp the amount of extra memory to a EXTRA_MEM_RATIO @@ -635,46 +816,54 @@ char * __init xen_memory_setup(void) * is limited to the max size of lowmem, so that it doesn't * get completely filled. * + * Make sure we have no memory above max_pages, as this area + * isn't handled by the p2m management. + * * In principle there could be a problem in lowmem systems if * the initial memory is also very large with respect to * lowmem, but we won't try to deal with that here. */ - extra_pages = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), - extra_pages); + extra_pages = min3(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), + extra_pages, max_pages - max_pfn); i = 0; - while (i < memmap.nr_entries) { - phys_addr_t addr = map[i].addr; - phys_addr_t size = map[i].size; - u32 type = map[i].type; + addr = xen_e820_map[0].addr; + size = xen_e820_map[0].size; + while (i < xen_e820_map_entries) { + chunk_size = size; + type = xen_e820_map[i].type; if (type == E820_RAM) { if (addr < mem_end) { - size = min(size, mem_end - addr); + chunk_size = min(size, mem_end - addr); } else if (extra_pages) { - size = min(size, PFN_PHYS(extra_pages)); - extra_pages -= PFN_DOWN(size); - xen_add_extra_mem(addr, size); - xen_max_p2m_pfn = PFN_DOWN(addr + size); + chunk_size = min(size, PFN_PHYS(extra_pages)); + pfn_s = PFN_UP(addr); + n_pfns = PFN_DOWN(addr + chunk_size) - pfn_s; + extra_pages -= n_pfns; + xen_add_extra_mem(pfn_s, n_pfns); + xen_max_p2m_pfn = pfn_s + n_pfns; } else type = E820_UNUSABLE; } - xen_align_and_add_e820_region(addr, size, type); + xen_align_and_add_e820_region(addr, chunk_size, type); - map[i].addr += size; - map[i].size -= size; - if (map[i].size == 0) + addr += chunk_size; + size -= chunk_size; + if (size == 0) { i++; + if (i < xen_e820_map_entries) { + addr = xen_e820_map[i].addr; + size = xen_e820_map[i].size; + } + } } /* * Set the rest as identity mapped, in case PCI BARs are * located here. - * - * PFNs above MAX_P2M_PFN are considered identity mapped as - * well. */ - set_phys_range_identity(map[i-1].addr / PAGE_SIZE, ~0ul); + set_phys_range_identity(addr / PAGE_SIZE, ~0ul); /* * In domU, the ISA region is normal, usable memory, but we @@ -684,34 +873,53 @@ char * __init xen_memory_setup(void) e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, E820_RESERVED); + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + /* - * Reserve Xen bits: - * - mfn_list - * - xen_start_info - * See comment above "struct start_info" in <xen/interface/xen.h> - * We tried to make the the memblock_reserve more selective so - * that it would be clear what region is reserved. Sadly we ran - * in the problem wherein on a 64-bit hypervisor with a 32-bit - * initial domain, the pt_base has the cr3 value which is not - * neccessarily where the pagetable starts! As Jan put it: " - * Actually, the adjustment turns out to be correct: The page - * tables for a 32-on-64 dom0 get allocated in the order "first L1", - * "first L2", "first L3", so the offset to the page table base is - * indeed 2. When reading xen/include/public/xen.h's comment - * very strictly, this is not a violation (since there nothing is said - * that the first thing in the page table space is pointed to by - * pt_base; I admit that this seems to be implied though, namely - * do I think that it is implied that the page table space is the - * range [pt_base, pt_base + nt_pt_frames), whereas that - * range here indeed is [pt_base - 2, pt_base - 2 + nt_pt_frames), - * which - without a priori knowledge - the kernel would have - * difficulty to figure out)." - so lets just fall back to the - * easy way and reserve the whole region. + * Check whether the kernel itself conflicts with the target E820 map. + * Failing now is better than running into weird problems later due + * to relocating (and even reusing) pages with kernel text or data. */ - memblock_reserve(__pa(xen_start_info->mfn_list), - xen_start_info->pt_base - xen_start_info->mfn_list); + if (xen_is_e820_reserved(__pa_symbol(_text), + __pa_symbol(__bss_stop) - __pa_symbol(_text))) { + xen_raw_console_write("Xen hypervisor allocated kernel memory conflicts with E820 map\n"); + BUG(); + } - sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + /* + * Check for a conflict of the hypervisor supplied page tables with + * the target E820 map. + */ + xen_pt_check_e820(); + + xen_reserve_xen_mfnlist(); + + /* Check for a conflict of the initrd with the target E820 map. */ + if (xen_is_e820_reserved(boot_params.hdr.ramdisk_image, + boot_params.hdr.ramdisk_size)) { + phys_addr_t new_area, start, size; + + new_area = xen_find_free_area(boot_params.hdr.ramdisk_size); + if (!new_area) { + xen_raw_console_write("Can't find new memory area for initrd needed due to E820 map conflict\n"); + BUG(); + } + + start = boot_params.hdr.ramdisk_image; + size = boot_params.hdr.ramdisk_size; + xen_phys_memcpy(new_area, start, size); + pr_info("initrd moved from [mem %#010llx-%#010llx] to [mem %#010llx-%#010llx]\n", + start, start + size, new_area, new_area + size); + memblock_free(start, size); + boot_params.hdr.ramdisk_image = new_area; + boot_params.ext_ramdisk_image = new_area >> 32; + } + + /* + * Set identity map on non-RAM pages and prepare remapping the + * underlying RAM. + */ + xen_set_identity_and_remap(max_pfn); return "Xen"; } @@ -721,26 +929,30 @@ char * __init xen_memory_setup(void) */ char * __init xen_auto_xlated_memory_setup(void) { - static struct e820entry map[E820MAX] __initdata; - struct xen_memory_map memmap; int i; int rc; memmap.nr_entries = E820MAX; - set_xen_guest_handle(memmap.buffer, map); + set_xen_guest_handle(memmap.buffer, xen_e820_map); rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); if (rc < 0) panic("No memory map (%d)\n", rc); - sanitize_e820_map(map, ARRAY_SIZE(map), &memmap.nr_entries); + xen_e820_map_entries = memmap.nr_entries; + + sanitize_e820_map(xen_e820_map, ARRAY_SIZE(xen_e820_map), + &xen_e820_map_entries); - for (i = 0; i < memmap.nr_entries; i++) - e820_add_region(map[i].addr, map[i].size, map[i].type); + for (i = 0; i < xen_e820_map_entries; i++) + e820_add_region(xen_e820_map[i].addr, xen_e820_map[i].size, + xen_e820_map[i].type); - memblock_reserve(__pa(xen_start_info->mfn_list), - xen_start_info->pt_base - xen_start_info->mfn_list); + /* Remove p2m info, it is not needed. */ + xen_start_info->mfn_list = 0; + xen_start_info->first_p2m_pfn = 0; + xen_start_info->nr_p2m_frames = 0; return "Xen"; } diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 864843844..3f4ebf026 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -26,6 +26,7 @@ #include <xen/interface/xen.h> #include <xen/interface/vcpu.h> +#include <xen/interface/xenpmu.h> #include <asm/xen/interface.h> #include <asm/xen/hypercall.h> @@ -38,6 +39,7 @@ #include "xen-ops.h" #include "mmu.h" #include "smp.h" +#include "pmu.h" cpumask_var_t xen_cpu_initialized_map; @@ -50,6 +52,7 @@ static DEFINE_PER_CPU(struct xen_common_irq, xen_callfunc_irq) = { .irq = -1 }; static DEFINE_PER_CPU(struct xen_common_irq, xen_callfuncsingle_irq) = { .irq = -1 }; static DEFINE_PER_CPU(struct xen_common_irq, xen_irq_work) = { .irq = -1 }; static DEFINE_PER_CPU(struct xen_common_irq, xen_debug_irq) = { .irq = -1 }; +static DEFINE_PER_CPU(struct xen_common_irq, xen_pmu_irq) = { .irq = -1 }; static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id); static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id); @@ -148,11 +151,18 @@ static void xen_smp_intr_free(unsigned int cpu) kfree(per_cpu(xen_irq_work, cpu).name); per_cpu(xen_irq_work, cpu).name = NULL; } + + if (per_cpu(xen_pmu_irq, cpu).irq >= 0) { + unbind_from_irqhandler(per_cpu(xen_pmu_irq, cpu).irq, NULL); + per_cpu(xen_pmu_irq, cpu).irq = -1; + kfree(per_cpu(xen_pmu_irq, cpu).name); + per_cpu(xen_pmu_irq, cpu).name = NULL; + } }; static int xen_smp_intr_init(unsigned int cpu) { int rc; - char *resched_name, *callfunc_name, *debug_name; + char *resched_name, *callfunc_name, *debug_name, *pmu_name; resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu); rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR, @@ -218,6 +228,18 @@ static int xen_smp_intr_init(unsigned int cpu) per_cpu(xen_irq_work, cpu).irq = rc; per_cpu(xen_irq_work, cpu).name = callfunc_name; + if (is_xen_pmu(cpu)) { + pmu_name = kasprintf(GFP_KERNEL, "pmu%d", cpu); + rc = bind_virq_to_irqhandler(VIRQ_XENPMU, cpu, + xen_pmu_irq_handler, + IRQF_PERCPU|IRQF_NOBALANCING, + pmu_name, NULL); + if (rc < 0) + goto fail; + per_cpu(xen_pmu_irq, cpu).irq = rc; + per_cpu(xen_pmu_irq, cpu).name = pmu_name; + } + return 0; fail: @@ -335,6 +357,8 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus) } set_cpu_sibling_map(0); + xen_pmu_init(0); + if (xen_smp_intr_init(0)) BUG(); @@ -429,7 +453,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) } #endif ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs); - ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir)); + ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_gfn(swapper_pg_dir)); if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, ctxt)) BUG(); @@ -462,6 +486,8 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle) if (rc) return rc; + xen_pmu_init(cpu); + rc = xen_smp_intr_init(cpu); if (rc) return rc; @@ -503,6 +529,7 @@ static void xen_cpu_die(unsigned int cpu) xen_smp_intr_free(cpu); xen_uninit_lock_cpu(cpu); xen_teardown_timer(cpu); + xen_pmu_finish(cpu); } } diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index 53b4c0811..feddabdab 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -11,6 +11,7 @@ #include "xen-ops.h" #include "mmu.h" +#include "pmu.h" static void xen_pv_pre_suspend(void) { @@ -67,16 +68,26 @@ static void xen_pv_post_suspend(int suspend_cancelled) void xen_arch_pre_suspend(void) { - if (xen_pv_domain()) - xen_pv_pre_suspend(); + int cpu; + + for_each_online_cpu(cpu) + xen_pmu_finish(cpu); + + if (xen_pv_domain()) + xen_pv_pre_suspend(); } void xen_arch_post_suspend(int cancelled) { - if (xen_pv_domain()) - xen_pv_post_suspend(cancelled); - else - xen_hvm_post_suspend(cancelled); + int cpu; + + if (xen_pv_domain()) + xen_pv_post_suspend(cancelled); + else + xen_hvm_post_suspend(cancelled); + + for_each_online_cpu(cpu) + xen_pmu_init(cpu); } static void xen_vcpu_notify_restore(void *data) diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 55da33b1d..f1ba6a092 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -274,30 +274,18 @@ static s64 get_abs_timeout(unsigned long delta) return xen_clocksource_read() + delta; } -static void xen_timerop_set_mode(enum clock_event_mode mode, - struct clock_event_device *evt) +static int xen_timerop_shutdown(struct clock_event_device *evt) { - switch (mode) { - case CLOCK_EVT_MODE_PERIODIC: - /* unsupported */ - WARN_ON(1); - break; - - case CLOCK_EVT_MODE_ONESHOT: - case CLOCK_EVT_MODE_RESUME: - break; - - case CLOCK_EVT_MODE_UNUSED: - case CLOCK_EVT_MODE_SHUTDOWN: - HYPERVISOR_set_timer_op(0); /* cancel timeout */ - break; - } + /* cancel timeout */ + HYPERVISOR_set_timer_op(0); + + return 0; } static int xen_timerop_set_next_event(unsigned long delta, struct clock_event_device *evt) { - WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT); + WARN_ON(!clockevent_state_oneshot(evt)); if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0) BUG(); @@ -310,46 +298,39 @@ static int xen_timerop_set_next_event(unsigned long delta, } static const struct clock_event_device xen_timerop_clockevent = { - .name = "xen", - .features = CLOCK_EVT_FEAT_ONESHOT, + .name = "xen", + .features = CLOCK_EVT_FEAT_ONESHOT, - .max_delta_ns = 0xffffffff, - .min_delta_ns = TIMER_SLOP, + .max_delta_ns = 0xffffffff, + .min_delta_ns = TIMER_SLOP, - .mult = 1, - .shift = 0, - .rating = 500, + .mult = 1, + .shift = 0, + .rating = 500, - .set_mode = xen_timerop_set_mode, - .set_next_event = xen_timerop_set_next_event, + .set_state_shutdown = xen_timerop_shutdown, + .set_next_event = xen_timerop_set_next_event, }; +static int xen_vcpuop_shutdown(struct clock_event_device *evt) +{ + int cpu = smp_processor_id(); + if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) || + HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL)) + BUG(); + + return 0; +} -static void xen_vcpuop_set_mode(enum clock_event_mode mode, - struct clock_event_device *evt) +static int xen_vcpuop_set_oneshot(struct clock_event_device *evt) { int cpu = smp_processor_id(); - switch (mode) { - case CLOCK_EVT_MODE_PERIODIC: - WARN_ON(1); /* unsupported */ - break; - - case CLOCK_EVT_MODE_ONESHOT: - if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL)) - BUG(); - break; + if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL)) + BUG(); - case CLOCK_EVT_MODE_UNUSED: - case CLOCK_EVT_MODE_SHUTDOWN: - if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) || - HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL)) - BUG(); - break; - case CLOCK_EVT_MODE_RESUME: - break; - } + return 0; } static int xen_vcpuop_set_next_event(unsigned long delta, @@ -359,7 +340,7 @@ static int xen_vcpuop_set_next_event(unsigned long delta, struct vcpu_set_singleshot_timer single; int ret; - WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT); + WARN_ON(!clockevent_state_oneshot(evt)); single.timeout_abs_ns = get_abs_timeout(delta); single.flags = VCPU_SSHOTTMR_future; @@ -382,7 +363,8 @@ static const struct clock_event_device xen_vcpuop_clockevent = { .shift = 0, .rating = 500, - .set_mode = xen_vcpuop_set_mode, + .set_state_shutdown = xen_vcpuop_shutdown, + .set_state_oneshot = xen_vcpuop_set_oneshot, .set_next_event = xen_vcpuop_set_next_event, }; diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 8afdfccf6..b65f59a35 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -104,6 +104,8 @@ ENTRY(hypercall_page) ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __PAGE_OFFSET) #else ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map) + /* Map the p2m table to a 512GB-aligned user address. */ + ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M, .quad PGDIR_SIZE) #endif ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen) ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page) diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 2292721b1..1399423f3 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -35,13 +35,20 @@ void xen_build_mfn_list_list(void); void xen_setup_machphys_mapping(void); void xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); void xen_reserve_top(void); +void __init xen_reserve_special_pages(void); +void __init xen_pt_check_e820(void); void xen_mm_pin_all(void); void xen_mm_unpin_all(void); +#ifdef CONFIG_X86_64 +void __init xen_relocate_p2m(void); +#endif +bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size); unsigned long __ref xen_chk_extra_mem(unsigned long pfn); void __init xen_inv_extra_mem(void); void __init xen_remap_memory(void); +phys_addr_t __init xen_find_free_area(phys_addr_t size); char * __init xen_memory_setup(void); char * xen_auto_xlated_memory_setup(void); void __init xen_arch_setup(void); |