From bdcfd44fb5b5fb8fd660e7f93f1095c507481024 Mon Sep 17 00:00:00 2001 From: AndrĂ© Fabian Silva Delgado Date: Sat, 16 Apr 2016 15:30:54 -0300 Subject: Linux-libre 4.5.1-gnu --- arch/x86/Kconfig | 45 +++++++---- arch/x86/entry/common.c | 23 +++--- arch/x86/include/asm/apic.h | 2 +- arch/x86/include/asm/hw_irq.h | 1 + arch/x86/include/asm/microcode.h | 26 +++++++ arch/x86/include/asm/perf_event.h | 1 + arch/x86/include/asm/xen/hypervisor.h | 2 + arch/x86/kernel/apic/vector.c | 88 +++++++++++++++++----- arch/x86/kernel/cpu/microcode/intel.c | 38 +++++----- arch/x86/kernel/cpu/perf_event.c | 13 ++++ arch/x86/kernel/cpu/perf_event.h | 3 + arch/x86/kernel/cpu/perf_event_intel.c | 27 ++++++- arch/x86/kernel/cpu/perf_event_intel_ds.c | 24 +++++- .../x86/kernel/cpu/perf_event_intel_uncore_snbep.c | 8 +- arch/x86/kernel/cpu/perf_event_knc.c | 4 +- arch/x86/kernel/ioport.c | 29 ++++++- arch/x86/kernel/process_64.c | 12 +++ arch/x86/kvm/i8254.c | 12 +-- arch/x86/kvm/vmx.c | 16 +++- arch/x86/kvm/x86.c | 1 + arch/x86/mm/tlb.c | 12 ++- arch/x86/pci/fixup.c | 7 ++ arch/x86/xen/enlighten.c | 2 +- 23 files changed, 315 insertions(+), 81 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c46662f64..b971ea4be 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -911,10 +911,26 @@ config SCHED_SMT depends on SMP ---help--- SMT scheduler support improves the CPU scheduler's decision making - when dealing with Intel Pentium 4 chips with HyperThreading at a + when dealing with Intel P4/Core 2 chips with HyperThreading at a cost of slightly increased overhead in some places. If unsure say N here. +config SMT_NICE + bool "SMT (Hyperthreading) aware nice priority and policy support" + depends on SCHED_BFS && SCHED_SMT + default y + ---help--- + Enabling Hyperthreading on Intel CPUs decreases the effectiveness + of the use of 'nice' levels and different scheduling policies + (e.g. realtime) due to sharing of CPU power between hyperthreads. + SMT nice support makes each logical CPU aware of what is running on + its hyperthread siblings, maintaining appropriate distribution of + CPU according to nice levels and scheduling policies at the expense + of slightly increased overhead. + + If unsure say Y here. + + config SCHED_MC def_bool y prompt "Multi-core scheduler support" @@ -1160,22 +1176,23 @@ config MICROCODE bool "CPU microcode loading support" default y depends on CPU_SUP_AMD || CPU_SUP_INTEL - depends on BLK_DEV_INITRD select FW_LOADER ---help--- - If you say Y here, you will be able to update the microcode on - certain Intel and AMD processors. The Intel support is for the - IA32 family, e.g. Pentium Pro, Pentium II, Pentium III, Pentium 4, - Xeon etc. The AMD support is for families 0x10 and later. You will - obviously need the actual microcode binary data itself which is not - shipped with the Linux kernel. + Intel and AMD processors. The Intel support is for the IA32 family, + e.g. Pentium Pro, Pentium II, Pentium III, Pentium 4, Xeon etc. The + AMD support is for families 0x10 and later. You will obviously need + the actual microcode binary data itself which is not shipped with + the Linux kernel. - This option selects the general module only, you need to select - at least one vendor specific module as well. + The preferred method to load microcode from a detached initrd is described + in Documentation/x86/early-microcode.txt. For that you need to enable + CONFIG_BLK_DEV_INITRD in order for the loader to be able to scan the + initrd for microcode blobs. - To compile this driver as a module, choose M here: the module - will be called microcode. + In addition, you can build-in the microcode into the kernel. For that you + need to enable FIRMWARE_IN_KERNEL and add the vendor-supplied microcode + to the CONFIG_EXTRA_FIRMWARE config option. config MICROCODE_INTEL bool "Intel microcode loading support" @@ -1995,7 +2012,7 @@ config HOTPLUG_CPU config BOOTPARAM_HOTPLUG_CPU0 bool "Set default setting of cpu0_hotpluggable" default n - depends on HOTPLUG_CPU + depends on HOTPLUG_CPU && !SCHED_BFS ---help--- Set whether default state of cpu0_hotpluggable is on or off. @@ -2024,7 +2041,7 @@ config BOOTPARAM_HOTPLUG_CPU0 config DEBUG_HOTPLUG_CPU0 def_bool n prompt "Debug CPU0 hotplug" - depends on HOTPLUG_CPU + depends on HOTPLUG_CPU && !SCHED_BFS ---help--- Enabling this option offlines CPU0 (if CPU0 can be offlined) as soon as possible and boots up userspace with CPU0 offlined. User diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 03663740c..1a4477ced 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -268,6 +268,7 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags) /* Called with IRQs disabled. */ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs) { + struct thread_info *ti = pt_regs_to_thread_info(regs); u32 cached_flags; if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled())) @@ -275,12 +276,22 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs) lockdep_sys_exit(); - cached_flags = - READ_ONCE(pt_regs_to_thread_info(regs)->flags); + cached_flags = READ_ONCE(ti->flags); if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS)) exit_to_usermode_loop(regs, cached_flags); +#ifdef CONFIG_COMPAT + /* + * Compat syscalls set TS_COMPAT. Make sure we clear it before + * returning to user mode. We need to clear it *after* signal + * handling, because syscall restart has a fixup for compat + * syscalls. The fixup is exercised by the ptrace_syscall_32 + * selftest. + */ + ti->status &= ~TS_COMPAT; +#endif + user_enter(); } @@ -332,14 +343,6 @@ __visible inline void syscall_return_slowpath(struct pt_regs *regs) if (unlikely(cached_flags & SYSCALL_EXIT_WORK_FLAGS)) syscall_slow_exit_work(regs, cached_flags); -#ifdef CONFIG_COMPAT - /* - * Compat syscalls set TS_COMPAT. Make sure we clear it before - * returning to user mode. - */ - ti->status &= ~TS_COMPAT; -#endif - local_irq_disable(); prepare_exit_to_usermode(regs); } diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index c80f6b6f3..e8c4fba52 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -644,8 +644,8 @@ static inline void entering_irq(void) static inline void entering_ack_irq(void) { - ack_APIC_irq(); entering_irq(); + ack_APIC_irq(); } static inline void ipi_entering_ack_irq(void) diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 1815b7362..84b3d194a 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -141,6 +141,7 @@ struct irq_alloc_info { struct irq_cfg { unsigned int dest_apicid; u8 vector; + u8 old_vector; }; extern struct irq_cfg *irq_cfg(unsigned int irq); diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index 1e1b07a5a..9d3a96c4d 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -3,6 +3,7 @@ #include #include +#include #define native_rdmsr(msr, val1, val2) \ do { \ @@ -143,4 +144,29 @@ static inline void reload_early_microcode(void) { } static inline bool get_builtin_firmware(struct cpio_data *cd, const char *name) { return false; } #endif + +static inline unsigned long get_initrd_start(void) +{ +#ifdef CONFIG_BLK_DEV_INITRD + return initrd_start; +#else + return 0; +#endif +} + +static inline unsigned long get_initrd_start_addr(void) +{ +#ifdef CONFIG_BLK_DEV_INITRD +#ifdef CONFIG_X86_32 + unsigned long *initrd_start_p = (unsigned long *)__pa_nodebug(&initrd_start); + + return (unsigned long)__pa_nodebug(*initrd_start_p); +#else + return get_initrd_start(); +#endif +#else /* CONFIG_BLK_DEV_INITRD */ + return 0; +#endif +} + #endif /* _ASM_X86_MICROCODE_H */ diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 7bcb861a0..5a2ed3ed2 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -165,6 +165,7 @@ struct x86_pmu_capability { #define GLOBAL_STATUS_ASIF BIT_ULL(60) #define GLOBAL_STATUS_COUNTERS_FROZEN BIT_ULL(59) #define GLOBAL_STATUS_LBRS_FROZEN BIT_ULL(58) +#define GLOBAL_STATUS_TRACE_TOPAPMI BIT_ULL(55) /* * IBS cpuid feature detection diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h index 8b2d4bea9..39171b364 100644 --- a/arch/x86/include/asm/xen/hypervisor.h +++ b/arch/x86/include/asm/xen/hypervisor.h @@ -62,4 +62,6 @@ void xen_arch_register_cpu(int num); void xen_arch_unregister_cpu(int num); #endif +extern void xen_set_iopl_mask(unsigned mask); + #endif /* _ASM_X86_XEN_HYPERVISOR_H */ diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 3b670df4b..ad59d70bc 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -213,6 +213,7 @@ update: */ cpumask_and(d->old_domain, d->old_domain, cpu_online_mask); d->move_in_progress = !cpumask_empty(d->old_domain); + d->cfg.old_vector = d->move_in_progress ? d->cfg.vector : 0; d->cfg.vector = vector; cpumask_copy(d->domain, vector_cpumask); success: @@ -655,46 +656,97 @@ void irq_complete_move(struct irq_cfg *cfg) } /* - * Called with @desc->lock held and interrupts disabled. + * Called from fixup_irqs() with @desc->lock held and interrupts disabled. */ void irq_force_complete_move(struct irq_desc *desc) { struct irq_data *irqdata = irq_desc_get_irq_data(desc); struct apic_chip_data *data = apic_chip_data(irqdata); struct irq_cfg *cfg = data ? &data->cfg : NULL; + unsigned int cpu; if (!cfg) return; - __irq_complete_move(cfg, cfg->vector); - /* * This is tricky. If the cleanup of @data->old_domain has not been * done yet, then the following setaffinity call will fail with * -EBUSY. This can leave the interrupt in a stale state. * - * The cleanup cannot make progress because we hold @desc->lock. So in - * case @data->old_domain is not yet cleaned up, we need to drop the - * lock and acquire it again. @desc cannot go away, because the - * hotplug code holds the sparse irq lock. + * All CPUs are stuck in stop machine with interrupts disabled so + * calling __irq_complete_move() would be completely pointless. */ raw_spin_lock(&vector_lock); - /* Clean out all offline cpus (including ourself) first. */ + /* + * Clean out all offline cpus (including the outgoing one) from the + * old_domain mask. + */ cpumask_and(data->old_domain, data->old_domain, cpu_online_mask); - while (!cpumask_empty(data->old_domain)) { + + /* + * If move_in_progress is cleared and the old_domain mask is empty, + * then there is nothing to cleanup. fixup_irqs() will take care of + * the stale vectors on the outgoing cpu. + */ + if (!data->move_in_progress && cpumask_empty(data->old_domain)) { raw_spin_unlock(&vector_lock); - raw_spin_unlock(&desc->lock); - cpu_relax(); - raw_spin_lock(&desc->lock); + return; + } + + /* + * 1) The interrupt is in move_in_progress state. That means that we + * have not seen an interrupt since the io_apic was reprogrammed to + * the new vector. + * + * 2) The interrupt has fired on the new vector, but the cleanup IPIs + * have not been processed yet. + */ + if (data->move_in_progress) { /* - * Reevaluate apic_chip_data. It might have been cleared after - * we dropped @desc->lock. + * In theory there is a race: + * + * set_ioapic(new_vector) <-- Interrupt is raised before update + * is effective, i.e. it's raised on + * the old vector. + * + * So if the target cpu cannot handle that interrupt before + * the old vector is cleaned up, we get a spurious interrupt + * and in the worst case the ioapic irq line becomes stale. + * + * But in case of cpu hotplug this should be a non issue + * because if the affinity update happens right before all + * cpus rendevouz in stop machine, there is no way that the + * interrupt can be blocked on the target cpu because all cpus + * loops first with interrupts enabled in stop machine, so the + * old vector is not yet cleaned up when the interrupt fires. + * + * So the only way to run into this issue is if the delivery + * of the interrupt on the apic/system bus would be delayed + * beyond the point where the target cpu disables interrupts + * in stop machine. I doubt that it can happen, but at least + * there is a theroretical chance. Virtualization might be + * able to expose this, but AFAICT the IOAPIC emulation is not + * as stupid as the real hardware. + * + * Anyway, there is nothing we can do about that at this point + * w/o refactoring the whole fixup_irq() business completely. + * We print at least the irq number and the old vector number, + * so we have the necessary information when a problem in that + * area arises. */ - data = apic_chip_data(irqdata); - if (!data) - return; - raw_spin_lock(&vector_lock); + pr_warn("IRQ fixup: irq %d move in progress, old vector %d\n", + irqdata->irq, cfg->old_vector); } + /* + * If old_domain is not empty, then other cpus still have the irq + * descriptor set in their vector array. Clean it up. + */ + for_each_cpu(cpu, data->old_domain) + per_cpu(vector_irq, cpu)[cfg->old_vector] = VECTOR_UNUSED; + + /* Cleanup the left overs of the (half finished) move */ + cpumask_clear(data->old_domain); + data->move_in_progress = 0; raw_spin_unlock(&vector_lock); } #endif diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index de8a164a6..f9c7f3302 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -551,10 +551,14 @@ scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd, cd.data = NULL; cd.size = 0; - cd = find_cpio_data(p, (void *)start, size, &offset); - if (!cd.data) { + /* try built-in microcode if no initrd */ + if (!size) { if (!load_builtin_intel_microcode(&cd)) return UCODE_ERROR; + } else { + cd = find_cpio_data(p, (void *)start, size, &offset); + if (!cd.data) + return UCODE_ERROR; } return get_matching_model_microcode(0, start, cd.data, cd.size, @@ -690,7 +694,7 @@ int __init save_microcode_in_initrd_intel(void) if (count == 0) return ret; - copy_initrd_ptrs(mc_saved, mc_saved_in_initrd, initrd_start, count); + copy_initrd_ptrs(mc_saved, mc_saved_in_initrd, get_initrd_start(), count); ret = save_microcode(&mc_saved_data, mc_saved, count); if (ret) pr_err("Cannot save microcode patches from initrd.\n"); @@ -728,16 +732,20 @@ void __init load_ucode_intel_bsp(void) struct boot_params *p; p = (struct boot_params *)__pa_nodebug(&boot_params); - start = p->hdr.ramdisk_image; size = p->hdr.ramdisk_size; - _load_ucode_intel_bsp( - (struct mc_saved_data *)__pa_nodebug(&mc_saved_data), - (unsigned long *)__pa_nodebug(&mc_saved_in_initrd), - start, size); + /* + * Set start only if we have an initrd image. We cannot use initrd_start + * because it is not set that early yet. + */ + start = (size ? p->hdr.ramdisk_image : 0); + + _load_ucode_intel_bsp((struct mc_saved_data *)__pa_nodebug(&mc_saved_data), + (unsigned long *)__pa_nodebug(&mc_saved_in_initrd), + start, size); #else - start = boot_params.hdr.ramdisk_image + PAGE_OFFSET; size = boot_params.hdr.ramdisk_size; + start = (size ? boot_params.hdr.ramdisk_image + PAGE_OFFSET : 0); _load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd, start, size); #endif @@ -748,20 +756,14 @@ void load_ucode_intel_ap(void) struct mc_saved_data *mc_saved_data_p; struct ucode_cpu_info uci; unsigned long *mc_saved_in_initrd_p; - unsigned long initrd_start_addr; enum ucode_state ret; #ifdef CONFIG_X86_32 - unsigned long *initrd_start_p; - mc_saved_in_initrd_p = - (unsigned long *)__pa_nodebug(mc_saved_in_initrd); + mc_saved_in_initrd_p = (unsigned long *)__pa_nodebug(mc_saved_in_initrd); mc_saved_data_p = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data); - initrd_start_p = (unsigned long *)__pa_nodebug(&initrd_start); - initrd_start_addr = (unsigned long)__pa_nodebug(*initrd_start_p); #else - mc_saved_data_p = &mc_saved_data; mc_saved_in_initrd_p = mc_saved_in_initrd; - initrd_start_addr = initrd_start; + mc_saved_data_p = &mc_saved_data; #endif /* @@ -773,7 +775,7 @@ void load_ucode_intel_ap(void) collect_cpu_info_early(&uci); ret = load_microcode(mc_saved_data_p, mc_saved_in_initrd_p, - initrd_start_addr, &uci); + get_initrd_start_addr(), &uci); if (ret != UCODE_OK) return; diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 1b443db2d..6532f5b40 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -596,6 +596,19 @@ void x86_pmu_disable_all(void) } } +/* + * There may be PMI landing after enabled=0. The PMI hitting could be before or + * after disable_all. + * + * If PMI hits before disable_all, the PMU will be disabled in the NMI handler. + * It will not be re-enabled in the NMI handler again, because enabled=0. After + * handling the NMI, disable_all will be called, which will not change the + * state either. If PMI hits after disable_all, the PMU is already disabled + * before entering NMI handler. The NMI handler will not change the state + * either. + * + * So either situation is harmless. + */ static void x86_pmu_disable(struct pmu *pmu) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 7bb61e32f..98be6d6d3 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -586,6 +586,7 @@ struct x86_pmu { pebs_broken :1, pebs_prec_dist :1; int pebs_record_size; + int pebs_buffer_size; void (*drain_pebs)(struct pt_regs *regs); struct event_constraint *pebs_constraints; void (*pebs_aliases)(struct perf_event *event); @@ -904,6 +905,8 @@ void intel_pmu_lbr_init_skl(void); void intel_pmu_lbr_init_knl(void); +void intel_pmu_pebs_data_source_nhm(void); + int intel_pmu_setup_lbr_filter(struct perf_event *event); void intel_pt_interrupt(void); diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index fed2ab1f1..760aec1e8 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1502,7 +1502,15 @@ static __initconst const u64 knl_hw_cache_extra_regs }; /* - * Use from PMIs where the LBRs are already disabled. + * Used from PMIs where the LBRs are already disabled. + * + * This function could be called consecutively. It is required to remain in + * disabled state if called consecutively. + * + * During consecutive calls, the same disable value will be written to related + * registers, so the PMU state remains unchanged. hw.state in + * intel_bts_disable_local will remain PERF_HES_STOPPED too in consecutive + * calls. */ static void __intel_pmu_disable_all(void) { @@ -1884,6 +1892,16 @@ again: if (__test_and_clear_bit(62, (unsigned long *)&status)) { handled++; x86_pmu.drain_pebs(regs); + /* + * There are cases where, even though, the PEBS ovfl bit is set + * in GLOBAL_OVF_STATUS, the PEBS events may also have their + * overflow bits set for their counters. We must clear them + * here because they have been processed as exact samples in + * the drain_pebs() routine. They must not be processed again + * in the for_each_bit_set() loop for regular samples below. + */ + status &= ~cpuc->pebs_enabled; + status &= x86_pmu.intel_ctrl | GLOBAL_STATUS_TRACE_TOPAPMI; } /* @@ -1929,7 +1947,10 @@ again: goto again; done: - __intel_pmu_enable_all(0, true); + /* Only restore PMU state when it's active. See x86_pmu_disable(). */ + if (cpuc->enabled) + __intel_pmu_enable_all(0, true); + /* * Only unmask the NMI after the overflow counters * have been reset. This avoids spurious NMIs on @@ -3396,6 +3417,7 @@ __init int intel_pmu_init(void) intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1); + intel_pmu_pebs_data_source_nhm(); x86_add_quirk(intel_nehalem_quirk); pr_cont("Nehalem events, "); @@ -3459,6 +3481,7 @@ __init int intel_pmu_init(void) intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1); + intel_pmu_pebs_data_source_nhm(); pr_cont("Westmere events, "); break; diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 10602f0a4..955140140 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -51,7 +51,8 @@ union intel_x86_pebs_dse { #define OP_LH (P(OP, LOAD) | P(LVL, HIT)) #define SNOOP_NONE_MISS (P(SNOOP, NONE) | P(SNOOP, MISS)) -static const u64 pebs_data_source[] = { +/* Version for Sandy Bridge and later */ +static u64 pebs_data_source[] = { P(OP, LOAD) | P(LVL, MISS) | P(LVL, L3) | P(SNOOP, NA),/* 0x00:ukn L3 */ OP_LH | P(LVL, L1) | P(SNOOP, NONE), /* 0x01: L1 local */ OP_LH | P(LVL, LFB) | P(SNOOP, NONE), /* 0x02: LFB hit */ @@ -70,6 +71,14 @@ static const u64 pebs_data_source[] = { OP_LH | P(LVL, UNC) | P(SNOOP, NONE), /* 0x0f: uncached */ }; +/* Patch up minor differences in the bits */ +void __init intel_pmu_pebs_data_source_nhm(void) +{ + pebs_data_source[0x05] = OP_LH | P(LVL, L3) | P(SNOOP, HIT); + pebs_data_source[0x06] = OP_LH | P(LVL, L3) | P(SNOOP, HITM); + pebs_data_source[0x07] = OP_LH | P(LVL, L3) | P(SNOOP, HITM); +} + static u64 precise_store_data(u64 status) { union intel_x86_pebs_dse dse; @@ -269,7 +278,7 @@ static int alloc_pebs_buffer(int cpu) if (!x86_pmu.pebs) return 0; - buffer = kzalloc_node(PEBS_BUFFER_SIZE, GFP_KERNEL, node); + buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node); if (unlikely(!buffer)) return -ENOMEM; @@ -286,7 +295,7 @@ static int alloc_pebs_buffer(int cpu) per_cpu(insn_buffer, cpu) = ibuffer; } - max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size; + max = x86_pmu.pebs_buffer_size / x86_pmu.pebs_record_size; ds->pebs_buffer_base = (u64)(unsigned long)buffer; ds->pebs_index = ds->pebs_buffer_base; @@ -1319,6 +1328,7 @@ void __init intel_ds_init(void) x86_pmu.bts = boot_cpu_has(X86_FEATURE_BTS); x86_pmu.pebs = boot_cpu_has(X86_FEATURE_PEBS); + x86_pmu.pebs_buffer_size = PEBS_BUFFER_SIZE; if (x86_pmu.pebs) { char pebs_type = x86_pmu.intel_cap.pebs_trap ? '+' : '-'; int format = x86_pmu.intel_cap.pebs_format; @@ -1327,6 +1337,14 @@ void __init intel_ds_init(void) case 0: printk(KERN_CONT "PEBS fmt0%c, ", pebs_type); x86_pmu.pebs_record_size = sizeof(struct pebs_record_core); + /* + * Using >PAGE_SIZE buffers makes the WRMSR to + * PERF_GLOBAL_CTRL in intel_pmu_enable_all() + * mysteriously hang on Core2. + * + * As a workaround, we don't do this. + */ + x86_pmu.pebs_buffer_size = PAGE_SIZE; x86_pmu.drain_pebs = intel_pmu_drain_pebs_core; break; diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c index 33acb884c..4547b2cca 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c @@ -2875,11 +2875,13 @@ static struct intel_uncore_type bdx_uncore_sbox = { .format_group = &hswep_uncore_sbox_format_group, }; +#define BDX_MSR_UNCORE_SBOX 3 + static struct intel_uncore_type *bdx_msr_uncores[] = { &bdx_uncore_ubox, &bdx_uncore_cbox, - &bdx_uncore_sbox, &hswep_uncore_pcu, + &bdx_uncore_sbox, NULL, }; @@ -2888,6 +2890,10 @@ void bdx_uncore_cpu_init(void) if (bdx_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) bdx_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; uncore_msr_uncores = bdx_msr_uncores; + + /* BDX-DE doesn't have SBOX */ + if (boot_cpu_data.x86_model == 86) + uncore_msr_uncores[BDX_MSR_UNCORE_SBOX] = NULL; } static struct intel_uncore_type bdx_uncore_ha = { diff --git a/arch/x86/kernel/cpu/perf_event_knc.c b/arch/x86/kernel/cpu/perf_event_knc.c index 5b0c232d1..b931095e8 100644 --- a/arch/x86/kernel/cpu/perf_event_knc.c +++ b/arch/x86/kernel/cpu/perf_event_knc.c @@ -263,7 +263,9 @@ again: goto again; done: - knc_pmu_enable_all(0); + /* Only restore PMU state when it's active. See x86_pmu_disable(). */ + if (cpuc->enabled) + knc_pmu_enable_all(0); return handled; } diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index 37dae792d..435466fbd 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -28,8 +28,18 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) return -EINVAL; +#ifdef CONFIG_SCHED_BFS_AUTOISO + if (turn_on) { + struct sched_param param = { .sched_priority = 0 }; + if (!capable(CAP_SYS_RAWIO)) + return -EPERM; + /* Start X as SCHED_ISO */ + sched_setscheduler_nocheck(current, SCHED_ISO, ¶m); + } +#else if (turn_on && !capable(CAP_SYS_RAWIO)) return -EPERM; +#endif /* * If it's the first ioperm() call in this thread's lifetime, set the @@ -96,18 +106,31 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) SYSCALL_DEFINE1(iopl, unsigned int, level) { struct pt_regs *regs = current_pt_regs(); - unsigned int old = (regs->flags >> 12) & 3; struct thread_struct *t = ¤t->thread; + /* + * Careful: the IOPL bits in regs->flags are undefined under Xen PV + * and changing them has no effect. + */ + unsigned int old = t->iopl >> X86_EFLAGS_IOPL_BIT; + if (level > 3) return -EINVAL; /* Trying to gain more privileges? */ if (level > old) { +#ifdef CONFIG_SCHED_BFS_AUTOISO + struct sched_param param = { .sched_priority = 0 }; +#endif if (!capable(CAP_SYS_RAWIO)) return -EPERM; +#ifdef CONFIG_SCHED_BFS_AUTOISO + /* Start X as SCHED_ISO */ + sched_setscheduler_nocheck(current, SCHED_ISO, ¶m); +#endif } - regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | (level << 12); - t->iopl = level << 12; + regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | + (level << X86_EFLAGS_IOPL_BIT); + t->iopl = level << X86_EFLAGS_IOPL_BIT; set_iopl_mask(t->iopl); return 0; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index b9d99e0f8..9f7518760 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -48,6 +48,7 @@ #include #include #include +#include asmlinkage extern void ret_from_fork(void); @@ -411,6 +412,17 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) __switch_to_xtra(prev_p, next_p, tss); +#ifdef CONFIG_XEN + /* + * On Xen PV, IOPL bits in pt_regs->flags have no effect, and + * current_pt_regs()->flags may not match the current task's + * intended IOPL. We need to switch it manually. + */ + if (unlikely(static_cpu_has(X86_FEATURE_XENPV) && + prev->iopl != next->iopl)) + xen_set_iopl_mask(next->iopl); +#endif + if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) { /* * AMD CPUs have a misfeature: SYSRET sets the SS selector but diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index b0ea42b78..ab5318727 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -245,7 +245,7 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) * PIC is being reset. Handle it gracefully here */ atomic_inc(&ps->pending); - else if (value > 0) + else if (value > 0 && ps->reinject) /* in this case, we had multiple outstanding pit interrupts * that we needed to inject. Reinject */ @@ -288,7 +288,9 @@ static void pit_do_work(struct kthread_work *work) * last one has been acked. */ spin_lock(&ps->inject_lock); - if (ps->irq_ack) { + if (!ps->reinject) + inject = 1; + else if (ps->irq_ack) { ps->irq_ack = 0; inject = 1; } @@ -317,10 +319,10 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) struct kvm_kpit_state *ps = container_of(data, struct kvm_kpit_state, timer); struct kvm_pit *pt = ps->kvm->arch.vpit; - if (ps->reinject || !atomic_read(&ps->pending)) { + if (ps->reinject) atomic_inc(&ps->pending); - queue_kthread_work(&pt->worker, &pt->expired); - } + + queue_kthread_work(&pt->worker, &pt->expired); if (ps->is_periodic) { hrtimer_add_expires_ns(&ps->timer, ps->period); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 9bd8f44ba..539062e24 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2702,8 +2702,15 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) } else vmx->nested.nested_vmx_ept_caps = 0; + /* + * Old versions of KVM use the single-context version without + * checking for support, so declare that it is supported even + * though it is treated as global context. The alternative is + * not failing the single-context invvpid, and it is worse. + */ if (enable_vpid) vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT | + VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT; else vmx->nested.nested_vmx_vpid_caps = 0; @@ -7398,6 +7405,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) if (!(types & (1UL << type))) { nested_vmx_failValid(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + skip_emulated_instruction(vcpu); return 1; } @@ -7456,6 +7464,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) if (!(types & (1UL << type))) { nested_vmx_failValid(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + skip_emulated_instruction(vcpu); return 1; } @@ -7472,12 +7481,17 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) } switch (type) { + case VMX_VPID_EXTENT_SINGLE_CONTEXT: + /* + * Old versions of KVM use the single-context version so we + * have to support it; just treat it the same as all-context. + */ case VMX_VPID_EXTENT_ALL_CONTEXT: __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02); nested_vmx_succeed(vcpu); break; default: - /* Trap single context invalidation invvpid calls */ + /* Trap individual address invalidation invvpid calls */ BUG_ON(1); break; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index eaf6ee8c2..d47d231e0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2752,6 +2752,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) } kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); + vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD; } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 8f4cc3dfa..5fb6adaaa 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -106,8 +106,6 @@ static void flush_tlb_func(void *info) if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm)) return; - if (!f->flush_end) - f->flush_end = f->flush_start + PAGE_SIZE; count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { @@ -135,12 +133,20 @@ void native_flush_tlb_others(const struct cpumask *cpumask, unsigned long end) { struct flush_tlb_info info; + + if (end == 0) + end = start + PAGE_SIZE; info.flush_mm = mm; info.flush_start = start; info.flush_end = end; count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); - trace_tlb_flush(TLB_REMOTE_SEND_IPI, end - start); + if (end == TLB_FLUSH_ALL) + trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL); + else + trace_tlb_flush(TLB_REMOTE_SEND_IPI, + (end - start) >> PAGE_SHIFT); + if (is_uv_system()) { unsigned int cpu; diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index e58565556..0ae7e9fa3 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -540,3 +540,10 @@ static void twinhead_reserve_killing_zone(struct pci_dev *dev) } } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x27B9, twinhead_reserve_killing_zone); + +static void pci_bdwep_bar(struct pci_dev *dev) +{ + dev->non_compliant_bars = 1; +} +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fa0, pci_bdwep_bar); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, pci_bdwep_bar); diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index d09e4c9d7..e3679db17 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -961,7 +961,7 @@ static void xen_load_sp0(struct tss_struct *tss, tss->x86_tss.sp0 = thread->sp0; } -static void xen_set_iopl_mask(unsigned mask) +void xen_set_iopl_mask(unsigned mask) { struct physdev_set_iopl set_iopl; -- cgit v1.2.3-54-g00ecf