summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig45
-rw-r--r--arch/x86/entry/common.c23
-rw-r--r--arch/x86/include/asm/apic.h2
-rw-r--r--arch/x86/include/asm/hw_irq.h1
-rw-r--r--arch/x86/include/asm/microcode.h26
-rw-r--r--arch/x86/include/asm/perf_event.h1
-rw-r--r--arch/x86/include/asm/xen/hypervisor.h2
-rw-r--r--arch/x86/kernel/apic/vector.c88
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c38
-rw-r--r--arch/x86/kernel/cpu/perf_event.c13
-rw-r--r--arch/x86/kernel/cpu/perf_event.h3
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c27
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c24
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c8
-rw-r--r--arch/x86/kernel/cpu/perf_event_knc.c4
-rw-r--r--arch/x86/kernel/ioport.c29
-rw-r--r--arch/x86/kernel/process_64.c12
-rw-r--r--arch/x86/kvm/i8254.c12
-rw-r--r--arch/x86/kvm/vmx.c16
-rw-r--r--arch/x86/kvm/x86.c1
-rw-r--r--arch/x86/mm/tlb.c12
-rw-r--r--arch/x86/pci/fixup.c7
-rw-r--r--arch/x86/xen/enlighten.c2
23 files changed, 315 insertions, 81 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c46662f64..b971ea4be 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -911,10 +911,26 @@ config SCHED_SMT
depends on SMP
---help---
SMT scheduler support improves the CPU scheduler's decision making
- when dealing with Intel Pentium 4 chips with HyperThreading at a
+ when dealing with Intel P4/Core 2 chips with HyperThreading at a
cost of slightly increased overhead in some places. If unsure say
N here.
+config SMT_NICE
+ bool "SMT (Hyperthreading) aware nice priority and policy support"
+ depends on SCHED_BFS && SCHED_SMT
+ default y
+ ---help---
+ Enabling Hyperthreading on Intel CPUs decreases the effectiveness
+ of the use of 'nice' levels and different scheduling policies
+ (e.g. realtime) due to sharing of CPU power between hyperthreads.
+ SMT nice support makes each logical CPU aware of what is running on
+ its hyperthread siblings, maintaining appropriate distribution of
+ CPU according to nice levels and scheduling policies at the expense
+ of slightly increased overhead.
+
+ If unsure say Y here.
+
+
config SCHED_MC
def_bool y
prompt "Multi-core scheduler support"
@@ -1160,22 +1176,23 @@ config MICROCODE
bool "CPU microcode loading support"
default y
depends on CPU_SUP_AMD || CPU_SUP_INTEL
- depends on BLK_DEV_INITRD
select FW_LOADER
---help---
-
If you say Y here, you will be able to update the microcode on
- certain Intel and AMD processors. The Intel support is for the
- IA32 family, e.g. Pentium Pro, Pentium II, Pentium III, Pentium 4,
- Xeon etc. The AMD support is for families 0x10 and later. You will
- obviously need the actual microcode binary data itself which is not
- shipped with the Linux kernel.
+ Intel and AMD processors. The Intel support is for the IA32 family,
+ e.g. Pentium Pro, Pentium II, Pentium III, Pentium 4, Xeon etc. The
+ AMD support is for families 0x10 and later. You will obviously need
+ the actual microcode binary data itself which is not shipped with
+ the Linux kernel.
- This option selects the general module only, you need to select
- at least one vendor specific module as well.
+ The preferred method to load microcode from a detached initrd is described
+ in Documentation/x86/early-microcode.txt. For that you need to enable
+ CONFIG_BLK_DEV_INITRD in order for the loader to be able to scan the
+ initrd for microcode blobs.
- To compile this driver as a module, choose M here: the module
- will be called microcode.
+ In addition, you can build-in the microcode into the kernel. For that you
+ need to enable FIRMWARE_IN_KERNEL and add the vendor-supplied microcode
+ to the CONFIG_EXTRA_FIRMWARE config option.
config MICROCODE_INTEL
bool "Intel microcode loading support"
@@ -1995,7 +2012,7 @@ config HOTPLUG_CPU
config BOOTPARAM_HOTPLUG_CPU0
bool "Set default setting of cpu0_hotpluggable"
default n
- depends on HOTPLUG_CPU
+ depends on HOTPLUG_CPU && !SCHED_BFS
---help---
Set whether default state of cpu0_hotpluggable is on or off.
@@ -2024,7 +2041,7 @@ config BOOTPARAM_HOTPLUG_CPU0
config DEBUG_HOTPLUG_CPU0
def_bool n
prompt "Debug CPU0 hotplug"
- depends on HOTPLUG_CPU
+ depends on HOTPLUG_CPU && !SCHED_BFS
---help---
Enabling this option offlines CPU0 (if CPU0 can be offlined) as
soon as possible and boots up userspace with CPU0 offlined. User
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 03663740c..1a4477ced 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -268,6 +268,7 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
/* Called with IRQs disabled. */
__visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
{
+ struct thread_info *ti = pt_regs_to_thread_info(regs);
u32 cached_flags;
if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled()))
@@ -275,12 +276,22 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
lockdep_sys_exit();
- cached_flags =
- READ_ONCE(pt_regs_to_thread_info(regs)->flags);
+ cached_flags = READ_ONCE(ti->flags);
if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS))
exit_to_usermode_loop(regs, cached_flags);
+#ifdef CONFIG_COMPAT
+ /*
+ * Compat syscalls set TS_COMPAT. Make sure we clear it before
+ * returning to user mode. We need to clear it *after* signal
+ * handling, because syscall restart has a fixup for compat
+ * syscalls. The fixup is exercised by the ptrace_syscall_32
+ * selftest.
+ */
+ ti->status &= ~TS_COMPAT;
+#endif
+
user_enter();
}
@@ -332,14 +343,6 @@ __visible inline void syscall_return_slowpath(struct pt_regs *regs)
if (unlikely(cached_flags & SYSCALL_EXIT_WORK_FLAGS))
syscall_slow_exit_work(regs, cached_flags);
-#ifdef CONFIG_COMPAT
- /*
- * Compat syscalls set TS_COMPAT. Make sure we clear it before
- * returning to user mode.
- */
- ti->status &= ~TS_COMPAT;
-#endif
-
local_irq_disable();
prepare_exit_to_usermode(regs);
}
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index c80f6b6f3..e8c4fba52 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -644,8 +644,8 @@ static inline void entering_irq(void)
static inline void entering_ack_irq(void)
{
- ack_APIC_irq();
entering_irq();
+ ack_APIC_irq();
}
static inline void ipi_entering_ack_irq(void)
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 1815b7362..84b3d194a 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -141,6 +141,7 @@ struct irq_alloc_info {
struct irq_cfg {
unsigned int dest_apicid;
u8 vector;
+ u8 old_vector;
};
extern struct irq_cfg *irq_cfg(unsigned int irq);
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index 1e1b07a5a..9d3a96c4d 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -3,6 +3,7 @@
#include <asm/cpu.h>
#include <linux/earlycpio.h>
+#include <linux/initrd.h>
#define native_rdmsr(msr, val1, val2) \
do { \
@@ -143,4 +144,29 @@ static inline void reload_early_microcode(void) { }
static inline bool
get_builtin_firmware(struct cpio_data *cd, const char *name) { return false; }
#endif
+
+static inline unsigned long get_initrd_start(void)
+{
+#ifdef CONFIG_BLK_DEV_INITRD
+ return initrd_start;
+#else
+ return 0;
+#endif
+}
+
+static inline unsigned long get_initrd_start_addr(void)
+{
+#ifdef CONFIG_BLK_DEV_INITRD
+#ifdef CONFIG_X86_32
+ unsigned long *initrd_start_p = (unsigned long *)__pa_nodebug(&initrd_start);
+
+ return (unsigned long)__pa_nodebug(*initrd_start_p);
+#else
+ return get_initrd_start();
+#endif
+#else /* CONFIG_BLK_DEV_INITRD */
+ return 0;
+#endif
+}
+
#endif /* _ASM_X86_MICROCODE_H */
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 7bcb861a0..5a2ed3ed2 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -165,6 +165,7 @@ struct x86_pmu_capability {
#define GLOBAL_STATUS_ASIF BIT_ULL(60)
#define GLOBAL_STATUS_COUNTERS_FROZEN BIT_ULL(59)
#define GLOBAL_STATUS_LBRS_FROZEN BIT_ULL(58)
+#define GLOBAL_STATUS_TRACE_TOPAPMI BIT_ULL(55)
/*
* IBS cpuid feature detection
diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
index 8b2d4bea9..39171b364 100644
--- a/arch/x86/include/asm/xen/hypervisor.h
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -62,4 +62,6 @@ void xen_arch_register_cpu(int num);
void xen_arch_unregister_cpu(int num);
#endif
+extern void xen_set_iopl_mask(unsigned mask);
+
#endif /* _ASM_X86_XEN_HYPERVISOR_H */
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 3b670df4b..ad59d70bc 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -213,6 +213,7 @@ update:
*/
cpumask_and(d->old_domain, d->old_domain, cpu_online_mask);
d->move_in_progress = !cpumask_empty(d->old_domain);
+ d->cfg.old_vector = d->move_in_progress ? d->cfg.vector : 0;
d->cfg.vector = vector;
cpumask_copy(d->domain, vector_cpumask);
success:
@@ -655,46 +656,97 @@ void irq_complete_move(struct irq_cfg *cfg)
}
/*
- * Called with @desc->lock held and interrupts disabled.
+ * Called from fixup_irqs() with @desc->lock held and interrupts disabled.
*/
void irq_force_complete_move(struct irq_desc *desc)
{
struct irq_data *irqdata = irq_desc_get_irq_data(desc);
struct apic_chip_data *data = apic_chip_data(irqdata);
struct irq_cfg *cfg = data ? &data->cfg : NULL;
+ unsigned int cpu;
if (!cfg)
return;
- __irq_complete_move(cfg, cfg->vector);
-
/*
* This is tricky. If the cleanup of @data->old_domain has not been
* done yet, then the following setaffinity call will fail with
* -EBUSY. This can leave the interrupt in a stale state.
*
- * The cleanup cannot make progress because we hold @desc->lock. So in
- * case @data->old_domain is not yet cleaned up, we need to drop the
- * lock and acquire it again. @desc cannot go away, because the
- * hotplug code holds the sparse irq lock.
+ * All CPUs are stuck in stop machine with interrupts disabled so
+ * calling __irq_complete_move() would be completely pointless.
*/
raw_spin_lock(&vector_lock);
- /* Clean out all offline cpus (including ourself) first. */
+ /*
+ * Clean out all offline cpus (including the outgoing one) from the
+ * old_domain mask.
+ */
cpumask_and(data->old_domain, data->old_domain, cpu_online_mask);
- while (!cpumask_empty(data->old_domain)) {
+
+ /*
+ * If move_in_progress is cleared and the old_domain mask is empty,
+ * then there is nothing to cleanup. fixup_irqs() will take care of
+ * the stale vectors on the outgoing cpu.
+ */
+ if (!data->move_in_progress && cpumask_empty(data->old_domain)) {
raw_spin_unlock(&vector_lock);
- raw_spin_unlock(&desc->lock);
- cpu_relax();
- raw_spin_lock(&desc->lock);
+ return;
+ }
+
+ /*
+ * 1) The interrupt is in move_in_progress state. That means that we
+ * have not seen an interrupt since the io_apic was reprogrammed to
+ * the new vector.
+ *
+ * 2) The interrupt has fired on the new vector, but the cleanup IPIs
+ * have not been processed yet.
+ */
+ if (data->move_in_progress) {
/*
- * Reevaluate apic_chip_data. It might have been cleared after
- * we dropped @desc->lock.
+ * In theory there is a race:
+ *
+ * set_ioapic(new_vector) <-- Interrupt is raised before update
+ * is effective, i.e. it's raised on
+ * the old vector.
+ *
+ * So if the target cpu cannot handle that interrupt before
+ * the old vector is cleaned up, we get a spurious interrupt
+ * and in the worst case the ioapic irq line becomes stale.
+ *
+ * But in case of cpu hotplug this should be a non issue
+ * because if the affinity update happens right before all
+ * cpus rendevouz in stop machine, there is no way that the
+ * interrupt can be blocked on the target cpu because all cpus
+ * loops first with interrupts enabled in stop machine, so the
+ * old vector is not yet cleaned up when the interrupt fires.
+ *
+ * So the only way to run into this issue is if the delivery
+ * of the interrupt on the apic/system bus would be delayed
+ * beyond the point where the target cpu disables interrupts
+ * in stop machine. I doubt that it can happen, but at least
+ * there is a theroretical chance. Virtualization might be
+ * able to expose this, but AFAICT the IOAPIC emulation is not
+ * as stupid as the real hardware.
+ *
+ * Anyway, there is nothing we can do about that at this point
+ * w/o refactoring the whole fixup_irq() business completely.
+ * We print at least the irq number and the old vector number,
+ * so we have the necessary information when a problem in that
+ * area arises.
*/
- data = apic_chip_data(irqdata);
- if (!data)
- return;
- raw_spin_lock(&vector_lock);
+ pr_warn("IRQ fixup: irq %d move in progress, old vector %d\n",
+ irqdata->irq, cfg->old_vector);
}
+ /*
+ * If old_domain is not empty, then other cpus still have the irq
+ * descriptor set in their vector array. Clean it up.
+ */
+ for_each_cpu(cpu, data->old_domain)
+ per_cpu(vector_irq, cpu)[cfg->old_vector] = VECTOR_UNUSED;
+
+ /* Cleanup the left overs of the (half finished) move */
+ cpumask_clear(data->old_domain);
+ data->move_in_progress = 0;
raw_spin_unlock(&vector_lock);
}
#endif
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index de8a164a6..f9c7f3302 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -551,10 +551,14 @@ scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd,
cd.data = NULL;
cd.size = 0;
- cd = find_cpio_data(p, (void *)start, size, &offset);
- if (!cd.data) {
+ /* try built-in microcode if no initrd */
+ if (!size) {
if (!load_builtin_intel_microcode(&cd))
return UCODE_ERROR;
+ } else {
+ cd = find_cpio_data(p, (void *)start, size, &offset);
+ if (!cd.data)
+ return UCODE_ERROR;
}
return get_matching_model_microcode(0, start, cd.data, cd.size,
@@ -690,7 +694,7 @@ int __init save_microcode_in_initrd_intel(void)
if (count == 0)
return ret;
- copy_initrd_ptrs(mc_saved, mc_saved_in_initrd, initrd_start, count);
+ copy_initrd_ptrs(mc_saved, mc_saved_in_initrd, get_initrd_start(), count);
ret = save_microcode(&mc_saved_data, mc_saved, count);
if (ret)
pr_err("Cannot save microcode patches from initrd.\n");
@@ -728,16 +732,20 @@ void __init load_ucode_intel_bsp(void)
struct boot_params *p;
p = (struct boot_params *)__pa_nodebug(&boot_params);
- start = p->hdr.ramdisk_image;
size = p->hdr.ramdisk_size;
- _load_ucode_intel_bsp(
- (struct mc_saved_data *)__pa_nodebug(&mc_saved_data),
- (unsigned long *)__pa_nodebug(&mc_saved_in_initrd),
- start, size);
+ /*
+ * Set start only if we have an initrd image. We cannot use initrd_start
+ * because it is not set that early yet.
+ */
+ start = (size ? p->hdr.ramdisk_image : 0);
+
+ _load_ucode_intel_bsp((struct mc_saved_data *)__pa_nodebug(&mc_saved_data),
+ (unsigned long *)__pa_nodebug(&mc_saved_in_initrd),
+ start, size);
#else
- start = boot_params.hdr.ramdisk_image + PAGE_OFFSET;
size = boot_params.hdr.ramdisk_size;
+ start = (size ? boot_params.hdr.ramdisk_image + PAGE_OFFSET : 0);
_load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd, start, size);
#endif
@@ -748,20 +756,14 @@ void load_ucode_intel_ap(void)
struct mc_saved_data *mc_saved_data_p;
struct ucode_cpu_info uci;
unsigned long *mc_saved_in_initrd_p;
- unsigned long initrd_start_addr;
enum ucode_state ret;
#ifdef CONFIG_X86_32
- unsigned long *initrd_start_p;
- mc_saved_in_initrd_p =
- (unsigned long *)__pa_nodebug(mc_saved_in_initrd);
+ mc_saved_in_initrd_p = (unsigned long *)__pa_nodebug(mc_saved_in_initrd);
mc_saved_data_p = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data);
- initrd_start_p = (unsigned long *)__pa_nodebug(&initrd_start);
- initrd_start_addr = (unsigned long)__pa_nodebug(*initrd_start_p);
#else
- mc_saved_data_p = &mc_saved_data;
mc_saved_in_initrd_p = mc_saved_in_initrd;
- initrd_start_addr = initrd_start;
+ mc_saved_data_p = &mc_saved_data;
#endif
/*
@@ -773,7 +775,7 @@ void load_ucode_intel_ap(void)
collect_cpu_info_early(&uci);
ret = load_microcode(mc_saved_data_p, mc_saved_in_initrd_p,
- initrd_start_addr, &uci);
+ get_initrd_start_addr(), &uci);
if (ret != UCODE_OK)
return;
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 1b443db2d..6532f5b40 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -596,6 +596,19 @@ void x86_pmu_disable_all(void)
}
}
+/*
+ * There may be PMI landing after enabled=0. The PMI hitting could be before or
+ * after disable_all.
+ *
+ * If PMI hits before disable_all, the PMU will be disabled in the NMI handler.
+ * It will not be re-enabled in the NMI handler again, because enabled=0. After
+ * handling the NMI, disable_all will be called, which will not change the
+ * state either. If PMI hits after disable_all, the PMU is already disabled
+ * before entering NMI handler. The NMI handler will not change the state
+ * either.
+ *
+ * So either situation is harmless.
+ */
static void x86_pmu_disable(struct pmu *pmu)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 7bb61e32f..98be6d6d3 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -586,6 +586,7 @@ struct x86_pmu {
pebs_broken :1,
pebs_prec_dist :1;
int pebs_record_size;
+ int pebs_buffer_size;
void (*drain_pebs)(struct pt_regs *regs);
struct event_constraint *pebs_constraints;
void (*pebs_aliases)(struct perf_event *event);
@@ -904,6 +905,8 @@ void intel_pmu_lbr_init_skl(void);
void intel_pmu_lbr_init_knl(void);
+void intel_pmu_pebs_data_source_nhm(void);
+
int intel_pmu_setup_lbr_filter(struct perf_event *event);
void intel_pt_interrupt(void);
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index fed2ab1f1..760aec1e8 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1502,7 +1502,15 @@ static __initconst const u64 knl_hw_cache_extra_regs
};
/*
- * Use from PMIs where the LBRs are already disabled.
+ * Used from PMIs where the LBRs are already disabled.
+ *
+ * This function could be called consecutively. It is required to remain in
+ * disabled state if called consecutively.
+ *
+ * During consecutive calls, the same disable value will be written to related
+ * registers, so the PMU state remains unchanged. hw.state in
+ * intel_bts_disable_local will remain PERF_HES_STOPPED too in consecutive
+ * calls.
*/
static void __intel_pmu_disable_all(void)
{
@@ -1884,6 +1892,16 @@ again:
if (__test_and_clear_bit(62, (unsigned long *)&status)) {
handled++;
x86_pmu.drain_pebs(regs);
+ /*
+ * There are cases where, even though, the PEBS ovfl bit is set
+ * in GLOBAL_OVF_STATUS, the PEBS events may also have their
+ * overflow bits set for their counters. We must clear them
+ * here because they have been processed as exact samples in
+ * the drain_pebs() routine. They must not be processed again
+ * in the for_each_bit_set() loop for regular samples below.
+ */
+ status &= ~cpuc->pebs_enabled;
+ status &= x86_pmu.intel_ctrl | GLOBAL_STATUS_TRACE_TOPAPMI;
}
/*
@@ -1929,7 +1947,10 @@ again:
goto again;
done:
- __intel_pmu_enable_all(0, true);
+ /* Only restore PMU state when it's active. See x86_pmu_disable(). */
+ if (cpuc->enabled)
+ __intel_pmu_enable_all(0, true);
+
/*
* Only unmask the NMI after the overflow counters
* have been reset. This avoids spurious NMIs on
@@ -3396,6 +3417,7 @@ __init int intel_pmu_init(void)
intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
+ intel_pmu_pebs_data_source_nhm();
x86_add_quirk(intel_nehalem_quirk);
pr_cont("Nehalem events, ");
@@ -3459,6 +3481,7 @@ __init int intel_pmu_init(void)
intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
+ intel_pmu_pebs_data_source_nhm();
pr_cont("Westmere events, ");
break;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 10602f0a4..955140140 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -51,7 +51,8 @@ union intel_x86_pebs_dse {
#define OP_LH (P(OP, LOAD) | P(LVL, HIT))
#define SNOOP_NONE_MISS (P(SNOOP, NONE) | P(SNOOP, MISS))
-static const u64 pebs_data_source[] = {
+/* Version for Sandy Bridge and later */
+static u64 pebs_data_source[] = {
P(OP, LOAD) | P(LVL, MISS) | P(LVL, L3) | P(SNOOP, NA),/* 0x00:ukn L3 */
OP_LH | P(LVL, L1) | P(SNOOP, NONE), /* 0x01: L1 local */
OP_LH | P(LVL, LFB) | P(SNOOP, NONE), /* 0x02: LFB hit */
@@ -70,6 +71,14 @@ static const u64 pebs_data_source[] = {
OP_LH | P(LVL, UNC) | P(SNOOP, NONE), /* 0x0f: uncached */
};
+/* Patch up minor differences in the bits */
+void __init intel_pmu_pebs_data_source_nhm(void)
+{
+ pebs_data_source[0x05] = OP_LH | P(LVL, L3) | P(SNOOP, HIT);
+ pebs_data_source[0x06] = OP_LH | P(LVL, L3) | P(SNOOP, HITM);
+ pebs_data_source[0x07] = OP_LH | P(LVL, L3) | P(SNOOP, HITM);
+}
+
static u64 precise_store_data(u64 status)
{
union intel_x86_pebs_dse dse;
@@ -269,7 +278,7 @@ static int alloc_pebs_buffer(int cpu)
if (!x86_pmu.pebs)
return 0;
- buffer = kzalloc_node(PEBS_BUFFER_SIZE, GFP_KERNEL, node);
+ buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
if (unlikely(!buffer))
return -ENOMEM;
@@ -286,7 +295,7 @@ static int alloc_pebs_buffer(int cpu)
per_cpu(insn_buffer, cpu) = ibuffer;
}
- max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size;
+ max = x86_pmu.pebs_buffer_size / x86_pmu.pebs_record_size;
ds->pebs_buffer_base = (u64)(unsigned long)buffer;
ds->pebs_index = ds->pebs_buffer_base;
@@ -1319,6 +1328,7 @@ void __init intel_ds_init(void)
x86_pmu.bts = boot_cpu_has(X86_FEATURE_BTS);
x86_pmu.pebs = boot_cpu_has(X86_FEATURE_PEBS);
+ x86_pmu.pebs_buffer_size = PEBS_BUFFER_SIZE;
if (x86_pmu.pebs) {
char pebs_type = x86_pmu.intel_cap.pebs_trap ? '+' : '-';
int format = x86_pmu.intel_cap.pebs_format;
@@ -1327,6 +1337,14 @@ void __init intel_ds_init(void)
case 0:
printk(KERN_CONT "PEBS fmt0%c, ", pebs_type);
x86_pmu.pebs_record_size = sizeof(struct pebs_record_core);
+ /*
+ * Using >PAGE_SIZE buffers makes the WRMSR to
+ * PERF_GLOBAL_CTRL in intel_pmu_enable_all()
+ * mysteriously hang on Core2.
+ *
+ * As a workaround, we don't do this.
+ */
+ x86_pmu.pebs_buffer_size = PAGE_SIZE;
x86_pmu.drain_pebs = intel_pmu_drain_pebs_core;
break;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
index 33acb884c..4547b2cca 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
@@ -2875,11 +2875,13 @@ static struct intel_uncore_type bdx_uncore_sbox = {
.format_group = &hswep_uncore_sbox_format_group,
};
+#define BDX_MSR_UNCORE_SBOX 3
+
static struct intel_uncore_type *bdx_msr_uncores[] = {
&bdx_uncore_ubox,
&bdx_uncore_cbox,
- &bdx_uncore_sbox,
&hswep_uncore_pcu,
+ &bdx_uncore_sbox,
NULL,
};
@@ -2888,6 +2890,10 @@ void bdx_uncore_cpu_init(void)
if (bdx_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
bdx_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
uncore_msr_uncores = bdx_msr_uncores;
+
+ /* BDX-DE doesn't have SBOX */
+ if (boot_cpu_data.x86_model == 86)
+ uncore_msr_uncores[BDX_MSR_UNCORE_SBOX] = NULL;
}
static struct intel_uncore_type bdx_uncore_ha = {
diff --git a/arch/x86/kernel/cpu/perf_event_knc.c b/arch/x86/kernel/cpu/perf_event_knc.c
index 5b0c232d1..b931095e8 100644
--- a/arch/x86/kernel/cpu/perf_event_knc.c
+++ b/arch/x86/kernel/cpu/perf_event_knc.c
@@ -263,7 +263,9 @@ again:
goto again;
done:
- knc_pmu_enable_all(0);
+ /* Only restore PMU state when it's active. See x86_pmu_disable(). */
+ if (cpuc->enabled)
+ knc_pmu_enable_all(0);
return handled;
}
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 37dae792d..435466fbd 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -28,8 +28,18 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
return -EINVAL;
+#ifdef CONFIG_SCHED_BFS_AUTOISO
+ if (turn_on) {
+ struct sched_param param = { .sched_priority = 0 };
+ if (!capable(CAP_SYS_RAWIO))
+ return -EPERM;
+ /* Start X as SCHED_ISO */
+ sched_setscheduler_nocheck(current, SCHED_ISO, &param);
+ }
+#else
if (turn_on && !capable(CAP_SYS_RAWIO))
return -EPERM;
+#endif
/*
* If it's the first ioperm() call in this thread's lifetime, set the
@@ -96,18 +106,31 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
SYSCALL_DEFINE1(iopl, unsigned int, level)
{
struct pt_regs *regs = current_pt_regs();
- unsigned int old = (regs->flags >> 12) & 3;
struct thread_struct *t = &current->thread;
+ /*
+ * Careful: the IOPL bits in regs->flags are undefined under Xen PV
+ * and changing them has no effect.
+ */
+ unsigned int old = t->iopl >> X86_EFLAGS_IOPL_BIT;
+
if (level > 3)
return -EINVAL;
/* Trying to gain more privileges? */
if (level > old) {
+#ifdef CONFIG_SCHED_BFS_AUTOISO
+ struct sched_param param = { .sched_priority = 0 };
+#endif
if (!capable(CAP_SYS_RAWIO))
return -EPERM;
+#ifdef CONFIG_SCHED_BFS_AUTOISO
+ /* Start X as SCHED_ISO */
+ sched_setscheduler_nocheck(current, SCHED_ISO, &param);
+#endif
}
- regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | (level << 12);
- t->iopl = level << 12;
+ regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) |
+ (level << X86_EFLAGS_IOPL_BIT);
+ t->iopl = level << X86_EFLAGS_IOPL_BIT;
set_iopl_mask(t->iopl);
return 0;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index b9d99e0f8..9f7518760 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -48,6 +48,7 @@
#include <asm/syscalls.h>
#include <asm/debugreg.h>
#include <asm/switch_to.h>
+#include <asm/xen/hypervisor.h>
asmlinkage extern void ret_from_fork(void);
@@ -411,6 +412,17 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
__switch_to_xtra(prev_p, next_p, tss);
+#ifdef CONFIG_XEN
+ /*
+ * On Xen PV, IOPL bits in pt_regs->flags have no effect, and
+ * current_pt_regs()->flags may not match the current task's
+ * intended IOPL. We need to switch it manually.
+ */
+ if (unlikely(static_cpu_has(X86_FEATURE_XENPV) &&
+ prev->iopl != next->iopl))
+ xen_set_iopl_mask(next->iopl);
+#endif
+
if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) {
/*
* AMD CPUs have a misfeature: SYSRET sets the SS selector but
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index b0ea42b78..ab5318727 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -245,7 +245,7 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
* PIC is being reset. Handle it gracefully here
*/
atomic_inc(&ps->pending);
- else if (value > 0)
+ else if (value > 0 && ps->reinject)
/* in this case, we had multiple outstanding pit interrupts
* that we needed to inject. Reinject
*/
@@ -288,7 +288,9 @@ static void pit_do_work(struct kthread_work *work)
* last one has been acked.
*/
spin_lock(&ps->inject_lock);
- if (ps->irq_ack) {
+ if (!ps->reinject)
+ inject = 1;
+ else if (ps->irq_ack) {
ps->irq_ack = 0;
inject = 1;
}
@@ -317,10 +319,10 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
struct kvm_kpit_state *ps = container_of(data, struct kvm_kpit_state, timer);
struct kvm_pit *pt = ps->kvm->arch.vpit;
- if (ps->reinject || !atomic_read(&ps->pending)) {
+ if (ps->reinject)
atomic_inc(&ps->pending);
- queue_kthread_work(&pt->worker, &pt->expired);
- }
+
+ queue_kthread_work(&pt->worker, &pt->expired);
if (ps->is_periodic) {
hrtimer_add_expires_ns(&ps->timer, ps->period);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 9bd8f44ba..539062e24 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2702,8 +2702,15 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
} else
vmx->nested.nested_vmx_ept_caps = 0;
+ /*
+ * Old versions of KVM use the single-context version without
+ * checking for support, so declare that it is supported even
+ * though it is treated as global context. The alternative is
+ * not failing the single-context invvpid, and it is worse.
+ */
if (enable_vpid)
vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT |
+ VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT |
VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
else
vmx->nested.nested_vmx_vpid_caps = 0;
@@ -7398,6 +7405,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
if (!(types & (1UL << type))) {
nested_vmx_failValid(vcpu,
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+ skip_emulated_instruction(vcpu);
return 1;
}
@@ -7456,6 +7464,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
if (!(types & (1UL << type))) {
nested_vmx_failValid(vcpu,
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+ skip_emulated_instruction(vcpu);
return 1;
}
@@ -7472,12 +7481,17 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
}
switch (type) {
+ case VMX_VPID_EXTENT_SINGLE_CONTEXT:
+ /*
+ * Old versions of KVM use the single-context version so we
+ * have to support it; just treat it the same as all-context.
+ */
case VMX_VPID_EXTENT_ALL_CONTEXT:
__vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02);
nested_vmx_succeed(vcpu);
break;
default:
- /* Trap single context invalidation invvpid calls */
+ /* Trap individual address invalidation invvpid calls */
BUG_ON(1);
break;
}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index eaf6ee8c2..d47d231e0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2752,6 +2752,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
}
kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
+ vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD;
}
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 8f4cc3dfa..5fb6adaaa 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -106,8 +106,6 @@ static void flush_tlb_func(void *info)
if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
return;
- if (!f->flush_end)
- f->flush_end = f->flush_start + PAGE_SIZE;
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
@@ -135,12 +133,20 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
unsigned long end)
{
struct flush_tlb_info info;
+
+ if (end == 0)
+ end = start + PAGE_SIZE;
info.flush_mm = mm;
info.flush_start = start;
info.flush_end = end;
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
- trace_tlb_flush(TLB_REMOTE_SEND_IPI, end - start);
+ if (end == TLB_FLUSH_ALL)
+ trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
+ else
+ trace_tlb_flush(TLB_REMOTE_SEND_IPI,
+ (end - start) >> PAGE_SHIFT);
+
if (is_uv_system()) {
unsigned int cpu;
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index e58565556..0ae7e9fa3 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -540,3 +540,10 @@ static void twinhead_reserve_killing_zone(struct pci_dev *dev)
}
}
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x27B9, twinhead_reserve_killing_zone);
+
+static void pci_bdwep_bar(struct pci_dev *dev)
+{
+ dev->non_compliant_bars = 1;
+}
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fa0, pci_bdwep_bar);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, pci_bdwep_bar);
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index d09e4c9d7..e3679db17 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -961,7 +961,7 @@ static void xen_load_sp0(struct tss_struct *tss,
tss->x86_tss.sp0 = thread->sp0;
}
-static void xen_set_iopl_mask(unsigned mask)
+void xen_set_iopl_mask(unsigned mask)
{
struct physdev_set_iopl set_iopl;