From 8d91c1e411f55d7ea91b1183a2e9f8088fb4d5be Mon Sep 17 00:00:00 2001 From: André Fabian Silva Delgado Date: Tue, 15 Dec 2015 14:52:16 -0300 Subject: Linux-libre 4.3.2-gnu --- arch/x86/kernel/cpu/Makefile | 2 + arch/x86/kernel/cpu/amd.c | 10 +- arch/x86/kernel/cpu/common.c | 31 ++- arch/x86/kernel/cpu/cpu.h | 1 + arch/x86/kernel/cpu/intel.c | 47 +++- arch/x86/kernel/cpu/intel_pt.h | 39 +-- arch/x86/kernel/cpu/mcheck/Makefile | 2 +- arch/x86/kernel/cpu/mcheck/mce-apei.c | 1 - arch/x86/kernel/cpu/mcheck/mce-genpool.c | 99 +++++++ arch/x86/kernel/cpu/mcheck/mce-internal.h | 14 + arch/x86/kernel/cpu/mcheck/mce.c | 244 ++++++++--------- arch/x86/kernel/cpu/mcheck/mce_intel.c | 20 +- arch/x86/kernel/cpu/mcheck/p5.c | 5 +- arch/x86/kernel/cpu/mcheck/winchip.c | 4 +- arch/x86/kernel/cpu/microcode/core.c | 7 +- arch/x86/kernel/cpu/microcode/intel_early.c | 2 +- arch/x86/kernel/cpu/mshyperv.c | 52 +++- arch/x86/kernel/cpu/mtrr/main.c | 2 - arch/x86/kernel/cpu/perf_event.c | 8 +- arch/x86/kernel/cpu/perf_event.h | 26 +- arch/x86/kernel/cpu/perf_event_intel.c | 304 +++++++++++++++++++-- arch/x86/kernel/cpu/perf_event_intel_bts.c | 4 +- arch/x86/kernel/cpu/perf_event_intel_ds.c | 106 +++++-- arch/x86/kernel/cpu/perf_event_intel_lbr.c | 58 +++- arch/x86/kernel/cpu/perf_event_intel_pt.c | 85 +++++- arch/x86/kernel/cpu/perf_event_intel_rapl.c | 20 ++ arch/x86/kernel/cpu/perf_event_intel_uncore.c | 11 + arch/x86/kernel/cpu/perf_event_intel_uncore.h | 2 + arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c | 23 +- .../x86/kernel/cpu/perf_event_intel_uncore_snbep.c | 166 ++++++++++- arch/x86/kernel/cpu/perf_event_msr.c | 242 ++++++++++++++++ arch/x86/kernel/cpu/scattered.c | 2 +- 32 files changed, 1361 insertions(+), 278 deletions(-) create mode 100644 arch/x86/kernel/cpu/mcheck/mce-genpool.c create mode 100644 arch/x86/kernel/cpu/perf_event_msr.c (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 9bff68798..4eb065c6b 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -46,6 +46,8 @@ obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += perf_event_intel_uncore.o \ perf_event_intel_uncore_snb.o \ perf_event_intel_uncore_snbep.o \ perf_event_intel_uncore_nhmex.o +obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_msr.o +obj-$(CONFIG_CPU_SUP_AMD) += perf_event_msr.o endif diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index dd3a4baff..4a70fc6d4 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -11,6 +11,7 @@ #include #include #include +#include #ifdef CONFIG_X86_64 # include @@ -114,7 +115,7 @@ static void init_amd_k6(struct cpuinfo_x86 *c) const int K6_BUG_LOOP = 1000000; int n; void (*f_vide)(void); - unsigned long d, d2; + u64 d, d2; printk(KERN_INFO "AMD K6 stepping B detected - "); @@ -125,10 +126,10 @@ static void init_amd_k6(struct cpuinfo_x86 *c) n = K6_BUG_LOOP; f_vide = vide; - rdtscl(d); + d = rdtsc(); while (n--) f_vide(); - rdtscl(d2); + d2 = rdtsc(); d = d2-d; if (d > 20*K6_BUG_LOOP) @@ -506,6 +507,9 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) /* A random value per boot for bit slice [12:upper_bit) */ va_align.bits = get_random_int() & va_align.mask; } + + if (cpu_has(c, X86_FEATURE_MWAITX)) + use_mwaitx_delay(); } static void early_init_amd(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index cb9e5df42..1a292573d 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -272,10 +273,9 @@ __setup("nosmap", setup_disable_smap); static __always_inline void setup_smap(struct cpuinfo_x86 *c) { - unsigned long eflags; + unsigned long eflags = native_save_fl(); /* This should have been cleared long ago */ - raw_local_save_flags(eflags); BUG_ON(eflags & X86_EFLAGS_AC); if (cpu_has(c, X86_FEATURE_SMAP)) { @@ -1109,10 +1109,10 @@ void print_cpu_info(struct cpuinfo_x86 *c) else printk(KERN_CONT "%d86", c->x86); - printk(KERN_CONT " (fam: %02x, model: %02x", c->x86, c->x86_model); + printk(KERN_CONT " (family: 0x%x, model: 0x%x", c->x86, c->x86_model); if (c->x86_mask || c->cpuid_level >= 0) - printk(KERN_CONT ", stepping: %02x)\n", c->x86_mask); + printk(KERN_CONT ", stepping: 0x%x)\n", c->x86_mask); else printk(KERN_CONT ")\n"); @@ -1185,10 +1185,10 @@ void syscall_init(void) * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip. */ wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); - wrmsrl(MSR_LSTAR, entry_SYSCALL_64); + wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); #ifdef CONFIG_IA32_EMULATION - wrmsrl(MSR_CSTAR, entry_SYSCALL_compat); + wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat); /* * This only works on Intel CPUs. * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP. @@ -1199,7 +1199,7 @@ void syscall_init(void) wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); #else - wrmsrl(MSR_CSTAR, ignore_sysret); + wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret); wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG); wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL); @@ -1488,3 +1488,20 @@ inline bool __static_cpu_has_safe(u16 bit) return boot_cpu_has(bit); } EXPORT_SYMBOL_GPL(__static_cpu_has_safe); + +static void bsp_resume(void) +{ + if (this_cpu->c_bsp_resume) + this_cpu->c_bsp_resume(&boot_cpu_data); +} + +static struct syscore_ops cpu_syscore_ops = { + .resume = bsp_resume, +}; + +static int __init init_cpu_syscore(void) +{ + register_syscore_ops(&cpu_syscore_ops); + return 0; +} +core_initcall(init_cpu_syscore); diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index c37dc37e8..2584265d4 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -13,6 +13,7 @@ struct cpu_dev { void (*c_init)(struct cpuinfo_x86 *); void (*c_identify)(struct cpuinfo_x86 *); void (*c_detect_tlb)(struct cpuinfo_x86 *); + void (*c_bsp_resume)(struct cpuinfo_x86 *); int c_x86_vendor; #ifdef CONFIG_X86_32 /* Optional vendor specific routine to obtain the cache size. */ diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 50163fa90..98a13db5f 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -371,6 +371,36 @@ static void detect_vmx_virtcap(struct cpuinfo_x86 *c) } } +static void init_intel_energy_perf(struct cpuinfo_x86 *c) +{ + u64 epb; + + /* + * Initialize MSR_IA32_ENERGY_PERF_BIAS if not already initialized. + * (x86_energy_perf_policy(8) is available to change it at run-time.) + */ + if (!cpu_has(c, X86_FEATURE_EPB)) + return; + + rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); + if ((epb & 0xF) != ENERGY_PERF_BIAS_PERFORMANCE) + return; + + pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n"); + pr_warn_once("ENERGY_PERF_BIAS: View and update with x86_energy_perf_policy(8)\n"); + epb = (epb & ~0xF) | ENERGY_PERF_BIAS_NORMAL; + wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); +} + +static void intel_bsp_resume(struct cpuinfo_x86 *c) +{ + /* + * MSR_IA32_ENERGY_PERF_BIAS is lost across suspend/resume, + * so reinitialize it properly like during bootup: + */ + init_intel_energy_perf(c); +} + static void init_intel(struct cpuinfo_x86 *c) { unsigned int l2 = 0; @@ -478,21 +508,7 @@ static void init_intel(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_VMX)) detect_vmx_virtcap(c); - /* - * Initialize MSR_IA32_ENERGY_PERF_BIAS if BIOS did not. - * x86_energy_perf_policy(8) is available to change it at run-time - */ - if (cpu_has(c, X86_FEATURE_EPB)) { - u64 epb; - - rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); - if ((epb & 0xF) == ENERGY_PERF_BIAS_PERFORMANCE) { - pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n"); - pr_warn_once("ENERGY_PERF_BIAS: View and update with x86_energy_perf_policy(8)\n"); - epb = (epb & ~0xF) | ENERGY_PERF_BIAS_NORMAL; - wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); - } - } + init_intel_energy_perf(c); } #ifdef CONFIG_X86_32 @@ -747,6 +763,7 @@ static const struct cpu_dev intel_cpu_dev = { .c_detect_tlb = intel_detect_tlb, .c_early_init = early_init_intel, .c_init = init_intel, + .c_bsp_resume = intel_bsp_resume, .c_x86_vendor = X86_VENDOR_INTEL, }; diff --git a/arch/x86/kernel/cpu/intel_pt.h b/arch/x86/kernel/cpu/intel_pt.h index 1c338b0eb..336878a5d 100644 --- a/arch/x86/kernel/cpu/intel_pt.h +++ b/arch/x86/kernel/cpu/intel_pt.h @@ -25,32 +25,11 @@ */ #define TOPA_PMI_MARGIN 512 -/* - * Table of Physical Addresses bits - */ -enum topa_sz { - TOPA_4K = 0, - TOPA_8K, - TOPA_16K, - TOPA_32K, - TOPA_64K, - TOPA_128K, - TOPA_256K, - TOPA_512K, - TOPA_1MB, - TOPA_2MB, - TOPA_4MB, - TOPA_8MB, - TOPA_16MB, - TOPA_32MB, - TOPA_64MB, - TOPA_128MB, - TOPA_SZ_END, -}; +#define TOPA_SHIFT 12 -static inline unsigned int sizes(enum topa_sz tsz) +static inline unsigned int sizes(unsigned int tsz) { - return 1 << (tsz + 12); + return 1 << (tsz + TOPA_SHIFT); }; struct topa_entry { @@ -66,20 +45,26 @@ struct topa_entry { u64 rsvd4 : 16; }; -#define TOPA_SHIFT 12 -#define PT_CPUID_LEAVES 2 +#define PT_CPUID_LEAVES 2 +#define PT_CPUID_REGS_NUM 4 /* number of regsters (eax, ebx, ecx, edx) */ enum pt_capabilities { PT_CAP_max_subleaf = 0, PT_CAP_cr3_filtering, + PT_CAP_psb_cyc, + PT_CAP_mtc, PT_CAP_topa_output, PT_CAP_topa_multiple_entries, + PT_CAP_single_range_output, PT_CAP_payloads_lip, + PT_CAP_mtc_periods, + PT_CAP_cycle_thresholds, + PT_CAP_psb_periods, }; struct pt_pmu { struct pmu pmu; - u32 caps[4 * PT_CPUID_LEAVES]; + u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES]; }; /** diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index bb34b03af..a3311c886 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -1,4 +1,4 @@ -obj-y = mce.o mce-severity.o +obj-y = mce.o mce-severity.o mce-genpool.o obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c index a1aef9533..34c89a3e8 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-apei.c +++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c @@ -57,7 +57,6 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err) m.addr = mem_err->physical_addr; mce_log(&m); - mce_notify_irq(); } EXPORT_SYMBOL_GPL(apei_mce_report_mem_error); diff --git a/arch/x86/kernel/cpu/mcheck/mce-genpool.c b/arch/x86/kernel/cpu/mcheck/mce-genpool.c new file mode 100644 index 000000000..0a850100c --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/mce-genpool.c @@ -0,0 +1,99 @@ +/* + * MCE event pool management in MCE context + * + * Copyright (C) 2015 Intel Corp. + * Author: Chen, Gong + * + * This file is licensed under GPLv2. + */ +#include +#include +#include +#include +#include "mce-internal.h" + +/* + * printk() is not safe in MCE context. This is a lock-less memory allocator + * used to save error information organized in a lock-less list. + * + * This memory pool is only to be used to save MCE records in MCE context. + * MCE events are rare, so a fixed size memory pool should be enough. Use + * 2 pages to save MCE events for now (~80 MCE records at most). + */ +#define MCE_POOLSZ (2 * PAGE_SIZE) + +static struct gen_pool *mce_evt_pool; +static LLIST_HEAD(mce_event_llist); +static char gen_pool_buf[MCE_POOLSZ]; + +void mce_gen_pool_process(void) +{ + struct llist_node *head; + struct mce_evt_llist *node; + struct mce *mce; + + head = llist_del_all(&mce_event_llist); + if (!head) + return; + + head = llist_reverse_order(head); + llist_for_each_entry(node, head, llnode) { + mce = &node->mce; + atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); + gen_pool_free(mce_evt_pool, (unsigned long)node, sizeof(*node)); + } +} + +bool mce_gen_pool_empty(void) +{ + return llist_empty(&mce_event_llist); +} + +int mce_gen_pool_add(struct mce *mce) +{ + struct mce_evt_llist *node; + + if (!mce_evt_pool) + return -EINVAL; + + node = (void *)gen_pool_alloc(mce_evt_pool, sizeof(*node)); + if (!node) { + pr_warn_ratelimited("MCE records pool full!\n"); + return -ENOMEM; + } + + memcpy(&node->mce, mce, sizeof(*mce)); + llist_add(&node->llnode, &mce_event_llist); + + return 0; +} + +static int mce_gen_pool_create(void) +{ + struct gen_pool *tmpp; + int ret = -ENOMEM; + + tmpp = gen_pool_create(ilog2(sizeof(struct mce_evt_llist)), -1); + if (!tmpp) + goto out; + + ret = gen_pool_add(tmpp, (unsigned long)gen_pool_buf, MCE_POOLSZ, -1); + if (ret) { + gen_pool_destroy(tmpp); + goto out; + } + + mce_evt_pool = tmpp; + +out: + return ret; +} + +int mce_gen_pool_init(void) +{ + /* Just init mce_gen_pool once. */ + if (mce_evt_pool) + return 0; + + return mce_gen_pool_create(); +} diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index fe32074b8..547720efd 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -13,6 +13,8 @@ enum severity_level { MCE_PANIC_SEVERITY, }; +extern struct atomic_notifier_head x86_mce_decoder_chain; + #define ATTR_LEN 16 #define INITIAL_CHECK_INTERVAL 5 * 60 /* 5 minutes */ @@ -24,6 +26,16 @@ struct mce_bank { char attrname[ATTR_LEN]; /* attribute name */ }; +struct mce_evt_llist { + struct llist_node llnode; + struct mce mce; +}; + +void mce_gen_pool_process(void); +bool mce_gen_pool_empty(void); +int mce_gen_pool_add(struct mce *mce); +int mce_gen_pool_init(void); + extern int (*mce_severity)(struct mce *a, int tolerant, char **msg, bool is_excp); struct dentry *mce_get_debugfs_dir(void); @@ -67,3 +79,5 @@ static inline int apei_clear_mce(u64 record_id) return -EINVAL; } #endif + +void mce_inject_log(struct mce *m); diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index df919ff10..9d014b82a 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -52,11 +52,11 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex); -#define rcu_dereference_check_mce(p) \ +#define mce_log_get_idx_check(p) \ ({ \ - rcu_lockdep_assert(rcu_read_lock_sched_held() || \ - lockdep_is_held(&mce_chrdev_read_mutex), \ - "suspicious rcu_dereference_check_mce() usage"); \ + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \ + !lockdep_is_held(&mce_chrdev_read_mutex), \ + "suspicious mce_log_get_idx_check() usage"); \ smp_load_acquire(&(p)); \ }) @@ -110,22 +110,24 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { */ mce_banks_t mce_banks_ce_disabled; -static DEFINE_PER_CPU(struct work_struct, mce_work); +static struct work_struct mce_work; +static struct irq_work mce_irq_work; static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs); +static int mce_usable_address(struct mce *m); /* * CPU/chipset specific EDAC code can register a notifier call here to print * MCE errors in a human-readable form. */ -static ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); +ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); /* Do initial initialization of a struct mce */ void mce_setup(struct mce *m) { memset(m, 0, sizeof(struct mce)); m->cpu = m->extcpu = smp_processor_id(); - rdtscll(m->tsc); + m->tsc = rdtsc(); /* We hope get_seconds stays lockless */ m->time = get_seconds(); m->cpuvendor = boot_cpu_data.x86_vendor; @@ -157,12 +159,13 @@ void mce_log(struct mce *mce) /* Emit the trace record: */ trace_mce_record(mce); - atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); + if (!mce_gen_pool_add(mce)) + irq_work_queue(&mce_irq_work); mce->finished = 0; wmb(); for (;;) { - entry = rcu_dereference_check_mce(mcelog.next); + entry = mce_log_get_idx_check(mcelog.next); for (;;) { /* @@ -196,48 +199,23 @@ void mce_log(struct mce *mce) set_bit(0, &mce_need_notify); } -static void drain_mcelog_buffer(void) +void mce_inject_log(struct mce *m) { - unsigned int next, i, prev = 0; - - next = ACCESS_ONCE(mcelog.next); - - do { - struct mce *m; - - /* drain what was logged during boot */ - for (i = prev; i < next; i++) { - unsigned long start = jiffies; - unsigned retries = 1; - - m = &mcelog.entry[i]; - - while (!m->finished) { - if (time_after_eq(jiffies, start + 2*retries)) - retries++; - - cpu_relax(); - - if (!m->finished && retries >= 4) { - pr_err("skipping error being logged currently!\n"); - break; - } - } - smp_rmb(); - atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); - } - - memset(mcelog.entry + prev, 0, (next - prev) * sizeof(*m)); - prev = next; - next = cmpxchg(&mcelog.next, prev, 0); - } while (next != prev); + mutex_lock(&mce_chrdev_read_mutex); + mce_log(m); + mutex_unlock(&mce_chrdev_read_mutex); } +EXPORT_SYMBOL_GPL(mce_inject_log); +static struct notifier_block mce_srao_nb; void mce_register_decode_chain(struct notifier_block *nb) { + /* Ensure SRAO notifier has the highest priority in the decode chain. */ + if (nb != &mce_srao_nb && nb->priority == INT_MAX) + nb->priority -= 1; + atomic_notifier_chain_register(&x86_mce_decoder_chain, nb); - drain_mcelog_buffer(); } EXPORT_SYMBOL_GPL(mce_register_decode_chain); @@ -461,61 +439,6 @@ static inline void mce_gather_info(struct mce *m, struct pt_regs *regs) } } -/* - * Simple lockless ring to communicate PFNs from the exception handler with the - * process context work function. This is vastly simplified because there's - * only a single reader and a single writer. - */ -#define MCE_RING_SIZE 16 /* we use one entry less */ - -struct mce_ring { - unsigned short start; - unsigned short end; - unsigned long ring[MCE_RING_SIZE]; -}; -static DEFINE_PER_CPU(struct mce_ring, mce_ring); - -/* Runs with CPU affinity in workqueue */ -static int mce_ring_empty(void) -{ - struct mce_ring *r = this_cpu_ptr(&mce_ring); - - return r->start == r->end; -} - -static int mce_ring_get(unsigned long *pfn) -{ - struct mce_ring *r; - int ret = 0; - - *pfn = 0; - get_cpu(); - r = this_cpu_ptr(&mce_ring); - if (r->start == r->end) - goto out; - *pfn = r->ring[r->start]; - r->start = (r->start + 1) % MCE_RING_SIZE; - ret = 1; -out: - put_cpu(); - return ret; -} - -/* Always runs in MCE context with preempt off */ -static int mce_ring_add(unsigned long pfn) -{ - struct mce_ring *r = this_cpu_ptr(&mce_ring); - unsigned next; - - next = (r->end + 1) % MCE_RING_SIZE; - if (next == r->start) - return -1; - r->ring[r->end] = pfn; - wmb(); - r->end = next; - return 0; -} - int mce_available(struct cpuinfo_x86 *c) { if (mca_cfg.disabled) @@ -525,12 +448,10 @@ int mce_available(struct cpuinfo_x86 *c) static void mce_schedule_work(void) { - if (!mce_ring_empty()) - schedule_work(this_cpu_ptr(&mce_work)); + if (!mce_gen_pool_empty() && keventd_up()) + schedule_work(&mce_work); } -static DEFINE_PER_CPU(struct irq_work, mce_irq_work); - static void mce_irq_work_cb(struct irq_work *entry) { mce_notify_irq(); @@ -551,8 +472,29 @@ static void mce_report_event(struct pt_regs *regs) return; } - irq_work_queue(this_cpu_ptr(&mce_irq_work)); + irq_work_queue(&mce_irq_work); +} + +static int srao_decode_notifier(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct mce *mce = (struct mce *)data; + unsigned long pfn; + + if (!mce) + return NOTIFY_DONE; + + if (mce->usable_addr && (mce->severity == MCE_AO_SEVERITY)) { + pfn = mce->addr >> PAGE_SHIFT; + memory_failure(pfn, MCE_VECTOR, 0); + } + + return NOTIFY_OK; } +static struct notifier_block mce_srao_nb = { + .notifier_call = srao_decode_notifier, + .priority = INT_MAX, +}; /* * Read ADDR and MISC registers. @@ -672,8 +614,11 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) */ if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m)) { if (m.status & MCI_STATUS_ADDRV) { - mce_ring_add(m.addr >> PAGE_SHIFT); - mce_schedule_work(); + m.severity = severity; + m.usable_addr = mce_usable_address(&m); + + if (!mce_gen_pool_add(&m)) + mce_schedule_work(); } } @@ -1029,7 +974,6 @@ void do_machine_check(struct pt_regs *regs, long error_code) { struct mca_config *cfg = &mca_cfg; struct mce m, *final; - enum ctx_state prev_state; int i; int worst = 0; int severity; @@ -1055,7 +999,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) int flags = MF_ACTION_REQUIRED; int lmce = 0; - prev_state = ist_enter(regs); + ist_enter(regs); this_cpu_inc(mce_exception_count); @@ -1143,15 +1087,9 @@ void do_machine_check(struct pt_regs *regs, long error_code) mce_read_aux(&m, i); - /* - * Action optional error. Queue address for later processing. - * When the ring overflows we just ignore the AO error. - * RED-PEN add some logging mechanism when - * usable_address or mce_add_ring fails. - * RED-PEN don't ignore overflow for mca_cfg.tolerant == 0 - */ - if (severity == MCE_AO_SEVERITY && mce_usable_address(&m)) - mce_ring_add(m.addr >> PAGE_SHIFT); + /* assuming valid severity level != 0 */ + m.severity = severity; + m.usable_addr = mce_usable_address(&m); mce_log(&m); @@ -1227,7 +1165,7 @@ out: local_irq_disable(); ist_end_non_atomic(); done: - ist_exit(regs, prev_state); + ist_exit(regs); } EXPORT_SYMBOL_GPL(do_machine_check); @@ -1247,14 +1185,11 @@ int memory_failure(unsigned long pfn, int vector, int flags) /* * Action optional processing happens here (picking up * from the list of faulting pages that do_machine_check() - * placed into the "ring"). + * placed into the genpool). */ static void mce_process_work(struct work_struct *dummy) { - unsigned long pfn; - - while (mce_ring_get(&pfn)) - memory_failure(pfn, MCE_VECTOR, 0); + mce_gen_pool_process(); } #ifdef CONFIG_X86_MCE_INTEL @@ -1678,6 +1613,17 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) } } +static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c) +{ + switch (c->x86_vendor) { + case X86_VENDOR_INTEL: + mce_intel_feature_clear(c); + break; + default: + break; + } +} + static void mce_start_timer(unsigned int cpu, struct timer_list *t) { unsigned long iv = check_interval * HZ; @@ -1731,13 +1677,36 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c) return; } + if (mce_gen_pool_init()) { + mca_cfg.disabled = true; + pr_emerg("Couldn't allocate MCE records pool!\n"); + return; + } + machine_check_vector = do_machine_check; __mcheck_cpu_init_generic(); __mcheck_cpu_init_vendor(c); __mcheck_cpu_init_timer(); - INIT_WORK(this_cpu_ptr(&mce_work), mce_process_work); - init_irq_work(this_cpu_ptr(&mce_irq_work), &mce_irq_work_cb); +} + +/* + * Called for each booted CPU to clear some machine checks opt-ins + */ +void mcheck_cpu_clear(struct cpuinfo_x86 *c) +{ + if (mca_cfg.disabled) + return; + + if (!mce_available(c)) + return; + + /* + * Possibly to clear general settings generic to x86 + * __mcheck_cpu_clear_generic(c); + */ + __mcheck_cpu_clear_vendor(c); + } /* @@ -1784,7 +1753,7 @@ static void collect_tscs(void *data) { unsigned long *cpu_tsc = (unsigned long *)data; - rdtscll(cpu_tsc[smp_processor_id()]); + cpu_tsc[smp_processor_id()] = rdtsc(); } static int mce_apei_read_done; @@ -1850,7 +1819,7 @@ static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf, goto out; } - next = rcu_dereference_check_mce(mcelog.next); + next = mce_log_get_idx_check(mcelog.next); /* Only supports full reads right now */ err = -EINVAL; @@ -2056,8 +2025,12 @@ __setup("mce", mcheck_enable); int __init mcheck_init(void) { mcheck_intel_therm_init(); + mce_register_decode_chain(&mce_srao_nb); mcheck_vendor_init_severity(); + INIT_WORK(&mce_work, mce_process_work); + init_irq_work(&mce_irq_work, mce_irq_work_cb); + return 0; } @@ -2591,5 +2564,20 @@ static int __init mcheck_debugfs_init(void) return 0; } -late_initcall(mcheck_debugfs_init); +#else +static int __init mcheck_debugfs_init(void) { return -EINVAL; } #endif + +static int __init mcheck_late_init(void) +{ + mcheck_debugfs_init(); + + /* + * Flush out everything that has been logged during early boot, now that + * everything has been initialized (workqueues, decoders, ...). + */ + mce_schedule_work(); + + return 0; +} +late_initcall(mcheck_late_init); diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index c93c27df9..1e8bb6c94 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -251,7 +251,6 @@ static void intel_threshold_interrupt(void) return; machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)); - mce_notify_irq(); } /* @@ -440,7 +439,7 @@ static void intel_init_cmci(void) cmci_recheck(); } -void intel_init_lmce(void) +static void intel_init_lmce(void) { u64 val; @@ -453,9 +452,26 @@ void intel_init_lmce(void) wrmsrl(MSR_IA32_MCG_EXT_CTL, val | MCG_EXT_CTL_LMCE_EN); } +static void intel_clear_lmce(void) +{ + u64 val; + + if (!lmce_supported()) + return; + + rdmsrl(MSR_IA32_MCG_EXT_CTL, val); + val &= ~MCG_EXT_CTL_LMCE_EN; + wrmsrl(MSR_IA32_MCG_EXT_CTL, val); +} + void mce_intel_feature_init(struct cpuinfo_x86 *c) { intel_init_thermal(c); intel_init_cmci(); intel_init_lmce(); } + +void mce_intel_feature_clear(struct cpuinfo_x86 *c) +{ + intel_clear_lmce(); +} diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c index 737b0ad4e..12402e10a 100644 --- a/arch/x86/kernel/cpu/mcheck/p5.c +++ b/arch/x86/kernel/cpu/mcheck/p5.c @@ -19,10 +19,9 @@ int mce_p5_enabled __read_mostly; /* Machine check handler for Pentium class Intel CPUs: */ static void pentium_machine_check(struct pt_regs *regs, long error_code) { - enum ctx_state prev_state; u32 loaddr, hi, lotype; - prev_state = ist_enter(regs); + ist_enter(regs); rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi); rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi); @@ -39,7 +38,7 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code) add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); - ist_exit(regs, prev_state); + ist_exit(regs); } /* Set up machine check reporting for processors with Intel style MCE: */ diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c index 44f138296..01dd87028 100644 --- a/arch/x86/kernel/cpu/mcheck/winchip.c +++ b/arch/x86/kernel/cpu/mcheck/winchip.c @@ -15,12 +15,12 @@ /* Machine check handler for WinChip C6: */ static void winchip_machine_check(struct pt_regs *regs, long error_code) { - enum ctx_state prev_state = ist_enter(regs); + ist_enter(regs); printk(KERN_EMERG "CPU0: Machine Check Exception.\n"); add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); - ist_exit(regs, prev_state); + ist_exit(regs); } /* Set up machine check reporting on the Winchip C6 series */ diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 6236a54a6..9e3f3c7dd 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -377,17 +377,16 @@ static int mc_device_add(struct device *dev, struct subsys_interface *sif) return err; } -static int mc_device_remove(struct device *dev, struct subsys_interface *sif) +static void mc_device_remove(struct device *dev, struct subsys_interface *sif) { int cpu = dev->id; if (!cpu_online(cpu)) - return 0; + return; pr_debug("CPU%d removed\n", cpu); microcode_fini_cpu(cpu); sysfs_remove_group(&dev->kobj, &mc_attr_group); - return 0; } static struct subsys_interface mc_cpu_interface = { @@ -460,7 +459,7 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) return NOTIFY_OK; } -static struct notifier_block __refdata mc_cpu_notifier = { +static struct notifier_block mc_cpu_notifier = { .notifier_call = mc_cpu_callback, }; diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c index 6b5021fc3..66ed31939 100644 --- a/arch/x86/kernel/cpu/microcode/intel_early.c +++ b/arch/x86/kernel/cpu/microcode/intel_early.c @@ -390,7 +390,7 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci) } #ifdef DEBUG -static void __ref show_saved_mc(void) +static void show_saved_mc(void) { int i, j; unsigned int sig, pf, rev, total_size, data_size, date; diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index aad4bd84b..20e242ea1 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -28,12 +29,15 @@ #include #include #include +#include struct ms_hyperv_info ms_hyperv; EXPORT_SYMBOL_GPL(ms_hyperv); #if IS_ENABLED(CONFIG_HYPERV) static void (*vmbus_handler)(void); +static void (*hv_kexec_handler)(void); +static void (*hv_crash_handler)(struct pt_regs *regs); void hyperv_vector_handler(struct pt_regs *regs) { @@ -67,7 +71,47 @@ void hv_remove_vmbus_irq(void) } EXPORT_SYMBOL_GPL(hv_setup_vmbus_irq); EXPORT_SYMBOL_GPL(hv_remove_vmbus_irq); -#endif + +void hv_setup_kexec_handler(void (*handler)(void)) +{ + hv_kexec_handler = handler; +} +EXPORT_SYMBOL_GPL(hv_setup_kexec_handler); + +void hv_remove_kexec_handler(void) +{ + hv_kexec_handler = NULL; +} +EXPORT_SYMBOL_GPL(hv_remove_kexec_handler); + +void hv_setup_crash_handler(void (*handler)(struct pt_regs *regs)) +{ + hv_crash_handler = handler; +} +EXPORT_SYMBOL_GPL(hv_setup_crash_handler); + +void hv_remove_crash_handler(void) +{ + hv_crash_handler = NULL; +} +EXPORT_SYMBOL_GPL(hv_remove_crash_handler); + +#ifdef CONFIG_KEXEC_CORE +static void hv_machine_shutdown(void) +{ + if (kexec_in_progress && hv_kexec_handler) + hv_kexec_handler(); + native_machine_shutdown(); +} + +static void hv_machine_crash_shutdown(struct pt_regs *regs) +{ + if (hv_crash_handler) + hv_crash_handler(regs); + native_machine_crash_shutdown(regs); +} +#endif /* CONFIG_KEXEC_CORE */ +#endif /* CONFIG_HYPERV */ static uint32_t __init ms_hyperv_platform(void) { @@ -114,6 +158,7 @@ static void __init ms_hyperv_init_platform(void) * Extract the features and hints */ ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES); + ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES); ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO); printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n", @@ -141,6 +186,11 @@ static void __init ms_hyperv_init_platform(void) no_timer_check = 1; #endif +#if IS_ENABLED(CONFIG_HYPERV) && defined(CONFIG_KEXEC_CORE) + machine_ops.shutdown = hv_machine_shutdown; + machine_ops.crash_shutdown = hv_machine_crash_shutdown; +#endif + mark_tsc_unstable("running on Hyper-V"); } const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = { diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index e7ed0d8eb..f891b4750 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -448,7 +448,6 @@ int mtrr_add(unsigned long base, unsigned long size, unsigned int type, return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type, increment); } -EXPORT_SYMBOL(mtrr_add); /** * mtrr_del_page - delete a memory type region @@ -537,7 +536,6 @@ int mtrr_del(int reg, unsigned long base, unsigned long size) return -EINVAL; return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT); } -EXPORT_SYMBOL(mtrr_del); /** * arch_phys_wc_add - add a WC MTRR and handle errors if PAT is unavailable diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 9469dfa55..66dd3fe99 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1551,7 +1551,7 @@ static void __init filter_events(struct attribute **attrs) } /* Merge two pointer arrays */ -static __init struct attribute **merge_attr(struct attribute **a, struct attribute **b) +__init struct attribute **merge_attr(struct attribute **a, struct attribute **b) { struct attribute **new; int j, i; @@ -2179,6 +2179,7 @@ static unsigned long get_segment_base(unsigned int segment) int idx = segment >> 3; if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) { +#ifdef CONFIG_MODIFY_LDT_SYSCALL struct ldt_struct *ldt; if (idx > LDT_ENTRIES) @@ -2190,6 +2191,9 @@ static unsigned long get_segment_base(unsigned int segment) return 0; desc = &ldt->entries[idx]; +#else + return 0; +#endif } else { if (idx > GDT_ENTRIES) return 0; @@ -2200,7 +2204,7 @@ static unsigned long get_segment_base(unsigned int segment) return get_desc_base(desc); } -#ifdef CONFIG_COMPAT +#ifdef CONFIG_IA32_EMULATION #include diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 3e7fd27df..165be83a7 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -47,6 +47,7 @@ enum extra_reg_type { EXTRA_REG_RSP_1 = 1, /* offcore_response_1 */ EXTRA_REG_LBR = 2, /* lbr_select */ EXTRA_REG_LDLAT = 3, /* ld_lat_threshold */ + EXTRA_REG_FE = 4, /* fe_* */ EXTRA_REG_MAX /* number of entries needed */ }; @@ -165,7 +166,7 @@ struct intel_excl_cntrs { unsigned core_id; /* per-core: core id */ }; -#define MAX_LBR_ENTRIES 16 +#define MAX_LBR_ENTRIES 32 enum { X86_PERF_KFREE_SHARED = 0, @@ -594,6 +595,7 @@ struct x86_pmu { struct event_constraint *pebs_constraints; void (*pebs_aliases)(struct perf_event *event); int max_pebs_events; + unsigned long free_running_flags; /* * Intel LBR @@ -624,6 +626,7 @@ struct x86_pmu { struct x86_perf_task_context { u64 lbr_from[MAX_LBR_ENTRIES]; u64 lbr_to[MAX_LBR_ENTRIES]; + u64 lbr_info[MAX_LBR_ENTRIES]; int lbr_callstack_users; int lbr_stack_state; }; @@ -793,6 +796,8 @@ static inline void set_linear_ip(struct pt_regs *regs, unsigned long ip) ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event); ssize_t intel_event_sysfs_show(char *page, u64 config); +struct attribute **merge_attr(struct attribute **a, struct attribute **b); + #ifdef CONFIG_CPU_SUP_AMD int amd_pmu_init(void); @@ -808,20 +813,6 @@ static inline int amd_pmu_init(void) #ifdef CONFIG_CPU_SUP_INTEL -static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event) -{ - /* user explicitly requested branch sampling */ - if (has_branch_stack(event)) - return true; - - /* implicit branch sampling to correct PEBS skid */ - if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1 && - x86_pmu.intel_cap.pebs_format < 2) - return true; - - return false; -} - static inline bool intel_pmu_has_bts(struct perf_event *event) { if (event->attr.config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS && @@ -873,6 +864,8 @@ extern struct event_constraint intel_ivb_pebs_event_constraints[]; extern struct event_constraint intel_hsw_pebs_event_constraints[]; +extern struct event_constraint intel_skl_pebs_event_constraints[]; + struct event_constraint *intel_pebs_constraints(struct perf_event *event); void intel_pmu_pebs_enable(struct perf_event *event); @@ -911,6 +904,8 @@ void intel_pmu_lbr_init_snb(void); void intel_pmu_lbr_init_hsw(void); +void intel_pmu_lbr_init_skl(void); + int intel_pmu_setup_lbr_filter(struct perf_event *event); void intel_pt_interrupt(void); @@ -934,6 +929,7 @@ static inline int is_ht_workaround_enabled(void) { return !!(x86_pmu.flags & PMU_FL_EXCL_ENABLED); } + #else /* CONFIG_CPU_SUP_INTEL */ static inline void reserve_ds_buffers(void) diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 1b09c420c..f63360be2 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include @@ -177,6 +177,14 @@ static struct event_constraint intel_slm_event_constraints[] __read_mostly = EVENT_CONSTRAINT_END }; +struct event_constraint intel_skl_event_constraints[] = { + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ + INTEL_UEVENT_CONSTRAINT(0x1c0, 0x2), /* INST_RETIRED.PREC_DIST */ + EVENT_CONSTRAINT_END +}; + static struct extra_reg intel_snb_extra_regs[] __read_mostly = { /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3f807f8fffull, RSP_0), @@ -193,6 +201,18 @@ static struct extra_reg intel_snbep_extra_regs[] __read_mostly = { EVENT_EXTRA_END }; +static struct extra_reg intel_skl_extra_regs[] __read_mostly = { + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1), + INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd), + /* + * Note the low 8 bits eventsel code is not a continuous field, containing + * some #GPing bits. These are masked out. + */ + INTEL_UEVENT_EXTRA_REG(0x01c6, MSR_PEBS_FRONTEND, 0x7fff17, FE), + EVENT_EXTRA_END +}; + EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3"); EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3"); EVENT_ATTR_STR(mem-stores, mem_st_snb, "event=0xcd,umask=0x2"); @@ -235,7 +255,7 @@ struct event_constraint intel_bdw_event_constraints[] = { FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ INTEL_UEVENT_CONSTRAINT(0x148, 0x4), /* L1D_PEND_MISS.PENDING */ - INTEL_EVENT_CONSTRAINT(0xa3, 0x4), /* CYCLE_ACTIVITY.* */ + INTEL_UEVENT_CONSTRAINT(0x8a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_MISS */ EVENT_CONSTRAINT_END }; @@ -244,6 +264,200 @@ static u64 intel_pmu_event_map(int hw_event) return intel_perfmon_event_map[hw_event]; } +/* + * Notes on the events: + * - data reads do not include code reads (comparable to earlier tables) + * - data counts include speculative execution (except L1 write, dtlb, bpu) + * - remote node access includes remote memory, remote cache, remote mmio. + * - prefetches are not included in the counts. + * - icache miss does not include decoded icache + */ + +#define SKL_DEMAND_DATA_RD BIT_ULL(0) +#define SKL_DEMAND_RFO BIT_ULL(1) +#define SKL_ANY_RESPONSE BIT_ULL(16) +#define SKL_SUPPLIER_NONE BIT_ULL(17) +#define SKL_L3_MISS_LOCAL_DRAM BIT_ULL(26) +#define SKL_L3_MISS_REMOTE_HOP0_DRAM BIT_ULL(27) +#define SKL_L3_MISS_REMOTE_HOP1_DRAM BIT_ULL(28) +#define SKL_L3_MISS_REMOTE_HOP2P_DRAM BIT_ULL(29) +#define SKL_L3_MISS (SKL_L3_MISS_LOCAL_DRAM| \ + SKL_L3_MISS_REMOTE_HOP0_DRAM| \ + SKL_L3_MISS_REMOTE_HOP1_DRAM| \ + SKL_L3_MISS_REMOTE_HOP2P_DRAM) +#define SKL_SPL_HIT BIT_ULL(30) +#define SKL_SNOOP_NONE BIT_ULL(31) +#define SKL_SNOOP_NOT_NEEDED BIT_ULL(32) +#define SKL_SNOOP_MISS BIT_ULL(33) +#define SKL_SNOOP_HIT_NO_FWD BIT_ULL(34) +#define SKL_SNOOP_HIT_WITH_FWD BIT_ULL(35) +#define SKL_SNOOP_HITM BIT_ULL(36) +#define SKL_SNOOP_NON_DRAM BIT_ULL(37) +#define SKL_ANY_SNOOP (SKL_SPL_HIT|SKL_SNOOP_NONE| \ + SKL_SNOOP_NOT_NEEDED|SKL_SNOOP_MISS| \ + SKL_SNOOP_HIT_NO_FWD|SKL_SNOOP_HIT_WITH_FWD| \ + SKL_SNOOP_HITM|SKL_SNOOP_NON_DRAM) +#define SKL_DEMAND_READ SKL_DEMAND_DATA_RD +#define SKL_SNOOP_DRAM (SKL_SNOOP_NONE| \ + SKL_SNOOP_NOT_NEEDED|SKL_SNOOP_MISS| \ + SKL_SNOOP_HIT_NO_FWD|SKL_SNOOP_HIT_WITH_FWD| \ + SKL_SNOOP_HITM|SKL_SPL_HIT) +#define SKL_DEMAND_WRITE SKL_DEMAND_RFO +#define SKL_LLC_ACCESS SKL_ANY_RESPONSE +#define SKL_L3_MISS_REMOTE (SKL_L3_MISS_REMOTE_HOP0_DRAM| \ + SKL_L3_MISS_REMOTE_HOP1_DRAM| \ + SKL_L3_MISS_REMOTE_HOP2P_DRAM) + +static __initconst const u64 skl_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_INST_RETIRED.ALL_LOADS */ + [ C(RESULT_MISS) ] = 0x151, /* L1D.REPLACEMENT */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_INST_RETIRED.ALL_STORES */ + [ C(RESULT_MISS) ] = 0x0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x283, /* ICACHE_64B.MISS */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */ + [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */ + [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_INST_RETIRED.ALL_LOADS */ + [ C(RESULT_MISS) ] = 0x608, /* DTLB_LOAD_MISSES.WALK_COMPLETED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_INST_RETIRED.ALL_STORES */ + [ C(RESULT_MISS) ] = 0x649, /* DTLB_STORE_MISSES.WALK_COMPLETED */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x2085, /* ITLB_MISSES.STLB_HIT */ + [ C(RESULT_MISS) ] = 0xe85, /* ITLB_MISSES.WALK_COMPLETED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0xc4, /* BR_INST_RETIRED.ALL_BRANCHES */ + [ C(RESULT_MISS) ] = 0xc5, /* BR_MISP_RETIRED.ALL_BRANCHES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */ + [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */ + [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, +}; + +static __initconst const u64 skl_hw_cache_extra_regs + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = SKL_DEMAND_READ| + SKL_LLC_ACCESS|SKL_ANY_SNOOP, + [ C(RESULT_MISS) ] = SKL_DEMAND_READ| + SKL_L3_MISS|SKL_ANY_SNOOP| + SKL_SUPPLIER_NONE, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = SKL_DEMAND_WRITE| + SKL_LLC_ACCESS|SKL_ANY_SNOOP, + [ C(RESULT_MISS) ] = SKL_DEMAND_WRITE| + SKL_L3_MISS|SKL_ANY_SNOOP| + SKL_SUPPLIER_NONE, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = SKL_DEMAND_READ| + SKL_L3_MISS_LOCAL_DRAM|SKL_SNOOP_DRAM, + [ C(RESULT_MISS) ] = SKL_DEMAND_READ| + SKL_L3_MISS_REMOTE|SKL_SNOOP_DRAM, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = SKL_DEMAND_WRITE| + SKL_L3_MISS_LOCAL_DRAM|SKL_SNOOP_DRAM, + [ C(RESULT_MISS) ] = SKL_DEMAND_WRITE| + SKL_L3_MISS_REMOTE|SKL_SNOOP_DRAM, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, +}; + #define SNB_DMND_DATA_RD (1ULL << 0) #define SNB_DMND_RFO (1ULL << 1) #define SNB_DMND_IFETCH (1ULL << 2) @@ -1114,7 +1328,7 @@ static struct extra_reg intel_slm_extra_regs[] __read_mostly = { /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x768005ffffull, RSP_0), - INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x768005ffffull, RSP_1), + INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x368005ffffull, RSP_1), EVENT_EXTRA_END }; @@ -1594,6 +1808,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) loops = 0; again: + intel_pmu_lbr_read(); intel_pmu_ack_status(status); if (++loops > 100) { static bool warned = false; @@ -1608,16 +1823,16 @@ again: inc_irq_stat(apic_perf_irqs); - intel_pmu_lbr_read(); /* - * CondChgd bit 63 doesn't mean any overflow status. Ignore - * and clear the bit. + * Ignore a range of extra bits in status that do not indicate + * overflow by themselves. */ - if (__test_and_clear_bit(63, (unsigned long *)&status)) { - if (!status) - goto done; - } + status &= ~(GLOBAL_STATUS_COND_CHG | + GLOBAL_STATUS_ASIF | + GLOBAL_STATUS_LBRS_FROZEN); + if (!status) + goto done; /* * PEBS overflow sets bit 62 in the global status register @@ -1699,18 +1914,22 @@ intel_bts_constraints(struct perf_event *event) return NULL; } -static int intel_alt_er(int idx) +static int intel_alt_er(int idx, u64 config) { + int alt_idx; if (!(x86_pmu.flags & PMU_FL_HAS_RSP_1)) return idx; if (idx == EXTRA_REG_RSP_0) - return EXTRA_REG_RSP_1; + alt_idx = EXTRA_REG_RSP_1; if (idx == EXTRA_REG_RSP_1) - return EXTRA_REG_RSP_0; + alt_idx = EXTRA_REG_RSP_0; + + if (config & ~x86_pmu.extra_regs[alt_idx].valid_mask) + return idx; - return idx; + return alt_idx; } static void intel_fixup_er(struct perf_event *event, int idx) @@ -1799,7 +2018,7 @@ again: */ c = NULL; } else { - idx = intel_alt_er(idx); + idx = intel_alt_er(idx, reg->config); if (idx != reg->idx) { raw_spin_unlock_irqrestore(&era->lock, flags); goto again; @@ -2256,6 +2475,15 @@ static void intel_pebs_aliases_snb(struct perf_event *event) } } +static unsigned long intel_pmu_free_running_flags(struct perf_event *event) +{ + unsigned long flags = x86_pmu.free_running_flags; + + if (event->attr.use_clockid) + flags &= ~PERF_SAMPLE_TIME; + return flags; +} + static int intel_pmu_hw_config(struct perf_event *event) { int ret = x86_pmu_hw_config(event); @@ -2266,7 +2494,8 @@ static int intel_pmu_hw_config(struct perf_event *event) if (event->attr.precise_ip) { if (!event->attr.freq) { event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD; - if (!(event->attr.sample_type & ~PEBS_FREERUNNING_FLAGS)) + if (!(event->attr.sample_type & + ~intel_pmu_free_running_flags(event))) event->hw.flags |= PERF_X86_EVENT_FREERUNNING; } if (x86_pmu.pebs_aliases) @@ -2667,6 +2896,8 @@ PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63"); PMU_FORMAT_ATTR(ldlat, "config1:0-15"); +PMU_FORMAT_ATTR(frontend, "config1:0-23"); + static struct attribute *intel_arch3_formats_attr[] = { &format_attr_event.attr, &format_attr_umask.attr, @@ -2683,6 +2914,11 @@ static struct attribute *intel_arch3_formats_attr[] = { NULL, }; +static struct attribute *skl_format_attr[] = { + &format_attr_frontend.attr, + NULL, +}; + static __initconst const struct x86_pmu core_pmu = { .name = "core", .handle_irq = x86_pmu_handle_irq, @@ -2697,6 +2933,8 @@ static __initconst const struct x86_pmu core_pmu = { .event_map = intel_pmu_event_map, .max_events = ARRAY_SIZE(intel_perfmon_event_map), .apic = 1, + .free_running_flags = PEBS_FREERUNNING_FLAGS, + /* * Intel PMCs cannot be accessed sanely above 32-bit width, * so we install an artificial 1<<31 period regardless of @@ -2735,6 +2973,7 @@ static __initconst const struct x86_pmu intel_pmu = { .event_map = intel_pmu_event_map, .max_events = ARRAY_SIZE(intel_perfmon_event_map), .apic = 1, + .free_running_flags = PEBS_FREERUNNING_FLAGS, /* * Intel PMCs cannot be accessed sanely above 32 bit width, * so we install an artificial 1<<31 period regardless of @@ -3272,6 +3511,30 @@ __init int intel_pmu_init(void) pr_cont("Broadwell events, "); break; + case 78: /* 14nm Skylake Mobile */ + case 94: /* 14nm Skylake Desktop */ + x86_pmu.late_ack = true; + memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); + intel_pmu_lbr_init_skl(); + + x86_pmu.event_constraints = intel_skl_event_constraints; + x86_pmu.pebs_constraints = intel_skl_pebs_event_constraints; + x86_pmu.extra_regs = intel_skl_extra_regs; + x86_pmu.pebs_aliases = intel_pebs_aliases_snb; + /* all extra regs are per-cpu when HT is on */ + x86_pmu.flags |= PMU_FL_HAS_RSP_1; + x86_pmu.flags |= PMU_FL_NO_HT_SHARING; + + x86_pmu.hw_config = hsw_hw_config; + x86_pmu.get_event_constraints = hsw_get_event_constraints; + x86_pmu.format_attrs = merge_attr(intel_arch3_formats_attr, + skl_format_attr); + WARN_ON(!x86_pmu.format_attrs); + x86_pmu.cpu_events = hsw_events_attrs; + pr_cont("Skylake events, "); + break; + default: switch (x86_pmu.version) { case 1: @@ -3341,7 +3604,7 @@ __init int intel_pmu_init(void) */ if (x86_pmu.extra_regs) { for (er = x86_pmu.extra_regs; er->msr; er++) { - er->extra_msr_access = check_msr(er->msr, 0x1ffUL); + er->extra_msr_access = check_msr(er->msr, 0x11UL); /* Disable LBR select mapping */ if ((er->idx == EXTRA_REG_LBR) && !er->extra_msr_access) x86_pmu.lbr_sel_map = NULL; @@ -3380,7 +3643,10 @@ static __init int fixup_ht_bug(void) return 0; } - watchdog_nmi_disable_all(); + if (lockup_detector_suspend() != 0) { + pr_debug("failed to disable PMU erratum BJ122, BV98, HSD29 workaround\n"); + return 0; + } x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED); @@ -3388,7 +3654,7 @@ static __init int fixup_ht_bug(void) x86_pmu.commit_scheduling = NULL; x86_pmu.stop_scheduling = NULL; - watchdog_nmi_enable_all(); + lockup_detector_resume(); get_online_cpus(); diff --git a/arch/x86/kernel/cpu/perf_event_intel_bts.c b/arch/x86/kernel/cpu/perf_event_intel_bts.c index 43dd672d7..d1c0f254a 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_bts.c +++ b/arch/x86/kernel/cpu/perf_event_intel_bts.c @@ -62,9 +62,6 @@ struct bts_buffer { struct pmu bts_pmu; -void intel_pmu_enable_bts(u64 config); -void intel_pmu_disable_bts(void); - static size_t buf_size(struct page *page) { return 1 << (PAGE_SHIFT + page_private(page)); @@ -225,6 +222,7 @@ static void __bts_event_start(struct perf_event *event) if (!buf || bts_buffer_is_full(buf, bts)) return; + event->hw.itrace_started = 1; event->hw.state = 0; if (!buf->snapshot) diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 71fc40238..84f236ab9 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -224,6 +224,19 @@ union hsw_tsx_tuning { #define PEBS_HSW_TSX_FLAGS 0xff00000000ULL +/* Same as HSW, plus TSC */ + +struct pebs_record_skl { + u64 flags, ip; + u64 ax, bx, cx, dx; + u64 si, di, bp, sp; + u64 r8, r9, r10, r11; + u64 r12, r13, r14, r15; + u64 status, dla, dse, lat; + u64 real_ip, tsx_tuning; + u64 tsc; +}; + void init_debug_store_on_cpu(int cpu) { struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; @@ -675,6 +688,28 @@ struct event_constraint intel_hsw_pebs_event_constraints[] = { EVENT_CONSTRAINT_END }; +struct event_constraint intel_skl_pebs_event_constraints[] = { + INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x2), /* INST_RETIRED.PREC_DIST */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */ + /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */ + INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf), + INTEL_PLD_CONSTRAINT(0x1cd, 0xf), /* MEM_TRANS_RETIRED.* */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_LOADS */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_STORES */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_INST_RETIRED.LOCK_LOADS */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x22d0, 0xf), /* MEM_INST_RETIRED.LOCK_STORES */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_INST_RETIRED.SPLIT_LOADS */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_INST_RETIRED.SPLIT_STORES */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_INST_RETIRED.ALL_LOADS */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_INST_RETIRED.ALL_STORES */ + INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd1, 0xf), /* MEM_LOAD_RETIRED.* */ + INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd2, 0xf), /* MEM_LOAD_L3_HIT_RETIRED.* */ + INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd3, 0xf), /* MEM_LOAD_L3_MISS_RETIRED.* */ + /* Allow all events as PEBS with no flags */ + INTEL_ALL_EVENT_CONSTRAINT(0, 0xf), + EVENT_CONSTRAINT_END +}; + struct event_constraint *intel_pebs_constraints(struct perf_event *event) { struct event_constraint *c; @@ -754,6 +789,11 @@ void intel_pmu_pebs_disable(struct perf_event *event) struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; struct debug_store *ds = cpuc->ds; + bool large_pebs = ds->pebs_interrupt_threshold > + ds->pebs_buffer_base + x86_pmu.pebs_record_size; + + if (large_pebs) + intel_pmu_drain_pebs_buffer(); cpuc->pebs_enabled &= ~(1ULL << hwc->idx); @@ -762,12 +802,8 @@ void intel_pmu_pebs_disable(struct perf_event *event) else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST) cpuc->pebs_enabled &= ~(1ULL << 63); - if (ds->pebs_interrupt_threshold > - ds->pebs_buffer_base + x86_pmu.pebs_record_size) { - intel_pmu_drain_pebs_buffer(); - if (!pebs_is_enabled(cpuc)) - perf_sched_cb_dec(event->ctx->pmu); - } + if (large_pebs && !pebs_is_enabled(cpuc)) + perf_sched_cb_dec(event->ctx->pmu); if (cpuc->enabled) wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled); @@ -885,7 +921,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) return 0; } -static inline u64 intel_hsw_weight(struct pebs_record_hsw *pebs) +static inline u64 intel_hsw_weight(struct pebs_record_skl *pebs) { if (pebs->tsx_tuning) { union hsw_tsx_tuning tsx = { .value = pebs->tsx_tuning }; @@ -894,7 +930,7 @@ static inline u64 intel_hsw_weight(struct pebs_record_hsw *pebs) return 0; } -static inline u64 intel_hsw_transaction(struct pebs_record_hsw *pebs) +static inline u64 intel_hsw_transaction(struct pebs_record_skl *pebs) { u64 txn = (pebs->tsx_tuning & PEBS_HSW_TSX_FLAGS) >> 32; @@ -918,7 +954,7 @@ static void setup_pebs_sample_data(struct perf_event *event, * unconditionally access the 'extra' entries. */ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - struct pebs_record_hsw *pebs = __pebs; + struct pebs_record_skl *pebs = __pebs; u64 sample_type; int fll, fst, dsrc; int fl = event->hw.flags; @@ -1016,6 +1052,16 @@ static void setup_pebs_sample_data(struct perf_event *event, data->txn = intel_hsw_transaction(pebs); } + /* + * v3 supplies an accurate time stamp, so we use that + * for the time stamp. + * + * We can only do this for the default trace clock. + */ + if (x86_pmu.intel_cap.pebs_format >= 3 && + event->attr.use_clockid == 0) + data->time = native_sched_clock_from_tsc(pebs->tsc); + if (has_branch_stack(event)) data->br_stack = &cpuc->lbr_stack; } @@ -1142,6 +1188,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) for (at = base; at < top; at += x86_pmu.pebs_record_size) { struct pebs_record_nhm *p = at; + u64 pebs_status; /* PEBS v3 has accurate status bits */ if (x86_pmu.intel_cap.pebs_format >= 3) { @@ -1152,12 +1199,17 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) continue; } - bit = find_first_bit((unsigned long *)&p->status, + pebs_status = p->status & cpuc->pebs_enabled; + pebs_status &= (1ULL << x86_pmu.max_pebs_events) - 1; + + bit = find_first_bit((unsigned long *)&pebs_status, x86_pmu.max_pebs_events); - if (bit >= x86_pmu.max_pebs_events) - continue; - if (!test_bit(bit, cpuc->active_mask)) + if (WARN(bit >= x86_pmu.max_pebs_events, + "PEBS record without PEBS event! status=%Lx pebs_enabled=%Lx active_mask=%Lx", + (unsigned long long)p->status, (unsigned long long)cpuc->pebs_enabled, + *(unsigned long long *)cpuc->active_mask)) continue; + /* * The PEBS hardware does not deal well with the situation * when events happen near to each other and multiple bits @@ -1172,27 +1224,21 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) * one, and it's not possible to reconstruct all events * that caused the PEBS record. It's called collision. * If collision happened, the record will be dropped. - * */ - if (p->status != (1 << bit)) { - u64 pebs_status; - - /* slow path */ - pebs_status = p->status & cpuc->pebs_enabled; - pebs_status &= (1ULL << MAX_PEBS_EVENTS) - 1; - if (pebs_status != (1 << bit)) { - for_each_set_bit(i, (unsigned long *)&pebs_status, - MAX_PEBS_EVENTS) - error[i]++; - continue; - } + if (p->status != (1ULL << bit)) { + for_each_set_bit(i, (unsigned long *)&pebs_status, + x86_pmu.max_pebs_events) + error[i]++; + continue; } + counts[bit]++; } for (bit = 0; bit < x86_pmu.max_pebs_events; bit++) { if ((counts[bit] == 0) && (error[bit] == 0)) continue; + event = cpuc->events[bit]; WARN_ON_ONCE(!event); WARN_ON_ONCE(!event->attr.precise_ip); @@ -1245,6 +1291,14 @@ void __init intel_ds_init(void) x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm; break; + case 3: + pr_cont("PEBS fmt3%c, ", pebs_type); + x86_pmu.pebs_record_size = + sizeof(struct pebs_record_skl); + x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm; + x86_pmu.free_running_flags |= PERF_SAMPLE_TIME; + break; + default: printk(KERN_CONT "no PEBS fmt%d%c, ", format, pebs_type); x86_pmu.pebs = 0; diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index 452a7bd2d..b2c9475b7 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -13,7 +13,8 @@ enum { LBR_FORMAT_EIP = 0x02, LBR_FORMAT_EIP_FLAGS = 0x03, LBR_FORMAT_EIP_FLAGS2 = 0x04, - LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_EIP_FLAGS2, + LBR_FORMAT_INFO = 0x05, + LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_INFO, }; static enum { @@ -139,6 +140,13 @@ static void __intel_pmu_lbr_enable(bool pmi) struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); u64 debugctl, lbr_select = 0, orig_debugctl; + /* + * No need to unfreeze manually, as v4 can do that as part + * of the GLOBAL_STATUS ack. + */ + if (pmi && x86_pmu.version >= 4) + return; + /* * No need to reprogram LBR_SELECT in a PMI, as it * did not change. @@ -186,6 +194,8 @@ static void intel_pmu_lbr_reset_64(void) for (i = 0; i < x86_pmu.lbr_nr; i++) { wrmsrl(x86_pmu.lbr_from + i, 0); wrmsrl(x86_pmu.lbr_to + i, 0); + if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) + wrmsrl(MSR_LBR_INFO_0 + i, 0); } } @@ -230,10 +240,12 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx) mask = x86_pmu.lbr_nr - 1; tos = intel_pmu_lbr_tos(); - for (i = 0; i < x86_pmu.lbr_nr; i++) { + for (i = 0; i < tos; i++) { lbr_idx = (tos - i) & mask; wrmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]); wrmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]); + if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) + wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]); } task_ctx->lbr_stack_state = LBR_NONE; } @@ -251,10 +263,12 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx) mask = x86_pmu.lbr_nr - 1; tos = intel_pmu_lbr_tos(); - for (i = 0; i < x86_pmu.lbr_nr; i++) { + for (i = 0; i < tos; i++) { lbr_idx = (tos - i) & mask; rdmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]); rdmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]); + if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) + rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]); } task_ctx->lbr_stack_state = LBR_VALID; } @@ -411,16 +425,31 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) u64 tos = intel_pmu_lbr_tos(); int i; int out = 0; + int num = x86_pmu.lbr_nr; - for (i = 0; i < x86_pmu.lbr_nr; i++) { + if (cpuc->lbr_sel->config & LBR_CALL_STACK) + num = tos; + + for (i = 0; i < num; i++) { unsigned long lbr_idx = (tos - i) & mask; u64 from, to, mis = 0, pred = 0, in_tx = 0, abort = 0; int skip = 0; + u16 cycles = 0; int lbr_flags = lbr_desc[lbr_format]; rdmsrl(x86_pmu.lbr_from + lbr_idx, from); rdmsrl(x86_pmu.lbr_to + lbr_idx, to); + if (lbr_format == LBR_FORMAT_INFO) { + u64 info; + + rdmsrl(MSR_LBR_INFO_0 + lbr_idx, info); + mis = !!(info & LBR_INFO_MISPRED); + pred = !mis; + in_tx = !!(info & LBR_INFO_IN_TX); + abort = !!(info & LBR_INFO_ABORT); + cycles = (info & LBR_INFO_CYCLES); + } if (lbr_flags & LBR_EIP_FLAGS) { mis = !!(from & LBR_FROM_FLAG_MISPRED); pred = !mis; @@ -450,6 +479,7 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) cpuc->lbr_entries[out].predicted = pred; cpuc->lbr_entries[out].in_tx = in_tx; cpuc->lbr_entries[out].abort = abort; + cpuc->lbr_entries[out].cycles = cycles; cpuc->lbr_entries[out].reserved = 0; out++; } @@ -947,6 +977,26 @@ void intel_pmu_lbr_init_hsw(void) pr_cont("16-deep LBR, "); } +/* skylake */ +__init void intel_pmu_lbr_init_skl(void) +{ + x86_pmu.lbr_nr = 32; + x86_pmu.lbr_tos = MSR_LBR_TOS; + x86_pmu.lbr_from = MSR_LBR_NHM_FROM; + x86_pmu.lbr_to = MSR_LBR_NHM_TO; + + x86_pmu.lbr_sel_mask = LBR_SEL_MASK; + x86_pmu.lbr_sel_map = hsw_lbr_sel_map; + + /* + * SW branch filter usage: + * - support syscall, sysret capture. + * That requires LBR_FAR but that means far + * jmp need to be filtered out + */ + pr_cont("32-deep LBR, "); +} + /* atom */ void __init intel_pmu_lbr_init_atom(void) { diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c index 183de7196..421692834 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_pt.c +++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c @@ -65,15 +65,21 @@ static struct pt_cap_desc { } pt_caps[] = { PT_CAP(max_subleaf, 0, CR_EAX, 0xffffffff), PT_CAP(cr3_filtering, 0, CR_EBX, BIT(0)), + PT_CAP(psb_cyc, 0, CR_EBX, BIT(1)), + PT_CAP(mtc, 0, CR_EBX, BIT(3)), PT_CAP(topa_output, 0, CR_ECX, BIT(0)), PT_CAP(topa_multiple_entries, 0, CR_ECX, BIT(1)), + PT_CAP(single_range_output, 0, CR_ECX, BIT(2)), PT_CAP(payloads_lip, 0, CR_ECX, BIT(31)), + PT_CAP(mtc_periods, 1, CR_EAX, 0xffff0000), + PT_CAP(cycle_thresholds, 1, CR_EBX, 0xffff), + PT_CAP(psb_periods, 1, CR_EBX, 0xffff0000), }; static u32 pt_cap_get(enum pt_capabilities cap) { struct pt_cap_desc *cd = &pt_caps[cap]; - u32 c = pt_pmu.caps[cd->leaf * 4 + cd->reg]; + u32 c = pt_pmu.caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg]; unsigned int shift = __ffs(cd->mask); return (c & cd->mask) >> shift; @@ -94,12 +100,22 @@ static struct attribute_group pt_cap_group = { .name = "caps", }; +PMU_FORMAT_ATTR(cyc, "config:1" ); +PMU_FORMAT_ATTR(mtc, "config:9" ); PMU_FORMAT_ATTR(tsc, "config:10" ); PMU_FORMAT_ATTR(noretcomp, "config:11" ); +PMU_FORMAT_ATTR(mtc_period, "config:14-17" ); +PMU_FORMAT_ATTR(cyc_thresh, "config:19-22" ); +PMU_FORMAT_ATTR(psb_period, "config:24-27" ); static struct attribute *pt_formats_attr[] = { + &format_attr_cyc.attr, + &format_attr_mtc.attr, &format_attr_tsc.attr, &format_attr_noretcomp.attr, + &format_attr_mtc_period.attr, + &format_attr_cyc_thresh.attr, + &format_attr_psb_period.attr, NULL, }; @@ -129,10 +145,10 @@ static int __init pt_pmu_hw_init(void) for (i = 0; i < PT_CPUID_LEAVES; i++) { cpuid_count(20, i, - &pt_pmu.caps[CR_EAX + i*4], - &pt_pmu.caps[CR_EBX + i*4], - &pt_pmu.caps[CR_ECX + i*4], - &pt_pmu.caps[CR_EDX + i*4]); + &pt_pmu.caps[CR_EAX + i*PT_CPUID_REGS_NUM], + &pt_pmu.caps[CR_EBX + i*PT_CPUID_REGS_NUM], + &pt_pmu.caps[CR_ECX + i*PT_CPUID_REGS_NUM], + &pt_pmu.caps[CR_EDX + i*PT_CPUID_REGS_NUM]); } ret = -ENOMEM; @@ -170,15 +186,65 @@ fail: return ret; } -#define PT_CONFIG_MASK (RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC) +#define RTIT_CTL_CYC_PSB (RTIT_CTL_CYCLEACC | \ + RTIT_CTL_CYC_THRESH | \ + RTIT_CTL_PSB_FREQ) + +#define RTIT_CTL_MTC (RTIT_CTL_MTC_EN | \ + RTIT_CTL_MTC_RANGE) + +#define PT_CONFIG_MASK (RTIT_CTL_TSC_EN | \ + RTIT_CTL_DISRETC | \ + RTIT_CTL_CYC_PSB | \ + RTIT_CTL_MTC) static bool pt_event_valid(struct perf_event *event) { u64 config = event->attr.config; + u64 allowed, requested; if ((config & PT_CONFIG_MASK) != config) return false; + if (config & RTIT_CTL_CYC_PSB) { + if (!pt_cap_get(PT_CAP_psb_cyc)) + return false; + + allowed = pt_cap_get(PT_CAP_psb_periods); + requested = (config & RTIT_CTL_PSB_FREQ) >> + RTIT_CTL_PSB_FREQ_OFFSET; + if (requested && (!(allowed & BIT(requested)))) + return false; + + allowed = pt_cap_get(PT_CAP_cycle_thresholds); + requested = (config & RTIT_CTL_CYC_THRESH) >> + RTIT_CTL_CYC_THRESH_OFFSET; + if (requested && (!(allowed & BIT(requested)))) + return false; + } + + if (config & RTIT_CTL_MTC) { + /* + * In the unlikely case that CPUID lists valid mtc periods, + * but not the mtc capability, drop out here. + * + * Spec says that setting mtc period bits while mtc bit in + * CPUID is 0 will #GP, so better safe than sorry. + */ + if (!pt_cap_get(PT_CAP_mtc)) + return false; + + allowed = pt_cap_get(PT_CAP_mtc_periods); + if (!allowed) + return false; + + requested = (config & RTIT_CTL_MTC_RANGE) >> + RTIT_CTL_MTC_RANGE_OFFSET; + + if (!(allowed & BIT(requested))) + return false; + } + return true; } @@ -191,6 +257,11 @@ static void pt_config(struct perf_event *event) { u64 reg; + if (!event->hw.itrace_started) { + event->hw.itrace_started = 1; + wrmsrl(MSR_IA32_RTIT_STATUS, 0); + } + reg = RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN; if (!event->attr.exclude_kernel) @@ -910,7 +981,6 @@ void intel_pt_interrupt(void) pt_config_buffer(buf->cur->table, buf->cur_idx, buf->output_off); - wrmsrl(MSR_IA32_RTIT_STATUS, 0); pt_config(event); } } @@ -934,7 +1004,6 @@ static void pt_event_start(struct perf_event *event, int mode) pt_config_buffer(buf->cur->table, buf->cur_idx, buf->output_off); - wrmsrl(MSR_IA32_RTIT_STATUS, 0); pt_config(event); } diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c index 5cbd4e64f..81431c0f0 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c +++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c @@ -86,6 +86,10 @@ static const char *rapl_domain_names[NR_RAPL_DOMAINS] __initconst = { 1<config * any other bit is reserved @@ -486,6 +490,18 @@ static struct attribute *rapl_events_hsw_attr[] = { NULL, }; +static struct attribute *rapl_events_knl_attr[] = { + EVENT_PTR(rapl_pkg), + EVENT_PTR(rapl_ram), + + EVENT_PTR(rapl_pkg_unit), + EVENT_PTR(rapl_ram_unit), + + EVENT_PTR(rapl_pkg_scale), + EVENT_PTR(rapl_ram_scale), + NULL, +}; + static struct attribute_group rapl_pmu_events_group = { .name = "events", .attrs = NULL, /* patched at runtime */ @@ -730,6 +746,10 @@ static int __init rapl_pmu_init(void) rapl_cntr_mask = RAPL_IDX_SRV; rapl_pmu_events_group.attrs = rapl_events_srv_attr; break; + case 87: /* Knights Landing */ + rapl_add_quirk(rapl_hsw_server_quirk); + rapl_cntr_mask = RAPL_IDX_KNL; + rapl_pmu_events_group.attrs = rapl_events_knl_attr; default: /* unsupported */ diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 21b5e38c9..560e5255b 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -911,6 +911,9 @@ static int __init uncore_pci_init(void) case 63: /* Haswell-EP */ ret = hswep_uncore_pci_init(); break; + case 86: /* BDX-DE */ + ret = bdx_uncore_pci_init(); + break; case 42: /* Sandy Bridge */ ret = snb_uncore_pci_init(); break; @@ -1209,6 +1212,11 @@ static int __init uncore_cpu_init(void) break; case 42: /* Sandy Bridge */ case 58: /* Ivy Bridge */ + case 60: /* Haswell */ + case 69: /* Haswell */ + case 70: /* Haswell */ + case 61: /* Broadwell */ + case 71: /* Broadwell */ snb_uncore_cpu_init(); break; case 45: /* Sandy Bridge-EP */ @@ -1224,6 +1232,9 @@ static int __init uncore_cpu_init(void) case 63: /* Haswell-EP */ hswep_uncore_cpu_init(); break; + case 86: /* BDX-DE */ + bdx_uncore_cpu_init(); + break; default: return 0; } diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h index 0f77f0a19..72c54c2e5 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h @@ -336,6 +336,8 @@ int ivbep_uncore_pci_init(void); void ivbep_uncore_cpu_init(void); int hswep_uncore_pci_init(void); void hswep_uncore_cpu_init(void); +int bdx_uncore_pci_init(void); +void bdx_uncore_cpu_init(void); /* perf_event_intel_uncore_nhmex.c */ void nhmex_uncore_cpu_init(void); diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c index b005a78c7..f78574b3c 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c @@ -45,6 +45,11 @@ #define SNB_UNC_CBO_0_PER_CTR0 0x706 #define SNB_UNC_CBO_MSR_OFFSET 0x10 +/* SNB ARB register */ +#define SNB_UNC_ARB_PER_CTR0 0x3b0 +#define SNB_UNC_ARB_PERFEVTSEL0 0x3b2 +#define SNB_UNC_ARB_MSR_OFFSET 0x10 + /* NHM global control register */ #define NHM_UNC_PERF_GLOBAL_CTL 0x391 #define NHM_UNC_FIXED_CTR 0x394 @@ -115,7 +120,7 @@ static struct intel_uncore_ops snb_uncore_msr_ops = { .read_counter = uncore_msr_read_counter, }; -static struct event_constraint snb_uncore_cbox_constraints[] = { +static struct event_constraint snb_uncore_arb_constraints[] = { UNCORE_EVENT_CONSTRAINT(0x80, 0x1), UNCORE_EVENT_CONSTRAINT(0x83, 0x1), EVENT_CONSTRAINT_END @@ -134,14 +139,28 @@ static struct intel_uncore_type snb_uncore_cbox = { .single_fixed = 1, .event_mask = SNB_UNC_RAW_EVENT_MASK, .msr_offset = SNB_UNC_CBO_MSR_OFFSET, - .constraints = snb_uncore_cbox_constraints, .ops = &snb_uncore_msr_ops, .format_group = &snb_uncore_format_group, .event_descs = snb_uncore_events, }; +static struct intel_uncore_type snb_uncore_arb = { + .name = "arb", + .num_counters = 2, + .num_boxes = 1, + .perf_ctr_bits = 44, + .perf_ctr = SNB_UNC_ARB_PER_CTR0, + .event_ctl = SNB_UNC_ARB_PERFEVTSEL0, + .event_mask = SNB_UNC_RAW_EVENT_MASK, + .msr_offset = SNB_UNC_ARB_MSR_OFFSET, + .constraints = snb_uncore_arb_constraints, + .ops = &snb_uncore_msr_ops, + .format_group = &snb_uncore_format_group, +}; + static struct intel_uncore_type *snb_msr_uncores[] = { &snb_uncore_cbox, + &snb_uncore_arb, NULL, }; diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c index 6d6e85dd5..694510a88 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c @@ -2215,7 +2215,7 @@ static struct intel_uncore_type *hswep_pci_uncores[] = { NULL, }; -static DEFINE_PCI_DEVICE_TABLE(hswep_uncore_pci_ids) = { +static const struct pci_device_id hswep_uncore_pci_ids[] = { { /* Home Agent 0 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f30), .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_HA, 0), @@ -2321,3 +2321,167 @@ int hswep_uncore_pci_init(void) return 0; } /* end of Haswell-EP uncore support */ + +/* BDX-DE uncore support */ + +static struct intel_uncore_type bdx_uncore_ubox = { + .name = "ubox", + .num_counters = 2, + .num_boxes = 1, + .perf_ctr_bits = 48, + .fixed_ctr_bits = 48, + .perf_ctr = HSWEP_U_MSR_PMON_CTR0, + .event_ctl = HSWEP_U_MSR_PMON_CTL0, + .event_mask = SNBEP_U_MSR_PMON_RAW_EVENT_MASK, + .fixed_ctr = HSWEP_U_MSR_PMON_UCLK_FIXED_CTR, + .fixed_ctl = HSWEP_U_MSR_PMON_UCLK_FIXED_CTL, + .num_shared_regs = 1, + .ops = &ivbep_uncore_msr_ops, + .format_group = &ivbep_uncore_ubox_format_group, +}; + +static struct event_constraint bdx_uncore_cbox_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x09, 0x3), + UNCORE_EVENT_CONSTRAINT(0x11, 0x1), + UNCORE_EVENT_CONSTRAINT(0x36, 0x1), + EVENT_CONSTRAINT_END +}; + +static struct intel_uncore_type bdx_uncore_cbox = { + .name = "cbox", + .num_counters = 4, + .num_boxes = 8, + .perf_ctr_bits = 48, + .event_ctl = HSWEP_C0_MSR_PMON_CTL0, + .perf_ctr = HSWEP_C0_MSR_PMON_CTR0, + .event_mask = SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK, + .box_ctl = HSWEP_C0_MSR_PMON_BOX_CTL, + .msr_offset = HSWEP_CBO_MSR_OFFSET, + .num_shared_regs = 1, + .constraints = bdx_uncore_cbox_constraints, + .ops = &hswep_uncore_cbox_ops, + .format_group = &hswep_uncore_cbox_format_group, +}; + +static struct intel_uncore_type *bdx_msr_uncores[] = { + &bdx_uncore_ubox, + &bdx_uncore_cbox, + &hswep_uncore_pcu, + NULL, +}; + +void bdx_uncore_cpu_init(void) +{ + if (bdx_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) + bdx_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; + uncore_msr_uncores = bdx_msr_uncores; +} + +static struct intel_uncore_type bdx_uncore_ha = { + .name = "ha", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + +static struct intel_uncore_type bdx_uncore_imc = { + .name = "imc", + .num_counters = 5, + .num_boxes = 2, + .perf_ctr_bits = 48, + .fixed_ctr_bits = 48, + .fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR, + .fixed_ctl = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL, + .event_descs = hswep_uncore_imc_events, + SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + +static struct intel_uncore_type bdx_uncore_irp = { + .name = "irp", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_PCI_PMON_BOX_CTL, + .ops = &hswep_uncore_irp_ops, + .format_group = &snbep_uncore_format_group, +}; + + +static struct event_constraint bdx_uncore_r2pcie_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x10, 0x3), + UNCORE_EVENT_CONSTRAINT(0x11, 0x3), + UNCORE_EVENT_CONSTRAINT(0x13, 0x1), + UNCORE_EVENT_CONSTRAINT(0x23, 0x1), + UNCORE_EVENT_CONSTRAINT(0x25, 0x1), + UNCORE_EVENT_CONSTRAINT(0x26, 0x3), + UNCORE_EVENT_CONSTRAINT(0x2d, 0x3), + EVENT_CONSTRAINT_END +}; + +static struct intel_uncore_type bdx_uncore_r2pcie = { + .name = "r2pcie", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + .constraints = bdx_uncore_r2pcie_constraints, + SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + +enum { + BDX_PCI_UNCORE_HA, + BDX_PCI_UNCORE_IMC, + BDX_PCI_UNCORE_IRP, + BDX_PCI_UNCORE_R2PCIE, +}; + +static struct intel_uncore_type *bdx_pci_uncores[] = { + [BDX_PCI_UNCORE_HA] = &bdx_uncore_ha, + [BDX_PCI_UNCORE_IMC] = &bdx_uncore_imc, + [BDX_PCI_UNCORE_IRP] = &bdx_uncore_irp, + [BDX_PCI_UNCORE_R2PCIE] = &bdx_uncore_r2pcie, + NULL, +}; + +static DEFINE_PCI_DEVICE_TABLE(bdx_uncore_pci_ids) = { + { /* Home Agent 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f30), + .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_HA, 0), + }, + { /* MC0 Channel 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb0), + .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 0), + }, + { /* MC0 Channel 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb1), + .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 1), + }, + { /* IRP */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f39), + .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IRP, 0), + }, + { /* R2PCIe */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f34), + .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R2PCIE, 0), + }, + { /* end: all zeroes */ } +}; + +static struct pci_driver bdx_uncore_pci_driver = { + .name = "bdx_uncore", + .id_table = bdx_uncore_pci_ids, +}; + +int bdx_uncore_pci_init(void) +{ + int ret = snbep_pci2phy_map_init(0x6f1e); + + if (ret) + return ret; + uncore_pci_uncores = bdx_pci_uncores; + uncore_pci_driver = &bdx_uncore_pci_driver; + return 0; +} + +/* end of BDX-DE uncore support */ diff --git a/arch/x86/kernel/cpu/perf_event_msr.c b/arch/x86/kernel/cpu/perf_event_msr.c new file mode 100644 index 000000000..f32ac1393 --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_msr.c @@ -0,0 +1,242 @@ +#include + +enum perf_msr_id { + PERF_MSR_TSC = 0, + PERF_MSR_APERF = 1, + PERF_MSR_MPERF = 2, + PERF_MSR_PPERF = 3, + PERF_MSR_SMI = 4, + + PERF_MSR_EVENT_MAX, +}; + +static bool test_aperfmperf(int idx) +{ + return boot_cpu_has(X86_FEATURE_APERFMPERF); +} + +static bool test_intel(int idx) +{ + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || + boot_cpu_data.x86 != 6) + return false; + + switch (boot_cpu_data.x86_model) { + case 30: /* 45nm Nehalem */ + case 26: /* 45nm Nehalem-EP */ + case 46: /* 45nm Nehalem-EX */ + + case 37: /* 32nm Westmere */ + case 44: /* 32nm Westmere-EP */ + case 47: /* 32nm Westmere-EX */ + + case 42: /* 32nm SandyBridge */ + case 45: /* 32nm SandyBridge-E/EN/EP */ + + case 58: /* 22nm IvyBridge */ + case 62: /* 22nm IvyBridge-EP/EX */ + + case 60: /* 22nm Haswell Core */ + case 63: /* 22nm Haswell Server */ + case 69: /* 22nm Haswell ULT */ + case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */ + + case 61: /* 14nm Broadwell Core-M */ + case 86: /* 14nm Broadwell Xeon D */ + case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */ + case 79: /* 14nm Broadwell Server */ + + case 55: /* 22nm Atom "Silvermont" */ + case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */ + case 76: /* 14nm Atom "Airmont" */ + if (idx == PERF_MSR_SMI) + return true; + break; + + case 78: /* 14nm Skylake Mobile */ + case 94: /* 14nm Skylake Desktop */ + if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF) + return true; + break; + } + + return false; +} + +struct perf_msr { + u64 msr; + struct perf_pmu_events_attr *attr; + bool (*test)(int idx); +}; + +PMU_EVENT_ATTR_STRING(tsc, evattr_tsc, "event=0x00"); +PMU_EVENT_ATTR_STRING(aperf, evattr_aperf, "event=0x01"); +PMU_EVENT_ATTR_STRING(mperf, evattr_mperf, "event=0x02"); +PMU_EVENT_ATTR_STRING(pperf, evattr_pperf, "event=0x03"); +PMU_EVENT_ATTR_STRING(smi, evattr_smi, "event=0x04"); + +static struct perf_msr msr[] = { + [PERF_MSR_TSC] = { 0, &evattr_tsc, NULL, }, + [PERF_MSR_APERF] = { MSR_IA32_APERF, &evattr_aperf, test_aperfmperf, }, + [PERF_MSR_MPERF] = { MSR_IA32_MPERF, &evattr_mperf, test_aperfmperf, }, + [PERF_MSR_PPERF] = { MSR_PPERF, &evattr_pperf, test_intel, }, + [PERF_MSR_SMI] = { MSR_SMI_COUNT, &evattr_smi, test_intel, }, +}; + +static struct attribute *events_attrs[PERF_MSR_EVENT_MAX + 1] = { + NULL, +}; + +static struct attribute_group events_attr_group = { + .name = "events", + .attrs = events_attrs, +}; + +PMU_FORMAT_ATTR(event, "config:0-63"); +static struct attribute *format_attrs[] = { + &format_attr_event.attr, + NULL, +}; +static struct attribute_group format_attr_group = { + .name = "format", + .attrs = format_attrs, +}; + +static const struct attribute_group *attr_groups[] = { + &events_attr_group, + &format_attr_group, + NULL, +}; + +static int msr_event_init(struct perf_event *event) +{ + u64 cfg = event->attr.config; + + if (event->attr.type != event->pmu->type) + return -ENOENT; + + if (cfg >= PERF_MSR_EVENT_MAX) + return -EINVAL; + + /* unsupported modes and filters */ + if (event->attr.exclude_user || + event->attr.exclude_kernel || + event->attr.exclude_hv || + event->attr.exclude_idle || + event->attr.exclude_host || + event->attr.exclude_guest || + event->attr.sample_period) /* no sampling */ + return -EINVAL; + + if (!msr[cfg].attr) + return -EINVAL; + + event->hw.idx = -1; + event->hw.event_base = msr[cfg].msr; + event->hw.config = cfg; + + return 0; +} + +static inline u64 msr_read_counter(struct perf_event *event) +{ + u64 now; + + if (event->hw.event_base) + rdmsrl(event->hw.event_base, now); + else + rdtscll(now); + + return now; +} +static void msr_event_update(struct perf_event *event) +{ + u64 prev, now; + s64 delta; + + /* Careful, an NMI might modify the previous event value. */ +again: + prev = local64_read(&event->hw.prev_count); + now = msr_read_counter(event); + + if (local64_cmpxchg(&event->hw.prev_count, prev, now) != prev) + goto again; + + delta = now - prev; + if (unlikely(event->hw.event_base == MSR_SMI_COUNT)) { + delta <<= 32; + delta >>= 32; /* sign extend */ + } + local64_add(now - prev, &event->count); +} + +static void msr_event_start(struct perf_event *event, int flags) +{ + u64 now; + + now = msr_read_counter(event); + local64_set(&event->hw.prev_count, now); +} + +static void msr_event_stop(struct perf_event *event, int flags) +{ + msr_event_update(event); +} + +static void msr_event_del(struct perf_event *event, int flags) +{ + msr_event_stop(event, PERF_EF_UPDATE); +} + +static int msr_event_add(struct perf_event *event, int flags) +{ + if (flags & PERF_EF_START) + msr_event_start(event, flags); + + return 0; +} + +static struct pmu pmu_msr = { + .task_ctx_nr = perf_sw_context, + .attr_groups = attr_groups, + .event_init = msr_event_init, + .add = msr_event_add, + .del = msr_event_del, + .start = msr_event_start, + .stop = msr_event_stop, + .read = msr_event_update, + .capabilities = PERF_PMU_CAP_NO_INTERRUPT, +}; + +static int __init msr_init(void) +{ + int i, j = 0; + + if (!boot_cpu_has(X86_FEATURE_TSC)) { + pr_cont("no MSR PMU driver.\n"); + return 0; + } + + /* Probe the MSRs. */ + for (i = PERF_MSR_TSC + 1; i < PERF_MSR_EVENT_MAX; i++) { + u64 val; + + /* + * Virt sucks arse; you cannot tell if a R/O MSR is present :/ + */ + if (!msr[i].test(i) || rdmsrl_safe(msr[i].msr, &val)) + msr[i].attr = NULL; + } + + /* List remaining MSRs in the sysfs attrs. */ + for (i = 0; i < PERF_MSR_EVENT_MAX; i++) { + if (msr[i].attr) + events_attrs[j++] = &msr[i].attr->attr.attr; + } + events_attrs[j] = NULL; + + perf_pmu_register(&pmu_msr, "msr", -1); + + return 0; +} +device_initcall(msr_init); diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 3d423a101..608fb26c7 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -37,7 +37,7 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c) { X86_FEATURE_PLN, CR_EAX, 4, 0x00000006, 0 }, { X86_FEATURE_PTS, CR_EAX, 6, 0x00000006, 0 }, { X86_FEATURE_HWP, CR_EAX, 7, 0x00000006, 0 }, - { X86_FEATURE_HWP_NOITFY, CR_EAX, 8, 0x00000006, 0 }, + { X86_FEATURE_HWP_NOTIFY, CR_EAX, 8, 0x00000006, 0 }, { X86_FEATURE_HWP_ACT_WINDOW, CR_EAX, 9, 0x00000006, 0 }, { X86_FEATURE_HWP_EPP, CR_EAX,10, 0x00000006, 0 }, { X86_FEATURE_HWP_PKG_REQ, CR_EAX,11, 0x00000006, 0 }, -- cgit v1.2.3-54-g00ecf