6 files changed, 3273 insertions, 0 deletions
diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c
new file mode 100644
index 000000000..bd3e8421b
--- /dev/null
+++ b/arch/x86/events/amd/core.c
@@ -0,0 +1,746 @@
+#include <linux/perf_event.h>
+#include <linux/export.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <asm/apicdef.h>
+
+#include "../perf_event.h"
+
+static __initconst const u64 amd_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses        */
+		[ C(RESULT_MISS)   ] = 0x0141, /* Data Cache Misses          */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts  */
+		[ C(RESULT_MISS)   ] = 0x0167, /* Data Prefetcher :cancelled */
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches  */
+		[ C(RESULT_MISS)   ] = 0x0081, /* Instruction cache misses   */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */
+		[ C(RESULT_MISS)   ] = 0x037E, /* L2 Cache Misses : IC+DC     */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback           */
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses        */
+		[ C(RESULT_MISS)   ] = 0x0746, /* L1_DTLB_AND_L2_DLTB_MISS.ALL */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes        */
+		[ C(RESULT_MISS)   ] = 0x0385, /* L1_ITLB_AND_L2_ITLB_MISS.ALL */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr.      */
+		[ C(RESULT_MISS)   ] = 0x00c3, /* Retired Mispredicted BI    */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0xb8e9, /* CPU Request to Memory, l+r */
+		[ C(RESULT_MISS)   ] = 0x98e9, /* CPU Request to Memory, r   */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+};
+
+/*
+ * AMD Performance Monitor K7 and later.
+ */
+static const u64 amd_perfmon_event_map[PERF_COUNT_HW_MAX] =
+{
+  [PERF_COUNT_HW_CPU_CYCLES]			= 0x0076,
+  [PERF_COUNT_HW_INSTRUCTIONS]			= 0x00c0,
+  [PERF_COUNT_HW_CACHE_REFERENCES]		= 0x0080,
+  [PERF_COUNT_HW_CACHE_MISSES]			= 0x0081,
+  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]		= 0x00c2,
+  [PERF_COUNT_HW_BRANCH_MISSES]			= 0x00c3,
+  [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND]	= 0x00d0, /* "Decoder empty" event */
+  [PERF_COUNT_HW_STALLED_CYCLES_BACKEND]	= 0x00d1, /* "Dispatch stalls" event */
+};
+
+static u64 amd_pmu_event_map(int hw_event)
+{
+	return amd_perfmon_event_map[hw_event];
+}
+
+/*
+ * Previously calculated offsets
+ */
+static unsigned int event_offsets[X86_PMC_IDX_MAX] __read_mostly;
+static unsigned int count_offsets[X86_PMC_IDX_MAX] __read_mostly;
+
+/*
+ * Legacy CPUs:
+ *   4 counters starting at 0xc0010000 each offset by 1
+ *
+ * CPUs with core performance counter extensions:
+ *   6 counters starting at 0xc0010200 each offset by 2
+ */
+static inline int amd_pmu_addr_offset(int index, bool eventsel)
+{
+	int offset;
+
+	if (!index)
+		return index;
+
+	if (eventsel)
+		offset = event_offsets[index];
+	else
+		offset = count_offsets[index];
+
+	if (offset)
+		return offset;
+
+	if (!boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
+		offset = index;
+	else
+		offset = index << 1;
+
+	if (eventsel)
+		event_offsets[index] = offset;
+	else
+		count_offsets[index] = offset;
+
+	return offset;
+}
+
+static int amd_core_hw_config(struct perf_event *event)
+{
+	if (event->attr.exclude_host && event->attr.exclude_guest)
+		/*
+		 * When HO == GO == 1 the hardware treats that as GO == HO == 0
+		 * and will count in both modes. We don't want to count in that
+		 * case so we emulate no-counting by setting US = OS = 0.
+		 */
+		event->hw.config &= ~(ARCH_PERFMON_EVENTSEL_USR |
+				      ARCH_PERFMON_EVENTSEL_OS);
+	else if (event->attr.exclude_host)
+		event->hw.config |= AMD64_EVENTSEL_GUESTONLY;
+	else if (event->attr.exclude_guest)
+		event->hw.config |= AMD64_EVENTSEL_HOSTONLY;
+
+	return 0;
+}
+
+/*
+ * AMD64 events are detected based on their event codes.
+ */
+static inline unsigned int amd_get_event_code(struct hw_perf_event *hwc)
+{
+	return ((hwc->config >> 24) & 0x0f00) | (hwc->config & 0x00ff);
+}
+
+static inline int amd_is_nb_event(struct hw_perf_event *hwc)
+{
+	return (hwc->config & 0xe0) == 0xe0;
+}
+
+static inline int amd_has_nb(struct cpu_hw_events *cpuc)
+{
+	struct amd_nb *nb = cpuc->amd_nb;
+
+	return nb && nb->nb_id != -1;
+}
+
+static int amd_pmu_hw_config(struct perf_event *event)
+{
+	int ret;
+
+	/* pass precise event sampling to ibs: */
+	if (event->attr.precise_ip && get_ibs_caps())
+		return -ENOENT;
+
+	if (has_branch_stack(event))
+		return -EOPNOTSUPP;
+
+	ret = x86_pmu_hw_config(event);
+	if (ret)
+		return ret;
+
+	if (event->attr.type == PERF_TYPE_RAW)
+		event->hw.config |= event->attr.config & AMD64_RAW_EVENT_MASK;
+
+	return amd_core_hw_config(event);
+}
+
+static void __amd_put_nb_event_constraints(struct cpu_hw_events *cpuc,
+					   struct perf_event *event)
+{
+	struct amd_nb *nb = cpuc->amd_nb;
+	int i;
+
+	/*
+	 * need to scan whole list because event may not have
+	 * been assigned during scheduling
+	 *
+	 * no race condition possible because event can only
+	 * be removed on one CPU at a time AND PMU is disabled
+	 * when we come here
+	 */
+	for (i = 0; i < x86_pmu.num_counters; i++) {
+		if (cmpxchg(nb->owners + i, event, NULL) == event)
+			break;
+	}
+}
+
+ /*
+  * AMD64 NorthBridge events need special treatment because
+  * counter access needs to be synchronized across all cores
+  * of a package. Refer to BKDG section 3.12
+  *
+  * NB events are events measuring L3 cache, Hypertransport
+  * traffic. They are identified by an event code >= 0xe00.
+  * They measure events on the NorthBride which is shared
+  * by all cores on a package. NB events are counted on a
+  * shared set of counters. When a NB event is programmed
+  * in a counter, the data actually comes from a shared
+  * counter. Thus, access to those counters needs to be
+  * synchronized.
+  *
+  * We implement the synchronization such that no two cores
+  * can be measuring NB events using the same counters. Thus,
+  * we maintain a per-NB allocation table. The available slot
+  * is propagated using the event_constraint structure.
+  *
+  * We provide only one choice for each NB event based on
+  * the fact that only NB events have restrictions. Consequently,
+  * if a counter is available, there is a guarantee the NB event
+  * will be assigned to it. If no slot is available, an empty
+  * constraint is returned and scheduling will eventually fail
+  * for this event.
+  *
+  * Note that all cores attached the same NB compete for the same
+  * counters to host NB events, this is why we use atomic ops. Some
+  * multi-chip CPUs may have more than one NB.
+  *
+  * Given that resources are allocated (cmpxchg), they must be
+  * eventually freed for others to use. This is accomplished by
+  * calling __amd_put_nb_event_constraints()
+  *
+  * Non NB events are not impacted by this restriction.
+  */
+static struct event_constraint *
+__amd_get_nb_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
+			       struct event_constraint *c)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct amd_nb *nb = cpuc->amd_nb;
+	struct perf_event *old;
+	int idx, new = -1;
+
+	if (!c)
+		c = &unconstrained;
+
+	if (cpuc->is_fake)
+		return c;
+
+	/*
+	 * detect if already present, if so reuse
+	 *
+	 * cannot merge with actual allocation
+	 * because of possible holes
+	 *
+	 * event can already be present yet not assigned (in hwc->idx)
+	 * because of successive calls to x86_schedule_events() from
+	 * hw_perf_group_sched_in() without hw_perf_enable()
+	 */
+	for_each_set_bit(idx, c->idxmsk, x86_pmu.num_counters) {
+		if (new == -1 || hwc->idx == idx)
+			/* assign free slot, prefer hwc->idx */
+			old = cmpxchg(nb->owners + idx, NULL, event);
+		else if (nb->owners[idx] == event)
+			/* event already present */
+			old = event;
+		else
+			continue;
+
+		if (old && old != event)
+			continue;
+
+		/* reassign to this slot */
+		if (new != -1)
+			cmpxchg(nb->owners + new, event, NULL);
+		new = idx;
+
+		/* already present, reuse */
+		if (old == event)
+			break;
+	}
+
+	if (new == -1)
+		return &emptyconstraint;
+
+	return &nb->event_constraints[new];
+}
+
+static struct amd_nb *amd_alloc_nb(int cpu)
+{
+	struct amd_nb *nb;
+	int i;
+
+	nb = kzalloc_node(sizeof(struct amd_nb), GFP_KERNEL, cpu_to_node(cpu));
+	if (!nb)
+		return NULL;
+
+	nb->nb_id = -1;
+
+	/*
+	 * initialize all possible NB constraints
+	 */
+	for (i = 0; i < x86_pmu.num_counters; i++) {
+		__set_bit(i, nb->event_constraints[i].idxmsk);
+		nb->event_constraints[i].weight = 1;
+	}
+	return nb;
+}
+
+static int amd_pmu_cpu_prepare(int cpu)
+{
+	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+
+	WARN_ON_ONCE(cpuc->amd_nb);
+
+	if (!x86_pmu.amd_nb_constraints)
+		return NOTIFY_OK;
+
+	cpuc->amd_nb = amd_alloc_nb(cpu);
+	if (!cpuc->amd_nb)
+		return NOTIFY_BAD;
+
+	return NOTIFY_OK;
+}
+
+static void amd_pmu_cpu_starting(int cpu)
+{
+	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+	void **onln = &cpuc->kfree_on_online[X86_PERF_KFREE_SHARED];
+	struct amd_nb *nb;
+	int i, nb_id;
+
+	cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY;
+
+	if (!x86_pmu.amd_nb_constraints)
+		return;
+
+	nb_id = amd_get_nb_id(cpu);
+	WARN_ON_ONCE(nb_id == BAD_APICID);
+
+	for_each_online_cpu(i) {
+		nb = per_cpu(cpu_hw_events, i).amd_nb;
+		if (WARN_ON_ONCE(!nb))
+			continue;
+
+		if (nb->nb_id == nb_id) {
+			*onln = cpuc->amd_nb;
+			cpuc->amd_nb = nb;
+			break;
+		}
+	}
+
+	cpuc->amd_nb->nb_id = nb_id;
+	cpuc->amd_nb->refcnt++;
+}
+
+static void amd_pmu_cpu_dead(int cpu)
+{
+	struct cpu_hw_events *cpuhw;
+
+	if (!x86_pmu.amd_nb_constraints)
+		return;
+
+	cpuhw = &per_cpu(cpu_hw_events, cpu);
+
+	if (cpuhw->amd_nb) {
+		struct amd_nb *nb = cpuhw->amd_nb;
+
+		if (nb->nb_id == -1 || --nb->refcnt == 0)
+			kfree(nb);
+
+		cpuhw->amd_nb = NULL;
+	}
+}
+
+static struct event_constraint *
+amd_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+			  struct perf_event *event)
+{
+	/*
+	 * if not NB event or no NB, then no constraints
+	 */
+	if (!(amd_has_nb(cpuc) && amd_is_nb_event(&event->hw)))
+		return &unconstrained;
+
+	return __amd_get_nb_event_constraints(cpuc, event, NULL);
+}
+
+static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
+				      struct perf_event *event)
+{
+	if (amd_has_nb(cpuc) && amd_is_nb_event(&event->hw))
+		__amd_put_nb_event_constraints(cpuc, event);
+}
+
+PMU_FORMAT_ATTR(event,	"config:0-7,32-35");
+PMU_FORMAT_ATTR(umask,	"config:8-15"	);
+PMU_FORMAT_ATTR(edge,	"config:18"	);
+PMU_FORMAT_ATTR(inv,	"config:23"	);
+PMU_FORMAT_ATTR(cmask,	"config:24-31"	);
+
+static struct attribute *amd_format_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_cmask.attr,
+	NULL,
+};
+
+/* AMD Family 15h */
+
+#define AMD_EVENT_TYPE_MASK	0x000000F0ULL
+
+#define AMD_EVENT_FP		0x00000000ULL ... 0x00000010ULL
+#define AMD_EVENT_LS		0x00000020ULL ... 0x00000030ULL
+#define AMD_EVENT_DC		0x00000040ULL ... 0x00000050ULL
+#define AMD_EVENT_CU		0x00000060ULL ... 0x00000070ULL
+#define AMD_EVENT_IC_DE		0x00000080ULL ... 0x00000090ULL
+#define AMD_EVENT_EX_LS		0x000000C0ULL
+#define AMD_EVENT_DE		0x000000D0ULL
+#define AMD_EVENT_NB		0x000000E0ULL ... 0x000000F0ULL
+
+/*
+ * AMD family 15h event code/PMC mappings:
+ *
+ * type = event_code & 0x0F0:
+ *
+ * 0x000	FP	PERF_CTL[5:3]
+ * 0x010	FP	PERF_CTL[5:3]
+ * 0x020	LS	PERF_CTL[5:0]
+ * 0x030	LS	PERF_CTL[5:0]
+ * 0x040	DC	PERF_CTL[5:0]
+ * 0x050	DC	PERF_CTL[5:0]
+ * 0x060	CU	PERF_CTL[2:0]
+ * 0x070	CU	PERF_CTL[2:0]
+ * 0x080	IC/DE	PERF_CTL[2:0]
+ * 0x090	IC/DE	PERF_CTL[2:0]
+ * 0x0A0	---
+ * 0x0B0	---
+ * 0x0C0	EX/LS	PERF_CTL[5:0]
+ * 0x0D0	DE	PERF_CTL[2:0]
+ * 0x0E0	NB	NB_PERF_CTL[3:0]
+ * 0x0F0	NB	NB_PERF_CTL[3:0]
+ *
+ * Exceptions:
+ *
+ * 0x000	FP	PERF_CTL[3], PERF_CTL[5:3] (*)
+ * 0x003	FP	PERF_CTL[3]
+ * 0x004	FP	PERF_CTL[3], PERF_CTL[5:3] (*)
+ * 0x00B	FP	PERF_CTL[3]
+ * 0x00D	FP	PERF_CTL[3]
+ * 0x023	DE	PERF_CTL[2:0]
+ * 0x02D	LS	PERF_CTL[3]
+ * 0x02E	LS	PERF_CTL[3,0]
+ * 0x031	LS	PERF_CTL[2:0] (**)
+ * 0x043	CU	PERF_CTL[2:0]
+ * 0x045	CU	PERF_CTL[2:0]
+ * 0x046	CU	PERF_CTL[2:0]
+ * 0x054	CU	PERF_CTL[2:0]
+ * 0x055	CU	PERF_CTL[2:0]
+ * 0x08F	IC	PERF_CTL[0]
+ * 0x187	DE	PERF_CTL[0]
+ * 0x188	DE	PERF_CTL[0]
+ * 0x0DB	EX	PERF_CTL[5:0]
+ * 0x0DC	LS	PERF_CTL[5:0]
+ * 0x0DD	LS	PERF_CTL[5:0]
+ * 0x0DE	LS	PERF_CTL[5:0]
+ * 0x0DF	LS	PERF_CTL[5:0]
+ * 0x1C0	EX	PERF_CTL[5:3]
+ * 0x1D6	EX	PERF_CTL[5:0]
+ * 0x1D8	EX	PERF_CTL[5:0]
+ *
+ * (*)  depending on the umask all FPU counters may be used
+ * (**) only one unitmask enabled at a time
+ */
+
+static struct event_constraint amd_f15_PMC0  = EVENT_CONSTRAINT(0, 0x01, 0);
+static struct event_constraint amd_f15_PMC20 = EVENT_CONSTRAINT(0, 0x07, 0);
+static struct event_constraint amd_f15_PMC3  = EVENT_CONSTRAINT(0, 0x08, 0);
+static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0);
+static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0);
+static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0);
+
+static struct event_constraint *
+amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, int idx,
+			       struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	unsigned int event_code = amd_get_event_code(hwc);
+
+	switch (event_code & AMD_EVENT_TYPE_MASK) {
+	case AMD_EVENT_FP:
+		switch (event_code) {
+		case 0x000:
+			if (!(hwc->config & 0x0000F000ULL))
+				break;
+			if (!(hwc->config & 0x00000F00ULL))
+				break;
+			return &amd_f15_PMC3;
+		case 0x004:
+			if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1)
+				break;
+			return &amd_f15_PMC3;
+		case 0x003:
+		case 0x00B:
+		case 0x00D:
+			return &amd_f15_PMC3;
+		}
+		return &amd_f15_PMC53;
+	case AMD_EVENT_LS:
+	case AMD_EVENT_DC:
+	case AMD_EVENT_EX_LS:
+		switch (event_code) {
+		case 0x023:
+		case 0x043:
+		case 0x045:
+		case 0x046:
+		case 0x054:
+		case 0x055:
+			return &amd_f15_PMC20;
+		case 0x02D:
+			return &amd_f15_PMC3;
+		case 0x02E:
+			return &amd_f15_PMC30;
+		case 0x031:
+			if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1)
+				return &amd_f15_PMC20;
+			return &emptyconstraint;
+		case 0x1C0:
+			return &amd_f15_PMC53;
+		default:
+			return &amd_f15_PMC50;
+		}
+	case AMD_EVENT_CU:
+	case AMD_EVENT_IC_DE:
+	case AMD_EVENT_DE:
+		switch (event_code) {
+		case 0x08F:
+		case 0x187:
+		case 0x188:
+			return &amd_f15_PMC0;
+		case 0x0DB ... 0x0DF:
+		case 0x1D6:
+		case 0x1D8:
+			return &amd_f15_PMC50;
+		default:
+			return &amd_f15_PMC20;
+		}
+	case AMD_EVENT_NB:
+		/* moved to perf_event_amd_uncore.c */
+		return &emptyconstraint;
+	default:
+		return &emptyconstraint;
+	}
+}
+
+static ssize_t amd_event_sysfs_show(char *page, u64 config)
+{
+	u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT) |
+		    (config & AMD64_EVENTSEL_EVENT) >> 24;
+
+	return x86_event_sysfs_show(page, config, event);
+}
+
+static __initconst const struct x86_pmu amd_pmu = {
+	.name			= "AMD",
+	.handle_irq		= x86_pmu_handle_irq,
+	.disable_all		= x86_pmu_disable_all,
+	.enable_all		= x86_pmu_enable_all,
+	.enable			= x86_pmu_enable_event,
+	.disable		= x86_pmu_disable_event,
+	.hw_config		= amd_pmu_hw_config,
+	.schedule_events	= x86_schedule_events,
+	.eventsel		= MSR_K7_EVNTSEL0,
+	.perfctr		= MSR_K7_PERFCTR0,
+	.addr_offset            = amd_pmu_addr_offset,
+	.event_map		= amd_pmu_event_map,
+	.max_events		= ARRAY_SIZE(amd_perfmon_event_map),
+	.num_counters		= AMD64_NUM_COUNTERS,
+	.cntval_bits		= 48,
+	.cntval_mask		= (1ULL << 48) - 1,
+	.apic			= 1,
+	/* use highest bit to detect overflow */
+	.max_period		= (1ULL << 47) - 1,
+	.get_event_constraints	= amd_get_event_constraints,
+	.put_event_constraints	= amd_put_event_constraints,
+
+	.format_attrs		= amd_format_attr,
+	.events_sysfs_show	= amd_event_sysfs_show,
+
+	.cpu_prepare		= amd_pmu_cpu_prepare,
+	.cpu_starting		= amd_pmu_cpu_starting,
+	.cpu_dead		= amd_pmu_cpu_dead,
+
+	.amd_nb_constraints	= 1,
+};
+
+static int __init amd_core_pmu_init(void)
+{
+	if (!boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
+		return 0;
+
+	switch (boot_cpu_data.x86) {
+	case 0x15:
+		pr_cont("Fam15h ");
+		x86_pmu.get_event_constraints = amd_get_event_constraints_f15h;
+		break;
+
+	default:
+		pr_err("core perfctr but no constraints; unknown hardware!\n");
+		return -ENODEV;
+	}
+
+	/*
+	 * If core performance counter extensions exists, we must use
+	 * MSR_F15H_PERF_CTL/MSR_F15H_PERF_CTR msrs. See also
+	 * amd_pmu_addr_offset().
+	 */
+	x86_pmu.eventsel	= MSR_F15H_PERF_CTL;
+	x86_pmu.perfctr		= MSR_F15H_PERF_CTR;
+	x86_pmu.num_counters	= AMD64_NUM_COUNTERS_CORE;
+	/*
+	 * AMD Core perfctr has separate MSRs for the NB events, see
+	 * the amd/uncore.c driver.
+	 */
+	x86_pmu.amd_nb_constraints = 0;
+
+	pr_cont("core perfctr, ");
+	return 0;
+}
+
+__init int amd_pmu_init(void)
+{
+	int ret;
+
+	/* Performance-monitoring supported from K7 and later: */
+	if (boot_cpu_data.x86 < 6)
+		return -ENODEV;
+
+	x86_pmu = amd_pmu;
+
+	ret = amd_core_pmu_init();
+	if (ret)
+		return ret;
+
+	if (num_possible_cpus() == 1) {
+		/*
+		 * No point in allocating data structures to serialize
+		 * against other CPUs, when there is only the one CPU.
+		 */
+		x86_pmu.amd_nb_constraints = 0;
+	}
+
+	/* Events are common for all AMDs */
+	memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
+	       sizeof(hw_cache_event_ids));
+
+	return 0;
+}
+
+void amd_pmu_enable_virt(void)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	cpuc->perf_ctr_virt_mask = 0;
+
+	/* Reload all events */
+	x86_pmu_disable_all();
+	x86_pmu_enable_all(0);
+}
+EXPORT_SYMBOL_GPL(amd_pmu_enable_virt);
+
+void amd_pmu_disable_virt(void)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	/*
+	 * We only mask out the Host-only bit so that host-only counting works
+	 * when SVM is disabled. If someone sets up a guest-only counter when
+	 * SVM is disabled the Guest-only bits still gets set and the counter
+	 * will not count anything.
+	 */
+	cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY;
+
+	/* Reload all events */
+	x86_pmu_disable_all();
+	x86_pmu_enable_all(0);
+}
+EXPORT_SYMBOL_GPL(amd_pmu_disable_virt);
diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c
new file mode 100644
index 000000000..feb90f673
--- /dev/null
+++ b/arch/x86/events/amd/ibs.c
@@ -0,0 +1,1026 @@
+/*
+ * Performance events - AMD IBS
+ *
+ *  Copyright (C) 2011 Advanced Micro Devices, Inc., Robert Richter
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/perf_event.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/ptrace.h>
+#include <linux/syscore_ops.h>
+
+#include <asm/apic.h>
+
+#include "../perf_event.h"
+
+static u32 ibs_caps;
+
+#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD)
+
+#include <linux/kprobes.h>
+#include <linux/hardirq.h>
+
+#include <asm/nmi.h>
+
+#define IBS_FETCH_CONFIG_MASK	(IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT)
+#define IBS_OP_CONFIG_MASK	IBS_OP_MAX_CNT
+
+
+/*
+ * IBS states:
+ *
+ * ENABLED; tracks the pmu::add(), pmu::del() state, when set the counter is taken
+ * and any further add()s must fail.
+ *
+ * STARTED/STOPPING/STOPPED; deal with pmu::start(), pmu::stop() state but are
+ * complicated by the fact that the IBS hardware can send late NMIs (ie. after
+ * we've cleared the EN bit).
+ *
+ * In order to consume these late NMIs we have the STOPPED state, any NMI that
+ * happens after we've cleared the EN state will clear this bit and report the
+ * NMI handled (this is fundamentally racy in the face or multiple NMI sources,
+ * someone else can consume our BIT and our NMI will go unhandled).
+ *
+ * And since we cannot set/clear this separate bit together with the EN bit,
+ * there are races; if we cleared STARTED early, an NMI could land in
+ * between clearing STARTED and clearing the EN bit (in fact multiple NMIs
+ * could happen if the period is small enough), and consume our STOPPED bit
+ * and trigger streams of unhandled NMIs.
+ *
+ * If, however, we clear STARTED late, an NMI can hit between clearing the
+ * EN bit and clearing STARTED, still see STARTED set and process the event.
+ * If this event will have the VALID bit clear, we bail properly, but this
+ * is not a given. With VALID set we can end up calling pmu::stop() again
+ * (the throttle logic) and trigger the WARNs in there.
+ *
+ * So what we do is set STOPPING before clearing EN to avoid the pmu::stop()
+ * nesting, and clear STARTED late, so that we have a well defined state over
+ * the clearing of the EN bit.
+ *
+ * XXX: we could probably be using !atomic bitops for all this.
+ */
+
+enum ibs_states {
+	IBS_ENABLED	= 0,
+	IBS_STARTED	= 1,
+	IBS_STOPPING	= 2,
+	IBS_STOPPED	= 3,
+
+	IBS_MAX_STATES,
+};
+
+struct cpu_perf_ibs {
+	struct perf_event	*event;
+	unsigned long		state[BITS_TO_LONGS(IBS_MAX_STATES)];
+};
+
+struct perf_ibs {
+	struct pmu			pmu;
+	unsigned int			msr;
+	u64				config_mask;
+	u64				cnt_mask;
+	u64				enable_mask;
+	u64				valid_mask;
+	u64				max_period;
+	unsigned long			offset_mask[1];
+	int				offset_max;
+	struct cpu_perf_ibs __percpu	*pcpu;
+
+	struct attribute		**format_attrs;
+	struct attribute_group		format_group;
+	const struct attribute_group	*attr_groups[2];
+
+	u64				(*get_count)(u64 config);
+};
+
+struct perf_ibs_data {
+	u32		size;
+	union {
+		u32	data[0];	/* data buffer starts here */
+		u32	caps;
+	};
+	u64		regs[MSR_AMD64_IBS_REG_COUNT_MAX];
+};
+
+static int
+perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *hw_period)
+{
+	s64 left = local64_read(&hwc->period_left);
+	s64 period = hwc->sample_period;
+	int overflow = 0;
+
+	/*
+	 * If we are way outside a reasonable range then just skip forward:
+	 */
+	if (unlikely(left <= -period)) {
+		left = period;
+		local64_set(&hwc->period_left, left);
+		hwc->last_period = period;
+		overflow = 1;
+	}
+
+	if (unlikely(left < (s64)min)) {
+		left += period;
+		local64_set(&hwc->period_left, left);
+		hwc->last_period = period;
+		overflow = 1;
+	}
+
+	/*
+	 * If the hw period that triggers the sw overflow is too short
+	 * we might hit the irq handler. This biases the results.
+	 * Thus we shorten the next-to-last period and set the last
+	 * period to the max period.
+	 */
+	if (left > max) {
+		left -= max;
+		if (left > max)
+			left = max;
+		else if (left < min)
+			left = min;
+	}
+
+	*hw_period = (u64)left;
+
+	return overflow;
+}
+
+static  int
+perf_event_try_update(struct perf_event *event, u64 new_raw_count, int width)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	int shift = 64 - width;
+	u64 prev_raw_count;
+	u64 delta;
+
+	/*
+	 * Careful: an NMI might modify the previous event value.
+	 *
+	 * Our tactic to handle this is to first atomically read and
+	 * exchange a new raw count - then add that new-prev delta
+	 * count to the generic event atomically:
+	 */
+	prev_raw_count = local64_read(&hwc->prev_count);
+	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
+					new_raw_count) != prev_raw_count)
+		return 0;
+
+	/*
+	 * Now we have the new raw value and have updated the prev
+	 * timestamp already. We can now calculate the elapsed delta
+	 * (event-)time and add that to the generic event.
+	 *
+	 * Careful, not all hw sign-extends above the physical width
+	 * of the count.
+	 */
+	delta = (new_raw_count << shift) - (prev_raw_count << shift);
+	delta >>= shift;
+
+	local64_add(delta, &event->count);
+	local64_sub(delta, &hwc->period_left);
+
+	return 1;
+}
+
+static struct perf_ibs perf_ibs_fetch;
+static struct perf_ibs perf_ibs_op;
+
+static struct perf_ibs *get_ibs_pmu(int type)
+{
+	if (perf_ibs_fetch.pmu.type == type)
+		return &perf_ibs_fetch;
+	if (perf_ibs_op.pmu.type == type)
+		return &perf_ibs_op;
+	return NULL;
+}
+
+/*
+ * Use IBS for precise event sampling:
+ *
+ *  perf record -a -e cpu-cycles:p ...    # use ibs op counting cycle count
+ *  perf record -a -e r076:p ...          # same as -e cpu-cycles:p
+ *  perf record -a -e r0C1:p ...          # use ibs op counting micro-ops
+ *
+ * IbsOpCntCtl (bit 19) of IBS Execution Control Register (IbsOpCtl,
+ * MSRC001_1033) is used to select either cycle or micro-ops counting
+ * mode.
+ *
+ * The rip of IBS samples has skid 0. Thus, IBS supports precise
+ * levels 1 and 2 and the PERF_EFLAGS_EXACT is set. In rare cases the
+ * rip is invalid when IBS was not able to record the rip correctly.
+ * We clear PERF_EFLAGS_EXACT and take the rip from pt_regs then.
+ *
+ */
+static int perf_ibs_precise_event(struct perf_event *event, u64 *config)
+{
+	switch (event->attr.precise_ip) {
+	case 0:
+		return -ENOENT;
+	case 1:
+	case 2:
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	switch (event->attr.type) {
+	case PERF_TYPE_HARDWARE:
+		switch (event->attr.config) {
+		case PERF_COUNT_HW_CPU_CYCLES:
+			*config = 0;
+			return 0;
+		}
+		break;
+	case PERF_TYPE_RAW:
+		switch (event->attr.config) {
+		case 0x0076:
+			*config = 0;
+			return 0;
+		case 0x00C1:
+			*config = IBS_OP_CNT_CTL;
+			return 0;
+		}
+		break;
+	default:
+		return -ENOENT;
+	}
+
+	return -EOPNOTSUPP;
+}
+
+static const struct perf_event_attr ibs_notsupp = {
+	.exclude_user	= 1,
+	.exclude_kernel	= 1,
+	.exclude_hv	= 1,
+	.exclude_idle	= 1,
+	.exclude_host	= 1,
+	.exclude_guest	= 1,
+};
+
+static int perf_ibs_init(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct perf_ibs *perf_ibs;
+	u64 max_cnt, config;
+	int ret;
+
+	perf_ibs = get_ibs_pmu(event->attr.type);
+	if (perf_ibs) {
+		config = event->attr.config;
+	} else {
+		perf_ibs = &perf_ibs_op;
+		ret = perf_ibs_precise_event(event, &config);
+		if (ret)
+			return ret;
+	}
+
+	if (event->pmu != &perf_ibs->pmu)
+		return -ENOENT;
+
+	if (perf_flags(&event->attr) & perf_flags(&ibs_notsupp))
+		return -EINVAL;
+
+	if (config & ~perf_ibs->config_mask)
+		return -EINVAL;
+
+	if (hwc->sample_period) {
+		if (config & perf_ibs->cnt_mask)
+			/* raw max_cnt may not be set */
+			return -EINVAL;
+		if (!event->attr.sample_freq && hwc->sample_period & 0x0f)
+			/*
+			 * lower 4 bits can not be set in ibs max cnt,
+			 * but allowing it in case we adjust the
+			 * sample period to set a frequency.
+			 */
+			return -EINVAL;
+		hwc->sample_period &= ~0x0FULL;
+		if (!hwc->sample_period)
+			hwc->sample_period = 0x10;
+	} else {
+		max_cnt = config & perf_ibs->cnt_mask;
+		config &= ~perf_ibs->cnt_mask;
+		event->attr.sample_period = max_cnt << 4;
+		hwc->sample_period = event->attr.sample_period;
+	}
+
+	if (!hwc->sample_period)
+		return -EINVAL;
+
+	/*
+	 * If we modify hwc->sample_period, we also need to update
+	 * hwc->last_period and hwc->period_left.
+	 */
+	hwc->last_period = hwc->sample_period;
+	local64_set(&hwc->period_left, hwc->sample_period);
+
+	hwc->config_base = perf_ibs->msr;
+	hwc->config = config;
+
+	return 0;
+}
+
+static int perf_ibs_set_period(struct perf_ibs *perf_ibs,
+			       struct hw_perf_event *hwc, u64 *period)
+{
+	int overflow;
+
+	/* ignore lower 4 bits in min count: */
+	overflow = perf_event_set_period(hwc, 1<<4, perf_ibs->max_period, period);
+	local64_set(&hwc->prev_count, 0);
+
+	return overflow;
+}
+
+static u64 get_ibs_fetch_count(u64 config)
+{
+	return (config & IBS_FETCH_CNT) >> 12;
+}
+
+static u64 get_ibs_op_count(u64 config)
+{
+	u64 count = 0;
+
+	if (config & IBS_OP_VAL)
+		count += (config & IBS_OP_MAX_CNT) << 4; /* cnt rolled over */
+
+	if (ibs_caps & IBS_CAPS_RDWROPCNT)
+		count += (config & IBS_OP_CUR_CNT) >> 32;
+
+	return count;
+}
+
+static void
+perf_ibs_event_update(struct perf_ibs *perf_ibs, struct perf_event *event,
+		      u64 *config)
+{
+	u64 count = perf_ibs->get_count(*config);
+
+	/*
+	 * Set width to 64 since we do not overflow on max width but
+	 * instead on max count. In perf_ibs_set_period() we clear
+	 * prev count manually on overflow.
+	 */
+	while (!perf_event_try_update(event, count, 64)) {
+		rdmsrl(event->hw.config_base, *config);
+		count = perf_ibs->get_count(*config);
+	}
+}
+
+static inline void perf_ibs_enable_event(struct perf_ibs *perf_ibs,
+					 struct hw_perf_event *hwc, u64 config)
+{
+	wrmsrl(hwc->config_base, hwc->config | config | perf_ibs->enable_mask);
+}
+
+/*
+ * Erratum #420 Instruction-Based Sampling Engine May Generate
+ * Interrupt that Cannot Be Cleared:
+ *
+ * Must clear counter mask first, then clear the enable bit. See
+ * Revision Guide for AMD Family 10h Processors, Publication #41322.
+ */
+static inline void perf_ibs_disable_event(struct perf_ibs *perf_ibs,
+					  struct hw_perf_event *hwc, u64 config)
+{
+	config &= ~perf_ibs->cnt_mask;
+	wrmsrl(hwc->config_base, config);
+	config &= ~perf_ibs->enable_mask;
+	wrmsrl(hwc->config_base, config);
+}
+
+/*
+ * We cannot restore the ibs pmu state, so we always needs to update
+ * the event while stopping it and then reset the state when starting
+ * again. Thus, ignoring PERF_EF_RELOAD and PERF_EF_UPDATE flags in
+ * perf_ibs_start()/perf_ibs_stop() and instead always do it.
+ */
+static void perf_ibs_start(struct perf_event *event, int flags)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
+	struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+	u64 period;
+
+	if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
+		return;
+
+	WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
+	hwc->state = 0;
+
+	perf_ibs_set_period(perf_ibs, hwc, &period);
+	/*
+	 * Set STARTED before enabling the hardware, such that a subsequent NMI
+	 * must observe it.
+	 */
+	set_bit(IBS_STARTED,    pcpu->state);
+	clear_bit(IBS_STOPPING, pcpu->state);
+	perf_ibs_enable_event(perf_ibs, hwc, period >> 4);
+
+	perf_event_update_userpage(event);
+}
+
+static void perf_ibs_stop(struct perf_event *event, int flags)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
+	struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+	u64 config;
+	int stopping;
+
+	if (test_and_set_bit(IBS_STOPPING, pcpu->state))
+		return;
+
+	stopping = test_bit(IBS_STARTED, pcpu->state);
+
+	if (!stopping && (hwc->state & PERF_HES_UPTODATE))
+		return;
+
+	rdmsrl(hwc->config_base, config);
+
+	if (stopping) {
+		/*
+		 * Set STOPPED before disabling the hardware, such that it
+		 * must be visible to NMIs the moment we clear the EN bit,
+		 * at which point we can generate an !VALID sample which
+		 * we need to consume.
+		 */
+		set_bit(IBS_STOPPED, pcpu->state);
+		perf_ibs_disable_event(perf_ibs, hwc, config);
+		/*
+		 * Clear STARTED after disabling the hardware; if it were
+		 * cleared before an NMI hitting after the clear but before
+		 * clearing the EN bit might think it a spurious NMI and not
+		 * handle it.
+		 *
+		 * Clearing it after, however, creates the problem of the NMI
+		 * handler seeing STARTED but not having a valid sample.
+		 */
+		clear_bit(IBS_STARTED, pcpu->state);
+		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+		hwc->state |= PERF_HES_STOPPED;
+	}
+
+	if (hwc->state & PERF_HES_UPTODATE)
+		return;
+
+	/*
+	 * Clear valid bit to not count rollovers on update, rollovers
+	 * are only updated in the irq handler.
+	 */
+	config &= ~perf_ibs->valid_mask;
+
+	perf_ibs_event_update(perf_ibs, event, &config);
+	hwc->state |= PERF_HES_UPTODATE;
+}
+
+static int perf_ibs_add(struct perf_event *event, int flags)
+{
+	struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
+	struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+
+	if (test_and_set_bit(IBS_ENABLED, pcpu->state))
+		return -ENOSPC;
+
+	event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+
+	pcpu->event = event;
+
+	if (flags & PERF_EF_START)
+		perf_ibs_start(event, PERF_EF_RELOAD);
+
+	return 0;
+}
+
+static void perf_ibs_del(struct perf_event *event, int flags)
+{
+	struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
+	struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+
+	if (!test_and_clear_bit(IBS_ENABLED, pcpu->state))
+		return;
+
+	perf_ibs_stop(event, PERF_EF_UPDATE);
+
+	pcpu->event = NULL;
+
+	perf_event_update_userpage(event);
+}
+
+static void perf_ibs_read(struct perf_event *event) { }
+
+PMU_FORMAT_ATTR(rand_en,	"config:57");
+PMU_FORMAT_ATTR(cnt_ctl,	"config:19");
+
+static struct attribute *ibs_fetch_format_attrs[] = {
+	&format_attr_rand_en.attr,
+	NULL,
+};
+
+static struct attribute *ibs_op_format_attrs[] = {
+	NULL,	/* &format_attr_cnt_ctl.attr if IBS_CAPS_OPCNT */
+	NULL,
+};
+
+static struct perf_ibs perf_ibs_fetch = {
+	.pmu = {
+		.task_ctx_nr	= perf_invalid_context,
+
+		.event_init	= perf_ibs_init,
+		.add		= perf_ibs_add,
+		.del		= perf_ibs_del,
+		.start		= perf_ibs_start,
+		.stop		= perf_ibs_stop,
+		.read		= perf_ibs_read,
+	},
+	.msr			= MSR_AMD64_IBSFETCHCTL,
+	.config_mask		= IBS_FETCH_CONFIG_MASK,
+	.cnt_mask		= IBS_FETCH_MAX_CNT,
+	.enable_mask		= IBS_FETCH_ENABLE,
+	.valid_mask		= IBS_FETCH_VAL,
+	.max_period		= IBS_FETCH_MAX_CNT << 4,
+	.offset_mask		= { MSR_AMD64_IBSFETCH_REG_MASK },
+	.offset_max		= MSR_AMD64_IBSFETCH_REG_COUNT,
+	.format_attrs		= ibs_fetch_format_attrs,
+
+	.get_count		= get_ibs_fetch_count,
+};
+
+static struct perf_ibs perf_ibs_op = {
+	.pmu = {
+		.task_ctx_nr	= perf_invalid_context,
+
+		.event_init	= perf_ibs_init,
+		.add		= perf_ibs_add,
+		.del		= perf_ibs_del,
+		.start		= perf_ibs_start,
+		.stop		= perf_ibs_stop,
+		.read		= perf_ibs_read,
+	},
+	.msr			= MSR_AMD64_IBSOPCTL,
+	.config_mask		= IBS_OP_CONFIG_MASK,
+	.cnt_mask		= IBS_OP_MAX_CNT,
+	.enable_mask		= IBS_OP_ENABLE,
+	.valid_mask		= IBS_OP_VAL,
+	.max_period		= IBS_OP_MAX_CNT << 4,
+	.offset_mask		= { MSR_AMD64_IBSOP_REG_MASK },
+	.offset_max		= MSR_AMD64_IBSOP_REG_COUNT,
+	.format_attrs		= ibs_op_format_attrs,
+
+	.get_count		= get_ibs_op_count,
+};
+
+static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
+{
+	struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+	struct perf_event *event = pcpu->event;
+	struct hw_perf_event *hwc = &event->hw;
+	struct perf_sample_data data;
+	struct perf_raw_record raw;
+	struct pt_regs regs;
+	struct perf_ibs_data ibs_data;
+	int offset, size, check_rip, offset_max, throttle = 0;
+	unsigned int msr;
+	u64 *buf, *config, period;
+
+	if (!test_bit(IBS_STARTED, pcpu->state)) {
+fail:
+		/*
+		 * Catch spurious interrupts after stopping IBS: After
+		 * disabling IBS there could be still incoming NMIs
+		 * with samples that even have the valid bit cleared.
+		 * Mark all this NMIs as handled.
+		 */
+		if (test_and_clear_bit(IBS_STOPPED, pcpu->state))
+			return 1;
+
+		return 0;
+	}
+
+	msr = hwc->config_base;
+	buf = ibs_data.regs;
+	rdmsrl(msr, *buf);
+	if (!(*buf++ & perf_ibs->valid_mask))
+		goto fail;
+
+	config = &ibs_data.regs[0];
+	perf_ibs_event_update(perf_ibs, event, config);
+	perf_sample_data_init(&data, 0, hwc->last_period);
+	if (!perf_ibs_set_period(perf_ibs, hwc, &period))
+		goto out;	/* no sw counter overflow */
+
+	ibs_data.caps = ibs_caps;
+	size = 1;
+	offset = 1;
+	check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK));
+	if (event->attr.sample_type & PERF_SAMPLE_RAW)
+		offset_max = perf_ibs->offset_max;
+	else if (check_rip)
+		offset_max = 2;
+	else
+		offset_max = 1;
+	do {
+		rdmsrl(msr + offset, *buf++);
+		size++;
+		offset = find_next_bit(perf_ibs->offset_mask,
+				       perf_ibs->offset_max,
+				       offset + 1);
+	} while (offset < offset_max);
+	if (event->attr.sample_type & PERF_SAMPLE_RAW) {
+		/*
+		 * Read IbsBrTarget and IbsOpData4 separately
+		 * depending on their availability.
+		 * Can't add to offset_max as they are staggered
+		 */
+		if (ibs_caps & IBS_CAPS_BRNTRGT) {
+			rdmsrl(MSR_AMD64_IBSBRTARGET, *buf++);
+			size++;
+		}
+		if (ibs_caps & IBS_CAPS_OPDATA4) {
+			rdmsrl(MSR_AMD64_IBSOPDATA4, *buf++);
+			size++;
+		}
+	}
+	ibs_data.size = sizeof(u64) * size;
+
+	regs = *iregs;
+	if (check_rip && (ibs_data.regs[2] & IBS_RIP_INVALID)) {
+		regs.flags &= ~PERF_EFLAGS_EXACT;
+	} else {
+		set_linear_ip(&regs, ibs_data.regs[1]);
+		regs.flags |= PERF_EFLAGS_EXACT;
+	}
+
+	if (event->attr.sample_type & PERF_SAMPLE_RAW) {
+		raw.size = sizeof(u32) + ibs_data.size;
+		raw.data = ibs_data.data;
+		data.raw = &raw;
+	}
+
+	throttle = perf_event_overflow(event, &data, &regs);
+out:
+	if (throttle)
+		perf_ibs_stop(event, 0);
+	else
+		perf_ibs_enable_event(perf_ibs, hwc, period >> 4);
+
+	perf_event_update_userpage(event);
+
+	return 1;
+}
+
+static int
+perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs)
+{
+	u64 stamp = sched_clock();
+	int handled = 0;
+
+	handled += perf_ibs_handle_irq(&perf_ibs_fetch, regs);
+	handled += perf_ibs_handle_irq(&perf_ibs_op, regs);
+
+	if (handled)
+		inc_irq_stat(apic_perf_irqs);
+
+	perf_sample_event_took(sched_clock() - stamp);
+
+	return handled;
+}
+NOKPROBE_SYMBOL(perf_ibs_nmi_handler);
+
+static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name)
+{
+	struct cpu_perf_ibs __percpu *pcpu;
+	int ret;
+
+	pcpu = alloc_percpu(struct cpu_perf_ibs);
+	if (!pcpu)
+		return -ENOMEM;
+
+	perf_ibs->pcpu = pcpu;
+
+	/* register attributes */
+	if (perf_ibs->format_attrs[0]) {
+		memset(&perf_ibs->format_group, 0, sizeof(perf_ibs->format_group));
+		perf_ibs->format_group.name	= "format";
+		perf_ibs->format_group.attrs	= perf_ibs->format_attrs;
+
+		memset(&perf_ibs->attr_groups, 0, sizeof(perf_ibs->attr_groups));
+		perf_ibs->attr_groups[0]	= &perf_ibs->format_group;
+		perf_ibs->pmu.attr_groups	= perf_ibs->attr_groups;
+	}
+
+	ret = perf_pmu_register(&perf_ibs->pmu, name, -1);
+	if (ret) {
+		perf_ibs->pcpu = NULL;
+		free_percpu(pcpu);
+	}
+
+	return ret;
+}
+
+static __init int perf_event_ibs_init(void)
+{
+	struct attribute **attr = ibs_op_format_attrs;
+
+	if (!ibs_caps)
+		return -ENODEV;	/* ibs not supported by the cpu */
+
+	perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch");
+
+	if (ibs_caps & IBS_CAPS_OPCNT) {
+		perf_ibs_op.config_mask |= IBS_OP_CNT_CTL;
+		*attr++ = &format_attr_cnt_ctl.attr;
+	}
+	perf_ibs_pmu_init(&perf_ibs_op, "ibs_op");
+
+	register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs");
+	pr_info("perf: AMD IBS detected (0x%08x)\n", ibs_caps);
+
+	return 0;
+}
+
+#else /* defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) */
+
+static __init int perf_event_ibs_init(void) { return 0; }
+
+#endif
+
+/* IBS - apic initialization, for perf and oprofile */
+
+static __init u32 __get_ibs_caps(void)
+{
+	u32 caps;
+	unsigned int max_level;
+
+	if (!boot_cpu_has(X86_FEATURE_IBS))
+		return 0;
+
+	/* check IBS cpuid feature flags */
+	max_level = cpuid_eax(0x80000000);
+	if (max_level < IBS_CPUID_FEATURES)
+		return IBS_CAPS_DEFAULT;
+
+	caps = cpuid_eax(IBS_CPUID_FEATURES);
+	if (!(caps & IBS_CAPS_AVAIL))
+		/* cpuid flags not valid */
+		return IBS_CAPS_DEFAULT;
+
+	return caps;
+}
+
+u32 get_ibs_caps(void)
+{
+	return ibs_caps;
+}
+
+EXPORT_SYMBOL(get_ibs_caps);
+
+static inline int get_eilvt(int offset)
+{
+	return !setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 1);
+}
+
+static inline int put_eilvt(int offset)
+{
+	return !setup_APIC_eilvt(offset, 0, 0, 1);
+}
+
+/*
+ * Check and reserve APIC extended interrupt LVT offset for IBS if available.
+ */
+static inline int ibs_eilvt_valid(void)
+{
+	int offset;
+	u64 val;
+	int valid = 0;
+
+	preempt_disable();
+
+	rdmsrl(MSR_AMD64_IBSCTL, val);
+	offset = val & IBSCTL_LVT_OFFSET_MASK;
+
+	if (!(val & IBSCTL_LVT_OFFSET_VALID)) {
+		pr_err(FW_BUG "cpu %d, invalid IBS interrupt offset %d (MSR%08X=0x%016llx)\n",
+		       smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
+		goto out;
+	}
+
+	if (!get_eilvt(offset)) {
+		pr_err(FW_BUG "cpu %d, IBS interrupt offset %d not available (MSR%08X=0x%016llx)\n",
+		       smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
+		goto out;
+	}
+
+	valid = 1;
+out:
+	preempt_enable();
+
+	return valid;
+}
+
+static int setup_ibs_ctl(int ibs_eilvt_off)
+{
+	struct pci_dev *cpu_cfg;
+	int nodes;
+	u32 value = 0;
+
+	nodes = 0;
+	cpu_cfg = NULL;
+	do {
+		cpu_cfg = pci_get_device(PCI_VENDOR_ID_AMD,
+					 PCI_DEVICE_ID_AMD_10H_NB_MISC,
+					 cpu_cfg);
+		if (!cpu_cfg)
+			break;
+		++nodes;
+		pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off
+				       | IBSCTL_LVT_OFFSET_VALID);
+		pci_read_config_dword(cpu_cfg, IBSCTL, &value);
+		if (value != (ibs_eilvt_off | IBSCTL_LVT_OFFSET_VALID)) {
+			pci_dev_put(cpu_cfg);
+			pr_debug("Failed to setup IBS LVT offset, IBSCTL = 0x%08x\n",
+				 value);
+			return -EINVAL;
+		}
+	} while (1);
+
+	if (!nodes) {
+		pr_debug("No CPU node configured for IBS\n");
+		return -ENODEV;
+	}
+
+	return 0;
+}
+
+/*
+ * This runs only on the current cpu. We try to find an LVT offset and
+ * setup the local APIC. For this we must disable preemption. On
+ * success we initialize all nodes with this offset. This updates then
+ * the offset in the IBS_CTL per-node msr. The per-core APIC setup of
+ * the IBS interrupt vector is handled by perf_ibs_cpu_notifier that
+ * is using the new offset.
+ */
+static void force_ibs_eilvt_setup(void)
+{
+	int offset;
+	int ret;
+
+	preempt_disable();
+	/* find the next free available EILVT entry, skip offset 0 */
+	for (offset = 1; offset < APIC_EILVT_NR_MAX; offset++) {
+		if (get_eilvt(offset))
+			break;
+	}
+	preempt_enable();
+
+	if (offset == APIC_EILVT_NR_MAX) {
+		pr_debug("No EILVT entry available\n");
+		return;
+	}
+
+	ret = setup_ibs_ctl(offset);
+	if (ret)
+		goto out;
+
+	if (!ibs_eilvt_valid())
+		goto out;
+
+	pr_info("IBS: LVT offset %d assigned\n", offset);
+
+	return;
+out:
+	preempt_disable();
+	put_eilvt(offset);
+	preempt_enable();
+	return;
+}
+
+static void ibs_eilvt_setup(void)
+{
+	/*
+	 * Force LVT offset assignment for family 10h: The offsets are
+	 * not assigned by the BIOS for this family, so the OS is
+	 * responsible for doing it. If the OS assignment fails, fall
+	 * back to BIOS settings and try to setup this.
+	 */
+	if (boot_cpu_data.x86 == 0x10)
+		force_ibs_eilvt_setup();
+}
+
+static inline int get_ibs_lvt_offset(void)
+{
+	u64 val;
+
+	rdmsrl(MSR_AMD64_IBSCTL, val);
+	if (!(val & IBSCTL_LVT_OFFSET_VALID))
+		return -EINVAL;
+
+	return val & IBSCTL_LVT_OFFSET_MASK;
+}
+
+static void setup_APIC_ibs(void *dummy)
+{
+	int offset;
+
+	offset = get_ibs_lvt_offset();
+	if (offset < 0)
+		goto failed;
+
+	if (!setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 0))
+		return;
+failed:
+	pr_warn("perf: IBS APIC setup failed on cpu #%d\n",
+		smp_processor_id());
+}
+
+static void clear_APIC_ibs(void *dummy)
+{
+	int offset;
+
+	offset = get_ibs_lvt_offset();
+	if (offset >= 0)
+		setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1);
+}
+
+#ifdef CONFIG_PM
+
+static int perf_ibs_suspend(void)
+{
+	clear_APIC_ibs(NULL);
+	return 0;
+}
+
+static void perf_ibs_resume(void)
+{
+	ibs_eilvt_setup();
+	setup_APIC_ibs(NULL);
+}
+
+static struct syscore_ops perf_ibs_syscore_ops = {
+	.resume		= perf_ibs_resume,
+	.suspend	= perf_ibs_suspend,
+};
+
+static void perf_ibs_pm_init(void)
+{
+	register_syscore_ops(&perf_ibs_syscore_ops);
+}
+
+#else
+
+static inline void perf_ibs_pm_init(void) { }
+
+#endif
+
+static int
+perf_ibs_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
+{
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_STARTING:
+		setup_APIC_ibs(NULL);
+		break;
+	case CPU_DYING:
+		clear_APIC_ibs(NULL);
+		break;
+	default:
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static __init int amd_ibs_init(void)
+{
+	u32 caps;
+	int ret = -EINVAL;
+
+	caps = __get_ibs_caps();
+	if (!caps)
+		return -ENODEV;	/* ibs not supported by the cpu */
+
+	ibs_eilvt_setup();
+
+	if (!ibs_eilvt_valid())
+		goto out;
+
+	perf_ibs_pm_init();
+	cpu_notifier_register_begin();
+	ibs_caps = caps;
+	/* make ibs_caps visible to other cpus: */
+	smp_mb();
+	smp_call_function(setup_APIC_ibs, NULL, 1);
+	__perf_cpu_notifier(perf_ibs_cpu_notifier);
+	cpu_notifier_register_done();
+
+	ret = perf_event_ibs_init();
+out:
+	if (ret)
+		pr_err("Failed to setup IBS, %d\n", ret);
+	return ret;
+}
+
+/* Since we need the pci subsystem to init ibs we can't do this earlier: */
+device_initcall(amd_ibs_init);
diff --git a/arch/x86/events/amd/iommu.c b/arch/x86/events/amd/iommu.c
new file mode 100644
index 000000000..6011a573d
--- /dev/null
+++ b/arch/x86/events/amd/iommu.c
@@ -0,0 +1,505 @@
+/*
+ * Copyright (C) 2013 Advanced Micro Devices, Inc.
+ *
+ * Author: Steven Kinney <Steven.Kinney@amd.com>
+ * Author: Suravee Suthikulpanit <Suraveee.Suthikulpanit@amd.com>
+ *
+ * Perf: amd_iommu - AMD IOMMU Performance Counter PMU implementation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/perf_event.h>
+#include <linux/module.h>
+#include <linux/cpumask.h>
+#include <linux/slab.h>
+
+#include "../perf_event.h"
+#include "iommu.h"
+
+#define COUNTER_SHIFT		16
+
+#define _GET_BANK(ev)       ((u8)(ev->hw.extra_reg.reg >> 8))
+#define _GET_CNTR(ev)       ((u8)(ev->hw.extra_reg.reg))
+
+/* iommu pmu config masks */
+#define _GET_CSOURCE(ev)    ((ev->hw.config & 0xFFULL))
+#define _GET_DEVID(ev)      ((ev->hw.config >> 8)  & 0xFFFFULL)
+#define _GET_PASID(ev)      ((ev->hw.config >> 24) & 0xFFFFULL)
+#define _GET_DOMID(ev)      ((ev->hw.config >> 40) & 0xFFFFULL)
+#define _GET_DEVID_MASK(ev) ((ev->hw.extra_reg.config)  & 0xFFFFULL)
+#define _GET_PASID_MASK(ev) ((ev->hw.extra_reg.config >> 16) & 0xFFFFULL)
+#define _GET_DOMID_MASK(ev) ((ev->hw.extra_reg.config >> 32) & 0xFFFFULL)
+
+static struct perf_amd_iommu __perf_iommu;
+
+struct perf_amd_iommu {
+	struct pmu pmu;
+	u8 max_banks;
+	u8 max_counters;
+	u64 cntr_assign_mask;
+	raw_spinlock_t lock;
+	const struct attribute_group *attr_groups[4];
+};
+
+#define format_group	attr_groups[0]
+#define cpumask_group	attr_groups[1]
+#define events_group	attr_groups[2]
+#define null_group	attr_groups[3]
+
+/*---------------------------------------------
+ * sysfs format attributes
+ *---------------------------------------------*/
+PMU_FORMAT_ATTR(csource,    "config:0-7");
+PMU_FORMAT_ATTR(devid,      "config:8-23");
+PMU_FORMAT_ATTR(pasid,      "config:24-39");
+PMU_FORMAT_ATTR(domid,      "config:40-55");
+PMU_FORMAT_ATTR(devid_mask, "config1:0-15");
+PMU_FORMAT_ATTR(pasid_mask, "config1:16-31");
+PMU_FORMAT_ATTR(domid_mask, "config1:32-47");
+
+static struct attribute *iommu_format_attrs[] = {
+	&format_attr_csource.attr,
+	&format_attr_devid.attr,
+	&format_attr_pasid.attr,
+	&format_attr_domid.attr,
+	&format_attr_devid_mask.attr,
+	&format_attr_pasid_mask.attr,
+	&format_attr_domid_mask.attr,
+	NULL,
+};
+
+static struct attribute_group amd_iommu_format_group = {
+	.name = "format",
+	.attrs = iommu_format_attrs,
+};
+
+/*---------------------------------------------
+ * sysfs events attributes
+ *---------------------------------------------*/
+struct amd_iommu_event_desc {
+	struct kobj_attribute attr;
+	const char *event;
+};
+
+static ssize_t _iommu_event_show(struct kobject *kobj,
+				struct kobj_attribute *attr, char *buf)
+{
+	struct amd_iommu_event_desc *event =
+		container_of(attr, struct amd_iommu_event_desc, attr);
+	return sprintf(buf, "%s\n", event->event);
+}
+
+#define AMD_IOMMU_EVENT_DESC(_name, _event)			\
+{								\
+	.attr  = __ATTR(_name, 0444, _iommu_event_show, NULL),	\
+	.event = _event,					\
+}
+
+static struct amd_iommu_event_desc amd_iommu_v2_event_descs[] = {
+	AMD_IOMMU_EVENT_DESC(mem_pass_untrans,        "csource=0x01"),
+	AMD_IOMMU_EVENT_DESC(mem_pass_pretrans,       "csource=0x02"),
+	AMD_IOMMU_EVENT_DESC(mem_pass_excl,           "csource=0x03"),
+	AMD_IOMMU_EVENT_DESC(mem_target_abort,        "csource=0x04"),
+	AMD_IOMMU_EVENT_DESC(mem_trans_total,         "csource=0x05"),
+	AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pte_hit,   "csource=0x06"),
+	AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pte_mis,   "csource=0x07"),
+	AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pde_hit,   "csource=0x08"),
+	AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pde_mis,   "csource=0x09"),
+	AMD_IOMMU_EVENT_DESC(mem_dte_hit,             "csource=0x0a"),
+	AMD_IOMMU_EVENT_DESC(mem_dte_mis,             "csource=0x0b"),
+	AMD_IOMMU_EVENT_DESC(page_tbl_read_tot,       "csource=0x0c"),
+	AMD_IOMMU_EVENT_DESC(page_tbl_read_nst,       "csource=0x0d"),
+	AMD_IOMMU_EVENT_DESC(page_tbl_read_gst,       "csource=0x0e"),
+	AMD_IOMMU_EVENT_DESC(int_dte_hit,             "csource=0x0f"),
+	AMD_IOMMU_EVENT_DESC(int_dte_mis,             "csource=0x10"),
+	AMD_IOMMU_EVENT_DESC(cmd_processed,           "csource=0x11"),
+	AMD_IOMMU_EVENT_DESC(cmd_processed_inv,       "csource=0x12"),
+	AMD_IOMMU_EVENT_DESC(tlb_inv,                 "csource=0x13"),
+	AMD_IOMMU_EVENT_DESC(ign_rd_wr_mmio_1ff8h,    "csource=0x14"),
+	AMD_IOMMU_EVENT_DESC(vapic_int_non_guest,     "csource=0x15"),
+	AMD_IOMMU_EVENT_DESC(vapic_int_guest,         "csource=0x16"),
+	AMD_IOMMU_EVENT_DESC(smi_recv,                "csource=0x17"),
+	AMD_IOMMU_EVENT_DESC(smi_blk,                 "csource=0x18"),
+	{ /* end: all zeroes */ },
+};
+
+/*---------------------------------------------
+ * sysfs cpumask attributes
+ *---------------------------------------------*/
+static cpumask_t iommu_cpumask;
+
+static ssize_t _iommu_cpumask_show(struct device *dev,
+				   struct device_attribute *attr,
+				   char *buf)
+{
+	return cpumap_print_to_pagebuf(true, buf, &iommu_cpumask);
+}
+static DEVICE_ATTR(cpumask, S_IRUGO, _iommu_cpumask_show, NULL);
+
+static struct attribute *iommu_cpumask_attrs[] = {
+	&dev_attr_cpumask.attr,
+	NULL,
+};
+
+static struct attribute_group amd_iommu_cpumask_group = {
+	.attrs = iommu_cpumask_attrs,
+};
+
+/*---------------------------------------------*/
+
+static int get_next_avail_iommu_bnk_cntr(struct perf_amd_iommu *perf_iommu)
+{
+	unsigned long flags;
+	int shift, bank, cntr, retval;
+	int max_banks = perf_iommu->max_banks;
+	int max_cntrs = perf_iommu->max_counters;
+
+	raw_spin_lock_irqsave(&perf_iommu->lock, flags);
+
+	for (bank = 0, shift = 0; bank < max_banks; bank++) {
+		for (cntr = 0; cntr < max_cntrs; cntr++) {
+			shift = bank + (bank*3) + cntr;
+			if (perf_iommu->cntr_assign_mask & (1ULL<<shift)) {
+				continue;
+			} else {
+				perf_iommu->cntr_assign_mask |= (1ULL<<shift);
+				retval = ((u16)((u16)bank<<8) | (u8)(cntr));
+				goto out;
+			}
+		}
+	}
+	retval = -ENOSPC;
+out:
+	raw_spin_unlock_irqrestore(&perf_iommu->lock, flags);
+	return retval;
+}
+
+static int clear_avail_iommu_bnk_cntr(struct perf_amd_iommu *perf_iommu,
+					u8 bank, u8 cntr)
+{
+	unsigned long flags;
+	int max_banks, max_cntrs;
+	int shift = 0;
+
+	max_banks = perf_iommu->max_banks;
+	max_cntrs = perf_iommu->max_counters;
+
+	if ((bank > max_banks) || (cntr > max_cntrs))
+		return -EINVAL;
+
+	shift = bank + cntr + (bank*3);
+
+	raw_spin_lock_irqsave(&perf_iommu->lock, flags);
+	perf_iommu->cntr_assign_mask &= ~(1ULL<<shift);
+	raw_spin_unlock_irqrestore(&perf_iommu->lock, flags);
+
+	return 0;
+}
+
+static int perf_iommu_event_init(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct perf_amd_iommu *perf_iommu;
+	u64 config, config1;
+
+	/* test the event attr type check for PMU enumeration */
+	if (event->attr.type != event->pmu->type)
+		return -ENOENT;
+
+	/*
+	 * IOMMU counters are shared across all cores.
+	 * Therefore, it does not support per-process mode.
+	 * Also, it does not support event sampling mode.
+	 */
+	if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK)
+		return -EINVAL;
+
+	/* IOMMU counters do not have usr/os/guest/host bits */
+	if (event->attr.exclude_user || event->attr.exclude_kernel ||
+	    event->attr.exclude_host || event->attr.exclude_guest)
+		return -EINVAL;
+
+	if (event->cpu < 0)
+		return -EINVAL;
+
+	perf_iommu = &__perf_iommu;
+
+	if (event->pmu != &perf_iommu->pmu)
+		return -ENOENT;
+
+	if (perf_iommu) {
+		config = event->attr.config;
+		config1 = event->attr.config1;
+	} else {
+		return -EINVAL;
+	}
+
+	/* integrate with iommu base devid (0000), assume one iommu */
+	perf_iommu->max_banks =
+		amd_iommu_pc_get_max_banks(IOMMU_BASE_DEVID);
+	perf_iommu->max_counters =
+		amd_iommu_pc_get_max_counters(IOMMU_BASE_DEVID);
+	if ((perf_iommu->max_banks == 0) || (perf_iommu->max_counters == 0))
+		return -EINVAL;
+
+	/* update the hw_perf_event struct with the iommu config data */
+	hwc->config = config;
+	hwc->extra_reg.config = config1;
+
+	return 0;
+}
+
+static void perf_iommu_enable_event(struct perf_event *ev)
+{
+	u8 csource = _GET_CSOURCE(ev);
+	u16 devid = _GET_DEVID(ev);
+	u64 reg = 0ULL;
+
+	reg = csource;
+	amd_iommu_pc_get_set_reg_val(devid,
+			_GET_BANK(ev), _GET_CNTR(ev) ,
+			 IOMMU_PC_COUNTER_SRC_REG, &reg, true);
+
+	reg = 0ULL | devid | (_GET_DEVID_MASK(ev) << 32);
+	if (reg)
+		reg |= (1UL << 31);
+	amd_iommu_pc_get_set_reg_val(devid,
+			_GET_BANK(ev), _GET_CNTR(ev) ,
+			 IOMMU_PC_DEVID_MATCH_REG, &reg, true);
+
+	reg = 0ULL | _GET_PASID(ev) | (_GET_PASID_MASK(ev) << 32);
+	if (reg)
+		reg |= (1UL << 31);
+	amd_iommu_pc_get_set_reg_val(devid,
+			_GET_BANK(ev), _GET_CNTR(ev) ,
+			 IOMMU_PC_PASID_MATCH_REG, &reg, true);
+
+	reg = 0ULL | _GET_DOMID(ev) | (_GET_DOMID_MASK(ev) << 32);
+	if (reg)
+		reg |= (1UL << 31);
+	amd_iommu_pc_get_set_reg_val(devid,
+			_GET_BANK(ev), _GET_CNTR(ev) ,
+			 IOMMU_PC_DOMID_MATCH_REG, &reg, true);
+}
+
+static void perf_iommu_disable_event(struct perf_event *event)
+{
+	u64 reg = 0ULL;
+
+	amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
+			_GET_BANK(event), _GET_CNTR(event),
+			IOMMU_PC_COUNTER_SRC_REG, &reg, true);
+}
+
+static void perf_iommu_start(struct perf_event *event, int flags)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	pr_debug("perf: amd_iommu:perf_iommu_start\n");
+	if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
+		return;
+
+	WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
+	hwc->state = 0;
+
+	if (flags & PERF_EF_RELOAD) {
+		u64 prev_raw_count =  local64_read(&hwc->prev_count);
+		amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
+				_GET_BANK(event), _GET_CNTR(event),
+				IOMMU_PC_COUNTER_REG, &prev_raw_count, true);
+	}
+
+	perf_iommu_enable_event(event);
+	perf_event_update_userpage(event);
+
+}
+
+static void perf_iommu_read(struct perf_event *event)
+{
+	u64 count = 0ULL;
+	u64 prev_raw_count = 0ULL;
+	u64 delta = 0ULL;
+	struct hw_perf_event *hwc = &event->hw;
+	pr_debug("perf: amd_iommu:perf_iommu_read\n");
+
+	amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
+				_GET_BANK(event), _GET_CNTR(event),
+				IOMMU_PC_COUNTER_REG, &count, false);
+
+	/* IOMMU pc counter register is only 48 bits */
+	count &= 0xFFFFFFFFFFFFULL;
+
+	prev_raw_count =  local64_read(&hwc->prev_count);
+	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
+					count) != prev_raw_count)
+		return;
+
+	/* Handling 48-bit counter overflowing */
+	delta = (count << COUNTER_SHIFT) - (prev_raw_count << COUNTER_SHIFT);
+	delta >>= COUNTER_SHIFT;
+	local64_add(delta, &event->count);
+
+}
+
+static void perf_iommu_stop(struct perf_event *event, int flags)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	u64 config;
+
+	pr_debug("perf: amd_iommu:perf_iommu_stop\n");
+
+	if (hwc->state & PERF_HES_UPTODATE)
+		return;
+
+	perf_iommu_disable_event(event);
+	WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+	hwc->state |= PERF_HES_STOPPED;
+
+	if (hwc->state & PERF_HES_UPTODATE)
+		return;
+
+	config = hwc->config;
+	perf_iommu_read(event);
+	hwc->state |= PERF_HES_UPTODATE;
+}
+
+static int perf_iommu_add(struct perf_event *event, int flags)
+{
+	int retval;
+	struct perf_amd_iommu *perf_iommu =
+			container_of(event->pmu, struct perf_amd_iommu, pmu);
+
+	pr_debug("perf: amd_iommu:perf_iommu_add\n");
+	event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+
+	/* request an iommu bank/counter */
+	retval = get_next_avail_iommu_bnk_cntr(perf_iommu);
+	if (retval != -ENOSPC)
+		event->hw.extra_reg.reg = (u16)retval;
+	else
+		return retval;
+
+	if (flags & PERF_EF_START)
+		perf_iommu_start(event, PERF_EF_RELOAD);
+
+	return 0;
+}
+
+static void perf_iommu_del(struct perf_event *event, int flags)
+{
+	struct perf_amd_iommu *perf_iommu =
+			container_of(event->pmu, struct perf_amd_iommu, pmu);
+
+	pr_debug("perf: amd_iommu:perf_iommu_del\n");
+	perf_iommu_stop(event, PERF_EF_UPDATE);
+
+	/* clear the assigned iommu bank/counter */
+	clear_avail_iommu_bnk_cntr(perf_iommu,
+				     _GET_BANK(event),
+				     _GET_CNTR(event));
+
+	perf_event_update_userpage(event);
+}
+
+static __init int _init_events_attrs(struct perf_amd_iommu *perf_iommu)
+{
+	struct attribute **attrs;
+	struct attribute_group *attr_group;
+	int i = 0, j;
+
+	while (amd_iommu_v2_event_descs[i].attr.attr.name)
+		i++;
+
+	attr_group = kzalloc(sizeof(struct attribute *)
+		* (i + 1) + sizeof(*attr_group), GFP_KERNEL);
+	if (!attr_group)
+		return -ENOMEM;
+
+	attrs = (struct attribute **)(attr_group + 1);
+	for (j = 0; j < i; j++)
+		attrs[j] = &amd_iommu_v2_event_descs[j].attr.attr;
+
+	attr_group->name = "events";
+	attr_group->attrs = attrs;
+	perf_iommu->events_group = attr_group;
+
+	return 0;
+}
+
+static __init void amd_iommu_pc_exit(void)
+{
+	if (__perf_iommu.events_group != NULL) {
+		kfree(__perf_iommu.events_group);
+		__perf_iommu.events_group = NULL;
+	}
+}
+
+static __init int _init_perf_amd_iommu(
+	struct perf_amd_iommu *perf_iommu, char *name)
+{
+	int ret;
+
+	raw_spin_lock_init(&perf_iommu->lock);
+
+	/* Init format attributes */
+	perf_iommu->format_group = &amd_iommu_format_group;
+
+	/* Init cpumask attributes to only core 0 */
+	cpumask_set_cpu(0, &iommu_cpumask);
+	perf_iommu->cpumask_group = &amd_iommu_cpumask_group;
+
+	/* Init events attributes */
+	if (_init_events_attrs(perf_iommu) != 0)
+		pr_err("perf: amd_iommu: Only support raw events.\n");
+
+	/* Init null attributes */
+	perf_iommu->null_group = NULL;
+	perf_iommu->pmu.attr_groups = perf_iommu->attr_groups;
+
+	ret = perf_pmu_register(&perf_iommu->pmu, name, -1);
+	if (ret) {
+		pr_err("perf: amd_iommu: Failed to initialized.\n");
+		amd_iommu_pc_exit();
+	} else {
+		pr_info("perf: amd_iommu: Detected. (%d banks, %d counters/bank)\n",
+			amd_iommu_pc_get_max_banks(IOMMU_BASE_DEVID),
+			amd_iommu_pc_get_max_counters(IOMMU_BASE_DEVID));
+	}
+
+	return ret;
+}
+
+static struct perf_amd_iommu __perf_iommu = {
+	.pmu = {
+		.task_ctx_nr    = perf_invalid_context,
+		.event_init	= perf_iommu_event_init,
+		.add		= perf_iommu_add,
+		.del		= perf_iommu_del,
+		.start		= perf_iommu_start,
+		.stop		= perf_iommu_stop,
+		.read		= perf_iommu_read,
+	},
+	.max_banks		= 0x00,
+	.max_counters		= 0x00,
+	.cntr_assign_mask	= 0ULL,
+	.format_group		= NULL,
+	.cpumask_group		= NULL,
+	.events_group		= NULL,
+	.null_group		= NULL,
+};
+
+static __init int amd_iommu_pc_init(void)
+{
+	/* Make sure the IOMMU PC resource is available */
+	if (!amd_iommu_pc_supported())
+		return -ENODEV;
+
+	_init_perf_amd_iommu(&__perf_iommu, "amd_iommu");
+
+	return 0;
+}
+
+device_initcall(amd_iommu_pc_init);
diff --git a/arch/x86/events/amd/iommu.h b/arch/x86/events/amd/iommu.h
new file mode 100644
index 000000000..845d17327
--- /dev/null
+++ b/arch/x86/events/amd/iommu.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (C) 2013 Advanced Micro Devices, Inc.
+ *
+ * Author: Steven Kinney <Steven.Kinney@amd.com>
+ * Author: Suravee Suthikulpanit <Suraveee.Suthikulpanit@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _PERF_EVENT_AMD_IOMMU_H_
+#define _PERF_EVENT_AMD_IOMMU_H_
+
+/* iommu pc mmio region register indexes */
+#define IOMMU_PC_COUNTER_REG			0x00
+#define IOMMU_PC_COUNTER_SRC_REG		0x08
+#define IOMMU_PC_PASID_MATCH_REG		0x10
+#define IOMMU_PC_DOMID_MATCH_REG		0x18
+#define IOMMU_PC_DEVID_MATCH_REG		0x20
+#define IOMMU_PC_COUNTER_REPORT_REG		0x28
+
+/* maximun specified bank/counters */
+#define PC_MAX_SPEC_BNKS			64
+#define PC_MAX_SPEC_CNTRS			16
+
+/* iommu pc reg masks*/
+#define IOMMU_BASE_DEVID			0x0000
+
+/* amd_iommu_init.c external support functions */
+extern bool amd_iommu_pc_supported(void);
+
+extern u8 amd_iommu_pc_get_max_banks(u16 devid);
+
+extern u8 amd_iommu_pc_get_max_counters(u16 devid);
+
+extern int amd_iommu_pc_get_set_reg_val(u16 devid, u8 bank, u8 cntr,
+			u8 fxn, u64 *value, bool is_write);
+
+#endif /*_PERF_EVENT_AMD_IOMMU_H_*/
diff --git a/arch/x86/events/amd/power.c b/arch/x86/events/amd/power.c
new file mode 100644
index 000000000..55a3529db
--- /dev/null
+++ b/arch/x86/events/amd/power.c
@@ -0,0 +1,353 @@
+/*
+ * Performance events - AMD Processor Power Reporting Mechanism
+ *
+ * Copyright (C) 2016 Advanced Micro Devices, Inc.
+ *
+ * Author: Huang Rui <ray.huang@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/perf_event.h>
+#include <asm/cpu_device_id.h>
+#include "../perf_event.h"
+
+#define MSR_F15H_CU_PWR_ACCUMULATOR     0xc001007a
+#define MSR_F15H_CU_MAX_PWR_ACCUMULATOR 0xc001007b
+#define MSR_F15H_PTSC			0xc0010280
+
+/* Event code: LSB 8 bits, passed in attr->config any other bit is reserved. */
+#define AMD_POWER_EVENT_MASK		0xFFULL
+
+/*
+ * Accumulated power status counters.
+ */
+#define AMD_POWER_EVENTSEL_PKG		1
+
+/*
+ * The ratio of compute unit power accumulator sample period to the
+ * PTSC period.
+ */
+static unsigned int cpu_pwr_sample_ratio;
+
+/* Maximum accumulated power of a compute unit. */
+static u64 max_cu_acc_power;
+
+static struct pmu pmu_class;
+
+/*
+ * Accumulated power represents the sum of each compute unit's (CU) power
+ * consumption. On any core of each CU we read the total accumulated power from
+ * MSR_F15H_CU_PWR_ACCUMULATOR. cpu_mask represents CPU bit map of all cores
+ * which are picked to measure the power for the CUs they belong to.
+ */
+static cpumask_t cpu_mask;
+
+static void event_update(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	u64 prev_pwr_acc, new_pwr_acc, prev_ptsc, new_ptsc;
+	u64 delta, tdelta;
+
+	prev_pwr_acc = hwc->pwr_acc;
+	prev_ptsc = hwc->ptsc;
+	rdmsrl(MSR_F15H_CU_PWR_ACCUMULATOR, new_pwr_acc);
+	rdmsrl(MSR_F15H_PTSC, new_ptsc);
+
+	/*
+	 * Calculate the CU power consumption over a time period, the unit of
+	 * final value (delta) is micro-Watts. Then add it to the event count.
+	 */
+	if (new_pwr_acc < prev_pwr_acc) {
+		delta = max_cu_acc_power + new_pwr_acc;
+		delta -= prev_pwr_acc;
+	} else
+		delta = new_pwr_acc - prev_pwr_acc;
+
+	delta *= cpu_pwr_sample_ratio * 1000;
+	tdelta = new_ptsc - prev_ptsc;
+
+	do_div(delta, tdelta);
+	local64_add(delta, &event->count);
+}
+
+static void __pmu_event_start(struct perf_event *event)
+{
+	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+		return;
+
+	event->hw.state = 0;
+
+	rdmsrl(MSR_F15H_PTSC, event->hw.ptsc);
+	rdmsrl(MSR_F15H_CU_PWR_ACCUMULATOR, event->hw.pwr_acc);
+}
+
+static void pmu_event_start(struct perf_event *event, int mode)
+{
+	__pmu_event_start(event);
+}
+
+static void pmu_event_stop(struct perf_event *event, int mode)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	/* Mark event as deactivated and stopped. */
+	if (!(hwc->state & PERF_HES_STOPPED))
+		hwc->state |= PERF_HES_STOPPED;
+
+	/* Check if software counter update is necessary. */
+	if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+		/*
+		 * Drain the remaining delta count out of an event
+		 * that we are disabling:
+		 */
+		event_update(event);
+		hwc->state |= PERF_HES_UPTODATE;
+	}
+}
+
+static int pmu_event_add(struct perf_event *event, int mode)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+
+	if (mode & PERF_EF_START)
+		__pmu_event_start(event);
+
+	return 0;
+}
+
+static void pmu_event_del(struct perf_event *event, int flags)
+{
+	pmu_event_stop(event, PERF_EF_UPDATE);
+}
+
+static int pmu_event_init(struct perf_event *event)
+{
+	u64 cfg = event->attr.config & AMD_POWER_EVENT_MASK;
+
+	/* Only look at AMD power events. */
+	if (event->attr.type != pmu_class.type)
+		return -ENOENT;
+
+	/* Unsupported modes and filters. */
+	if (event->attr.exclude_user   ||
+	    event->attr.exclude_kernel ||
+	    event->attr.exclude_hv     ||
+	    event->attr.exclude_idle   ||
+	    event->attr.exclude_host   ||
+	    event->attr.exclude_guest  ||
+	    /* no sampling */
+	    event->attr.sample_period)
+		return -EINVAL;
+
+	if (cfg != AMD_POWER_EVENTSEL_PKG)
+		return -EINVAL;
+
+	return 0;
+}
+
+static void pmu_event_read(struct perf_event *event)
+{
+	event_update(event);
+}
+
+static ssize_t
+get_attr_cpumask(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	return cpumap_print_to_pagebuf(true, buf, &cpu_mask);
+}
+
+static DEVICE_ATTR(cpumask, S_IRUGO, get_attr_cpumask, NULL);
+
+static struct attribute *pmu_attrs[] = {
+	&dev_attr_cpumask.attr,
+	NULL,
+};
+
+static struct attribute_group pmu_attr_group = {
+	.attrs = pmu_attrs,
+};
+
+/*
+ * Currently it only supports to report the power of each
+ * processor/package.
+ */
+EVENT_ATTR_STR(power-pkg, power_pkg, "event=0x01");
+
+EVENT_ATTR_STR(power-pkg.unit, power_pkg_unit, "mWatts");
+
+/* Convert the count from micro-Watts to milli-Watts. */
+EVENT_ATTR_STR(power-pkg.scale, power_pkg_scale, "1.000000e-3");
+
+static struct attribute *events_attr[] = {
+	EVENT_PTR(power_pkg),
+	EVENT_PTR(power_pkg_unit),
+	EVENT_PTR(power_pkg_scale),
+	NULL,
+};
+
+static struct attribute_group pmu_events_group = {
+	.name	= "events",
+	.attrs	= events_attr,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-7");
+
+static struct attribute *formats_attr[] = {
+	&format_attr_event.attr,
+	NULL,
+};
+
+static struct attribute_group pmu_format_group = {
+	.name	= "format",
+	.attrs	= formats_attr,
+};
+
+static const struct attribute_group *attr_groups[] = {
+	&pmu_attr_group,
+	&pmu_format_group,
+	&pmu_events_group,
+	NULL,
+};
+
+static struct pmu pmu_class = {
+	.attr_groups	= attr_groups,
+	/* system-wide only */
+	.task_ctx_nr	= perf_invalid_context,
+	.event_init	= pmu_event_init,
+	.add		= pmu_event_add,
+	.del		= pmu_event_del,
+	.start		= pmu_event_start,
+	.stop		= pmu_event_stop,
+	.read		= pmu_event_read,
+};
+
+static void power_cpu_exit(int cpu)
+{
+	int target;
+
+	if (!cpumask_test_and_clear_cpu(cpu, &cpu_mask))
+		return;
+
+	/*
+	 * Find a new CPU on the same compute unit, if was set in cpumask
+	 * and still some CPUs on compute unit. Then migrate event and
+	 * context to new CPU.
+	 */
+	target = cpumask_any_but(topology_sibling_cpumask(cpu), cpu);
+	if (target < nr_cpumask_bits) {
+		cpumask_set_cpu(target, &cpu_mask);
+		perf_pmu_migrate_context(&pmu_class, cpu, target);
+	}
+}
+
+static void power_cpu_init(int cpu)
+{
+	int target;
+
+	/*
+	 * 1) If any CPU is set at cpu_mask in the same compute unit, do
+	 * nothing.
+	 * 2) If no CPU is set at cpu_mask in the same compute unit,
+	 * set current STARTING CPU.
+	 *
+	 * Note: if there is a CPU aside of the new one already in the
+	 * sibling mask, then it is also in cpu_mask.
+	 */
+	target = cpumask_any_but(topology_sibling_cpumask(cpu), cpu);
+	if (target >= nr_cpumask_bits)
+		cpumask_set_cpu(cpu, &cpu_mask);
+}
+
+static int
+power_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (long)hcpu;
+
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_DOWN_FAILED:
+	case CPU_STARTING:
+		power_cpu_init(cpu);
+		break;
+	case CPU_DOWN_PREPARE:
+		power_cpu_exit(cpu);
+		break;
+	default:
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block power_cpu_notifier_nb = {
+	.notifier_call = power_cpu_notifier,
+	.priority = CPU_PRI_PERF,
+};
+
+static const struct x86_cpu_id cpu_match[] = {
+	{ .vendor = X86_VENDOR_AMD, .family = 0x15 },
+	{},
+};
+
+static int __init amd_power_pmu_init(void)
+{
+	int cpu, target, ret;
+
+	if (!x86_match_cpu(cpu_match))
+		return 0;
+
+	if (!boot_cpu_has(X86_FEATURE_ACC_POWER))
+		return -ENODEV;
+
+	cpu_pwr_sample_ratio = cpuid_ecx(0x80000007);
+
+	if (rdmsrl_safe(MSR_F15H_CU_MAX_PWR_ACCUMULATOR, &max_cu_acc_power)) {
+		pr_err("Failed to read max compute unit power accumulator MSR\n");
+		return -ENODEV;
+	}
+
+	cpu_notifier_register_begin();
+
+	/* Choose one online core of each compute unit. */
+	for_each_online_cpu(cpu) {
+		target = cpumask_first(topology_sibling_cpumask(cpu));
+		if (!cpumask_test_cpu(target, &cpu_mask))
+			cpumask_set_cpu(target, &cpu_mask);
+	}
+
+	ret = perf_pmu_register(&pmu_class, "power", -1);
+	if (WARN_ON(ret)) {
+		pr_warn("AMD Power PMU registration failed\n");
+		goto out;
+	}
+
+	__register_cpu_notifier(&power_cpu_notifier_nb);
+
+	pr_info("AMD Power PMU detected\n");
+
+out:
+	cpu_notifier_register_done();
+
+	return ret;
+}
+module_init(amd_power_pmu_init);
+
+static void __exit amd_power_pmu_exit(void)
+{
+	cpu_notifier_register_begin();
+	__unregister_cpu_notifier(&power_cpu_notifier_nb);
+	cpu_notifier_register_done();
+
+	perf_pmu_unregister(&pmu_class);
+}
+module_exit(amd_power_pmu_exit);
+
+MODULE_AUTHOR("Huang Rui <ray.huang@amd.com>");
+MODULE_DESCRIPTION("AMD Processor Power Reporting Mechanism");
+MODULE_LICENSE("GPL v2");
diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c
new file mode 100644
index 000000000..3db9569e6
--- /dev/null
+++ b/arch/x86/events/amd/uncore.c
@@ -0,0 +1,603 @@
+/*
+ * Copyright (C) 2013 Advanced Micro Devices, Inc.
+ *
+ * Author: Jacob Shin <jacob.shin@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/perf_event.h>
+#include <linux/percpu.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+
+#include <asm/cpufeature.h>
+#include <asm/perf_event.h>
+#include <asm/msr.h>
+
+#define NUM_COUNTERS_NB		4
+#define NUM_COUNTERS_L2		4
+#define MAX_COUNTERS		NUM_COUNTERS_NB
+
+#define RDPMC_BASE_NB		6
+#define RDPMC_BASE_L2		10
+
+#define COUNTER_SHIFT		16
+
+struct amd_uncore {
+	int id;
+	int refcnt;
+	int cpu;
+	int num_counters;
+	int rdpmc_base;
+	u32 msr_base;
+	cpumask_t *active_mask;
+	struct pmu *pmu;
+	struct perf_event *events[MAX_COUNTERS];
+	struct amd_uncore *free_when_cpu_online;
+};
+
+static struct amd_uncore * __percpu *amd_uncore_nb;
+static struct amd_uncore * __percpu *amd_uncore_l2;
+
+static struct pmu amd_nb_pmu;
+static struct pmu amd_l2_pmu;
+
+static cpumask_t amd_nb_active_mask;
+static cpumask_t amd_l2_active_mask;
+
+static bool is_nb_event(struct perf_event *event)
+{
+	return event->pmu->type == amd_nb_pmu.type;
+}
+
+static bool is_l2_event(struct perf_event *event)
+{
+	return event->pmu->type == amd_l2_pmu.type;
+}
+
+static struct amd_uncore *event_to_amd_uncore(struct perf_event *event)
+{
+	if (is_nb_event(event) && amd_uncore_nb)
+		return *per_cpu_ptr(amd_uncore_nb, event->cpu);
+	else if (is_l2_event(event) && amd_uncore_l2)
+		return *per_cpu_ptr(amd_uncore_l2, event->cpu);
+
+	return NULL;
+}
+
+static void amd_uncore_read(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	u64 prev, new;
+	s64 delta;
+
+	/*
+	 * since we do not enable counter overflow interrupts,
+	 * we do not have to worry about prev_count changing on us
+	 */
+
+	prev = local64_read(&hwc->prev_count);
+	rdpmcl(hwc->event_base_rdpmc, new);
+	local64_set(&hwc->prev_count, new);
+	delta = (new << COUNTER_SHIFT) - (prev << COUNTER_SHIFT);
+	delta >>= COUNTER_SHIFT;
+	local64_add(delta, &event->count);
+}
+
+static void amd_uncore_start(struct perf_event *event, int flags)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (flags & PERF_EF_RELOAD)
+		wrmsrl(hwc->event_base, (u64)local64_read(&hwc->prev_count));
+
+	hwc->state = 0;
+	wrmsrl(hwc->config_base, (hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE));
+	perf_event_update_userpage(event);
+}
+
+static void amd_uncore_stop(struct perf_event *event, int flags)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	wrmsrl(hwc->config_base, hwc->config);
+	hwc->state |= PERF_HES_STOPPED;
+
+	if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+		amd_uncore_read(event);
+		hwc->state |= PERF_HES_UPTODATE;
+	}
+}
+
+static int amd_uncore_add(struct perf_event *event, int flags)
+{
+	int i;
+	struct amd_uncore *uncore = event_to_amd_uncore(event);
+	struct hw_perf_event *hwc = &event->hw;
+
+	/* are we already assigned? */
+	if (hwc->idx != -1 && uncore->events[hwc->idx] == event)
+		goto out;
+
+	for (i = 0; i < uncore->num_counters; i++) {
+		if (uncore->events[i] == event) {
+			hwc->idx = i;
+			goto out;
+		}
+	}
+
+	/* if not, take the first available counter */
+	hwc->idx = -1;
+	for (i = 0; i < uncore->num_counters; i++) {
+		if (cmpxchg(&uncore->events[i], NULL, event) == NULL) {
+			hwc->idx = i;
+			break;
+		}
+	}
+
+out:
+	if (hwc->idx == -1)
+		return -EBUSY;
+
+	hwc->config_base = uncore->msr_base + (2 * hwc->idx);
+	hwc->event_base = uncore->msr_base + 1 + (2 * hwc->idx);
+	hwc->event_base_rdpmc = uncore->rdpmc_base + hwc->idx;
+	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+
+	if (flags & PERF_EF_START)
+		amd_uncore_start(event, PERF_EF_RELOAD);
+
+	return 0;
+}
+
+static void amd_uncore_del(struct perf_event *event, int flags)
+{
+	int i;
+	struct amd_uncore *uncore = event_to_amd_uncore(event);
+	struct hw_perf_event *hwc = &event->hw;
+
+	amd_uncore_stop(event, PERF_EF_UPDATE);
+
+	for (i = 0; i < uncore->num_counters; i++) {
+		if (cmpxchg(&uncore->events[i], event, NULL) == event)
+			break;
+	}
+
+	hwc->idx = -1;
+}
+
+static int amd_uncore_event_init(struct perf_event *event)
+{
+	struct amd_uncore *uncore;
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (event->attr.type != event->pmu->type)
+		return -ENOENT;
+
+	/*
+	 * NB and L2 counters (MSRs) are shared across all cores that share the
+	 * same NB / L2 cache. Interrupts can be directed to a single target
+	 * core, however, event counts generated by processes running on other
+	 * cores cannot be masked out. So we do not support sampling and
+	 * per-thread events.
+	 */
+	if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK)
+		return -EINVAL;
+
+	/* NB and L2 counters do not have usr/os/guest/host bits */
+	if (event->attr.exclude_user || event->attr.exclude_kernel ||
+	    event->attr.exclude_host || event->attr.exclude_guest)
+		return -EINVAL;
+
+	/* and we do not enable counter overflow interrupts */
+	hwc->config = event->attr.config & AMD64_RAW_EVENT_MASK_NB;
+	hwc->idx = -1;
+
+	if (event->cpu < 0)
+		return -EINVAL;
+
+	uncore = event_to_amd_uncore(event);
+	if (!uncore)
+		return -ENODEV;
+
+	/*
+	 * since request can come in to any of the shared cores, we will remap
+	 * to a single common cpu.
+	 */
+	event->cpu = uncore->cpu;
+
+	return 0;
+}
+
+static ssize_t amd_uncore_attr_show_cpumask(struct device *dev,
+					    struct device_attribute *attr,
+					    char *buf)
+{
+	cpumask_t *active_mask;
+	struct pmu *pmu = dev_get_drvdata(dev);
+
+	if (pmu->type == amd_nb_pmu.type)
+		active_mask = &amd_nb_active_mask;
+	else if (pmu->type == amd_l2_pmu.type)
+		active_mask = &amd_l2_active_mask;
+	else
+		return 0;
+
+	return cpumap_print_to_pagebuf(true, buf, active_mask);
+}
+static DEVICE_ATTR(cpumask, S_IRUGO, amd_uncore_attr_show_cpumask, NULL);
+
+static struct attribute *amd_uncore_attrs[] = {
+	&dev_attr_cpumask.attr,
+	NULL,
+};
+
+static struct attribute_group amd_uncore_attr_group = {
+	.attrs = amd_uncore_attrs,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-7,32-35");
+PMU_FORMAT_ATTR(umask, "config:8-15");
+
+static struct attribute *amd_uncore_format_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	NULL,
+};
+
+static struct attribute_group amd_uncore_format_group = {
+	.name = "format",
+	.attrs = amd_uncore_format_attr,
+};
+
+static const struct attribute_group *amd_uncore_attr_groups[] = {
+	&amd_uncore_attr_group,
+	&amd_uncore_format_group,
+	NULL,
+};
+
+static struct pmu amd_nb_pmu = {
+	.attr_groups	= amd_uncore_attr_groups,
+	.name		= "amd_nb",
+	.event_init	= amd_uncore_event_init,
+	.add		= amd_uncore_add,
+	.del		= amd_uncore_del,
+	.start		= amd_uncore_start,
+	.stop		= amd_uncore_stop,
+	.read		= amd_uncore_read,
+};
+
+static struct pmu amd_l2_pmu = {
+	.attr_groups	= amd_uncore_attr_groups,
+	.name		= "amd_l2",
+	.event_init	= amd_uncore_event_init,
+	.add		= amd_uncore_add,
+	.del		= amd_uncore_del,
+	.start		= amd_uncore_start,
+	.stop		= amd_uncore_stop,
+	.read		= amd_uncore_read,
+};
+
+static struct amd_uncore *amd_uncore_alloc(unsigned int cpu)
+{
+	return kzalloc_node(sizeof(struct amd_uncore), GFP_KERNEL,
+			cpu_to_node(cpu));
+}
+
+static int amd_uncore_cpu_up_prepare(unsigned int cpu)
+{
+	struct amd_uncore *uncore_nb = NULL, *uncore_l2;
+
+	if (amd_uncore_nb) {
+		uncore_nb = amd_uncore_alloc(cpu);
+		if (!uncore_nb)
+			goto fail;
+		uncore_nb->cpu = cpu;
+		uncore_nb->num_counters = NUM_COUNTERS_NB;
+		uncore_nb->rdpmc_base = RDPMC_BASE_NB;
+		uncore_nb->msr_base = MSR_F15H_NB_PERF_CTL;
+		uncore_nb->active_mask = &amd_nb_active_mask;
+		uncore_nb->pmu = &amd_nb_pmu;
+		*per_cpu_ptr(amd_uncore_nb, cpu) = uncore_nb;
+	}
+
+	if (amd_uncore_l2) {
+		uncore_l2 = amd_uncore_alloc(cpu);
+		if (!uncore_l2)
+			goto fail;
+		uncore_l2->cpu = cpu;
+		uncore_l2->num_counters = NUM_COUNTERS_L2;
+		uncore_l2->rdpmc_base = RDPMC_BASE_L2;
+		uncore_l2->msr_base = MSR_F16H_L2I_PERF_CTL;
+		uncore_l2->active_mask = &amd_l2_active_mask;
+		uncore_l2->pmu = &amd_l2_pmu;
+		*per_cpu_ptr(amd_uncore_l2, cpu) = uncore_l2;
+	}
+
+	return 0;
+
+fail:
+	if (amd_uncore_nb)
+		*per_cpu_ptr(amd_uncore_nb, cpu) = NULL;
+	kfree(uncore_nb);
+	return -ENOMEM;
+}
+
+static struct amd_uncore *
+amd_uncore_find_online_sibling(struct amd_uncore *this,
+			       struct amd_uncore * __percpu *uncores)
+{
+	unsigned int cpu;
+	struct amd_uncore *that;
+
+	for_each_online_cpu(cpu) {
+		that = *per_cpu_ptr(uncores, cpu);
+
+		if (!that)
+			continue;
+
+		if (this == that)
+			continue;
+
+		if (this->id == that->id) {
+			that->free_when_cpu_online = this;
+			this = that;
+			break;
+		}
+	}
+
+	this->refcnt++;
+	return this;
+}
+
+static void amd_uncore_cpu_starting(unsigned int cpu)
+{
+	unsigned int eax, ebx, ecx, edx;
+	struct amd_uncore *uncore;
+
+	if (amd_uncore_nb) {
+		uncore = *per_cpu_ptr(amd_uncore_nb, cpu);
+		cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
+		uncore->id = ecx & 0xff;
+
+		uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_nb);
+		*per_cpu_ptr(amd_uncore_nb, cpu) = uncore;
+	}
+
+	if (amd_uncore_l2) {
+		unsigned int apicid = cpu_data(cpu).apicid;
+		unsigned int nshared;
+
+		uncore = *per_cpu_ptr(amd_uncore_l2, cpu);
+		cpuid_count(0x8000001d, 2, &eax, &ebx, &ecx, &edx);
+		nshared = ((eax >> 14) & 0xfff) + 1;
+		uncore->id = apicid - (apicid % nshared);
+
+		uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_l2);
+		*per_cpu_ptr(amd_uncore_l2, cpu) = uncore;
+	}
+}
+
+static void uncore_online(unsigned int cpu,
+			  struct amd_uncore * __percpu *uncores)
+{
+	struct amd_uncore *uncore = *per_cpu_ptr(uncores, cpu);
+
+	kfree(uncore->free_when_cpu_online);
+	uncore->free_when_cpu_online = NULL;
+
+	if (cpu == uncore->cpu)
+		cpumask_set_cpu(cpu, uncore->active_mask);
+}
+
+static void amd_uncore_cpu_online(unsigned int cpu)
+{
+	if (amd_uncore_nb)
+		uncore_online(cpu, amd_uncore_nb);
+
+	if (amd_uncore_l2)
+		uncore_online(cpu, amd_uncore_l2);
+}
+
+static void uncore_down_prepare(unsigned int cpu,
+				struct amd_uncore * __percpu *uncores)
+{
+	unsigned int i;
+	struct amd_uncore *this = *per_cpu_ptr(uncores, cpu);
+
+	if (this->cpu != cpu)
+		return;
+
+	/* this cpu is going down, migrate to a shared sibling if possible */
+	for_each_online_cpu(i) {
+		struct amd_uncore *that = *per_cpu_ptr(uncores, i);
+
+		if (cpu == i)
+			continue;
+
+		if (this == that) {
+			perf_pmu_migrate_context(this->pmu, cpu, i);
+			cpumask_clear_cpu(cpu, that->active_mask);
+			cpumask_set_cpu(i, that->active_mask);
+			that->cpu = i;
+			break;
+		}
+	}
+}
+
+static void amd_uncore_cpu_down_prepare(unsigned int cpu)
+{
+	if (amd_uncore_nb)
+		uncore_down_prepare(cpu, amd_uncore_nb);
+
+	if (amd_uncore_l2)
+		uncore_down_prepare(cpu, amd_uncore_l2);
+}
+
+static void uncore_dead(unsigned int cpu, struct amd_uncore * __percpu *uncores)
+{
+	struct amd_uncore *uncore = *per_cpu_ptr(uncores, cpu);
+
+	if (cpu == uncore->cpu)
+		cpumask_clear_cpu(cpu, uncore->active_mask);
+
+	if (!--uncore->refcnt)
+		kfree(uncore);
+	*per_cpu_ptr(uncores, cpu) = NULL;
+}
+
+static void amd_uncore_cpu_dead(unsigned int cpu)
+{
+	if (amd_uncore_nb)
+		uncore_dead(cpu, amd_uncore_nb);
+
+	if (amd_uncore_l2)
+		uncore_dead(cpu, amd_uncore_l2);
+}
+
+static int
+amd_uncore_cpu_notifier(struct notifier_block *self, unsigned long action,
+			void *hcpu)
+{
+	unsigned int cpu = (long)hcpu;
+
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_UP_PREPARE:
+		if (amd_uncore_cpu_up_prepare(cpu))
+			return notifier_from_errno(-ENOMEM);
+		break;
+
+	case CPU_STARTING:
+		amd_uncore_cpu_starting(cpu);
+		break;
+
+	case CPU_ONLINE:
+		amd_uncore_cpu_online(cpu);
+		break;
+
+	case CPU_DOWN_PREPARE:
+		amd_uncore_cpu_down_prepare(cpu);
+		break;
+
+	case CPU_UP_CANCELED:
+	case CPU_DEAD:
+		amd_uncore_cpu_dead(cpu);
+		break;
+
+	default:
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block amd_uncore_cpu_notifier_block = {
+	.notifier_call	= amd_uncore_cpu_notifier,
+	.priority	= CPU_PRI_PERF + 1,
+};
+
+static void __init init_cpu_already_online(void *dummy)
+{
+	unsigned int cpu = smp_processor_id();
+
+	amd_uncore_cpu_starting(cpu);
+	amd_uncore_cpu_online(cpu);
+}
+
+static void cleanup_cpu_online(void *dummy)
+{
+	unsigned int cpu = smp_processor_id();
+
+	amd_uncore_cpu_dead(cpu);
+}
+
+static int __init amd_uncore_init(void)
+{
+	unsigned int cpu, cpu2;
+	int ret = -ENODEV;
+
+	if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
+		goto fail_nodev;
+
+	if (!boot_cpu_has(X86_FEATURE_TOPOEXT))
+		goto fail_nodev;
+
+	if (boot_cpu_has(X86_FEATURE_PERFCTR_NB)) {
+		amd_uncore_nb = alloc_percpu(struct amd_uncore *);
+		if (!amd_uncore_nb) {
+			ret = -ENOMEM;
+			goto fail_nb;
+		}
+		ret = perf_pmu_register(&amd_nb_pmu, amd_nb_pmu.name, -1);
+		if (ret)
+			goto fail_nb;
+
+		pr_info("perf: AMD NB counters detected\n");
+		ret = 0;
+	}
+
+	if (boot_cpu_has(X86_FEATURE_PERFCTR_L2)) {
+		amd_uncore_l2 = alloc_percpu(struct amd_uncore *);
+		if (!amd_uncore_l2) {
+			ret = -ENOMEM;
+			goto fail_l2;
+		}
+		ret = perf_pmu_register(&amd_l2_pmu, amd_l2_pmu.name, -1);
+		if (ret)
+			goto fail_l2;
+
+		pr_info("perf: AMD L2I counters detected\n");
+		ret = 0;
+	}
+
+	if (ret)
+		goto fail_nodev;
+
+	cpu_notifier_register_begin();
+
+	/* init cpus already online before registering for hotplug notifier */
+	for_each_online_cpu(cpu) {
+		ret = amd_uncore_cpu_up_prepare(cpu);
+		if (ret)
+			goto fail_online;
+		smp_call_function_single(cpu, init_cpu_already_online, NULL, 1);
+	}
+
+	__register_cpu_notifier(&amd_uncore_cpu_notifier_block);
+	cpu_notifier_register_done();
+
+	return 0;
+
+
+fail_online:
+	for_each_online_cpu(cpu2) {
+		if (cpu2 == cpu)
+			break;
+		smp_call_function_single(cpu, cleanup_cpu_online, NULL, 1);
+	}
+	cpu_notifier_register_done();
+
+	/* amd_uncore_nb/l2 should have been freed by cleanup_cpu_online */
+	amd_uncore_nb = amd_uncore_l2 = NULL;
+
+	if (boot_cpu_has(X86_FEATURE_PERFCTR_L2))
+		perf_pmu_unregister(&amd_l2_pmu);
+fail_l2:
+	if (boot_cpu_has(X86_FEATURE_PERFCTR_NB))
+		perf_pmu_unregister(&amd_nb_pmu);
+	if (amd_uncore_l2)
+		free_percpu(amd_uncore_l2);
+fail_nb:
+	if (amd_uncore_nb)
+		free_percpu(amd_uncore_nb);
+
+fail_nodev:
+	return ret;
+}
+device_initcall(amd_uncore_init);