From 57f0f512b273f60d52568b8c6b77e17f5636edc0 Mon Sep 17 00:00:00 2001
From: André Fabian Silva Delgado <emulatorman@parabola.nu>
Date: Wed, 5 Aug 2015 17:04:01 -0300
Subject: Initial import

---
 arch/metag/kernel/.gitignore        |   1 +
 arch/metag/kernel/Makefile          |  39 ++
 arch/metag/kernel/asm-offsets.c     |  14 +
 arch/metag/kernel/cachepart.c       | 131 +++++
 arch/metag/kernel/clock.c           | 110 ++++
 arch/metag/kernel/core_reg.c        | 117 +++++
 arch/metag/kernel/da.c              |  25 +
 arch/metag/kernel/devtree.c         |  71 +++
 arch/metag/kernel/dma.c             | 500 ++++++++++++++++++
 arch/metag/kernel/ftrace.c          | 123 +++++
 arch/metag/kernel/ftrace_stub.S     |  62 +++
 arch/metag/kernel/head.S            |  65 +++
 arch/metag/kernel/irq.c             | 292 +++++++++++
 arch/metag/kernel/kick.c            | 110 ++++
 arch/metag/kernel/machines.c        |  20 +
 arch/metag/kernel/metag_ksyms.c     |  54 ++
 arch/metag/kernel/module.c          | 284 +++++++++++
 arch/metag/kernel/perf/Makefile     |   3 +
 arch/metag/kernel/perf/perf_event.c | 888 ++++++++++++++++++++++++++++++++
 arch/metag/kernel/perf/perf_event.h | 106 ++++
 arch/metag/kernel/perf_callchain.c  |  96 ++++
 arch/metag/kernel/process.c         | 444 ++++++++++++++++
 arch/metag/kernel/ptrace.c          | 414 +++++++++++++++
 arch/metag/kernel/setup.c           | 625 +++++++++++++++++++++++
 arch/metag/kernel/signal.c          | 334 ++++++++++++
 arch/metag/kernel/smp.c             | 665 ++++++++++++++++++++++++
 arch/metag/kernel/stacktrace.c      | 187 +++++++
 arch/metag/kernel/sys_metag.c       | 180 +++++++
 arch/metag/kernel/tbiunexp.S        |  22 +
 arch/metag/kernel/tcm.c             | 151 ++++++
 arch/metag/kernel/time.c            |  25 +
 arch/metag/kernel/topology.c        |  78 +++
 arch/metag/kernel/traps.c           | 989 ++++++++++++++++++++++++++++++++++++
 arch/metag/kernel/user_gateway.S    |  97 ++++
 arch/metag/kernel/vmlinux.lds.S     |  71 +++
 35 files changed, 7393 insertions(+)
 create mode 100644 arch/metag/kernel/.gitignore
 create mode 100644 arch/metag/kernel/Makefile
 create mode 100644 arch/metag/kernel/asm-offsets.c
 create mode 100644 arch/metag/kernel/cachepart.c
 create mode 100644 arch/metag/kernel/clock.c
 create mode 100644 arch/metag/kernel/core_reg.c
 create mode 100644 arch/metag/kernel/da.c
 create mode 100644 arch/metag/kernel/devtree.c
 create mode 100644 arch/metag/kernel/dma.c
 create mode 100644 arch/metag/kernel/ftrace.c
 create mode 100644 arch/metag/kernel/ftrace_stub.S
 create mode 100644 arch/metag/kernel/head.S
 create mode 100644 arch/metag/kernel/irq.c
 create mode 100644 arch/metag/kernel/kick.c
 create mode 100644 arch/metag/kernel/machines.c
 create mode 100644 arch/metag/kernel/metag_ksyms.c
 create mode 100644 arch/metag/kernel/module.c
 create mode 100644 arch/metag/kernel/perf/Makefile
 create mode 100644 arch/metag/kernel/perf/perf_event.c
 create mode 100644 arch/metag/kernel/perf/perf_event.h
 create mode 100644 arch/metag/kernel/perf_callchain.c
 create mode 100644 arch/metag/kernel/process.c
 create mode 100644 arch/metag/kernel/ptrace.c
 create mode 100644 arch/metag/kernel/setup.c
 create mode 100644 arch/metag/kernel/signal.c
 create mode 100644 arch/metag/kernel/smp.c
 create mode 100644 arch/metag/kernel/stacktrace.c
 create mode 100644 arch/metag/kernel/sys_metag.c
 create mode 100644 arch/metag/kernel/tbiunexp.S
 create mode 100644 arch/metag/kernel/tcm.c
 create mode 100644 arch/metag/kernel/time.c
 create mode 100644 arch/metag/kernel/topology.c
 create mode 100644 arch/metag/kernel/traps.c
 create mode 100644 arch/metag/kernel/user_gateway.S
 create mode 100644 arch/metag/kernel/vmlinux.lds.S

(limited to 'arch/metag/kernel')

diff --git a/arch/metag/kernel/.gitignore b/arch/metag/kernel/.gitignore
new file mode 100644
index 000000000..c5f676c3c
--- /dev/null
+++ b/arch/metag/kernel/.gitignore
@@ -0,0 +1 @@
+vmlinux.lds
diff --git a/arch/metag/kernel/Makefile b/arch/metag/kernel/Makefile
new file mode 100644
index 000000000..d7675f4a5
--- /dev/null
+++ b/arch/metag/kernel/Makefile
@@ -0,0 +1,39 @@
+#
+# Makefile for the Linux/Meta kernel.
+#
+
+extra-y	+= head.o
+extra-y	+= vmlinux.lds
+
+obj-y	+= cachepart.o
+obj-y	+= clock.o
+obj-y	+= core_reg.o
+obj-y	+= devtree.o
+obj-y	+= dma.o
+obj-y	+= irq.o
+obj-y	+= kick.o
+obj-y	+= machines.o
+obj-y	+= process.o
+obj-y	+= ptrace.o
+obj-y	+= setup.o
+obj-y	+= signal.o
+obj-y	+= stacktrace.o
+obj-y	+= sys_metag.o
+obj-y	+= tbiunexp.o
+obj-y	+= time.o
+obj-y	+= topology.o
+obj-y	+= traps.o
+obj-y	+= user_gateway.o
+
+obj-$(CONFIG_PERF_EVENTS)		+= perf/
+
+obj-$(CONFIG_METAG_COREMEM)		+= coremem.o
+obj-$(CONFIG_METAG_DA)			+= da.o
+obj-$(CONFIG_DYNAMIC_FTRACE)		+= ftrace.o
+obj-$(CONFIG_FUNCTION_TRACER)		+= ftrace_stub.o
+obj-$(CONFIG_MODULES)			+= metag_ksyms.o
+obj-$(CONFIG_MODULES)			+= module.o
+obj-$(CONFIG_PERF_EVENTS)		+= perf_callchain.o
+obj-$(CONFIG_SMP)			+= smp.o
+obj-$(CONFIG_METAG_SUSPEND_MEM)		+= suspend.o
+obj-$(CONFIG_METAG_USER_TCM)		+= tcm.o
diff --git a/arch/metag/kernel/asm-offsets.c b/arch/metag/kernel/asm-offsets.c
new file mode 100644
index 000000000..bfc9205f9
--- /dev/null
+++ b/arch/metag/kernel/asm-offsets.c
@@ -0,0 +1,14 @@
+/*
+ * This program is used to generate definitions needed by
+ * assembly language modules.
+ *
+ */
+
+#include <linux/kbuild.h>
+#include <linux/thread_info.h>
+
+int main(void)
+{
+	DEFINE(THREAD_INFO_SIZE, sizeof(struct thread_info));
+	return 0;
+}
diff --git a/arch/metag/kernel/cachepart.c b/arch/metag/kernel/cachepart.c
new file mode 100644
index 000000000..04b7d4f84
--- /dev/null
+++ b/arch/metag/kernel/cachepart.c
@@ -0,0 +1,131 @@
+/*
+ * Meta cache partition manipulation.
+ *
+ * Copyright 2010 Imagination Technologies Ltd.
+ */
+
+#include <linux/kernel.h>
+#include <linux/io.h>
+#include <linux/errno.h>
+#include <asm/processor.h>
+#include <asm/cachepart.h>
+#include <asm/metag_isa.h>
+#include <asm/metag_mem.h>
+
+#define SYSC_DCPART(n)	(SYSC_DCPART0 + SYSC_xCPARTn_STRIDE * (n))
+#define SYSC_ICPART(n)	(SYSC_ICPART0 + SYSC_xCPARTn_STRIDE * (n))
+
+#define CACHE_ASSOCIATIVITY 4 /* 4 way set-assosiative */
+#define ICACHE 0
+#define DCACHE 1
+
+/* The CORE_CONFIG2 register is not available on Meta 1 */
+#ifdef CONFIG_METAG_META21
+unsigned int get_dcache_size(void)
+{
+	unsigned int config2 = metag_in32(METAC_CORE_CONFIG2);
+	unsigned int sz = 0x1000 << ((config2 & METAC_CORECFG2_DCSZ_BITS)
+				     >> METAC_CORECFG2_DCSZ_S);
+	if (config2 & METAC_CORECFG2_DCSMALL_BIT)
+		sz >>= 6;
+	return sz;
+}
+
+unsigned int get_icache_size(void)
+{
+	unsigned int config2 = metag_in32(METAC_CORE_CONFIG2);
+	unsigned int sz = 0x1000 << ((config2 & METAC_CORE_C2ICSZ_BITS)
+				     >> METAC_CORE_C2ICSZ_S);
+	if (config2 & METAC_CORECFG2_ICSMALL_BIT)
+		sz >>= 6;
+	return sz;
+}
+
+unsigned int get_global_dcache_size(void)
+{
+	unsigned int cpart = metag_in32(SYSC_DCPART(hard_processor_id()));
+	unsigned int temp = cpart & SYSC_xCPARTG_AND_BITS;
+	return (get_dcache_size() * ((temp >> SYSC_xCPARTG_AND_S) + 1)) >> 4;
+}
+
+unsigned int get_global_icache_size(void)
+{
+	unsigned int cpart = metag_in32(SYSC_ICPART(hard_processor_id()));
+	unsigned int temp = cpart & SYSC_xCPARTG_AND_BITS;
+	return (get_icache_size() * ((temp >> SYSC_xCPARTG_AND_S) + 1)) >> 4;
+}
+
+static int get_thread_cache_size(unsigned int cache, int thread_id)
+{
+	unsigned int cache_size;
+	unsigned int t_cache_part;
+	unsigned int isEnabled;
+	unsigned int offset = 0;
+	isEnabled = (cache == DCACHE ? metag_in32(MMCU_DCACHE_CTRL_ADDR) & 0x1 :
+		metag_in32(MMCU_ICACHE_CTRL_ADDR) & 0x1);
+	if (!isEnabled)
+		return 0;
+#if PAGE_OFFSET >= LINGLOBAL_BASE
+	/* Checking for global cache */
+	cache_size = (cache == DCACHE ? get_global_dcache_size() :
+		get_global_icache_size());
+	offset = 8;
+#else
+	cache_size = (cache == DCACHE ? get_dcache_size() :
+		get_icache_size());
+#endif
+	t_cache_part = (cache == DCACHE ?
+		(metag_in32(SYSC_DCPART(thread_id)) >> offset) & 0xF :
+		(metag_in32(SYSC_ICPART(thread_id)) >> offset) & 0xF);
+	switch (t_cache_part) {
+	case 0xF:
+		return cache_size;
+	case 0x7:
+		return cache_size / 2;
+	case 0x3:
+		return cache_size / 4;
+	case 0x1:
+		return cache_size / 8;
+	case 0:
+		return cache_size / 16;
+	}
+	return -1;
+}
+
+void check_for_cache_aliasing(int thread_id)
+{
+	int thread_cache_size;
+	unsigned int cache_type;
+	for (cache_type = ICACHE; cache_type <= DCACHE; cache_type++) {
+		thread_cache_size =
+				get_thread_cache_size(cache_type, thread_id);
+		if (thread_cache_size < 0)
+			pr_emerg("Can't read %s cache size\n",
+				 cache_type ? "DCACHE" : "ICACHE");
+		else if (thread_cache_size == 0)
+			/* Cache is off. No need to check for aliasing */
+			continue;
+		if (thread_cache_size / CACHE_ASSOCIATIVITY > PAGE_SIZE) {
+			pr_emerg("Potential cache aliasing detected in %s on Thread %d\n",
+				 cache_type ? "DCACHE" : "ICACHE", thread_id);
+			pr_warn("Total %s size: %u bytes\n",
+				cache_type ? "DCACHE" : "ICACHE",
+				cache_type ? get_dcache_size()
+				: get_icache_size());
+			pr_warn("Thread %s size: %d bytes\n",
+				cache_type ? "CACHE" : "ICACHE",
+				thread_cache_size);
+			pr_warn("Page Size: %lu bytes\n", PAGE_SIZE);
+			panic("Potential cache aliasing detected");
+		}
+	}
+}
+
+#else
+
+void check_for_cache_aliasing(int thread_id)
+{
+	return;
+}
+
+#endif
diff --git a/arch/metag/kernel/clock.c b/arch/metag/kernel/clock.c
new file mode 100644
index 000000000..6339c9c6d
--- /dev/null
+++ b/arch/metag/kernel/clock.c
@@ -0,0 +1,110 @@
+/*
+ * arch/metag/kernel/clock.c
+ *
+ * Copyright (C) 2012 Imagination Technologies Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/clk.h>
+#include <linux/delay.h>
+#include <linux/io.h>
+#include <linux/of.h>
+
+#include <asm/param.h>
+#include <asm/clock.h>
+
+struct meta_clock_desc _meta_clock;
+
+/* Default machine get_core_freq callback. */
+static unsigned long get_core_freq_default(void)
+{
+#ifdef CONFIG_METAG_META21
+	/*
+	 * Meta 2 cores divide down the core clock for the Meta timers, so we
+	 * can estimate the core clock from the divider.
+	 */
+	return (metag_in32(EXPAND_TIMER_DIV) + 1) * 1000000;
+#else
+	/*
+	 * On Meta 1 we don't know the core clock, but assuming the Meta timer
+	 * is correct it can be estimated based on loops_per_jiffy.
+	 */
+	return (loops_per_jiffy * HZ * 5) >> 1;
+#endif
+}
+
+static struct clk *clk_core;
+
+/* Clk based get_core_freq callback. */
+static unsigned long get_core_freq_clk(void)
+{
+	return clk_get_rate(clk_core);
+}
+
+/**
+ * init_metag_core_clock() - Set up core clock from devicetree.
+ *
+ * Checks to see if a "core" clock is provided in the device tree, and overrides
+ * the get_core_freq callback to use it.
+ */
+static void __init init_metag_core_clock(void)
+{
+	/*
+	 * See if a core clock is provided by the devicetree (and
+	 * registered by the init callback above).
+	 */
+	struct device_node *node;
+	node = of_find_compatible_node(NULL, NULL, "img,meta");
+	if (!node) {
+		pr_warn("%s: no compatible img,meta DT node found\n",
+			__func__);
+		return;
+	}
+
+	clk_core = of_clk_get_by_name(node, "core");
+	if (IS_ERR(clk_core)) {
+		pr_warn("%s: no core clock found in DT\n",
+			__func__);
+		return;
+	}
+
+	/*
+	 * Override the core frequency callback to use
+	 * this clk.
+	 */
+	_meta_clock.get_core_freq = get_core_freq_clk;
+}
+
+/**
+ * init_metag_clocks() - Set up clocks from devicetree.
+ *
+ * Set up important clocks from device tree. In particular any needed for clock
+ * sources.
+ */
+void __init init_metag_clocks(void)
+{
+	init_metag_core_clock();
+
+	pr_info("Core clock frequency: %lu Hz\n", get_coreclock());
+}
+
+/**
+ * setup_meta_clocks() - Early set up of the Meta clock.
+ * @desc:	Clock descriptor usually provided by machine description
+ *
+ * Ensures all callbacks are valid.
+ */
+void __init setup_meta_clocks(struct meta_clock_desc *desc)
+{
+	/* copy callbacks */
+	if (desc)
+		_meta_clock = *desc;
+
+	/* set fallback functions */
+	if (!_meta_clock.get_core_freq)
+		_meta_clock.get_core_freq = get_core_freq_default;
+}
+
diff --git a/arch/metag/kernel/core_reg.c b/arch/metag/kernel/core_reg.c
new file mode 100644
index 000000000..671cce8c3
--- /dev/null
+++ b/arch/metag/kernel/core_reg.c
@@ -0,0 +1,117 @@
+/*
+ *  Support for reading and writing Meta core internal registers.
+ *
+ *  Copyright (C) 2011 Imagination Technologies Ltd.
+ *
+ */
+
+#include <linux/delay.h>
+#include <linux/export.h>
+
+#include <asm/core_reg.h>
+#include <asm/global_lock.h>
+#include <asm/hwthread.h>
+#include <asm/io.h>
+#include <asm/metag_mem.h>
+#include <asm/metag_regs.h>
+
+#define UNIT_BIT_MASK		TXUXXRXRQ_UXX_BITS
+#define REG_BIT_MASK		TXUXXRXRQ_RX_BITS
+#define THREAD_BIT_MASK		TXUXXRXRQ_TX_BITS
+
+#define UNIT_SHIFTS		TXUXXRXRQ_UXX_S
+#define REG_SHIFTS		TXUXXRXRQ_RX_S
+#define THREAD_SHIFTS		TXUXXRXRQ_TX_S
+
+#define UNIT_VAL(x)		(((x) << UNIT_SHIFTS) & UNIT_BIT_MASK)
+#define REG_VAL(x)		(((x) << REG_SHIFTS) & REG_BIT_MASK)
+#define THREAD_VAL(x)		(((x) << THREAD_SHIFTS) & THREAD_BIT_MASK)
+
+/*
+ * core_reg_write() - modify the content of a register in a core unit.
+ * @unit:	The unit to be modified.
+ * @reg:	Register number within the unit.
+ * @thread:	The thread we want to access.
+ * @val:	The new value to write.
+ *
+ * Check asm/metag_regs.h for a list/defines of supported units (ie: TXUPC_ID,
+ * TXUTR_ID, etc), and regnums within the units (ie: TXMASKI_REGNUM,
+ * TXPOLLI_REGNUM, etc).
+ */
+void core_reg_write(int unit, int reg, int thread, unsigned int val)
+{
+	unsigned long flags;
+
+	/* TXUCT_ID has its own memory mapped registers */
+	if (unit == TXUCT_ID) {
+		void __iomem *cu_reg = __CU_addr(thread, reg);
+		metag_out32(val, cu_reg);
+		return;
+	}
+
+	__global_lock2(flags);
+
+	/* wait for ready */
+	while (!(metag_in32(TXUXXRXRQ) & TXUXXRXRQ_DREADY_BIT))
+		udelay(10);
+
+	/* set the value to write */
+	metag_out32(val, TXUXXRXDT);
+
+	/* set the register to write */
+	val = UNIT_VAL(unit) | REG_VAL(reg) | THREAD_VAL(thread);
+	metag_out32(val, TXUXXRXRQ);
+
+	/* wait for finish */
+	while (!(metag_in32(TXUXXRXRQ) & TXUXXRXRQ_DREADY_BIT))
+		udelay(10);
+
+	__global_unlock2(flags);
+}
+EXPORT_SYMBOL(core_reg_write);
+
+/*
+ * core_reg_read() - read the content of a register in a core unit.
+ * @unit:	The unit to be modified.
+ * @reg:	Register number within the unit.
+ * @thread:	The thread we want to access.
+ *
+ * Check asm/metag_regs.h for a list/defines of supported units (ie: TXUPC_ID,
+ * TXUTR_ID, etc), and regnums within the units (ie: TXMASKI_REGNUM,
+ * TXPOLLI_REGNUM, etc).
+ */
+unsigned int core_reg_read(int unit, int reg, int thread)
+{
+	unsigned long flags;
+	unsigned int val;
+
+	/* TXUCT_ID has its own memory mapped registers */
+	if (unit == TXUCT_ID) {
+		void __iomem *cu_reg = __CU_addr(thread, reg);
+		val = metag_in32(cu_reg);
+		return val;
+	}
+
+	__global_lock2(flags);
+
+	/* wait for ready */
+	while (!(metag_in32(TXUXXRXRQ) & TXUXXRXRQ_DREADY_BIT))
+		udelay(10);
+
+	/* set the register to read */
+	val = (UNIT_VAL(unit) | REG_VAL(reg) | THREAD_VAL(thread) |
+							TXUXXRXRQ_RDnWR_BIT);
+	metag_out32(val, TXUXXRXRQ);
+
+	/* wait for finish */
+	while (!(metag_in32(TXUXXRXRQ) & TXUXXRXRQ_DREADY_BIT))
+		udelay(10);
+
+	/* read the register value */
+	val = metag_in32(TXUXXRXDT);
+
+	__global_unlock2(flags);
+
+	return val;
+}
+EXPORT_SYMBOL(core_reg_read);
diff --git a/arch/metag/kernel/da.c b/arch/metag/kernel/da.c
new file mode 100644
index 000000000..a35dbed6f
--- /dev/null
+++ b/arch/metag/kernel/da.c
@@ -0,0 +1,25 @@
+/*
+ * Meta DA JTAG debugger control.
+ *
+ * Copyright 2012 Imagination Technologies Ltd.
+ */
+
+
+#include <linux/export.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <asm/da.h>
+#include <asm/metag_mem.h>
+
+bool _metag_da_present;
+EXPORT_SYMBOL_GPL(_metag_da_present);
+
+int __init metag_da_probe(void)
+{
+	_metag_da_present = (metag_in32(T0VECINT_BHALT) == 1);
+	if (_metag_da_present)
+		pr_info("DA present\n");
+	else
+		pr_info("DA not present\n");
+	return 0;
+}
diff --git a/arch/metag/kernel/devtree.c b/arch/metag/kernel/devtree.c
new file mode 100644
index 000000000..18dd7aea9
--- /dev/null
+++ b/arch/metag/kernel/devtree.c
@@ -0,0 +1,71 @@
+/*
+ *  linux/arch/metag/kernel/devtree.c
+ *
+ *  Copyright (C) 2012 Imagination Technologies Ltd.
+ *
+ *  Based on ARM version:
+ *  Copyright (C) 2009 Canonical Ltd. <jeremy.kerr@canonical.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/types.h>
+#include <linux/bootmem.h>
+#include <linux/memblock.h>
+#include <linux/of.h>
+#include <linux/of_fdt.h>
+
+#include <asm/setup.h>
+#include <asm/page.h>
+#include <asm/mach/arch.h>
+
+void __init early_init_dt_add_memory_arch(u64 base, u64 size)
+{
+	pr_err("%s(%llx, %llx)\n",
+	       __func__, base, size);
+}
+
+void * __init early_init_dt_alloc_memory_arch(u64 size, u64 align)
+{
+	return alloc_bootmem_align(size, align);
+}
+
+static const void * __init arch_get_next_mach(const char *const **match)
+{
+	static const struct machine_desc *mdesc = __arch_info_begin;
+	const struct machine_desc *m = mdesc;
+
+	if (m >= __arch_info_end)
+		return NULL;
+
+	mdesc++;
+	*match = m->dt_compat;
+	return m;
+}
+
+/**
+ * setup_machine_fdt - Machine setup when an dtb was passed to the kernel
+ * @dt:		virtual address pointer to dt blob
+ *
+ * If a dtb was passed to the kernel, then use it to choose the correct
+ * machine_desc and to setup the system.
+ */
+const struct machine_desc * __init setup_machine_fdt(void *dt)
+{
+	const struct machine_desc *mdesc;
+
+	/* check device tree validity */
+	if (!early_init_dt_scan(dt))
+		return NULL;
+
+	mdesc = of_flat_dt_match_machine(NULL, arch_get_next_mach);
+	if (!mdesc)
+		dump_machine_table(); /* does not return */
+	pr_info("Machine name: %s\n", mdesc->name);
+
+	return mdesc;
+}
diff --git a/arch/metag/kernel/dma.c b/arch/metag/kernel/dma.c
new file mode 100644
index 000000000..c700d6250
--- /dev/null
+++ b/arch/metag/kernel/dma.c
@@ -0,0 +1,500 @@
+/*
+ *  Meta version derived from arch/powerpc/lib/dma-noncoherent.c
+ *    Copyright (C) 2008 Imagination Technologies Ltd.
+ *
+ *  PowerPC version derived from arch/arm/mm/consistent.c
+ *    Copyright (C) 2001 Dan Malek (dmalek@jlc.net)
+ *
+ *  Copyright (C) 2000 Russell King
+ *
+ * Consistent memory allocators.  Used for DMA devices that want to
+ * share uncached memory with the processor core.  The function return
+ * is the virtual address and 'dma_handle' is the physical address.
+ * Mostly stolen from the ARM port, with some changes for PowerPC.
+ *						-- Dan
+ *
+ * Reorganized to get rid of the arch-specific consistent_* functions
+ * and provide non-coherent implementations for the DMA API. -Matt
+ *
+ * Added in_interrupt() safe dma_alloc_coherent()/dma_free_coherent()
+ * implementation. This is pulled straight from ARM and barely
+ * modified. -Matt
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/export.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/highmem.h>
+#include <linux/dma-mapping.h>
+#include <linux/slab.h>
+
+#include <asm/tlbflush.h>
+#include <asm/mmu.h>
+
+#define CONSISTENT_OFFSET(x)	(((unsigned long)(x) - CONSISTENT_START) \
+					>> PAGE_SHIFT)
+
+static u64 get_coherent_dma_mask(struct device *dev)
+{
+	u64 mask = ~0ULL;
+
+	if (dev) {
+		mask = dev->coherent_dma_mask;
+
+		/*
+		 * Sanity check the DMA mask - it must be non-zero, and
+		 * must be able to be satisfied by a DMA allocation.
+		 */
+		if (mask == 0) {
+			dev_warn(dev, "coherent DMA mask is unset\n");
+			return 0;
+		}
+	}
+
+	return mask;
+}
+/*
+ * This is the page table (2MB) covering uncached, DMA consistent allocations
+ */
+static pte_t *consistent_pte;
+static DEFINE_SPINLOCK(consistent_lock);
+
+/*
+ * VM region handling support.
+ *
+ * This should become something generic, handling VM region allocations for
+ * vmalloc and similar (ioremap, module space, etc).
+ *
+ * I envisage vmalloc()'s supporting vm_struct becoming:
+ *
+ *  struct vm_struct {
+ *    struct metag_vm_region	region;
+ *    unsigned long	flags;
+ *    struct page	**pages;
+ *    unsigned int	nr_pages;
+ *    unsigned long	phys_addr;
+ *  };
+ *
+ * get_vm_area() would then call metag_vm_region_alloc with an appropriate
+ * struct metag_vm_region head (eg):
+ *
+ *  struct metag_vm_region vmalloc_head = {
+ *	.vm_list	= LIST_HEAD_INIT(vmalloc_head.vm_list),
+ *	.vm_start	= VMALLOC_START,
+ *	.vm_end		= VMALLOC_END,
+ *  };
+ *
+ * However, vmalloc_head.vm_start is variable (typically, it is dependent on
+ * the amount of RAM found at boot time.)  I would imagine that get_vm_area()
+ * would have to initialise this each time prior to calling
+ * metag_vm_region_alloc().
+ */
+struct metag_vm_region {
+	struct list_head vm_list;
+	unsigned long vm_start;
+	unsigned long vm_end;
+	struct page		*vm_pages;
+	int			vm_active;
+};
+
+static struct metag_vm_region consistent_head = {
+	.vm_list = LIST_HEAD_INIT(consistent_head.vm_list),
+	.vm_start = CONSISTENT_START,
+	.vm_end = CONSISTENT_END,
+};
+
+static struct metag_vm_region *metag_vm_region_alloc(struct metag_vm_region
+						     *head, size_t size,
+						     gfp_t gfp)
+{
+	unsigned long addr = head->vm_start, end = head->vm_end - size;
+	unsigned long flags;
+	struct metag_vm_region *c, *new;
+
+	new = kmalloc(sizeof(struct metag_vm_region), gfp);
+	if (!new)
+		goto out;
+
+	spin_lock_irqsave(&consistent_lock, flags);
+
+	list_for_each_entry(c, &head->vm_list, vm_list) {
+		if ((addr + size) < addr)
+			goto nospc;
+		if ((addr + size) <= c->vm_start)
+			goto found;
+		addr = c->vm_end;
+		if (addr > end)
+			goto nospc;
+	}
+
+found:
+	/*
+	 * Insert this entry _before_ the one we found.
+	 */
+	list_add_tail(&new->vm_list, &c->vm_list);
+	new->vm_start = addr;
+	new->vm_end = addr + size;
+	new->vm_active = 1;
+
+	spin_unlock_irqrestore(&consistent_lock, flags);
+	return new;
+
+nospc:
+	spin_unlock_irqrestore(&consistent_lock, flags);
+	kfree(new);
+out:
+	return NULL;
+}
+
+static struct metag_vm_region *metag_vm_region_find(struct metag_vm_region
+						    *head, unsigned long addr)
+{
+	struct metag_vm_region *c;
+
+	list_for_each_entry(c, &head->vm_list, vm_list) {
+		if (c->vm_active && c->vm_start == addr)
+			goto out;
+	}
+	c = NULL;
+out:
+	return c;
+}
+
+/*
+ * Allocate DMA-coherent memory space and return both the kernel remapped
+ * virtual and bus address for that space.
+ */
+void *dma_alloc_coherent(struct device *dev, size_t size,
+			 dma_addr_t *handle, gfp_t gfp)
+{
+	struct page *page;
+	struct metag_vm_region *c;
+	unsigned long order;
+	u64 mask = get_coherent_dma_mask(dev);
+	u64 limit;
+
+	if (!consistent_pte) {
+		pr_err("%s: not initialised\n", __func__);
+		dump_stack();
+		return NULL;
+	}
+
+	if (!mask)
+		goto no_page;
+	size = PAGE_ALIGN(size);
+	limit = (mask + 1) & ~mask;
+	if ((limit && size >= limit)
+	    || size >= (CONSISTENT_END - CONSISTENT_START)) {
+		pr_warn("coherent allocation too big (requested %#x mask %#Lx)\n",
+			size, mask);
+		return NULL;
+	}
+
+	order = get_order(size);
+
+	if (mask != 0xffffffff)
+		gfp |= GFP_DMA;
+
+	page = alloc_pages(gfp, order);
+	if (!page)
+		goto no_page;
+
+	/*
+	 * Invalidate any data that might be lurking in the
+	 * kernel direct-mapped region for device DMA.
+	 */
+	{
+		void *kaddr = page_address(page);
+		memset(kaddr, 0, size);
+		flush_dcache_region(kaddr, size);
+	}
+
+	/*
+	 * Allocate a virtual address in the consistent mapping region.
+	 */
+	c = metag_vm_region_alloc(&consistent_head, size,
+				  gfp & ~(__GFP_DMA | __GFP_HIGHMEM));
+	if (c) {
+		unsigned long vaddr = c->vm_start;
+		pte_t *pte = consistent_pte + CONSISTENT_OFFSET(vaddr);
+		struct page *end = page + (1 << order);
+
+		c->vm_pages = page;
+		split_page(page, order);
+
+		/*
+		 * Set the "dma handle"
+		 */
+		*handle = page_to_bus(page);
+
+		do {
+			BUG_ON(!pte_none(*pte));
+
+			SetPageReserved(page);
+			set_pte_at(&init_mm, vaddr,
+				   pte, mk_pte(page,
+					       pgprot_writecombine
+					       (PAGE_KERNEL)));
+			page++;
+			pte++;
+			vaddr += PAGE_SIZE;
+		} while (size -= PAGE_SIZE);
+
+		/*
+		 * Free the otherwise unused pages.
+		 */
+		while (page < end) {
+			__free_page(page);
+			page++;
+		}
+
+		return (void *)c->vm_start;
+	}
+
+	if (page)
+		__free_pages(page, order);
+no_page:
+	return NULL;
+}
+EXPORT_SYMBOL(dma_alloc_coherent);
+
+/*
+ * free a page as defined by the above mapping.
+ */
+void dma_free_coherent(struct device *dev, size_t size,
+		       void *vaddr, dma_addr_t dma_handle)
+{
+	struct metag_vm_region *c;
+	unsigned long flags, addr;
+	pte_t *ptep;
+
+	size = PAGE_ALIGN(size);
+
+	spin_lock_irqsave(&consistent_lock, flags);
+
+	c = metag_vm_region_find(&consistent_head, (unsigned long)vaddr);
+	if (!c)
+		goto no_area;
+
+	c->vm_active = 0;
+	if ((c->vm_end - c->vm_start) != size) {
+		pr_err("%s: freeing wrong coherent size (%ld != %d)\n",
+		       __func__, c->vm_end - c->vm_start, size);
+		dump_stack();
+		size = c->vm_end - c->vm_start;
+	}
+
+	ptep = consistent_pte + CONSISTENT_OFFSET(c->vm_start);
+	addr = c->vm_start;
+	do {
+		pte_t pte = ptep_get_and_clear(&init_mm, addr, ptep);
+		unsigned long pfn;
+
+		ptep++;
+		addr += PAGE_SIZE;
+
+		if (!pte_none(pte) && pte_present(pte)) {
+			pfn = pte_pfn(pte);
+
+			if (pfn_valid(pfn)) {
+				struct page *page = pfn_to_page(pfn);
+				__free_reserved_page(page);
+				continue;
+			}
+		}
+
+		pr_crit("%s: bad page in kernel page table\n",
+			__func__);
+	} while (size -= PAGE_SIZE);
+
+	flush_tlb_kernel_range(c->vm_start, c->vm_end);
+
+	list_del(&c->vm_list);
+
+	spin_unlock_irqrestore(&consistent_lock, flags);
+
+	kfree(c);
+	return;
+
+no_area:
+	spin_unlock_irqrestore(&consistent_lock, flags);
+	pr_err("%s: trying to free invalid coherent area: %p\n",
+	       __func__, vaddr);
+	dump_stack();
+}
+EXPORT_SYMBOL(dma_free_coherent);
+
+
+static int dma_mmap(struct device *dev, struct vm_area_struct *vma,
+		    void *cpu_addr, dma_addr_t dma_addr, size_t size)
+{
+	int ret = -ENXIO;
+
+	unsigned long flags, user_size, kern_size;
+	struct metag_vm_region *c;
+
+	user_size = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+
+	spin_lock_irqsave(&consistent_lock, flags);
+	c = metag_vm_region_find(&consistent_head, (unsigned long)cpu_addr);
+	spin_unlock_irqrestore(&consistent_lock, flags);
+
+	if (c) {
+		unsigned long off = vma->vm_pgoff;
+
+		kern_size = (c->vm_end - c->vm_start) >> PAGE_SHIFT;
+
+		if (off < kern_size &&
+		    user_size <= (kern_size - off)) {
+			ret = remap_pfn_range(vma, vma->vm_start,
+					      page_to_pfn(c->vm_pages) + off,
+					      user_size << PAGE_SHIFT,
+					      vma->vm_page_prot);
+		}
+	}
+
+
+	return ret;
+}
+
+int dma_mmap_coherent(struct device *dev, struct vm_area_struct *vma,
+		      void *cpu_addr, dma_addr_t dma_addr, size_t size)
+{
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+	return dma_mmap(dev, vma, cpu_addr, dma_addr, size);
+}
+EXPORT_SYMBOL(dma_mmap_coherent);
+
+int dma_mmap_writecombine(struct device *dev, struct vm_area_struct *vma,
+			  void *cpu_addr, dma_addr_t dma_addr, size_t size)
+{
+	vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
+	return dma_mmap(dev, vma, cpu_addr, dma_addr, size);
+}
+EXPORT_SYMBOL(dma_mmap_writecombine);
+
+
+
+
+/*
+ * Initialise the consistent memory allocation.
+ */
+static int __init dma_alloc_init(void)
+{
+	pgd_t *pgd, *pgd_k;
+	pud_t *pud, *pud_k;
+	pmd_t *pmd, *pmd_k;
+	pte_t *pte;
+	int ret = 0;
+
+	do {
+		int offset = pgd_index(CONSISTENT_START);
+		pgd = pgd_offset(&init_mm, CONSISTENT_START);
+		pud = pud_alloc(&init_mm, pgd, CONSISTENT_START);
+		pmd = pmd_alloc(&init_mm, pud, CONSISTENT_START);
+		WARN_ON(!pmd_none(*pmd));
+
+		pte = pte_alloc_kernel(pmd, CONSISTENT_START);
+		if (!pte) {
+			pr_err("%s: no pte tables\n", __func__);
+			ret = -ENOMEM;
+			break;
+		}
+
+		pgd_k = ((pgd_t *) mmu_get_base()) + offset;
+		pud_k = pud_offset(pgd_k, CONSISTENT_START);
+		pmd_k = pmd_offset(pud_k, CONSISTENT_START);
+		set_pmd(pmd_k, *pmd);
+
+		consistent_pte = pte;
+	} while (0);
+
+	return ret;
+}
+early_initcall(dma_alloc_init);
+
+/*
+ * make an area consistent to devices.
+ */
+void dma_sync_for_device(void *vaddr, size_t size, int dma_direction)
+{
+	/*
+	 * Ensure any writes get through the write combiner. This is necessary
+	 * even with DMA_FROM_DEVICE, or the write may dirty the cache after
+	 * we've invalidated it and get written back during the DMA.
+	 */
+
+	barrier();
+
+	switch (dma_direction) {
+	case DMA_BIDIRECTIONAL:
+		/*
+		 * Writeback to ensure the device can see our latest changes and
+		 * so that we have no dirty lines, and invalidate the cache
+		 * lines too in preparation for receiving the buffer back
+		 * (dma_sync_for_cpu) later.
+		 */
+		flush_dcache_region(vaddr, size);
+		break;
+	case DMA_TO_DEVICE:
+		/*
+		 * Writeback to ensure the device can see our latest changes.
+		 * There's no need to invalidate as the device shouldn't write
+		 * to the buffer.
+		 */
+		writeback_dcache_region(vaddr, size);
+		break;
+	case DMA_FROM_DEVICE:
+		/*
+		 * Invalidate to ensure we have no dirty lines that could get
+		 * written back during the DMA. It's also safe to flush
+		 * (writeback) here if necessary.
+		 */
+		invalidate_dcache_region(vaddr, size);
+		break;
+	case DMA_NONE:
+		BUG();
+	}
+
+	wmb();
+}
+EXPORT_SYMBOL(dma_sync_for_device);
+
+/*
+ * make an area consistent to the core.
+ */
+void dma_sync_for_cpu(void *vaddr, size_t size, int dma_direction)
+{
+	/*
+	 * Hardware L2 cache prefetch doesn't occur across 4K physical
+	 * boundaries, however according to Documentation/DMA-API-HOWTO.txt
+	 * kmalloc'd memory is DMA'able, so accesses in nearby memory could
+	 * trigger a cache fill in the DMA buffer.
+	 *
+	 * This should never cause dirty lines, so a flush or invalidate should
+	 * be safe to allow us to see data from the device.
+	 */
+	if (_meta_l2c_pf_is_enabled()) {
+		switch (dma_direction) {
+		case DMA_BIDIRECTIONAL:
+		case DMA_FROM_DEVICE:
+			invalidate_dcache_region(vaddr, size);
+			break;
+		case DMA_TO_DEVICE:
+			/* The device shouldn't have written to the buffer */
+			break;
+		case DMA_NONE:
+			BUG();
+		}
+	}
+
+	rmb();
+}
+EXPORT_SYMBOL(dma_sync_for_cpu);
diff --git a/arch/metag/kernel/ftrace.c b/arch/metag/kernel/ftrace.c
new file mode 100644
index 000000000..ed1d68515
--- /dev/null
+++ b/arch/metag/kernel/ftrace.c
@@ -0,0 +1,123 @@
+/*
+ * Copyright (C) 2008 Imagination Technologies Ltd.
+ * Licensed under the GPL
+ *
+ * Dynamic ftrace support.
+ */
+
+#include <linux/ftrace.h>
+#include <linux/io.h>
+#include <linux/uaccess.h>
+
+#include <asm/cacheflush.h>
+
+#define D04_MOVT_TEMPLATE	0x02200005
+#define D04_CALL_TEMPLATE	0xAC200005
+#define D1RTP_MOVT_TEMPLATE	0x03200005
+#define D1RTP_CALL_TEMPLATE	0xAC200006
+
+static const unsigned long NOP[2] = {0xa0fffffe, 0xa0fffffe};
+static unsigned long movt_and_call_insn[2];
+
+static unsigned char *ftrace_nop_replace(void)
+{
+	return (char *)&NOP[0];
+}
+
+static unsigned char *ftrace_call_replace(unsigned long pc, unsigned long addr)
+{
+	unsigned long hi16, low16;
+
+	hi16 = (addr & 0xffff0000) >> 13;
+	low16 = (addr & 0x0000ffff) << 3;
+
+	/*
+	 * The compiler makes the call to mcount_wrapper()
+	 * (Meta's wrapper around mcount()) through the register
+	 * D0.4. So whenever we're patching one of those compiler-generated
+	 * calls we also need to go through D0.4. Otherwise use D1RtP.
+	 */
+	if (pc == (unsigned long)&ftrace_call) {
+		writel(D1RTP_MOVT_TEMPLATE | hi16, &movt_and_call_insn[0]);
+		writel(D1RTP_CALL_TEMPLATE | low16, &movt_and_call_insn[1]);
+	} else {
+		writel(D04_MOVT_TEMPLATE | hi16, &movt_and_call_insn[0]);
+		writel(D04_CALL_TEMPLATE | low16, &movt_and_call_insn[1]);
+	}
+
+	return (unsigned char *)&movt_and_call_insn[0];
+}
+
+static int ftrace_modify_code(unsigned long pc, unsigned char *old_code,
+			      unsigned char *new_code)
+{
+	unsigned char replaced[MCOUNT_INSN_SIZE];
+
+	/*
+	 * Note: Due to modules and __init, code can
+	 *  disappear and change, we need to protect against faulting
+	 *  as well as code changing.
+	 *
+	 * No real locking needed, this code is run through
+	 * kstop_machine.
+	 */
+
+	/* read the text we want to modify */
+	if (probe_kernel_read(replaced, (void *)pc, MCOUNT_INSN_SIZE))
+		return -EFAULT;
+
+	/* Make sure it is what we expect it to be */
+	if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0)
+		return -EINVAL;
+
+	/* replace the text with the new text */
+	if (probe_kernel_write((void *)pc, new_code, MCOUNT_INSN_SIZE))
+		return -EPERM;
+
+	flush_icache_range(pc, pc + MCOUNT_INSN_SIZE);
+
+	return 0;
+}
+
+int ftrace_update_ftrace_func(ftrace_func_t func)
+{
+	int ret;
+	unsigned long pc;
+	unsigned char old[MCOUNT_INSN_SIZE], *new;
+
+	pc = (unsigned long)&ftrace_call;
+	memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE);
+	new = ftrace_call_replace(pc, (unsigned long)func);
+	ret = ftrace_modify_code(pc, old, new);
+
+	return ret;
+}
+
+int ftrace_make_nop(struct module *mod,
+		    struct dyn_ftrace *rec, unsigned long addr)
+{
+	unsigned char *new, *old;
+	unsigned long ip = rec->ip;
+
+	old = ftrace_call_replace(ip, addr);
+	new = ftrace_nop_replace();
+
+	return ftrace_modify_code(ip, old, new);
+}
+
+int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
+{
+	unsigned char *new, *old;
+	unsigned long ip = rec->ip;
+
+	old = ftrace_nop_replace();
+	new = ftrace_call_replace(ip, addr);
+
+	return ftrace_modify_code(ip, old, new);
+}
+
+/* run from kstop_machine */
+int __init ftrace_dyn_arch_init(void)
+{
+	return 0;
+}
diff --git a/arch/metag/kernel/ftrace_stub.S b/arch/metag/kernel/ftrace_stub.S
new file mode 100644
index 000000000..3acc28821
--- /dev/null
+++ b/arch/metag/kernel/ftrace_stub.S
@@ -0,0 +1,62 @@
+/*
+ * Copyright (C) 2008 Imagination Technologies Ltd.
+ * Licensed under the GPL
+ *
+ */
+
+#include <asm/ftrace.h>
+
+	.text
+#ifdef CONFIG_DYNAMIC_FTRACE
+	.global	_mcount_wrapper
+	.type	_mcount_wrapper,function
+_mcount_wrapper:
+	MOV	PC,D0.4
+
+	.global _ftrace_caller
+	.type	_ftrace_caller,function
+_ftrace_caller:
+	MSETL   [A0StP], D0Ar6, D0Ar4, D0Ar2, D0.4
+	MOV     D1Ar1, D0.4
+	MOV     D0Ar2, D1RtP
+	SUB	D1Ar1,D1Ar1,#MCOUNT_INSN_SIZE
+
+	.global _ftrace_call
+_ftrace_call:
+	MOVT	D1RtP,#HI(_ftrace_stub)
+	CALL	D1RtP,#LO(_ftrace_stub)
+	GETL    D0.4,  D1RtP, [A0StP++#(-8)]
+	GETL    D0Ar2, D1Ar1, [A0StP++#(-8)]
+	GETL    D0Ar4, D1Ar3, [A0StP++#(-8)]
+	GETL    D0Ar6, D1Ar5, [A0StP++#(-8)]
+	MOV     PC, D0.4
+#else
+
+	.global	_mcount_wrapper
+	.type	_mcount_wrapper,function
+_mcount_wrapper:
+	MSETL   [A0StP], D0Ar6, D0Ar4, D0Ar2, D0.4
+	MOV     D1Ar1, D0.4
+	MOV     D0Ar2, D1RtP
+	MOVT    D0Re0,#HI(_ftrace_trace_function)
+	ADD	D0Re0,D0Re0,#LO(_ftrace_trace_function)
+	GET	D1Ar3,[D0Re0]
+	MOVT	D1Re0,#HI(_ftrace_stub)
+	ADD	D1Re0,D1Re0,#LO(_ftrace_stub)
+	CMP	D1Ar3,D1Re0
+	BEQ	$Ltrace_exit
+	MOV	D1RtP,D1Ar3
+	SUB	D1Ar1,D1Ar1,#MCOUNT_INSN_SIZE
+	SWAP	PC,D1RtP
+$Ltrace_exit:
+	GETL    D0.4,  D1RtP, [A0StP++#(-8)]
+	GETL    D0Ar2, D1Ar1, [A0StP++#(-8)]
+	GETL    D0Ar4, D1Ar3, [A0StP++#(-8)]
+	GETL    D0Ar6, D1Ar5, [A0StP++#(-8)]
+	MOV     PC, D0.4
+
+#endif	/* CONFIG_DYNAMIC_FTRACE */
+
+	.global _ftrace_stub
+_ftrace_stub:
+	MOV 	PC,D1RtP
diff --git a/arch/metag/kernel/head.S b/arch/metag/kernel/head.S
new file mode 100644
index 000000000..713f71d1b
--- /dev/null
+++ b/arch/metag/kernel/head.S
@@ -0,0 +1,65 @@
+	! Copyright 2005,2006,2007,2009 Imagination Technologies
+
+#include <linux/init.h>
+#include <asm/metag_mem.h>
+#include <generated/asm-offsets.h>
+#undef __exit
+
+	__HEAD
+	! Setup the stack and get going into _metag_start_kernel
+	.global	__start
+	.type   __start,function
+__start:
+	! D1Ar1 contains pTBI (ISTAT)
+	! D0Ar2 contains pTBI
+	! D1Ar3 contains __pTBISegs
+	! D0Ar4 contains kernel arglist pointer
+
+	MOVT    D0Re0,#HI(___pTBIs)
+	ADD     D0Re0,D0Re0,#LO(___pTBIs)
+	SETL    [D0Re0],D0Ar2,D1Ar1
+	MOVT    D0Re0,#HI(___pTBISegs)
+	ADD     D0Re0,D0Re0,#LO(___pTBISegs)
+	SETD    [D0Re0],D1Ar3
+	MOV	A0FrP,#0
+	MOV	D0Re0,#0
+	MOV	D1Re0,#0
+	MOV	D1Ar3,#0
+	MOV	D1Ar1,D0Ar4			!Store kernel boot params
+	MOV	D1Ar5,#0
+	MOV	D0Ar6,#0
+#ifdef CONFIG_METAG_DSP
+	MOV	D0.8,#0
+#endif
+	MOVT    A0StP,#HI(_init_thread_union)
+	ADD	A0StP,A0StP,#LO(_init_thread_union)
+	ADD     A0StP,A0StP,#THREAD_INFO_SIZE
+	MOVT	D1RtP,#HI(_metag_start_kernel)
+	CALL	D1RtP,#LO(_metag_start_kernel)
+	.size	__start,.-__start
+
+	!! Needed by TBX
+	.global	__exit
+	.type   __exit,function
+__exit:
+	XOR     TXENABLE,D0Re0,D0Re0
+	.size	__exit,.-__exit
+
+#ifdef CONFIG_SMP
+	.global _secondary_startup
+	.type _secondary_startup,function
+_secondary_startup:
+#if CONFIG_PAGE_OFFSET < LINGLOBAL_BASE
+	! In case GCOn has just been turned on we need to fence any writes that
+	! the boot thread might have performed prior to coherency taking effect.
+	MOVT	D0Re0,#HI(LINSYSEVENT_WR_ATOMIC_UNLOCK)
+	MOV	D1Re0,#0
+	SETD	[D0Re0], D1Re0
+#endif
+	MOVT	A0StP,#HI(_secondary_data_stack)
+	ADD	A0StP,A0StP,#LO(_secondary_data_stack)
+	GETD	A0StP,[A0StP]
+	ADD	A0StP,A0StP,#THREAD_INFO_SIZE
+	B	_secondary_start_kernel
+	.size	_secondary_startup,.-_secondary_startup
+#endif
diff --git a/arch/metag/kernel/irq.c b/arch/metag/kernel/irq.c
new file mode 100644
index 000000000..4f8f1f87e
--- /dev/null
+++ b/arch/metag/kernel/irq.c
@@ -0,0 +1,292 @@
+/*
+ * Linux/Meta general interrupt handling code
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/irqchip/metag-ext.h>
+#include <linux/irqchip/metag.h>
+#include <linux/irqdomain.h>
+#include <linux/ratelimit.h>
+
+#include <asm/core_reg.h>
+#include <asm/mach/arch.h>
+#include <asm/uaccess.h>
+
+#ifdef CONFIG_4KSTACKS
+union irq_ctx {
+	struct thread_info      tinfo;
+	u32                     stack[THREAD_SIZE/sizeof(u32)];
+};
+
+static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
+static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
+#endif
+
+static struct irq_domain *root_domain;
+
+static unsigned int startup_meta_irq(struct irq_data *data)
+{
+	tbi_startup_interrupt(data->hwirq);
+	return 0;
+}
+
+static void shutdown_meta_irq(struct irq_data *data)
+{
+	tbi_shutdown_interrupt(data->hwirq);
+}
+
+void do_IRQ(int irq, struct pt_regs *regs)
+{
+	struct pt_regs *old_regs = set_irq_regs(regs);
+#ifdef CONFIG_4KSTACKS
+	struct irq_desc *desc;
+	union irq_ctx *curctx, *irqctx;
+	u32 *isp;
+#endif
+
+	irq_enter();
+
+	irq = irq_linear_revmap(root_domain, irq);
+
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+	/* Debugging check for stack overflow: is there less than 1KB free? */
+	{
+		unsigned long sp;
+
+		sp = __core_reg_get(A0StP);
+		sp &= THREAD_SIZE - 1;
+
+		if (unlikely(sp > (THREAD_SIZE - 1024)))
+			pr_err("Stack overflow in do_IRQ: %ld\n", sp);
+	}
+#endif
+
+
+#ifdef CONFIG_4KSTACKS
+	curctx = (union irq_ctx *) current_thread_info();
+	irqctx = hardirq_ctx[smp_processor_id()];
+
+	/*
+	 * this is where we switch to the IRQ stack. However, if we are
+	 * already using the IRQ stack (because we interrupted a hardirq
+	 * handler) we can't do that and just have to keep using the
+	 * current stack (which is the irq stack already after all)
+	 */
+	if (curctx != irqctx) {
+		/* build the stack frame on the IRQ stack */
+		isp = (u32 *) ((char *)irqctx + sizeof(struct thread_info));
+		irqctx->tinfo.task = curctx->tinfo.task;
+
+		/*
+		 * Copy the softirq bits in preempt_count so that the
+		 * softirq checks work in the hardirq context.
+		 */
+		irqctx->tinfo.preempt_count =
+			(irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
+			(curctx->tinfo.preempt_count & SOFTIRQ_MASK);
+
+		desc = irq_to_desc(irq);
+
+		asm volatile (
+			"MOV   D0.5,%0\n"
+			"MOV   D1Ar1,%1\n"
+			"MOV   D1RtP,%2\n"
+			"MOV   D0Ar2,%3\n"
+			"SWAP  A0StP,D0.5\n"
+			"SWAP  PC,D1RtP\n"
+			"MOV   A0StP,D0.5\n"
+			:
+			: "r" (isp), "r" (irq), "r" (desc->handle_irq),
+			  "r" (desc)
+			: "memory", "cc", "D1Ar1", "D0Ar2", "D1Ar3", "D0Ar4",
+			  "D1Ar5", "D0Ar6", "D0Re0", "D1Re0", "D0.4", "D1RtP",
+			  "D0.5"
+			);
+	} else
+#endif
+		generic_handle_irq(irq);
+
+	irq_exit();
+
+	set_irq_regs(old_regs);
+}
+
+#ifdef CONFIG_4KSTACKS
+
+static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
+
+static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
+
+/*
+ * allocate per-cpu stacks for hardirq and for softirq processing
+ */
+void irq_ctx_init(int cpu)
+{
+	union irq_ctx *irqctx;
+
+	if (hardirq_ctx[cpu])
+		return;
+
+	irqctx = (union irq_ctx *) &hardirq_stack[cpu * THREAD_SIZE];
+	irqctx->tinfo.task              = NULL;
+	irqctx->tinfo.cpu               = cpu;
+	irqctx->tinfo.preempt_count     = HARDIRQ_OFFSET;
+	irqctx->tinfo.addr_limit        = MAKE_MM_SEG(0);
+
+	hardirq_ctx[cpu] = irqctx;
+
+	irqctx = (union irq_ctx *) &softirq_stack[cpu * THREAD_SIZE];
+	irqctx->tinfo.task              = NULL;
+	irqctx->tinfo.cpu               = cpu;
+	irqctx->tinfo.preempt_count     = 0;
+	irqctx->tinfo.addr_limit        = MAKE_MM_SEG(0);
+
+	softirq_ctx[cpu] = irqctx;
+
+	pr_info("CPU %u irqstacks, hard=%p soft=%p\n",
+		cpu, hardirq_ctx[cpu], softirq_ctx[cpu]);
+}
+
+void irq_ctx_exit(int cpu)
+{
+	hardirq_ctx[smp_processor_id()] = NULL;
+}
+
+extern asmlinkage void __do_softirq(void);
+
+void do_softirq_own_stack(void)
+{
+	struct thread_info *curctx;
+	union irq_ctx *irqctx;
+	u32 *isp;
+
+	curctx = current_thread_info();
+	irqctx = softirq_ctx[smp_processor_id()];
+	irqctx->tinfo.task = curctx->task;
+
+	/* build the stack frame on the softirq stack */
+	isp = (u32 *) ((char *)irqctx + sizeof(struct thread_info));
+
+	asm volatile (
+		"MOV   D0.5,%0\n"
+		"SWAP  A0StP,D0.5\n"
+		"CALLR D1RtP,___do_softirq\n"
+		"MOV   A0StP,D0.5\n"
+		:
+		: "r" (isp)
+		: "memory", "cc", "D1Ar1", "D0Ar2", "D1Ar3", "D0Ar4",
+		  "D1Ar5", "D0Ar6", "D0Re0", "D1Re0", "D0.4", "D1RtP",
+		  "D0.5"
+		);
+}
+#endif
+
+static struct irq_chip meta_irq_type = {
+	.name = "META-IRQ",
+	.irq_startup = startup_meta_irq,
+	.irq_shutdown = shutdown_meta_irq,
+};
+
+/**
+ * tbisig_map() - Map a TBI signal number to a virtual IRQ number.
+ * @hw:		Number of the TBI signal. Must be in range.
+ *
+ * Returns:	The virtual IRQ number of the TBI signal number IRQ specified by
+ *		@hw.
+ */
+int tbisig_map(unsigned int hw)
+{
+	return irq_create_mapping(root_domain, hw);
+}
+
+/**
+ * metag_tbisig_map() - map a tbi signal to a Linux virtual IRQ number
+ * @d:		root irq domain
+ * @irq:	virtual irq number
+ * @hw:		hardware irq number (TBI signal number)
+ *
+ * This sets up a virtual irq for a specified TBI signal number.
+ */
+static int metag_tbisig_map(struct irq_domain *d, unsigned int irq,
+			    irq_hw_number_t hw)
+{
+#ifdef CONFIG_SMP
+	irq_set_chip_and_handler(irq, &meta_irq_type, handle_percpu_irq);
+#else
+	irq_set_chip_and_handler(irq, &meta_irq_type, handle_simple_irq);
+#endif
+	return 0;
+}
+
+static const struct irq_domain_ops metag_tbisig_domain_ops = {
+	.map = metag_tbisig_map,
+};
+
+/*
+ * void init_IRQ(void)
+ *
+ * Parameters:	None
+ *
+ * Returns:	Nothing
+ *
+ * This function should be called during kernel startup to initialize
+ * the IRQ handling routines.
+ */
+void __init init_IRQ(void)
+{
+	root_domain = irq_domain_add_linear(NULL, 32,
+					    &metag_tbisig_domain_ops, NULL);
+	if (unlikely(!root_domain))
+		panic("init_IRQ: cannot add root IRQ domain");
+
+	irq_ctx_init(smp_processor_id());
+
+	init_internal_IRQ();
+	init_external_IRQ();
+
+	if (machine_desc->init_irq)
+		machine_desc->init_irq();
+}
+
+int __init arch_probe_nr_irqs(void)
+{
+	if (machine_desc->nr_irqs)
+		nr_irqs = machine_desc->nr_irqs;
+	return 0;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * The CPU has been marked offline.  Migrate IRQs off this CPU.  If
+ * the affinity settings do not allow other CPUs, force them onto any
+ * available CPU.
+ */
+void migrate_irqs(void)
+{
+	unsigned int i, cpu = smp_processor_id();
+
+	for_each_active_irq(i) {
+		struct irq_data *data = irq_get_irq_data(i);
+		unsigned int newcpu;
+
+		if (irqd_is_per_cpu(data))
+			continue;
+
+		if (!cpumask_test_cpu(cpu, data->affinity))
+			continue;
+
+		newcpu = cpumask_any_and(data->affinity, cpu_online_mask);
+
+		if (newcpu >= nr_cpu_ids) {
+			pr_info_ratelimited("IRQ%u no longer affine to CPU%u\n",
+					    i, cpu);
+
+			cpumask_setall(data->affinity);
+		}
+		irq_set_affinity(i, data->affinity);
+	}
+}
+#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/arch/metag/kernel/kick.c b/arch/metag/kernel/kick.c
new file mode 100644
index 000000000..beb377621
--- /dev/null
+++ b/arch/metag/kernel/kick.c
@@ -0,0 +1,110 @@
+/*
+ *  Copyright (C) 2009 Imagination Technologies
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of this archive
+ * for more details.
+ *
+ * The Meta KICK interrupt mechanism is generally a useful feature, so
+ * we provide an interface for registering multiple interrupt
+ * handlers. All the registered interrupt handlers are "chained". When
+ * a KICK interrupt is received the first function in the list is
+ * called. If that interrupt handler cannot handle the KICK the next
+ * one is called, then the next until someone handles it (or we run
+ * out of functions). As soon as one function handles the interrupt no
+ * other handlers are called.
+ *
+ * The only downside of chaining interrupt handlers is that each
+ * handler must be able to detect whether the KICK was intended for it
+ * or not.  For example, when the IPI handler runs and it sees that
+ * there are no IPI messages it must not signal that the KICK was
+ * handled, thereby giving the other handlers a chance to run.
+ *
+ * The reason that we provide our own interface for calling KICK
+ * handlers instead of using the generic kernel infrastructure is that
+ * the KICK handlers require access to a CPU's pTBI structure. So we
+ * pass it as an argument.
+ */
+#include <linux/export.h>
+#include <linux/hardirq.h>
+#include <linux/irq.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/types.h>
+
+#include <asm/traps.h>
+
+/*
+ * All accesses/manipulations of kick_handlers_list should be
+ * performed while holding kick_handlers_lock.
+ */
+static DEFINE_SPINLOCK(kick_handlers_lock);
+static LIST_HEAD(kick_handlers_list);
+
+void kick_register_func(struct kick_irq_handler *kh)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&kick_handlers_lock, flags);
+
+	list_add_tail(&kh->list, &kick_handlers_list);
+
+	spin_unlock_irqrestore(&kick_handlers_lock, flags);
+}
+EXPORT_SYMBOL(kick_register_func);
+
+void kick_unregister_func(struct kick_irq_handler *kh)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&kick_handlers_lock, flags);
+
+	list_del(&kh->list);
+
+	spin_unlock_irqrestore(&kick_handlers_lock, flags);
+}
+EXPORT_SYMBOL(kick_unregister_func);
+
+TBIRES
+kick_handler(TBIRES State, int SigNum, int Triggers, int Inst, PTBI pTBI)
+{
+	struct pt_regs *old_regs;
+	struct kick_irq_handler *kh;
+	struct list_head *lh;
+	int handled = 0;
+	TBIRES ret;
+
+	head_end(State, ~INTS_OFF_MASK);
+
+	/* If we interrupted user code handle any critical sections. */
+	if (State.Sig.SaveMask & TBICTX_PRIV_BIT)
+		restart_critical_section(State);
+
+	trace_hardirqs_off();
+
+	old_regs = set_irq_regs((struct pt_regs *)State.Sig.pCtx);
+	irq_enter();
+
+	/*
+	 * There is no need to disable interrupts here because we
+	 * can't nest KICK interrupts in a KICK interrupt handler.
+	 */
+	spin_lock(&kick_handlers_lock);
+
+	list_for_each(lh, &kick_handlers_list) {
+		kh = list_entry(lh, struct kick_irq_handler, list);
+
+		ret = kh->func(State, SigNum, Triggers, Inst, pTBI, &handled);
+		if (handled)
+			break;
+	}
+
+	spin_unlock(&kick_handlers_lock);
+
+	WARN_ON(!handled);
+
+	irq_exit();
+	set_irq_regs(old_regs);
+
+	return tail_end(ret);
+}
diff --git a/arch/metag/kernel/machines.c b/arch/metag/kernel/machines.c
new file mode 100644
index 000000000..1edf6ba19
--- /dev/null
+++ b/arch/metag/kernel/machines.c
@@ -0,0 +1,20 @@
+/*
+ *  arch/metag/kernel/machines.c
+ *
+ *  Copyright (C) 2012 Imagination Technologies Ltd.
+ *
+ *  Generic Meta Boards.
+ */
+
+#include <linux/init.h>
+#include <asm/irq.h>
+#include <asm/mach/arch.h>
+
+static const char *meta_boards_compat[] __initdata = {
+	"img,meta",
+	NULL,
+};
+
+MACHINE_START(META, "Generic Meta")
+	.dt_compat	= meta_boards_compat,
+MACHINE_END
diff --git a/arch/metag/kernel/metag_ksyms.c b/arch/metag/kernel/metag_ksyms.c
new file mode 100644
index 000000000..215c94ad6
--- /dev/null
+++ b/arch/metag/kernel/metag_ksyms.c
@@ -0,0 +1,54 @@
+#include <linux/export.h>
+#include <linux/types.h>
+
+#include <asm/checksum.h>
+#include <asm/div64.h>
+#include <asm/ftrace.h>
+#include <asm/page.h>
+#include <asm/string.h>
+#include <asm/tbx.h>
+
+EXPORT_SYMBOL(clear_page);
+EXPORT_SYMBOL(copy_page);
+
+#ifdef CONFIG_FLATMEM
+/* needed for the pfn_valid macro */
+EXPORT_SYMBOL(max_pfn);
+EXPORT_SYMBOL(min_low_pfn);
+#endif
+
+/* Network checksum functions */
+EXPORT_SYMBOL(csum_partial);
+
+/* TBI symbols */
+EXPORT_SYMBOL(__TBI);
+EXPORT_SYMBOL(__TBIFindSeg);
+EXPORT_SYMBOL(__TBIPoll);
+EXPORT_SYMBOL(__TBITimeStamp);
+
+#define DECLARE_EXPORT(name) extern void name(void); EXPORT_SYMBOL(name)
+
+/* libgcc functions */
+DECLARE_EXPORT(__ashldi3);
+DECLARE_EXPORT(__ashrdi3);
+DECLARE_EXPORT(__lshrdi3);
+DECLARE_EXPORT(__udivsi3);
+DECLARE_EXPORT(__divsi3);
+DECLARE_EXPORT(__umodsi3);
+DECLARE_EXPORT(__modsi3);
+DECLARE_EXPORT(__muldi3);
+DECLARE_EXPORT(__cmpdi2);
+DECLARE_EXPORT(__ucmpdi2);
+
+/* Maths functions */
+EXPORT_SYMBOL(div_u64);
+EXPORT_SYMBOL(div_s64);
+
+/* String functions */
+EXPORT_SYMBOL(memcpy);
+EXPORT_SYMBOL(memset);
+EXPORT_SYMBOL(memmove);
+
+#ifdef CONFIG_FUNCTION_TRACER
+EXPORT_SYMBOL(mcount_wrapper);
+#endif
diff --git a/arch/metag/kernel/module.c b/arch/metag/kernel/module.c
new file mode 100644
index 000000000..986331cd0
--- /dev/null
+++ b/arch/metag/kernel/module.c
@@ -0,0 +1,284 @@
+/*  Kernel module help for Meta.
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+*/
+#include <linux/moduleloader.h>
+#include <linux/elf.h>
+#include <linux/vmalloc.h>
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/sort.h>
+
+#include <asm/unaligned.h>
+
+/* Count how many different relocations (different symbol, different
+   addend) */
+static unsigned int count_relocs(const Elf32_Rela *rela, unsigned int num)
+{
+	unsigned int i, r_info, r_addend, _count_relocs;
+
+	_count_relocs = 0;
+	r_info = 0;
+	r_addend = 0;
+	for (i = 0; i < num; i++)
+		/* Only count relbranch relocs, others don't need stubs */
+		if (ELF32_R_TYPE(rela[i].r_info) == R_METAG_RELBRANCH &&
+		    (r_info != ELF32_R_SYM(rela[i].r_info) ||
+		     r_addend != rela[i].r_addend)) {
+			_count_relocs++;
+			r_info = ELF32_R_SYM(rela[i].r_info);
+			r_addend = rela[i].r_addend;
+		}
+
+	return _count_relocs;
+}
+
+static int relacmp(const void *_x, const void *_y)
+{
+	const Elf32_Rela *x, *y;
+
+	y = (Elf32_Rela *)_x;
+	x = (Elf32_Rela *)_y;
+
+	/* Compare the entire r_info (as opposed to ELF32_R_SYM(r_info) only) to
+	 * make the comparison cheaper/faster. It won't affect the sorting or
+	 * the counting algorithms' performance
+	 */
+	if (x->r_info < y->r_info)
+		return -1;
+	else if (x->r_info > y->r_info)
+		return 1;
+	else if (x->r_addend < y->r_addend)
+		return -1;
+	else if (x->r_addend > y->r_addend)
+		return 1;
+	else
+		return 0;
+}
+
+static void relaswap(void *_x, void *_y, int size)
+{
+	uint32_t *x, *y, tmp;
+	int i;
+
+	y = (uint32_t *)_x;
+	x = (uint32_t *)_y;
+
+	for (i = 0; i < sizeof(Elf32_Rela) / sizeof(uint32_t); i++) {
+		tmp = x[i];
+		x[i] = y[i];
+		y[i] = tmp;
+	}
+}
+
+/* Get the potential trampolines size required of the init and
+   non-init sections */
+static unsigned long get_plt_size(const Elf32_Ehdr *hdr,
+				  const Elf32_Shdr *sechdrs,
+				  const char *secstrings,
+				  int is_init)
+{
+	unsigned long ret = 0;
+	unsigned i;
+
+	/* Everything marked ALLOC (this includes the exported
+	   symbols) */
+	for (i = 1; i < hdr->e_shnum; i++) {
+		/* If it's called *.init*, and we're not init, we're
+		   not interested */
+		if ((strstr(secstrings + sechdrs[i].sh_name, ".init") != NULL)
+		    != is_init)
+			continue;
+
+		/* We don't want to look at debug sections. */
+		if (strstr(secstrings + sechdrs[i].sh_name, ".debug") != NULL)
+			continue;
+
+		if (sechdrs[i].sh_type == SHT_RELA) {
+			pr_debug("Found relocations in section %u\n", i);
+			pr_debug("Ptr: %p.  Number: %u\n",
+				 (void *)hdr + sechdrs[i].sh_offset,
+				 sechdrs[i].sh_size / sizeof(Elf32_Rela));
+
+			/* Sort the relocation information based on a symbol and
+			 * addend key. This is a stable O(n*log n) complexity
+			 * alogrithm but it will reduce the complexity of
+			 * count_relocs() to linear complexity O(n)
+			 */
+			sort((void *)hdr + sechdrs[i].sh_offset,
+			     sechdrs[i].sh_size / sizeof(Elf32_Rela),
+			     sizeof(Elf32_Rela), relacmp, relaswap);
+
+			ret += count_relocs((void *)hdr
+					     + sechdrs[i].sh_offset,
+					     sechdrs[i].sh_size
+					     / sizeof(Elf32_Rela))
+				* sizeof(struct metag_plt_entry);
+		}
+	}
+
+	return ret;
+}
+
+int module_frob_arch_sections(Elf32_Ehdr *hdr,
+			      Elf32_Shdr *sechdrs,
+			      char *secstrings,
+			      struct module *me)
+{
+	unsigned int i;
+
+	/* Find .plt and .init.plt sections */
+	for (i = 0; i < hdr->e_shnum; i++) {
+		if (strcmp(secstrings + sechdrs[i].sh_name, ".init.plt") == 0)
+			me->arch.init_plt_section = i;
+		else if (strcmp(secstrings + sechdrs[i].sh_name, ".plt") == 0)
+			me->arch.core_plt_section = i;
+	}
+	if (!me->arch.core_plt_section || !me->arch.init_plt_section) {
+		pr_err("Module doesn't contain .plt or .init.plt sections.\n");
+		return -ENOEXEC;
+	}
+
+	/* Override their sizes */
+	sechdrs[me->arch.core_plt_section].sh_size
+		= get_plt_size(hdr, sechdrs, secstrings, 0);
+	sechdrs[me->arch.core_plt_section].sh_type = SHT_NOBITS;
+	sechdrs[me->arch.init_plt_section].sh_size
+		= get_plt_size(hdr, sechdrs, secstrings, 1);
+	sechdrs[me->arch.init_plt_section].sh_type = SHT_NOBITS;
+	return 0;
+}
+
+/* Set up a trampoline in the PLT to bounce us to the distant function */
+static uint32_t do_plt_call(void *location, Elf32_Addr val,
+			    Elf32_Shdr *sechdrs, struct module *mod)
+{
+	struct metag_plt_entry *entry;
+	/* Instructions used to do the indirect jump.  */
+	uint32_t tramp[2];
+
+	/* We have to trash a register, so we assume that any control
+	   transfer more than 21-bits away must be a function call
+	   (so we can use a call-clobbered register).  */
+
+	/* MOVT D0Re0,#HI(v) */
+	tramp[0] = 0x02000005 | (((val & 0xffff0000) >> 16) << 3);
+	/* JUMP D0Re0,#LO(v) */
+	tramp[1] = 0xac000001 | ((val & 0x0000ffff) << 3);
+
+	/* Init, or core PLT? */
+	if (location >= mod->module_core
+	    && location < mod->module_core + mod->core_size)
+		entry = (void *)sechdrs[mod->arch.core_plt_section].sh_addr;
+	else
+		entry = (void *)sechdrs[mod->arch.init_plt_section].sh_addr;
+
+	/* Find this entry, or if that fails, the next avail. entry */
+	while (entry->tramp[0])
+		if (entry->tramp[0] == tramp[0] && entry->tramp[1] == tramp[1])
+			return (uint32_t)entry;
+		else
+			entry++;
+
+	entry->tramp[0] = tramp[0];
+	entry->tramp[1] = tramp[1];
+
+	return (uint32_t)entry;
+}
+
+int apply_relocate_add(Elf32_Shdr *sechdrs,
+		   const char *strtab,
+		   unsigned int symindex,
+		   unsigned int relsec,
+		   struct module *me)
+{
+	unsigned int i;
+	Elf32_Rela *rel = (void *)sechdrs[relsec].sh_addr;
+	Elf32_Sym *sym;
+	Elf32_Addr relocation;
+	uint32_t *location;
+	int32_t value;
+
+	pr_debug("Applying relocate section %u to %u\n", relsec,
+		 sechdrs[relsec].sh_info);
+	for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
+		/* This is where to make the change */
+		location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
+			+ rel[i].r_offset;
+		/* This is the symbol it is referring to.  Note that all
+		   undefined symbols have been resolved.  */
+		sym = (Elf32_Sym *)sechdrs[symindex].sh_addr
+			+ ELF32_R_SYM(rel[i].r_info);
+		relocation = sym->st_value + rel[i].r_addend;
+
+		switch (ELF32_R_TYPE(rel[i].r_info)) {
+		case R_METAG_NONE:
+			break;
+		case R_METAG_HIADDR16:
+			relocation >>= 16;
+		case R_METAG_LOADDR16:
+			*location = (*location & 0xfff80007) |
+				((relocation & 0xffff) << 3);
+			break;
+		case R_METAG_ADDR32:
+			/*
+			 * Packed data structures may cause a misaligned
+			 * R_METAG_ADDR32 to be emitted.
+			 */
+			put_unaligned(relocation, location);
+			break;
+		case R_METAG_GETSETOFF:
+			*location += ((relocation & 0xfff) << 7);
+			break;
+		case R_METAG_RELBRANCH:
+			if (*location & (0x7ffff << 5)) {
+				pr_err("bad relbranch relocation\n");
+				break;
+			}
+
+			/* This jump is too big for the offset slot. Build
+			 * a PLT to jump through to get to where we want to go.
+			 * NB: 21bit check - not scaled to 19bit yet
+			 */
+			if (((int32_t)(relocation -
+				       (uint32_t)location) > 0xfffff) ||
+			    ((int32_t)(relocation -
+				       (uint32_t)location) < -0xfffff)) {
+				relocation = do_plt_call(location, relocation,
+							 sechdrs, me);
+			}
+
+			value = relocation - (uint32_t)location;
+
+			/* branch instruction aligned */
+			value /= 4;
+
+			if ((value > 0x7ffff) || (value < -0x7ffff)) {
+				/*
+				 * this should have been caught by the code
+				 * above!
+				 */
+				pr_err("overflow of relbranch reloc\n");
+			}
+
+			*location = (*location & (~(0x7ffff << 5))) |
+				((value & 0x7ffff) << 5);
+			break;
+
+		default:
+			pr_err("module %s: Unknown relocation: %u\n",
+			       me->name, ELF32_R_TYPE(rel[i].r_info));
+			return -ENOEXEC;
+		}
+	}
+	return 0;
+}
diff --git a/arch/metag/kernel/perf/Makefile b/arch/metag/kernel/perf/Makefile
new file mode 100644
index 000000000..b158cb272
--- /dev/null
+++ b/arch/metag/kernel/perf/Makefile
@@ -0,0 +1,3 @@
+# Makefile for performance event core
+
+obj-y += perf_event.o
diff --git a/arch/metag/kernel/perf/perf_event.c b/arch/metag/kernel/perf/perf_event.c
new file mode 100644
index 000000000..2478ec6d2
--- /dev/null
+++ b/arch/metag/kernel/perf/perf_event.c
@@ -0,0 +1,888 @@
+/*
+ * Meta performance counter support.
+ *  Copyright (C) 2012 Imagination Technologies Ltd
+ *
+ * This code is based on the sh pmu code:
+ *  Copyright (C) 2009 Paul Mundt
+ *
+ * and on the arm pmu code:
+ *  Copyright (C) 2009 picoChip Designs, Ltd., James Iles
+ *  Copyright (C) 2010 ARM Ltd., Will Deacon <will.deacon@arm.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ */
+
+#include <linux/atomic.h>
+#include <linux/export.h>
+#include <linux/init.h>
+#include <linux/irqchip/metag.h>
+#include <linux/perf_event.h>
+#include <linux/slab.h>
+
+#include <asm/core_reg.h>
+#include <asm/io.h>
+#include <asm/irq.h>
+#include <asm/processor.h>
+
+#include "perf_event.h"
+
+static int _hw_perf_event_init(struct perf_event *);
+static void _hw_perf_event_destroy(struct perf_event *);
+
+/* Determines which core type we are */
+static struct metag_pmu *metag_pmu __read_mostly;
+
+/* Processor specific data */
+static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
+
+/* PMU admin */
+const char *perf_pmu_name(void)
+{
+	if (!metag_pmu)
+		return NULL;
+
+	return metag_pmu->name;
+}
+EXPORT_SYMBOL_GPL(perf_pmu_name);
+
+int perf_num_counters(void)
+{
+	if (metag_pmu)
+		return metag_pmu->max_events;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(perf_num_counters);
+
+static inline int metag_pmu_initialised(void)
+{
+	return !!metag_pmu;
+}
+
+static void release_pmu_hardware(void)
+{
+	int irq;
+	unsigned int version = (metag_pmu->version &
+			(METAC_ID_MINOR_BITS | METAC_ID_REV_BITS)) >>
+			METAC_ID_REV_S;
+
+	/* Early cores don't have overflow interrupts */
+	if (version < 0x0104)
+		return;
+
+	irq = internal_irq_map(17);
+	if (irq >= 0)
+		free_irq(irq, (void *)1);
+
+	irq = internal_irq_map(16);
+	if (irq >= 0)
+		free_irq(irq, (void *)0);
+}
+
+static int reserve_pmu_hardware(void)
+{
+	int err = 0, irq[2];
+	unsigned int version = (metag_pmu->version &
+			(METAC_ID_MINOR_BITS | METAC_ID_REV_BITS)) >>
+			METAC_ID_REV_S;
+
+	/* Early cores don't have overflow interrupts */
+	if (version < 0x0104)
+		goto out;
+
+	/*
+	 * Bit 16 on HWSTATMETA is the interrupt for performance counter 0;
+	 * similarly, 17 is the interrupt for performance counter 1.
+	 * We can't (yet) interrupt on the cycle counter, because it's a
+	 * register, however it holds a 32-bit value as opposed to 24-bit.
+	 */
+	irq[0] = internal_irq_map(16);
+	if (irq[0] < 0) {
+		pr_err("unable to map internal IRQ %d\n", 16);
+		goto out;
+	}
+	err = request_irq(irq[0], metag_pmu->handle_irq, IRQF_NOBALANCING,
+			"metagpmu0", (void *)0);
+	if (err) {
+		pr_err("unable to request IRQ%d for metag PMU counters\n",
+				irq[0]);
+		goto out;
+	}
+
+	irq[1] = internal_irq_map(17);
+	if (irq[1] < 0) {
+		pr_err("unable to map internal IRQ %d\n", 17);
+		goto out_irq1;
+	}
+	err = request_irq(irq[1], metag_pmu->handle_irq, IRQF_NOBALANCING,
+			"metagpmu1", (void *)1);
+	if (err) {
+		pr_err("unable to request IRQ%d for metag PMU counters\n",
+				irq[1]);
+		goto out_irq1;
+	}
+
+	return 0;
+
+out_irq1:
+	free_irq(irq[0], (void *)0);
+out:
+	return err;
+}
+
+/* PMU operations */
+static void metag_pmu_enable(struct pmu *pmu)
+{
+}
+
+static void metag_pmu_disable(struct pmu *pmu)
+{
+}
+
+static int metag_pmu_event_init(struct perf_event *event)
+{
+	int err = 0;
+	atomic_t *active_events = &metag_pmu->active_events;
+
+	if (!metag_pmu_initialised()) {
+		err = -ENODEV;
+		goto out;
+	}
+
+	if (has_branch_stack(event))
+		return -EOPNOTSUPP;
+
+	event->destroy = _hw_perf_event_destroy;
+
+	if (!atomic_inc_not_zero(active_events)) {
+		mutex_lock(&metag_pmu->reserve_mutex);
+		if (atomic_read(active_events) == 0)
+			err = reserve_pmu_hardware();
+
+		if (!err)
+			atomic_inc(active_events);
+
+		mutex_unlock(&metag_pmu->reserve_mutex);
+	}
+
+	/* Hardware and caches counters */
+	switch (event->attr.type) {
+	case PERF_TYPE_HARDWARE:
+	case PERF_TYPE_HW_CACHE:
+	case PERF_TYPE_RAW:
+		err = _hw_perf_event_init(event);
+		break;
+
+	default:
+		return -ENOENT;
+	}
+
+	if (err)
+		event->destroy(event);
+
+out:
+	return err;
+}
+
+void metag_pmu_event_update(struct perf_event *event,
+		struct hw_perf_event *hwc, int idx)
+{
+	u64 prev_raw_count, new_raw_count;
+	s64 delta;
+
+	/*
+	 * If this counter is chained, it may be that the previous counter
+	 * value has been changed beneath us.
+	 *
+	 * To get around this, we read and exchange the new raw count, then
+	 * add the delta (new - prev) to the generic counter atomically.
+	 *
+	 * Without interrupts, this is the simplest approach.
+	 */
+again:
+	prev_raw_count = local64_read(&hwc->prev_count);
+	new_raw_count = metag_pmu->read(idx);
+
+	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
+			new_raw_count) != prev_raw_count)
+		goto again;
+
+	/*
+	 * Calculate the delta and add it to the counter.
+	 */
+	delta = (new_raw_count - prev_raw_count) & MAX_PERIOD;
+
+	local64_add(delta, &event->count);
+	local64_sub(delta, &hwc->period_left);
+}
+
+int metag_pmu_event_set_period(struct perf_event *event,
+		struct hw_perf_event *hwc, int idx)
+{
+	s64 left = local64_read(&hwc->period_left);
+	s64 period = hwc->sample_period;
+	int ret = 0;
+
+	/* The period may have been changed */
+	if (unlikely(period != hwc->last_period))
+		left += period - hwc->last_period;
+
+	if (unlikely(left <= -period)) {
+		left = period;
+		local64_set(&hwc->period_left, left);
+		hwc->last_period = period;
+		ret = 1;
+	}
+
+	if (unlikely(left <= 0)) {
+		left += period;
+		local64_set(&hwc->period_left, left);
+		hwc->last_period = period;
+		ret = 1;
+	}
+
+	if (left > (s64)metag_pmu->max_period)
+		left = metag_pmu->max_period;
+
+	if (metag_pmu->write) {
+		local64_set(&hwc->prev_count, -(s32)left);
+		metag_pmu->write(idx, -left & MAX_PERIOD);
+	}
+
+	perf_event_update_userpage(event);
+
+	return ret;
+}
+
+static void metag_pmu_start(struct perf_event *event, int flags)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct hw_perf_event *hwc = &event->hw;
+	int idx = hwc->idx;
+
+	if (WARN_ON_ONCE(idx == -1))
+		return;
+
+	/*
+	 * We always have to reprogram the period, so ignore PERF_EF_RELOAD.
+	 */
+	if (flags & PERF_EF_RELOAD)
+		WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
+
+	hwc->state = 0;
+
+	/*
+	 * Reset the period.
+	 * Some counters can't be stopped (i.e. are core global), so when the
+	 * counter was 'stopped' we merely disabled the IRQ. If we don't reset
+	 * the period, then we'll either: a) get an overflow too soon;
+	 * or b) too late if the overflow happened since disabling.
+	 * Obviously, this has little bearing on cores without the overflow
+	 * interrupt, as the performance counter resets to zero on write
+	 * anyway.
+	 */
+	if (metag_pmu->max_period)
+		metag_pmu_event_set_period(event, hwc, hwc->idx);
+	cpuc->events[idx] = event;
+	metag_pmu->enable(hwc, idx);
+}
+
+static void metag_pmu_stop(struct perf_event *event, int flags)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	/*
+	 * We should always update the counter on stop; see comment above
+	 * why.
+	 */
+	if (!(hwc->state & PERF_HES_STOPPED)) {
+		metag_pmu_event_update(event, hwc, hwc->idx);
+		metag_pmu->disable(hwc, hwc->idx);
+		hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
+	}
+}
+
+static int metag_pmu_add(struct perf_event *event, int flags)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct hw_perf_event *hwc = &event->hw;
+	int idx = 0, ret = 0;
+
+	perf_pmu_disable(event->pmu);
+
+	/* check whether we're counting instructions */
+	if (hwc->config == 0x100) {
+		if (__test_and_set_bit(METAG_INST_COUNTER,
+				cpuc->used_mask)) {
+			ret = -EAGAIN;
+			goto out;
+		}
+		idx = METAG_INST_COUNTER;
+	} else {
+		/* Check whether we have a spare counter */
+		idx = find_first_zero_bit(cpuc->used_mask,
+				atomic_read(&metag_pmu->active_events));
+		if (idx >= METAG_INST_COUNTER) {
+			ret = -EAGAIN;
+			goto out;
+		}
+
+		__set_bit(idx, cpuc->used_mask);
+	}
+	hwc->idx = idx;
+
+	/* Make sure the counter is disabled */
+	metag_pmu->disable(hwc, idx);
+
+	hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
+	if (flags & PERF_EF_START)
+		metag_pmu_start(event, PERF_EF_RELOAD);
+
+	perf_event_update_userpage(event);
+out:
+	perf_pmu_enable(event->pmu);
+	return ret;
+}
+
+static void metag_pmu_del(struct perf_event *event, int flags)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct hw_perf_event *hwc = &event->hw;
+	int idx = hwc->idx;
+
+	WARN_ON(idx < 0);
+	metag_pmu_stop(event, PERF_EF_UPDATE);
+	cpuc->events[idx] = NULL;
+	__clear_bit(idx, cpuc->used_mask);
+
+	perf_event_update_userpage(event);
+}
+
+static void metag_pmu_read(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	/* Don't read disabled counters! */
+	if (hwc->idx < 0)
+		return;
+
+	metag_pmu_event_update(event, hwc, hwc->idx);
+}
+
+static struct pmu pmu = {
+	.pmu_enable	= metag_pmu_enable,
+	.pmu_disable	= metag_pmu_disable,
+
+	.event_init	= metag_pmu_event_init,
+
+	.add		= metag_pmu_add,
+	.del		= metag_pmu_del,
+	.start		= metag_pmu_start,
+	.stop		= metag_pmu_stop,
+	.read		= metag_pmu_read,
+};
+
+/* Core counter specific functions */
+static const int metag_general_events[] = {
+	[PERF_COUNT_HW_CPU_CYCLES] = 0x03,
+	[PERF_COUNT_HW_INSTRUCTIONS] = 0x100,
+	[PERF_COUNT_HW_CACHE_REFERENCES] = -1,
+	[PERF_COUNT_HW_CACHE_MISSES] = -1,
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = -1,
+	[PERF_COUNT_HW_BRANCH_MISSES] = -1,
+	[PERF_COUNT_HW_BUS_CYCLES] = -1,
+	[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = -1,
+	[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = -1,
+	[PERF_COUNT_HW_REF_CPU_CYCLES] = -1,
+};
+
+static const int metag_pmu_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
+	[C(L1D)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)] = 0x08,
+			[C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED,
+		},
+	},
+	[C(L1I)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)] = 0x09,
+			[C(RESULT_MISS)] = 0x0a,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED,
+		},
+	},
+	[C(LL)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED,
+		},
+	},
+	[C(DTLB)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)] = 0xd0,
+			[C(RESULT_MISS)] = 0xd2,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)] = 0xd4,
+			[C(RESULT_MISS)] = 0xd5,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED,
+		},
+	},
+	[C(ITLB)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)] = 0xd1,
+			[C(RESULT_MISS)] = 0xd3,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED,
+		},
+	},
+	[C(BPU)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED,
+		},
+	},
+	[C(NODE)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED,
+			[C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED,
+		},
+	},
+};
+
+
+static void _hw_perf_event_destroy(struct perf_event *event)
+{
+	atomic_t *active_events = &metag_pmu->active_events;
+	struct mutex *pmu_mutex = &metag_pmu->reserve_mutex;
+
+	if (atomic_dec_and_mutex_lock(active_events, pmu_mutex)) {
+		release_pmu_hardware();
+		mutex_unlock(pmu_mutex);
+	}
+}
+
+static int _hw_perf_cache_event(int config, int *evp)
+{
+	unsigned long type, op, result;
+	int ev;
+
+	if (!metag_pmu->cache_events)
+		return -EINVAL;
+
+	/* Unpack config */
+	type = config & 0xff;
+	op = (config >> 8) & 0xff;
+	result = (config >> 16) & 0xff;
+
+	if (type >= PERF_COUNT_HW_CACHE_MAX ||
+			op >= PERF_COUNT_HW_CACHE_OP_MAX ||
+			result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
+		return -EINVAL;
+
+	ev = (*metag_pmu->cache_events)[type][op][result];
+	if (ev == 0)
+		return -EOPNOTSUPP;
+	if (ev == -1)
+		return -EINVAL;
+	*evp = ev;
+	return 0;
+}
+
+static int _hw_perf_event_init(struct perf_event *event)
+{
+	struct perf_event_attr *attr = &event->attr;
+	struct hw_perf_event *hwc = &event->hw;
+	int mapping = 0, err;
+
+	switch (attr->type) {
+	case PERF_TYPE_HARDWARE:
+		if (attr->config >= PERF_COUNT_HW_MAX)
+			return -EINVAL;
+
+		mapping = metag_pmu->event_map(attr->config);
+		break;
+
+	case PERF_TYPE_HW_CACHE:
+		err = _hw_perf_cache_event(attr->config, &mapping);
+		if (err)
+			return err;
+		break;
+
+	case PERF_TYPE_RAW:
+		mapping = attr->config;
+		break;
+	}
+
+	/* Return early if the event is unsupported */
+	if (mapping == -1)
+		return -EINVAL;
+
+	/*
+	 * Don't assign an index until the event is placed into the hardware.
+	 * -1 signifies that we're still deciding where to put it. On SMP
+	 * systems each core has its own set of counters, so we can't do any
+	 * constraint checking yet.
+	 */
+	hwc->idx = -1;
+
+	/* Store the event encoding */
+	hwc->config |= (unsigned long)mapping;
+
+	/*
+	 * For non-sampling runs, limit the sample_period to half of the
+	 * counter width. This way, the new counter value should be less
+	 * likely to overtake the previous one (unless there are IRQ latency
+	 * issues...)
+	 */
+	if (metag_pmu->max_period) {
+		if (!hwc->sample_period) {
+			hwc->sample_period = metag_pmu->max_period >> 1;
+			hwc->last_period = hwc->sample_period;
+			local64_set(&hwc->period_left, hwc->sample_period);
+		}
+	}
+
+	return 0;
+}
+
+static void metag_pmu_enable_counter(struct hw_perf_event *event, int idx)
+{
+	struct cpu_hw_events *events = this_cpu_ptr(&cpu_hw_events);
+	unsigned int config = event->config;
+	unsigned int tmp = config & 0xf0;
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&events->pmu_lock, flags);
+
+	/*
+	 * Check if we're enabling the instruction counter (index of
+	 * MAX_HWEVENTS - 1)
+	 */
+	if (METAG_INST_COUNTER == idx) {
+		WARN_ONCE((config != 0x100),
+			"invalid configuration (%d) for counter (%d)\n",
+			config, idx);
+		local64_set(&event->prev_count, __core_reg_get(TXTACTCYC));
+		goto unlock;
+	}
+
+	/* Check for a core internal or performance channel event. */
+	if (tmp) {
+		void *perf_addr;
+
+		/*
+		 * Anything other than a cycle count will write the low-
+		 * nibble to the correct counter register.
+		 */
+		switch (tmp) {
+		case 0xd0:
+			perf_addr = (void *)PERF_ICORE(idx);
+			break;
+
+		case 0xf0:
+			perf_addr = (void *)PERF_CHAN(idx);
+			break;
+
+		default:
+			perf_addr = NULL;
+			break;
+		}
+
+		if (perf_addr)
+			metag_out32((config & 0x0f), perf_addr);
+
+		/*
+		 * Now we use the high nibble as the performance event to
+		 * to count.
+		 */
+		config = tmp >> 4;
+	}
+
+	tmp = ((config & 0xf) << 28) |
+			((1 << 24) << hard_processor_id());
+	if (metag_pmu->max_period)
+		/*
+		 * Cores supporting overflow interrupts may have had the counter
+		 * set to a specific value that needs preserving.
+		 */
+		tmp |= metag_in32(PERF_COUNT(idx)) & 0x00ffffff;
+	else
+		/*
+		 * Older cores reset the counter on write, so prev_count needs
+		 * resetting too so we can calculate a correct delta.
+		 */
+		local64_set(&event->prev_count, 0);
+
+	metag_out32(tmp, PERF_COUNT(idx));
+unlock:
+	raw_spin_unlock_irqrestore(&events->pmu_lock, flags);
+}
+
+static void metag_pmu_disable_counter(struct hw_perf_event *event, int idx)
+{
+	struct cpu_hw_events *events = this_cpu_ptr(&cpu_hw_events);
+	unsigned int tmp = 0;
+	unsigned long flags;
+
+	/*
+	 * The cycle counter can't be disabled per se, as it's a hardware
+	 * thread register which is always counting. We merely return if this
+	 * is the counter we're attempting to disable.
+	 */
+	if (METAG_INST_COUNTER == idx)
+		return;
+
+	/*
+	 * The counter value _should_ have been read prior to disabling,
+	 * as if we're running on an early core then the value gets reset to
+	 * 0, and any read after that would be useless. On the newer cores,
+	 * however, it's better to read-modify-update this for purposes of
+	 * the overflow interrupt.
+	 * Here we remove the thread id AND the event nibble (there are at
+	 * least two events that count events that are core global and ignore
+	 * the thread id mask). This only works because we don't mix thread
+	 * performance counts, and event 0x00 requires a thread id mask!
+	 */
+	raw_spin_lock_irqsave(&events->pmu_lock, flags);
+
+	tmp = metag_in32(PERF_COUNT(idx));
+	tmp &= 0x00ffffff;
+	metag_out32(tmp, PERF_COUNT(idx));
+
+	raw_spin_unlock_irqrestore(&events->pmu_lock, flags);
+}
+
+static u64 metag_pmu_read_counter(int idx)
+{
+	u32 tmp = 0;
+
+	if (METAG_INST_COUNTER == idx) {
+		tmp = __core_reg_get(TXTACTCYC);
+		goto out;
+	}
+
+	tmp = metag_in32(PERF_COUNT(idx)) & 0x00ffffff;
+out:
+	return tmp;
+}
+
+static void metag_pmu_write_counter(int idx, u32 val)
+{
+	struct cpu_hw_events *events = this_cpu_ptr(&cpu_hw_events);
+	u32 tmp = 0;
+	unsigned long flags;
+
+	/*
+	 * This _shouldn't_ happen, but if it does, then we can just
+	 * ignore the write, as the register is read-only and clear-on-write.
+	 */
+	if (METAG_INST_COUNTER == idx)
+		return;
+
+	/*
+	 * We'll keep the thread mask and event id, and just update the
+	 * counter itself. Also , we should bound the value to 24-bits.
+	 */
+	raw_spin_lock_irqsave(&events->pmu_lock, flags);
+
+	val &= 0x00ffffff;
+	tmp = metag_in32(PERF_COUNT(idx)) & 0xff000000;
+	val |= tmp;
+	metag_out32(val, PERF_COUNT(idx));
+
+	raw_spin_unlock_irqrestore(&events->pmu_lock, flags);
+}
+
+static int metag_pmu_event_map(int idx)
+{
+	return metag_general_events[idx];
+}
+
+static irqreturn_t metag_pmu_counter_overflow(int irq, void *dev)
+{
+	int idx = (int)dev;
+	struct cpu_hw_events *cpuhw = this_cpu_ptr(&cpu_hw_events);
+	struct perf_event *event = cpuhw->events[idx];
+	struct hw_perf_event *hwc = &event->hw;
+	struct pt_regs *regs = get_irq_regs();
+	struct perf_sample_data sampledata;
+	unsigned long flags;
+	u32 counter = 0;
+
+	/*
+	 * We need to stop the core temporarily from generating another
+	 * interrupt while we disable this counter. However, we don't want
+	 * to flag the counter as free
+	 */
+	__global_lock2(flags);
+	counter = metag_in32(PERF_COUNT(idx));
+	metag_out32((counter & 0x00ffffff), PERF_COUNT(idx));
+	__global_unlock2(flags);
+
+	/* Update the counts and reset the sample period */
+	metag_pmu_event_update(event, hwc, idx);
+	perf_sample_data_init(&sampledata, 0, hwc->last_period);
+	metag_pmu_event_set_period(event, hwc, idx);
+
+	/*
+	 * Enable the counter again once core overflow processing has
+	 * completed. Note the counter value may have been modified while it was
+	 * inactive to set it up ready for the next interrupt.
+	 */
+	if (!perf_event_overflow(event, &sampledata, regs)) {
+		__global_lock2(flags);
+		counter = (counter & 0xff000000) |
+			  (metag_in32(PERF_COUNT(idx)) & 0x00ffffff);
+		metag_out32(counter, PERF_COUNT(idx));
+		__global_unlock2(flags);
+	}
+
+	return IRQ_HANDLED;
+}
+
+static struct metag_pmu _metag_pmu = {
+	.handle_irq	= metag_pmu_counter_overflow,
+	.enable		= metag_pmu_enable_counter,
+	.disable	= metag_pmu_disable_counter,
+	.read		= metag_pmu_read_counter,
+	.write		= metag_pmu_write_counter,
+	.event_map	= metag_pmu_event_map,
+	.cache_events	= &metag_pmu_cache_events,
+	.max_period	= MAX_PERIOD,
+	.max_events	= MAX_HWEVENTS,
+};
+
+/* PMU CPU hotplug notifier */
+static int metag_pmu_cpu_notify(struct notifier_block *b, unsigned long action,
+				void *hcpu)
+{
+	unsigned int cpu = (unsigned int)hcpu;
+	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+
+	if ((action & ~CPU_TASKS_FROZEN) != CPU_STARTING)
+		return NOTIFY_DONE;
+
+	memset(cpuc, 0, sizeof(struct cpu_hw_events));
+	raw_spin_lock_init(&cpuc->pmu_lock);
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block metag_pmu_notifier = {
+	.notifier_call = metag_pmu_cpu_notify,
+};
+
+/* PMU Initialisation */
+static int __init init_hw_perf_events(void)
+{
+	int ret = 0, cpu;
+	u32 version = *(u32 *)METAC_ID;
+	int major = (version & METAC_ID_MAJOR_BITS) >> METAC_ID_MAJOR_S;
+	int min_rev = (version & (METAC_ID_MINOR_BITS | METAC_ID_REV_BITS))
+			>> METAC_ID_REV_S;
+
+	/* Not a Meta 2 core, then not supported */
+	if (0x02 > major) {
+		pr_info("no hardware counter support available\n");
+		goto out;
+	} else if (0x02 == major) {
+		metag_pmu = &_metag_pmu;
+
+		if (min_rev < 0x0104) {
+			/*
+			 * A core without overflow interrupts, and clear-on-
+			 * write counters.
+			 */
+			metag_pmu->handle_irq = NULL;
+			metag_pmu->write = NULL;
+			metag_pmu->max_period = 0;
+		}
+
+		metag_pmu->name = "meta2";
+		metag_pmu->version = version;
+		metag_pmu->pmu = pmu;
+	}
+
+	pr_info("enabled with %s PMU driver, %d counters available\n",
+			metag_pmu->name, metag_pmu->max_events);
+
+	/*
+	 * Early cores have "limited" counters - they have no overflow
+	 * interrupts - and so are unable to do sampling without extra work
+	 * and timer assistance.
+	 */
+	if (metag_pmu->max_period == 0) {
+		metag_pmu->pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT;
+	}
+
+	/* Initialise the active events and reservation mutex */
+	atomic_set(&metag_pmu->active_events, 0);
+	mutex_init(&metag_pmu->reserve_mutex);
+
+	/* Clear the counters */
+	metag_out32(0, PERF_COUNT(0));
+	metag_out32(0, PERF_COUNT(1));
+
+	for_each_possible_cpu(cpu) {
+		struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+
+		memset(cpuc, 0, sizeof(struct cpu_hw_events));
+		raw_spin_lock_init(&cpuc->pmu_lock);
+	}
+
+	register_cpu_notifier(&metag_pmu_notifier);
+	ret = perf_pmu_register(&pmu, metag_pmu->name, PERF_TYPE_RAW);
+out:
+	return ret;
+}
+early_initcall(init_hw_perf_events);
diff --git a/arch/metag/kernel/perf/perf_event.h b/arch/metag/kernel/perf/perf_event.h
new file mode 100644
index 000000000..fd10a1345
--- /dev/null
+++ b/arch/metag/kernel/perf/perf_event.h
@@ -0,0 +1,106 @@
+/*
+ * Meta performance counter support.
+ *  Copyright (C) 2012 Imagination Technologies Ltd
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ */
+
+#ifndef METAG_PERF_EVENT_H_
+#define METAG_PERF_EVENT_H_
+
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/perf_event.h>
+
+/* For performance counter definitions */
+#include <asm/metag_mem.h>
+
+/*
+ * The Meta core has two performance counters, with 24-bit resolution. Newer
+ * cores generate an overflow interrupt on transition from 0xffffff to 0.
+ *
+ * Each counter consists of the counter id, hardware thread id, and the count
+ * itself; each counter can be assigned to multiple hardware threads at any
+ * one time, with the returned count being an aggregate of events. A small
+ * number of events are thread global, i.e. they count the aggregate of all
+ * threads' events, regardless of the thread selected.
+ *
+ * Newer cores can store an arbitrary 24-bit number in the counter, whereas
+ * older cores will clear the counter bits on write.
+ *
+ * We also have a pseudo-counter in the form of the thread active cycles
+ * counter (which, incidentally, is also bound to
+ */
+
+#define MAX_HWEVENTS		3
+#define MAX_PERIOD		((1UL << 24) - 1)
+#define METAG_INST_COUNTER	(MAX_HWEVENTS - 1)
+
+/**
+ * struct cpu_hw_events - a processor core's performance events
+ * @events:	an array of perf_events active for a given index.
+ * @used_mask:	a bitmap of in-use counters.
+ * @pmu_lock:	a perf counter lock
+ *
+ * This is a per-cpu/core structure that maintains a record of its
+ * performance counters' state.
+ */
+struct cpu_hw_events {
+	struct perf_event	*events[MAX_HWEVENTS];
+	unsigned long		used_mask[BITS_TO_LONGS(MAX_HWEVENTS)];
+	raw_spinlock_t		pmu_lock;
+};
+
+/**
+ * struct metag_pmu - the Meta PMU structure
+ * @pmu:		core pmu structure
+ * @name:		pmu name
+ * @version:		core version
+ * @handle_irq:		overflow interrupt handler
+ * @enable:		enable a counter
+ * @disable:		disable a counter
+ * @read:		read the value of a counter
+ * @write:		write a value to a counter
+ * @event_map:		kernel event to counter event id map
+ * @cache_events:	kernel cache counter to core cache counter map
+ * @max_period:		maximum value of the counter before overflow
+ * @max_events:		maximum number of counters available at any one time
+ * @active_events:	number of active counters
+ * @reserve_mutex:	counter reservation mutex
+ *
+ * This describes the main functionality and data used by the performance
+ * event core.
+ */
+struct metag_pmu {
+	struct pmu	pmu;
+	const char	*name;
+	u32		version;
+	irqreturn_t	(*handle_irq)(int irq_num, void *dev);
+	void		(*enable)(struct hw_perf_event *evt, int idx);
+	void		(*disable)(struct hw_perf_event *evt, int idx);
+	u64		(*read)(int idx);
+	void		(*write)(int idx, u32 val);
+	int		(*event_map)(int idx);
+	const int	(*cache_events)[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX];
+	u32		max_period;
+	int		max_events;
+	atomic_t	active_events;
+	struct mutex	reserve_mutex;
+};
+
+/* Convenience macros for accessing the perf counters */
+/* Define some convenience accessors */
+#define PERF_COUNT(x)	(PERF_COUNT0 + (sizeof(u64) * (x)))
+#define PERF_ICORE(x)	(PERF_ICORE0 + (sizeof(u64) * (x)))
+#define PERF_CHAN(x)	(PERF_CHAN0 + (sizeof(u64) * (x)))
+
+/* Cache index macros */
+#define C(x) PERF_COUNT_HW_CACHE_##x
+#define CACHE_OP_UNSUPPORTED	0xfffe
+#define CACHE_OP_NONSENSE	0xffff
+
+#endif
diff --git a/arch/metag/kernel/perf_callchain.c b/arch/metag/kernel/perf_callchain.c
new file mode 100644
index 000000000..315633461
--- /dev/null
+++ b/arch/metag/kernel/perf_callchain.c
@@ -0,0 +1,96 @@
+/*
+ * Perf callchain handling code.
+ *
+ *   Based on the ARM perf implementation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/perf_event.h>
+#include <linux/uaccess.h>
+#include <asm/ptrace.h>
+#include <asm/stacktrace.h>
+
+static bool is_valid_call(unsigned long calladdr)
+{
+	unsigned int callinsn;
+
+	/* Check the possible return address is aligned. */
+	if (!(calladdr & 0x3)) {
+		if (!get_user(callinsn, (unsigned int *)calladdr)) {
+			/* Check for CALLR or SWAP PC,D1RtP. */
+			if ((callinsn & 0xff000000) == 0xab000000 ||
+			    callinsn == 0xa3200aa0)
+				return true;
+		}
+	}
+	return false;
+}
+
+static struct metag_frame __user *
+user_backtrace(struct metag_frame __user *user_frame,
+	       struct perf_callchain_entry *entry)
+{
+	struct metag_frame frame;
+	unsigned long calladdr;
+
+	/* We cannot rely on having frame pointers in user code. */
+	while (1) {
+		/* Also check accessibility of one struct frame beyond */
+		if (!access_ok(VERIFY_READ, user_frame, sizeof(frame)))
+			return 0;
+		if (__copy_from_user_inatomic(&frame, user_frame,
+					      sizeof(frame)))
+			return 0;
+
+		--user_frame;
+
+		calladdr = frame.lr - 4;
+		if (is_valid_call(calladdr)) {
+			perf_callchain_store(entry, calladdr);
+			return user_frame;
+		}
+	}
+
+	return 0;
+}
+
+void
+perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
+{
+	unsigned long sp = regs->ctx.AX[0].U0;
+	struct metag_frame __user *frame;
+
+	frame = (struct metag_frame __user *)sp;
+
+	--frame;
+
+	while ((entry->nr < PERF_MAX_STACK_DEPTH) && frame)
+		frame = user_backtrace(frame, entry);
+}
+
+/*
+ * Gets called by walk_stackframe() for every stackframe. This will be called
+ * whist unwinding the stackframe and is like a subroutine return so we use
+ * the PC.
+ */
+static int
+callchain_trace(struct stackframe *fr,
+		void *data)
+{
+	struct perf_callchain_entry *entry = data;
+	perf_callchain_store(entry, fr->pc);
+	return 0;
+}
+
+void
+perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
+{
+	struct stackframe fr;
+
+	fr.fp = regs->ctx.AX[1].U0;
+	fr.sp = regs->ctx.AX[0].U0;
+	fr.lr = regs->ctx.DX[4].U1;
+	fr.pc = regs->ctx.CurrPC;
+	walk_stackframe(&fr, callchain_trace, entry);
+}
diff --git a/arch/metag/kernel/process.c b/arch/metag/kernel/process.c
new file mode 100644
index 000000000..7f546183a
--- /dev/null
+++ b/arch/metag/kernel/process.c
@@ -0,0 +1,444 @@
+/*
+ * Copyright (C) 2005,2006,2007,2008,2009,2010,2011 Imagination Technologies
+ *
+ * This file contains the architecture-dependent parts of process handling.
+ *
+ */
+
+#include <linux/errno.h>
+#include <linux/export.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/unistd.h>
+#include <linux/ptrace.h>
+#include <linux/user.h>
+#include <linux/reboot.h>
+#include <linux/elfcore.h>
+#include <linux/fs.h>
+#include <linux/tick.h>
+#include <linux/slab.h>
+#include <linux/mman.h>
+#include <linux/pm.h>
+#include <linux/syscalls.h>
+#include <linux/uaccess.h>
+#include <linux/smp.h>
+#include <asm/core_reg.h>
+#include <asm/user_gateway.h>
+#include <asm/tcm.h>
+#include <asm/traps.h>
+#include <asm/switch_to.h>
+
+/*
+ * Wait for the next interrupt and enable local interrupts
+ */
+void arch_cpu_idle(void)
+{
+	int tmp;
+
+	/*
+	 * Quickly jump straight into the interrupt entry point without actually
+	 * triggering an interrupt. When TXSTATI gets read the processor will
+	 * block until an interrupt is triggered.
+	 */
+	asm volatile (/* Switch into ISTAT mode */
+		      "RTH\n\t"
+		      /* Enable local interrupts */
+		      "MOV	TXMASKI, %1\n\t"
+		      /*
+		       * We can't directly "SWAP PC, PCX", so we swap via a
+		       * temporary. Essentially we do:
+		       *  PCX_new = 1f (the place to continue execution)
+		       *  PC = PCX_old
+		       */
+		      "ADD	%0, CPC0, #(1f-.)\n\t"
+		      "SWAP	PCX, %0\n\t"
+		      "MOV	PC, %0\n"
+		      /* Continue execution here with interrupts enabled */
+		      "1:"
+		      : "=a" (tmp)
+		      : "r" (get_trigger_mask()));
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+void arch_cpu_idle_dead(void)
+{
+	cpu_die();
+}
+#endif
+
+void (*pm_power_off)(void);
+EXPORT_SYMBOL(pm_power_off);
+
+void (*soc_restart)(char *cmd);
+void (*soc_halt)(void);
+
+void machine_restart(char *cmd)
+{
+	if (soc_restart)
+		soc_restart(cmd);
+	hard_processor_halt(HALT_OK);
+}
+
+void machine_halt(void)
+{
+	if (soc_halt)
+		soc_halt();
+	smp_send_stop();
+	hard_processor_halt(HALT_OK);
+}
+
+void machine_power_off(void)
+{
+	if (pm_power_off)
+		pm_power_off();
+	smp_send_stop();
+	hard_processor_halt(HALT_OK);
+}
+
+#define FLAG_Z 0x8
+#define FLAG_N 0x4
+#define FLAG_O 0x2
+#define FLAG_C 0x1
+
+void show_regs(struct pt_regs *regs)
+{
+	int i;
+	const char *AX0_names[] = {"A0StP", "A0FrP"};
+	const char *AX1_names[] = {"A1GbP", "A1LbP"};
+
+	const char *DX0_names[] = {
+		"D0Re0",
+		"D0Ar6",
+		"D0Ar4",
+		"D0Ar2",
+		"D0FrT",
+		"D0.5 ",
+		"D0.6 ",
+		"D0.7 "
+	};
+
+	const char *DX1_names[] = {
+		"D1Re0",
+		"D1Ar5",
+		"D1Ar3",
+		"D1Ar1",
+		"D1RtP",
+		"D1.5 ",
+		"D1.6 ",
+		"D1.7 "
+	};
+
+	show_regs_print_info(KERN_INFO);
+
+	pr_info(" pt_regs @ %p\n", regs);
+	pr_info(" SaveMask = 0x%04hx\n", regs->ctx.SaveMask);
+	pr_info(" Flags = 0x%04hx (%c%c%c%c)\n", regs->ctx.Flags,
+		regs->ctx.Flags & FLAG_Z ? 'Z' : 'z',
+		regs->ctx.Flags & FLAG_N ? 'N' : 'n',
+		regs->ctx.Flags & FLAG_O ? 'O' : 'o',
+		regs->ctx.Flags & FLAG_C ? 'C' : 'c');
+	pr_info(" TXRPT = 0x%08x\n", regs->ctx.CurrRPT);
+	pr_info(" PC = 0x%08x\n", regs->ctx.CurrPC);
+
+	/* AX regs */
+	for (i = 0; i < 2; i++) {
+		pr_info(" %s = 0x%08x    ",
+			AX0_names[i],
+			regs->ctx.AX[i].U0);
+		printk(" %s = 0x%08x\n",
+			AX1_names[i],
+			regs->ctx.AX[i].U1);
+	}
+
+	if (regs->ctx.SaveMask & TBICTX_XEXT_BIT)
+		pr_warn(" Extended state present - AX2.[01] will be WRONG\n");
+
+	/* Special place with AXx.2 */
+	pr_info(" A0.2  = 0x%08x    ",
+		regs->ctx.Ext.AX2.U0);
+	printk(" A1.2  = 0x%08x\n",
+		regs->ctx.Ext.AX2.U1);
+
+	/* 'extended' AX regs (nominally, just AXx.3) */
+	for (i = 0; i < (TBICTX_AX_REGS - 3); i++) {
+		pr_info(" A0.%d  = 0x%08x    ", i + 3, regs->ctx.AX3[i].U0);
+		printk(" A1.%d  = 0x%08x\n", i + 3, regs->ctx.AX3[i].U1);
+	}
+
+	for (i = 0; i < 8; i++) {
+		pr_info(" %s = 0x%08x    ", DX0_names[i], regs->ctx.DX[i].U0);
+		printk(" %s = 0x%08x\n", DX1_names[i], regs->ctx.DX[i].U1);
+	}
+
+	show_trace(NULL, (unsigned long *)regs->ctx.AX[0].U0, regs);
+}
+
+/*
+ * Copy architecture-specific thread state
+ */
+int copy_thread(unsigned long clone_flags, unsigned long usp,
+		unsigned long kthread_arg, struct task_struct *tsk)
+{
+	struct pt_regs *childregs = task_pt_regs(tsk);
+	void *kernel_context = ((void *) childregs +
+				sizeof(struct pt_regs));
+	unsigned long global_base;
+
+	BUG_ON(((unsigned long)childregs) & 0x7);
+	BUG_ON(((unsigned long)kernel_context) & 0x7);
+
+	memset(&tsk->thread.kernel_context, 0,
+			sizeof(tsk->thread.kernel_context));
+
+	tsk->thread.kernel_context = __TBISwitchInit(kernel_context,
+						     ret_from_fork,
+						     0, 0);
+
+	if (unlikely(tsk->flags & PF_KTHREAD)) {
+		/*
+		 * Make sure we don't leak any kernel data to child's regs
+		 * if kernel thread becomes a userspace thread in the future
+		 */
+		memset(childregs, 0 , sizeof(struct pt_regs));
+
+		global_base = __core_reg_get(A1GbP);
+		childregs->ctx.AX[0].U1 = (unsigned long) global_base;
+		childregs->ctx.AX[0].U0 = (unsigned long) kernel_context;
+		/* Set D1Ar1=kthread_arg and D1RtP=usp (fn) */
+		childregs->ctx.DX[4].U1 = usp;
+		childregs->ctx.DX[3].U1 = kthread_arg;
+		tsk->thread.int_depth = 2;
+		return 0;
+	}
+
+	/*
+	 * Get a pointer to where the new child's register block should have
+	 * been pushed.
+	 * The Meta's stack grows upwards, and the context is the the first
+	 * thing to be pushed by TBX (phew)
+	 */
+	*childregs = *current_pt_regs();
+	/* Set the correct stack for the clone mode */
+	if (usp)
+		childregs->ctx.AX[0].U0 = ALIGN(usp, 8);
+	tsk->thread.int_depth = 1;
+
+	/* set return value for child process */
+	childregs->ctx.DX[0].U0 = 0;
+
+	/* The TLS pointer is passed as an argument to sys_clone. */
+	if (clone_flags & CLONE_SETTLS)
+		tsk->thread.tls_ptr =
+				(__force void __user *)childregs->ctx.DX[1].U1;
+
+#ifdef CONFIG_METAG_FPU
+	if (tsk->thread.fpu_context) {
+		struct meta_fpu_context *ctx;
+
+		ctx = kmemdup(tsk->thread.fpu_context,
+			      sizeof(struct meta_fpu_context), GFP_ATOMIC);
+		tsk->thread.fpu_context = ctx;
+	}
+#endif
+
+#ifdef CONFIG_METAG_DSP
+	if (tsk->thread.dsp_context) {
+		struct meta_ext_context *ctx;
+		int i;
+
+		ctx = kmemdup(tsk->thread.dsp_context,
+			      sizeof(struct meta_ext_context), GFP_ATOMIC);
+		for (i = 0; i < 2; i++)
+			ctx->ram[i] = kmemdup(ctx->ram[i], ctx->ram_sz[i],
+					      GFP_ATOMIC);
+		tsk->thread.dsp_context = ctx;
+	}
+#endif
+
+	return 0;
+}
+
+#ifdef CONFIG_METAG_FPU
+static void alloc_fpu_context(struct thread_struct *thread)
+{
+	thread->fpu_context = kzalloc(sizeof(struct meta_fpu_context),
+				      GFP_ATOMIC);
+}
+
+static void clear_fpu(struct thread_struct *thread)
+{
+	thread->user_flags &= ~TBICTX_FPAC_BIT;
+	kfree(thread->fpu_context);
+	thread->fpu_context = NULL;
+}
+#else
+static void clear_fpu(struct thread_struct *thread)
+{
+}
+#endif
+
+#ifdef CONFIG_METAG_DSP
+static void clear_dsp(struct thread_struct *thread)
+{
+	if (thread->dsp_context) {
+		kfree(thread->dsp_context->ram[0]);
+		kfree(thread->dsp_context->ram[1]);
+
+		kfree(thread->dsp_context);
+
+		thread->dsp_context = NULL;
+	}
+
+	__core_reg_set(D0.8, 0);
+}
+#else
+static void clear_dsp(struct thread_struct *thread)
+{
+}
+#endif
+
+struct task_struct *__sched __switch_to(struct task_struct *prev,
+					struct task_struct *next)
+{
+	TBIRES to, from;
+
+	to.Switch.pCtx = next->thread.kernel_context;
+	to.Switch.pPara = prev;
+
+#ifdef CONFIG_METAG_FPU
+	if (prev->thread.user_flags & TBICTX_FPAC_BIT) {
+		struct pt_regs *regs = task_pt_regs(prev);
+		TBIRES state;
+
+		state.Sig.SaveMask = prev->thread.user_flags;
+		state.Sig.pCtx = &regs->ctx;
+
+		if (!prev->thread.fpu_context)
+			alloc_fpu_context(&prev->thread);
+		if (prev->thread.fpu_context)
+			__TBICtxFPUSave(state, prev->thread.fpu_context);
+	}
+	/*
+	 * Force a restore of the FPU context next time this process is
+	 * scheduled.
+	 */
+	if (prev->thread.fpu_context)
+		prev->thread.fpu_context->needs_restore = true;
+#endif
+
+
+	from = __TBISwitch(to, &prev->thread.kernel_context);
+
+	/* Restore TLS pointer for this process. */
+	set_gateway_tls(current->thread.tls_ptr);
+
+	return (struct task_struct *) from.Switch.pPara;
+}
+
+void flush_thread(void)
+{
+	clear_fpu(&current->thread);
+	clear_dsp(&current->thread);
+}
+
+/*
+ * Free current thread data structures etc.
+ */
+void exit_thread(void)
+{
+	clear_fpu(&current->thread);
+	clear_dsp(&current->thread);
+}
+
+/* TODO: figure out how to unwind the kernel stack here to figure out
+ * where we went to sleep. */
+unsigned long get_wchan(struct task_struct *p)
+{
+	return 0;
+}
+
+int dump_fpu(struct pt_regs *regs, elf_fpregset_t *fpu)
+{
+	/* Returning 0 indicates that the FPU state was not stored (as it was
+	 * not in use) */
+	return 0;
+}
+
+#ifdef CONFIG_METAG_USER_TCM
+
+#define ELF_MIN_ALIGN	PAGE_SIZE
+
+#define ELF_PAGESTART(_v) ((_v) & ~(unsigned long)(ELF_MIN_ALIGN-1))
+#define ELF_PAGEOFFSET(_v) ((_v) & (ELF_MIN_ALIGN-1))
+#define ELF_PAGEALIGN(_v) (((_v) + ELF_MIN_ALIGN - 1) & ~(ELF_MIN_ALIGN - 1))
+
+#define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE)
+
+unsigned long __metag_elf_map(struct file *filep, unsigned long addr,
+			      struct elf_phdr *eppnt, int prot, int type,
+			      unsigned long total_size)
+{
+	unsigned long map_addr, size;
+	unsigned long page_off = ELF_PAGEOFFSET(eppnt->p_vaddr);
+	unsigned long raw_size = eppnt->p_filesz + page_off;
+	unsigned long off = eppnt->p_offset - page_off;
+	unsigned int tcm_tag;
+	addr = ELF_PAGESTART(addr);
+	size = ELF_PAGEALIGN(raw_size);
+
+	/* mmap() will return -EINVAL if given a zero size, but a
+	 * segment with zero filesize is perfectly valid */
+	if (!size)
+		return addr;
+
+	tcm_tag = tcm_lookup_tag(addr);
+
+	if (tcm_tag != TCM_INVALID_TAG)
+		type &= ~MAP_FIXED;
+
+	/*
+	* total_size is the size of the ELF (interpreter) image.
+	* The _first_ mmap needs to know the full size, otherwise
+	* randomization might put this image into an overlapping
+	* position with the ELF binary image. (since size < total_size)
+	* So we first map the 'big' image - and unmap the remainder at
+	* the end. (which unmap is needed for ELF images with holes.)
+	*/
+	if (total_size) {
+		total_size = ELF_PAGEALIGN(total_size);
+		map_addr = vm_mmap(filep, addr, total_size, prot, type, off);
+		if (!BAD_ADDR(map_addr))
+			vm_munmap(map_addr+size, total_size-size);
+	} else
+		map_addr = vm_mmap(filep, addr, size, prot, type, off);
+
+	if (!BAD_ADDR(map_addr) && tcm_tag != TCM_INVALID_TAG) {
+		struct tcm_allocation *tcm;
+		unsigned long tcm_addr;
+
+		tcm = kmalloc(sizeof(*tcm), GFP_KERNEL);
+		if (!tcm)
+			return -ENOMEM;
+
+		tcm_addr = tcm_alloc(tcm_tag, raw_size);
+		if (tcm_addr != addr) {
+			kfree(tcm);
+			return -ENOMEM;
+		}
+
+		tcm->tag = tcm_tag;
+		tcm->addr = tcm_addr;
+		tcm->size = raw_size;
+
+		list_add(&tcm->list, &current->mm->context.tcm);
+
+		eppnt->p_vaddr = map_addr;
+		if (copy_from_user((void *) addr, (void __user *) map_addr,
+				   raw_size))
+			return -EFAULT;
+	}
+
+	return map_addr;
+}
+#endif
diff --git a/arch/metag/kernel/ptrace.c b/arch/metag/kernel/ptrace.c
new file mode 100644
index 000000000..756362882
--- /dev/null
+++ b/arch/metag/kernel/ptrace.c
@@ -0,0 +1,414 @@
+/*
+ *  Copyright (C) 2005-2012 Imagination Technologies Ltd.
+ *
+ * This file is subject to the terms and conditions of the GNU General
+ * Public License.  See the file COPYING in the main directory of
+ * this archive for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/ptrace.h>
+#include <linux/user.h>
+#include <linux/regset.h>
+#include <linux/tracehook.h>
+#include <linux/elf.h>
+#include <linux/uaccess.h>
+#include <trace/syscall.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/syscalls.h>
+
+/*
+ * user_regset definitions.
+ */
+
+int metag_gp_regs_copyout(const struct pt_regs *regs,
+			  unsigned int pos, unsigned int count,
+			  void *kbuf, void __user *ubuf)
+{
+	const void *ptr;
+	unsigned long data;
+	int ret;
+
+	/* D{0-1}.{0-7} */
+	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+				  regs->ctx.DX, 0, 4*16);
+	if (ret)
+		goto out;
+	/* A{0-1}.{0-1} */
+	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+				  regs->ctx.AX, 4*16, 4*20);
+	if (ret)
+		goto out;
+	/* A{0-1}.2 */
+	if (regs->ctx.SaveMask & TBICTX_XEXT_BIT)
+		ptr = regs->ctx.Ext.Ctx.pExt;
+	else
+		ptr = &regs->ctx.Ext.AX2;
+	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+				  ptr, 4*20, 4*22);
+	if (ret)
+		goto out;
+	/* A{0-1}.3 */
+	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+				  &regs->ctx.AX3, 4*22, 4*24);
+	if (ret)
+		goto out;
+	/* PC */
+	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+				  &regs->ctx.CurrPC, 4*24, 4*25);
+	if (ret)
+		goto out;
+	/* TXSTATUS */
+	data = (unsigned long)regs->ctx.Flags;
+	if (regs->ctx.SaveMask & TBICTX_CBUF_BIT)
+		data |= USER_GP_REGS_STATUS_CATCH_BIT;
+	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+				  &data, 4*25, 4*26);
+	if (ret)
+		goto out;
+	/* TXRPT, TXBPOBITS, TXMODE */
+	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+				  &regs->ctx.CurrRPT, 4*26, 4*29);
+	if (ret)
+		goto out;
+	/* Padding */
+	ret = user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf,
+				       4*29, 4*30);
+out:
+	return ret;
+}
+
+int metag_gp_regs_copyin(struct pt_regs *regs,
+			 unsigned int pos, unsigned int count,
+			 const void *kbuf, const void __user *ubuf)
+{
+	void *ptr;
+	unsigned long data;
+	int ret;
+
+	/* D{0-1}.{0-7} */
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				 regs->ctx.DX, 0, 4*16);
+	if (ret)
+		goto out;
+	/* A{0-1}.{0-1} */
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				 regs->ctx.AX, 4*16, 4*20);
+	if (ret)
+		goto out;
+	/* A{0-1}.2 */
+	if (regs->ctx.SaveMask & TBICTX_XEXT_BIT)
+		ptr = regs->ctx.Ext.Ctx.pExt;
+	else
+		ptr = &regs->ctx.Ext.AX2;
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				 ptr, 4*20, 4*22);
+	if (ret)
+		goto out;
+	/* A{0-1}.3 */
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				 &regs->ctx.AX3, 4*22, 4*24);
+	if (ret)
+		goto out;
+	/* PC */
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				 &regs->ctx.CurrPC, 4*24, 4*25);
+	if (ret)
+		goto out;
+	/* TXSTATUS */
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				 &data, 4*25, 4*26);
+	if (ret)
+		goto out;
+	regs->ctx.Flags = data & 0xffff;
+	if (data & USER_GP_REGS_STATUS_CATCH_BIT)
+		regs->ctx.SaveMask |= TBICTX_XCBF_BIT | TBICTX_CBUF_BIT;
+	else
+		regs->ctx.SaveMask &= ~TBICTX_CBUF_BIT;
+	/* TXRPT, TXBPOBITS, TXMODE */
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				 &regs->ctx.CurrRPT, 4*26, 4*29);
+out:
+	return ret;
+}
+
+static int metag_gp_regs_get(struct task_struct *target,
+			     const struct user_regset *regset,
+			     unsigned int pos, unsigned int count,
+			     void *kbuf, void __user *ubuf)
+{
+	const struct pt_regs *regs = task_pt_regs(target);
+	return metag_gp_regs_copyout(regs, pos, count, kbuf, ubuf);
+}
+
+static int metag_gp_regs_set(struct task_struct *target,
+			     const struct user_regset *regset,
+			     unsigned int pos, unsigned int count,
+			     const void *kbuf, const void __user *ubuf)
+{
+	struct pt_regs *regs = task_pt_regs(target);
+	return metag_gp_regs_copyin(regs, pos, count, kbuf, ubuf);
+}
+
+int metag_cb_regs_copyout(const struct pt_regs *regs,
+			  unsigned int pos, unsigned int count,
+			  void *kbuf, void __user *ubuf)
+{
+	int ret;
+
+	/* TXCATCH{0-3} */
+	if (regs->ctx.SaveMask & TBICTX_XCBF_BIT)
+		ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+					  regs->extcb0, 0, 4*4);
+	else
+		ret = user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf,
+					       0, 4*4);
+	return ret;
+}
+
+int metag_cb_regs_copyin(struct pt_regs *regs,
+			 unsigned int pos, unsigned int count,
+			 const void *kbuf, const void __user *ubuf)
+{
+	int ret;
+
+	/* TXCATCH{0-3} */
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				 regs->extcb0, 0, 4*4);
+	return ret;
+}
+
+static int metag_cb_regs_get(struct task_struct *target,
+			     const struct user_regset *regset,
+			     unsigned int pos, unsigned int count,
+			     void *kbuf, void __user *ubuf)
+{
+	const struct pt_regs *regs = task_pt_regs(target);
+	return metag_cb_regs_copyout(regs, pos, count, kbuf, ubuf);
+}
+
+static int metag_cb_regs_set(struct task_struct *target,
+			     const struct user_regset *regset,
+			     unsigned int pos, unsigned int count,
+			     const void *kbuf, const void __user *ubuf)
+{
+	struct pt_regs *regs = task_pt_regs(target);
+	return metag_cb_regs_copyin(regs, pos, count, kbuf, ubuf);
+}
+
+int metag_rp_state_copyout(const struct pt_regs *regs,
+			   unsigned int pos, unsigned int count,
+			   void *kbuf, void __user *ubuf)
+{
+	unsigned long mask;
+	u64 *ptr;
+	int ret, i;
+
+	/* Empty read pipeline */
+	if (!(regs->ctx.SaveMask & TBICTX_CBRP_BIT)) {
+		ret = user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf,
+					       0, 4*13);
+		goto out;
+	}
+
+	mask = (regs->ctx.CurrDIVTIME & TXDIVTIME_RPMASK_BITS) >>
+		TXDIVTIME_RPMASK_S;
+
+	/* Read pipeline entries */
+	ptr = (void *)&regs->extcb0[1];
+	for (i = 0; i < 6; ++i, ++ptr) {
+		if (mask & (1 << i))
+			ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+						  ptr, 8*i, 8*(i + 1));
+		else
+			ret = user_regset_copyout_zero(&pos, &count, &kbuf,
+						       &ubuf, 8*i, 8*(i + 1));
+		if (ret)
+			goto out;
+	}
+	/* Mask of entries */
+	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+				  &mask, 4*12, 4*13);
+out:
+	return ret;
+}
+
+int metag_rp_state_copyin(struct pt_regs *regs,
+			  unsigned int pos, unsigned int count,
+			  const void *kbuf, const void __user *ubuf)
+{
+	struct user_rp_state rp;
+	unsigned long long *ptr;
+	int ret, i;
+
+	/* Read the entire pipeline before making any changes */
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				 &rp, 0, 4*13);
+	if (ret)
+		goto out;
+
+	/* Write pipeline entries */
+	ptr = (void *)&regs->extcb0[1];
+	for (i = 0; i < 6; ++i, ++ptr)
+		if (rp.mask & (1 << i))
+			*ptr = rp.entries[i];
+
+	/* Update RPMask in TXDIVTIME */
+	regs->ctx.CurrDIVTIME &= ~TXDIVTIME_RPMASK_BITS;
+	regs->ctx.CurrDIVTIME |= (rp.mask << TXDIVTIME_RPMASK_S)
+				 & TXDIVTIME_RPMASK_BITS;
+
+	/* Set/clear flags to indicate catch/read pipeline state */
+	if (rp.mask)
+		regs->ctx.SaveMask |= TBICTX_XCBF_BIT | TBICTX_CBRP_BIT;
+	else
+		regs->ctx.SaveMask &= ~TBICTX_CBRP_BIT;
+out:
+	return ret;
+}
+
+static int metag_rp_state_get(struct task_struct *target,
+			      const struct user_regset *regset,
+			      unsigned int pos, unsigned int count,
+			      void *kbuf, void __user *ubuf)
+{
+	const struct pt_regs *regs = task_pt_regs(target);
+	return metag_rp_state_copyout(regs, pos, count, kbuf, ubuf);
+}
+
+static int metag_rp_state_set(struct task_struct *target,
+			      const struct user_regset *regset,
+			      unsigned int pos, unsigned int count,
+			      const void *kbuf, const void __user *ubuf)
+{
+	struct pt_regs *regs = task_pt_regs(target);
+	return metag_rp_state_copyin(regs, pos, count, kbuf, ubuf);
+}
+
+static int metag_tls_get(struct task_struct *target,
+			const struct user_regset *regset,
+			unsigned int pos, unsigned int count,
+			void *kbuf, void __user *ubuf)
+{
+	void __user *tls = target->thread.tls_ptr;
+	return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &tls, 0, -1);
+}
+
+static int metag_tls_set(struct task_struct *target,
+			const struct user_regset *regset,
+			unsigned int pos, unsigned int count,
+			const void *kbuf, const void __user *ubuf)
+{
+	int ret;
+	void __user *tls;
+
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &tls, 0, -1);
+	if (ret)
+		return ret;
+
+	target->thread.tls_ptr = tls;
+	return ret;
+}
+
+enum metag_regset {
+	REGSET_GENERAL,
+	REGSET_CBUF,
+	REGSET_READPIPE,
+	REGSET_TLS,
+};
+
+static const struct user_regset metag_regsets[] = {
+	[REGSET_GENERAL] = {
+		.core_note_type = NT_PRSTATUS,
+		.n = ELF_NGREG,
+		.size = sizeof(long),
+		.align = sizeof(long long),
+		.get = metag_gp_regs_get,
+		.set = metag_gp_regs_set,
+	},
+	[REGSET_CBUF] = {
+		.core_note_type = NT_METAG_CBUF,
+		.n = sizeof(struct user_cb_regs) / sizeof(long),
+		.size = sizeof(long),
+		.align = sizeof(long long),
+		.get = metag_cb_regs_get,
+		.set = metag_cb_regs_set,
+	},
+	[REGSET_READPIPE] = {
+		.core_note_type = NT_METAG_RPIPE,
+		.n = sizeof(struct user_rp_state) / sizeof(long),
+		.size = sizeof(long),
+		.align = sizeof(long long),
+		.get = metag_rp_state_get,
+		.set = metag_rp_state_set,
+	},
+	[REGSET_TLS] = {
+		.core_note_type = NT_METAG_TLS,
+		.n = 1,
+		.size = sizeof(void *),
+		.align = sizeof(void *),
+		.get = metag_tls_get,
+		.set = metag_tls_set,
+	},
+};
+
+static const struct user_regset_view user_metag_view = {
+	.name = "metag",
+	.e_machine = EM_METAG,
+	.regsets = metag_regsets,
+	.n = ARRAY_SIZE(metag_regsets)
+};
+
+const struct user_regset_view *task_user_regset_view(struct task_struct *task)
+{
+	return &user_metag_view;
+}
+
+/*
+ * Called by kernel/ptrace.c when detaching..
+ *
+ * Make sure single step bits etc are not set.
+ */
+void ptrace_disable(struct task_struct *child)
+{
+	/* nothing to do.. */
+}
+
+long arch_ptrace(struct task_struct *child, long request, unsigned long addr,
+		 unsigned long data)
+{
+	int ret;
+
+	switch (request) {
+	default:
+		ret = ptrace_request(child, request, addr, data);
+		break;
+	}
+
+	return ret;
+}
+
+int syscall_trace_enter(struct pt_regs *regs)
+{
+	int ret = 0;
+
+	if (test_thread_flag(TIF_SYSCALL_TRACE))
+		ret = tracehook_report_syscall_entry(regs);
+
+	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
+		trace_sys_enter(regs, regs->ctx.DX[0].U1);
+
+	return ret ? -1 : regs->ctx.DX[0].U1;
+}
+
+void syscall_trace_leave(struct pt_regs *regs)
+{
+	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
+		trace_sys_exit(regs, regs->ctx.DX[0].U1);
+
+	if (test_thread_flag(TIF_SYSCALL_TRACE))
+		tracehook_report_syscall_exit(regs, 0);
+}
diff --git a/arch/metag/kernel/setup.c b/arch/metag/kernel/setup.c
new file mode 100644
index 000000000..31cf53d0e
--- /dev/null
+++ b/arch/metag/kernel/setup.c
@@ -0,0 +1,625 @@
+/*
+ * Copyright (C) 2005-2012 Imagination Technologies Ltd.
+ *
+ * This file contains the architecture-dependant parts of system setup.
+ *
+ */
+
+#include <linux/export.h>
+#include <linux/bootmem.h>
+#include <linux/console.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/genhd.h>
+#include <linux/init.h>
+#include <linux/initrd.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/memblock.h>
+#include <linux/mm.h>
+#include <linux/of_fdt.h>
+#include <linux/of_platform.h>
+#include <linux/pfn.h>
+#include <linux/root_dev.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/start_kernel.h>
+#include <linux/string.h>
+
+#include <asm/cachepart.h>
+#include <asm/clock.h>
+#include <asm/core_reg.h>
+#include <asm/cpu.h>
+#include <asm/da.h>
+#include <asm/highmem.h>
+#include <asm/hwthread.h>
+#include <asm/l2cache.h>
+#include <asm/mach/arch.h>
+#include <asm/metag_mem.h>
+#include <asm/metag_regs.h>
+#include <asm/mmu.h>
+#include <asm/mmzone.h>
+#include <asm/processor.h>
+#include <asm/sections.h>
+#include <asm/setup.h>
+#include <asm/traps.h>
+
+/* Priv protect as many registers as possible. */
+#define DEFAULT_PRIV	(TXPRIVEXT_COPRO_BITS		| \
+			 TXPRIVEXT_TXTRIGGER_BIT	| \
+			 TXPRIVEXT_TXGBLCREG_BIT	| \
+			 TXPRIVEXT_ILOCK_BIT		| \
+			 TXPRIVEXT_TXITACCYC_BIT	| \
+			 TXPRIVEXT_TXDIVTIME_BIT	| \
+			 TXPRIVEXT_TXAMAREGX_BIT	| \
+			 TXPRIVEXT_TXTIMERI_BIT		| \
+			 TXPRIVEXT_TXSTATUS_BIT		| \
+			 TXPRIVEXT_TXDISABLE_BIT)
+
+/* Meta2 specific bits. */
+#ifdef CONFIG_METAG_META12
+#define META2_PRIV	0
+#else
+#define META2_PRIV	(TXPRIVEXT_TXTIMER_BIT		| \
+			 TXPRIVEXT_TRACE_BIT)
+#endif
+
+/* Unaligned access checking bits. */
+#ifdef CONFIG_METAG_UNALIGNED
+#define UNALIGNED_PRIV	TXPRIVEXT_ALIGNREW_BIT
+#else
+#define UNALIGNED_PRIV	0
+#endif
+
+#define PRIV_BITS 	(DEFAULT_PRIV			| \
+			 META2_PRIV			| \
+			 UNALIGNED_PRIV)
+
+/*
+ * Protect access to:
+ * 0x06000000-0x07ffffff Direct mapped region
+ * 0x05000000-0x05ffffff MMU table region (Meta1)
+ * 0x04400000-0x047fffff Cache flush region
+ * 0x84000000-0x87ffffff Core cache memory region (Meta2)
+ *
+ * Allow access to:
+ * 0x80000000-0x81ffffff Core code memory region (Meta2)
+ */
+#ifdef CONFIG_METAG_META12
+#define PRIVSYSR_BITS	TXPRIVSYSR_ALL_BITS
+#else
+#define PRIVSYSR_BITS	(TXPRIVSYSR_ALL_BITS & ~TXPRIVSYSR_CORECODE_BIT)
+#endif
+
+/* Protect all 0x02xxxxxx and 0x048xxxxx. */
+#define PIOREG_BITS	0xffffffff
+
+/*
+ * Protect all 0x04000xx0 (system events)
+ * except write combiner flush and write fence (system events 4 and 5).
+ */
+#define PSYREG_BITS	0xfffffffb
+
+
+extern char _heap_start[];
+
+#ifdef CONFIG_DA_CONSOLE
+/* Our early channel based console driver */
+extern struct console dash_console;
+#endif
+
+const struct machine_desc *machine_desc __initdata;
+
+/*
+ * Map a Linux CPU number to a hardware thread ID
+ * In SMP this will be setup with the correct mapping at startup; in UP this
+ * will map to the HW thread on which we are running.
+ */
+u8 cpu_2_hwthread_id[NR_CPUS] __read_mostly = {
+	[0 ... NR_CPUS-1] = BAD_HWTHREAD_ID
+};
+EXPORT_SYMBOL_GPL(cpu_2_hwthread_id);
+
+/*
+ * Map a hardware thread ID to a Linux CPU number
+ * In SMP this will be fleshed out with the correct CPU ID for a particular
+ * hardware thread. In UP this will be initialised with the boot CPU ID.
+ */
+u8 hwthread_id_2_cpu[4] __read_mostly = {
+	[0 ... 3] = BAD_CPU_ID
+};
+
+/* The relative offset of the MMU mapped memory (from ldlk or bootloader)
+ * to the real physical memory.  This is needed as we have to use the
+ * physical addresses in the MMU tables (pte entries), and not the virtual
+ * addresses.
+ * This variable is used in the __pa() and __va() macros, and should
+ * probably only be used via them.
+ */
+unsigned int meta_memoffset;
+EXPORT_SYMBOL(meta_memoffset);
+
+static char __initdata *original_cmd_line;
+
+DEFINE_PER_CPU(PTBI, pTBI);
+
+/*
+ * Mapping are specified as "CPU_ID:HWTHREAD_ID", e.g.
+ *
+ *	"hwthread_map=0:1,1:2,2:3,3:0"
+ *
+ *	Linux CPU ID	HWTHREAD_ID
+ *	---------------------------
+ *	    0		      1
+ *	    1		      2
+ *	    2		      3
+ *	    3		      0
+ */
+static int __init parse_hwthread_map(char *p)
+{
+	int cpu;
+
+	while (*p) {
+		cpu = (*p++) - '0';
+		if (cpu < 0 || cpu > 9)
+			goto err_cpu;
+
+		p++;		/* skip semi-colon */
+		cpu_2_hwthread_id[cpu] = (*p++) - '0';
+		if (cpu_2_hwthread_id[cpu] >= 4)
+			goto err_thread;
+		hwthread_id_2_cpu[cpu_2_hwthread_id[cpu]] = cpu;
+
+		if (*p == ',')
+			p++;		/* skip comma */
+	}
+
+	return 0;
+err_cpu:
+	pr_err("%s: hwthread_map cpu argument out of range\n", __func__);
+	return -EINVAL;
+err_thread:
+	pr_err("%s: hwthread_map thread argument out of range\n", __func__);
+	return -EINVAL;
+}
+early_param("hwthread_map", parse_hwthread_map);
+
+void __init dump_machine_table(void)
+{
+	struct machine_desc *p;
+	const char **compat;
+
+	pr_info("Available machine support:\n\tNAME\t\tCOMPATIBLE LIST\n");
+	for_each_machine_desc(p) {
+		pr_info("\t%s\t[", p->name);
+		for (compat = p->dt_compat; compat && *compat; ++compat)
+			printk(" '%s'", *compat);
+		printk(" ]\n");
+	}
+
+	pr_info("\nPlease check your kernel config and/or bootloader.\n");
+
+	hard_processor_halt(HALT_PANIC);
+}
+
+#ifdef CONFIG_METAG_HALT_ON_PANIC
+static int metag_panic_event(struct notifier_block *this, unsigned long event,
+			     void *ptr)
+{
+	hard_processor_halt(HALT_PANIC);
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block metag_panic_block = {
+	metag_panic_event,
+	NULL,
+	0
+};
+#endif
+
+void __init setup_arch(char **cmdline_p)
+{
+	unsigned long start_pfn;
+	unsigned long text_start = (unsigned long)(&_stext);
+	unsigned long cpu = smp_processor_id();
+	unsigned long heap_start, heap_end;
+	unsigned long start_pte;
+	PTBI _pTBI;
+	PTBISEG p_heap;
+	int heap_id, i;
+
+	metag_cache_probe();
+
+	metag_da_probe();
+#ifdef CONFIG_DA_CONSOLE
+	if (metag_da_enabled()) {
+		/* An early channel based console driver */
+		register_console(&dash_console);
+		add_preferred_console("ttyDA", 1, NULL);
+	}
+#endif
+
+	/* try interpreting the argument as a device tree */
+	machine_desc = setup_machine_fdt(original_cmd_line);
+	/* if it doesn't look like a device tree it must be a command line */
+	if (!machine_desc) {
+#ifdef CONFIG_METAG_BUILTIN_DTB
+		/* try the embedded device tree */
+		machine_desc = setup_machine_fdt(__dtb_start);
+		if (!machine_desc)
+			panic("Invalid embedded device tree.");
+#else
+		/* use the default machine description */
+		machine_desc = default_machine_desc();
+#endif
+#ifndef CONFIG_CMDLINE_FORCE
+		/* append the bootloader cmdline to any builtin fdt cmdline */
+		if (boot_command_line[0] && original_cmd_line[0])
+			strlcat(boot_command_line, " ", COMMAND_LINE_SIZE);
+		strlcat(boot_command_line, original_cmd_line,
+			COMMAND_LINE_SIZE);
+#endif
+	}
+	setup_meta_clocks(machine_desc->clocks);
+
+	*cmdline_p = boot_command_line;
+	parse_early_param();
+
+	/*
+	 * Make sure we don't alias in dcache or icache
+	 */
+	check_for_cache_aliasing(cpu);
+
+
+#ifdef CONFIG_METAG_HALT_ON_PANIC
+	atomic_notifier_chain_register(&panic_notifier_list,
+				       &metag_panic_block);
+#endif
+
+#ifdef CONFIG_DUMMY_CONSOLE
+	conswitchp = &dummy_con;
+#endif
+
+	if (!(__core_reg_get(TXSTATUS) & TXSTATUS_PSTAT_BIT))
+		panic("Privilege must be enabled for this thread.");
+
+	_pTBI = __TBI(TBID_ISTAT_BIT);
+
+	per_cpu(pTBI, cpu) = _pTBI;
+
+	if (!per_cpu(pTBI, cpu))
+		panic("No TBI found!");
+
+	/*
+	 * Initialize all interrupt vectors to our copy of __TBIUnExpXXX,
+	 * rather than the version from the bootloader. This makes call
+	 * stacks easier to understand and may allow us to unmap the
+	 * bootloader at some point.
+	 */
+	for (i = 0; i <= TBID_SIGNUM_MAX; i++)
+		_pTBI->fnSigs[i] = __TBIUnExpXXX;
+
+	/* A Meta requirement is that the kernel is loaded (virtually)
+	 * at the PAGE_OFFSET.
+	 */
+	if (PAGE_OFFSET != text_start)
+		panic("Kernel not loaded at PAGE_OFFSET (%#x) but at %#lx.",
+		      PAGE_OFFSET, text_start);
+
+	start_pte = mmu_read_second_level_page(text_start);
+
+	/*
+	 * Kernel pages should have the PRIV bit set by the bootloader.
+	 */
+	if (!(start_pte & _PAGE_KERNEL))
+		panic("kernel pte does not have PRIV set");
+
+	/*
+	 * See __pa and __va in include/asm/page.h.
+	 * This value is negative when running in local space but the
+	 * calculations work anyway.
+	 */
+	meta_memoffset = text_start - (start_pte & PAGE_MASK);
+
+	/* Now lets look at the heap space */
+	heap_id = (__TBIThreadId() & TBID_THREAD_BITS)
+		+ TBID_SEG(0, TBID_SEGSCOPE_LOCAL, TBID_SEGTYPE_HEAP);
+
+	p_heap = __TBIFindSeg(NULL, heap_id);
+
+	if (!p_heap)
+		panic("Could not find heap from TBI!");
+
+	/* The heap begins at the first full page after the kernel data. */
+	heap_start = (unsigned long) &_heap_start;
+
+	/* The heap ends at the end of the heap segment specified with
+	 * ldlk.
+	 */
+	if (is_global_space(text_start)) {
+		pr_debug("WARNING: running in global space!\n");
+		heap_end = (unsigned long)p_heap->pGAddr + p_heap->Bytes;
+	} else {
+		heap_end = (unsigned long)p_heap->pLAddr + p_heap->Bytes;
+	}
+
+	ROOT_DEV = Root_RAM0;
+
+	/* init_mm is the mm struct used for the first task.  It is then
+	 * cloned for all other tasks spawned from that task.
+	 *
+	 * Note - we are using the virtual addresses here.
+	 */
+	init_mm.start_code = (unsigned long)(&_stext);
+	init_mm.end_code = (unsigned long)(&_etext);
+	init_mm.end_data = (unsigned long)(&_edata);
+	init_mm.brk = (unsigned long)heap_start;
+
+	min_low_pfn = PFN_UP(__pa(text_start));
+	max_low_pfn = PFN_DOWN(__pa(heap_end));
+
+	pfn_base = min_low_pfn;
+
+	/* Round max_pfn up to a 4Mb boundary. The free_bootmem_node()
+	 * call later makes sure to keep the rounded up pages marked reserved.
+	 */
+	max_pfn = max_low_pfn + ((1 << MAX_ORDER) - 1);
+	max_pfn &= ~((1 << MAX_ORDER) - 1);
+
+	start_pfn = PFN_UP(__pa(heap_start));
+
+	if (min_low_pfn & ((1 << MAX_ORDER) - 1)) {
+		/* Theoretically, we could expand the space that the
+		 * bootmem allocator covers - much as we do for the
+		 * 'high' address, and then tell the bootmem system
+		 * that the lowest chunk is 'not available'.  Right
+		 * now it is just much easier to constrain the
+		 * user to always MAX_ORDER align their kernel space.
+		 */
+
+		panic("Kernel must be %d byte aligned, currently at %#lx.",
+		      1 << (MAX_ORDER + PAGE_SHIFT),
+		      min_low_pfn << PAGE_SHIFT);
+	}
+
+#ifdef CONFIG_HIGHMEM
+	highstart_pfn = highend_pfn = max_pfn;
+	high_memory = (void *) __va(PFN_PHYS(highstart_pfn));
+#else
+	high_memory = (void *)__va(PFN_PHYS(max_pfn));
+#endif
+
+	paging_init(heap_end);
+
+	setup_priv();
+
+	/* Setup the boot cpu's mapping. The rest will be setup below. */
+	cpu_2_hwthread_id[smp_processor_id()] = hard_processor_id();
+	hwthread_id_2_cpu[hard_processor_id()] = smp_processor_id();
+
+	unflatten_and_copy_device_tree();
+
+#ifdef CONFIG_SMP
+	smp_init_cpus();
+#endif
+
+	if (machine_desc->init_early)
+		machine_desc->init_early();
+}
+
+static int __init customize_machine(void)
+{
+	/* customizes platform devices, or adds new ones */
+	if (machine_desc->init_machine)
+		machine_desc->init_machine();
+	else
+		of_platform_populate(NULL, of_default_bus_match_table, NULL,
+				     NULL);
+	return 0;
+}
+arch_initcall(customize_machine);
+
+static int __init init_machine_late(void)
+{
+	if (machine_desc->init_late)
+		machine_desc->init_late();
+	return 0;
+}
+late_initcall(init_machine_late);
+
+#ifdef CONFIG_PROC_FS
+/*
+ *	Get CPU information for use by the procfs.
+ */
+static const char *get_cpu_capabilities(unsigned int txenable)
+{
+#ifdef CONFIG_METAG_META21
+	/* See CORE_ID in META HTP.GP TRM - Architecture Overview 2.1.238 */
+	int coreid = metag_in32(METAC_CORE_ID);
+	unsigned int dsp_type = (coreid >> 3) & 7;
+	unsigned int fpu_type = (coreid >> 7) & 3;
+
+	switch (dsp_type | fpu_type << 3) {
+	case (0x00): return "EDSP";
+	case (0x01): return "DSP";
+	case (0x08): return "EDSP+LFPU";
+	case (0x09): return "DSP+LFPU";
+	case (0x10): return "EDSP+FPU";
+	case (0x11): return "DSP+FPU";
+	}
+	return "UNKNOWN";
+
+#else
+	if (!(txenable & TXENABLE_CLASS_BITS))
+		return "DSP";
+	else
+		return "";
+#endif
+}
+
+static int show_cpuinfo(struct seq_file *m, void *v)
+{
+	const char *cpu;
+	unsigned int txenable, thread_id, major, minor;
+	unsigned long clockfreq = get_coreclock();
+#ifdef CONFIG_SMP
+	int i;
+	unsigned long lpj;
+#endif
+
+	cpu = "META";
+
+	txenable = __core_reg_get(TXENABLE);
+	major = (txenable & TXENABLE_MAJOR_REV_BITS) >> TXENABLE_MAJOR_REV_S;
+	minor = (txenable & TXENABLE_MINOR_REV_BITS) >> TXENABLE_MINOR_REV_S;
+	thread_id = (txenable >> 8) & 0x3;
+
+#ifdef CONFIG_SMP
+	for_each_online_cpu(i) {
+		lpj = per_cpu(cpu_data, i).loops_per_jiffy;
+		txenable = core_reg_read(TXUCT_ID, TXENABLE_REGNUM,
+							cpu_2_hwthread_id[i]);
+
+		seq_printf(m, "CPU:\t\t%s %d.%d (thread %d)\n"
+			      "Clocking:\t%lu.%1luMHz\n"
+			      "BogoMips:\t%lu.%02lu\n"
+			      "Calibration:\t%lu loops\n"
+			      "Capabilities:\t%s\n\n",
+			      cpu, major, minor, i,
+			      clockfreq / 1000000, (clockfreq / 100000) % 10,
+			      lpj / (500000 / HZ), (lpj / (5000 / HZ)) % 100,
+			      lpj,
+			      get_cpu_capabilities(txenable));
+	}
+#else
+	seq_printf(m, "CPU:\t\t%s %d.%d (thread %d)\n"
+		   "Clocking:\t%lu.%1luMHz\n"
+		   "BogoMips:\t%lu.%02lu\n"
+		   "Calibration:\t%lu loops\n"
+		   "Capabilities:\t%s\n",
+		   cpu, major, minor, thread_id,
+		   clockfreq / 1000000, (clockfreq / 100000) % 10,
+		   loops_per_jiffy / (500000 / HZ),
+		   (loops_per_jiffy / (5000 / HZ)) % 100,
+		   loops_per_jiffy,
+		   get_cpu_capabilities(txenable));
+#endif /* CONFIG_SMP */
+
+#ifdef CONFIG_METAG_L2C
+	if (meta_l2c_is_present()) {
+		seq_printf(m, "L2 cache:\t%s\n"
+			      "L2 cache size:\t%d KB\n",
+			      meta_l2c_is_enabled() ? "enabled" : "disabled",
+			      meta_l2c_size() >> 10);
+	}
+#endif
+	return 0;
+}
+
+static void *c_start(struct seq_file *m, loff_t *pos)
+{
+	return (void *)(*pos == 0);
+}
+static void *c_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	return NULL;
+}
+static void c_stop(struct seq_file *m, void *v)
+{
+}
+const struct seq_operations cpuinfo_op = {
+	.start = c_start,
+	.next  = c_next,
+	.stop  = c_stop,
+	.show  = show_cpuinfo,
+};
+#endif /* CONFIG_PROC_FS */
+
+void __init metag_start_kernel(char *args)
+{
+	/* Zero the timer register so timestamps are from the point at
+	 * which the kernel started running.
+	 */
+	__core_reg_set(TXTIMER, 0);
+
+	/* Clear the bss. */
+	memset(__bss_start, 0,
+	       (unsigned long)__bss_stop - (unsigned long)__bss_start);
+
+	/* Remember where these are for use in setup_arch */
+	original_cmd_line = args;
+
+	current_thread_info()->cpu = hard_processor_id();
+
+	start_kernel();
+}
+
+/**
+ * setup_priv() - Set up privilege protection registers.
+ *
+ * Set up privilege protection registers such as TXPRIVEXT to prevent userland
+ * from touching our precious registers and sensitive memory areas.
+ */
+void setup_priv(void)
+{
+	unsigned int offset = hard_processor_id() << TXPRIVREG_STRIDE_S;
+
+	__core_reg_set(TXPRIVEXT, PRIV_BITS);
+
+	metag_out32(PRIVSYSR_BITS, T0PRIVSYSR + offset);
+	metag_out32(PIOREG_BITS,   T0PIOREG   + offset);
+	metag_out32(PSYREG_BITS,   T0PSYREG   + offset);
+}
+
+PTBI pTBI_get(unsigned int cpu)
+{
+	return per_cpu(pTBI, cpu);
+}
+EXPORT_SYMBOL(pTBI_get);
+
+#if defined(CONFIG_METAG_DSP) && defined(CONFIG_METAG_FPU)
+static char capabilities[] = "dsp fpu";
+#elif defined(CONFIG_METAG_DSP)
+static char capabilities[] = "dsp";
+#elif defined(CONFIG_METAG_FPU)
+static char capabilities[] = "fpu";
+#else
+static char capabilities[] = "";
+#endif
+
+static struct ctl_table caps_kern_table[] = {
+	{
+		.procname	= "capabilities",
+		.data		= capabilities,
+		.maxlen		= sizeof(capabilities),
+		.mode		= 0444,
+		.proc_handler	= proc_dostring,
+	},
+	{}
+};
+
+static struct ctl_table caps_root_table[] = {
+	{
+		.procname	= "kernel",
+		.mode		= 0555,
+		.child		= caps_kern_table,
+	},
+	{}
+};
+
+static int __init capabilities_register_sysctl(void)
+{
+	struct ctl_table_header *caps_table_header;
+
+	caps_table_header = register_sysctl_table(caps_root_table);
+	if (!caps_table_header) {
+		pr_err("Unable to register CAPABILITIES sysctl\n");
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+core_initcall(capabilities_register_sysctl);
diff --git a/arch/metag/kernel/signal.c b/arch/metag/kernel/signal.c
new file mode 100644
index 000000000..ce49d429c
--- /dev/null
+++ b/arch/metag/kernel/signal.c
@@ -0,0 +1,334 @@
+/*
+ *  Copyright (C) 1991,1992  Linus Torvalds
+ *  Copyright (C) 2005-2012  Imagination Technologies Ltd.
+ *
+ *  1997-11-28  Modified for POSIX.1b signals by Richard Henderson
+ *
+ */
+
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/kernel.h>
+#include <linux/signal.h>
+#include <linux/errno.h>
+#include <linux/wait.h>
+#include <linux/ptrace.h>
+#include <linux/unistd.h>
+#include <linux/stddef.h>
+#include <linux/personality.h>
+#include <linux/uaccess.h>
+#include <linux/tracehook.h>
+
+#include <asm/ucontext.h>
+#include <asm/cacheflush.h>
+#include <asm/switch.h>
+#include <asm/syscall.h>
+#include <asm/syscalls.h>
+
+#define REG_FLAGS	ctx.SaveMask
+#define REG_RETVAL	ctx.DX[0].U0
+#define REG_SYSCALL	ctx.DX[0].U1
+#define REG_SP		ctx.AX[0].U0
+#define REG_ARG1	ctx.DX[3].U1
+#define REG_ARG2	ctx.DX[3].U0
+#define REG_ARG3	ctx.DX[2].U1
+#define REG_PC		ctx.CurrPC
+#define REG_RTP		ctx.DX[4].U1
+
+struct rt_sigframe {
+	struct siginfo info;
+	struct ucontext uc;
+	unsigned long retcode[2];
+};
+
+static int restore_sigcontext(struct pt_regs *regs,
+			      struct sigcontext __user *sc)
+{
+	int err;
+
+	/* Always make any pending restarted system calls return -EINTR */
+	current->restart_block.fn = do_no_restart_syscall;
+
+	err = metag_gp_regs_copyin(regs, 0, sizeof(struct user_gp_regs), NULL,
+				   &sc->regs);
+	if (!err)
+		err = metag_cb_regs_copyin(regs, 0,
+					   sizeof(struct user_cb_regs), NULL,
+					   &sc->cb);
+	if (!err)
+		err = metag_rp_state_copyin(regs, 0,
+					    sizeof(struct user_rp_state), NULL,
+					    &sc->rp);
+
+	/* This is a user-mode context. */
+	regs->REG_FLAGS |= TBICTX_PRIV_BIT;
+
+	return err;
+}
+
+long sys_rt_sigreturn(void)
+{
+	/* NOTE - Meta stack goes UPWARDS - so we wind the stack back */
+	struct pt_regs *regs = current_pt_regs();
+	struct rt_sigframe __user *frame;
+	sigset_t set;
+
+	frame = (__force struct rt_sigframe __user *)(regs->REG_SP -
+						      sizeof(*frame));
+
+	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
+		goto badframe;
+
+	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
+		goto badframe;
+
+	set_current_blocked(&set);
+
+	if (restore_sigcontext(regs, &frame->uc.uc_mcontext))
+		goto badframe;
+
+	if (restore_altstack(&frame->uc.uc_stack))
+		goto badframe;
+
+	return regs->REG_RETVAL;
+
+badframe:
+	force_sig(SIGSEGV, current);
+
+	return 0;
+}
+
+static int setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs,
+			    unsigned long mask)
+{
+	int err;
+
+	err = metag_gp_regs_copyout(regs, 0, sizeof(struct user_gp_regs), NULL,
+				    &sc->regs);
+
+	if (!err)
+		err = metag_cb_regs_copyout(regs, 0,
+					    sizeof(struct user_cb_regs), NULL,
+					    &sc->cb);
+	if (!err)
+		err = metag_rp_state_copyout(regs, 0,
+					     sizeof(struct user_rp_state), NULL,
+					     &sc->rp);
+
+	/* OK, clear that cbuf flag in the old context, or our stored
+	 * catch buffer will be restored when we go to call the signal
+	 * handler. Also clear out the CBRP RA/RD pipe bit incase
+	 * that is pending as well!
+	 * Note that as we have already stored this context, these
+	 * flags will get restored on sigreturn to their original
+	 * state.
+	 */
+	regs->REG_FLAGS &= ~(TBICTX_XCBF_BIT | TBICTX_CBUF_BIT |
+			     TBICTX_CBRP_BIT);
+
+	/* Clear out the LSM_STEP bits in case we are in the middle of
+	 * and MSET/MGET.
+	 */
+	regs->ctx.Flags &= ~TXSTATUS_LSM_STEP_BITS;
+
+	err |= __put_user(mask, &sc->oldmask);
+
+	return err;
+}
+
+/*
+ * Determine which stack to use..
+ */
+static void __user *get_sigframe(struct ksignal *ksig, unsigned long sp)
+{
+	sp = sigsp(sp, ksig);
+	sp = (sp + 7) & ~7;			/* 8byte align stack */
+
+	return (void __user *)sp;
+}
+
+static int setup_rt_frame(struct ksignal *ksig, sigset_t *set,
+			  struct pt_regs *regs)
+{
+	struct rt_sigframe __user *frame;
+	int err;
+	unsigned long code;
+
+	frame = get_sigframe(ksig, regs->REG_SP);
+	if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+		return -EFAULT;
+
+	err = copy_siginfo_to_user(&frame->info, &ksig->info);
+
+	/* Create the ucontext.  */
+	err |= __put_user(0, &frame->uc.uc_flags);
+	err |= __put_user(0, (unsigned long __user *)&frame->uc.uc_link);
+	err |= __save_altstack(&frame->uc.uc_stack, regs->REG_SP);
+	err |= setup_sigcontext(&frame->uc.uc_mcontext,
+				regs, set->sig[0]);
+	err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
+
+	if (err)
+		return -EFAULT;
+
+	/* Set up to return from userspace.  */
+
+	/* MOV D1Re0 (D1.0), #__NR_rt_sigreturn */
+	code = 0x03000004 | (__NR_rt_sigreturn << 3);
+	err |= __put_user(code, (unsigned long __user *)(&frame->retcode[0]));
+
+	/* SWITCH #__METAG_SW_SYS */
+	code = __METAG_SW_ENCODING(SYS);
+	err |= __put_user(code, (unsigned long __user *)(&frame->retcode[1]));
+
+	if (err)
+		return -EFAULT;
+
+	/* Set up registers for signal handler */
+	regs->REG_RTP = (unsigned long) frame->retcode;
+	regs->REG_SP = (unsigned long) frame + sizeof(*frame);
+	regs->REG_ARG1 = ksig->sig;
+	regs->REG_ARG2 = (unsigned long) &frame->info;
+	regs->REG_ARG3 = (unsigned long) &frame->uc;
+	regs->REG_PC = (unsigned long) ksig->ka.sa.sa_handler;
+
+	pr_debug("SIG deliver (%s:%d): sp=%p pc=%08x pr=%08x\n",
+		 current->comm, current->pid, frame, regs->REG_PC,
+		 regs->REG_RTP);
+
+	/* Now pass size of 'new code' into sigtramp so we can do a more
+	 * effective cache flush - directed rather than 'full flush'.
+	 */
+	flush_cache_sigtramp(regs->REG_RTP, sizeof(frame->retcode));
+
+	return 0;
+}
+
+static void handle_signal(struct ksignal *ksig, struct pt_regs *regs)
+{
+	sigset_t *oldset = sigmask_to_save();
+	int ret;
+
+	/* Set up the stack frame */
+	ret = setup_rt_frame(ksig, oldset, regs);
+
+	signal_setup_done(ret, ksig, test_thread_flag(TIF_SINGLESTEP));
+}
+
+ /*
+  * Notes for Meta.
+  * We have moved from the old 2.4.9 SH way of using syscall_nr (in the stored
+  * context) to passing in the syscall flag on the stack.
+  * This is because having syscall_nr in our context does not fit with TBX, and
+  * corrupted the stack.
+  */
+static int do_signal(struct pt_regs *regs, int syscall)
+{
+	unsigned int retval = 0, continue_addr = 0, restart_addr = 0;
+	int restart = 0;
+	struct ksignal ksig;
+
+	/*
+	 * By the end of rt_sigreturn the context describes the point that the
+	 * signal was taken (which may happen to be just before a syscall if
+	 * it's already been restarted). This should *never* be mistaken for a
+	 * system call in need of restarting.
+	 */
+	if (syscall == __NR_rt_sigreturn)
+		syscall = -1;
+
+	/* Did we come from a system call? */
+	if (syscall >= 0) {
+		continue_addr = regs->REG_PC;
+		restart_addr = continue_addr - 4;
+		retval = regs->REG_RETVAL;
+
+		/*
+		 * Prepare for system call restart. We do this here so that a
+		 * debugger will see the already changed PC.
+		 */
+		switch (retval) {
+		case -ERESTART_RESTARTBLOCK:
+			restart = -2;
+		case -ERESTARTNOHAND:
+		case -ERESTARTSYS:
+		case -ERESTARTNOINTR:
+			++restart;
+			regs->REG_PC = restart_addr;
+			break;
+		}
+	}
+
+	/*
+	 * Get the signal to deliver. When running under ptrace, at this point
+	 * the debugger may change all our registers ...
+	 */
+	get_signal(&ksig);
+
+	/*
+	 * Depending on the signal settings we may need to revert the decision
+	 * to restart the system call. But skip this if a debugger has chosen to
+	 * restart at a different PC.
+	 */
+	if (regs->REG_PC != restart_addr)
+		restart = 0;
+	if (ksig.sig > 0) {
+		if (unlikely(restart)) {
+			if (retval == -ERESTARTNOHAND
+			    || retval == -ERESTART_RESTARTBLOCK
+			    || (retval == -ERESTARTSYS
+				&& !(ksig.ka.sa.sa_flags & SA_RESTART))) {
+				regs->REG_RETVAL = -EINTR;
+				regs->REG_PC = continue_addr;
+			}
+		}
+
+		/* Whee! Actually deliver the signal.  */
+		handle_signal(&ksig, regs);
+		return 0;
+	}
+
+	/* Handlerless -ERESTART_RESTARTBLOCK re-enters via restart_syscall */
+	if (unlikely(restart < 0))
+		regs->REG_SYSCALL = __NR_restart_syscall;
+
+	/*
+	 * If there's no signal to deliver, we just put the saved sigmask back.
+	 */
+	restore_saved_sigmask();
+
+	return restart;
+}
+
+int do_work_pending(struct pt_regs *regs, unsigned int thread_flags,
+		    int syscall)
+{
+	do {
+		if (likely(thread_flags & _TIF_NEED_RESCHED)) {
+			schedule();
+		} else {
+			if (unlikely(!user_mode(regs)))
+				return 0;
+			local_irq_enable();
+			if (thread_flags & _TIF_SIGPENDING) {
+				int restart = do_signal(regs, syscall);
+				if (unlikely(restart)) {
+					/*
+					 * Restart without handlers.
+					 * Deal with it without leaving
+					 * the kernel space.
+					 */
+					return restart;
+				}
+				syscall = -1;
+			} else {
+				clear_thread_flag(TIF_NOTIFY_RESUME);
+				tracehook_notify_resume(regs);
+			}
+		}
+		local_irq_disable();
+		thread_flags = current_thread_info()->flags;
+	} while (thread_flags & _TIF_WORK_MASK);
+	return 0;
+}
diff --git a/arch/metag/kernel/smp.c b/arch/metag/kernel/smp.c
new file mode 100644
index 000000000..ac3a199e3
--- /dev/null
+++ b/arch/metag/kernel/smp.c
@@ -0,0 +1,665 @@
+/*
+ *  Copyright (C) 2009,2010,2011 Imagination Technologies Ltd.
+ *
+ *  Copyright (C) 2002 ARM Limited, All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/atomic.h>
+#include <linux/completion.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <linux/cache.h>
+#include <linux/profile.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/err.h>
+#include <linux/cpu.h>
+#include <linux/smp.h>
+#include <linux/seq_file.h>
+#include <linux/irq.h>
+#include <linux/bootmem.h>
+
+#include <asm/cacheflush.h>
+#include <asm/cachepart.h>
+#include <asm/core_reg.h>
+#include <asm/cpu.h>
+#include <asm/global_lock.h>
+#include <asm/metag_mem.h>
+#include <asm/mmu_context.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/processor.h>
+#include <asm/setup.h>
+#include <asm/tlbflush.h>
+#include <asm/hwthread.h>
+#include <asm/traps.h>
+
+#define SYSC_DCPART(n)	(SYSC_DCPART0 + SYSC_xCPARTn_STRIDE * (n))
+#define SYSC_ICPART(n)	(SYSC_ICPART0 + SYSC_xCPARTn_STRIDE * (n))
+
+DECLARE_PER_CPU(PTBI, pTBI);
+
+void *secondary_data_stack;
+
+/*
+ * structures for inter-processor calls
+ * - A collection of single bit ipi messages.
+ */
+struct ipi_data {
+	spinlock_t lock;
+	unsigned long ipi_count;
+	unsigned long bits;
+};
+
+static DEFINE_PER_CPU(struct ipi_data, ipi_data) = {
+	.lock	= __SPIN_LOCK_UNLOCKED(ipi_data.lock),
+};
+
+static DEFINE_SPINLOCK(boot_lock);
+
+static DECLARE_COMPLETION(cpu_running);
+
+/*
+ * "thread" is assumed to be a valid Meta hardware thread ID.
+ */
+static int boot_secondary(unsigned int thread, struct task_struct *idle)
+{
+	u32 val;
+
+	/*
+	 * set synchronisation state between this boot processor
+	 * and the secondary one
+	 */
+	spin_lock(&boot_lock);
+
+	core_reg_write(TXUPC_ID, 0, thread, (unsigned int)secondary_startup);
+	core_reg_write(TXUPC_ID, 1, thread, 0);
+
+	/*
+	 * Give the thread privilege (PSTAT) and clear potentially problematic
+	 * bits in the process (namely ISTAT, CBMarker, CBMarkerI, LSM_STEP).
+	 */
+	core_reg_write(TXUCT_ID, TXSTATUS_REGNUM, thread, TXSTATUS_PSTAT_BIT);
+
+	/* Clear the minim enable bit. */
+	val = core_reg_read(TXUCT_ID, TXPRIVEXT_REGNUM, thread);
+	core_reg_write(TXUCT_ID, TXPRIVEXT_REGNUM, thread, val & ~0x80);
+
+	/*
+	 * set the ThreadEnable bit (0x1) in the TXENABLE register
+	 * for the specified thread - off it goes!
+	 */
+	val = core_reg_read(TXUCT_ID, TXENABLE_REGNUM, thread);
+	core_reg_write(TXUCT_ID, TXENABLE_REGNUM, thread, val | 0x1);
+
+	/*
+	 * now the secondary core is starting up let it run its
+	 * calibrations, then wait for it to finish
+	 */
+	spin_unlock(&boot_lock);
+
+	return 0;
+}
+
+/**
+ * describe_cachepart_change: describe a change to cache partitions.
+ * @thread:	Hardware thread number.
+ * @label:	Label of cache type, e.g. "dcache" or "icache".
+ * @sz:		Total size of the cache.
+ * @old:	Old cache partition configuration (*CPART* register).
+ * @new:	New cache partition configuration (*CPART* register).
+ *
+ * If the cache partition has changed, prints a message to the log describing
+ * those changes.
+ */
+static void describe_cachepart_change(unsigned int thread, const char *label,
+				      unsigned int sz, unsigned int old,
+				      unsigned int new)
+{
+	unsigned int lor1, land1, gor1, gand1;
+	unsigned int lor2, land2, gor2, gand2;
+	unsigned int diff = old ^ new;
+
+	if (!diff)
+		return;
+
+	pr_info("Thread %d: %s partition changed:", thread, label);
+	if (diff & (SYSC_xCPARTL_OR_BITS | SYSC_xCPARTL_AND_BITS)) {
+		lor1   = (old & SYSC_xCPARTL_OR_BITS)  >> SYSC_xCPARTL_OR_S;
+		lor2   = (new & SYSC_xCPARTL_OR_BITS)  >> SYSC_xCPARTL_OR_S;
+		land1  = (old & SYSC_xCPARTL_AND_BITS) >> SYSC_xCPARTL_AND_S;
+		land2  = (new & SYSC_xCPARTL_AND_BITS) >> SYSC_xCPARTL_AND_S;
+		pr_cont(" L:%#x+%#x->%#x+%#x",
+			(lor1 * sz) >> 4,
+			((land1 + 1) * sz) >> 4,
+			(lor2 * sz) >> 4,
+			((land2 + 1) * sz) >> 4);
+	}
+	if (diff & (SYSC_xCPARTG_OR_BITS | SYSC_xCPARTG_AND_BITS)) {
+		gor1   = (old & SYSC_xCPARTG_OR_BITS)  >> SYSC_xCPARTG_OR_S;
+		gor2   = (new & SYSC_xCPARTG_OR_BITS)  >> SYSC_xCPARTG_OR_S;
+		gand1  = (old & SYSC_xCPARTG_AND_BITS) >> SYSC_xCPARTG_AND_S;
+		gand2  = (new & SYSC_xCPARTG_AND_BITS) >> SYSC_xCPARTG_AND_S;
+		pr_cont(" G:%#x+%#x->%#x+%#x",
+			(gor1 * sz) >> 4,
+			((gand1 + 1) * sz) >> 4,
+			(gor2 * sz) >> 4,
+			((gand2 + 1) * sz) >> 4);
+	}
+	if (diff & SYSC_CWRMODE_BIT)
+		pr_cont(" %sWR",
+			(new & SYSC_CWRMODE_BIT) ? "+" : "-");
+	if (diff & SYSC_DCPART_GCON_BIT)
+		pr_cont(" %sGCOn",
+			(new & SYSC_DCPART_GCON_BIT) ? "+" : "-");
+	pr_cont("\n");
+}
+
+/**
+ * setup_smp_cache: ensure cache coherency for new SMP thread.
+ * @thread:	New hardware thread number.
+ *
+ * Ensures that coherency is enabled and that the threads share the same cache
+ * partitions.
+ */
+static void setup_smp_cache(unsigned int thread)
+{
+	unsigned int this_thread, lflags;
+	unsigned int dcsz, dcpart_this, dcpart_old, dcpart_new;
+	unsigned int icsz, icpart_old, icpart_new;
+
+	/*
+	 * Copy over the current thread's cache partition configuration to the
+	 * new thread so that they share cache partitions.
+	 */
+	__global_lock2(lflags);
+	this_thread = hard_processor_id();
+	/* Share dcache partition */
+	dcpart_this = metag_in32(SYSC_DCPART(this_thread));
+	dcpart_old = metag_in32(SYSC_DCPART(thread));
+	dcpart_new = dcpart_this;
+#if PAGE_OFFSET < LINGLOBAL_BASE
+	/*
+	 * For the local data cache to be coherent the threads must also have
+	 * GCOn enabled.
+	 */
+	dcpart_new |= SYSC_DCPART_GCON_BIT;
+	metag_out32(dcpart_new, SYSC_DCPART(this_thread));
+#endif
+	metag_out32(dcpart_new, SYSC_DCPART(thread));
+	/* Share icache partition too */
+	icpart_new = metag_in32(SYSC_ICPART(this_thread));
+	icpart_old = metag_in32(SYSC_ICPART(thread));
+	metag_out32(icpart_new, SYSC_ICPART(thread));
+	__global_unlock2(lflags);
+
+	/*
+	 * Log if the cache partitions were altered so the user is aware of any
+	 * potential unintentional cache wastage.
+	 */
+	dcsz = get_dcache_size();
+	icsz = get_dcache_size();
+	describe_cachepart_change(this_thread, "dcache", dcsz,
+				  dcpart_this, dcpart_new);
+	describe_cachepart_change(thread, "dcache", dcsz,
+				  dcpart_old, dcpart_new);
+	describe_cachepart_change(thread, "icache", icsz,
+				  icpart_old, icpart_new);
+}
+
+int __cpu_up(unsigned int cpu, struct task_struct *idle)
+{
+	unsigned int thread = cpu_2_hwthread_id[cpu];
+	int ret;
+
+	load_pgd(swapper_pg_dir, thread);
+
+	flush_tlb_all();
+
+	setup_smp_cache(thread);
+
+	/*
+	 * Tell the secondary CPU where to find its idle thread's stack.
+	 */
+	secondary_data_stack = task_stack_page(idle);
+
+	wmb();
+
+	/*
+	 * Now bring the CPU into our world.
+	 */
+	ret = boot_secondary(thread, idle);
+	if (ret == 0) {
+		/*
+		 * CPU was successfully started, wait for it
+		 * to come online or time out.
+		 */
+		wait_for_completion_timeout(&cpu_running,
+					    msecs_to_jiffies(1000));
+
+		if (!cpu_online(cpu))
+			ret = -EIO;
+	}
+
+	secondary_data_stack = NULL;
+
+	if (ret) {
+		pr_crit("CPU%u: processor failed to boot\n", cpu);
+
+		/*
+		 * FIXME: We need to clean up the new idle thread. --rmk
+		 */
+	}
+
+	return ret;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+/*
+ * __cpu_disable runs on the processor to be shutdown.
+ */
+int __cpu_disable(void)
+{
+	unsigned int cpu = smp_processor_id();
+
+	/*
+	 * Take this CPU offline.  Once we clear this, we can't return,
+	 * and we must not schedule until we're ready to give up the cpu.
+	 */
+	set_cpu_online(cpu, false);
+
+	/*
+	 * OK - migrate IRQs away from this CPU
+	 */
+	migrate_irqs();
+
+	/*
+	 * Flush user cache and TLB mappings, and then remove this CPU
+	 * from the vm mask set of all processes.
+	 */
+	flush_cache_all();
+	local_flush_tlb_all();
+
+	clear_tasks_mm_cpumask(cpu);
+
+	return 0;
+}
+
+/*
+ * called on the thread which is asking for a CPU to be shutdown -
+ * waits until shutdown has completed, or it is timed out.
+ */
+void __cpu_die(unsigned int cpu)
+{
+	if (!cpu_wait_death(cpu, 1))
+		pr_err("CPU%u: unable to kill\n", cpu);
+}
+
+/*
+ * Called from the idle thread for the CPU which has been shutdown.
+ *
+ * Note that we do not return from this function. If this cpu is
+ * brought online again it will need to run secondary_startup().
+ */
+void cpu_die(void)
+{
+	local_irq_disable();
+	idle_task_exit();
+
+	(void)cpu_report_death();
+
+	asm ("XOR	TXENABLE, D0Re0,D0Re0\n");
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+/*
+ * Called by both boot and secondaries to move global data into
+ * per-processor storage.
+ */
+void smp_store_cpu_info(unsigned int cpuid)
+{
+	struct cpuinfo_metag *cpu_info = &per_cpu(cpu_data, cpuid);
+
+	cpu_info->loops_per_jiffy = loops_per_jiffy;
+}
+
+/*
+ * This is the secondary CPU boot entry.  We're using this CPUs
+ * idle thread stack and the global page tables.
+ */
+asmlinkage void secondary_start_kernel(void)
+{
+	struct mm_struct *mm = &init_mm;
+	unsigned int cpu = smp_processor_id();
+
+	/*
+	 * All kernel threads share the same mm context; grab a
+	 * reference and switch to it.
+	 */
+	atomic_inc(&mm->mm_users);
+	atomic_inc(&mm->mm_count);
+	current->active_mm = mm;
+	cpumask_set_cpu(cpu, mm_cpumask(mm));
+	enter_lazy_tlb(mm, current);
+	local_flush_tlb_all();
+
+	/*
+	 * TODO: Some day it might be useful for each Linux CPU to
+	 * have its own TBI structure. That would allow each Linux CPU
+	 * to run different interrupt handlers for the same IRQ
+	 * number.
+	 *
+	 * For now, simply copying the pointer to the boot CPU's TBI
+	 * structure is sufficient because we always want to run the
+	 * same interrupt handler whatever CPU takes the interrupt.
+	 */
+	per_cpu(pTBI, cpu) = __TBI(TBID_ISTAT_BIT);
+
+	if (!per_cpu(pTBI, cpu))
+		panic("No TBI found!");
+
+	per_cpu_trap_init(cpu);
+
+	preempt_disable();
+
+	setup_priv();
+
+	notify_cpu_starting(cpu);
+
+	pr_info("CPU%u (thread %u): Booted secondary processor\n",
+		cpu, cpu_2_hwthread_id[cpu]);
+
+	calibrate_delay();
+	smp_store_cpu_info(cpu);
+
+	/*
+	 * OK, now it's safe to let the boot CPU continue
+	 */
+	set_cpu_online(cpu, true);
+	complete(&cpu_running);
+
+	/*
+	 * Enable local interrupts.
+	 */
+	tbi_startup_interrupt(TBID_SIGNUM_TRT);
+	local_irq_enable();
+
+	/*
+	 * OK, it's off to the idle thread for us
+	 */
+	cpu_startup_entry(CPUHP_ONLINE);
+}
+
+void __init smp_cpus_done(unsigned int max_cpus)
+{
+	int cpu;
+	unsigned long bogosum = 0;
+
+	for_each_online_cpu(cpu)
+		bogosum += per_cpu(cpu_data, cpu).loops_per_jiffy;
+
+	pr_info("SMP: Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
+		num_online_cpus(),
+		bogosum / (500000/HZ),
+		(bogosum / (5000/HZ)) % 100);
+}
+
+void __init smp_prepare_cpus(unsigned int max_cpus)
+{
+	unsigned int cpu = smp_processor_id();
+
+	init_new_context(current, &init_mm);
+	current_thread_info()->cpu = cpu;
+
+	smp_store_cpu_info(cpu);
+	init_cpu_present(cpu_possible_mask);
+}
+
+void __init smp_prepare_boot_cpu(void)
+{
+	unsigned int cpu = smp_processor_id();
+
+	per_cpu(pTBI, cpu) = __TBI(TBID_ISTAT_BIT);
+
+	if (!per_cpu(pTBI, cpu))
+		panic("No TBI found!");
+}
+
+static void smp_cross_call(cpumask_t callmap, enum ipi_msg_type msg);
+
+static void send_ipi_message(const struct cpumask *mask, enum ipi_msg_type msg)
+{
+	unsigned long flags;
+	unsigned int cpu;
+	cpumask_t map;
+
+	cpumask_clear(&map);
+	local_irq_save(flags);
+
+	for_each_cpu(cpu, mask) {
+		struct ipi_data *ipi = &per_cpu(ipi_data, cpu);
+
+		spin_lock(&ipi->lock);
+
+		/*
+		 * KICK interrupts are queued in hardware so we'll get
+		 * multiple interrupts if we call smp_cross_call()
+		 * multiple times for one msg. The problem is that we
+		 * only have one bit for each message - we can't queue
+		 * them in software.
+		 *
+		 * The first time through ipi_handler() we'll clear
+		 * the msg bit, having done all the work. But when we
+		 * return we'll get _another_ interrupt (and another,
+		 * and another until we've handled all the queued
+		 * KICKs). Running ipi_handler() when there's no work
+		 * to do is bad because that's how kick handler
+		 * chaining detects who the KICK was intended for.
+		 * See arch/metag/kernel/kick.c for more details.
+		 *
+		 * So only add 'cpu' to 'map' if we haven't already
+		 * queued a KICK interrupt for 'msg'.
+		 */
+		if (!(ipi->bits & (1 << msg))) {
+			ipi->bits |= 1 << msg;
+			cpumask_set_cpu(cpu, &map);
+		}
+
+		spin_unlock(&ipi->lock);
+	}
+
+	/*
+	 * Call the platform specific cross-CPU call function.
+	 */
+	smp_cross_call(map, msg);
+
+	local_irq_restore(flags);
+}
+
+void arch_send_call_function_ipi_mask(const struct cpumask *mask)
+{
+	send_ipi_message(mask, IPI_CALL_FUNC);
+}
+
+void arch_send_call_function_single_ipi(int cpu)
+{
+	send_ipi_message(cpumask_of(cpu), IPI_CALL_FUNC);
+}
+
+void show_ipi_list(struct seq_file *p)
+{
+	unsigned int cpu;
+
+	seq_puts(p, "IPI:");
+
+	for_each_present_cpu(cpu)
+		seq_printf(p, " %10lu", per_cpu(ipi_data, cpu).ipi_count);
+
+	seq_putc(p, '\n');
+}
+
+static DEFINE_SPINLOCK(stop_lock);
+
+/*
+ * Main handler for inter-processor interrupts
+ *
+ * For Meta, the ipimask now only identifies a single
+ * category of IPI (Bit 1 IPIs have been replaced by a
+ * different mechanism):
+ *
+ *  Bit 0 - Inter-processor function call
+ */
+static int do_IPI(void)
+{
+	unsigned int cpu = smp_processor_id();
+	struct ipi_data *ipi = &per_cpu(ipi_data, cpu);
+	unsigned long msgs, nextmsg;
+	int handled = 0;
+
+	ipi->ipi_count++;
+
+	spin_lock(&ipi->lock);
+	msgs = ipi->bits;
+	nextmsg = msgs & -msgs;
+	ipi->bits &= ~nextmsg;
+	spin_unlock(&ipi->lock);
+
+	if (nextmsg) {
+		handled = 1;
+
+		nextmsg = ffz(~nextmsg);
+		switch (nextmsg) {
+		case IPI_RESCHEDULE:
+			scheduler_ipi();
+			break;
+
+		case IPI_CALL_FUNC:
+			generic_smp_call_function_interrupt();
+			break;
+
+		default:
+			pr_crit("CPU%u: Unknown IPI message 0x%lx\n",
+				cpu, nextmsg);
+			break;
+		}
+	}
+
+	return handled;
+}
+
+void smp_send_reschedule(int cpu)
+{
+	send_ipi_message(cpumask_of(cpu), IPI_RESCHEDULE);
+}
+
+static void stop_this_cpu(void *data)
+{
+	unsigned int cpu = smp_processor_id();
+
+	if (system_state == SYSTEM_BOOTING ||
+	    system_state == SYSTEM_RUNNING) {
+		spin_lock(&stop_lock);
+		pr_crit("CPU%u: stopping\n", cpu);
+		dump_stack();
+		spin_unlock(&stop_lock);
+	}
+
+	set_cpu_online(cpu, false);
+
+	local_irq_disable();
+
+	hard_processor_halt(HALT_OK);
+}
+
+void smp_send_stop(void)
+{
+	smp_call_function(stop_this_cpu, NULL, 0);
+}
+
+/*
+ * not supported here
+ */
+int setup_profiling_timer(unsigned int multiplier)
+{
+	return -EINVAL;
+}
+
+/*
+ * We use KICKs for inter-processor interrupts.
+ *
+ * For every CPU in "callmap" the IPI data must already have been
+ * stored in that CPU's "ipi_data" member prior to calling this
+ * function.
+ */
+static void kick_raise_softirq(cpumask_t callmap, unsigned int irq)
+{
+	int cpu;
+
+	for_each_cpu(cpu, &callmap) {
+		unsigned int thread;
+
+		thread = cpu_2_hwthread_id[cpu];
+
+		BUG_ON(thread == BAD_HWTHREAD_ID);
+
+		metag_out32(1, T0KICKI + (thread * TnXKICK_STRIDE));
+	}
+}
+
+static TBIRES ipi_handler(TBIRES State, int SigNum, int Triggers,
+		   int Inst, PTBI pTBI, int *handled)
+{
+	*handled = do_IPI();
+
+	return State;
+}
+
+static struct kick_irq_handler ipi_irq = {
+	.func = ipi_handler,
+};
+
+static void smp_cross_call(cpumask_t callmap, enum ipi_msg_type msg)
+{
+	kick_raise_softirq(callmap, 1);
+}
+
+static inline unsigned int get_core_count(void)
+{
+	int i;
+	unsigned int ret = 0;
+
+	for (i = 0; i < CONFIG_NR_CPUS; i++) {
+		if (core_reg_read(TXUCT_ID, TXENABLE_REGNUM, i))
+			ret++;
+	}
+
+	return ret;
+}
+
+/*
+ * Initialise the CPU possible map early - this describes the CPUs
+ * which may be present or become present in the system.
+ */
+void __init smp_init_cpus(void)
+{
+	unsigned int i, ncores = get_core_count();
+
+	/* If no hwthread_map early param was set use default mapping */
+	for (i = 0; i < NR_CPUS; i++)
+		if (cpu_2_hwthread_id[i] == BAD_HWTHREAD_ID) {
+			cpu_2_hwthread_id[i] = i;
+			hwthread_id_2_cpu[i] = i;
+		}
+
+	for (i = 0; i < ncores; i++)
+		set_cpu_possible(i, true);
+
+	kick_register_func(&ipi_irq);
+}
diff --git a/arch/metag/kernel/stacktrace.c b/arch/metag/kernel/stacktrace.c
new file mode 100644
index 000000000..5510361d5
--- /dev/null
+++ b/arch/metag/kernel/stacktrace.c
@@ -0,0 +1,187 @@
+#include <linux/export.h>
+#include <linux/sched.h>
+#include <linux/stacktrace.h>
+
+#include <asm/stacktrace.h>
+
+#if defined(CONFIG_FRAME_POINTER)
+
+#ifdef CONFIG_KALLSYMS
+#include <linux/kallsyms.h>
+#include <linux/module.h>
+
+static unsigned long tbi_boing_addr;
+static unsigned long tbi_boing_size;
+
+static void tbi_boing_init(void)
+{
+	/* We need to know where TBIBoingVec is and it's size */
+	unsigned long size;
+	unsigned long offset;
+	char modname[MODULE_NAME_LEN];
+	char name[KSYM_NAME_LEN];
+	tbi_boing_addr = kallsyms_lookup_name("___TBIBoingVec");
+	if (!tbi_boing_addr)
+		tbi_boing_addr = 1;
+	else if (!lookup_symbol_attrs(tbi_boing_addr, &size,
+				      &offset, modname, name))
+		tbi_boing_size = size;
+}
+#endif
+
+#define ALIGN_DOWN(addr, size)  ((addr)&(~((size)-1)))
+
+/*
+ * Unwind the current stack frame and store the new register values in the
+ * structure passed as argument. Unwinding is equivalent to a function return,
+ * hence the new PC value rather than LR should be used for backtrace.
+ */
+int notrace unwind_frame(struct stackframe *frame)
+{
+	struct metag_frame *fp = (struct metag_frame *)frame->fp;
+	unsigned long lr;
+	unsigned long fpnew;
+
+	if (frame->fp & 0x7)
+		return -EINVAL;
+
+	fpnew = fp->fp;
+	lr = fp->lr - 4;
+
+#ifdef CONFIG_KALLSYMS
+	/* If we've reached TBIBoingVec then we're at an interrupt
+	 * entry point or a syscall entry point. The frame pointer
+	 * points to a pt_regs which can be used to continue tracing on
+	 * the other side of the boing.
+	 */
+	if (!tbi_boing_addr)
+		tbi_boing_init();
+	if (tbi_boing_size && lr >= tbi_boing_addr &&
+	    lr < tbi_boing_addr + tbi_boing_size) {
+		struct pt_regs *regs = (struct pt_regs *)fpnew;
+		if (user_mode(regs))
+			return -EINVAL;
+		fpnew = regs->ctx.AX[1].U0;
+		lr = regs->ctx.DX[4].U1;
+	}
+#endif
+
+	/* stack grows up, so frame pointers must decrease */
+	if (fpnew < (ALIGN_DOWN((unsigned long)fp, THREAD_SIZE) +
+		     sizeof(struct thread_info)) || fpnew >= (unsigned long)fp)
+		return -EINVAL;
+
+	/* restore the registers from the stack frame */
+	frame->fp = fpnew;
+	frame->pc = lr;
+
+	return 0;
+}
+#else
+int notrace unwind_frame(struct stackframe *frame)
+{
+	struct metag_frame *sp = (struct metag_frame *)frame->sp;
+
+	if (frame->sp & 0x7)
+		return -EINVAL;
+
+	while (!kstack_end(sp)) {
+		unsigned long addr = sp->lr - 4;
+		sp--;
+
+		if (__kernel_text_address(addr)) {
+			frame->sp = (unsigned long)sp;
+			frame->pc = addr;
+			return 0;
+		}
+	}
+	return -EINVAL;
+}
+#endif
+
+void notrace walk_stackframe(struct stackframe *frame,
+		     int (*fn)(struct stackframe *, void *), void *data)
+{
+	while (1) {
+		int ret;
+
+		if (fn(frame, data))
+			break;
+		ret = unwind_frame(frame);
+		if (ret < 0)
+			break;
+	}
+}
+EXPORT_SYMBOL(walk_stackframe);
+
+#ifdef CONFIG_STACKTRACE
+struct stack_trace_data {
+	struct stack_trace *trace;
+	unsigned int no_sched_functions;
+	unsigned int skip;
+};
+
+static int save_trace(struct stackframe *frame, void *d)
+{
+	struct stack_trace_data *data = d;
+	struct stack_trace *trace = data->trace;
+	unsigned long addr = frame->pc;
+
+	if (data->no_sched_functions && in_sched_functions(addr))
+		return 0;
+	if (data->skip) {
+		data->skip--;
+		return 0;
+	}
+
+	trace->entries[trace->nr_entries++] = addr;
+
+	return trace->nr_entries >= trace->max_entries;
+}
+
+void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
+{
+	struct stack_trace_data data;
+	struct stackframe frame;
+
+	data.trace = trace;
+	data.skip = trace->skip;
+
+	if (tsk != current) {
+#ifdef CONFIG_SMP
+		/*
+		 * What guarantees do we have here that 'tsk' is not
+		 * running on another CPU?  For now, ignore it as we
+		 * can't guarantee we won't explode.
+		 */
+		if (trace->nr_entries < trace->max_entries)
+			trace->entries[trace->nr_entries++] = ULONG_MAX;
+		return;
+#else
+		data.no_sched_functions = 1;
+		frame.fp = thread_saved_fp(tsk);
+		frame.sp = thread_saved_sp(tsk);
+		frame.lr = 0;		/* recovered from the stack */
+		frame.pc = thread_saved_pc(tsk);
+#endif
+	} else {
+		register unsigned long current_sp asm ("A0StP");
+
+		data.no_sched_functions = 0;
+		frame.fp = (unsigned long)__builtin_frame_address(0);
+		frame.sp = current_sp;
+		frame.lr = (unsigned long)__builtin_return_address(0);
+		frame.pc = (unsigned long)save_stack_trace_tsk;
+	}
+
+	walk_stackframe(&frame, save_trace, &data);
+	if (trace->nr_entries < trace->max_entries)
+		trace->entries[trace->nr_entries++] = ULONG_MAX;
+}
+
+void save_stack_trace(struct stack_trace *trace)
+{
+	save_stack_trace_tsk(current, trace);
+}
+EXPORT_SYMBOL_GPL(save_stack_trace);
+#endif
diff --git a/arch/metag/kernel/sys_metag.c b/arch/metag/kernel/sys_metag.c
new file mode 100644
index 000000000..efe833a45
--- /dev/null
+++ b/arch/metag/kernel/sys_metag.c
@@ -0,0 +1,180 @@
+/*
+ * This file contains various random system calls that
+ * have a non-standard calling sequence on the Linux/Meta
+ * platform.
+ */
+
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/syscalls.h>
+#include <linux/mman.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#include <linux/unistd.h>
+#include <asm/cacheflush.h>
+#include <asm/core_reg.h>
+#include <asm/global_lock.h>
+#include <asm/switch.h>
+#include <asm/syscall.h>
+#include <asm/syscalls.h>
+#include <asm/user_gateway.h>
+
+#define merge_64(hi, lo) ((((unsigned long long)(hi)) << 32) + \
+			  ((lo) & 0xffffffffUL))
+
+int metag_mmap_check(unsigned long addr, unsigned long len,
+		     unsigned long flags)
+{
+	/* We can't have people trying to write to the bottom of the
+	 * memory map, there are mysterious unspecified things there that
+	 * we don't want people trampling on.
+	 */
+	if ((flags & MAP_FIXED) && (addr < TASK_UNMAPPED_BASE))
+		return -EINVAL;
+
+	return 0;
+}
+
+asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
+			  unsigned long prot, unsigned long flags,
+			  unsigned long fd, unsigned long pgoff)
+{
+	/* The shift for mmap2 is constant, regardless of PAGE_SIZE setting. */
+	if (pgoff & ((1 << (PAGE_SHIFT - 12)) - 1))
+		return -EINVAL;
+
+	pgoff >>= PAGE_SHIFT - 12;
+
+	return sys_mmap_pgoff(addr, len, prot, flags, fd, pgoff);
+}
+
+asmlinkage int sys_metag_setglobalbit(char __user *addr, int mask)
+{
+	char tmp;
+	int ret = 0;
+	unsigned int flags;
+
+	if (!((__force unsigned int)addr >= LINCORE_BASE))
+		return -EFAULT;
+
+	__global_lock2(flags);
+
+	metag_data_cache_flush((__force void *)addr, sizeof(mask));
+
+	ret = __get_user(tmp, addr);
+	if (ret)
+		goto out;
+	tmp |= mask;
+	ret = __put_user(tmp, addr);
+
+	metag_data_cache_flush((__force void *)addr, sizeof(mask));
+
+out:
+	__global_unlock2(flags);
+
+	return ret;
+}
+
+#define TXDEFR_FPU_MASK ((0x1f << 16) | 0x1f)
+
+asmlinkage void sys_metag_set_fpu_flags(unsigned int flags)
+{
+	unsigned int temp;
+
+	flags &= TXDEFR_FPU_MASK;
+
+	temp = __core_reg_get(TXDEFR);
+	temp &= ~TXDEFR_FPU_MASK;
+	temp |= flags;
+	__core_reg_set(TXDEFR, temp);
+}
+
+asmlinkage int sys_metag_set_tls(void __user *ptr)
+{
+	current->thread.tls_ptr = ptr;
+	set_gateway_tls(ptr);
+
+	return 0;
+}
+
+asmlinkage void *sys_metag_get_tls(void)
+{
+	return (__force void *)current->thread.tls_ptr;
+}
+
+asmlinkage long sys_truncate64_metag(const char __user *path, unsigned long lo,
+				     unsigned long hi)
+{
+	return sys_truncate64(path, merge_64(hi, lo));
+}
+
+asmlinkage long sys_ftruncate64_metag(unsigned int fd, unsigned long lo,
+				      unsigned long hi)
+{
+	return sys_ftruncate64(fd, merge_64(hi, lo));
+}
+
+asmlinkage long sys_fadvise64_64_metag(int fd, unsigned long offs_lo,
+				       unsigned long offs_hi,
+				       unsigned long len_lo,
+				       unsigned long len_hi, int advice)
+{
+	return sys_fadvise64_64(fd, merge_64(offs_hi, offs_lo),
+				merge_64(len_hi, len_lo), advice);
+}
+
+asmlinkage long sys_readahead_metag(int fd, unsigned long lo, unsigned long hi,
+				    size_t count)
+{
+	return sys_readahead(fd, merge_64(hi, lo), count);
+}
+
+asmlinkage ssize_t sys_pread64_metag(unsigned long fd, char __user *buf,
+				     size_t count, unsigned long lo,
+				     unsigned long hi)
+{
+	return sys_pread64(fd, buf, count, merge_64(hi, lo));
+}
+
+asmlinkage ssize_t sys_pwrite64_metag(unsigned long fd, char __user *buf,
+				      size_t count, unsigned long lo,
+				      unsigned long hi)
+{
+	return sys_pwrite64(fd, buf, count, merge_64(hi, lo));
+}
+
+asmlinkage long sys_sync_file_range_metag(int fd, unsigned long offs_lo,
+					  unsigned long offs_hi,
+					  unsigned long len_lo,
+					  unsigned long len_hi,
+					  unsigned int flags)
+{
+	return sys_sync_file_range(fd, merge_64(offs_hi, offs_lo),
+				   merge_64(len_hi, len_lo), flags);
+}
+
+/* Provide the actual syscall number to call mapping. */
+#undef __SYSCALL
+#define __SYSCALL(nr, call) [nr] = (call),
+
+/*
+ * We need wrappers for anything with unaligned 64bit arguments
+ */
+#define sys_truncate64		sys_truncate64_metag
+#define sys_ftruncate64		sys_ftruncate64_metag
+#define sys_fadvise64_64	sys_fadvise64_64_metag
+#define sys_readahead		sys_readahead_metag
+#define sys_pread64		sys_pread64_metag
+#define sys_pwrite64		sys_pwrite64_metag
+#define sys_sync_file_range	sys_sync_file_range_metag
+
+/*
+ * Note that we can't include <linux/unistd.h> here since the header
+ * guard will defeat us; <asm/unistd.h> checks for __SYSCALL as well.
+ */
+const void *sys_call_table[__NR_syscalls] = {
+	[0 ... __NR_syscalls-1] = sys_ni_syscall,
+#include <asm/unistd.h>
+};
diff --git a/arch/metag/kernel/tbiunexp.S b/arch/metag/kernel/tbiunexp.S
new file mode 100644
index 000000000..907bbe0b2
--- /dev/null
+++ b/arch/metag/kernel/tbiunexp.S
@@ -0,0 +1,22 @@
+/* Pass a breakpoint through to Codescape */
+
+#include <asm/tbx.h>
+
+	.text
+        .global	___TBIUnExpXXX
+        .type   ___TBIUnExpXXX,function
+___TBIUnExpXXX:
+	TSTT	D0Ar2,#TBICTX_CRIT_BIT	! Result of nestable int call?
+	BZ	$LTBINormCase		! UnExpXXX at background level
+	MOV	D0Re0,TXMASKI		! Read TXMASKI
+	XOR	TXMASKI,D1Re0,D1Re0	! Turn off BGNDHALT handling!
+	OR	D0Ar2,D0Ar2,D0Re0	! Preserve bits cleared
+$LTBINormCase:
+	MSETL 	[A0StP],D0Ar6,D0Ar4,D0Ar2	! Save args on stack
+	SETL 	[A0StP++],D0Ar2,D1Ar1	! Init area for returned values
+	SWITCH 	#0xC20208		! Total stack frame size 8 Dwords
+					!            write back size 2 Dwords
+	GETL 	D0Re0,D1Re0,[--A0StP]	! Get result
+	SUB 	A0StP,A0StP,#(8*3)	! Recover stack frame
+	MOV 	PC,D1RtP
+        .size   	___TBIUnExpXXX,.-___TBIUnExpXXX
diff --git a/arch/metag/kernel/tcm.c b/arch/metag/kernel/tcm.c
new file mode 100644
index 000000000..5d102b31c
--- /dev/null
+++ b/arch/metag/kernel/tcm.c
@@ -0,0 +1,151 @@
+/*
+ * Copyright (C) 2010 Imagination Technologies Ltd.
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/spinlock.h>
+#include <linux/stddef.h>
+#include <linux/genalloc.h>
+#include <linux/string.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <asm/page.h>
+#include <asm/tcm.h>
+
+struct tcm_pool {
+	struct list_head list;
+	unsigned int tag;
+	unsigned long start;
+	unsigned long end;
+	struct gen_pool *pool;
+};
+
+static LIST_HEAD(pool_list);
+
+static struct tcm_pool *find_pool(unsigned int tag)
+{
+	struct list_head *lh;
+	struct tcm_pool *pool;
+
+	list_for_each(lh, &pool_list) {
+		pool = list_entry(lh, struct tcm_pool, list);
+		if (pool->tag == tag)
+			return pool;
+	}
+
+	return NULL;
+}
+
+/**
+ * tcm_alloc - allocate memory from a TCM pool
+ * @tag: tag of the pool to allocate memory from
+ * @len: number of bytes to be allocated
+ *
+ * Allocate the requested number of bytes from the pool matching
+ * the specified tag. Returns the address of the allocated memory
+ * or zero on failure.
+ */
+unsigned long tcm_alloc(unsigned int tag, size_t len)
+{
+	unsigned long vaddr;
+	struct tcm_pool *pool;
+
+	pool = find_pool(tag);
+	if (!pool)
+		return 0;
+
+	vaddr = gen_pool_alloc(pool->pool, len);
+	if (!vaddr)
+		return 0;
+
+	return vaddr;
+}
+
+/**
+ * tcm_free - free a block of memory to a TCM pool
+ * @tag: tag of the pool to free memory to
+ * @addr: address of the memory to be freed
+ * @len: number of bytes to be freed
+ *
+ * Free the requested number of bytes at a specific address to the
+ * pool matching the specified tag.
+ */
+void tcm_free(unsigned int tag, unsigned long addr, size_t len)
+{
+	struct tcm_pool *pool;
+
+	pool = find_pool(tag);
+	if (!pool)
+		return;
+	gen_pool_free(pool->pool, addr, len);
+}
+
+/**
+ * tcm_lookup_tag - find the tag matching an address
+ * @p: memory address to lookup the tag for
+ *
+ * Find the tag of the tcm memory region that contains the
+ * specified address. Returns %TCM_INVALID_TAG if no such
+ * memory region could be found.
+ */
+unsigned int tcm_lookup_tag(unsigned long p)
+{
+	struct list_head *lh;
+	struct tcm_pool *pool;
+	unsigned long addr = (unsigned long) p;
+
+	list_for_each(lh, &pool_list) {
+		pool = list_entry(lh, struct tcm_pool, list);
+		if (addr >= pool->start && addr < pool->end)
+			return pool->tag;
+	}
+
+	return TCM_INVALID_TAG;
+}
+
+/**
+ * tcm_add_region - add a memory region to TCM pool list
+ * @reg: descriptor of region to be added
+ *
+ * Add a region of memory to the TCM pool list. Returns 0 on success.
+ */
+int __init tcm_add_region(struct tcm_region *reg)
+{
+	struct tcm_pool *pool;
+
+	pool = kmalloc(sizeof(*pool), GFP_KERNEL);
+	if (!pool) {
+		pr_err("Failed to alloc memory for TCM pool!\n");
+		return -ENOMEM;
+	}
+
+	pool->tag = reg->tag;
+	pool->start = reg->res.start;
+	pool->end = reg->res.end;
+
+	/*
+	 * 2^3 = 8 bytes granularity to allow for 64bit access alignment.
+	 * -1 = NUMA node specifier.
+	 */
+	pool->pool = gen_pool_create(3, -1);
+
+	if (!pool->pool) {
+		pr_err("Failed to create TCM pool!\n");
+		kfree(pool);
+		return -ENOMEM;
+	}
+
+	if (gen_pool_add(pool->pool, reg->res.start,
+			 reg->res.end - reg->res.start + 1, -1)) {
+		pr_err("Failed to add memory to TCM pool!\n");
+		return -ENOMEM;
+	}
+	pr_info("Added %s TCM pool (%08x bytes @ %08x)\n",
+		reg->res.name, reg->res.end - reg->res.start + 1,
+		reg->res.start);
+
+	list_add_tail(&pool->list, &pool_list);
+
+	return 0;
+}
diff --git a/arch/metag/kernel/time.c b/arch/metag/kernel/time.c
new file mode 100644
index 000000000..f1c8c53da
--- /dev/null
+++ b/arch/metag/kernel/time.c
@@ -0,0 +1,25 @@
+/*
+ * Copyright (C) 2005-2013 Imagination Technologies Ltd.
+ *
+ * This file contains the Meta-specific time handling details.
+ *
+ */
+
+#include <clocksource/metag_generic.h>
+#include <linux/clk-provider.h>
+#include <linux/init.h>
+#include <asm/clock.h>
+
+void __init time_init(void)
+{
+#ifdef CONFIG_COMMON_CLK
+	/* Init clocks from device tree */
+	of_clk_init(NULL);
+#endif
+
+	/* Init meta clocks, particularly the core clock */
+	init_metag_clocks();
+
+	/* Set up the timer clock sources */
+	metag_generic_timer_init();
+}
diff --git a/arch/metag/kernel/topology.c b/arch/metag/kernel/topology.c
new file mode 100644
index 000000000..4ba595701
--- /dev/null
+++ b/arch/metag/kernel/topology.c
@@ -0,0 +1,78 @@
+/*
+ *  Copyright (C) 2007  Paul Mundt
+ *  Copyright (C) 2010  Imagination Technolohies Ltd.
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ */
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/init.h>
+#include <linux/percpu.h>
+#include <linux/node.h>
+#include <linux/nodemask.h>
+#include <linux/topology.h>
+
+#include <asm/cpu.h>
+
+DEFINE_PER_CPU(struct cpuinfo_metag, cpu_data);
+
+cpumask_t cpu_core_map[NR_CPUS];
+EXPORT_SYMBOL(cpu_core_map);
+
+static cpumask_t cpu_coregroup_map(unsigned int cpu)
+{
+	return *cpu_possible_mask;
+}
+
+const struct cpumask *cpu_coregroup_mask(unsigned int cpu)
+{
+	return &cpu_core_map[cpu];
+}
+
+int arch_update_cpu_topology(void)
+{
+	unsigned int cpu;
+
+	for_each_possible_cpu(cpu)
+		cpu_core_map[cpu] = cpu_coregroup_map(cpu);
+
+	return 0;
+}
+
+static int __init topology_init(void)
+{
+	int i, ret;
+
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+	for_each_online_node(i)
+		register_one_node(i);
+#endif
+
+	for_each_present_cpu(i) {
+		struct cpuinfo_metag *cpuinfo = &per_cpu(cpu_data, i);
+#ifdef CONFIG_HOTPLUG_CPU
+		cpuinfo->cpu.hotpluggable = 1;
+#endif
+		ret = register_cpu(&cpuinfo->cpu, i);
+		if (unlikely(ret))
+			pr_warn("%s: register_cpu %d failed (%d)\n",
+				__func__, i, ret);
+	}
+
+#if defined(CONFIG_NUMA) && !defined(CONFIG_SMP)
+	/*
+	 * In the UP case, make sure the CPU association is still
+	 * registered under each node. Without this, sysfs fails
+	 * to make the connection between nodes other than node0
+	 * and cpu0.
+	 */
+	for_each_online_node(i)
+		if (i != numa_node_id())
+			register_cpu_under_node(raw_smp_processor_id(), i);
+#endif
+
+	return 0;
+}
+subsys_initcall(topology_init);
diff --git a/arch/metag/kernel/traps.c b/arch/metag/kernel/traps.c
new file mode 100644
index 000000000..17b2e2e38
--- /dev/null
+++ b/arch/metag/kernel/traps.c
@@ -0,0 +1,989 @@
+/*
+ *  Meta exception handling.
+ *
+ *  Copyright (C) 2005,2006,2007,2008,2009,2012 Imagination Technologies Ltd.
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of this archive
+ * for more details.
+ */
+
+#include <linux/export.h>
+#include <linux/sched.h>
+#include <linux/signal.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/preempt.h>
+#include <linux/ptrace.h>
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/kdebug.h>
+#include <linux/kexec.h>
+#include <linux/unistd.h>
+#include <linux/smp.h>
+#include <linux/slab.h>
+#include <linux/syscalls.h>
+
+#include <asm/bug.h>
+#include <asm/core_reg.h>
+#include <asm/irqflags.h>
+#include <asm/siginfo.h>
+#include <asm/traps.h>
+#include <asm/hwthread.h>
+#include <asm/setup.h>
+#include <asm/switch.h>
+#include <asm/user_gateway.h>
+#include <asm/syscall.h>
+#include <asm/syscalls.h>
+
+/* Passing syscall arguments as long long is quicker. */
+typedef unsigned int (*LPSYSCALL) (unsigned long long,
+				   unsigned long long,
+				   unsigned long long);
+
+/*
+ * Users of LNKSET should compare the bus error bits obtained from DEFR
+ * against TXDEFR_LNKSET_SUCCESS only as the failure code will vary between
+ * different cores revisions.
+ */
+#define TXDEFR_LNKSET_SUCCESS 0x02000000
+#define TXDEFR_LNKSET_FAILURE 0x04000000
+
+/*
+ * Our global TBI handle.  Initialised from setup.c/setup_arch.
+ */
+DECLARE_PER_CPU(PTBI, pTBI);
+
+#ifdef CONFIG_SMP
+static DEFINE_PER_CPU(unsigned int, trigger_mask);
+#else
+unsigned int global_trigger_mask;
+EXPORT_SYMBOL(global_trigger_mask);
+#endif
+
+unsigned long per_cpu__stack_save[NR_CPUS];
+
+static const char * const trap_names[] = {
+	[TBIXXF_SIGNUM_IIF] = "Illegal instruction fault",
+	[TBIXXF_SIGNUM_PGF] = "Privilege violation",
+	[TBIXXF_SIGNUM_DHF] = "Unaligned data access fault",
+	[TBIXXF_SIGNUM_IGF] = "Code fetch general read failure",
+	[TBIXXF_SIGNUM_DGF] = "Data access general read/write fault",
+	[TBIXXF_SIGNUM_IPF] = "Code fetch page fault",
+	[TBIXXF_SIGNUM_DPF] = "Data access page fault",
+	[TBIXXF_SIGNUM_IHF] = "Instruction breakpoint",
+	[TBIXXF_SIGNUM_DWF] = "Read-only data access fault",
+};
+
+const char *trap_name(int trapno)
+{
+	if (trapno >= 0 && trapno < ARRAY_SIZE(trap_names)
+			&& trap_names[trapno])
+		return trap_names[trapno];
+	return "Unknown fault";
+}
+
+static DEFINE_SPINLOCK(die_lock);
+
+void __noreturn die(const char *str, struct pt_regs *regs,
+		    long err, unsigned long addr)
+{
+	static int die_counter;
+
+	oops_enter();
+
+	spin_lock_irq(&die_lock);
+	console_verbose();
+	bust_spinlocks(1);
+	pr_err("%s: err %04lx (%s) addr %08lx [#%d]\n", str, err & 0xffff,
+	       trap_name(err & 0xffff), addr, ++die_counter);
+
+	print_modules();
+	show_regs(regs);
+
+	pr_err("Process: %s (pid: %d, stack limit = %p)\n", current->comm,
+	       task_pid_nr(current), task_stack_page(current) + THREAD_SIZE);
+
+	bust_spinlocks(0);
+	add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE);
+	if (kexec_should_crash(current))
+		crash_kexec(regs);
+
+	if (in_interrupt())
+		panic("Fatal exception in interrupt");
+
+	if (panic_on_oops)
+		panic("Fatal exception");
+
+	spin_unlock_irq(&die_lock);
+	oops_exit();
+	do_exit(SIGSEGV);
+}
+
+#ifdef CONFIG_METAG_DSP
+/*
+ * The ECH encoding specifies the size of a DSPRAM as,
+ *
+ *		"slots" / 4
+ *
+ * A "slot" is the size of two DSPRAM bank entries; an entry from
+ * DSPRAM bank A and an entry from DSPRAM bank B. One DSPRAM bank
+ * entry is 4 bytes.
+ */
+#define SLOT_SZ	8
+static inline unsigned int decode_dspram_size(unsigned int size)
+{
+	unsigned int _sz = size & 0x7f;
+
+	return _sz * SLOT_SZ * 4;
+}
+
+static void dspram_save(struct meta_ext_context *dsp_ctx,
+			unsigned int ramA_sz, unsigned int ramB_sz)
+{
+	unsigned int ram_sz[2];
+	int i;
+
+	ram_sz[0] = ramA_sz;
+	ram_sz[1] = ramB_sz;
+
+	for (i = 0; i < 2; i++) {
+		if (ram_sz[i] != 0) {
+			unsigned int sz;
+
+			if (i == 0)
+				sz = decode_dspram_size(ram_sz[i] >> 8);
+			else
+				sz = decode_dspram_size(ram_sz[i]);
+
+			if (dsp_ctx->ram[i] == NULL) {
+				dsp_ctx->ram[i] = kmalloc(sz, GFP_KERNEL);
+
+				if (dsp_ctx->ram[i] == NULL)
+					panic("couldn't save DSP context");
+			} else {
+				if (ram_sz[i] > dsp_ctx->ram_sz[i]) {
+					kfree(dsp_ctx->ram[i]);
+
+					dsp_ctx->ram[i] = kmalloc(sz,
+								  GFP_KERNEL);
+
+					if (dsp_ctx->ram[i] == NULL)
+						panic("couldn't save DSP context");
+				}
+			}
+
+			if (i == 0)
+				__TBIDspramSaveA(ram_sz[i], dsp_ctx->ram[i]);
+			else
+				__TBIDspramSaveB(ram_sz[i], dsp_ctx->ram[i]);
+
+			dsp_ctx->ram_sz[i] = ram_sz[i];
+		}
+	}
+}
+#endif /* CONFIG_METAG_DSP */
+
+/*
+ * Allow interrupts to be nested and save any "extended" register
+ * context state, e.g. DSP regs and RAMs.
+ */
+static void nest_interrupts(TBIRES State, unsigned long mask)
+{
+#ifdef CONFIG_METAG_DSP
+	struct meta_ext_context *dsp_ctx;
+	unsigned int D0_8;
+
+	/*
+	 * D0.8 may contain an ECH encoding. The upper 16 bits
+	 * tell us what DSP resources the current process is
+	 * using. OR the bits into the SaveMask so that
+	 * __TBINestInts() knows what resources to save as
+	 * part of this context.
+	 *
+	 * Don't save the context if we're nesting interrupts in the
+	 * kernel because the kernel doesn't use DSP hardware.
+	 */
+	D0_8 = __core_reg_get(D0.8);
+
+	if (D0_8 && (State.Sig.SaveMask & TBICTX_PRIV_BIT)) {
+		State.Sig.SaveMask |= (D0_8 >> 16);
+
+		dsp_ctx = current->thread.dsp_context;
+		if (dsp_ctx == NULL) {
+			dsp_ctx = kzalloc(sizeof(*dsp_ctx), GFP_KERNEL);
+			if (dsp_ctx == NULL)
+				panic("couldn't save DSP context: ENOMEM");
+
+			current->thread.dsp_context = dsp_ctx;
+		}
+
+		current->thread.user_flags |= (D0_8 & 0xffff0000);
+		__TBINestInts(State, &dsp_ctx->regs, mask);
+		dspram_save(dsp_ctx, D0_8 & 0x7f00, D0_8 & 0x007f);
+	} else
+		__TBINestInts(State, NULL, mask);
+#else
+	__TBINestInts(State, NULL, mask);
+#endif
+}
+
+void head_end(TBIRES State, unsigned long mask)
+{
+	unsigned int savemask = (unsigned short)State.Sig.SaveMask;
+	unsigned int ctx_savemask = (unsigned short)State.Sig.pCtx->SaveMask;
+
+	if (savemask & TBICTX_PRIV_BIT) {
+		ctx_savemask |= TBICTX_PRIV_BIT;
+		current->thread.user_flags = savemask;
+	}
+
+	/* Always undo the sleep bit */
+	ctx_savemask &= ~TBICTX_WAIT_BIT;
+
+	/* Always save the catch buffer and RD pipe if they are dirty */
+	savemask |= TBICTX_XCBF_BIT;
+
+	/* Only save the catch and RD if we have not already done so.
+	 * Note - the RD bits are in the pCtx only, and not in the
+	 * State.SaveMask.
+	 */
+	if ((savemask & TBICTX_CBUF_BIT) ||
+	    (ctx_savemask & TBICTX_CBRP_BIT)) {
+		/* Have we already saved the buffers though?
+		 * - See TestTrack 5071 */
+		if (ctx_savemask & TBICTX_XCBF_BIT) {
+			/* Strip off the bits so the call to __TBINestInts
+			 * won't save the buffers again. */
+			savemask &= ~TBICTX_CBUF_BIT;
+			ctx_savemask &= ~TBICTX_CBRP_BIT;
+		}
+	}
+
+#ifdef CONFIG_METAG_META21
+	{
+		unsigned int depth, txdefr;
+
+		/*
+		 * Save TXDEFR state.
+		 *
+		 * The process may have been interrupted after a LNKSET, but
+		 * before it could read the DEFR state, so we mustn't lose that
+		 * state or it could end up retrying an atomic operation that
+		 * succeeded.
+		 *
+		 * All interrupts are disabled at this point so we
+		 * don't need to perform any locking. We must do this
+		 * dance before we use LNKGET or LNKSET.
+		 */
+		BUG_ON(current->thread.int_depth > HARDIRQ_BITS);
+
+		depth = current->thread.int_depth++;
+
+		txdefr = __core_reg_get(TXDEFR);
+
+		txdefr &= TXDEFR_BUS_STATE_BITS;
+		if (txdefr & TXDEFR_LNKSET_SUCCESS)
+			current->thread.txdefr_failure &= ~(1 << depth);
+		else
+			current->thread.txdefr_failure |= (1 << depth);
+	}
+#endif
+
+	State.Sig.SaveMask = savemask;
+	State.Sig.pCtx->SaveMask = ctx_savemask;
+
+	nest_interrupts(State, mask);
+
+#ifdef CONFIG_METAG_POISON_CATCH_BUFFERS
+	/* Poison the catch registers.  This shows up any mistakes we have
+	 * made in their handling MUCH quicker.
+	 */
+	__core_reg_set(TXCATCH0, 0x87650021);
+	__core_reg_set(TXCATCH1, 0x87654322);
+	__core_reg_set(TXCATCH2, 0x87654323);
+	__core_reg_set(TXCATCH3, 0x87654324);
+#endif /* CONFIG_METAG_POISON_CATCH_BUFFERS */
+}
+
+TBIRES tail_end_sys(TBIRES State, int syscall, int *restart)
+{
+	struct pt_regs *regs = (struct pt_regs *)State.Sig.pCtx;
+	unsigned long flags;
+
+	local_irq_disable();
+
+	if (user_mode(regs)) {
+		flags = current_thread_info()->flags;
+		if (flags & _TIF_WORK_MASK &&
+		    do_work_pending(regs, flags, syscall)) {
+			*restart = 1;
+			return State;
+		}
+
+#ifdef CONFIG_METAG_FPU
+		if (current->thread.fpu_context &&
+		    current->thread.fpu_context->needs_restore) {
+			__TBICtxFPURestore(State, current->thread.fpu_context);
+			/*
+			 * Clearing this bit ensures the FP unit is not made
+			 * active again unless it is used.
+			 */
+			State.Sig.SaveMask &= ~TBICTX_FPAC_BIT;
+			current->thread.fpu_context->needs_restore = false;
+		}
+		State.Sig.TrigMask |= TBI_TRIG_BIT(TBID_SIGNUM_DFR);
+#endif
+	}
+
+	/* TBI will turn interrupts back on at some point. */
+	if (!irqs_disabled_flags((unsigned long)State.Sig.TrigMask))
+		trace_hardirqs_on();
+
+#ifdef CONFIG_METAG_DSP
+	/*
+	 * If we previously saved an extended context then restore it
+	 * now. Otherwise, clear D0.8 because this process is not
+	 * using DSP hardware.
+	 */
+	if (State.Sig.pCtx->SaveMask & TBICTX_XEXT_BIT) {
+		unsigned int D0_8;
+		struct meta_ext_context *dsp_ctx = current->thread.dsp_context;
+
+		/* Make sure we're going to return to userland. */
+		BUG_ON(current->thread.int_depth != 1);
+
+		if (dsp_ctx->ram_sz[0] > 0)
+			__TBIDspramRestoreA(dsp_ctx->ram_sz[0],
+					    dsp_ctx->ram[0]);
+		if (dsp_ctx->ram_sz[1] > 0)
+			__TBIDspramRestoreB(dsp_ctx->ram_sz[1],
+					    dsp_ctx->ram[1]);
+
+		State.Sig.SaveMask |= State.Sig.pCtx->SaveMask;
+		__TBICtxRestore(State, current->thread.dsp_context);
+		D0_8 = __core_reg_get(D0.8);
+		D0_8 |= current->thread.user_flags & 0xffff0000;
+		D0_8 |= (dsp_ctx->ram_sz[1] | dsp_ctx->ram_sz[0]) & 0xffff;
+		__core_reg_set(D0.8, D0_8);
+	} else
+		__core_reg_set(D0.8, 0);
+#endif /* CONFIG_METAG_DSP */
+
+#ifdef CONFIG_METAG_META21
+	{
+		unsigned int depth, txdefr;
+
+		/*
+		 * If there hasn't been a LNKSET since the last LNKGET then the
+		 * link flag will be set, causing the next LNKSET to succeed if
+		 * the addresses match. The two LNK operations may not be a pair
+		 * (e.g. see atomic_read()), so the LNKSET should fail.
+		 * We use a conditional-never LNKSET to clear the link flag
+		 * without side effects.
+		 */
+		asm volatile("LNKSETDNV [D0Re0],D0Re0");
+
+		depth = --current->thread.int_depth;
+
+		BUG_ON(user_mode(regs) && depth);
+
+		txdefr = __core_reg_get(TXDEFR);
+
+		txdefr &= ~TXDEFR_BUS_STATE_BITS;
+
+		/* Do we need to restore a failure code into TXDEFR? */
+		if (current->thread.txdefr_failure & (1 << depth))
+			txdefr |= (TXDEFR_LNKSET_FAILURE | TXDEFR_BUS_TRIG_BIT);
+		else
+			txdefr |= (TXDEFR_LNKSET_SUCCESS | TXDEFR_BUS_TRIG_BIT);
+
+		__core_reg_set(TXDEFR, txdefr);
+	}
+#endif
+	return State;
+}
+
+#ifdef CONFIG_SMP
+/*
+ * If we took an interrupt in the middle of __kuser_get_tls then we need
+ * to rewind the PC to the start of the function in case the process
+ * gets migrated to another thread (SMP only) and it reads the wrong tls
+ * data.
+ */
+static inline void _restart_critical_section(TBIRES State)
+{
+	unsigned long get_tls_start;
+	unsigned long get_tls_end;
+
+	get_tls_start = (unsigned long)__kuser_get_tls -
+		(unsigned long)&__user_gateway_start;
+
+	get_tls_start += USER_GATEWAY_PAGE;
+
+	get_tls_end = (unsigned long)__kuser_get_tls_end -
+		(unsigned long)&__user_gateway_start;
+
+	get_tls_end += USER_GATEWAY_PAGE;
+
+	if ((State.Sig.pCtx->CurrPC >= get_tls_start) &&
+	    (State.Sig.pCtx->CurrPC < get_tls_end))
+		State.Sig.pCtx->CurrPC = get_tls_start;
+}
+#else
+/*
+ * If we took an interrupt in the middle of
+ * __kuser_cmpxchg then we need to rewind the PC to the
+ * start of the function.
+ */
+static inline void _restart_critical_section(TBIRES State)
+{
+	unsigned long cmpxchg_start;
+	unsigned long cmpxchg_end;
+
+	cmpxchg_start = (unsigned long)__kuser_cmpxchg -
+		(unsigned long)&__user_gateway_start;
+
+	cmpxchg_start += USER_GATEWAY_PAGE;
+
+	cmpxchg_end = (unsigned long)__kuser_cmpxchg_end -
+		(unsigned long)&__user_gateway_start;
+
+	cmpxchg_end += USER_GATEWAY_PAGE;
+
+	if ((State.Sig.pCtx->CurrPC >= cmpxchg_start) &&
+	    (State.Sig.pCtx->CurrPC < cmpxchg_end))
+		State.Sig.pCtx->CurrPC = cmpxchg_start;
+}
+#endif
+
+/* Used by kick_handler() */
+void restart_critical_section(TBIRES State)
+{
+	_restart_critical_section(State);
+}
+
+TBIRES trigger_handler(TBIRES State, int SigNum, int Triggers, int Inst,
+		       PTBI pTBI)
+{
+	head_end(State, ~INTS_OFF_MASK);
+
+	/* If we interrupted user code handle any critical sections. */
+	if (State.Sig.SaveMask & TBICTX_PRIV_BIT)
+		_restart_critical_section(State);
+
+	trace_hardirqs_off();
+
+	do_IRQ(SigNum, (struct pt_regs *)State.Sig.pCtx);
+
+	return tail_end(State);
+}
+
+static unsigned int load_fault(PTBICTXEXTCB0 pbuf)
+{
+	return pbuf->CBFlags & TXCATCH0_READ_BIT;
+}
+
+static unsigned long fault_address(PTBICTXEXTCB0 pbuf)
+{
+	return pbuf->CBAddr;
+}
+
+static void unhandled_fault(struct pt_regs *regs, unsigned long addr,
+			    int signo, int code, int trapno)
+{
+	if (user_mode(regs)) {
+		siginfo_t info;
+
+		if (show_unhandled_signals && unhandled_signal(current, signo)
+		    && printk_ratelimit()) {
+
+			pr_info("pid %d unhandled fault: pc 0x%08x, addr 0x%08lx, trap %d (%s)\n",
+				current->pid, regs->ctx.CurrPC, addr,
+				trapno, trap_name(trapno));
+			print_vma_addr(" in ", regs->ctx.CurrPC);
+			print_vma_addr(" rtp in ", regs->ctx.DX[4].U1);
+			printk("\n");
+			show_regs(regs);
+		}
+
+		info.si_signo = signo;
+		info.si_errno = 0;
+		info.si_code = code;
+		info.si_addr = (__force void __user *)addr;
+		info.si_trapno = trapno;
+		force_sig_info(signo, &info, current);
+	} else {
+		die("Oops", regs, trapno, addr);
+	}
+}
+
+static int handle_data_fault(PTBICTXEXTCB0 pcbuf, struct pt_regs *regs,
+			     unsigned int data_address, int trapno)
+{
+	int ret;
+
+	ret = do_page_fault(regs, data_address, !load_fault(pcbuf), trapno);
+
+	return ret;
+}
+
+static unsigned long get_inst_fault_address(struct pt_regs *regs)
+{
+	return regs->ctx.CurrPC;
+}
+
+TBIRES fault_handler(TBIRES State, int SigNum, int Triggers,
+		     int Inst, PTBI pTBI)
+{
+	struct pt_regs *regs = (struct pt_regs *)State.Sig.pCtx;
+	PTBICTXEXTCB0 pcbuf = (PTBICTXEXTCB0)&regs->extcb0;
+	unsigned long data_address;
+
+	head_end(State, ~INTS_OFF_MASK);
+
+	/* Hardware breakpoint or data watch */
+	if ((SigNum == TBIXXF_SIGNUM_IHF) ||
+	    ((SigNum == TBIXXF_SIGNUM_DHF) &&
+	     (pcbuf[0].CBFlags & (TXCATCH0_WATCH1_BIT |
+				  TXCATCH0_WATCH0_BIT)))) {
+		State = __TBIUnExpXXX(State, SigNum, Triggers, Inst,
+				      pTBI);
+		return tail_end(State);
+	}
+
+	local_irq_enable();
+
+	data_address = fault_address(pcbuf);
+
+	switch (SigNum) {
+	case TBIXXF_SIGNUM_IGF:
+		/* 1st-level entry invalid (instruction fetch) */
+	case TBIXXF_SIGNUM_IPF: {
+		/* 2nd-level entry invalid (instruction fetch) */
+		unsigned long addr = get_inst_fault_address(regs);
+		do_page_fault(regs, addr, 0, SigNum);
+		break;
+	}
+
+	case TBIXXF_SIGNUM_DGF:
+		/* 1st-level entry invalid (data access) */
+	case TBIXXF_SIGNUM_DPF:
+		/* 2nd-level entry invalid (data access) */
+	case TBIXXF_SIGNUM_DWF:
+		/* Write to read only page */
+		handle_data_fault(pcbuf, regs, data_address, SigNum);
+		break;
+
+	case TBIXXF_SIGNUM_IIF:
+		/* Illegal instruction */
+		unhandled_fault(regs, regs->ctx.CurrPC, SIGILL, ILL_ILLOPC,
+				SigNum);
+		break;
+
+	case TBIXXF_SIGNUM_DHF:
+		/* Unaligned access */
+		unhandled_fault(regs, data_address, SIGBUS, BUS_ADRALN,
+				SigNum);
+		break;
+	case TBIXXF_SIGNUM_PGF:
+		/* Privilege violation */
+		unhandled_fault(regs, data_address, SIGSEGV, SEGV_ACCERR,
+				SigNum);
+		break;
+	default:
+		BUG();
+		break;
+	}
+
+	return tail_end(State);
+}
+
+static bool switch_is_syscall(unsigned int inst)
+{
+	return inst == __METAG_SW_ENCODING(SYS);
+}
+
+static bool switch_is_legacy_syscall(unsigned int inst)
+{
+	return inst == __METAG_SW_ENCODING(SYS_LEGACY);
+}
+
+static inline void step_over_switch(struct pt_regs *regs, unsigned int inst)
+{
+	regs->ctx.CurrPC += 4;
+}
+
+static inline int test_syscall_work(void)
+{
+	return current_thread_info()->flags & _TIF_WORK_SYSCALL_MASK;
+}
+
+TBIRES switch1_handler(TBIRES State, int SigNum, int Triggers,
+		       int Inst, PTBI pTBI)
+{
+	struct pt_regs *regs = (struct pt_regs *)State.Sig.pCtx;
+	unsigned int sysnumber;
+	unsigned long long a1_a2, a3_a4, a5_a6;
+	LPSYSCALL syscall_entry;
+	int restart;
+
+	head_end(State, ~INTS_OFF_MASK);
+
+	/*
+	 * If this is not a syscall SWITCH it could be a breakpoint.
+	 */
+	if (!switch_is_syscall(Inst)) {
+		/*
+		 * Alert the user if they're trying to use legacy system
+		 * calls. This suggests they need to update their C
+		 * library and build against up to date kernel headers.
+		 */
+		if (switch_is_legacy_syscall(Inst))
+			pr_warn_once("WARNING: A legacy syscall was made. Your userland needs updating.\n");
+		/*
+		 * We don't know how to handle the SWITCH and cannot
+		 * safely ignore it, so treat all unknown switches
+		 * (including breakpoints) as traps.
+		 */
+		force_sig(SIGTRAP, current);
+		return tail_end(State);
+	}
+
+	local_irq_enable();
+
+restart_syscall:
+	restart = 0;
+	sysnumber = regs->ctx.DX[0].U1;
+
+	if (test_syscall_work())
+		sysnumber = syscall_trace_enter(regs);
+
+	/* Skip over the SWITCH instruction - or you just get 'stuck' on it! */
+	step_over_switch(regs, Inst);
+
+	if (sysnumber >= __NR_syscalls) {
+		pr_debug("unknown syscall number: %d\n", sysnumber);
+		syscall_entry = (LPSYSCALL) sys_ni_syscall;
+	} else {
+		syscall_entry = (LPSYSCALL) sys_call_table[sysnumber];
+	}
+
+	/* Use 64bit loads for speed. */
+	a5_a6 = *(unsigned long long *)&regs->ctx.DX[1];
+	a3_a4 = *(unsigned long long *)&regs->ctx.DX[2];
+	a1_a2 = *(unsigned long long *)&regs->ctx.DX[3];
+
+	/* here is the actual call to the syscall handler functions */
+	regs->ctx.DX[0].U0 = syscall_entry(a1_a2, a3_a4, a5_a6);
+
+	if (test_syscall_work())
+		syscall_trace_leave(regs);
+
+	State = tail_end_sys(State, sysnumber, &restart);
+	/* Handlerless restarts shouldn't go via userland */
+	if (restart)
+		goto restart_syscall;
+	return State;
+}
+
+TBIRES switchx_handler(TBIRES State, int SigNum, int Triggers,
+		       int Inst, PTBI pTBI)
+{
+	struct pt_regs *regs = (struct pt_regs *)State.Sig.pCtx;
+
+	/*
+	 * This can be caused by any user process simply executing an unusual
+	 * SWITCH instruction. If there's no DA, __TBIUnExpXXX will cause the
+	 * thread to stop, so signal a SIGTRAP instead.
+	 */
+	head_end(State, ~INTS_OFF_MASK);
+	if (user_mode(regs))
+		force_sig(SIGTRAP, current);
+	else
+		State = __TBIUnExpXXX(State, SigNum, Triggers, Inst, pTBI);
+	return tail_end(State);
+}
+
+#ifdef CONFIG_METAG_META21
+TBIRES fpe_handler(TBIRES State, int SigNum, int Triggers, int Inst, PTBI pTBI)
+{
+	struct pt_regs *regs = (struct pt_regs *)State.Sig.pCtx;
+	unsigned int error_state = Triggers;
+	siginfo_t info;
+
+	head_end(State, ~INTS_OFF_MASK);
+
+	local_irq_enable();
+
+	info.si_signo = SIGFPE;
+
+	if (error_state & TXSTAT_FPE_INVALID_BIT)
+		info.si_code = FPE_FLTINV;
+	else if (error_state & TXSTAT_FPE_DIVBYZERO_BIT)
+		info.si_code = FPE_FLTDIV;
+	else if (error_state & TXSTAT_FPE_OVERFLOW_BIT)
+		info.si_code = FPE_FLTOVF;
+	else if (error_state & TXSTAT_FPE_UNDERFLOW_BIT)
+		info.si_code = FPE_FLTUND;
+	else if (error_state & TXSTAT_FPE_INEXACT_BIT)
+		info.si_code = FPE_FLTRES;
+	else
+		info.si_code = 0;
+	info.si_errno = 0;
+	info.si_addr = (__force void __user *)regs->ctx.CurrPC;
+	force_sig_info(SIGFPE, &info, current);
+
+	return tail_end(State);
+}
+#endif
+
+#ifdef CONFIG_METAG_SUSPEND_MEM
+struct traps_context {
+	PTBIAPIFN fnSigs[TBID_SIGNUM_MAX + 1];
+};
+
+static struct traps_context *metag_traps_context;
+
+int traps_save_context(void)
+{
+	unsigned long cpu = smp_processor_id();
+	PTBI _pTBI = per_cpu(pTBI, cpu);
+	struct traps_context *context;
+
+	context = kzalloc(sizeof(*context), GFP_ATOMIC);
+	if (!context)
+		return -ENOMEM;
+
+	memcpy(context->fnSigs, (void *)_pTBI->fnSigs, sizeof(context->fnSigs));
+
+	metag_traps_context = context;
+	return 0;
+}
+
+int traps_restore_context(void)
+{
+	unsigned long cpu = smp_processor_id();
+	PTBI _pTBI = per_cpu(pTBI, cpu);
+	struct traps_context *context = metag_traps_context;
+
+	metag_traps_context = NULL;
+
+	memcpy((void *)_pTBI->fnSigs, context->fnSigs, sizeof(context->fnSigs));
+
+	kfree(context);
+	return 0;
+}
+#endif
+
+#ifdef CONFIG_SMP
+static inline unsigned int _get_trigger_mask(void)
+{
+	unsigned long cpu = smp_processor_id();
+	return per_cpu(trigger_mask, cpu);
+}
+
+unsigned int get_trigger_mask(void)
+{
+	return _get_trigger_mask();
+}
+EXPORT_SYMBOL(get_trigger_mask);
+
+static void set_trigger_mask(unsigned int mask)
+{
+	unsigned long cpu = smp_processor_id();
+	per_cpu(trigger_mask, cpu) = mask;
+}
+
+void arch_local_irq_enable(void)
+{
+	preempt_disable();
+	arch_local_irq_restore(_get_trigger_mask());
+	preempt_enable_no_resched();
+}
+EXPORT_SYMBOL(arch_local_irq_enable);
+#else
+static void set_trigger_mask(unsigned int mask)
+{
+	global_trigger_mask = mask;
+}
+#endif
+
+void per_cpu_trap_init(unsigned long cpu)
+{
+	TBIRES int_context;
+	unsigned int thread = cpu_2_hwthread_id[cpu];
+
+	set_trigger_mask(TBI_INTS_INIT(thread) | /* interrupts */
+			 TBI_TRIG_BIT(TBID_SIGNUM_LWK) | /* low level kick */
+			 TBI_TRIG_BIT(TBID_SIGNUM_SW1));
+
+	/* non-priv - use current stack */
+	int_context.Sig.pCtx = NULL;
+	/* Start with interrupts off */
+	int_context.Sig.TrigMask = INTS_OFF_MASK;
+	int_context.Sig.SaveMask = 0;
+
+	/* And call __TBIASyncTrigger() */
+	__TBIASyncTrigger(int_context);
+}
+
+void __init trap_init(void)
+{
+	unsigned long cpu = smp_processor_id();
+	PTBI _pTBI = per_cpu(pTBI, cpu);
+
+	_pTBI->fnSigs[TBID_SIGNUM_XXF] = fault_handler;
+	_pTBI->fnSigs[TBID_SIGNUM_SW0] = switchx_handler;
+	_pTBI->fnSigs[TBID_SIGNUM_SW1] = switch1_handler;
+	_pTBI->fnSigs[TBID_SIGNUM_SW2] = switchx_handler;
+	_pTBI->fnSigs[TBID_SIGNUM_SW3] = switchx_handler;
+	_pTBI->fnSigs[TBID_SIGNUM_LWK] = kick_handler;
+
+#ifdef CONFIG_METAG_META21
+	_pTBI->fnSigs[TBID_SIGNUM_DFR] = __TBIHandleDFR;
+	_pTBI->fnSigs[TBID_SIGNUM_FPE] = fpe_handler;
+#endif
+
+	per_cpu_trap_init(cpu);
+}
+
+void tbi_startup_interrupt(int irq)
+{
+	unsigned long cpu = smp_processor_id();
+	PTBI _pTBI = per_cpu(pTBI, cpu);
+
+	BUG_ON(irq > TBID_SIGNUM_MAX);
+
+	/* For TR1 and TR2, the thread id is encoded in the irq number */
+	if (irq >= TBID_SIGNUM_T10 && irq < TBID_SIGNUM_TR3)
+		cpu = hwthread_id_2_cpu[(irq - TBID_SIGNUM_T10) % 4];
+
+	set_trigger_mask(get_trigger_mask() | TBI_TRIG_BIT(irq));
+
+	_pTBI->fnSigs[irq] = trigger_handler;
+}
+
+void tbi_shutdown_interrupt(int irq)
+{
+	unsigned long cpu = smp_processor_id();
+	PTBI _pTBI = per_cpu(pTBI, cpu);
+
+	BUG_ON(irq > TBID_SIGNUM_MAX);
+
+	set_trigger_mask(get_trigger_mask() & ~TBI_TRIG_BIT(irq));
+
+	_pTBI->fnSigs[irq] = __TBIUnExpXXX;
+}
+
+int ret_from_fork(TBIRES arg)
+{
+	struct task_struct *prev = arg.Switch.pPara;
+	struct task_struct *tsk = current;
+	struct pt_regs *regs = task_pt_regs(tsk);
+	int (*fn)(void *);
+	TBIRES Next;
+
+	schedule_tail(prev);
+
+	if (tsk->flags & PF_KTHREAD) {
+		fn = (void *)regs->ctx.DX[4].U1;
+		BUG_ON(!fn);
+
+		fn((void *)regs->ctx.DX[3].U1);
+	}
+
+	if (test_syscall_work())
+		syscall_trace_leave(regs);
+
+	preempt_disable();
+
+	Next.Sig.TrigMask = get_trigger_mask();
+	Next.Sig.SaveMask = 0;
+	Next.Sig.pCtx = &regs->ctx;
+
+	set_gateway_tls(current->thread.tls_ptr);
+
+	preempt_enable_no_resched();
+
+	/* And interrupts should come back on when we resume the real usermode
+	 * code. Call __TBIASyncResume()
+	 */
+	__TBIASyncResume(tail_end(Next));
+	/* ASyncResume should NEVER return */
+	BUG();
+	return 0;
+}
+
+void show_trace(struct task_struct *tsk, unsigned long *sp,
+		struct pt_regs *regs)
+{
+	unsigned long addr;
+#ifdef CONFIG_FRAME_POINTER
+	unsigned long fp, fpnew;
+	unsigned long stack;
+#endif
+
+	if (regs && user_mode(regs))
+		return;
+
+	printk("\nCall trace: ");
+#ifdef CONFIG_KALLSYMS
+	printk("\n");
+#endif
+
+	if (!tsk)
+		tsk = current;
+
+#ifdef CONFIG_FRAME_POINTER
+	if (regs) {
+		print_ip_sym(regs->ctx.CurrPC);
+		fp = regs->ctx.AX[1].U0;
+	} else {
+		fp = __core_reg_get(A0FrP);
+	}
+
+	/* detect when the frame pointer has been used for other purposes and
+	 * doesn't point to the stack (it may point completely elsewhere which
+	 * kstack_end may not detect).
+	 */
+	stack = (unsigned long)task_stack_page(tsk);
+	while (fp >= stack && fp + 8 <= stack + THREAD_SIZE) {
+		addr = __raw_readl((unsigned long *)(fp + 4)) - 4;
+		if (kernel_text_address(addr))
+			print_ip_sym(addr);
+		else
+			break;
+		/* stack grows up, so frame pointers must decrease */
+		fpnew = __raw_readl((unsigned long *)(fp + 0));
+		if (fpnew >= fp)
+			break;
+		fp = fpnew;
+	}
+#else
+	while (!kstack_end(sp)) {
+		addr = (*sp--) - 4;
+		if (kernel_text_address(addr))
+			print_ip_sym(addr);
+	}
+#endif
+
+	printk("\n");
+
+	debug_show_held_locks(tsk);
+}
+
+void show_stack(struct task_struct *tsk, unsigned long *sp)
+{
+	if (!tsk)
+		tsk = current;
+	if (tsk == current)
+		sp = (unsigned long *)current_stack_pointer;
+	else
+		sp = (unsigned long *)tsk->thread.kernel_context->AX[0].U0;
+
+	show_trace(tsk, sp, NULL);
+}
diff --git a/arch/metag/kernel/user_gateway.S b/arch/metag/kernel/user_gateway.S
new file mode 100644
index 000000000..7167f3e8d
--- /dev/null
+++ b/arch/metag/kernel/user_gateway.S
@@ -0,0 +1,97 @@
+/*
+ * Copyright (C) 2010 Imagination Technologies Ltd.
+ *
+ * This file contains code that can be accessed from userspace and can
+ * access certain kernel data structures without the overhead of a system
+ * call.
+ */
+
+#include <asm/metag_regs.h>
+#include <asm/user_gateway.h>
+
+/*
+ * User helpers.
+ *
+ * These are segment of kernel provided user code reachable from user space
+ * at a fixed address in kernel memory.  This is used to provide user space
+ * with some operations which require kernel help because of unimplemented
+ * native feature and/or instructions in some Meta CPUs. The idea is for
+ * this code to be executed directly in user mode for best efficiency but
+ * which is too intimate with the kernel counter part to be left to user
+ * libraries.  The kernel reserves the right to change this code as needed
+ * without warning. Only the entry points and their results are guaranteed
+ * to be stable.
+ *
+ * Each segment is 64-byte aligned.  This mechanism should be used only for
+ * for things that are really small and justified, and not be abused freely.
+ */
+	.text
+	.global	___user_gateway_start
+___user_gateway_start:
+
+	/* get_tls
+	 * Offset:	 0
+	 * Description:	 Get the TLS pointer for this process.
+	 */
+	.global	___kuser_get_tls
+	.type	___kuser_get_tls,function
+___kuser_get_tls:
+	MOVT	D1Ar1,#HI(USER_GATEWAY_PAGE + USER_GATEWAY_TLS)
+	ADD	D1Ar1,D1Ar1,#LO(USER_GATEWAY_PAGE + USER_GATEWAY_TLS)
+	MOV	D1Ar3,TXENABLE
+	AND	D1Ar3,D1Ar3,#(TXENABLE_THREAD_BITS)
+	LSR	D1Ar3,D1Ar3,#(TXENABLE_THREAD_S - 2)
+	GETD	D0Re0,[D1Ar1+D1Ar3]
+___kuser_get_tls_end:		/* Beyond this point the read will complete */
+	MOV	PC,D1RtP
+	.size	___kuser_get_tls,.-___kuser_get_tls
+	.global	___kuser_get_tls_end
+
+	/* cmpxchg
+	 * Offset:	 64
+	 * Description:  Replace the value at 'ptr' with 'newval' if the current
+	 *		 value is 'oldval'. Return zero if we succeeded,
+	 *		 non-zero otherwise.
+	 *
+	 * Reference prototype:
+	 *
+	 *	int __kuser_cmpxchg(int oldval, int newval, unsigned long *ptr)
+	 *
+	 */
+	.balign 64
+	.global ___kuser_cmpxchg
+	.type   ___kuser_cmpxchg,function
+___kuser_cmpxchg:
+#ifdef CONFIG_SMP
+	/*
+	 * We must use LNKGET/LNKSET with an SMP kernel because the other method
+	 * does not provide atomicity across multiple CPUs.
+	 */
+0:	LNKGETD	D0Re0,[D1Ar3]
+	CMP	D0Re0,D1Ar1
+	LNKSETDZ [D1Ar3],D0Ar2
+	BNZ	1f
+	DEFR	D0Re0,TXSTAT
+	ANDT	D0Re0,D0Re0,#HI(0x3f000000)
+	CMPT	D0Re0,#HI(0x02000000)
+	BNE	0b
+#ifdef CONFIG_METAG_LNKGET_AROUND_CACHE
+	DCACHE  [D1Ar3], D0Re0
+#endif
+1:	MOV	D0Re0,#1
+	XORZ	D0Re0,D0Re0,D0Re0
+	MOV	PC,D1RtP
+#else
+	GETD	D0Re0,[D1Ar3]
+	CMP	D0Re0,D1Ar1
+	SETDZ	[D1Ar3],D0Ar2
+___kuser_cmpxchg_end:		/* Beyond this point the write will complete */
+	MOV	D0Re0,#1
+	XORZ	D0Re0,D0Re0,D0Re0
+	MOV	PC,D1RtP
+#endif /* CONFIG_SMP */
+	.size	___kuser_cmpxchg,.-___kuser_cmpxchg
+	.global	___kuser_cmpxchg_end
+
+	.global	___user_gateway_end
+___user_gateway_end:
diff --git a/arch/metag/kernel/vmlinux.lds.S b/arch/metag/kernel/vmlinux.lds.S
new file mode 100644
index 000000000..e12055e88
--- /dev/null
+++ b/arch/metag/kernel/vmlinux.lds.S
@@ -0,0 +1,71 @@
+/* ld script to make Meta Linux kernel */
+
+#include <asm/thread_info.h>
+#include <asm/page.h>
+#include <asm/cache.h>
+
+#include <asm-generic/vmlinux.lds.h>
+
+OUTPUT_FORMAT("elf32-metag", "elf32-metag", "elf32-metag")
+OUTPUT_ARCH(metag)
+ENTRY(__start)
+
+_jiffies = _jiffies_64;
+SECTIONS
+{
+  . = CONFIG_PAGE_OFFSET;
+  _text = .;
+  __text = .;
+  __stext = .;
+  HEAD_TEXT_SECTION
+  .text : {
+	TEXT_TEXT
+	SCHED_TEXT
+	LOCK_TEXT
+	KPROBES_TEXT
+	IRQENTRY_TEXT
+	*(.text.*)
+	*(.gnu.warning)
+	}
+
+  __etext = .;			/* End of text section */
+
+  __sdata = .;
+  RO_DATA_SECTION(PAGE_SIZE)
+  RW_DATA_SECTION(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE)
+  __edata = .;			/* End of data section */
+
+  EXCEPTION_TABLE(16)
+  NOTES
+
+  . = ALIGN(PAGE_SIZE);		/* Init code and data */
+  ___init_begin = .;
+  INIT_TEXT_SECTION(PAGE_SIZE)
+  INIT_DATA_SECTION(16)
+
+  .init.arch.info : {
+	  ___arch_info_begin = .;
+	  *(.arch.info.init)
+	  ___arch_info_end = .;
+  }
+
+  PERCPU_SECTION(L1_CACHE_BYTES)
+
+  ___init_end = .;
+
+  BSS_SECTION(0, PAGE_SIZE, 0)
+
+  __end = .;
+
+  . = ALIGN(PAGE_SIZE);
+  __heap_start = .;
+
+  DWARF_DEBUG
+
+  /* When something in the kernel is NOT compiled as a module, the
+   * module cleanup code and data are put into these segments.  Both
+   * can then be thrown away, as cleanup code is never called unless
+   * it's a module.
+   */
+  DISCARDS
+}
-- 
cgit v1.2.3-54-g00ecf