6 files changed, 1143 insertions, 0 deletions
diff --git a/arch/s390/numa/Makefile b/arch/s390/numa/Makefile
new file mode 100644
index 000000000..f94ecaffa
--- /dev/null
+++ b/arch/s390/numa/Makefile
@@ -0,0 +1,3 @@
+obj-y			+= numa.o
+obj-y			+= toptree.o
+obj-$(CONFIG_NUMA_EMU)	+= mode_emu.o
diff --git a/arch/s390/numa/mode_emu.c b/arch/s390/numa/mode_emu.c
new file mode 100644
index 000000000..30b2698a2
--- /dev/null
+++ b/arch/s390/numa/mode_emu.c
@@ -0,0 +1,530 @@
+/*
+ * NUMA support for s390
+ *
+ * NUMA emulation (aka fake NUMA) distributes the available memory to nodes
+ * without using real topology information about the physical memory of the
+ * machine.
+ *
+ * It distributes the available CPUs to nodes while respecting the original
+ * machine topology information. This is done by trying to avoid to separate
+ * CPUs which reside on the same book or even on the same MC.
+ *
+ * Because the current Linux scheduler code requires a stable cpu to node
+ * mapping, cores are pinned to nodes when the first CPU thread is set online.
+ *
+ * Copyright IBM Corp. 2015
+ */
+
+#define KMSG_COMPONENT "numa_emu"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/cpumask.h>
+#include <linux/memblock.h>
+#include <linux/node.h>
+#include <linux/memory.h>
+#include <linux/slab.h>
+#include <asm/smp.h>
+#include <asm/topology.h>
+#include "numa_mode.h"
+#include "toptree.h"
+
+/* Distances between the different system components */
+#define DIST_EMPTY	0
+#define DIST_CORE	1
+#define DIST_MC		2
+#define DIST_BOOK	3
+#define DIST_MAX	4
+
+/* Node distance reported to common code */
+#define EMU_NODE_DIST	10
+
+/* Node ID for free (not yet pinned) cores */
+#define NODE_ID_FREE	-1
+
+/* Different levels of toptree */
+enum toptree_level {CORE, MC, BOOK, NODE, TOPOLOGY};
+
+/* The two toptree IDs */
+enum {TOPTREE_ID_PHYS, TOPTREE_ID_NUMA};
+
+/* Number of NUMA nodes */
+static int emu_nodes = 1;
+/* NUMA stripe size */
+static unsigned long emu_size;
+
+/*
+ * Node to core pinning information updates are protected by
+ * "sched_domains_mutex".
+ */
+static struct {
+	s32 to_node_id[CONFIG_NR_CPUS];	/* Pinned core to node mapping */
+	int total;			/* Total number of pinned cores */
+	int per_node_target;		/* Cores per node without extra cores */
+	int per_node[MAX_NUMNODES];	/* Number of cores pinned to node */
+} *emu_cores;
+
+/*
+ * Pin a core to a node
+ */
+static void pin_core_to_node(int core_id, int node_id)
+{
+	if (emu_cores->to_node_id[core_id] == NODE_ID_FREE) {
+		emu_cores->per_node[node_id]++;
+		emu_cores->to_node_id[core_id] = node_id;
+		emu_cores->total++;
+	} else {
+		WARN_ON(emu_cores->to_node_id[core_id] != node_id);
+	}
+}
+
+/*
+ * Number of pinned cores of a node
+ */
+static int cores_pinned(struct toptree *node)
+{
+	return emu_cores->per_node[node->id];
+}
+
+/*
+ * ID of the node where the core is pinned (or NODE_ID_FREE)
+ */
+static int core_pinned_to_node_id(struct toptree *core)
+{
+	return emu_cores->to_node_id[core->id];
+}
+
+/*
+ * Number of cores in the tree that are not yet pinned
+ */
+static int cores_free(struct toptree *tree)
+{
+	struct toptree *core;
+	int count = 0;
+
+	toptree_for_each(core, tree, CORE) {
+		if (core_pinned_to_node_id(core) == NODE_ID_FREE)
+			count++;
+	}
+	return count;
+}
+
+/*
+ * Return node of core
+ */
+static struct toptree *core_node(struct toptree *core)
+{
+	return core->parent->parent->parent;
+}
+
+/*
+ * Return book of core
+ */
+static struct toptree *core_book(struct toptree *core)
+{
+	return core->parent->parent;
+}
+
+/*
+ * Return mc of core
+ */
+static struct toptree *core_mc(struct toptree *core)
+{
+	return core->parent;
+}
+
+/*
+ * Distance between two cores
+ */
+static int dist_core_to_core(struct toptree *core1, struct toptree *core2)
+{
+	if (core_book(core1)->id != core_book(core2)->id)
+		return DIST_BOOK;
+	if (core_mc(core1)->id != core_mc(core2)->id)
+		return DIST_MC;
+	/* Same core or sibling on same MC */
+	return DIST_CORE;
+}
+
+/*
+ * Distance of a node to a core
+ */
+static int dist_node_to_core(struct toptree *node, struct toptree *core)
+{
+	struct toptree *core_node;
+	int dist_min = DIST_MAX;
+
+	toptree_for_each(core_node, node, CORE)
+		dist_min = min(dist_min, dist_core_to_core(core_node, core));
+	return dist_min == DIST_MAX ? DIST_EMPTY : dist_min;
+}
+
+/*
+ * Unify will delete empty nodes, therefore recreate nodes.
+ */
+static void toptree_unify_tree(struct toptree *tree)
+{
+	int nid;
+
+	toptree_unify(tree);
+	for (nid = 0; nid < emu_nodes; nid++)
+		toptree_get_child(tree, nid);
+}
+
+/*
+ * Find the best/nearest node for a given core and ensure that no node
+ * gets more than "emu_cores->per_node_target + extra" cores.
+ */
+static struct toptree *node_for_core(struct toptree *numa, struct toptree *core,
+				     int extra)
+{
+	struct toptree *node, *node_best = NULL;
+	int dist_cur, dist_best, cores_target;
+
+	cores_target = emu_cores->per_node_target + extra;
+	dist_best = DIST_MAX;
+	node_best = NULL;
+	toptree_for_each(node, numa, NODE) {
+		/* Already pinned cores must use their nodes */
+		if (core_pinned_to_node_id(core) == node->id) {
+			node_best = node;
+			break;
+		}
+		/* Skip nodes that already have enough cores */
+		if (cores_pinned(node) >= cores_target)
+			continue;
+		dist_cur = dist_node_to_core(node, core);
+		if (dist_cur < dist_best) {
+			dist_best = dist_cur;
+			node_best = node;
+		}
+	}
+	return node_best;
+}
+
+/*
+ * Find the best node for each core with respect to "extra" core count
+ */
+static void toptree_to_numa_single(struct toptree *numa, struct toptree *phys,
+				   int extra)
+{
+	struct toptree *node, *core, *tmp;
+
+	toptree_for_each_safe(core, tmp, phys, CORE) {
+		node = node_for_core(numa, core, extra);
+		if (!node)
+			return;
+		toptree_move(core, node);
+		pin_core_to_node(core->id, node->id);
+	}
+}
+
+/*
+ * Move structures of given level to specified NUMA node
+ */
+static void move_level_to_numa_node(struct toptree *node, struct toptree *phys,
+				    enum toptree_level level, bool perfect)
+{
+	int cores_free, cores_target = emu_cores->per_node_target;
+	struct toptree *cur, *tmp;
+
+	toptree_for_each_safe(cur, tmp, phys, level) {
+		cores_free = cores_target - toptree_count(node, CORE);
+		if (perfect) {
+			if (cores_free == toptree_count(cur, CORE))
+				toptree_move(cur, node);
+		} else {
+			if (cores_free >= toptree_count(cur, CORE))
+				toptree_move(cur, node);
+		}
+	}
+}
+
+/*
+ * Move structures of a given level to NUMA nodes. If "perfect" is specified
+ * move only perfectly fitting structures. Otherwise move also smaller
+ * than needed structures.
+ */
+static void move_level_to_numa(struct toptree *numa, struct toptree *phys,
+			       enum toptree_level level, bool perfect)
+{
+	struct toptree *node;
+
+	toptree_for_each(node, numa, NODE)
+		move_level_to_numa_node(node, phys, level, perfect);
+}
+
+/*
+ * For the first run try to move the big structures
+ */
+static void toptree_to_numa_first(struct toptree *numa, struct toptree *phys)
+{
+	struct toptree *core;
+
+	/* Always try to move perfectly fitting structures first */
+	move_level_to_numa(numa, phys, BOOK, true);
+	move_level_to_numa(numa, phys, BOOK, false);
+	move_level_to_numa(numa, phys, MC, true);
+	move_level_to_numa(numa, phys, MC, false);
+	/* Now pin all the moved cores */
+	toptree_for_each(core, numa, CORE)
+		pin_core_to_node(core->id, core_node(core)->id);
+}
+
+/*
+ * Allocate new topology and create required nodes
+ */
+static struct toptree *toptree_new(int id, int nodes)
+{
+	struct toptree *tree;
+	int nid;
+
+	tree = toptree_alloc(TOPOLOGY, id);
+	if (!tree)
+		goto fail;
+	for (nid = 0; nid < nodes; nid++) {
+		if (!toptree_get_child(tree, nid))
+			goto fail;
+	}
+	return tree;
+fail:
+	panic("NUMA emulation could not allocate topology");
+}
+
+/*
+ * Allocate and initialize core to node mapping
+ */
+static void create_core_to_node_map(void)
+{
+	int i;
+
+	emu_cores = kzalloc(sizeof(*emu_cores), GFP_KERNEL);
+	if (emu_cores == NULL)
+		panic("Could not allocate cores to node memory");
+	for (i = 0; i < ARRAY_SIZE(emu_cores->to_node_id); i++)
+		emu_cores->to_node_id[i] = NODE_ID_FREE;
+}
+
+/*
+ * Move cores from physical topology into NUMA target topology
+ * and try to keep as much of the physical topology as possible.
+ */
+static struct toptree *toptree_to_numa(struct toptree *phys)
+{
+	static int first = 1;
+	struct toptree *numa;
+	int cores_total;
+
+	cores_total = emu_cores->total + cores_free(phys);
+	emu_cores->per_node_target = cores_total / emu_nodes;
+	numa = toptree_new(TOPTREE_ID_NUMA, emu_nodes);
+	if (first) {
+		toptree_to_numa_first(numa, phys);
+		first = 0;
+	}
+	toptree_to_numa_single(numa, phys, 0);
+	toptree_to_numa_single(numa, phys, 1);
+	toptree_unify_tree(numa);
+
+	WARN_ON(cpumask_weight(&phys->mask));
+	return numa;
+}
+
+/*
+ * Create a toptree out of the physical topology that we got from the hypervisor
+ */
+static struct toptree *toptree_from_topology(void)
+{
+	struct toptree *phys, *node, *book, *mc, *core;
+	struct cpu_topology_s390 *top;
+	int cpu;
+
+	phys = toptree_new(TOPTREE_ID_PHYS, 1);
+
+	for_each_online_cpu(cpu) {
+		top = &per_cpu(cpu_topology, cpu);
+		node = toptree_get_child(phys, 0);
+		book = toptree_get_child(node, top->book_id);
+		mc = toptree_get_child(book, top->socket_id);
+		core = toptree_get_child(mc, top->core_id);
+		if (!book || !mc || !core)
+			panic("NUMA emulation could not allocate memory");
+		cpumask_set_cpu(cpu, &core->mask);
+		toptree_update_mask(mc);
+	}
+	return phys;
+}
+
+/*
+ * Add toptree core to topology and create correct CPU masks
+ */
+static void topology_add_core(struct toptree *core)
+{
+	struct cpu_topology_s390 *top;
+	int cpu;
+
+	for_each_cpu(cpu, &core->mask) {
+		top = &per_cpu(cpu_topology, cpu);
+		cpumask_copy(&top->thread_mask, &core->mask);
+		cpumask_copy(&top->core_mask, &core_mc(core)->mask);
+		cpumask_copy(&top->book_mask, &core_book(core)->mask);
+		cpumask_set_cpu(cpu, &node_to_cpumask_map[core_node(core)->id]);
+		top->node_id = core_node(core)->id;
+	}
+}
+
+/*
+ * Apply toptree to topology and create CPU masks
+ */
+static void toptree_to_topology(struct toptree *numa)
+{
+	struct toptree *core;
+	int i;
+
+	/* Clear all node masks */
+	for (i = 0; i < MAX_NUMNODES; i++)
+		cpumask_clear(&node_to_cpumask_map[i]);
+
+	/* Rebuild all masks */
+	toptree_for_each(core, numa, CORE)
+		topology_add_core(core);
+}
+
+/*
+ * Show the node to core mapping
+ */
+static void print_node_to_core_map(void)
+{
+	int nid, cid;
+
+	if (!numa_debug_enabled)
+		return;
+	printk(KERN_DEBUG "NUMA node to core mapping\n");
+	for (nid = 0; nid < emu_nodes; nid++) {
+		printk(KERN_DEBUG "  node %3d: ", nid);
+		for (cid = 0; cid < ARRAY_SIZE(emu_cores->to_node_id); cid++) {
+			if (emu_cores->to_node_id[cid] == nid)
+				printk(KERN_CONT "%d ", cid);
+		}
+		printk(KERN_CONT "\n");
+	}
+}
+
+/*
+ * Transfer physical topology into a NUMA topology and modify CPU masks
+ * according to the NUMA topology.
+ *
+ * Must be called with "sched_domains_mutex" lock held.
+ */
+static void emu_update_cpu_topology(void)
+{
+	struct toptree *phys, *numa;
+
+	if (emu_cores == NULL)
+		create_core_to_node_map();
+	phys = toptree_from_topology();
+	numa = toptree_to_numa(phys);
+	toptree_free(phys);
+	toptree_to_topology(numa);
+	toptree_free(numa);
+	print_node_to_core_map();
+}
+
+/*
+ * If emu_size is not set, use CONFIG_EMU_SIZE. Then round to minimum
+ * alignment (needed for memory hotplug).
+ */
+static unsigned long emu_setup_size_adjust(unsigned long size)
+{
+	size = size ? : CONFIG_EMU_SIZE;
+	size = roundup(size, memory_block_size_bytes());
+	return size;
+}
+
+/*
+ * If we have not enough memory for the specified nodes, reduce the node count.
+ */
+static int emu_setup_nodes_adjust(int nodes)
+{
+	int nodes_max;
+
+	nodes_max = memblock.memory.total_size / emu_size;
+	nodes_max = max(nodes_max, 1);
+	if (nodes_max >= nodes)
+		return nodes;
+	pr_warn("Not enough memory for %d nodes, reducing node count\n", nodes);
+	return nodes_max;
+}
+
+/*
+ * Early emu setup
+ */
+static void emu_setup(void)
+{
+	emu_size = emu_setup_size_adjust(emu_size);
+	emu_nodes = emu_setup_nodes_adjust(emu_nodes);
+	pr_info("Creating %d nodes with memory stripe size %ld MB\n",
+		emu_nodes, emu_size >> 20);
+}
+
+/*
+ * Return node id for given page number
+ */
+static int emu_pfn_to_nid(unsigned long pfn)
+{
+	return (pfn / (emu_size >> PAGE_SHIFT)) % emu_nodes;
+}
+
+/*
+ * Return stripe size
+ */
+static unsigned long emu_align(void)
+{
+	return emu_size;
+}
+
+/*
+ * Return distance between two nodes
+ */
+static int emu_distance(int node1, int node2)
+{
+	return (node1 != node2) * EMU_NODE_DIST;
+}
+
+/*
+ * Define callbacks for generic s390 NUMA infrastructure
+ */
+const struct numa_mode numa_mode_emu = {
+	.name = "emu",
+	.setup = emu_setup,
+	.update_cpu_topology = emu_update_cpu_topology,
+	.__pfn_to_nid = emu_pfn_to_nid,
+	.align = emu_align,
+	.distance = emu_distance,
+};
+
+/*
+ * Kernel parameter: emu_nodes=<n>
+ */
+static int __init early_parse_emu_nodes(char *p)
+{
+	int count;
+
+	if (kstrtoint(p, 0, &count) != 0 || count <= 0)
+		return 0;
+	if (count <= 0)
+		return 0;
+	emu_nodes = min(count, MAX_NUMNODES);
+	return 0;
+}
+early_param("emu_nodes", early_parse_emu_nodes);
+
+/*
+ * Kernel parameter: emu_size=[<n>[k|M|G|T]]
+ */
+static int __init early_parse_emu_size(char *p)
+{
+	emu_size = memparse(p, NULL);
+	return 0;
+}
+early_param("emu_size", early_parse_emu_size);
diff --git a/arch/s390/numa/numa.c b/arch/s390/numa/numa.c
new file mode 100644
index 000000000..43f32ce60
--- /dev/null
+++ b/arch/s390/numa/numa.c
@@ -0,0 +1,184 @@
+/*
+ * NUMA support for s390
+ *
+ * Implement NUMA core code.
+ *
+ * Copyright IBM Corp. 2015
+ */
+
+#define KMSG_COMPONENT "numa"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/mmzone.h>
+#include <linux/cpumask.h>
+#include <linux/bootmem.h>
+#include <linux/memblock.h>
+#include <linux/slab.h>
+#include <linux/node.h>
+
+#include <asm/numa.h>
+#include "numa_mode.h"
+
+pg_data_t *node_data[MAX_NUMNODES];
+EXPORT_SYMBOL(node_data);
+
+cpumask_t node_to_cpumask_map[MAX_NUMNODES];
+EXPORT_SYMBOL(node_to_cpumask_map);
+
+const struct numa_mode numa_mode_plain = {
+	.name = "plain",
+};
+
+static const struct numa_mode *mode = &numa_mode_plain;
+
+int numa_pfn_to_nid(unsigned long pfn)
+{
+	return mode->__pfn_to_nid ? mode->__pfn_to_nid(pfn) : 0;
+}
+
+void numa_update_cpu_topology(void)
+{
+	if (mode->update_cpu_topology)
+		mode->update_cpu_topology();
+}
+
+int __node_distance(int a, int b)
+{
+	return mode->distance ? mode->distance(a, b) : 0;
+}
+
+int numa_debug_enabled;
+
+/*
+ * alloc_node_data() - Allocate node data
+ */
+static __init pg_data_t *alloc_node_data(void)
+{
+	pg_data_t *res;
+
+	res = (pg_data_t *) memblock_alloc(sizeof(pg_data_t), 1);
+	if (!res)
+		panic("Could not allocate memory for node data!\n");
+	memset(res, 0, sizeof(pg_data_t));
+	return res;
+}
+
+/*
+ * numa_setup_memory() - Assign bootmem to nodes
+ *
+ * The memory is first added to memblock without any respect to nodes.
+ * This is fixed before remaining memblock memory is handed over to the
+ * buddy allocator.
+ * An important side effect is that large bootmem allocations might easily
+ * cross node boundaries, which can be needed for large allocations with
+ * smaller memory stripes in each node (i.e. when using NUMA emulation).
+ *
+ * Memory defines nodes:
+ * Therefore this routine also sets the nodes online with memory.
+ */
+static void __init numa_setup_memory(void)
+{
+	unsigned long cur_base, align, end_of_dram;
+	int nid = 0;
+
+	end_of_dram = memblock_end_of_DRAM();
+	align = mode->align ? mode->align() : ULONG_MAX;
+
+	/*
+	 * Step through all available memory and assign it to the nodes
+	 * indicated by the mode implementation.
+	 * All nodes which are seen here will be set online.
+	 */
+	cur_base = 0;
+	do {
+		nid = numa_pfn_to_nid(PFN_DOWN(cur_base));
+		node_set_online(nid);
+		memblock_set_node(cur_base, align, &memblock.memory, nid);
+		cur_base += align;
+	} while (cur_base < end_of_dram);
+
+	/* Allocate and fill out node_data */
+	for (nid = 0; nid < MAX_NUMNODES; nid++)
+		NODE_DATA(nid) = alloc_node_data();
+
+	for_each_online_node(nid) {
+		unsigned long start_pfn, end_pfn;
+		unsigned long t_start, t_end;
+		int i;
+
+		start_pfn = ULONG_MAX;
+		end_pfn = 0;
+		for_each_mem_pfn_range(i, nid, &t_start, &t_end, NULL) {
+			if (t_start < start_pfn)
+				start_pfn = t_start;
+			if (t_end > end_pfn)
+				end_pfn = t_end;
+		}
+		NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn;
+		NODE_DATA(nid)->node_id = nid;
+	}
+}
+
+/*
+ * numa_setup() - Earliest initialization
+ *
+ * Assign the mode and call the mode's setup routine.
+ */
+void __init numa_setup(void)
+{
+	pr_info("NUMA mode: %s\n", mode->name);
+	if (mode->setup)
+		mode->setup();
+	numa_setup_memory();
+	memblock_dump_all();
+}
+
+
+/*
+ * numa_init_early() - Initialization initcall
+ *
+ * This runs when only one CPU is online and before the first
+ * topology update is called for by the scheduler.
+ */
+static int __init numa_init_early(void)
+{
+	/* Attach all possible CPUs to node 0 for now. */
+	cpumask_copy(&node_to_cpumask_map[0], cpu_possible_mask);
+	return 0;
+}
+early_initcall(numa_init_early);
+
+/*
+ * numa_init_late() - Initialization initcall
+ *
+ * Register NUMA nodes.
+ */
+static int __init numa_init_late(void)
+{
+	int nid;
+
+	for_each_online_node(nid)
+		register_one_node(nid);
+	return 0;
+}
+device_initcall(numa_init_late);
+
+static int __init parse_debug(char *parm)
+{
+	numa_debug_enabled = 1;
+	return 0;
+}
+early_param("numa_debug", parse_debug);
+
+static int __init parse_numa(char *parm)
+{
+	if (strcmp(parm, numa_mode_plain.name) == 0)
+		mode = &numa_mode_plain;
+#ifdef CONFIG_NUMA_EMU
+	if (strcmp(parm, numa_mode_emu.name) == 0)
+		mode = &numa_mode_emu;
+#endif
+	return 0;
+}
+early_param("numa", parse_numa);
diff --git a/arch/s390/numa/numa_mode.h b/arch/s390/numa/numa_mode.h
new file mode 100644
index 000000000..08953b0b1
--- /dev/null
+++ b/arch/s390/numa/numa_mode.h
@@ -0,0 +1,24 @@
+/*
+ * NUMA support for s390
+ *
+ * Define declarations used for communication between NUMA mode
+ * implementations and NUMA core functionality.
+ *
+ * Copyright IBM Corp. 2015
+ */
+#ifndef __S390_NUMA_MODE_H
+#define __S390_NUMA_MODE_H
+
+struct numa_mode {
+	char *name;				/* Name of mode */
+	void (*setup)(void);			/* Initizalize mode */
+	void (*update_cpu_topology)(void);	/* Called by topology code */
+	int (*__pfn_to_nid)(unsigned long pfn);	/* PFN to node ID */
+	unsigned long (*align)(void);		/* Minimum node alignment */
+	int (*distance)(int a, int b);		/* Distance between two nodes */
+};
+
+extern const struct numa_mode numa_mode_plain;
+extern const struct numa_mode numa_mode_emu;
+
+#endif /* __S390_NUMA_MODE_H */
diff --git a/arch/s390/numa/toptree.c b/arch/s390/numa/toptree.c
new file mode 100644
index 000000000..902d350d8
--- /dev/null
+++ b/arch/s390/numa/toptree.c
@@ -0,0 +1,342 @@
+/*
+ * NUMA support for s390
+ *
+ * A tree structure used for machine topology mangling
+ *
+ * Copyright IBM Corp. 2015
+ */
+
+#include <linux/kernel.h>
+#include <linux/cpumask.h>
+#include <linux/list.h>
+#include <linux/list_sort.h>
+#include <linux/slab.h>
+#include <asm/numa.h>
+
+#include "toptree.h"
+
+/**
+ * toptree_alloc - Allocate and initialize a new tree node.
+ * @level: The node's vertical level; level 0 contains the leaves.
+ * @id: ID number, explicitly not unique beyond scope of node's siblings
+ *
+ * Allocate a new tree node and initialize it.
+ *
+ * RETURNS:
+ * Pointer to the new tree node or NULL on error
+ */
+struct toptree *toptree_alloc(int level, int id)
+{
+	struct toptree *res = kzalloc(sizeof(struct toptree), GFP_KERNEL);
+
+	if (!res)
+		return res;
+
+	INIT_LIST_HEAD(&res->children);
+	INIT_LIST_HEAD(&res->sibling);
+	cpumask_clear(&res->mask);
+	res->level = level;
+	res->id = id;
+	return res;
+}
+
+/**
+ * toptree_remove - Remove a tree node from a tree
+ * @cand: Pointer to the node to remove
+ *
+ * The node is detached from its parent node. The parent node's
+ * masks will be updated to reflect the loss of the child.
+ */
+static void toptree_remove(struct toptree *cand)
+{
+	struct toptree *oldparent;
+
+	list_del_init(&cand->sibling);
+	oldparent = cand->parent;
+	cand->parent = NULL;
+	toptree_update_mask(oldparent);
+}
+
+/**
+ * toptree_free - discard a tree node
+ * @cand: Pointer to the tree node to discard
+ *
+ * Checks if @cand is attached to a parent node. Detaches it
+ * cleanly using toptree_remove. Possible children are freed
+ * recursively. In the end @cand itself is freed.
+ */
+void toptree_free(struct toptree *cand)
+{
+	struct toptree *child, *tmp;
+
+	if (cand->parent)
+		toptree_remove(cand);
+	toptree_for_each_child_safe(child, tmp, cand)
+		toptree_free(child);
+	kfree(cand);
+}
+
+/**
+ * toptree_update_mask - Update node bitmasks
+ * @cand: Pointer to a tree node
+ *
+ * The node's cpumask will be updated by combining all children's
+ * masks. Then toptree_update_mask is called recursively for the
+ * parent if applicable.
+ *
+ * NOTE:
+ * This must not be called on leaves. If called on a leaf, its
+ * CPU mask is cleared and lost.
+ */
+void toptree_update_mask(struct toptree *cand)
+{
+	struct toptree *child;
+
+	cpumask_clear(&cand->mask);
+	list_for_each_entry(child, &cand->children, sibling)
+		cpumask_or(&cand->mask, &cand->mask, &child->mask);
+	if (cand->parent)
+		toptree_update_mask(cand->parent);
+}
+
+/**
+ * toptree_insert - Insert a tree node into tree
+ * @cand: Pointer to the node to insert
+ * @target: Pointer to the node to which @cand will added as a child
+ *
+ * Insert a tree node into a tree. Masks will be updated automatically.
+ *
+ * RETURNS:
+ * 0 on success, -1 if NULL is passed as argument or the node levels
+ * don't fit.
+ */
+static int toptree_insert(struct toptree *cand, struct toptree *target)
+{
+	if (!cand || !target)
+		return -1;
+	if (target->level != (cand->level + 1))
+		return -1;
+	list_add_tail(&cand->sibling, &target->children);
+	cand->parent = target;
+	toptree_update_mask(target);
+	return 0;
+}
+
+/**
+ * toptree_move_children - Move all child nodes of a node to a new place
+ * @cand: Pointer to the node whose children are to be moved
+ * @target: Pointer to the node to which @cand's children will be attached
+ *
+ * Take all child nodes of @cand and move them using toptree_move.
+ */
+static void toptree_move_children(struct toptree *cand, struct toptree *target)
+{
+	struct toptree *child, *tmp;
+
+	toptree_for_each_child_safe(child, tmp, cand)
+		toptree_move(child, target);
+}
+
+/**
+ * toptree_unify - Merge children with same ID
+ * @cand: Pointer to node whose direct children should be made unique
+ *
+ * When mangling the tree it is possible that a node has two or more children
+ * which have the same ID. This routine merges these children into one and
+ * moves all children of the merged nodes into the unified node.
+ */
+void toptree_unify(struct toptree *cand)
+{
+	struct toptree *child, *tmp, *cand_copy;
+
+	/* Threads cannot be split, cores are not split */
+	if (cand->level < 2)
+		return;
+
+	cand_copy = toptree_alloc(cand->level, 0);
+	toptree_for_each_child_safe(child, tmp, cand) {
+		struct toptree *tmpchild;
+
+		if (!cpumask_empty(&child->mask)) {
+			tmpchild = toptree_get_child(cand_copy, child->id);
+			toptree_move_children(child, tmpchild);
+		}
+		toptree_free(child);
+	}
+	toptree_move_children(cand_copy, cand);
+	toptree_free(cand_copy);
+
+	toptree_for_each_child(child, cand)
+		toptree_unify(child);
+}
+
+/**
+ * toptree_move - Move a node to another context
+ * @cand: Pointer to the node to move
+ * @target: Pointer to the node where @cand should go
+ *
+ * In the easiest case @cand is exactly on the level below @target
+ * and will be immediately moved to the target.
+ *
+ * If @target's level is not the direct parent level of @cand,
+ * nodes for the missing levels are created and put between
+ * @cand and @target. The "stacking" nodes' IDs are taken from
+ * @cand's parents.
+ *
+ * After this it is likely to have redundant nodes in the tree
+ * which are addressed by means of toptree_unify.
+ */
+void toptree_move(struct toptree *cand, struct toptree *target)
+{
+	struct toptree *stack_target, *real_insert_point, *ptr, *tmp;
+
+	if (cand->level + 1 == target->level) {
+		toptree_remove(cand);
+		toptree_insert(cand, target);
+		return;
+	}
+
+	real_insert_point = NULL;
+	ptr = cand;
+	stack_target = NULL;
+
+	do {
+		tmp = stack_target;
+		stack_target = toptree_alloc(ptr->level + 1,
+					     ptr->parent->id);
+		toptree_insert(tmp, stack_target);
+		if (!real_insert_point)
+			real_insert_point = stack_target;
+		ptr = ptr->parent;
+	} while (stack_target->level < (target->level - 1));
+
+	toptree_remove(cand);
+	toptree_insert(cand, real_insert_point);
+	toptree_insert(stack_target, target);
+}
+
+/**
+ * toptree_get_child - Access a tree node's child by its ID
+ * @cand: Pointer to tree node whose child is to access
+ * @id: The desired child's ID
+ *
+ * @cand's children are searched for a child with matching ID.
+ * If no match can be found, a new child with the desired ID
+ * is created and returned.
+ */
+struct toptree *toptree_get_child(struct toptree *cand, int id)
+{
+	struct toptree *child;
+
+	toptree_for_each_child(child, cand)
+		if (child->id == id)
+			return child;
+	child = toptree_alloc(cand->level-1, id);
+	toptree_insert(child, cand);
+	return child;
+}
+
+/**
+ * toptree_first - Find the first descendant on specified level
+ * @context: Pointer to tree node whose descendants are to be used
+ * @level: The level of interest
+ *
+ * RETURNS:
+ * @context's first descendant on the specified level, or NULL
+ * if there is no matching descendant
+ */
+struct toptree *toptree_first(struct toptree *context, int level)
+{
+	struct toptree *child, *tmp;
+
+	if (context->level == level)
+		return context;
+
+	if (!list_empty(&context->children)) {
+		list_for_each_entry(child, &context->children, sibling) {
+			tmp = toptree_first(child, level);
+			if (tmp)
+				return tmp;
+		}
+	}
+	return NULL;
+}
+
+/**
+ * toptree_next_sibling - Return next sibling
+ * @cur: Pointer to a tree node
+ *
+ * RETURNS:
+ * If @cur has a parent and is not the last in the parent's children list,
+ * the next sibling is returned. Or NULL when there are no siblings left.
+ */
+static struct toptree *toptree_next_sibling(struct toptree *cur)
+{
+	if (cur->parent == NULL)
+		return NULL;
+
+	if (cur == list_last_entry(&cur->parent->children,
+				   struct toptree, sibling))
+		return NULL;
+	return (struct toptree *) list_next_entry(cur, sibling);
+}
+
+/**
+ * toptree_next - Tree traversal function
+ * @cur: Pointer to current element
+ * @context: Pointer to the root node of the tree or subtree to
+ * be traversed.
+ * @level: The level of interest.
+ *
+ * RETURNS:
+ * Pointer to the next node on level @level
+ * or NULL when there is no next node.
+ */
+struct toptree *toptree_next(struct toptree *cur, struct toptree *context,
+			     int level)
+{
+	struct toptree *cur_context, *tmp;
+
+	if (!cur)
+		return NULL;
+
+	if (context->level == level)
+		return NULL;
+
+	tmp = toptree_next_sibling(cur);
+	if (tmp != NULL)
+		return tmp;
+
+	cur_context = cur;
+	while (cur_context->level < context->level - 1) {
+		/* Step up */
+		cur_context = cur_context->parent;
+		/* Step aside */
+		tmp = toptree_next_sibling(cur_context);
+		if (tmp != NULL) {
+			/* Step down */
+			tmp = toptree_first(tmp, level);
+			if (tmp != NULL)
+				return tmp;
+		}
+	}
+	return NULL;
+}
+
+/**
+ * toptree_count - Count descendants on specified level
+ * @context: Pointer to node whose descendants are to be considered
+ * @level: Only descendants on the specified level will be counted
+ *
+ * RETURNS:
+ * Number of descendants on the specified level
+ */
+int toptree_count(struct toptree *context, int level)
+{
+	struct toptree *cur;
+	int cnt = 0;
+
+	toptree_for_each(cur, context, level)
+		cnt++;
+	return cnt;
+}
diff --git a/arch/s390/numa/toptree.h b/arch/s390/numa/toptree.h
new file mode 100644
index 000000000..bdf502027
--- /dev/null
+++ b/arch/s390/numa/toptree.h
@@ -0,0 +1,60 @@
+/*
+ * NUMA support for s390
+ *
+ * A tree structure used for machine topology mangling
+ *
+ * Copyright IBM Corp. 2015
+ */
+#ifndef S390_TOPTREE_H
+#define S390_TOPTREE_H
+
+#include <linux/cpumask.h>
+#include <linux/list.h>
+
+struct toptree {
+	int level;
+	int id;
+	cpumask_t mask;
+	struct toptree *parent;
+	struct list_head sibling;
+	struct list_head children;
+};
+
+struct toptree *toptree_alloc(int level, int id);
+void toptree_free(struct toptree *cand);
+void toptree_update_mask(struct toptree *cand);
+void toptree_unify(struct toptree *cand);
+struct toptree *toptree_get_child(struct toptree *cand, int id);
+void toptree_move(struct toptree *cand, struct toptree *target);
+int toptree_count(struct toptree *context, int level);
+
+struct toptree *toptree_first(struct toptree *context, int level);
+struct toptree *toptree_next(struct toptree *cur, struct toptree *context,
+			     int level);
+
+#define toptree_for_each_child(child, ptree)				\
+	list_for_each_entry(child,  &ptree->children, sibling)
+
+#define toptree_for_each_child_safe(child, ptmp, ptree)			\
+	list_for_each_entry_safe(child, ptmp, &ptree->children, sibling)
+
+#define toptree_is_last(ptree)					\
+	((ptree->parent == NULL) ||				\
+	 (ptree->parent->children.prev == &ptree->sibling))
+
+#define toptree_for_each(ptree, cont, ttype)		\
+	for (ptree = toptree_first(cont, ttype);	\
+	     ptree != NULL;				\
+	     ptree = toptree_next(ptree, cont, ttype))
+
+#define toptree_for_each_safe(ptree, tmp, cont, ttype)		\
+	for (ptree = toptree_first(cont, ttype),		\
+		     tmp = toptree_next(ptree, cont, ttype);	\
+	     ptree != NULL;					\
+	     ptree = tmp,					\
+		     tmp = toptree_next(ptree, cont, ttype))
+
+#define toptree_for_each_sibling(ptree, start)			\
+	toptree_for_each(ptree, start->parent, start->level)
+
+#endif /* S390_TOPTREE_H */