summaryrefslogtreecommitdiff
path: root/kernel/power/tuxonice_cluster.c
diff options
context:
space:
mode:
authorAndré Fabian Silva Delgado <emulatorman@parabola.nu>2015-08-05 17:04:01 -0300
committerAndré Fabian Silva Delgado <emulatorman@parabola.nu>2015-08-05 17:04:01 -0300
commit57f0f512b273f60d52568b8c6b77e17f5636edc0 (patch)
tree5e910f0e82173f4ef4f51111366a3f1299037a7b /kernel/power/tuxonice_cluster.c
Initial import
Diffstat (limited to 'kernel/power/tuxonice_cluster.c')
-rw-r--r--kernel/power/tuxonice_cluster.c1058
1 files changed, 1058 insertions, 0 deletions
diff --git a/kernel/power/tuxonice_cluster.c b/kernel/power/tuxonice_cluster.c
new file mode 100644
index 000000000..cfe3383ab
--- /dev/null
+++ b/kernel/power/tuxonice_cluster.c
@@ -0,0 +1,1058 @@
+/*
+ * kernel/power/tuxonice_cluster.c
+ *
+ * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ *
+ * This file contains routines for cluster hibernation support.
+ *
+ * Based on ip autoconfiguration code in net/ipv4/ipconfig.c.
+ *
+ * How does it work?
+ *
+ * There is no 'master' node that tells everyone else what to do. All nodes
+ * send messages to the broadcast address/port, maintain a list of peers
+ * and figure out when to progress to the next step in hibernating or resuming.
+ * This makes us more fault tolerant when it comes to nodes coming and going
+ * (which may be more of an issue if we're hibernating when power supplies
+ * are being unreliable).
+ *
+ * At boot time, we start a ktuxonice thread that handles communication with
+ * other nodes. This node maintains a state machine that controls our progress
+ * through hibernating and resuming, keeping us in step with other nodes. Nodes
+ * are identified by their hw address.
+ *
+ * On startup, the node sends CLUSTER_PING on the configured interface's
+ * broadcast address, port $toi_cluster_port (see below) and begins to listen
+ * for other broadcast messages. CLUSTER_PING messages are repeated at
+ * intervals of 5 minutes, with a random offset to spread traffic out.
+ *
+ * A hibernation cycle is initiated from any node via
+ *
+ * echo > /sys/power/tuxonice/do_hibernate
+ *
+ * and (possibily) the hibernate script. At each step of the process, the node
+ * completes its work, and waits for all other nodes to signal completion of
+ * their work (or timeout) before progressing to the next step.
+ *
+ * Request/state Action before reply Possible reply Next state
+ * HIBERNATE capable, pre-script HIBERNATE|ACK NODE_PREP
+ * HIBERNATE|NACK INIT_0
+ *
+ * PREP prepare_image PREP|ACK IMAGE_WRITE
+ * PREP|NACK INIT_0
+ * ABORT RUNNING
+ *
+ * IO write image IO|ACK power off
+ * ABORT POST_RESUME
+ *
+ * (Boot time) check for image IMAGE|ACK RESUME_PREP
+ * (Note 1)
+ * IMAGE|NACK (Note 2)
+ *
+ * PREP prepare read image PREP|ACK IMAGE_READ
+ * PREP|NACK (As NACK_IMAGE)
+ *
+ * IO read image IO|ACK POST_RESUME
+ *
+ * POST_RESUME thaw, post-script RUNNING
+ *
+ * INIT_0 init 0
+ *
+ * Other messages:
+ *
+ * - PING: Request for all other live nodes to send a PONG. Used at startup to
+ * announce presence, when a node is suspected dead and periodically, in case
+ * segments of the network are [un]plugged.
+ *
+ * - PONG: Response to a PING.
+ *
+ * - ABORT: Request to cancel writing an image.
+ *
+ * - BYE: Notification that this node is shutting down.
+ *
+ * Note 1: Repeated at 3s intervals until we continue to boot/resume, so that
+ * nodes which are slower to start up can get state synchronised. If a node
+ * starting up sees other nodes sending RESUME_PREP or IMAGE_READ, it may send
+ * ACK_IMAGE and they will wait for it to catch up. If it sees ACK_READ, it
+ * must invalidate its image (if any) and boot normally.
+ *
+ * Note 2: May occur when one node lost power or powered off while others
+ * hibernated. This node waits for others to complete resuming (ACK_READ)
+ * before completing its boot, so that it appears as a fail node restarting.
+ *
+ * If any node has an image, then it also has a list of nodes that hibernated
+ * in synchronisation with it. The node will wait for other nodes to appear
+ * or timeout before beginning its restoration.
+ *
+ * If a node has no image, it needs to wait, in case other nodes which do have
+ * an image are going to resume, but are taking longer to announce their
+ * presence. For this reason, the user can specify a timeout value and a number
+ * of nodes detected before we just continue. (We might want to assume in a
+ * cluster of, say, 15 nodes, if 8 others have booted without finding an image,
+ * the remaining nodes will too. This might help in situations where some nodes
+ * are much slower to boot, or more subject to hardware failures or such like).
+ */
+
+#include <linux/suspend.h>
+#include <linux/if.h>
+#include <linux/rtnetlink.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <linux/in.h>
+#include <linux/if_arp.h>
+#include <linux/kthread.h>
+#include <linux/wait.h>
+#include <linux/netdevice.h>
+#include <net/ip.h>
+
+#include "tuxonice.h"
+#include "tuxonice_modules.h"
+#include "tuxonice_sysfs.h"
+#include "tuxonice_alloc.h"
+#include "tuxonice_io.h"
+
+#if 1
+#define PRINTK(a, b...) do { printk(a, ##b); } while (0)
+#else
+#define PRINTK(a, b...) do { } while (0)
+#endif
+
+static int loopback_mode;
+static int num_local_nodes = 1;
+#define MAX_LOCAL_NODES 8
+#define SADDR (loopback_mode ? b->sid : h->saddr)
+
+#define MYNAME "TuxOnIce Clustering"
+
+enum cluster_message {
+ MSG_ACK = 1,
+ MSG_NACK = 2,
+ MSG_PING = 4,
+ MSG_ABORT = 8,
+ MSG_BYE = 16,
+ MSG_HIBERNATE = 32,
+ MSG_IMAGE = 64,
+ MSG_IO = 128,
+ MSG_RUNNING = 256
+};
+
+static char *str_message(int message)
+{
+ switch (message) {
+ case 4:
+ return "Ping";
+ case 8:
+ return "Abort";
+ case 9:
+ return "Abort acked";
+ case 10:
+ return "Abort nacked";
+ case 16:
+ return "Bye";
+ case 17:
+ return "Bye acked";
+ case 18:
+ return "Bye nacked";
+ case 32:
+ return "Hibernate request";
+ case 33:
+ return "Hibernate ack";
+ case 34:
+ return "Hibernate nack";
+ case 64:
+ return "Image exists?";
+ case 65:
+ return "Image does exist";
+ case 66:
+ return "No image here";
+ case 128:
+ return "I/O";
+ case 129:
+ return "I/O okay";
+ case 130:
+ return "I/O failed";
+ case 256:
+ return "Running";
+ default:
+ printk(KERN_ERR "Unrecognised message %d.\n", message);
+ return "Unrecognised message (see dmesg)";
+ }
+}
+
+#define MSG_ACK_MASK (MSG_ACK | MSG_NACK)
+#define MSG_STATE_MASK (~MSG_ACK_MASK)
+
+struct node_info {
+ struct list_head member_list;
+ wait_queue_head_t member_events;
+ spinlock_t member_list_lock;
+ spinlock_t receive_lock;
+ int peer_count, ignored_peer_count;
+ struct toi_sysfs_data sysfs_data;
+ enum cluster_message current_message;
+};
+
+struct node_info node_array[MAX_LOCAL_NODES];
+
+struct cluster_member {
+ __be32 addr;
+ enum cluster_message message;
+ struct list_head list;
+ int ignore;
+};
+
+#define toi_cluster_port_send 3501
+#define toi_cluster_port_recv 3502
+
+static struct net_device *net_dev;
+static struct toi_module_ops toi_cluster_ops;
+
+static int toi_recv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt, struct net_device *orig_dev);
+
+static struct packet_type toi_cluster_packet_type = {
+ .type = __constant_htons(ETH_P_IP),
+ .func = toi_recv,
+};
+
+struct toi_pkt { /* BOOTP packet format */
+ struct iphdr iph; /* IP header */
+ struct udphdr udph; /* UDP header */
+ u8 htype; /* HW address type */
+ u8 hlen; /* HW address length */
+ __be32 xid; /* Transaction ID */
+ __be16 secs; /* Seconds since we started */
+ __be16 flags; /* Just what it says */
+ u8 hw_addr[16]; /* Sender's HW address */
+ u16 message; /* Message */
+ unsigned long sid; /* Source ID for loopback testing */
+};
+
+static char toi_cluster_iface[IFNAMSIZ] = CONFIG_TOI_DEFAULT_CLUSTER_INTERFACE;
+
+static int added_pack;
+
+static int others_have_image;
+
+/* Key used to allow multiple clusters on the same lan */
+static char toi_cluster_key[32] = CONFIG_TOI_DEFAULT_CLUSTER_KEY;
+static char pre_hibernate_script[255] =
+ CONFIG_TOI_DEFAULT_CLUSTER_PRE_HIBERNATE;
+static char post_hibernate_script[255] =
+ CONFIG_TOI_DEFAULT_CLUSTER_POST_HIBERNATE;
+
+/* List of cluster members */
+static unsigned long continue_delay = 5 * HZ;
+static unsigned long cluster_message_timeout = 3 * HZ;
+
+/* === Membership list === */
+
+static void print_member_info(int index)
+{
+ struct cluster_member *this;
+
+ printk(KERN_INFO "==> Dumping node %d.\n", index);
+
+ list_for_each_entry(this, &node_array[index].member_list, list)
+ printk(KERN_INFO "%d.%d.%d.%d last message %s. %s\n",
+ NIPQUAD(this->addr),
+ str_message(this->message),
+ this->ignore ? "(Ignored)" : "");
+ printk(KERN_INFO "== Done ==\n");
+}
+
+static struct cluster_member *__find_member(int index, __be32 addr)
+{
+ struct cluster_member *this;
+
+ list_for_each_entry(this, &node_array[index].member_list, list) {
+ if (this->addr != addr)
+ continue;
+
+ return this;
+ }
+
+ return NULL;
+}
+
+static void set_ignore(int index, __be32 addr, struct cluster_member *this)
+{
+ if (this->ignore) {
+ PRINTK("Node %d already ignoring %d.%d.%d.%d.\n",
+ index, NIPQUAD(addr));
+ return;
+ }
+
+ PRINTK("Node %d sees node %d.%d.%d.%d now being ignored.\n",
+ index, NIPQUAD(addr));
+ this->ignore = 1;
+ node_array[index].ignored_peer_count++;
+}
+
+static int __add_update_member(int index, __be32 addr, int message)
+{
+ struct cluster_member *this;
+
+ this = __find_member(index, addr);
+ if (this) {
+ if (this->message != message) {
+ this->message = message;
+ if ((message & MSG_NACK) &&
+ (message & (MSG_HIBERNATE | MSG_IMAGE | MSG_IO)))
+ set_ignore(index, addr, this);
+ PRINTK("Node %d sees node %d.%d.%d.%d now sending "
+ "%s.\n", index, NIPQUAD(addr),
+ str_message(message));
+ wake_up(&node_array[index].member_events);
+ }
+ return 0;
+ }
+
+ this = (struct cluster_member *) toi_kzalloc(36,
+ sizeof(struct cluster_member), GFP_KERNEL);
+
+ if (!this)
+ return -1;
+
+ this->addr = addr;
+ this->message = message;
+ this->ignore = 0;
+ INIT_LIST_HEAD(&this->list);
+
+ node_array[index].peer_count++;
+
+ PRINTK("Node %d sees node %d.%d.%d.%d sending %s.\n", index,
+ NIPQUAD(addr), str_message(message));
+
+ if ((message & MSG_NACK) &&
+ (message & (MSG_HIBERNATE | MSG_IMAGE | MSG_IO)))
+ set_ignore(index, addr, this);
+ list_add_tail(&this->list, &node_array[index].member_list);
+ return 1;
+}
+
+static int add_update_member(int index, __be32 addr, int message)
+{
+ int result;
+ unsigned long flags;
+ spin_lock_irqsave(&node_array[index].member_list_lock, flags);
+ result = __add_update_member(index, addr, message);
+ spin_unlock_irqrestore(&node_array[index].member_list_lock, flags);
+
+ print_member_info(index);
+
+ wake_up(&node_array[index].member_events);
+
+ return result;
+}
+
+static void del_member(int index, __be32 addr)
+{
+ struct cluster_member *this;
+ unsigned long flags;
+
+ spin_lock_irqsave(&node_array[index].member_list_lock, flags);
+ this = __find_member(index, addr);
+
+ if (this) {
+ list_del_init(&this->list);
+ toi_kfree(36, this, sizeof(*this));
+ node_array[index].peer_count--;
+ }
+
+ spin_unlock_irqrestore(&node_array[index].member_list_lock, flags);
+}
+
+/* === Message transmission === */
+
+static void toi_send_if(int message, unsigned long my_id);
+
+/*
+ * Process received TOI packet.
+ */
+static int toi_recv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt, struct net_device *orig_dev)
+{
+ struct toi_pkt *b;
+ struct iphdr *h;
+ int len, result, index;
+ unsigned long addr, message, ack;
+
+ /* Perform verifications before taking the lock. */
+ if (skb->pkt_type == PACKET_OTHERHOST)
+ goto drop;
+
+ if (dev != net_dev)
+ goto drop;
+
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (!skb)
+ return NET_RX_DROP;
+
+ if (!pskb_may_pull(skb,
+ sizeof(struct iphdr) +
+ sizeof(struct udphdr)))
+ goto drop;
+
+ b = (struct toi_pkt *)skb_network_header(skb);
+ h = &b->iph;
+
+ if (h->ihl != 5 || h->version != 4 || h->protocol != IPPROTO_UDP)
+ goto drop;
+
+ /* Fragments are not supported */
+ if (h->frag_off & htons(IP_OFFSET | IP_MF)) {
+ if (net_ratelimit())
+ printk(KERN_ERR "TuxOnIce: Ignoring fragmented "
+ "cluster message.\n");
+ goto drop;
+ }
+
+ if (skb->len < ntohs(h->tot_len))
+ goto drop;
+
+ if (ip_fast_csum((char *) h, h->ihl))
+ goto drop;
+
+ if (b->udph.source != htons(toi_cluster_port_send) ||
+ b->udph.dest != htons(toi_cluster_port_recv))
+ goto drop;
+
+ if (ntohs(h->tot_len) < ntohs(b->udph.len) + sizeof(struct iphdr))
+ goto drop;
+
+ len = ntohs(b->udph.len) - sizeof(struct udphdr);
+
+ /* Ok the front looks good, make sure we can get at the rest. */
+ if (!pskb_may_pull(skb, skb->len))
+ goto drop;
+
+ b = (struct toi_pkt *)skb_network_header(skb);
+ h = &b->iph;
+
+ addr = SADDR;
+ PRINTK(">>> Message %s received from " NIPQUAD_FMT ".\n",
+ str_message(b->message), NIPQUAD(addr));
+
+ message = b->message & MSG_STATE_MASK;
+ ack = b->message & MSG_ACK_MASK;
+
+ for (index = 0; index < num_local_nodes; index++) {
+ int new_message = node_array[index].current_message,
+ old_message = new_message;
+
+ if (index == SADDR || !old_message) {
+ PRINTK("Ignoring node %d (offline or self).\n", index);
+ continue;
+ }
+
+ /* One message at a time, please. */
+ spin_lock(&node_array[index].receive_lock);
+
+ result = add_update_member(index, SADDR, b->message);
+ if (result == -1) {
+ printk(KERN_INFO "Failed to add new cluster member "
+ NIPQUAD_FMT ".\n",
+ NIPQUAD(addr));
+ goto drop_unlock;
+ }
+
+ switch (b->message & MSG_STATE_MASK) {
+ case MSG_PING:
+ break;
+ case MSG_ABORT:
+ break;
+ case MSG_BYE:
+ break;
+ case MSG_HIBERNATE:
+ /* Can I hibernate? */
+ new_message = MSG_HIBERNATE |
+ ((index & 1) ? MSG_NACK : MSG_ACK);
+ break;
+ case MSG_IMAGE:
+ /* Can I resume? */
+ new_message = MSG_IMAGE |
+ ((index & 1) ? MSG_NACK : MSG_ACK);
+ if (new_message != old_message)
+ printk(KERN_ERR "Setting whether I can resume "
+ "to %d.\n", new_message);
+ break;
+ case MSG_IO:
+ new_message = MSG_IO | MSG_ACK;
+ break;
+ case MSG_RUNNING:
+ break;
+ default:
+ if (net_ratelimit())
+ printk(KERN_ERR "Unrecognised TuxOnIce cluster"
+ " message %d from " NIPQUAD_FMT ".\n",
+ b->message, NIPQUAD(addr));
+ };
+
+ if (old_message != new_message) {
+ node_array[index].current_message = new_message;
+ printk(KERN_INFO ">>> Sending new message for node "
+ "%d.\n", index);
+ toi_send_if(new_message, index);
+ } else if (!ack) {
+ printk(KERN_INFO ">>> Resending message for node %d.\n",
+ index);
+ toi_send_if(new_message, index);
+ }
+drop_unlock:
+ spin_unlock(&node_array[index].receive_lock);
+ };
+
+drop:
+ /* Throw the packet out. */
+ kfree_skb(skb);
+
+ return 0;
+}
+
+/*
+ * Send cluster message to single interface.
+ */
+static void toi_send_if(int message, unsigned long my_id)
+{
+ struct sk_buff *skb;
+ struct toi_pkt *b;
+ int hh_len = LL_RESERVED_SPACE(net_dev);
+ struct iphdr *h;
+
+ /* Allocate packet */
+ skb = alloc_skb(sizeof(struct toi_pkt) + hh_len + 15, GFP_KERNEL);
+ if (!skb)
+ return;
+ skb_reserve(skb, hh_len);
+ b = (struct toi_pkt *) skb_put(skb, sizeof(struct toi_pkt));
+ memset(b, 0, sizeof(struct toi_pkt));
+
+ /* Construct IP header */
+ skb_reset_network_header(skb);
+ h = ip_hdr(skb);
+ h->version = 4;
+ h->ihl = 5;
+ h->tot_len = htons(sizeof(struct toi_pkt));
+ h->frag_off = htons(IP_DF);
+ h->ttl = 64;
+ h->protocol = IPPROTO_UDP;
+ h->daddr = htonl(INADDR_BROADCAST);
+ h->check = ip_fast_csum((unsigned char *) h, h->ihl);
+
+ /* Construct UDP header */
+ b->udph.source = htons(toi_cluster_port_send);
+ b->udph.dest = htons(toi_cluster_port_recv);
+ b->udph.len = htons(sizeof(struct toi_pkt) - sizeof(struct iphdr));
+ /* UDP checksum not calculated -- explicitly allowed in BOOTP RFC */
+
+ /* Construct message */
+ b->message = message;
+ b->sid = my_id;
+ b->htype = net_dev->type; /* can cause undefined behavior */
+ b->hlen = net_dev->addr_len;
+ memcpy(b->hw_addr, net_dev->dev_addr, net_dev->addr_len);
+ b->secs = htons(3); /* 3 seconds */
+
+ /* Chain packet down the line... */
+ skb->dev = net_dev;
+ skb->protocol = htons(ETH_P_IP);
+ if ((dev_hard_header(skb, net_dev, ntohs(skb->protocol),
+ net_dev->broadcast, net_dev->dev_addr, skb->len) < 0) ||
+ dev_queue_xmit(skb) < 0)
+ printk(KERN_INFO "E");
+}
+
+/* ========================================= */
+
+/* kTOICluster */
+
+static atomic_t num_cluster_threads;
+static DECLARE_WAIT_QUEUE_HEAD(clusterd_events);
+
+static int kTOICluster(void *data)
+{
+ unsigned long my_id;
+
+ my_id = atomic_add_return(1, &num_cluster_threads) - 1;
+ node_array[my_id].current_message = (unsigned long) data;
+
+ PRINTK("kTOICluster daemon %lu starting.\n", my_id);
+
+ current->flags |= PF_NOFREEZE;
+
+ while (node_array[my_id].current_message) {
+ toi_send_if(node_array[my_id].current_message, my_id);
+ sleep_on_timeout(&clusterd_events,
+ cluster_message_timeout);
+ PRINTK("Link state %lu is %d.\n", my_id,
+ node_array[my_id].current_message);
+ }
+
+ toi_send_if(MSG_BYE, my_id);
+ atomic_dec(&num_cluster_threads);
+ wake_up(&clusterd_events);
+
+ PRINTK("kTOICluster daemon %lu exiting.\n", my_id);
+ __set_current_state(TASK_RUNNING);
+ return 0;
+}
+
+static void kill_clusterd(void)
+{
+ int i;
+
+ for (i = 0; i < num_local_nodes; i++) {
+ if (node_array[i].current_message) {
+ PRINTK("Seeking to kill clusterd %d.\n", i);
+ node_array[i].current_message = 0;
+ }
+ }
+ wait_event(clusterd_events,
+ !atomic_read(&num_cluster_threads));
+ PRINTK("All cluster daemons have exited.\n");
+}
+
+static int peers_not_in_message(int index, int message, int precise)
+{
+ struct cluster_member *this;
+ unsigned long flags;
+ int result = 0;
+
+ spin_lock_irqsave(&node_array[index].member_list_lock, flags);
+ list_for_each_entry(this, &node_array[index].member_list, list) {
+ if (this->ignore)
+ continue;
+
+ PRINTK("Peer %d.%d.%d.%d sending %s. "
+ "Seeking %s.\n",
+ NIPQUAD(this->addr),
+ str_message(this->message), str_message(message));
+ if ((precise ? this->message :
+ this->message & MSG_STATE_MASK) !=
+ message)
+ result++;
+ }
+ spin_unlock_irqrestore(&node_array[index].member_list_lock, flags);
+ PRINTK("%d peers in sought message.\n", result);
+ return result;
+}
+
+static void reset_ignored(int index)
+{
+ struct cluster_member *this;
+ unsigned long flags;
+
+ spin_lock_irqsave(&node_array[index].member_list_lock, flags);
+ list_for_each_entry(this, &node_array[index].member_list, list)
+ this->ignore = 0;
+ node_array[index].ignored_peer_count = 0;
+ spin_unlock_irqrestore(&node_array[index].member_list_lock, flags);
+}
+
+static int peers_in_message(int index, int message, int precise)
+{
+ return node_array[index].peer_count -
+ node_array[index].ignored_peer_count -
+ peers_not_in_message(index, message, precise);
+}
+
+static int time_to_continue(int index, unsigned long start, int message)
+{
+ int first = peers_not_in_message(index, message, 0);
+ int second = peers_in_message(index, message, 1);
+
+ PRINTK("First part returns %d, second returns %d.\n", first, second);
+
+ if (!first && !second) {
+ PRINTK("All peers answered message %d.\n",
+ message);
+ return 1;
+ }
+
+ if (time_after(jiffies, start + continue_delay)) {
+ PRINTK("Timeout reached.\n");
+ return 1;
+ }
+
+ PRINTK("Not time to continue yet (%lu < %lu).\n", jiffies,
+ start + continue_delay);
+ return 0;
+}
+
+void toi_initiate_cluster_hibernate(void)
+{
+ int result;
+ unsigned long start;
+
+ result = do_toi_step(STEP_HIBERNATE_PREPARE_IMAGE);
+ if (result)
+ return;
+
+ toi_send_if(MSG_HIBERNATE, 0);
+
+ start = jiffies;
+ wait_event(node_array[0].member_events,
+ time_to_continue(0, start, MSG_HIBERNATE));
+
+ if (test_action_state(TOI_FREEZER_TEST)) {
+ toi_send_if(MSG_ABORT, 0);
+
+ start = jiffies;
+ wait_event(node_array[0].member_events,
+ time_to_continue(0, start, MSG_RUNNING));
+
+ do_toi_step(STEP_QUIET_CLEANUP);
+ return;
+ }
+
+ toi_send_if(MSG_IO, 0);
+
+ result = do_toi_step(STEP_HIBERNATE_SAVE_IMAGE);
+ if (result)
+ return;
+
+ /* This code runs at resume time too! */
+ if (toi_in_hibernate)
+ result = do_toi_step(STEP_HIBERNATE_POWERDOWN);
+}
+
+/* toi_cluster_print_debug_stats
+ *
+ * Description: Print information to be recorded for debugging purposes into a
+ * buffer.
+ * Arguments: buffer: Pointer to a buffer into which the debug info will be
+ * printed.
+ * size: Size of the buffer.
+ * Returns: Number of characters written to the buffer.
+ */
+static int toi_cluster_print_debug_stats(char *buffer, int size)
+{
+ int len;
+
+ if (strlen(toi_cluster_iface))
+ len = scnprintf(buffer, size,
+ "- Cluster interface is '%s'.\n",
+ toi_cluster_iface);
+ else
+ len = scnprintf(buffer, size,
+ "- Cluster support is disabled.\n");
+ return len;
+}
+
+/* cluster_memory_needed
+ *
+ * Description: Tell the caller how much memory we need to operate during
+ * hibernate/resume.
+ * Returns: Unsigned long. Maximum number of bytes of memory required for
+ * operation.
+ */
+static int toi_cluster_memory_needed(void)
+{
+ return 0;
+}
+
+static int toi_cluster_storage_needed(void)
+{
+ return 1 + strlen(toi_cluster_iface);
+}
+
+/* toi_cluster_save_config_info
+ *
+ * Description: Save informaton needed when reloading the image at resume time.
+ * Arguments: Buffer: Pointer to a buffer of size PAGE_SIZE.
+ * Returns: Number of bytes used for saving our data.
+ */
+static int toi_cluster_save_config_info(char *buffer)
+{
+ strcpy(buffer, toi_cluster_iface);
+ return strlen(toi_cluster_iface + 1);
+}
+
+/* toi_cluster_load_config_info
+ *
+ * Description: Reload information needed for declustering the image at
+ * resume time.
+ * Arguments: Buffer: Pointer to the start of the data.
+ * Size: Number of bytes that were saved.
+ */
+static void toi_cluster_load_config_info(char *buffer, int size)
+{
+ strncpy(toi_cluster_iface, buffer, size);
+ return;
+}
+
+static void cluster_startup(void)
+{
+ int have_image = do_check_can_resume(), i;
+ unsigned long start = jiffies, initial_message;
+ struct task_struct *p;
+
+ initial_message = MSG_IMAGE;
+
+ have_image = 1;
+
+ for (i = 0; i < num_local_nodes; i++) {
+ PRINTK("Starting ktoiclusterd %d.\n", i);
+ p = kthread_create(kTOICluster, (void *) initial_message,
+ "ktoiclusterd/%d", i);
+ if (IS_ERR(p)) {
+ printk(KERN_ERR "Failed to start ktoiclusterd.\n");
+ return;
+ }
+
+ wake_up_process(p);
+ }
+
+ /* Wait for delay or someone else sending first message */
+ wait_event(node_array[0].member_events, time_to_continue(0, start,
+ MSG_IMAGE));
+
+ others_have_image = peers_in_message(0, MSG_IMAGE | MSG_ACK, 1);
+
+ printk(KERN_INFO "Continuing. I %shave an image. Peers with image:"
+ " %d.\n", have_image ? "" : "don't ", others_have_image);
+
+ if (have_image) {
+ int result;
+
+ /* Start to resume */
+ printk(KERN_INFO " === Starting to resume === \n");
+ node_array[0].current_message = MSG_IO;
+ toi_send_if(MSG_IO, 0);
+
+ /* result = do_toi_step(STEP_RESUME_LOAD_PS1); */
+ result = 0;
+
+ if (!result) {
+ /*
+ * Atomic restore - we'll come back in the hibernation
+ * path.
+ */
+
+ /* result = do_toi_step(STEP_RESUME_DO_RESTORE); */
+ result = 0;
+
+ /* do_toi_step(STEP_QUIET_CLEANUP); */
+ }
+
+ node_array[0].current_message |= MSG_NACK;
+
+ /* For debugging - disable for real life? */
+ wait_event(node_array[0].member_events,
+ time_to_continue(0, start, MSG_IO));
+ }
+
+ if (others_have_image) {
+ /* Wait for them to resume */
+ printk(KERN_INFO "Waiting for other nodes to resume.\n");
+ start = jiffies;
+ wait_event(node_array[0].member_events,
+ time_to_continue(0, start, MSG_RUNNING));
+ if (peers_not_in_message(0, MSG_RUNNING, 0))
+ printk(KERN_INFO "Timed out while waiting for other "
+ "nodes to resume.\n");
+ }
+
+ /* Find out whether an image exists here. Send ACK_IMAGE or NACK_IMAGE
+ * as appropriate.
+ *
+ * If we don't have an image:
+ * - Wait until someone else says they have one, or conditions are met
+ * for continuing to boot (n machines or t seconds).
+ * - If anyone has an image, wait for them to resume before continuing
+ * to boot.
+ *
+ * If we have an image:
+ * - Wait until conditions are met before continuing to resume (n
+ * machines or t seconds). Send RESUME_PREP and freeze processes.
+ * NACK_PREP if freezing fails (shouldn't) and follow logic for
+ * us having no image above. On success, wait for [N]ACK_PREP from
+ * other machines. Read image (including atomic restore) until done.
+ * Wait for ACK_READ from others (should never fail). Thaw processes
+ * and do post-resume. (The section after the atomic restore is done
+ * via the code for hibernating).
+ */
+
+ node_array[0].current_message = MSG_RUNNING;
+}
+
+/* toi_cluster_open_iface
+ *
+ * Description: Prepare to use an interface.
+ */
+
+static int toi_cluster_open_iface(void)
+{
+ struct net_device *dev;
+
+ rtnl_lock();
+
+ for_each_netdev(&init_net, dev) {
+ if (/* dev == &init_net.loopback_dev || */
+ strcmp(dev->name, toi_cluster_iface))
+ continue;
+
+ net_dev = dev;
+ break;
+ }
+
+ rtnl_unlock();
+
+ if (!net_dev) {
+ printk(KERN_ERR MYNAME ": Device %s not found.\n",
+ toi_cluster_iface);
+ return -ENODEV;
+ }
+
+ dev_add_pack(&toi_cluster_packet_type);
+ added_pack = 1;
+
+ loopback_mode = (net_dev == init_net.loopback_dev);
+ num_local_nodes = loopback_mode ? 8 : 1;
+
+ PRINTK("Loopback mode is %s. Number of local nodes is %d.\n",
+ loopback_mode ? "on" : "off", num_local_nodes);
+
+ cluster_startup();
+ return 0;
+}
+
+/* toi_cluster_close_iface
+ *
+ * Description: Stop using an interface.
+ */
+
+static int toi_cluster_close_iface(void)
+{
+ kill_clusterd();
+ if (added_pack) {
+ dev_remove_pack(&toi_cluster_packet_type);
+ added_pack = 0;
+ }
+ return 0;
+}
+
+static void write_side_effect(void)
+{
+ if (toi_cluster_ops.enabled) {
+ toi_cluster_open_iface();
+ set_toi_state(TOI_CLUSTER_MODE);
+ } else {
+ toi_cluster_close_iface();
+ clear_toi_state(TOI_CLUSTER_MODE);
+ }
+}
+
+static void node_write_side_effect(void)
+{
+}
+
+/*
+ * data for our sysfs entries.
+ */
+static struct toi_sysfs_data sysfs_params[] = {
+ SYSFS_STRING("interface", SYSFS_RW, toi_cluster_iface, IFNAMSIZ, 0,
+ NULL),
+ SYSFS_INT("enabled", SYSFS_RW, &toi_cluster_ops.enabled, 0, 1, 0,
+ write_side_effect),
+ SYSFS_STRING("cluster_name", SYSFS_RW, toi_cluster_key, 32, 0, NULL),
+ SYSFS_STRING("pre-hibernate-script", SYSFS_RW, pre_hibernate_script,
+ 256, 0, NULL),
+ SYSFS_STRING("post-hibernate-script", SYSFS_RW, post_hibernate_script,
+ 256, 0, STRING),
+ SYSFS_UL("continue_delay", SYSFS_RW, &continue_delay, HZ / 2, 60 * HZ,
+ 0)
+};
+
+/*
+ * Ops structure.
+ */
+
+static struct toi_module_ops toi_cluster_ops = {
+ .type = FILTER_MODULE,
+ .name = "Cluster",
+ .directory = "cluster",
+ .module = THIS_MODULE,
+ .memory_needed = toi_cluster_memory_needed,
+ .print_debug_info = toi_cluster_print_debug_stats,
+ .save_config_info = toi_cluster_save_config_info,
+ .load_config_info = toi_cluster_load_config_info,
+ .storage_needed = toi_cluster_storage_needed,
+
+ .sysfs_data = sysfs_params,
+ .num_sysfs_entries = sizeof(sysfs_params) /
+ sizeof(struct toi_sysfs_data),
+};
+
+/* ---- Registration ---- */
+
+#ifdef MODULE
+#define INIT static __init
+#define EXIT static __exit
+#else
+#define INIT
+#define EXIT
+#endif
+
+INIT int toi_cluster_init(void)
+{
+ int temp = toi_register_module(&toi_cluster_ops), i;
+ struct kobject *kobj = toi_cluster_ops.dir_kobj;
+
+ for (i = 0; i < MAX_LOCAL_NODES; i++) {
+ node_array[i].current_message = 0;
+ INIT_LIST_HEAD(&node_array[i].member_list);
+ init_waitqueue_head(&node_array[i].member_events);
+ spin_lock_init(&node_array[i].member_list_lock);
+ spin_lock_init(&node_array[i].receive_lock);
+
+ /* Set up sysfs entry */
+ node_array[i].sysfs_data.attr.name = toi_kzalloc(8,
+ sizeof(node_array[i].sysfs_data.attr.name),
+ GFP_KERNEL);
+ sprintf((char *) node_array[i].sysfs_data.attr.name, "node_%d",
+ i);
+ node_array[i].sysfs_data.attr.mode = SYSFS_RW;
+ node_array[i].sysfs_data.type = TOI_SYSFS_DATA_INTEGER;
+ node_array[i].sysfs_data.flags = 0;
+ node_array[i].sysfs_data.data.integer.variable =
+ (int *) &node_array[i].current_message;
+ node_array[i].sysfs_data.data.integer.minimum = 0;
+ node_array[i].sysfs_data.data.integer.maximum = INT_MAX;
+ node_array[i].sysfs_data.write_side_effect =
+ node_write_side_effect;
+ toi_register_sysfs_file(kobj, &node_array[i].sysfs_data);
+ }
+
+ toi_cluster_ops.enabled = (strlen(toi_cluster_iface) > 0);
+
+ if (toi_cluster_ops.enabled)
+ toi_cluster_open_iface();
+
+ return temp;
+}
+
+EXIT void toi_cluster_exit(void)
+{
+ int i;
+ toi_cluster_close_iface();
+
+ for (i = 0; i < MAX_LOCAL_NODES; i++)
+ toi_unregister_sysfs_file(toi_cluster_ops.dir_kobj,
+ &node_array[i].sysfs_data);
+ toi_unregister_module(&toi_cluster_ops);
+}
+
+static int __init toi_cluster_iface_setup(char *iface)
+{
+ toi_cluster_ops.enabled = (*iface &&
+ strcmp(iface, "off"));
+
+ if (toi_cluster_ops.enabled)
+ strncpy(toi_cluster_iface, iface, strlen(iface));
+}
+
+__setup("toi_cluster=", toi_cluster_iface_setup);