summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/async.c1
-rw-r--r--kernel/audit.c19
-rw-r--r--kernel/audit.h2
-rw-r--r--kernel/audit_fsnotify.c2
-rw-r--r--kernel/audit_watch.c2
-rw-r--r--kernel/auditsc.c6
-rw-r--r--kernel/bpf/arraymap.c21
-rw-r--r--kernel/bpf/core.c4
-rw-r--r--kernel/bpf/hashtab.c64
-rw-r--r--kernel/bpf/inode.c20
-rw-r--r--kernel/bpf/syscall.c22
-rw-r--r--kernel/cgroup.c215
-rw-r--r--kernel/cgroup_freezer.c2
-rw-r--r--kernel/cgroup_pids.c6
-rw-r--r--kernel/context_tracking.c4
-rw-r--r--kernel/cpu.c64
-rw-r--r--kernel/cpuset.c12
-rw-r--r--kernel/cred.c4
-rw-r--r--kernel/debug/kdb/kdb_main.c4
-rw-r--r--kernel/delayacct.c2
-rw-r--r--kernel/events/core.c1479
-rw-r--r--kernel/events/hw_breakpoint.c2
-rw-r--r--kernel/events/ring_buffer.c40
-rw-r--r--kernel/events/uprobes.c13
-rw-r--r--kernel/exit.c5
-rw-r--r--kernel/fork.c42
-rw-r--r--kernel/futex.c163
-rw-r--r--kernel/gcov/base.c7
-rw-r--r--kernel/irq/chip.c9
-rw-r--r--kernel/irq/irqdesc.c19
-rw-r--r--kernel/irq/irqdomain.c24
-rw-r--r--kernel/irq/manage.c25
-rw-r--r--kernel/irq/msi.c66
-rw-r--r--kernel/kexec.c10
-rw-r--r--kernel/kexec_core.c37
-rw-r--r--kernel/kexec_file.c2
-rw-r--r--kernel/kexec_internal.h21
-rw-r--r--kernel/ksysfs.c26
-rw-r--r--kernel/kthread.c2
-rw-r--r--kernel/latencytop.c14
-rw-r--r--kernel/livepatch/core.c176
-rw-r--r--kernel/locking/lockdep.c58
-rw-r--r--kernel/locking/qspinlock.c82
-rw-r--r--kernel/locking/qspinlock_paravirt.h252
-rw-r--r--kernel/locking/qspinlock_stat.h300
-rw-r--r--kernel/locking/rtmutex.c135
-rw-r--r--kernel/memremap.c239
-rw-r--r--kernel/module.c363
-rw-r--r--kernel/panic.c33
-rw-r--r--kernel/pid.c4
-rw-r--r--kernel/power/Kconfig280
-rw-r--r--kernel/power/Makefile32
-rw-r--r--kernel/power/hibernate.c34
-rw-r--r--kernel/power/main.c17
-rw-r--r--kernel/power/power.h45
-rw-r--r--kernel/power/snapshot.c317
-rw-r--r--kernel/power/tuxonice.h260
-rw-r--r--kernel/power/tuxonice_alloc.c308
-rw-r--r--kernel/power/tuxonice_alloc.h54
-rw-r--r--kernel/power/tuxonice_atomic_copy.c469
-rw-r--r--kernel/power/tuxonice_atomic_copy.h25
-rw-r--r--kernel/power/tuxonice_bio.h78
-rw-r--r--kernel/power/tuxonice_bio_chains.c1126
-rw-r--r--kernel/power/tuxonice_bio_core.c1932
-rw-r--r--kernel/power/tuxonice_bio_internal.h101
-rw-r--r--kernel/power/tuxonice_bio_signature.c403
-rw-r--r--kernel/power/tuxonice_builtin.c498
-rw-r--r--kernel/power/tuxonice_builtin.h41
-rw-r--r--kernel/power/tuxonice_checksum.c392
-rw-r--r--kernel/power/tuxonice_checksum.h31
-rw-r--r--kernel/power/tuxonice_cluster.c1058
-rw-r--r--kernel/power/tuxonice_cluster.h18
-rw-r--r--kernel/power/tuxonice_compress.c452
-rw-r--r--kernel/power/tuxonice_copy_before_write.c240
-rw-r--r--kernel/power/tuxonice_extent.c144
-rw-r--r--kernel/power/tuxonice_extent.h45
-rw-r--r--kernel/power/tuxonice_file.c484
-rw-r--r--kernel/power/tuxonice_highlevel.c1413
-rw-r--r--kernel/power/tuxonice_incremental.c402
-rw-r--r--kernel/power/tuxonice_io.c1932
-rw-r--r--kernel/power/tuxonice_io.h72
-rw-r--r--kernel/power/tuxonice_modules.c520
-rw-r--r--kernel/power/tuxonice_modules.h212
-rw-r--r--kernel/power/tuxonice_netlink.c324
-rw-r--r--kernel/power/tuxonice_netlink.h62
-rw-r--r--kernel/power/tuxonice_pagedir.c345
-rw-r--r--kernel/power/tuxonice_pagedir.h50
-rw-r--r--kernel/power/tuxonice_pageflags.c18
-rw-r--r--kernel/power/tuxonice_pageflags.h106
-rw-r--r--kernel/power/tuxonice_power_off.c286
-rw-r--r--kernel/power/tuxonice_power_off.h24
-rw-r--r--kernel/power/tuxonice_prepare_image.c1080
-rw-r--r--kernel/power/tuxonice_prepare_image.h38
-rw-r--r--kernel/power/tuxonice_prune.c406
-rw-r--r--kernel/power/tuxonice_storage.c282
-rw-r--r--kernel/power/tuxonice_storage.h45
-rw-r--r--kernel/power/tuxonice_swap.c474
-rw-r--r--kernel/power/tuxonice_sysfs.c333
-rw-r--r--kernel/power/tuxonice_sysfs.h137
-rw-r--r--kernel/power/tuxonice_ui.c247
-rw-r--r--kernel/power/tuxonice_ui.h97
-rw-r--r--kernel/power/tuxonice_userui.c658
-rw-r--r--kernel/printk/printk.c57
-rw-r--r--kernel/profile.c1
-rw-r--r--kernel/ptrace.c10
-rw-r--r--kernel/rcu/rcutorture.c24
-rw-r--r--kernel/rcu/srcu.c2
-rw-r--r--kernel/rcu/tree.c337
-rw-r--r--kernel/rcu/tree.h73
-rw-r--r--kernel/rcu/tree_plugin.h98
-rw-r--r--kernel/rcu/tree_trace.c39
-rw-r--r--kernel/rcu/update.c22
-rw-r--r--kernel/relay.c4
-rw-r--r--kernel/resource.c11
-rw-r--r--kernel/sched/Makefile2
-rw-r--r--kernel/sched/auto_group.c2
-rw-r--r--kernel/sched/clock.c2
-rw-r--r--kernel/sched/core.c635
-rw-r--r--kernel/sched/cputime.c130
-rw-r--r--kernel/sched/deadline.c121
-rw-r--r--kernel/sched/debug.c415
-rw-r--r--kernel/sched/fair.c599
-rw-r--r--kernel/sched/idle.c9
-rw-r--r--kernel/sched/idle_task.c1
-rw-r--r--kernel/sched/rt.c96
-rw-r--r--kernel/sched/sched.h115
-rw-r--r--kernel/sched/stats.h8
-rw-r--r--kernel/sched/swait.c123
-rw-r--r--kernel/signal.c6
-rw-r--r--kernel/smpboot.c2
-rw-r--r--kernel/softirq.c4
-rw-r--r--kernel/stop_machine.c88
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--kernel/sysctl.c130
-rw-r--r--kernel/task_work.c1
-rw-r--r--kernel/time/alarmtimer.c17
-rw-r--r--kernel/time/clocksource.c4
-rw-r--r--kernel/time/ntp.c54
-rw-r--r--kernel/time/ntp_internal.h2
-rw-r--r--kernel/time/tick-sched.c50
-rw-r--r--kernel/time/timekeeping.c46
-rw-r--r--kernel/time/timekeeping_internal.h8
-rw-r--r--kernel/trace/blktrace.c12
-rw-r--r--kernel/trace/bpf_trace.c16
-rw-r--r--kernel/trace/ftrace.c449
-rw-r--r--kernel/trace/ring_buffer.c57
-rw-r--r--kernel/trace/trace.h6
-rw-r--r--kernel/trace/trace_event_perf.c2
-rw-r--r--kernel/trace/trace_events.c28
-rw-r--r--kernel/trace/trace_events_trigger.c25
-rw-r--r--kernel/trace/trace_stack.c6
-rw-r--r--kernel/tsacct.c54
-rw-r--r--kernel/user_namespace.c21
-rw-r--r--kernel/watchdog.c20
-rw-r--r--kernel/workqueue.c274
155 files changed, 5394 insertions, 21496 deletions
diff --git a/kernel/async.c b/kernel/async.c
index 4c3773c0b..d2edd6efe 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -326,3 +326,4 @@ bool current_is_async(void)
return worker && worker->current_func == async_run_entry_fn;
}
+EXPORT_SYMBOL_GPL(current_is_async);
diff --git a/kernel/audit.c b/kernel/audit.c
index 5ffcbd354..3a3e5deed 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -110,7 +110,6 @@ static u32 audit_backlog_limit = 64;
#define AUDIT_BACKLOG_WAIT_TIME (60 * HZ)
static u32 audit_backlog_wait_time_master = AUDIT_BACKLOG_WAIT_TIME;
static u32 audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME;
-static u32 audit_backlog_wait_overflow = 0;
/* The identity of the user shutting down the audit system. */
kuid_t audit_sig_uid = INVALID_UID;
@@ -509,8 +508,7 @@ static void flush_hold_queue(void)
* if auditd just disappeared but we
* dequeued an skb we need to drop ref
*/
- if (skb)
- consume_skb(skb);
+ consume_skb(skb);
}
static int kauditd_thread(void *dummy)
@@ -524,7 +522,8 @@ static int kauditd_thread(void *dummy)
skb = skb_dequeue(&audit_skb_queue);
if (skb) {
- if (skb_queue_len(&audit_skb_queue) <= audit_backlog_limit)
+ if (!audit_backlog_limit ||
+ (skb_queue_len(&audit_skb_queue) <= audit_backlog_limit))
wake_up(&audit_backlog_wait);
if (audit_pid)
kauditd_send_skb(skb);
@@ -1232,9 +1231,7 @@ static void audit_buffer_free(struct audit_buffer *ab)
if (!ab)
return;
- if (ab->skb)
- kfree_skb(ab->skb);
-
+ kfree_skb(ab->skb);
spin_lock_irqsave(&audit_freelist_lock, flags);
if (audit_freelist_count > AUDIT_MAXFREE)
kfree(ab);
@@ -1372,7 +1369,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
return NULL;
if (gfp_mask & __GFP_DIRECT_RECLAIM) {
- if (audit_pid && audit_pid == current->pid)
+ if (audit_pid && audit_pid == current->tgid)
gfp_mask &= ~__GFP_DIRECT_RECLAIM;
else
reserve = 0;
@@ -1395,12 +1392,12 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
skb_queue_len(&audit_skb_queue),
audit_backlog_limit);
audit_log_lost("backlog limit exceeded");
- audit_backlog_wait_time = audit_backlog_wait_overflow;
+ audit_backlog_wait_time = 0;
wake_up(&audit_backlog_wait);
return NULL;
}
- if (!reserve)
+ if (!reserve && !audit_backlog_wait_time)
audit_backlog_wait_time = audit_backlog_wait_time_master;
ab = audit_buffer_alloc(ctx, gfp_mask, type);
@@ -1722,7 +1719,7 @@ static inline int audit_copy_fcaps(struct audit_names *name,
/* Copy inode data into an audit_names. */
void audit_copy_inode(struct audit_names *name, const struct dentry *dentry,
- const struct inode *inode)
+ struct inode *inode)
{
name->ino = inode->i_ino;
name->dev = inode->i_sb->s_dev;
diff --git a/kernel/audit.h b/kernel/audit.h
index de6cbb7cf..cbbe6bb64 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -207,7 +207,7 @@ extern u32 audit_ever_enabled;
extern void audit_copy_inode(struct audit_names *name,
const struct dentry *dentry,
- const struct inode *inode);
+ struct inode *inode);
extern void audit_log_cap(struct audit_buffer *ab, char *prefix,
kernel_cap_t *cap);
extern void audit_log_name(struct audit_context *context,
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index 27c6046c2..f84f8d06e 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -95,7 +95,7 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa
if (IS_ERR(dentry))
return (void *)dentry; /* returning an error */
inode = path.dentry->d_inode;
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
audit_mark = kzalloc(sizeof(*audit_mark), GFP_KERNEL);
if (unlikely(!audit_mark)) {
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 656c7e93a..9f194aad0 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -364,7 +364,7 @@ static int audit_get_nd(struct audit_watch *watch, struct path *parent)
struct dentry *d = kern_path_locked(watch->path, parent);
if (IS_ERR(d))
return PTR_ERR(d);
- mutex_unlock(&d_backing_inode(parent->dentry)->i_mutex);
+ inode_unlock(d_backing_inode(parent->dentry));
if (d_is_positive(d)) {
/* update watch filter fields */
watch->dev = d_backing_inode(d)->i_sb->s_dev;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index b86cc0495..195ffaee5 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1754,7 +1754,7 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,
unsigned int flags)
{
struct audit_context *context = current->audit_context;
- const struct inode *inode = d_backing_inode(dentry);
+ struct inode *inode = d_backing_inode(dentry);
struct audit_names *n;
bool parent = flags & AUDIT_INODE_PARENT;
@@ -1848,12 +1848,12 @@ void __audit_file(const struct file *file)
* must be hooked prior, in order to capture the target inode during
* unsuccessful attempts.
*/
-void __audit_inode_child(const struct inode *parent,
+void __audit_inode_child(struct inode *parent,
const struct dentry *dentry,
const unsigned char type)
{
struct audit_context *context = current->audit_context;
- const struct inode *inode = d_backing_inode(dentry);
+ struct inode *inode = d_backing_inode(dentry);
const char *dname = dentry->d_name.name;
struct audit_names *n, *found_parent = NULL, *found_child = NULL;
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index b0799bced..89ebbc4d1 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -291,10 +291,13 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd)
{
struct perf_event *event;
const struct perf_event_attr *attr;
+ struct file *file;
- event = perf_event_get(fd);
- if (IS_ERR(event))
- return event;
+ file = perf_event_get(fd);
+ if (IS_ERR(file))
+ return file;
+
+ event = file->private_data;
attr = perf_event_attrs(event);
if (IS_ERR(attr))
@@ -304,24 +307,22 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd)
goto err;
if (attr->type == PERF_TYPE_RAW)
- return event;
+ return file;
if (attr->type == PERF_TYPE_HARDWARE)
- return event;
+ return file;
if (attr->type == PERF_TYPE_SOFTWARE &&
attr->config == PERF_COUNT_SW_BPF_OUTPUT)
- return event;
+ return file;
err:
- perf_event_release_kernel(event);
+ fput(file);
return ERR_PTR(-EINVAL);
}
static void perf_event_fd_array_put_ptr(void *ptr)
{
- struct perf_event *event = ptr;
-
- perf_event_release_kernel(event);
+ fput((struct file *)ptr);
}
static const struct bpf_map_ops perf_event_array_ops = {
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 334b1bdd5..972d9a8e4 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -306,10 +306,6 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)];
ARG1 = (u64) (unsigned long) ctx;
- /* Registers used in classic BPF programs need to be reset first. */
- regs[BPF_REG_A] = 0;
- regs[BPF_REG_X] = 0;
-
select_insn:
goto *jumptable[insn->code];
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 34777b374..c5b30fd8a 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -14,11 +14,15 @@
#include <linux/filter.h>
#include <linux/vmalloc.h>
+struct bucket {
+ struct hlist_head head;
+ raw_spinlock_t lock;
+};
+
struct bpf_htab {
struct bpf_map map;
- struct hlist_head *buckets;
- raw_spinlock_t lock;
- u32 count; /* number of elements in this hashtable */
+ struct bucket *buckets;
+ atomic_t count; /* number of elements in this hashtable */
u32 n_buckets; /* number of hash buckets */
u32 elem_size; /* size of each element in bytes */
};
@@ -79,34 +83,35 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
/* prevent zero size kmalloc and check for u32 overflow */
if (htab->n_buckets == 0 ||
- htab->n_buckets > U32_MAX / sizeof(struct hlist_head))
+ htab->n_buckets > U32_MAX / sizeof(struct bucket))
goto free_htab;
- if ((u64) htab->n_buckets * sizeof(struct hlist_head) +
+ if ((u64) htab->n_buckets * sizeof(struct bucket) +
(u64) htab->elem_size * htab->map.max_entries >=
U32_MAX - PAGE_SIZE)
/* make sure page count doesn't overflow */
goto free_htab;
- htab->map.pages = round_up(htab->n_buckets * sizeof(struct hlist_head) +
+ htab->map.pages = round_up(htab->n_buckets * sizeof(struct bucket) +
htab->elem_size * htab->map.max_entries,
PAGE_SIZE) >> PAGE_SHIFT;
err = -ENOMEM;
- htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct hlist_head),
+ htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct bucket),
GFP_USER | __GFP_NOWARN);
if (!htab->buckets) {
- htab->buckets = vmalloc(htab->n_buckets * sizeof(struct hlist_head));
+ htab->buckets = vmalloc(htab->n_buckets * sizeof(struct bucket));
if (!htab->buckets)
goto free_htab;
}
- for (i = 0; i < htab->n_buckets; i++)
- INIT_HLIST_HEAD(&htab->buckets[i]);
+ for (i = 0; i < htab->n_buckets; i++) {
+ INIT_HLIST_HEAD(&htab->buckets[i].head);
+ raw_spin_lock_init(&htab->buckets[i].lock);
+ }
- raw_spin_lock_init(&htab->lock);
- htab->count = 0;
+ atomic_set(&htab->count, 0);
return &htab->map;
@@ -120,11 +125,16 @@ static inline u32 htab_map_hash(const void *key, u32 key_len)
return jhash(key, key_len, 0);
}
-static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
+static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
{
return &htab->buckets[hash & (htab->n_buckets - 1)];
}
+static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
+{
+ return &__select_bucket(htab, hash)->head;
+}
+
static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash,
void *key, u32 key_size)
{
@@ -227,6 +237,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct htab_elem *l_new, *l_old;
struct hlist_head *head;
+ struct bucket *b;
unsigned long flags;
u32 key_size;
int ret;
@@ -248,15 +259,15 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
memcpy(l_new->key + round_up(key_size, 8), value, map->value_size);
l_new->hash = htab_map_hash(l_new->key, key_size);
+ b = __select_bucket(htab, l_new->hash);
+ head = &b->head;
/* bpf_map_update_elem() can be called in_irq() */
- raw_spin_lock_irqsave(&htab->lock, flags);
-
- head = select_bucket(htab, l_new->hash);
+ raw_spin_lock_irqsave(&b->lock, flags);
l_old = lookup_elem_raw(head, l_new->hash, key, key_size);
- if (!l_old && unlikely(htab->count >= map->max_entries)) {
+ if (!l_old && unlikely(atomic_read(&htab->count) >= map->max_entries)) {
/* if elem with this 'key' doesn't exist and we've reached
* max_entries limit, fail insertion of new elem
*/
@@ -284,13 +295,13 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
hlist_del_rcu(&l_old->hash_node);
kfree_rcu(l_old, rcu);
} else {
- htab->count++;
+ atomic_inc(&htab->count);
}
- raw_spin_unlock_irqrestore(&htab->lock, flags);
+ raw_spin_unlock_irqrestore(&b->lock, flags);
return 0;
err:
- raw_spin_unlock_irqrestore(&htab->lock, flags);
+ raw_spin_unlock_irqrestore(&b->lock, flags);
kfree(l_new);
return ret;
}
@@ -300,6 +311,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct hlist_head *head;
+ struct bucket *b;
struct htab_elem *l;
unsigned long flags;
u32 hash, key_size;
@@ -310,21 +322,21 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
key_size = map->key_size;
hash = htab_map_hash(key, key_size);
+ b = __select_bucket(htab, hash);
+ head = &b->head;
- raw_spin_lock_irqsave(&htab->lock, flags);
-
- head = select_bucket(htab, hash);
+ raw_spin_lock_irqsave(&b->lock, flags);
l = lookup_elem_raw(head, hash, key, key_size);
if (l) {
hlist_del_rcu(&l->hash_node);
- htab->count--;
+ atomic_dec(&htab->count);
kfree_rcu(l, rcu);
ret = 0;
}
- raw_spin_unlock_irqrestore(&htab->lock, flags);
+ raw_spin_unlock_irqrestore(&b->lock, flags);
return ret;
}
@@ -339,7 +351,7 @@ static void delete_all_elements(struct bpf_htab *htab)
hlist_for_each_entry_safe(l, n, head, hash_node) {
hlist_del_rcu(&l->hash_node);
- htab->count--;
+ atomic_dec(&htab->count);
kfree(l);
}
}
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 5a8a797d5..f2ece3c17 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -187,11 +187,31 @@ static int bpf_mkobj(struct inode *dir, struct dentry *dentry, umode_t mode,
}
}
+static int bpf_link(struct dentry *old_dentry, struct inode *dir,
+ struct dentry *new_dentry)
+{
+ if (bpf_dname_reserved(new_dentry))
+ return -EPERM;
+
+ return simple_link(old_dentry, dir, new_dentry);
+}
+
+static int bpf_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry)
+{
+ if (bpf_dname_reserved(new_dentry))
+ return -EPERM;
+
+ return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
+}
+
static const struct inode_operations bpf_dir_iops = {
.lookup = simple_lookup,
.mknod = bpf_mkobj,
.mkdir = bpf_mkdir,
.rmdir = simple_rmdir,
+ .rename = bpf_rename,
+ .link = bpf_link,
.unlink = simple_unlink,
};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 3b39550d8..637397059 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -113,8 +113,28 @@ static int bpf_map_release(struct inode *inode, struct file *filp)
return 0;
}
+#ifdef CONFIG_PROC_FS
+static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
+{
+ const struct bpf_map *map = filp->private_data;
+
+ seq_printf(m,
+ "map_type:\t%u\n"
+ "key_size:\t%u\n"
+ "value_size:\t%u\n"
+ "max_entries:\t%u\n",
+ map->map_type,
+ map->key_size,
+ map->value_size,
+ map->max_entries);
+}
+#endif
+
static const struct file_operations bpf_map_fops = {
- .release = bpf_map_release,
+#ifdef CONFIG_PROC_FS
+ .show_fdinfo = bpf_map_show_fdinfo,
+#endif
+ .release = bpf_map_release,
};
int bpf_map_new_fd(struct bpf_map *map)
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index fb1ecfd2d..d27904c19 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -57,8 +57,9 @@
#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
#include <linux/kthread.h>
#include <linux/delay.h>
-#include <linux/cpuset.h>
#include <linux/atomic.h>
+#include <linux/cpuset.h>
+#include <net/sock.h>
/*
* pidlists linger the following amount before being destroyed. The goal
@@ -211,6 +212,7 @@ static unsigned long have_free_callback __read_mostly;
/* Ditto for the can_fork callback. */
static unsigned long have_canfork_callback __read_mostly;
+static struct file_system_type cgroup2_fs_type;
static struct cftype cgroup_dfl_base_files[];
static struct cftype cgroup_legacy_base_files[];
@@ -440,11 +442,6 @@ static bool cgroup_tryget(struct cgroup *cgrp)
return css_tryget(&cgrp->self);
}
-static void cgroup_put(struct cgroup *cgrp)
-{
- css_put(&cgrp->self);
-}
-
struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
{
struct cgroup *cgrp = of->kn->parent->priv;
@@ -465,25 +462,6 @@ struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
}
EXPORT_SYMBOL_GPL(of_css);
-/**
- * cgroup_is_descendant - test ancestry
- * @cgrp: the cgroup to be tested
- * @ancestor: possible ancestor of @cgrp
- *
- * Test whether @cgrp is a descendant of @ancestor. It also returns %true
- * if @cgrp == @ancestor. This function is safe to call as long as @cgrp
- * and @ancestor are accessible.
- */
-bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
-{
- while (cgrp) {
- if (cgrp == ancestor)
- return true;
- cgrp = cgroup_parent(cgrp);
- }
- return false;
-}
-
static int notify_on_release(const struct cgroup *cgrp)
{
return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
@@ -1647,10 +1625,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
all_ss = true;
continue;
}
- if (!strcmp(token, "__DEVEL__sane_behavior")) {
- opts->flags |= CGRP_ROOT_SANE_BEHAVIOR;
- continue;
- }
if (!strcmp(token, "noprefix")) {
opts->flags |= CGRP_ROOT_NOPREFIX;
continue;
@@ -1717,15 +1691,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
return -ENOENT;
}
- if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
- pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
- if (nr_opts != 1) {
- pr_err("sane_behavior: no other mount options allowed\n");
- return -EINVAL;
- }
- return 0;
- }
-
/*
* If the 'all' option was specified select all the subsystems,
* otherwise if 'none', 'name=' and a subsystem name options were
@@ -1924,6 +1889,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
if (ret < 0)
goto out;
root_cgrp->id = ret;
+ root_cgrp->ancestor_ids[0] = ret;
ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0,
GFP_KERNEL);
@@ -2004,6 +1970,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
int flags, const char *unused_dev_name,
void *data)
{
+ bool is_v2 = fs_type == &cgroup2_fs_type;
struct super_block *pinned_sb = NULL;
struct cgroup_subsys *ss;
struct cgroup_root *root;
@@ -2020,6 +1987,17 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
if (!use_task_css_set_links)
cgroup_enable_task_cg_lists();
+ if (is_v2) {
+ if (data) {
+ pr_err("cgroup2: unknown option \"%s\"\n", (char *)data);
+ return ERR_PTR(-EINVAL);
+ }
+ cgrp_dfl_root_visible = true;
+ root = &cgrp_dfl_root;
+ cgroup_get(&root->cgrp);
+ goto out_mount;
+ }
+
mutex_lock(&cgroup_mutex);
/* First find the desired set of subsystems */
@@ -2027,15 +2005,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
if (ret)
goto out_unlock;
- /* look for a matching existing root */
- if (opts.flags & CGRP_ROOT_SANE_BEHAVIOR) {
- cgrp_dfl_root_visible = true;
- root = &cgrp_dfl_root;
- cgroup_get(&root->cgrp);
- ret = 0;
- goto out_unlock;
- }
-
/*
* Destruction of cgroup root is asynchronous, so subsystems may
* still be dying after the previous unmount. Let's drain the
@@ -2146,9 +2115,10 @@ out_free:
if (ret)
return ERR_PTR(ret);
-
+out_mount:
dentry = kernfs_mount(fs_type, flags, root->kf_root,
- CGROUP_SUPER_MAGIC, &new_sb);
+ is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC,
+ &new_sb);
if (IS_ERR(dentry) || !new_sb)
cgroup_put(&root->cgrp);
@@ -2191,6 +2161,12 @@ static struct file_system_type cgroup_fs_type = {
.kill_sb = cgroup_kill_sb,
};
+static struct file_system_type cgroup2_fs_type = {
+ .name = "cgroup2",
+ .mount = cgroup_mount,
+ .kill_sb = cgroup_kill_sb,
+};
+
/**
* task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
* @task: target task
@@ -4063,7 +4039,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
goto out_err;
/*
- * Migrate tasks one-by-one until @form is empty. This fails iff
+ * Migrate tasks one-by-one until @from is empty. This fails iff
* ->can_attach() fails.
*/
do {
@@ -4681,14 +4657,15 @@ static void css_free_work_fn(struct work_struct *work)
if (ss) {
/* css free path */
+ struct cgroup_subsys_state *parent = css->parent;
int id = css->id;
- if (css->parent)
- css_put(css->parent);
-
ss->css_free(css);
cgroup_idr_remove(&ss->css_idr, id);
cgroup_put(cgrp);
+
+ if (parent)
+ css_put(parent);
} else {
/* cgroup free path */
atomic_dec(&cgrp->root->nr_cgrps);
@@ -4909,11 +4886,11 @@ err_free_css:
static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
umode_t mode)
{
- struct cgroup *parent, *cgrp;
+ struct cgroup *parent, *cgrp, *tcgrp;
struct cgroup_root *root;
struct cgroup_subsys *ss;
struct kernfs_node *kn;
- int ssid, ret;
+ int level, ssid, ret;
/* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable.
*/
@@ -4924,9 +4901,11 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
if (!parent)
return -ENODEV;
root = parent->root;
+ level = parent->level + 1;
/* allocate the cgroup and its ID, 0 is reserved for the root */
- cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
+ cgrp = kzalloc(sizeof(*cgrp) +
+ sizeof(cgrp->ancestor_ids[0]) * (level + 1), GFP_KERNEL);
if (!cgrp) {
ret = -ENOMEM;
goto out_unlock;
@@ -4950,6 +4929,10 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
cgrp->self.parent = &parent->self;
cgrp->root = root;
+ cgrp->level = level;
+
+ for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp))
+ cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
if (notify_on_release(parent))
set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
@@ -5201,7 +5184,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
{
struct cgroup_subsys_state *css;
- printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
+ pr_debug("Initializing cgroup subsys %s\n", ss->name);
mutex_lock(&cgroup_mutex);
@@ -5359,6 +5342,7 @@ int __init cgroup_init(void)
WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
WARN_ON(register_filesystem(&cgroup_fs_type));
+ WARN_ON(register_filesystem(&cgroup2_fs_type));
WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations));
return 0;
@@ -5502,19 +5486,6 @@ static const struct file_operations proc_cgroupstats_operations = {
.release = single_release,
};
-static void **subsys_canfork_priv_p(void *ss_priv[CGROUP_CANFORK_COUNT], int i)
-{
- if (CGROUP_CANFORK_START <= i && i < CGROUP_CANFORK_END)
- return &ss_priv[i - CGROUP_CANFORK_START];
- return NULL;
-}
-
-static void *subsys_canfork_priv(void *ss_priv[CGROUP_CANFORK_COUNT], int i)
-{
- void **private = subsys_canfork_priv_p(ss_priv, i);
- return private ? *private : NULL;
-}
-
/**
* cgroup_fork - initialize cgroup related fields during copy_process()
* @child: pointer to task_struct of forking parent process.
@@ -5537,14 +5508,13 @@ void cgroup_fork(struct task_struct *child)
* returns an error, the fork aborts with that error code. This allows for
* a cgroup subsystem to conditionally allow or deny new forks.
*/
-int cgroup_can_fork(struct task_struct *child,
- void *ss_priv[CGROUP_CANFORK_COUNT])
+int cgroup_can_fork(struct task_struct *child)
{
struct cgroup_subsys *ss;
int i, j, ret;
for_each_subsys_which(ss, i, &have_canfork_callback) {
- ret = ss->can_fork(child, subsys_canfork_priv_p(ss_priv, i));
+ ret = ss->can_fork(child);
if (ret)
goto out_revert;
}
@@ -5556,7 +5526,7 @@ out_revert:
if (j >= i)
break;
if (ss->cancel_fork)
- ss->cancel_fork(child, subsys_canfork_priv(ss_priv, j));
+ ss->cancel_fork(child);
}
return ret;
@@ -5569,15 +5539,14 @@ out_revert:
* This calls the cancel_fork() callbacks if a fork failed *after*
* cgroup_can_fork() succeded.
*/
-void cgroup_cancel_fork(struct task_struct *child,
- void *ss_priv[CGROUP_CANFORK_COUNT])
+void cgroup_cancel_fork(struct task_struct *child)
{
struct cgroup_subsys *ss;
int i;
for_each_subsys(ss, i)
if (ss->cancel_fork)
- ss->cancel_fork(child, subsys_canfork_priv(ss_priv, i));
+ ss->cancel_fork(child);
}
/**
@@ -5590,8 +5559,7 @@ void cgroup_cancel_fork(struct task_struct *child,
* cgroup_task_iter_start() - to guarantee that the new task ends up on its
* list.
*/
-void cgroup_post_fork(struct task_struct *child,
- void *old_ss_priv[CGROUP_CANFORK_COUNT])
+void cgroup_post_fork(struct task_struct *child)
{
struct cgroup_subsys *ss;
int i;
@@ -5635,7 +5603,7 @@ void cgroup_post_fork(struct task_struct *child,
* and addition to css_set.
*/
for_each_subsys_which(ss, i, &have_fork_callback)
- ss->fork(child, subsys_canfork_priv(old_ss_priv, i));
+ ss->fork(child);
}
/**
@@ -5835,6 +5803,93 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
return id > 0 ? idr_find(&ss->css_idr, id) : NULL;
}
+/**
+ * cgroup_get_from_path - lookup and get a cgroup from its default hierarchy path
+ * @path: path on the default hierarchy
+ *
+ * Find the cgroup at @path on the default hierarchy, increment its
+ * reference count and return it. Returns pointer to the found cgroup on
+ * success, ERR_PTR(-ENOENT) if @path doens't exist and ERR_PTR(-ENOTDIR)
+ * if @path points to a non-directory.
+ */
+struct cgroup *cgroup_get_from_path(const char *path)
+{
+ struct kernfs_node *kn;
+ struct cgroup *cgrp;
+
+ mutex_lock(&cgroup_mutex);
+
+ kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path);
+ if (kn) {
+ if (kernfs_type(kn) == KERNFS_DIR) {
+ cgrp = kn->priv;
+ cgroup_get(cgrp);
+ } else {
+ cgrp = ERR_PTR(-ENOTDIR);
+ }
+ kernfs_put(kn);
+ } else {
+ cgrp = ERR_PTR(-ENOENT);
+ }
+
+ mutex_unlock(&cgroup_mutex);
+ return cgrp;
+}
+EXPORT_SYMBOL_GPL(cgroup_get_from_path);
+
+/*
+ * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data
+ * definition in cgroup-defs.h.
+ */
+#ifdef CONFIG_SOCK_CGROUP_DATA
+
+#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
+
+DEFINE_SPINLOCK(cgroup_sk_update_lock);
+static bool cgroup_sk_alloc_disabled __read_mostly;
+
+void cgroup_sk_alloc_disable(void)
+{
+ if (cgroup_sk_alloc_disabled)
+ return;
+ pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n");
+ cgroup_sk_alloc_disabled = true;
+}
+
+#else
+
+#define cgroup_sk_alloc_disabled false
+
+#endif
+
+void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
+{
+ if (cgroup_sk_alloc_disabled)
+ return;
+
+ rcu_read_lock();
+
+ while (true) {
+ struct css_set *cset;
+
+ cset = task_css_set(current);
+ if (likely(cgroup_tryget(cset->dfl_cgrp))) {
+ skcd->val = (unsigned long)cset->dfl_cgrp;
+ break;
+ }
+ cpu_relax();
+ }
+
+ rcu_read_unlock();
+}
+
+void cgroup_sk_free(struct sock_cgroup_data *skcd)
+{
+ cgroup_put(sock_cgroup_ptr(skcd));
+}
+
+#endif /* CONFIG_SOCK_CGROUP_DATA */
+
#ifdef CONFIG_CGROUP_DEBUG
static struct cgroup_subsys_state *
debug_css_alloc(struct cgroup_subsys_state *parent_css)
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 2d3df82c5..1b72d56ed 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -200,7 +200,7 @@ static void freezer_attach(struct cgroup_taskset *tset)
* to do anything as freezer_attach() will put @task into the appropriate
* state.
*/
-static void freezer_fork(struct task_struct *task, void *private)
+static void freezer_fork(struct task_struct *task)
{
struct freezer *freezer;
diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c
index b50d5a167..303097b37 100644
--- a/kernel/cgroup_pids.c
+++ b/kernel/cgroup_pids.c
@@ -134,7 +134,7 @@ static void pids_charge(struct pids_cgroup *pids, int num)
*
* This function follows the set limit. It will fail if the charge would cause
* the new value to exceed the hierarchical limit. Returns 0 if the charge
- * succeded, otherwise -EAGAIN.
+ * succeeded, otherwise -EAGAIN.
*/
static int pids_try_charge(struct pids_cgroup *pids, int num)
{
@@ -209,7 +209,7 @@ static void pids_cancel_attach(struct cgroup_taskset *tset)
* task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies
* on threadgroup_change_begin() held by the copy_process().
*/
-static int pids_can_fork(struct task_struct *task, void **priv_p)
+static int pids_can_fork(struct task_struct *task)
{
struct cgroup_subsys_state *css;
struct pids_cgroup *pids;
@@ -219,7 +219,7 @@ static int pids_can_fork(struct task_struct *task, void **priv_p)
return pids_try_charge(pids, 1);
}
-static void pids_cancel_fork(struct task_struct *task, void *priv)
+static void pids_cancel_fork(struct task_struct *task)
{
struct cgroup_subsys_state *css;
struct pids_cgroup *pids;
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index d8560ee3b..9ad37b9e4 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -24,7 +24,7 @@
#define CREATE_TRACE_POINTS
#include <trace/events/context_tracking.h>
-struct static_key context_tracking_enabled = STATIC_KEY_INIT_FALSE;
+DEFINE_STATIC_KEY_FALSE(context_tracking_enabled);
EXPORT_SYMBOL_GPL(context_tracking_enabled);
DEFINE_PER_CPU(struct context_tracking, context_tracking);
@@ -191,7 +191,7 @@ void __init context_tracking_cpu_set(int cpu)
if (!per_cpu(context_tracking.active, cpu)) {
per_cpu(context_tracking.active, cpu) = true;
- static_key_slow_inc(&context_tracking_enabled);
+ static_branch_inc(&context_tracking_enabled);
}
if (initialized)
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 85ff5e26e..5b9d39633 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -759,71 +759,33 @@ const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL;
EXPORT_SYMBOL(cpu_all_bits);
#ifdef CONFIG_INIT_ALL_POSSIBLE
-static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly
- = CPU_BITS_ALL;
+struct cpumask __cpu_possible_mask __read_mostly
+ = {CPU_BITS_ALL};
#else
-static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly;
+struct cpumask __cpu_possible_mask __read_mostly;
#endif
-const struct cpumask *const cpu_possible_mask = to_cpumask(cpu_possible_bits);
-EXPORT_SYMBOL(cpu_possible_mask);
+EXPORT_SYMBOL(__cpu_possible_mask);
-static DECLARE_BITMAP(cpu_online_bits, CONFIG_NR_CPUS) __read_mostly;
-const struct cpumask *const cpu_online_mask = to_cpumask(cpu_online_bits);
-EXPORT_SYMBOL(cpu_online_mask);
+struct cpumask __cpu_online_mask __read_mostly;
+EXPORT_SYMBOL(__cpu_online_mask);
-static DECLARE_BITMAP(cpu_present_bits, CONFIG_NR_CPUS) __read_mostly;
-const struct cpumask *const cpu_present_mask = to_cpumask(cpu_present_bits);
-EXPORT_SYMBOL(cpu_present_mask);
+struct cpumask __cpu_present_mask __read_mostly;
+EXPORT_SYMBOL(__cpu_present_mask);
-static DECLARE_BITMAP(cpu_active_bits, CONFIG_NR_CPUS) __read_mostly;
-const struct cpumask *const cpu_active_mask = to_cpumask(cpu_active_bits);
-EXPORT_SYMBOL(cpu_active_mask);
-
-void set_cpu_possible(unsigned int cpu, bool possible)
-{
- if (possible)
- cpumask_set_cpu(cpu, to_cpumask(cpu_possible_bits));
- else
- cpumask_clear_cpu(cpu, to_cpumask(cpu_possible_bits));
-}
-
-void set_cpu_present(unsigned int cpu, bool present)
-{
- if (present)
- cpumask_set_cpu(cpu, to_cpumask(cpu_present_bits));
- else
- cpumask_clear_cpu(cpu, to_cpumask(cpu_present_bits));
-}
-
-void set_cpu_online(unsigned int cpu, bool online)
-{
- if (online) {
- cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits));
- cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits));
- } else {
- cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits));
- }
-}
-
-void set_cpu_active(unsigned int cpu, bool active)
-{
- if (active)
- cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits));
- else
- cpumask_clear_cpu(cpu, to_cpumask(cpu_active_bits));
-}
+struct cpumask __cpu_active_mask __read_mostly;
+EXPORT_SYMBOL(__cpu_active_mask);
void init_cpu_present(const struct cpumask *src)
{
- cpumask_copy(to_cpumask(cpu_present_bits), src);
+ cpumask_copy(&__cpu_present_mask, src);
}
void init_cpu_possible(const struct cpumask *src)
{
- cpumask_copy(to_cpumask(cpu_possible_bits), src);
+ cpumask_copy(&__cpu_possible_mask, src);
}
void init_cpu_online(const struct cpumask *src)
{
- cpumask_copy(to_cpumask(cpu_online_bits), src);
+ cpumask_copy(&__cpu_online_mask, src);
}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 2ade63219..41989ab4d 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -51,6 +51,7 @@
#include <linux/stat.h>
#include <linux/string.h>
#include <linux/time.h>
+#include <linux/time64.h>
#include <linux/backing-dev.h>
#include <linux/sort.h>
@@ -68,7 +69,7 @@ struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE;
struct fmeter {
int cnt; /* unprocessed events count */
int val; /* most recent output value */
- time_t time; /* clock (secs) when val computed */
+ time64_t time; /* clock (secs) when val computed */
spinlock_t lock; /* guards read or write of above */
};
@@ -1397,7 +1398,7 @@ out:
*/
#define FM_COEF 933 /* coefficient for half-life of 10 secs */
-#define FM_MAXTICKS ((time_t)99) /* useless computing more ticks than this */
+#define FM_MAXTICKS ((u32)99) /* useless computing more ticks than this */
#define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */
#define FM_SCALE 1000 /* faux fixed point scale */
@@ -1413,8 +1414,11 @@ static void fmeter_init(struct fmeter *fmp)
/* Internal meter update - process cnt events and update value */
static void fmeter_update(struct fmeter *fmp)
{
- time_t now = get_seconds();
- time_t ticks = now - fmp->time;
+ time64_t now;
+ u32 ticks;
+
+ now = ktime_get_seconds();
+ ticks = now - fmp->time;
if (ticks == 0)
return;
diff --git a/kernel/cred.c b/kernel/cred.c
index 71179a09c..0c0cd8a62 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -569,8 +569,8 @@ EXPORT_SYMBOL(revert_creds);
void __init cred_init(void)
{
/* allocate a slab in which we can store credentials */
- cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred),
- 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+ cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred), 0,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
}
/**
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 412134549..2a20c0dfd 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -2021,7 +2021,7 @@ static int kdb_lsmod(int argc, const char **argv)
continue;
kdb_printf("%-20s%8u 0x%p ", mod->name,
- mod->core_size, (void *)mod);
+ mod->core_layout.size, (void *)mod);
#ifdef CONFIG_MODULE_UNLOAD
kdb_printf("%4d ", module_refcount(mod));
#endif
@@ -2031,7 +2031,7 @@ static int kdb_lsmod(int argc, const char **argv)
kdb_printf(" (Loading)");
else
kdb_printf(" (Live)");
- kdb_printf(" 0x%p", mod->module_core);
+ kdb_printf(" 0x%p", mod->core_layout.base);
#ifdef CONFIG_MODULE_UNLOAD
{
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index ef90b04d7..435c14a45 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -34,7 +34,7 @@ __setup("nodelayacct", delayacct_setup_disable);
void delayacct_init(void)
{
- delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC);
+ delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC|SLAB_ACCOUNT);
delayacct_tsk_init(&init_task);
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1087bbeb1..614614821 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -49,8 +49,6 @@
#include <asm/irq_regs.h>
-static struct workqueue_struct *perf_wq;
-
typedef int (*remote_function_f)(void *);
struct remote_function_call {
@@ -66,8 +64,17 @@ static void remote_function(void *data)
struct task_struct *p = tfc->p;
if (p) {
- tfc->ret = -EAGAIN;
- if (task_cpu(p) != smp_processor_id() || !task_curr(p))
+ /* -EAGAIN */
+ if (task_cpu(p) != smp_processor_id())
+ return;
+
+ /*
+ * Now that we're on right CPU with IRQs disabled, we can test
+ * if we hit the right task without races.
+ */
+
+ tfc->ret = -ESRCH; /* No such (running) process */
+ if (p != current)
return;
}
@@ -94,13 +101,17 @@ task_function_call(struct task_struct *p, remote_function_f func, void *info)
.p = p,
.func = func,
.info = info,
- .ret = -ESRCH, /* No such (running) process */
+ .ret = -EAGAIN,
};
+ int ret;
- if (task_curr(p))
- smp_call_function_single(task_cpu(p), remote_function, &data, 1);
+ do {
+ ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1);
+ if (!ret)
+ ret = data.ret;
+ } while (ret == -EAGAIN);
- return data.ret;
+ return ret;
}
/**
@@ -126,11 +137,168 @@ static int cpu_function_call(int cpu, remote_function_f func, void *info)
return data.ret;
}
-#define EVENT_OWNER_KERNEL ((void *) -1)
+static inline struct perf_cpu_context *
+__get_cpu_context(struct perf_event_context *ctx)
+{
+ return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
+}
+
+static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx)
+{
+ raw_spin_lock(&cpuctx->ctx.lock);
+ if (ctx)
+ raw_spin_lock(&ctx->lock);
+}
+
+static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx)
+{
+ if (ctx)
+ raw_spin_unlock(&ctx->lock);
+ raw_spin_unlock(&cpuctx->ctx.lock);
+}
+
+#define TASK_TOMBSTONE ((void *)-1L)
static bool is_kernel_event(struct perf_event *event)
{
- return event->owner == EVENT_OWNER_KERNEL;
+ return READ_ONCE(event->owner) == TASK_TOMBSTONE;
+}
+
+/*
+ * On task ctx scheduling...
+ *
+ * When !ctx->nr_events a task context will not be scheduled. This means
+ * we can disable the scheduler hooks (for performance) without leaving
+ * pending task ctx state.
+ *
+ * This however results in two special cases:
+ *
+ * - removing the last event from a task ctx; this is relatively straight
+ * forward and is done in __perf_remove_from_context.
+ *
+ * - adding the first event to a task ctx; this is tricky because we cannot
+ * rely on ctx->is_active and therefore cannot use event_function_call().
+ * See perf_install_in_context().
+ *
+ * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set.
+ */
+
+typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *,
+ struct perf_event_context *, void *);
+
+struct event_function_struct {
+ struct perf_event *event;
+ event_f func;
+ void *data;
+};
+
+static int event_function(void *info)
+{
+ struct event_function_struct *efs = info;
+ struct perf_event *event = efs->event;
+ struct perf_event_context *ctx = event->ctx;
+ struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct perf_event_context *task_ctx = cpuctx->task_ctx;
+ int ret = 0;
+
+ WARN_ON_ONCE(!irqs_disabled());
+
+ perf_ctx_lock(cpuctx, task_ctx);
+ /*
+ * Since we do the IPI call without holding ctx->lock things can have
+ * changed, double check we hit the task we set out to hit.
+ */
+ if (ctx->task) {
+ if (ctx->task != current) {
+ ret = -ESRCH;
+ goto unlock;
+ }
+
+ /*
+ * We only use event_function_call() on established contexts,
+ * and event_function() is only ever called when active (or
+ * rather, we'll have bailed in task_function_call() or the
+ * above ctx->task != current test), therefore we must have
+ * ctx->is_active here.
+ */
+ WARN_ON_ONCE(!ctx->is_active);
+ /*
+ * And since we have ctx->is_active, cpuctx->task_ctx must
+ * match.
+ */
+ WARN_ON_ONCE(task_ctx != ctx);
+ } else {
+ WARN_ON_ONCE(&cpuctx->ctx != ctx);
+ }
+
+ efs->func(event, cpuctx, ctx, efs->data);
+unlock:
+ perf_ctx_unlock(cpuctx, task_ctx);
+
+ return ret;
+}
+
+static void event_function_local(struct perf_event *event, event_f func, void *data)
+{
+ struct event_function_struct efs = {
+ .event = event,
+ .func = func,
+ .data = data,
+ };
+
+ int ret = event_function(&efs);
+ WARN_ON_ONCE(ret);
+}
+
+static void event_function_call(struct perf_event *event, event_f func, void *data)
+{
+ struct perf_event_context *ctx = event->ctx;
+ struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */
+ struct event_function_struct efs = {
+ .event = event,
+ .func = func,
+ .data = data,
+ };
+
+ if (!event->parent) {
+ /*
+ * If this is a !child event, we must hold ctx::mutex to
+ * stabilize the the event->ctx relation. See
+ * perf_event_ctx_lock().
+ */
+ lockdep_assert_held(&ctx->mutex);
+ }
+
+ if (!task) {
+ cpu_function_call(event->cpu, event_function, &efs);
+ return;
+ }
+
+ if (task == TASK_TOMBSTONE)
+ return;
+
+again:
+ if (!task_function_call(task, event_function, &efs))
+ return;
+
+ raw_spin_lock_irq(&ctx->lock);
+ /*
+ * Reload the task pointer, it might have been changed by
+ * a concurrent perf_event_context_sched_out().
+ */
+ task = ctx->task;
+ if (task == TASK_TOMBSTONE) {
+ raw_spin_unlock_irq(&ctx->lock);
+ return;
+ }
+ if (ctx->is_active) {
+ raw_spin_unlock_irq(&ctx->lock);
+ goto again;
+ }
+ func(event, NULL, ctx, data);
+ raw_spin_unlock_irq(&ctx->lock);
}
#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
@@ -148,6 +316,7 @@ static bool is_kernel_event(struct perf_event *event)
enum event_type_t {
EVENT_FLEXIBLE = 0x1,
EVENT_PINNED = 0x2,
+ EVENT_TIME = 0x4,
EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
};
@@ -155,7 +324,13 @@ enum event_type_t {
* perf_sched_events : >0 events exist
* perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
*/
-struct static_key_deferred perf_sched_events __read_mostly;
+
+static void perf_sched_delayed(struct work_struct *work);
+DEFINE_STATIC_KEY_FALSE(perf_sched_events);
+static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
+static DEFINE_MUTEX(perf_sched_mutex);
+static atomic_t perf_sched_count;
+
static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
static DEFINE_PER_CPU(int, perf_sched_cb_usages);
@@ -337,28 +512,6 @@ static inline u64 perf_event_clock(struct perf_event *event)
return event->clock();
}
-static inline struct perf_cpu_context *
-__get_cpu_context(struct perf_event_context *ctx)
-{
- return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
-}
-
-static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
-{
- raw_spin_lock(&cpuctx->ctx.lock);
- if (ctx)
- raw_spin_lock(&ctx->lock);
-}
-
-static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
-{
- if (ctx)
- raw_spin_unlock(&ctx->lock);
- raw_spin_unlock(&cpuctx->ctx.lock);
-}
-
#ifdef CONFIG_CGROUP_PERF
static inline bool
@@ -548,13 +701,7 @@ static inline void perf_cgroup_sched_out(struct task_struct *task,
* we are holding the rcu lock
*/
cgrp1 = perf_cgroup_from_task(task, NULL);
-
- /*
- * next is NULL when called from perf_event_enable_on_exec()
- * that will systematically cause a cgroup_switch()
- */
- if (next)
- cgrp2 = perf_cgroup_from_task(next, NULL);
+ cgrp2 = perf_cgroup_from_task(next, NULL);
/*
* only schedule out current cgroup events if we know
@@ -580,8 +727,6 @@ static inline void perf_cgroup_sched_in(struct task_struct *prev,
* we are holding the rcu lock
*/
cgrp1 = perf_cgroup_from_task(task, NULL);
-
- /* prev can never be NULL */
cgrp2 = perf_cgroup_from_task(prev, NULL);
/*
@@ -886,7 +1031,7 @@ static void put_ctx(struct perf_event_context *ctx)
if (atomic_dec_and_test(&ctx->refcount)) {
if (ctx->parent_ctx)
put_ctx(ctx->parent_ctx);
- if (ctx->task)
+ if (ctx->task && ctx->task != TASK_TOMBSTONE)
put_task_struct(ctx->task);
call_rcu(&ctx->rcu_head, free_ctx);
}
@@ -903,9 +1048,8 @@ static void put_ctx(struct perf_event_context *ctx)
* perf_event_context::mutex nests and those are:
*
* - perf_event_exit_task_context() [ child , 0 ]
- * __perf_event_exit_task()
- * sync_child_event()
- * put_event() [ parent, 1 ]
+ * perf_event_exit_event()
+ * put_event() [ parent, 1 ]
*
* - perf_event_init_context() [ parent, 0 ]
* inherit_task_group()
@@ -948,8 +1092,8 @@ static void put_ctx(struct perf_event_context *ctx)
* Lock order:
* task_struct::perf_event_mutex
* perf_event_context::mutex
- * perf_event_context::lock
* perf_event::child_mutex;
+ * perf_event_context::lock
* perf_event::mmap_mutex
* mmap_sem
*/
@@ -1047,6 +1191,7 @@ static u64 primary_event_id(struct perf_event *event)
/*
* Get the perf_event_context for a task and lock it.
+ *
* This has to cope with with the fact that until it is locked,
* the context could get moved to another task.
*/
@@ -1087,9 +1232,12 @@ retry:
goto retry;
}
- if (!atomic_inc_not_zero(&ctx->refcount)) {
+ if (ctx->task == TASK_TOMBSTONE ||
+ !atomic_inc_not_zero(&ctx->refcount)) {
raw_spin_unlock(&ctx->lock);
ctx = NULL;
+ } else {
+ WARN_ON_ONCE(ctx->task != task);
}
}
rcu_read_unlock();
@@ -1149,16 +1297,18 @@ static u64 perf_event_time(struct perf_event *event)
/*
* Update the total_time_enabled and total_time_running fields for a event.
- * The caller of this function needs to hold the ctx->lock.
*/
static void update_event_times(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
u64 run_end;
+ lockdep_assert_held(&ctx->lock);
+
if (event->state < PERF_EVENT_STATE_INACTIVE ||
event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
return;
+
/*
* in cgroup mode, time_enabled represents
* the time the event was enabled AND active
@@ -1215,6 +1365,8 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
static void
list_add_event(struct perf_event *event, struct perf_event_context *ctx)
{
+ lockdep_assert_held(&ctx->lock);
+
WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
event->attach_state |= PERF_ATTACH_CONTEXT;
@@ -1417,11 +1569,14 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
if (is_cgroup_event(event)) {
ctx->nr_cgroups--;
+ /*
+ * Because cgroup events are always per-cpu events, this will
+ * always be called from the right CPU.
+ */
cpuctx = __get_cpu_context(ctx);
/*
- * if there are no more cgroup events
- * then cler cgrp to avoid stale pointer
- * in update_cgrp_time_from_cpuctx()
+ * If there are no more cgroup events then clear cgrp to avoid
+ * stale pointer in update_cgrp_time_from_cpuctx().
*/
if (!ctx->nr_cgroups)
cpuctx->cgrp = NULL;
@@ -1499,45 +1654,11 @@ out:
perf_event__header_size(tmp);
}
-/*
- * User event without the task.
- */
static bool is_orphaned_event(struct perf_event *event)
{
- return event && !is_kernel_event(event) && !event->owner;
-}
-
-/*
- * Event has a parent but parent's task finished and it's
- * alive only because of children holding refference.
- */
-static bool is_orphaned_child(struct perf_event *event)
-{
- return is_orphaned_event(event->parent);
-}
-
-static void orphans_remove_work(struct work_struct *work);
-
-static void schedule_orphans_remove(struct perf_event_context *ctx)
-{
- if (!ctx->task || ctx->orphans_remove_sched || !perf_wq)
- return;
-
- if (queue_delayed_work(perf_wq, &ctx->orphans_remove, 1)) {
- get_ctx(ctx);
- ctx->orphans_remove_sched = true;
- }
-}
-
-static int __init perf_workqueue_init(void)
-{
- perf_wq = create_singlethread_workqueue("perf");
- WARN(!perf_wq, "failed to create perf workqueue\n");
- return perf_wq ? 0 : -1;
+ return event->state == PERF_EVENT_STATE_DEAD;
}
-core_initcall(perf_workqueue_init);
-
static inline int pmu_filter_match(struct perf_event *event)
{
struct pmu *pmu = event->pmu;
@@ -1580,14 +1701,14 @@ event_sched_out(struct perf_event *event,
perf_pmu_disable(event->pmu);
+ event->tstamp_stopped = tstamp;
+ event->pmu->del(event, 0);
+ event->oncpu = -1;
event->state = PERF_EVENT_STATE_INACTIVE;
if (event->pending_disable) {
event->pending_disable = 0;
event->state = PERF_EVENT_STATE_OFF;
}
- event->tstamp_stopped = tstamp;
- event->pmu->del(event, 0);
- event->oncpu = -1;
if (!is_software_event(event))
cpuctx->active_oncpu--;
@@ -1598,9 +1719,6 @@ event_sched_out(struct perf_event *event,
if (event->attr.exclusive || !cpuctx->active_oncpu)
cpuctx->exclusive = 0;
- if (is_orphaned_child(event))
- schedule_orphans_remove(ctx);
-
perf_pmu_enable(event->pmu);
}
@@ -1624,10 +1742,7 @@ group_sched_out(struct perf_event *group_event,
cpuctx->exclusive = 0;
}
-struct remove_event {
- struct perf_event *event;
- bool detach_group;
-};
+#define DETACH_GROUP 0x01UL
/*
* Cross CPU call to remove a performance event
@@ -1635,34 +1750,31 @@ struct remove_event {
* We disable the event on the hardware level first. After that we
* remove it from the context list.
*/
-static int __perf_remove_from_context(void *info)
+static void
+__perf_remove_from_context(struct perf_event *event,
+ struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx,
+ void *info)
{
- struct remove_event *re = info;
- struct perf_event *event = re->event;
- struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ unsigned long flags = (unsigned long)info;
- raw_spin_lock(&ctx->lock);
event_sched_out(event, cpuctx, ctx);
- if (re->detach_group)
+ if (flags & DETACH_GROUP)
perf_group_detach(event);
list_del_event(event, ctx);
- if (!ctx->nr_events && cpuctx->task_ctx == ctx) {
+
+ if (!ctx->nr_events && ctx->is_active) {
ctx->is_active = 0;
- cpuctx->task_ctx = NULL;
+ if (ctx->task) {
+ WARN_ON_ONCE(cpuctx->task_ctx != ctx);
+ cpuctx->task_ctx = NULL;
+ }
}
- raw_spin_unlock(&ctx->lock);
-
- return 0;
}
-
/*
* Remove the event from a task's (or a CPU's) list of events.
*
- * CPU events are removed with a smp call. For task events we only
- * call when the task is on a CPU.
- *
* If event->ctx is a cloned context, callers must make sure that
* every task struct that event->ctx->task could possibly point to
* remains valid. This is OK when called from perf_release since
@@ -1670,96 +1782,32 @@ static int __perf_remove_from_context(void *info)
* When called from perf_event_exit_task, it's OK because the
* context has been detached from its task.
*/
-static void perf_remove_from_context(struct perf_event *event, bool detach_group)
+static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
{
- struct perf_event_context *ctx = event->ctx;
- struct task_struct *task = ctx->task;
- struct remove_event re = {
- .event = event,
- .detach_group = detach_group,
- };
+ lockdep_assert_held(&event->ctx->mutex);
- lockdep_assert_held(&ctx->mutex);
-
- if (!task) {
- /*
- * Per cpu events are removed via an smp call. The removal can
- * fail if the CPU is currently offline, but in that case we
- * already called __perf_remove_from_context from
- * perf_event_exit_cpu.
- */
- cpu_function_call(event->cpu, __perf_remove_from_context, &re);
- return;
- }
-
-retry:
- if (!task_function_call(task, __perf_remove_from_context, &re))
- return;
-
- raw_spin_lock_irq(&ctx->lock);
- /*
- * If we failed to find a running task, but find the context active now
- * that we've acquired the ctx->lock, retry.
- */
- if (ctx->is_active) {
- raw_spin_unlock_irq(&ctx->lock);
- /*
- * Reload the task pointer, it might have been changed by
- * a concurrent perf_event_context_sched_out().
- */
- task = ctx->task;
- goto retry;
- }
-
- /*
- * Since the task isn't running, its safe to remove the event, us
- * holding the ctx->lock ensures the task won't get scheduled in.
- */
- if (detach_group)
- perf_group_detach(event);
- list_del_event(event, ctx);
- raw_spin_unlock_irq(&ctx->lock);
+ event_function_call(event, __perf_remove_from_context, (void *)flags);
}
/*
* Cross CPU call to disable a performance event
*/
-int __perf_event_disable(void *info)
+static void __perf_event_disable(struct perf_event *event,
+ struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx,
+ void *info)
{
- struct perf_event *event = info;
- struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
-
- /*
- * If this is a per-task event, need to check whether this
- * event's task is the current task on this cpu.
- *
- * Can trigger due to concurrent perf_event_context_sched_out()
- * flipping contexts around.
- */
- if (ctx->task && cpuctx->task_ctx != ctx)
- return -EINVAL;
-
- raw_spin_lock(&ctx->lock);
-
- /*
- * If the event is on, turn it off.
- * If it is in error state, leave it in error state.
- */
- if (event->state >= PERF_EVENT_STATE_INACTIVE) {
- update_context_time(ctx);
- update_cgrp_time_from_event(event);
- update_group_times(event);
- if (event == event->group_leader)
- group_sched_out(event, cpuctx, ctx);
- else
- event_sched_out(event, cpuctx, ctx);
- event->state = PERF_EVENT_STATE_OFF;
- }
-
- raw_spin_unlock(&ctx->lock);
+ if (event->state < PERF_EVENT_STATE_INACTIVE)
+ return;
- return 0;
+ update_context_time(ctx);
+ update_cgrp_time_from_event(event);
+ update_group_times(event);
+ if (event == event->group_leader)
+ group_sched_out(event, cpuctx, ctx);
+ else
+ event_sched_out(event, cpuctx, ctx);
+ event->state = PERF_EVENT_STATE_OFF;
}
/*
@@ -1770,7 +1818,8 @@ int __perf_event_disable(void *info)
* remains valid. This condition is satisifed when called through
* perf_event_for_each_child or perf_event_for_each because they
* hold the top-level event's child_mutex, so any descendant that
- * goes to exit will block in sync_child_event.
+ * goes to exit will block in perf_event_exit_event().
+ *
* When called from perf_pending_event it's OK because event->ctx
* is the current context on this CPU and preemption is disabled,
* hence we can't get into perf_event_task_sched_out for this context.
@@ -1778,43 +1827,20 @@ int __perf_event_disable(void *info)
static void _perf_event_disable(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
- struct task_struct *task = ctx->task;
-
- if (!task) {
- /*
- * Disable the event on the cpu that it's on
- */
- cpu_function_call(event->cpu, __perf_event_disable, event);
- return;
- }
-
-retry:
- if (!task_function_call(task, __perf_event_disable, event))
- return;
raw_spin_lock_irq(&ctx->lock);
- /*
- * If the event is still active, we need to retry the cross-call.
- */
- if (event->state == PERF_EVENT_STATE_ACTIVE) {
+ if (event->state <= PERF_EVENT_STATE_OFF) {
raw_spin_unlock_irq(&ctx->lock);
- /*
- * Reload the task pointer, it might have been changed by
- * a concurrent perf_event_context_sched_out().
- */
- task = ctx->task;
- goto retry;
- }
-
- /*
- * Since we have the lock this context can't be scheduled
- * in, so we can change the state safely.
- */
- if (event->state == PERF_EVENT_STATE_INACTIVE) {
- update_group_times(event);
- event->state = PERF_EVENT_STATE_OFF;
+ return;
}
raw_spin_unlock_irq(&ctx->lock);
+
+ event_function_call(event, __perf_event_disable, NULL);
+}
+
+void perf_event_disable_local(struct perf_event *event)
+{
+ event_function_local(event, __perf_event_disable, NULL);
}
/*
@@ -1927,9 +1953,6 @@ event_sched_in(struct perf_event *event,
if (event->attr.exclusive)
cpuctx->exclusive = 1;
- if (is_orphaned_child(event))
- schedule_orphans_remove(ctx);
-
out:
perf_pmu_enable(event->pmu);
@@ -2048,13 +2071,27 @@ static void add_event_to_ctx(struct perf_event *event,
event->tstamp_stopped = tstamp;
}
-static void task_ctx_sched_out(struct perf_event_context *ctx);
+static void ctx_sched_out(struct perf_event_context *ctx,
+ struct perf_cpu_context *cpuctx,
+ enum event_type_t event_type);
static void
ctx_sched_in(struct perf_event_context *ctx,
struct perf_cpu_context *cpuctx,
enum event_type_t event_type,
struct task_struct *task);
+static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx)
+{
+ if (!cpuctx->task_ctx)
+ return;
+
+ if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
+ return;
+
+ ctx_sched_out(ctx, cpuctx, EVENT_ALL);
+}
+
static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx,
struct task_struct *task)
@@ -2067,10 +2104,22 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
}
+static void ctx_resched(struct perf_cpu_context *cpuctx,
+ struct perf_event_context *task_ctx)
+{
+ perf_pmu_disable(cpuctx->ctx.pmu);
+ if (task_ctx)
+ task_ctx_sched_out(cpuctx, task_ctx);
+ cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+ perf_event_sched_in(cpuctx, task_ctx, current);
+ perf_pmu_enable(cpuctx->ctx.pmu);
+}
+
/*
* Cross CPU call to install and enable a performance event
*
- * Must be called with ctx->mutex held
+ * Very similar to remote_function() + event_function() but cannot assume that
+ * things like ctx->is_active and cpuctx->task_ctx are set.
*/
static int __perf_install_in_context(void *info)
{
@@ -2078,72 +2127,59 @@ static int __perf_install_in_context(void *info)
struct perf_event_context *ctx = event->ctx;
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
struct perf_event_context *task_ctx = cpuctx->task_ctx;
- struct task_struct *task = current;
-
- perf_ctx_lock(cpuctx, task_ctx);
- perf_pmu_disable(cpuctx->ctx.pmu);
-
- /*
- * If there was an active task_ctx schedule it out.
- */
- if (task_ctx)
- task_ctx_sched_out(task_ctx);
+ bool activate = true;
+ int ret = 0;
- /*
- * If the context we're installing events in is not the
- * active task_ctx, flip them.
- */
- if (ctx->task && task_ctx != ctx) {
- if (task_ctx)
- raw_spin_unlock(&task_ctx->lock);
+ raw_spin_lock(&cpuctx->ctx.lock);
+ if (ctx->task) {
raw_spin_lock(&ctx->lock);
task_ctx = ctx;
- }
-
- if (task_ctx) {
- cpuctx->task_ctx = task_ctx;
- task = task_ctx->task;
- }
- cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+ /* If we're on the wrong CPU, try again */
+ if (task_cpu(ctx->task) != smp_processor_id()) {
+ ret = -ESRCH;
+ goto unlock;
+ }
- update_context_time(ctx);
- /*
- * update cgrp time only if current cgrp
- * matches event->cgrp. Must be done before
- * calling add_event_to_ctx()
- */
- update_cgrp_time_from_event(event);
+ /*
+ * If we're on the right CPU, see if the task we target is
+ * current, if not we don't have to activate the ctx, a future
+ * context switch will do that for us.
+ */
+ if (ctx->task != current)
+ activate = false;
+ else
+ WARN_ON_ONCE(cpuctx->task_ctx && cpuctx->task_ctx != ctx);
- add_event_to_ctx(event, ctx);
+ } else if (task_ctx) {
+ raw_spin_lock(&task_ctx->lock);
+ }
- /*
- * Schedule everything back in
- */
- perf_event_sched_in(cpuctx, task_ctx, task);
+ if (activate) {
+ ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+ add_event_to_ctx(event, ctx);
+ ctx_resched(cpuctx, task_ctx);
+ } else {
+ add_event_to_ctx(event, ctx);
+ }
- perf_pmu_enable(cpuctx->ctx.pmu);
+unlock:
perf_ctx_unlock(cpuctx, task_ctx);
- return 0;
+ return ret;
}
/*
- * Attach a performance event to a context
+ * Attach a performance event to a context.
*
- * First we add the event to the list with the hardware enable bit
- * in event->hw_config cleared.
- *
- * If the event is attached to a task which is on a CPU we use a smp
- * call to enable it in the task context. The task might have been
- * scheduled away, but we check this in the smp call again.
+ * Very similar to event_function_call, see comment there.
*/
static void
perf_install_in_context(struct perf_event_context *ctx,
struct perf_event *event,
int cpu)
{
- struct task_struct *task = ctx->task;
+ struct task_struct *task = READ_ONCE(ctx->task);
lockdep_assert_held(&ctx->mutex);
@@ -2152,39 +2188,45 @@ perf_install_in_context(struct perf_event_context *ctx,
event->cpu = cpu;
if (!task) {
- /*
- * Per cpu events are installed via an smp call and
- * the install is always successful.
- */
cpu_function_call(cpu, __perf_install_in_context, event);
return;
}
-retry:
- if (!task_function_call(task, __perf_install_in_context, event))
+ /*
+ * Should not happen, we validate the ctx is still alive before calling.
+ */
+ if (WARN_ON_ONCE(task == TASK_TOMBSTONE))
return;
- raw_spin_lock_irq(&ctx->lock);
/*
- * If we failed to find a running task, but find the context active now
- * that we've acquired the ctx->lock, retry.
+ * Installing events is tricky because we cannot rely on ctx->is_active
+ * to be set in case this is the nr_events 0 -> 1 transition.
*/
- if (ctx->is_active) {
- raw_spin_unlock_irq(&ctx->lock);
+again:
+ /*
+ * Cannot use task_function_call() because we need to run on the task's
+ * CPU regardless of whether its current or not.
+ */
+ if (!cpu_function_call(task_cpu(task), __perf_install_in_context, event))
+ return;
+
+ raw_spin_lock_irq(&ctx->lock);
+ task = ctx->task;
+ if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) {
/*
- * Reload the task pointer, it might have been changed by
- * a concurrent perf_event_context_sched_out().
+ * Cannot happen because we already checked above (which also
+ * cannot happen), and we hold ctx->mutex, which serializes us
+ * against perf_event_exit_task_context().
*/
- task = ctx->task;
- goto retry;
+ raw_spin_unlock_irq(&ctx->lock);
+ return;
}
-
+ raw_spin_unlock_irq(&ctx->lock);
/*
- * Since the task isn't running, its safe to add the event, us holding
- * the ctx->lock ensures the task won't get scheduled in.
+ * Since !ctx->is_active doesn't mean anything, we must IPI
+ * unconditionally.
*/
- add_event_to_ctx(event, ctx);
- raw_spin_unlock_irq(&ctx->lock);
+ goto again;
}
/*
@@ -2211,80 +2253,47 @@ static void __perf_event_mark_enabled(struct perf_event *event)
/*
* Cross CPU call to enable a performance event
*/
-static int __perf_event_enable(void *info)
+static void __perf_event_enable(struct perf_event *event,
+ struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx,
+ void *info)
{
- struct perf_event *event = info;
- struct perf_event_context *ctx = event->ctx;
struct perf_event *leader = event->group_leader;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
- int err;
+ struct perf_event_context *task_ctx;
- /*
- * There's a time window between 'ctx->is_active' check
- * in perf_event_enable function and this place having:
- * - IRQs on
- * - ctx->lock unlocked
- *
- * where the task could be killed and 'ctx' deactivated
- * by perf_event_exit_task.
- */
- if (!ctx->is_active)
- return -EINVAL;
-
- raw_spin_lock(&ctx->lock);
- update_context_time(ctx);
-
- if (event->state >= PERF_EVENT_STATE_INACTIVE)
- goto unlock;
+ if (event->state >= PERF_EVENT_STATE_INACTIVE ||
+ event->state <= PERF_EVENT_STATE_ERROR)
+ return;
- /*
- * set current task's cgroup time reference point
- */
- perf_cgroup_set_timestamp(current, ctx);
+ if (ctx->is_active)
+ ctx_sched_out(ctx, cpuctx, EVENT_TIME);
__perf_event_mark_enabled(event);
+ if (!ctx->is_active)
+ return;
+
if (!event_filter_match(event)) {
if (is_cgroup_event(event))
perf_cgroup_defer_enabled(event);
- goto unlock;
+ ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
+ return;
}
/*
* If the event is in a group and isn't the group leader,
* then don't put it on unless the group is on.
*/
- if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
- goto unlock;
-
- if (!group_can_go_on(event, cpuctx, 1)) {
- err = -EEXIST;
- } else {
- if (event == leader)
- err = group_sched_in(event, cpuctx, ctx);
- else
- err = event_sched_in(event, cpuctx, ctx);
- }
-
- if (err) {
- /*
- * If this event can't go on and it's part of a
- * group, then the whole group has to come off.
- */
- if (leader != event) {
- group_sched_out(leader, cpuctx, ctx);
- perf_mux_hrtimer_restart(cpuctx);
- }
- if (leader->attr.pinned) {
- update_group_times(leader);
- leader->state = PERF_EVENT_STATE_ERROR;
- }
+ if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
+ ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
+ return;
}
-unlock:
- raw_spin_unlock(&ctx->lock);
+ task_ctx = cpuctx->task_ctx;
+ if (ctx->task)
+ WARN_ON_ONCE(task_ctx != ctx);
- return 0;
+ ctx_resched(cpuctx, task_ctx);
}
/*
@@ -2299,58 +2308,26 @@ unlock:
static void _perf_event_enable(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
- struct task_struct *task = ctx->task;
- if (!task) {
- /*
- * Enable the event on the cpu that it's on
- */
- cpu_function_call(event->cpu, __perf_event_enable, event);
+ raw_spin_lock_irq(&ctx->lock);
+ if (event->state >= PERF_EVENT_STATE_INACTIVE ||
+ event->state < PERF_EVENT_STATE_ERROR) {
+ raw_spin_unlock_irq(&ctx->lock);
return;
}
- raw_spin_lock_irq(&ctx->lock);
- if (event->state >= PERF_EVENT_STATE_INACTIVE)
- goto out;
-
/*
* If the event is in error state, clear that first.
- * That way, if we see the event in error state below, we
- * know that it has gone back into error state, as distinct
- * from the task having been scheduled away before the
- * cross-call arrived.
+ *
+ * That way, if we see the event in error state below, we know that it
+ * has gone back into error state, as distinct from the task having
+ * been scheduled away before the cross-call arrived.
*/
if (event->state == PERF_EVENT_STATE_ERROR)
event->state = PERF_EVENT_STATE_OFF;
-
-retry:
- if (!ctx->is_active) {
- __perf_event_mark_enabled(event);
- goto out;
- }
-
raw_spin_unlock_irq(&ctx->lock);
- if (!task_function_call(task, __perf_event_enable, event))
- return;
-
- raw_spin_lock_irq(&ctx->lock);
-
- /*
- * If the context is active and the event is still off,
- * we need to retry the cross-call.
- */
- if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) {
- /*
- * task could have been flipped by a concurrent
- * perf_event_context_sched_out()
- */
- task = ctx->task;
- goto retry;
- }
-
-out:
- raw_spin_unlock_irq(&ctx->lock);
+ event_function_call(event, __perf_event_enable, NULL);
}
/*
@@ -2400,25 +2377,49 @@ static void ctx_sched_out(struct perf_event_context *ctx,
struct perf_cpu_context *cpuctx,
enum event_type_t event_type)
{
- struct perf_event *event;
int is_active = ctx->is_active;
+ struct perf_event *event;
- ctx->is_active &= ~event_type;
- if (likely(!ctx->nr_events))
+ lockdep_assert_held(&ctx->lock);
+
+ if (likely(!ctx->nr_events)) {
+ /*
+ * See __perf_remove_from_context().
+ */
+ WARN_ON_ONCE(ctx->is_active);
+ if (ctx->task)
+ WARN_ON_ONCE(cpuctx->task_ctx);
return;
+ }
- update_context_time(ctx);
- update_cgrp_time_from_cpuctx(cpuctx);
- if (!ctx->nr_active)
+ ctx->is_active &= ~event_type;
+ if (!(ctx->is_active & EVENT_ALL))
+ ctx->is_active = 0;
+
+ if (ctx->task) {
+ WARN_ON_ONCE(cpuctx->task_ctx != ctx);
+ if (!ctx->is_active)
+ cpuctx->task_ctx = NULL;
+ }
+
+ is_active ^= ctx->is_active; /* changed bits */
+
+ if (is_active & EVENT_TIME) {
+ /* update (and stop) ctx time */
+ update_context_time(ctx);
+ update_cgrp_time_from_cpuctx(cpuctx);
+ }
+
+ if (!ctx->nr_active || !(is_active & EVENT_ALL))
return;
perf_pmu_disable(ctx->pmu);
- if ((is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) {
+ if (is_active & EVENT_PINNED) {
list_for_each_entry(event, &ctx->pinned_groups, group_entry)
group_sched_out(event, cpuctx, ctx);
}
- if ((is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) {
+ if (is_active & EVENT_FLEXIBLE) {
list_for_each_entry(event, &ctx->flexible_groups, group_entry)
group_sched_out(event, cpuctx, ctx);
}
@@ -2576,17 +2577,21 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
raw_spin_lock(&ctx->lock);
raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
if (context_equiv(ctx, next_ctx)) {
- /*
- * XXX do we need a memory barrier of sorts
- * wrt to rcu_dereference() of perf_event_ctxp
- */
- task->perf_event_ctxp[ctxn] = next_ctx;
- next->perf_event_ctxp[ctxn] = ctx;
- ctx->task = next;
- next_ctx->task = task;
+ WRITE_ONCE(ctx->task, next);
+ WRITE_ONCE(next_ctx->task, task);
swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
+ /*
+ * RCU_INIT_POINTER here is safe because we've not
+ * modified the ctx and the above modification of
+ * ctx->task and ctx->task_ctx_data are immaterial
+ * since those values are always verified under
+ * ctx->lock which we're now holding.
+ */
+ RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx);
+ RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx);
+
do_switch = 0;
perf_event_sync_stat(ctx, next_ctx);
@@ -2599,8 +2604,7 @@ unlock:
if (do_switch) {
raw_spin_lock(&ctx->lock);
- ctx_sched_out(ctx, cpuctx, EVENT_ALL);
- cpuctx->task_ctx = NULL;
+ task_ctx_sched_out(cpuctx, ctx);
raw_spin_unlock(&ctx->lock);
}
}
@@ -2695,20 +2699,6 @@ void __perf_event_task_sched_out(struct task_struct *task,
perf_cgroup_sched_out(task, next);
}
-static void task_ctx_sched_out(struct perf_event_context *ctx)
-{
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
-
- if (!cpuctx->task_ctx)
- return;
-
- if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
- return;
-
- ctx_sched_out(ctx, cpuctx, EVENT_ALL);
- cpuctx->task_ctx = NULL;
-}
-
/*
* Called with IRQs disabled
*/
@@ -2783,25 +2773,40 @@ ctx_sched_in(struct perf_event_context *ctx,
enum event_type_t event_type,
struct task_struct *task)
{
- u64 now;
int is_active = ctx->is_active;
+ u64 now;
+
+ lockdep_assert_held(&ctx->lock);
- ctx->is_active |= event_type;
if (likely(!ctx->nr_events))
return;
- now = perf_clock();
- ctx->timestamp = now;
- perf_cgroup_set_timestamp(task, ctx);
+ ctx->is_active |= (event_type | EVENT_TIME);
+ if (ctx->task) {
+ if (!is_active)
+ cpuctx->task_ctx = ctx;
+ else
+ WARN_ON_ONCE(cpuctx->task_ctx != ctx);
+ }
+
+ is_active ^= ctx->is_active; /* changed bits */
+
+ if (is_active & EVENT_TIME) {
+ /* start ctx time */
+ now = perf_clock();
+ ctx->timestamp = now;
+ perf_cgroup_set_timestamp(task, ctx);
+ }
+
/*
* First go through the list and put on any pinned groups
* in order to give them the best chance of going on.
*/
- if (!(is_active & EVENT_PINNED) && (event_type & EVENT_PINNED))
+ if (is_active & EVENT_PINNED)
ctx_pinned_sched_in(ctx, cpuctx);
/* Then walk through the lower prio flexible groups */
- if (!(is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE))
+ if (is_active & EVENT_FLEXIBLE)
ctx_flexible_sched_in(ctx, cpuctx);
}
@@ -2831,12 +2836,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
* cpu flexible, task flexible.
*/
cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
-
- if (ctx->nr_events)
- cpuctx->task_ctx = ctx;
-
- perf_event_sched_in(cpuctx, cpuctx->task_ctx, task);
-
+ perf_event_sched_in(cpuctx, ctx, task);
perf_pmu_enable(ctx->pmu);
perf_ctx_unlock(cpuctx, ctx);
}
@@ -2858,6 +2858,16 @@ void __perf_event_task_sched_in(struct task_struct *prev,
struct perf_event_context *ctx;
int ctxn;
+ /*
+ * If cgroup events exist on this CPU, then we need to check if we have
+ * to switch in PMU state; cgroup event are system-wide mode only.
+ *
+ * Since cgroup events are CPU events, we must schedule these in before
+ * we schedule in the task events.
+ */
+ if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
+ perf_cgroup_sched_in(prev, task);
+
for_each_task_context_nr(ctxn) {
ctx = task->perf_event_ctxp[ctxn];
if (likely(!ctx))
@@ -2865,13 +2875,6 @@ void __perf_event_task_sched_in(struct task_struct *prev,
perf_event_context_sched_in(ctx, task);
}
- /*
- * if cgroup events exist on this CPU, then we need
- * to check if we have to switch in PMU state.
- * cgroup event are system-wide mode only
- */
- if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
- perf_cgroup_sched_in(prev, task);
if (atomic_read(&nr_switch_events))
perf_event_switch(task, prev, true);
@@ -3157,46 +3160,31 @@ static int event_enable_on_exec(struct perf_event *event,
static void perf_event_enable_on_exec(int ctxn)
{
struct perf_event_context *ctx, *clone_ctx = NULL;
+ struct perf_cpu_context *cpuctx;
struct perf_event *event;
unsigned long flags;
int enabled = 0;
- int ret;
local_irq_save(flags);
ctx = current->perf_event_ctxp[ctxn];
if (!ctx || !ctx->nr_events)
goto out;
- /*
- * We must ctxsw out cgroup events to avoid conflict
- * when invoking perf_task_event_sched_in() later on
- * in this function. Otherwise we end up trying to
- * ctxswin cgroup events which are already scheduled
- * in.
- */
- perf_cgroup_sched_out(current, NULL);
-
- raw_spin_lock(&ctx->lock);
- task_ctx_sched_out(ctx);
-
- list_for_each_entry(event, &ctx->event_list, event_entry) {
- ret = event_enable_on_exec(event, ctx);
- if (ret)
- enabled = 1;
- }
+ cpuctx = __get_cpu_context(ctx);
+ perf_ctx_lock(cpuctx, ctx);
+ ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+ list_for_each_entry(event, &ctx->event_list, event_entry)
+ enabled |= event_enable_on_exec(event, ctx);
/*
- * Unclone this context if we enabled any event.
+ * Unclone and reschedule this context if we enabled any event.
*/
- if (enabled)
+ if (enabled) {
clone_ctx = unclone_ctx(ctx);
+ ctx_resched(cpuctx, ctx);
+ }
+ perf_ctx_unlock(cpuctx, ctx);
- raw_spin_unlock(&ctx->lock);
-
- /*
- * Also calls ctxswin for cgroup events, if any:
- */
- perf_event_context_sched_in(ctx, ctx->task);
out:
local_irq_restore(flags);
@@ -3392,7 +3380,6 @@ static void __perf_event_init_context(struct perf_event_context *ctx)
INIT_LIST_HEAD(&ctx->flexible_groups);
INIT_LIST_HEAD(&ctx->event_list);
atomic_set(&ctx->refcount, 1);
- INIT_DELAYED_WORK(&ctx->orphans_remove, orphans_remove_work);
}
static struct perf_event_context *
@@ -3579,11 +3566,13 @@ static void unaccount_event_cpu(struct perf_event *event, int cpu)
static void unaccount_event(struct perf_event *event)
{
+ bool dec = false;
+
if (event->parent)
return;
if (event->attach_state & PERF_ATTACH_TASK)
- static_key_slow_dec_deferred(&perf_sched_events);
+ dec = true;
if (event->attr.mmap || event->attr.mmap_data)
atomic_dec(&nr_mmap_events);
if (event->attr.comm)
@@ -3593,17 +3582,30 @@ static void unaccount_event(struct perf_event *event)
if (event->attr.freq)
atomic_dec(&nr_freq_events);
if (event->attr.context_switch) {
- static_key_slow_dec_deferred(&perf_sched_events);
+ dec = true;
atomic_dec(&nr_switch_events);
}
if (is_cgroup_event(event))
- static_key_slow_dec_deferred(&perf_sched_events);
+ dec = true;
if (has_branch_stack(event))
- static_key_slow_dec_deferred(&perf_sched_events);
+ dec = true;
+
+ if (dec) {
+ if (!atomic_add_unless(&perf_sched_count, -1, 1))
+ schedule_delayed_work(&perf_sched_work, HZ);
+ }
unaccount_event_cpu(event, event->cpu);
}
+static void perf_sched_delayed(struct work_struct *work)
+{
+ mutex_lock(&perf_sched_mutex);
+ if (atomic_dec_and_test(&perf_sched_count))
+ static_branch_disable(&perf_sched_events);
+ mutex_unlock(&perf_sched_mutex);
+}
+
/*
* The following implement mutual exclusion of events on "exclusive" pmus
* (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
@@ -3614,7 +3616,7 @@ static void unaccount_event(struct perf_event *event)
* 3) two matching events on the same context.
*
* The former two cases are handled in the allocation path (perf_event_alloc(),
- * __free_event()), the latter -- before the first perf_install_in_context().
+ * _free_event()), the latter -- before the first perf_install_in_context().
*/
static int exclusive_event_init(struct perf_event *event)
{
@@ -3689,29 +3691,6 @@ static bool exclusive_event_installable(struct perf_event *event,
return true;
}
-static void __free_event(struct perf_event *event)
-{
- if (!event->parent) {
- if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
- put_callchain_buffers();
- }
-
- perf_event_free_bpf_prog(event);
-
- if (event->destroy)
- event->destroy(event);
-
- if (event->ctx)
- put_ctx(event->ctx);
-
- if (event->pmu) {
- exclusive_event_destroy(event);
- module_put(event->pmu->module);
- }
-
- call_rcu(&event->rcu_head, free_event_rcu);
-}
-
static void _free_event(struct perf_event *event)
{
irq_work_sync(&event->pending);
@@ -3733,7 +3712,25 @@ static void _free_event(struct perf_event *event)
if (is_cgroup_event(event))
perf_detach_cgroup(event);
- __free_event(event);
+ if (!event->parent) {
+ if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
+ put_callchain_buffers();
+ }
+
+ perf_event_free_bpf_prog(event);
+
+ if (event->destroy)
+ event->destroy(event);
+
+ if (event->ctx)
+ put_ctx(event->ctx);
+
+ if (event->pmu) {
+ exclusive_event_destroy(event);
+ module_put(event->pmu->module);
+ }
+
+ call_rcu(&event->rcu_head, free_event_rcu);
}
/*
@@ -3760,14 +3757,13 @@ static void perf_remove_from_owner(struct perf_event *event)
struct task_struct *owner;
rcu_read_lock();
- owner = ACCESS_ONCE(event->owner);
/*
- * Matches the smp_wmb() in perf_event_exit_task(). If we observe
- * !owner it means the list deletion is complete and we can indeed
- * free this event, otherwise we need to serialize on
+ * Matches the smp_store_release() in perf_event_exit_task(). If we
+ * observe !owner it means the list deletion is complete and we can
+ * indeed free this event, otherwise we need to serialize on
* owner->perf_event_mutex.
*/
- smp_read_barrier_depends();
+ owner = lockless_dereference(event->owner);
if (owner) {
/*
* Since delayed_put_task_struct() also drops the last
@@ -3795,8 +3791,10 @@ static void perf_remove_from_owner(struct perf_event *event)
* ensured they're done, and we can proceed with freeing the
* event.
*/
- if (event->owner)
+ if (event->owner) {
list_del_init(&event->owner_entry);
+ smp_store_release(&event->owner, NULL);
+ }
mutex_unlock(&owner->perf_event_mutex);
put_task_struct(owner);
}
@@ -3804,37 +3802,111 @@ static void perf_remove_from_owner(struct perf_event *event)
static void put_event(struct perf_event *event)
{
- struct perf_event_context *ctx;
-
if (!atomic_long_dec_and_test(&event->refcount))
return;
+ _free_event(event);
+}
+
+/*
+ * Kill an event dead; while event:refcount will preserve the event
+ * object, it will not preserve its functionality. Once the last 'user'
+ * gives up the object, we'll destroy the thing.
+ */
+int perf_event_release_kernel(struct perf_event *event)
+{
+ struct perf_event_context *ctx = event->ctx;
+ struct perf_event *child, *tmp;
+
+ /*
+ * If we got here through err_file: fput(event_file); we will not have
+ * attached to a context yet.
+ */
+ if (!ctx) {
+ WARN_ON_ONCE(event->attach_state &
+ (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP));
+ goto no_ctx;
+ }
+
if (!is_kernel_event(event))
perf_remove_from_owner(event);
+ ctx = perf_event_ctx_lock(event);
+ WARN_ON_ONCE(ctx->parent_ctx);
+ perf_remove_from_context(event, DETACH_GROUP);
+
+ raw_spin_lock_irq(&ctx->lock);
/*
- * There are two ways this annotation is useful:
+ * Mark this even as STATE_DEAD, there is no external reference to it
+ * anymore.
*
- * 1) there is a lock recursion from perf_event_exit_task
- * see the comment there.
+ * Anybody acquiring event->child_mutex after the below loop _must_
+ * also see this, most importantly inherit_event() which will avoid
+ * placing more children on the list.
*
- * 2) there is a lock-inversion with mmap_sem through
- * perf_read_group(), which takes faults while
- * holding ctx->mutex, however this is called after
- * the last filedesc died, so there is no possibility
- * to trigger the AB-BA case.
+ * Thus this guarantees that we will in fact observe and kill _ALL_
+ * child events.
*/
- ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING);
- WARN_ON_ONCE(ctx->parent_ctx);
- perf_remove_from_context(event, true);
+ event->state = PERF_EVENT_STATE_DEAD;
+ raw_spin_unlock_irq(&ctx->lock);
+
perf_event_ctx_unlock(event, ctx);
- _free_event(event);
-}
+again:
+ mutex_lock(&event->child_mutex);
+ list_for_each_entry(child, &event->child_list, child_list) {
-int perf_event_release_kernel(struct perf_event *event)
-{
- put_event(event);
+ /*
+ * Cannot change, child events are not migrated, see the
+ * comment with perf_event_ctx_lock_nested().
+ */
+ ctx = lockless_dereference(child->ctx);
+ /*
+ * Since child_mutex nests inside ctx::mutex, we must jump
+ * through hoops. We start by grabbing a reference on the ctx.
+ *
+ * Since the event cannot get freed while we hold the
+ * child_mutex, the context must also exist and have a !0
+ * reference count.
+ */
+ get_ctx(ctx);
+
+ /*
+ * Now that we have a ctx ref, we can drop child_mutex, and
+ * acquire ctx::mutex without fear of it going away. Then we
+ * can re-acquire child_mutex.
+ */
+ mutex_unlock(&event->child_mutex);
+ mutex_lock(&ctx->mutex);
+ mutex_lock(&event->child_mutex);
+
+ /*
+ * Now that we hold ctx::mutex and child_mutex, revalidate our
+ * state, if child is still the first entry, it didn't get freed
+ * and we can continue doing so.
+ */
+ tmp = list_first_entry_or_null(&event->child_list,
+ struct perf_event, child_list);
+ if (tmp == child) {
+ perf_remove_from_context(child, DETACH_GROUP);
+ list_del(&child->child_list);
+ free_event(child);
+ /*
+ * This matches the refcount bump in inherit_event();
+ * this can't be the last reference.
+ */
+ put_event(event);
+ }
+
+ mutex_unlock(&event->child_mutex);
+ mutex_unlock(&ctx->mutex);
+ put_ctx(ctx);
+ goto again;
+ }
+ mutex_unlock(&event->child_mutex);
+
+no_ctx:
+ put_event(event); /* Must be the 'last' reference */
return 0;
}
EXPORT_SYMBOL_GPL(perf_event_release_kernel);
@@ -3844,46 +3916,10 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel);
*/
static int perf_release(struct inode *inode, struct file *file)
{
- put_event(file->private_data);
+ perf_event_release_kernel(file->private_data);
return 0;
}
-/*
- * Remove all orphanes events from the context.
- */
-static void orphans_remove_work(struct work_struct *work)
-{
- struct perf_event_context *ctx;
- struct perf_event *event, *tmp;
-
- ctx = container_of(work, struct perf_event_context,
- orphans_remove.work);
-
- mutex_lock(&ctx->mutex);
- list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) {
- struct perf_event *parent_event = event->parent;
-
- if (!is_orphaned_child(event))
- continue;
-
- perf_remove_from_context(event, true);
-
- mutex_lock(&parent_event->child_mutex);
- list_del_init(&event->child_list);
- mutex_unlock(&parent_event->child_mutex);
-
- free_event(event);
- put_event(parent_event);
- }
-
- raw_spin_lock_irq(&ctx->lock);
- ctx->orphans_remove_sched = false;
- raw_spin_unlock_irq(&ctx->lock);
- mutex_unlock(&ctx->mutex);
-
- put_ctx(ctx);
-}
-
u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
{
struct perf_event *child;
@@ -4027,7 +4063,7 @@ static bool is_event_hup(struct perf_event *event)
{
bool no_children;
- if (event->state != PERF_EVENT_STATE_EXIT)
+ if (event->state > PERF_EVENT_STATE_EXIT)
return false;
mutex_lock(&event->child_mutex);
@@ -4112,7 +4148,7 @@ static void _perf_event_reset(struct perf_event *event)
/*
* Holding the top-level event's child_mutex means that any
* descendant process that has inherited this event will block
- * in sync_child_event if it goes to exit, thus satisfying the
+ * in perf_event_exit_event() if it goes to exit, thus satisfying the
* task existence requirements of perf_event_enable/disable.
*/
static void perf_event_for_each_child(struct perf_event *event,
@@ -4144,20 +4180,14 @@ static void perf_event_for_each(struct perf_event *event,
perf_event_for_each_child(sibling, func);
}
-struct period_event {
- struct perf_event *event;
- u64 value;
-};
-
-static int __perf_event_period(void *info)
+static void __perf_event_period(struct perf_event *event,
+ struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx,
+ void *info)
{
- struct period_event *pe = info;
- struct perf_event *event = pe->event;
- struct perf_event_context *ctx = event->ctx;
- u64 value = pe->value;
+ u64 value = *((u64 *)info);
bool active;
- raw_spin_lock(&ctx->lock);
if (event->attr.freq) {
event->attr.sample_freq = value;
} else {
@@ -4177,16 +4207,10 @@ static int __perf_event_period(void *info)
event->pmu->start(event, PERF_EF_RELOAD);
perf_pmu_enable(ctx->pmu);
}
- raw_spin_unlock(&ctx->lock);
-
- return 0;
}
static int perf_event_period(struct perf_event *event, u64 __user *arg)
{
- struct period_event pe = { .event = event, };
- struct perf_event_context *ctx = event->ctx;
- struct task_struct *task;
u64 value;
if (!is_sampling_event(event))
@@ -4201,34 +4225,7 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
if (event->attr.freq && value > sysctl_perf_event_sample_rate)
return -EINVAL;
- task = ctx->task;
- pe.value = value;
-
- if (!task) {
- cpu_function_call(event->cpu, __perf_event_period, &pe);
- return 0;
- }
-
-retry:
- if (!task_function_call(task, __perf_event_period, &pe))
- return 0;
-
- raw_spin_lock_irq(&ctx->lock);
- if (ctx->is_active) {
- raw_spin_unlock_irq(&ctx->lock);
- task = ctx->task;
- goto retry;
- }
-
- if (event->attr.freq) {
- event->attr.sample_freq = value;
- } else {
- event->attr.sample_period = value;
- event->hw.sample_period = value;
- }
-
- local64_set(&event->hw.period_left, 0);
- raw_spin_unlock_irq(&ctx->lock);
+ event_function_call(event, __perf_event_period, &value);
return 0;
}
@@ -4940,9 +4937,9 @@ static int perf_fasync(int fd, struct file *filp, int on)
struct perf_event *event = filp->private_data;
int retval;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
retval = fasync_helper(fd, filp, on, &event->fasync);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (retval < 0)
return retval;
@@ -5000,7 +4997,7 @@ static void perf_pending_event(struct irq_work *entry)
if (event->pending_disable) {
event->pending_disable = 0;
- __perf_event_disable(event);
+ perf_event_disable_local(event);
}
if (event->pending_wakeup) {
@@ -7821,11 +7818,13 @@ static void account_event_cpu(struct perf_event *event, int cpu)
static void account_event(struct perf_event *event)
{
+ bool inc = false;
+
if (event->parent)
return;
if (event->attach_state & PERF_ATTACH_TASK)
- static_key_slow_inc(&perf_sched_events.key);
+ inc = true;
if (event->attr.mmap || event->attr.mmap_data)
atomic_inc(&nr_mmap_events);
if (event->attr.comm)
@@ -7838,12 +7837,35 @@ static void account_event(struct perf_event *event)
}
if (event->attr.context_switch) {
atomic_inc(&nr_switch_events);
- static_key_slow_inc(&perf_sched_events.key);
+ inc = true;
}
if (has_branch_stack(event))
- static_key_slow_inc(&perf_sched_events.key);
+ inc = true;
if (is_cgroup_event(event))
- static_key_slow_inc(&perf_sched_events.key);
+ inc = true;
+
+ if (inc) {
+ if (atomic_inc_not_zero(&perf_sched_count))
+ goto enabled;
+
+ mutex_lock(&perf_sched_mutex);
+ if (!atomic_read(&perf_sched_count)) {
+ static_branch_enable(&perf_sched_events);
+ /*
+ * Guarantee that all CPUs observe they key change and
+ * call the perf scheduling hooks before proceeding to
+ * install events that need them.
+ */
+ synchronize_sched();
+ }
+ /*
+ * Now that we have waited for the sync_sched(), allow further
+ * increments to by-pass the mutex.
+ */
+ atomic_inc(&perf_sched_count);
+ mutex_unlock(&perf_sched_mutex);
+ }
+enabled:
account_event_cpu(event, event->cpu);
}
@@ -8462,10 +8484,19 @@ SYSCALL_DEFINE5(perf_event_open,
if (move_group) {
gctx = group_leader->ctx;
mutex_lock_double(&gctx->mutex, &ctx->mutex);
+ if (gctx->task == TASK_TOMBSTONE) {
+ err = -ESRCH;
+ goto err_locked;
+ }
} else {
mutex_lock(&ctx->mutex);
}
+ if (ctx->task == TASK_TOMBSTONE) {
+ err = -ESRCH;
+ goto err_locked;
+ }
+
if (!perf_event_validate_size(event)) {
err = -E2BIG;
goto err_locked;
@@ -8490,11 +8521,11 @@ SYSCALL_DEFINE5(perf_event_open,
* See perf_event_ctx_lock() for comments on the details
* of swizzling perf_event::ctx.
*/
- perf_remove_from_context(group_leader, false);
+ perf_remove_from_context(group_leader, 0);
list_for_each_entry(sibling, &group_leader->sibling_list,
group_entry) {
- perf_remove_from_context(sibling, false);
+ perf_remove_from_context(sibling, 0);
put_ctx(gctx);
}
@@ -8547,6 +8578,8 @@ SYSCALL_DEFINE5(perf_event_open,
perf_event__header_size(event);
perf_event__id_header_size(event);
+ event->owner = current;
+
perf_install_in_context(ctx, event, event->cpu);
perf_unpin_context(ctx);
@@ -8556,8 +8589,6 @@ SYSCALL_DEFINE5(perf_event_open,
put_online_cpus();
- event->owner = current;
-
mutex_lock(&current->perf_event_mutex);
list_add_tail(&event->owner_entry, &current->perf_event_list);
mutex_unlock(&current->perf_event_mutex);
@@ -8582,7 +8613,12 @@ err_context:
perf_unpin_context(ctx);
put_ctx(ctx);
err_alloc:
- free_event(event);
+ /*
+ * If event_file is set, the fput() above will have called ->release()
+ * and that will take care of freeing the event.
+ */
+ if (!event_file)
+ free_event(event);
err_cpus:
put_online_cpus();
err_task:
@@ -8624,7 +8660,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
}
/* Mark owner so we could distinguish it from user events. */
- event->owner = EVENT_OWNER_KERNEL;
+ event->owner = TASK_TOMBSTONE;
account_event(event);
@@ -8636,12 +8672,14 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
WARN_ON_ONCE(ctx->parent_ctx);
mutex_lock(&ctx->mutex);
+ if (ctx->task == TASK_TOMBSTONE) {
+ err = -ESRCH;
+ goto err_unlock;
+ }
+
if (!exclusive_event_installable(event, ctx)) {
- mutex_unlock(&ctx->mutex);
- perf_unpin_context(ctx);
- put_ctx(ctx);
err = -EBUSY;
- goto err_free;
+ goto err_unlock;
}
perf_install_in_context(ctx, event, cpu);
@@ -8650,6 +8688,10 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
return event;
+err_unlock:
+ mutex_unlock(&ctx->mutex);
+ perf_unpin_context(ctx);
+ put_ctx(ctx);
err_free:
free_event(event);
err:
@@ -8674,7 +8716,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
event_entry) {
- perf_remove_from_context(event, false);
+ perf_remove_from_context(event, 0);
unaccount_event_cpu(event, src_cpu);
put_ctx(src_ctx);
list_add(&event->migrate_entry, &events);
@@ -8741,33 +8783,15 @@ static void sync_child_event(struct perf_event *child_event,
&parent_event->child_total_time_enabled);
atomic64_add(child_event->total_time_running,
&parent_event->child_total_time_running);
-
- /*
- * Remove this event from the parent's list
- */
- WARN_ON_ONCE(parent_event->ctx->parent_ctx);
- mutex_lock(&parent_event->child_mutex);
- list_del_init(&child_event->child_list);
- mutex_unlock(&parent_event->child_mutex);
-
- /*
- * Make sure user/parent get notified, that we just
- * lost one event.
- */
- perf_event_wakeup(parent_event);
-
- /*
- * Release the parent event, if this was the last
- * reference to it.
- */
- put_event(parent_event);
}
static void
-__perf_event_exit_task(struct perf_event *child_event,
- struct perf_event_context *child_ctx,
- struct task_struct *child)
+perf_event_exit_event(struct perf_event *child_event,
+ struct perf_event_context *child_ctx,
+ struct task_struct *child)
{
+ struct perf_event *parent_event = child_event->parent;
+
/*
* Do not destroy the 'original' grouping; because of the context
* switch optimization the original events could've ended up in a
@@ -8780,57 +8804,86 @@ __perf_event_exit_task(struct perf_event *child_event,
* Do destroy all inherited groups, we don't care about those
* and being thorough is better.
*/
- perf_remove_from_context(child_event, !!child_event->parent);
+ raw_spin_lock_irq(&child_ctx->lock);
+ WARN_ON_ONCE(child_ctx->is_active);
+
+ if (parent_event)
+ perf_group_detach(child_event);
+ list_del_event(child_event, child_ctx);
+ child_event->state = PERF_EVENT_STATE_EXIT; /* is_event_hup() */
+ raw_spin_unlock_irq(&child_ctx->lock);
/*
- * It can happen that the parent exits first, and has events
- * that are still around due to the child reference. These
- * events need to be zapped.
+ * Parent events are governed by their filedesc, retain them.
*/
- if (child_event->parent) {
- sync_child_event(child_event, child);
- free_event(child_event);
- } else {
- child_event->state = PERF_EVENT_STATE_EXIT;
+ if (!parent_event) {
perf_event_wakeup(child_event);
+ return;
}
+ /*
+ * Child events can be cleaned up.
+ */
+
+ sync_child_event(child_event, child);
+
+ /*
+ * Remove this event from the parent's list
+ */
+ WARN_ON_ONCE(parent_event->ctx->parent_ctx);
+ mutex_lock(&parent_event->child_mutex);
+ list_del_init(&child_event->child_list);
+ mutex_unlock(&parent_event->child_mutex);
+
+ /*
+ * Kick perf_poll() for is_event_hup().
+ */
+ perf_event_wakeup(parent_event);
+ free_event(child_event);
+ put_event(parent_event);
}
static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
{
- struct perf_event *child_event, *next;
struct perf_event_context *child_ctx, *clone_ctx = NULL;
- unsigned long flags;
+ struct perf_event *child_event, *next;
- if (likely(!child->perf_event_ctxp[ctxn]))
+ WARN_ON_ONCE(child != current);
+
+ child_ctx = perf_pin_task_context(child, ctxn);
+ if (!child_ctx)
return;
- local_irq_save(flags);
/*
- * We can't reschedule here because interrupts are disabled,
- * and either child is current or it is a task that can't be
- * scheduled, so we are now safe from rescheduling changing
- * our context.
+ * In order to reduce the amount of tricky in ctx tear-down, we hold
+ * ctx::mutex over the entire thing. This serializes against almost
+ * everything that wants to access the ctx.
+ *
+ * The exception is sys_perf_event_open() /
+ * perf_event_create_kernel_count() which does find_get_context()
+ * without ctx::mutex (it cannot because of the move_group double mutex
+ * lock thing). See the comments in perf_install_in_context().
*/
- child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]);
+ mutex_lock(&child_ctx->mutex);
/*
- * Take the context lock here so that if find_get_context is
- * reading child->perf_event_ctxp, we wait until it has
- * incremented the context's refcount before we do put_ctx below.
+ * In a single ctx::lock section, de-schedule the events and detach the
+ * context from the task such that we cannot ever get it scheduled back
+ * in.
*/
- raw_spin_lock(&child_ctx->lock);
- task_ctx_sched_out(child_ctx);
- child->perf_event_ctxp[ctxn] = NULL;
+ raw_spin_lock_irq(&child_ctx->lock);
+ task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx);
/*
- * If this context is a clone; unclone it so it can't get
- * swapped to another process while we're removing all
- * the events from it.
+ * Now that the context is inactive, destroy the task <-> ctx relation
+ * and mark the context dead.
*/
+ RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL);
+ put_ctx(child_ctx); /* cannot be last */
+ WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE);
+ put_task_struct(current); /* cannot be last */
+
clone_ctx = unclone_ctx(child_ctx);
- update_context_time(child_ctx);
- raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
+ raw_spin_unlock_irq(&child_ctx->lock);
if (clone_ctx)
put_ctx(clone_ctx);
@@ -8842,20 +8895,8 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
*/
perf_event_task(child, child_ctx, 0);
- /*
- * We can recurse on the same lock type through:
- *
- * __perf_event_exit_task()
- * sync_child_event()
- * put_event()
- * mutex_lock(&ctx->mutex)
- *
- * But since its the parent context it won't be the same instance.
- */
- mutex_lock(&child_ctx->mutex);
-
list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
- __perf_event_exit_task(child_event, child_ctx, child);
+ perf_event_exit_event(child_event, child_ctx, child);
mutex_unlock(&child_ctx->mutex);
@@ -8880,8 +8921,7 @@ void perf_event_exit_task(struct task_struct *child)
* the owner, closes a race against perf_release() where
* we need to serialize on the owner->perf_event_mutex.
*/
- smp_wmb();
- event->owner = NULL;
+ smp_store_release(&event->owner, NULL);
}
mutex_unlock(&child->perf_event_mutex);
@@ -8964,21 +9004,20 @@ void perf_event_delayed_put(struct task_struct *task)
WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
}
-struct perf_event *perf_event_get(unsigned int fd)
+struct file *perf_event_get(unsigned int fd)
{
- int err;
- struct fd f;
- struct perf_event *event;
+ struct file *file;
- err = perf_fget_light(fd, &f);
- if (err)
- return ERR_PTR(err);
+ file = fget_raw(fd);
+ if (!file)
+ return ERR_PTR(-EBADF);
- event = f.file->private_data;
- atomic_long_inc(&event->refcount);
- fdput(f);
+ if (file->f_op != &perf_fops) {
+ fput(file);
+ return ERR_PTR(-EBADF);
+ }
- return event;
+ return file;
}
const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
@@ -9021,8 +9060,16 @@ inherit_event(struct perf_event *parent_event,
if (IS_ERR(child_event))
return child_event;
+ /*
+ * is_orphaned_event() and list_add_tail(&parent_event->child_list)
+ * must be under the same lock in order to serialize against
+ * perf_event_release_kernel(), such that either we must observe
+ * is_orphaned_event() or they will observe us on the child_list.
+ */
+ mutex_lock(&parent_event->child_mutex);
if (is_orphaned_event(parent_event) ||
!atomic_long_inc_not_zero(&parent_event->refcount)) {
+ mutex_unlock(&parent_event->child_mutex);
free_event(child_event);
return NULL;
}
@@ -9070,8 +9117,6 @@ inherit_event(struct perf_event *parent_event,
/*
* Link this into the parent event's child list
*/
- WARN_ON_ONCE(parent_event->ctx->parent_ctx);
- mutex_lock(&parent_event->child_mutex);
list_add_tail(&child_event->child_list, &parent_event->child_list);
mutex_unlock(&parent_event->child_mutex);
@@ -9276,7 +9321,7 @@ static void perf_event_init_cpu(int cpu)
struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
mutex_lock(&swhash->hlist_mutex);
- if (swhash->hlist_refcount > 0) {
+ if (swhash->hlist_refcount > 0 && !swevent_hlist_deref(swhash)) {
struct swevent_hlist *hlist;
hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
@@ -9289,13 +9334,14 @@ static void perf_event_init_cpu(int cpu)
#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
static void __perf_event_exit_context(void *__info)
{
- struct remove_event re = { .detach_group = true };
struct perf_event_context *ctx = __info;
+ struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct perf_event *event;
- rcu_read_lock();
- list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry)
- __perf_remove_from_context(&re);
- rcu_read_unlock();
+ raw_spin_lock(&ctx->lock);
+ list_for_each_entry(event, &ctx->event_list, event_entry)
+ __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
+ raw_spin_unlock(&ctx->lock);
}
static void perf_event_exit_cpu_context(int cpu)
@@ -9351,11 +9397,9 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_UP_PREPARE:
- case CPU_DOWN_FAILED:
perf_event_init_cpu(cpu);
break;
- case CPU_UP_CANCELED:
case CPU_DOWN_PREPARE:
perf_event_exit_cpu(cpu);
break;
@@ -9384,9 +9428,6 @@ void __init perf_event_init(void)
ret = init_hw_breakpoint();
WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
- /* do not patch jump label more than once per second */
- jump_label_rate_limit(&perf_sched_events, HZ);
-
/*
* Build time assertion that we keep the data_head at the intended
* location. IOW, validation we got the __reserved[] size right.
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 92ce5f4cc..3f8cb1e14 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -444,7 +444,7 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att
* current task.
*/
if (irqs_disabled() && bp->ctx && bp->ctx->task == current)
- __perf_event_disable(bp);
+ perf_event_disable_local(bp);
else
perf_event_disable(bp);
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index adfdc0536..1faad2cfd 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -459,6 +459,25 @@ static void rb_free_aux_page(struct ring_buffer *rb, int idx)
__free_page(page);
}
+static void __rb_free_aux(struct ring_buffer *rb)
+{
+ int pg;
+
+ if (rb->aux_priv) {
+ rb->free_aux(rb->aux_priv);
+ rb->free_aux = NULL;
+ rb->aux_priv = NULL;
+ }
+
+ if (rb->aux_nr_pages) {
+ for (pg = 0; pg < rb->aux_nr_pages; pg++)
+ rb_free_aux_page(rb, pg);
+
+ kfree(rb->aux_pages);
+ rb->aux_nr_pages = 0;
+ }
+}
+
int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
pgoff_t pgoff, int nr_pages, long watermark, int flags)
{
@@ -547,30 +566,11 @@ out:
if (!ret)
rb->aux_pgoff = pgoff;
else
- rb_free_aux(rb);
+ __rb_free_aux(rb);
return ret;
}
-static void __rb_free_aux(struct ring_buffer *rb)
-{
- int pg;
-
- if (rb->aux_priv) {
- rb->free_aux(rb->aux_priv);
- rb->free_aux = NULL;
- rb->aux_priv = NULL;
- }
-
- if (rb->aux_nr_pages) {
- for (pg = 0; pg < rb->aux_nr_pages; pg++)
- rb_free_aux_page(rb, pg);
-
- kfree(rb->aux_pages);
- rb->aux_nr_pages = 0;
- }
-}
-
void rb_free_aux(struct ring_buffer *rb)
{
if (atomic_dec_and_test(&rb->aux_refcount))
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 7dad84913..016767918 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -161,7 +161,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
const unsigned long mmun_end = addr + PAGE_SIZE;
struct mem_cgroup *memcg;
- err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg);
+ err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg,
+ false);
if (err)
return err;
@@ -175,12 +176,12 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
goto unlock;
get_page(kpage);
- page_add_new_anon_rmap(kpage, vma, addr);
- mem_cgroup_commit_charge(kpage, memcg, false);
+ page_add_new_anon_rmap(kpage, vma, addr, false);
+ mem_cgroup_commit_charge(kpage, memcg, false, false);
lru_cache_add_active_or_unevictable(kpage, vma);
if (!PageAnon(page)) {
- dec_mm_counter(mm, MM_FILEPAGES);
+ dec_mm_counter(mm, mm_counter_file(page));
inc_mm_counter(mm, MM_ANONPAGES);
}
@@ -188,7 +189,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
ptep_clear_flush_notify(vma, addr, ptep);
set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
- page_remove_rmap(page);
+ page_remove_rmap(page, false);
if (!page_mapped(page))
try_to_free_swap(page);
pte_unmap_unlock(ptep, ptl);
@@ -199,7 +200,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
err = 0;
unlock:
- mem_cgroup_cancel_charge(kpage, memcg);
+ mem_cgroup_cancel_charge(kpage, memcg, false);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
unlock_page(page);
return err;
diff --git a/kernel/exit.c b/kernel/exit.c
index 07110c602..10e088237 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -59,8 +59,6 @@
#include <asm/pgtable.h>
#include <asm/mmu_context.h>
-static void exit_mm(struct task_struct *tsk);
-
static void __unhash_process(struct task_struct *p, bool group_dead)
{
nr_threads--;
@@ -1120,8 +1118,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
static int *task_stopped_code(struct task_struct *p, bool ptrace)
{
if (ptrace) {
- if (task_is_stopped_or_traced(p) &&
- !(p->jobctl & JOBCTL_LISTENING))
+ if (task_is_traced(p) && !(p->jobctl & JOBCTL_LISTENING))
return &p->exit_code;
} else {
if (p->signal->flags & SIGNAL_STOP_STOPPED)
diff --git a/kernel/fork.c b/kernel/fork.c
index 0b59aed29..f91740137 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -138,7 +138,7 @@ static struct kmem_cache *task_struct_cachep;
static inline struct task_struct *alloc_task_struct_node(int node)
{
- return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL | ___GFP_TOI_NOTRACK, node);
+ return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
}
static inline void free_task_struct(struct task_struct *tsk)
@@ -300,9 +300,9 @@ void __init fork_init(void)
#define ARCH_MIN_TASKALIGN L1_CACHE_BYTES
#endif
/* create a slab on which task_structs can be allocated */
- task_struct_cachep =
- kmem_cache_create("task_struct", arch_task_struct_size,
- ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL);
+ task_struct_cachep = kmem_cache_create("task_struct",
+ arch_task_struct_size, ARCH_MIN_TASKALIGN,
+ SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, NULL);
#endif
/* do the arch specific task caches init */
@@ -414,7 +414,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
mm->total_vm = oldmm->total_vm;
- mm->shared_vm = oldmm->shared_vm;
+ mm->data_vm = oldmm->data_vm;
mm->exec_vm = oldmm->exec_vm;
mm->stack_vm = oldmm->stack_vm;
@@ -433,8 +433,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
struct file *file;
if (mpnt->vm_flags & VM_DONTCOPY) {
- vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
- -vma_pages(mpnt));
+ vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
continue;
}
charge = 0;
@@ -465,7 +464,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
struct inode *inode = file_inode(file);
struct address_space *mapping = file->f_mapping;
- vma_get_file(tmp);
+ get_file(file);
if (tmp->vm_flags & VM_DENYWRITE)
atomic_dec(&inode->i_writecount);
i_mmap_lock_write(mapping);
@@ -1250,7 +1249,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
{
int retval;
struct task_struct *p;
- void *cgrp_ss_priv[CGROUP_CANFORK_COUNT] = {};
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
return ERR_PTR(-EINVAL);
@@ -1349,9 +1347,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
prev_cputime_init(&p->prev_cputime);
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
- seqlock_init(&p->vtime_seqlock);
+ seqcount_init(&p->vtime_seqcount);
p->vtime_snap = 0;
- p->vtime_snap_whence = VTIME_SLEEPING;
+ p->vtime_snap_whence = VTIME_INACTIVE;
#endif
#if defined(SPLIT_RSS_COUNTING)
@@ -1527,7 +1525,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
* between here and cgroup_post_fork() if an organisation operation is in
* progress.
*/
- retval = cgroup_can_fork(p, cgrp_ss_priv);
+ retval = cgroup_can_fork(p);
if (retval)
goto bad_fork_free_pid;
@@ -1609,7 +1607,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
write_unlock_irq(&tasklist_lock);
proc_fork_connector(p);
- cgroup_post_fork(p, cgrp_ss_priv);
+ cgroup_post_fork(p);
threadgroup_change_end(current);
perf_event_fork(p);
@@ -1619,7 +1617,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
return p;
bad_fork_cancel_cgroup:
- cgroup_cancel_fork(p, cgrp_ss_priv);
+ cgroup_cancel_fork(p);
bad_fork_free_pid:
if (pid != &init_struct_pid)
free_pid(pid);
@@ -1849,16 +1847,19 @@ void __init proc_caches_init(void)
sighand_cachep = kmem_cache_create("sighand_cache",
sizeof(struct sighand_struct), 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU|
- SLAB_NOTRACK, sighand_ctor);
+ SLAB_NOTRACK|SLAB_ACCOUNT, sighand_ctor);
signal_cachep = kmem_cache_create("signal_cache",
sizeof(struct signal_struct), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+ NULL);
files_cachep = kmem_cache_create("files_cache",
sizeof(struct files_struct), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+ NULL);
fs_cachep = kmem_cache_create("fs_cache",
sizeof(struct fs_struct), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+ NULL);
/*
* FIXME! The "sizeof(struct mm_struct)" currently includes the
* whole struct cpumask for the OFFSTACK case. We could change
@@ -1868,8 +1869,9 @@ void __init proc_caches_init(void)
*/
mm_cachep = kmem_cache_create("mm_struct",
sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
- vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC);
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+ NULL);
+ vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
mmap_init();
nsproxy_cache_init();
}
diff --git a/kernel/futex.c b/kernel/futex.c
index 461c72b2d..5d6ce6413 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -469,7 +469,8 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
{
unsigned long address = (unsigned long)uaddr;
struct mm_struct *mm = current->mm;
- struct page *page, *page_head;
+ struct page *page;
+ struct address_space *mapping;
int err, ro = 0;
/*
@@ -519,46 +520,9 @@ again:
else
err = 0;
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- page_head = page;
- if (unlikely(PageTail(page))) {
- put_page(page);
- /* serialize against __split_huge_page_splitting() */
- local_irq_disable();
- if (likely(__get_user_pages_fast(address, 1, !ro, &page) == 1)) {
- page_head = compound_head(page);
- /*
- * page_head is valid pointer but we must pin
- * it before taking the PG_lock and/or
- * PG_compound_lock. The moment we re-enable
- * irqs __split_huge_page_splitting() can
- * return and the head page can be freed from
- * under us. We can't take the PG_lock and/or
- * PG_compound_lock on a page that could be
- * freed from under us.
- */
- if (page != page_head) {
- get_page(page_head);
- put_page(page);
- }
- local_irq_enable();
- } else {
- local_irq_enable();
- goto again;
- }
- }
-#else
- page_head = compound_head(page);
- if (page != page_head) {
- get_page(page_head);
- put_page(page);
- }
-#endif
-
- lock_page(page_head);
-
+ lock_page(page);
/*
- * If page_head->mapping is NULL, then it cannot be a PageAnon
+ * If page->mapping is NULL, then it cannot be a PageAnon
* page; but it might be the ZERO_PAGE or in the gate area or
* in a special mapping (all cases which we are happy to fail);
* or it may have been a good file page when get_user_pages_fast
@@ -570,12 +534,13 @@ again:
*
* The case we do have to guard against is when memory pressure made
* shmem_writepage move it from filecache to swapcache beneath us:
- * an unlikely race, but we do need to retry for page_head->mapping.
+ * an unlikely race, but we do need to retry for page->mapping.
*/
- if (!page_head->mapping) {
- int shmem_swizzled = PageSwapCache(page_head);
- unlock_page(page_head);
- put_page(page_head);
+ mapping = compound_head(page)->mapping;
+ if (!mapping) {
+ int shmem_swizzled = PageSwapCache(page);
+ unlock_page(page);
+ put_page(page);
if (shmem_swizzled)
goto again;
return -EFAULT;
@@ -588,7 +553,7 @@ again:
* it's a read-only handle, it's expected that futexes attach to
* the object not the particular process.
*/
- if (PageAnon(page_head)) {
+ if (PageAnon(page)) {
/*
* A RO anonymous page will never change and thus doesn't make
* sense for futex operations.
@@ -603,15 +568,15 @@ again:
key->private.address = address;
} else {
key->both.offset |= FUT_OFF_INODE; /* inode-based key */
- key->shared.inode = page_head->mapping->host;
+ key->shared.inode = mapping->host;
key->shared.pgoff = basepage_index(page);
}
get_futex_key_refs(key); /* implies MB (B) */
out:
- unlock_page(page_head);
- put_page(page_head);
+ unlock_page(page);
+ put_page(page);
return err;
}
@@ -639,7 +604,7 @@ static int fault_in_user_writeable(u32 __user *uaddr)
down_read(&mm->mmap_sem);
ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
- FAULT_FLAG_WRITE);
+ FAULT_FLAG_WRITE, NULL);
up_read(&mm->mmap_sem);
return ret < 0 ? ret : 0;
@@ -725,9 +690,12 @@ static struct futex_pi_state * alloc_pi_state(void)
}
/*
+ * Drops a reference to the pi_state object and frees or caches it
+ * when the last reference is gone.
+ *
* Must be called with the hb lock held.
*/
-static void free_pi_state(struct futex_pi_state *pi_state)
+static void put_pi_state(struct futex_pi_state *pi_state)
{
if (!pi_state)
return;
@@ -1223,7 +1191,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
if (pi_state->owner != current)
return -EINVAL;
- raw_spin_lock(&pi_state->pi_mutex.wait_lock);
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
/*
@@ -1249,22 +1217,22 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
else if (curval != uval)
ret = -EINVAL;
if (ret) {
- raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
return ret;
}
- raw_spin_lock_irq(&pi_state->owner->pi_lock);
+ raw_spin_lock(&pi_state->owner->pi_lock);
WARN_ON(list_empty(&pi_state->list));
list_del_init(&pi_state->list);
- raw_spin_unlock_irq(&pi_state->owner->pi_lock);
+ raw_spin_unlock(&pi_state->owner->pi_lock);
- raw_spin_lock_irq(&new_owner->pi_lock);
+ raw_spin_lock(&new_owner->pi_lock);
WARN_ON(!list_empty(&pi_state->list));
list_add(&pi_state->list, &new_owner->pi_state_list);
pi_state->owner = new_owner;
- raw_spin_unlock_irq(&new_owner->pi_lock);
+ raw_spin_unlock(&new_owner->pi_lock);
- raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
@@ -1706,31 +1674,35 @@ retry_private:
* exist yet, look it up one more time to ensure we have a
* reference to it. If the lock was taken, ret contains the
* vpid of the top waiter task.
+ * If the lock was not taken, we have pi_state and an initial
+ * refcount on it. In case of an error we have nothing.
*/
if (ret > 0) {
WARN_ON(pi_state);
drop_count++;
task_count++;
/*
- * If we acquired the lock, then the user
- * space value of uaddr2 should be vpid. It
- * cannot be changed by the top waiter as it
- * is blocked on hb2 lock if it tries to do
- * so. If something fiddled with it behind our
- * back the pi state lookup might unearth
- * it. So we rather use the known value than
- * rereading and handing potential crap to
- * lookup_pi_state.
+ * If we acquired the lock, then the user space value
+ * of uaddr2 should be vpid. It cannot be changed by
+ * the top waiter as it is blocked on hb2 lock if it
+ * tries to do so. If something fiddled with it behind
+ * our back the pi state lookup might unearth it. So
+ * we rather use the known value than rereading and
+ * handing potential crap to lookup_pi_state.
+ *
+ * If that call succeeds then we have pi_state and an
+ * initial refcount on it.
*/
ret = lookup_pi_state(ret, hb2, &key2, &pi_state);
}
switch (ret) {
case 0:
+ /* We hold a reference on the pi state. */
break;
+
+ /* If the above failed, then pi_state is NULL */
case -EFAULT:
- free_pi_state(pi_state);
- pi_state = NULL;
double_unlock_hb(hb1, hb2);
hb_waiters_dec(hb2);
put_futex_key(&key2);
@@ -1746,8 +1718,6 @@ retry_private:
* exit to complete.
* - The user space value changed.
*/
- free_pi_state(pi_state);
- pi_state = NULL;
double_unlock_hb(hb1, hb2);
hb_waiters_dec(hb2);
put_futex_key(&key2);
@@ -1801,30 +1771,58 @@ retry_private:
* of requeue_pi if we couldn't acquire the lock atomically.
*/
if (requeue_pi) {
- /* Prepare the waiter to take the rt_mutex. */
+ /*
+ * Prepare the waiter to take the rt_mutex. Take a
+ * refcount on the pi_state and store the pointer in
+ * the futex_q object of the waiter.
+ */
atomic_inc(&pi_state->refcount);
this->pi_state = pi_state;
ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
this->rt_waiter,
this->task);
if (ret == 1) {
- /* We got the lock. */
+ /*
+ * We got the lock. We do neither drop the
+ * refcount on pi_state nor clear
+ * this->pi_state because the waiter needs the
+ * pi_state for cleaning up the user space
+ * value. It will drop the refcount after
+ * doing so.
+ */
requeue_pi_wake_futex(this, &key2, hb2);
drop_count++;
continue;
} else if (ret) {
- /* -EDEADLK */
+ /*
+ * rt_mutex_start_proxy_lock() detected a
+ * potential deadlock when we tried to queue
+ * that waiter. Drop the pi_state reference
+ * which we took above and remove the pointer
+ * to the state from the waiters futex_q
+ * object.
+ */
this->pi_state = NULL;
- free_pi_state(pi_state);
- goto out_unlock;
+ put_pi_state(pi_state);
+ /*
+ * We stop queueing more waiters and let user
+ * space deal with the mess.
+ */
+ break;
}
}
requeue_futex(this, hb1, hb2, &key2);
drop_count++;
}
+ /*
+ * We took an extra initial reference to the pi_state either
+ * in futex_proxy_trylock_atomic() or in lookup_pi_state(). We
+ * need to drop it here again.
+ */
+ put_pi_state(pi_state);
+
out_unlock:
- free_pi_state(pi_state);
double_unlock_hb(hb1, hb2);
wake_up_q(&wake_q);
hb_waiters_dec(hb2);
@@ -1973,7 +1971,7 @@ static void unqueue_me_pi(struct futex_q *q)
__unqueue_futex(q);
BUG_ON(!q->pi_state);
- free_pi_state(q->pi_state);
+ put_pi_state(q->pi_state);
q->pi_state = NULL;
spin_unlock(q->lock_ptr);
@@ -2129,11 +2127,11 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
* we returned due to timeout or signal without taking the
* rt_mutex. Too late.
*/
- raw_spin_lock(&q->pi_state->pi_mutex.wait_lock);
+ raw_spin_lock_irq(&q->pi_state->pi_mutex.wait_lock);
owner = rt_mutex_owner(&q->pi_state->pi_mutex);
if (!owner)
owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
- raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock);
+ raw_spin_unlock_irq(&q->pi_state->pi_mutex.wait_lock);
ret = fixup_pi_state_owner(uaddr, q, owner);
goto out;
}
@@ -2759,7 +2757,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
* Drop the reference to the pi state which
* the requeue_pi() code acquired for us.
*/
- free_pi_state(q.pi_state);
+ put_pi_state(q.pi_state);
spin_unlock(q.lock_ptr);
}
} else {
@@ -3051,7 +3049,8 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
if (op & FUTEX_CLOCK_REALTIME) {
flags |= FLAGS_CLOCKRT;
- if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
+ if (cmd != FUTEX_WAIT && cmd != FUTEX_WAIT_BITSET && \
+ cmd != FUTEX_WAIT_REQUEUE_PI)
return -ENOSYS;
}
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c
index 7080ae1eb..2f9df3794 100644
--- a/kernel/gcov/base.c
+++ b/kernel/gcov/base.c
@@ -123,11 +123,6 @@ void gcov_enable_events(void)
}
#ifdef CONFIG_MODULES
-static inline int within(void *addr, void *start, unsigned long size)
-{
- return ((addr >= start) && (addr < start + size));
-}
-
/* Update list and generate events when modules are unloaded. */
static int gcov_module_notifier(struct notifier_block *nb, unsigned long event,
void *data)
@@ -142,7 +137,7 @@ static int gcov_module_notifier(struct notifier_block *nb, unsigned long event,
/* Remove entries located in module from linked list. */
while ((info = gcov_info_next(info))) {
- if (within(info, mod->module_core, mod->core_size)) {
+ if (within_module((unsigned long)info, mod)) {
gcov_info_unlink(prev, info);
if (gcov_events_enabled)
gcov_event(GCOV_REMOVE, info);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 15206453b..5797909f4 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -338,7 +338,6 @@ void handle_nested_irq(unsigned int irq)
raw_spin_lock_irq(&desc->lock);
desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
- kstat_incr_irqs_this_cpu(desc);
action = desc->action;
if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) {
@@ -346,6 +345,7 @@ void handle_nested_irq(unsigned int irq)
goto out_unlock;
}
+ kstat_incr_irqs_this_cpu(desc);
irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
raw_spin_unlock_irq(&desc->lock);
@@ -412,13 +412,13 @@ void handle_simple_irq(struct irq_desc *desc)
goto out_unlock;
desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
- kstat_incr_irqs_this_cpu(desc);
if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
desc->istate |= IRQS_PENDING;
goto out_unlock;
}
+ kstat_incr_irqs_this_cpu(desc);
handle_irq_event(desc);
out_unlock:
@@ -462,7 +462,6 @@ void handle_level_irq(struct irq_desc *desc)
goto out_unlock;
desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
- kstat_incr_irqs_this_cpu(desc);
/*
* If its disabled or no action available
@@ -473,6 +472,7 @@ void handle_level_irq(struct irq_desc *desc)
goto out_unlock;
}
+ kstat_incr_irqs_this_cpu(desc);
handle_irq_event(desc);
cond_unmask_irq(desc);
@@ -532,7 +532,6 @@ void handle_fasteoi_irq(struct irq_desc *desc)
goto out;
desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
- kstat_incr_irqs_this_cpu(desc);
/*
* If its disabled or no action available
@@ -544,6 +543,7 @@ void handle_fasteoi_irq(struct irq_desc *desc)
goto out;
}
+ kstat_incr_irqs_this_cpu(desc);
if (desc->istate & IRQS_ONESHOT)
mask_irq(desc);
@@ -950,6 +950,7 @@ void irq_chip_ack_parent(struct irq_data *data)
data = data->parent_data;
data->chip->irq_ack(data);
}
+EXPORT_SYMBOL_GPL(irq_chip_ack_parent);
/**
* irq_chip_mask_parent - Mask the parent interrupt
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 239e2ae2c..0409da0bc 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -159,6 +159,7 @@ static struct irq_desc *alloc_desc(int irq, int node, struct module *owner)
raw_spin_lock_init(&desc->lock);
lockdep_set_class(&desc->lock, &irq_desc_lock_class);
+ init_rcu_head(&desc->rcu);
desc_set_defaults(irq, desc, node, owner);
@@ -171,6 +172,15 @@ err_desc:
return NULL;
}
+static void delayed_free_desc(struct rcu_head *rhp)
+{
+ struct irq_desc *desc = container_of(rhp, struct irq_desc, rcu);
+
+ free_masks(desc);
+ free_percpu(desc->kstat_irqs);
+ kfree(desc);
+}
+
static void free_desc(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
@@ -187,9 +197,12 @@ static void free_desc(unsigned int irq)
delete_irq_desc(irq);
mutex_unlock(&sparse_irq_lock);
- free_masks(desc);
- free_percpu(desc->kstat_irqs);
- kfree(desc);
+ /*
+ * We free the descriptor, masks and stat fields via RCU. That
+ * allows demultiplex interrupts to do rcu based management of
+ * the child interrupts.
+ */
+ call_rcu(&desc->rcu, delayed_free_desc);
}
static int alloc_descs(unsigned int start, unsigned int cnt, int node,
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 22aa9612e..3e56d2f03 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -60,6 +60,7 @@ struct fwnode_handle *irq_domain_alloc_fwnode(void *data)
fwid->fwnode.type = FWNODE_IRQCHIP;
return &fwid->fwnode;
}
+EXPORT_SYMBOL_GPL(irq_domain_alloc_fwnode);
/**
* irq_domain_free_fwnode - Free a non-OF-backed fwnode_handle
@@ -70,13 +71,14 @@ void irq_domain_free_fwnode(struct fwnode_handle *fwnode)
{
struct irqchip_fwid *fwid;
- if (WARN_ON(fwnode->type != FWNODE_IRQCHIP))
+ if (WARN_ON(!is_fwnode_irqchip(fwnode)))
return;
fwid = container_of(fwnode, struct irqchip_fwid, fwnode);
kfree(fwid->name);
kfree(fwid);
}
+EXPORT_SYMBOL_GPL(irq_domain_free_fwnode);
/**
* __irq_domain_add() - Allocate a new irq_domain data structure
@@ -573,10 +575,15 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
unsigned int type = IRQ_TYPE_NONE;
int virq;
- if (fwspec->fwnode)
- domain = irq_find_matching_fwnode(fwspec->fwnode, DOMAIN_BUS_ANY);
- else
+ if (fwspec->fwnode) {
+ domain = irq_find_matching_fwnode(fwspec->fwnode,
+ DOMAIN_BUS_WIRED);
+ if (!domain)
+ domain = irq_find_matching_fwnode(fwspec->fwnode,
+ DOMAIN_BUS_ANY);
+ } else {
domain = irq_default_domain;
+ }
if (!domain) {
pr_warn("no irq domain found for %s !\n",
@@ -1013,6 +1020,7 @@ struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain,
return NULL;
}
+EXPORT_SYMBOL_GPL(irq_domain_get_irq_data);
/**
* irq_domain_set_hwirq_and_chip - Set hwirq and irqchip of @virq at @domain
@@ -1058,6 +1066,7 @@ void irq_domain_set_info(struct irq_domain *domain, unsigned int virq,
__irq_set_handler(virq, handler, 0, handler_name);
irq_set_handler_data(virq, handler_data);
}
+EXPORT_SYMBOL(irq_domain_set_info);
/**
* irq_domain_reset_irq_data - Clear hwirq, chip and chip_data in @irq_data
@@ -1125,9 +1134,9 @@ static void irq_domain_free_irqs_recursive(struct irq_domain *domain,
}
}
-static int irq_domain_alloc_irqs_recursive(struct irq_domain *domain,
- unsigned int irq_base,
- unsigned int nr_irqs, void *arg)
+int irq_domain_alloc_irqs_recursive(struct irq_domain *domain,
+ unsigned int irq_base,
+ unsigned int nr_irqs, void *arg)
{
int ret = 0;
struct irq_domain *parent = domain->parent;
@@ -1343,6 +1352,7 @@ struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain,
return (irq_data && irq_data->domain == domain) ? irq_data : NULL;
}
+EXPORT_SYMBOL_GPL(irq_domain_get_irq_data);
/**
* irq_domain_set_info - Set the complete data for a @virq in @domain
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 6ead20037..841187239 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1743,6 +1743,31 @@ out:
}
EXPORT_SYMBOL_GPL(enable_percpu_irq);
+/**
+ * irq_percpu_is_enabled - Check whether the per cpu irq is enabled
+ * @irq: Linux irq number to check for
+ *
+ * Must be called from a non migratable context. Returns the enable
+ * state of a per cpu interrupt on the current cpu.
+ */
+bool irq_percpu_is_enabled(unsigned int irq)
+{
+ unsigned int cpu = smp_processor_id();
+ struct irq_desc *desc;
+ unsigned long flags;
+ bool is_enabled;
+
+ desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU);
+ if (!desc)
+ return false;
+
+ is_enabled = cpumask_test_cpu(cpu, desc->percpu_enabled);
+ irq_put_desc_unlock(desc, flags);
+
+ return is_enabled;
+}
+EXPORT_SYMBOL_GPL(irq_percpu_is_enabled);
+
void disable_percpu_irq(unsigned int irq)
{
unsigned int cpu = smp_processor_id();
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 6b0c0b74a..38e89ce7b 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -109,9 +109,11 @@ static int msi_domain_alloc(struct irq_domain *domain, unsigned int virq,
if (irq_find_mapping(domain, hwirq) > 0)
return -EEXIST;
- ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg);
- if (ret < 0)
- return ret;
+ if (domain->parent) {
+ ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg);
+ if (ret < 0)
+ return ret;
+ }
for (i = 0; i < nr_irqs; i++) {
ret = ops->msi_init(domain, info, virq + i, hwirq + i, arg);
@@ -252,6 +254,60 @@ struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode,
&msi_domain_ops, info);
}
+int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev,
+ int nvec, msi_alloc_info_t *arg)
+{
+ struct msi_domain_info *info = domain->host_data;
+ struct msi_domain_ops *ops = info->ops;
+ int ret;
+
+ ret = ops->msi_check(domain, info, dev);
+ if (ret == 0)
+ ret = ops->msi_prepare(domain, dev, nvec, arg);
+
+ return ret;
+}
+
+int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev,
+ int virq, int nvec, msi_alloc_info_t *arg)
+{
+ struct msi_domain_info *info = domain->host_data;
+ struct msi_domain_ops *ops = info->ops;
+ struct msi_desc *desc;
+ int ret = 0;
+
+ for_each_msi_entry(desc, dev) {
+ /* Don't even try the multi-MSI brain damage. */
+ if (WARN_ON(!desc->irq || desc->nvec_used != 1)) {
+ ret = -EINVAL;
+ break;
+ }
+
+ if (!(desc->irq >= virq && desc->irq < (virq + nvec)))
+ continue;
+
+ ops->set_desc(arg, desc);
+ /* Assumes the domain mutex is held! */
+ ret = irq_domain_alloc_irqs_recursive(domain, virq, 1, arg);
+ if (ret)
+ break;
+
+ irq_set_msi_desc_off(virq, 0, desc);
+ }
+
+ if (ret) {
+ /* Mop up the damage */
+ for_each_msi_entry(desc, dev) {
+ if (!(desc->irq >= virq && desc->irq < (virq + nvec)))
+ continue;
+
+ irq_domain_free_irqs_common(domain, desc->irq, 1);
+ }
+ }
+
+ return ret;
+}
+
/**
* msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain
* @domain: The domain to allocate from
@@ -270,9 +326,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
struct msi_desc *desc;
int i, ret, virq = -1;
- ret = ops->msi_check(domain, info, dev);
- if (ret == 0)
- ret = ops->msi_prepare(domain, dev, nvec, &arg);
+ ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg);
if (ret)
return ret;
diff --git a/kernel/kexec.c b/kernel/kexec.c
index d873b64fb..ee70aef5c 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -63,16 +63,16 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
if (ret)
goto out_free_image;
- ret = sanity_check_segment_list(image);
- if (ret)
- goto out_free_image;
-
- /* Enable the special crash kernel control page allocation policy. */
if (kexec_on_panic) {
+ /* Enable special crash kernel control page alloc policy. */
image->control_page = crashk_res.start;
image->type = KEXEC_TYPE_CRASH;
}
+ ret = sanity_check_segment_list(image);
+ if (ret)
+ goto out_free_image;
+
/*
* Find a location for the control code buffer, and add it
* the vector of segments so that it's pages will also be
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 11b64a63c..8dc659144 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -310,12 +310,9 @@ static void kimage_free_pages(struct page *page)
void kimage_free_page_list(struct list_head *list)
{
- struct list_head *pos, *next;
+ struct page *page, *next;
- list_for_each_safe(pos, next, list) {
- struct page *page;
-
- page = list_entry(pos, struct page, lru);
+ list_for_each_entry_safe(page, next, list, lru) {
list_del(&page->lru);
kimage_free_pages(page);
}
@@ -853,7 +850,12 @@ struct kimage *kexec_image;
struct kimage *kexec_crash_image;
int kexec_load_disabled;
-void crash_kexec(struct pt_regs *regs)
+/*
+ * No panic_cpu check version of crash_kexec(). This function is called
+ * only when panic_cpu holds the current CPU number; this is the only CPU
+ * which processes crash_kexec routines.
+ */
+void __crash_kexec(struct pt_regs *regs)
{
/* Take the kexec_mutex here to prevent sys_kexec_load
* running on one cpu from replacing the crash kernel
@@ -876,6 +878,29 @@ void crash_kexec(struct pt_regs *regs)
}
}
+void crash_kexec(struct pt_regs *regs)
+{
+ int old_cpu, this_cpu;
+
+ /*
+ * Only one CPU is allowed to execute the crash_kexec() code as with
+ * panic(). Otherwise parallel calls of panic() and crash_kexec()
+ * may stop each other. To exclude them, we use panic_cpu here too.
+ */
+ this_cpu = raw_smp_processor_id();
+ old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu);
+ if (old_cpu == PANIC_CPU_INVALID) {
+ /* This is the 1st CPU which comes here, so go ahead. */
+ __crash_kexec(regs);
+
+ /*
+ * Reset panic_cpu to allow another panic()/crash_kexec()
+ * call.
+ */
+ atomic_set(&panic_cpu, PANIC_CPU_INVALID);
+ }
+}
+
size_t crash_get_memory_size(void)
{
size_t size = 0;
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index b70ada002..007b791f6 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -109,11 +109,13 @@ int __weak arch_kimage_file_post_load_cleanup(struct kimage *image)
return -EINVAL;
}
+#ifdef CONFIG_KEXEC_VERIFY_SIG
int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
unsigned long buf_len)
{
return -EKEYREJECTED;
}
+#endif
/* Apply relocations of type RELA */
int __weak
diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h
index e4392a698..0a52315d9 100644
--- a/kernel/kexec_internal.h
+++ b/kernel/kexec_internal.h
@@ -15,6 +15,27 @@ int kimage_is_destination_range(struct kimage *image,
extern struct mutex kexec_mutex;
#ifdef CONFIG_KEXEC_FILE
+struct kexec_sha_region {
+ unsigned long start;
+ unsigned long len;
+};
+
+/*
+ * Keeps track of buffer parameters as provided by caller for requesting
+ * memory placement of buffer.
+ */
+struct kexec_buf {
+ struct kimage *image;
+ char *buffer;
+ unsigned long bufsz;
+ unsigned long mem;
+ unsigned long memsz;
+ unsigned long buf_align;
+ unsigned long buf_min;
+ unsigned long buf_max;
+ bool top_down; /* allocate from top of memory hole */
+};
+
void kimage_file_post_load_cleanup(struct kimage *image);
#else /* CONFIG_KEXEC_FILE */
static inline void kimage_file_post_load_cleanup(struct kimage *image) { }
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index e83b26464..152da4a48 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -20,7 +20,7 @@
#include <linux/capability.h>
#include <linux/compiler.h>
-#include <linux/rcupdate.h> /* rcu_expedited */
+#include <linux/rcupdate.h> /* rcu_expedited and rcu_normal */
#define KERNEL_ATTR_RO(_name) \
static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
@@ -144,11 +144,12 @@ static ssize_t fscaps_show(struct kobject *kobj,
}
KERNEL_ATTR_RO(fscaps);
+#ifndef CONFIG_TINY_RCU
int rcu_expedited;
static ssize_t rcu_expedited_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%d\n", rcu_expedited);
+ return sprintf(buf, "%d\n", READ_ONCE(rcu_expedited));
}
static ssize_t rcu_expedited_store(struct kobject *kobj,
struct kobj_attribute *attr,
@@ -161,6 +162,24 @@ static ssize_t rcu_expedited_store(struct kobject *kobj,
}
KERNEL_ATTR_RW(rcu_expedited);
+int rcu_normal;
+static ssize_t rcu_normal_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%d\n", READ_ONCE(rcu_normal));
+}
+static ssize_t rcu_normal_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ if (kstrtoint(buf, 0, &rcu_normal))
+ return -EINVAL;
+
+ return count;
+}
+KERNEL_ATTR_RW(rcu_normal);
+#endif /* #ifndef CONFIG_TINY_RCU */
+
/*
* Make /sys/kernel/notes give the raw contents of our kernel .notes section.
*/
@@ -202,7 +221,10 @@ static struct attribute * kernel_attrs[] = {
&kexec_crash_size_attr.attr,
&vmcoreinfo_attr.attr,
#endif
+#ifndef CONFIG_TINY_RCU
&rcu_expedited_attr.attr,
+ &rcu_normal_attr.attr,
+#endif
NULL
};
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 12d8a8f88..9ff173dca 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -275,7 +275,7 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
DECLARE_COMPLETION_ONSTACK(done);
struct task_struct *task;
struct kthread_create_info *create = kmalloc(sizeof(*create),
- GFP_KERNEL | ___GFP_TOI_NOTRACK);
+ GFP_KERNEL);
if (!create)
return ERR_PTR(-ENOMEM);
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index a02812743..b5c30d9f4 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -47,12 +47,12 @@
* of times)
*/
-#include <linux/latencytop.h>
#include <linux/kallsyms.h>
#include <linux/seq_file.h>
#include <linux/notifier.h>
#include <linux/spinlock.h>
#include <linux/proc_fs.h>
+#include <linux/latencytop.h>
#include <linux/export.h>
#include <linux/sched.h>
#include <linux/list.h>
@@ -289,4 +289,16 @@ static int __init init_lstats_procfs(void)
proc_create("latency_stats", 0644, NULL, &lstats_fops);
return 0;
}
+
+int sysctl_latencytop(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int err;
+
+ err = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (latencytop_enabled)
+ force_schedstat_enabled();
+
+ return err;
+}
device_initcall(init_lstats_procfs);
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index db545cbcd..bc2c85c06 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -28,6 +28,7 @@
#include <linux/list.h>
#include <linux/kallsyms.h>
#include <linux/livepatch.h>
+#include <asm/cacheflush.h>
/**
* struct klp_ops - structure for tracking registered ftrace ops structs
@@ -135,13 +136,8 @@ struct klp_find_arg {
const char *objname;
const char *name;
unsigned long addr;
- /*
- * If count == 0, the symbol was not found. If count == 1, a unique
- * match was found and addr is set. If count > 1, there is
- * unresolvable ambiguity among "count" number of symbols with the same
- * name in the same object.
- */
unsigned long count;
+ unsigned long pos;
};
static int klp_find_callback(void *data, const char *name,
@@ -158,37 +154,48 @@ static int klp_find_callback(void *data, const char *name,
if (args->objname && strcmp(args->objname, mod->name))
return 0;
- /*
- * args->addr might be overwritten if another match is found
- * but klp_find_object_symbol() handles this and only returns the
- * addr if count == 1.
- */
args->addr = addr;
args->count++;
+ /*
+ * Finish the search when the symbol is found for the desired position
+ * or the position is not defined for a non-unique symbol.
+ */
+ if ((args->pos && (args->count == args->pos)) ||
+ (!args->pos && (args->count > 1)))
+ return 1;
+
return 0;
}
static int klp_find_object_symbol(const char *objname, const char *name,
- unsigned long *addr)
+ unsigned long sympos, unsigned long *addr)
{
struct klp_find_arg args = {
.objname = objname,
.name = name,
.addr = 0,
- .count = 0
+ .count = 0,
+ .pos = sympos,
};
mutex_lock(&module_mutex);
kallsyms_on_each_symbol(klp_find_callback, &args);
mutex_unlock(&module_mutex);
- if (args.count == 0)
+ /*
+ * Ensure an address was found. If sympos is 0, ensure symbol is unique;
+ * otherwise ensure the symbol position count matches sympos.
+ */
+ if (args.addr == 0)
pr_err("symbol '%s' not found in symbol table\n", name);
- else if (args.count > 1)
+ else if (args.count > 1 && sympos == 0) {
pr_err("unresolvable ambiguity (%lu matches) on symbol '%s' in object '%s'\n",
args.count, name, objname);
- else {
+ } else if (sympos != args.count && sympos > 0) {
+ pr_err("symbol position %lu for symbol '%s' in object '%s' not found\n",
+ sympos, name, objname ? objname : "vmlinux");
+ } else {
*addr = args.addr;
return 0;
}
@@ -197,66 +204,6 @@ static int klp_find_object_symbol(const char *objname, const char *name,
return -EINVAL;
}
-struct klp_verify_args {
- const char *name;
- const unsigned long addr;
-};
-
-static int klp_verify_callback(void *data, const char *name,
- struct module *mod, unsigned long addr)
-{
- struct klp_verify_args *args = data;
-
- if (!mod &&
- !strcmp(args->name, name) &&
- args->addr == addr)
- return 1;
-
- return 0;
-}
-
-static int klp_verify_vmlinux_symbol(const char *name, unsigned long addr)
-{
- struct klp_verify_args args = {
- .name = name,
- .addr = addr,
- };
- int ret;
-
- mutex_lock(&module_mutex);
- ret = kallsyms_on_each_symbol(klp_verify_callback, &args);
- mutex_unlock(&module_mutex);
-
- if (!ret) {
- pr_err("symbol '%s' not found at specified address 0x%016lx, kernel mismatch?\n",
- name, addr);
- return -EINVAL;
- }
-
- return 0;
-}
-
-static int klp_find_verify_func_addr(struct klp_object *obj,
- struct klp_func *func)
-{
- int ret;
-
-#if defined(CONFIG_RANDOMIZE_BASE)
- /* If KASLR has been enabled, adjust old_addr accordingly */
- if (kaslr_enabled() && func->old_addr)
- func->old_addr += kaslr_offset();
-#endif
-
- if (!func->old_addr || klp_is_module(obj))
- ret = klp_find_object_symbol(obj->name, func->old_name,
- &func->old_addr);
- else
- ret = klp_verify_vmlinux_symbol(func->old_name,
- func->old_addr);
-
- return ret;
-}
-
/*
* external symbols are located outside the parent object (where the parent
* object is either vmlinux or the kmod being patched).
@@ -276,14 +223,18 @@ static int klp_find_external_symbol(struct module *pmod, const char *name,
}
preempt_enable();
- /* otherwise check if it's in another .o within the patch module */
- return klp_find_object_symbol(pmod->name, name, addr);
+ /*
+ * Check if it's in another .o within the patch module. This also
+ * checks that the external symbol is unique.
+ */
+ return klp_find_object_symbol(pmod->name, name, 0, addr);
}
static int klp_write_object_relocations(struct module *pmod,
struct klp_object *obj)
{
- int ret;
+ int ret = 0;
+ unsigned long val;
struct klp_reloc *reloc;
if (WARN_ON(!klp_is_object_loaded(obj)))
@@ -292,41 +243,38 @@ static int klp_write_object_relocations(struct module *pmod,
if (WARN_ON(!obj->relocs))
return -EINVAL;
+ module_disable_ro(pmod);
+
for (reloc = obj->relocs; reloc->name; reloc++) {
- if (!klp_is_module(obj)) {
-
-#if defined(CONFIG_RANDOMIZE_BASE)
- /* If KASLR has been enabled, adjust old value accordingly */
- if (kaslr_enabled())
- reloc->val += kaslr_offset();
-#endif
- ret = klp_verify_vmlinux_symbol(reloc->name,
- reloc->val);
- if (ret)
- return ret;
- } else {
- /* module, reloc->val needs to be discovered */
- if (reloc->external)
- ret = klp_find_external_symbol(pmod,
- reloc->name,
- &reloc->val);
- else
- ret = klp_find_object_symbol(obj->mod->name,
- reloc->name,
- &reloc->val);
- if (ret)
- return ret;
- }
+ /* discover the address of the referenced symbol */
+ if (reloc->external) {
+ if (reloc->sympos > 0) {
+ pr_err("non-zero sympos for external reloc symbol '%s' is not supported\n",
+ reloc->name);
+ ret = -EINVAL;
+ goto out;
+ }
+ ret = klp_find_external_symbol(pmod, reloc->name, &val);
+ } else
+ ret = klp_find_object_symbol(obj->name,
+ reloc->name,
+ reloc->sympos,
+ &val);
+ if (ret)
+ goto out;
+
ret = klp_write_module_reloc(pmod, reloc->type, reloc->loc,
- reloc->val + reloc->addend);
+ val + reloc->addend);
if (ret) {
pr_err("relocation failed for symbol '%s' at 0x%016lx (%d)\n",
- reloc->name, reloc->val, ret);
- return ret;
+ reloc->name, val, ret);
+ goto out;
}
}
- return 0;
+out:
+ module_enable_ro(pmod);
+ return ret;
}
static void notrace klp_ftrace_handler(unsigned long ip,
@@ -593,7 +541,7 @@ EXPORT_SYMBOL_GPL(klp_enable_patch);
* /sys/kernel/livepatch/<patch>
* /sys/kernel/livepatch/<patch>/enabled
* /sys/kernel/livepatch/<patch>/<object>
- * /sys/kernel/livepatch/<patch>/<object>/<func>
+ * /sys/kernel/livepatch/<patch>/<object>/<function,sympos>
*/
static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr,
@@ -738,8 +686,14 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func)
INIT_LIST_HEAD(&func->stack_node);
func->state = KLP_DISABLED;
+ /* The format for the sysfs directory is <function,sympos> where sympos
+ * is the nth occurrence of this symbol in kallsyms for the patched
+ * object. If the user selects 0 for old_sympos, then 1 will be used
+ * since a unique symbol will be the first occurrence.
+ */
return kobject_init_and_add(&func->kobj, &klp_ktype_func,
- &obj->kobj, "%s", func->old_name);
+ &obj->kobj, "%s,%lu", func->old_name,
+ func->old_sympos ? func->old_sympos : 1);
}
/* parts of the initialization that is done only when the object is loaded */
@@ -756,7 +710,9 @@ static int klp_init_object_loaded(struct klp_patch *patch,
}
klp_for_each_func(obj, func) {
- ret = klp_find_verify_func_addr(obj, func);
+ ret = klp_find_object_symbol(obj->name, func->old_name,
+ func->old_sympos,
+ &func->old_addr);
if (ret)
return ret;
}
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 60ace5661..716547fdb 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -292,7 +292,7 @@ LIST_HEAD(all_lock_classes);
#define __classhashfn(key) hash_long((unsigned long)key, CLASSHASH_BITS)
#define classhashentry(key) (classhash_table + __classhashfn((key)))
-static struct list_head classhash_table[CLASSHASH_SIZE];
+static struct hlist_head classhash_table[CLASSHASH_SIZE];
/*
* We put the lock dependency chains into a hash-table as well, to cache
@@ -303,7 +303,7 @@ static struct list_head classhash_table[CLASSHASH_SIZE];
#define __chainhashfn(chain) hash_long(chain, CHAINHASH_BITS)
#define chainhashentry(chain) (chainhash_table + __chainhashfn((chain)))
-static struct list_head chainhash_table[CHAINHASH_SIZE];
+static struct hlist_head chainhash_table[CHAINHASH_SIZE];
/*
* The hash key of the lock dependency chains is a hash itself too:
@@ -666,7 +666,7 @@ static inline struct lock_class *
look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
{
struct lockdep_subclass_key *key;
- struct list_head *hash_head;
+ struct hlist_head *hash_head;
struct lock_class *class;
#ifdef CONFIG_DEBUG_LOCKDEP
@@ -719,7 +719,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
return NULL;
- list_for_each_entry_rcu(class, hash_head, hash_entry) {
+ hlist_for_each_entry_rcu(class, hash_head, hash_entry) {
if (class->key == key) {
/*
* Huh! same key, different name? Did someone trample
@@ -742,7 +742,7 @@ static inline struct lock_class *
register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
{
struct lockdep_subclass_key *key;
- struct list_head *hash_head;
+ struct hlist_head *hash_head;
struct lock_class *class;
DEBUG_LOCKS_WARN_ON(!irqs_disabled());
@@ -774,7 +774,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
* We have to do the hash-walk again, to avoid races
* with another CPU:
*/
- list_for_each_entry_rcu(class, hash_head, hash_entry) {
+ hlist_for_each_entry_rcu(class, hash_head, hash_entry) {
if (class->key == key)
goto out_unlock_set;
}
@@ -805,7 +805,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
* We use RCU's safe list-add method to make
* parallel walking of the hash-list safe:
*/
- list_add_tail_rcu(&class->hash_entry, hash_head);
+ hlist_add_head_rcu(&class->hash_entry, hash_head);
/*
* Add it to the global list of classes:
*/
@@ -1822,7 +1822,7 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
*/
static int
check_prev_add(struct task_struct *curr, struct held_lock *prev,
- struct held_lock *next, int distance, int trylock_loop)
+ struct held_lock *next, int distance, int *stack_saved)
{
struct lock_list *entry;
int ret;
@@ -1883,8 +1883,11 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
}
}
- if (!trylock_loop && !save_trace(&trace))
- return 0;
+ if (!*stack_saved) {
+ if (!save_trace(&trace))
+ return 0;
+ *stack_saved = 1;
+ }
/*
* Ok, all validations passed, add the new lock
@@ -1907,6 +1910,8 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
* Debugging printouts:
*/
if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) {
+ /* We drop graph lock, so another thread can overwrite trace. */
+ *stack_saved = 0;
graph_unlock();
printk("\n new dependency: ");
print_lock_name(hlock_class(prev));
@@ -1929,7 +1934,7 @@ static int
check_prevs_add(struct task_struct *curr, struct held_lock *next)
{
int depth = curr->lockdep_depth;
- int trylock_loop = 0;
+ int stack_saved = 0;
struct held_lock *hlock;
/*
@@ -1956,7 +1961,7 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
*/
if (hlock->read != 2 && hlock->check) {
if (!check_prev_add(curr, hlock, next,
- distance, trylock_loop))
+ distance, &stack_saved))
return 0;
/*
* Stop after the first non-trylock entry,
@@ -1979,7 +1984,6 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
if (curr->held_locks[depth].irq_context !=
curr->held_locks[depth-1].irq_context)
break;
- trylock_loop = 1;
}
return 1;
out_bug:
@@ -2017,7 +2021,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
u64 chain_key)
{
struct lock_class *class = hlock_class(hlock);
- struct list_head *hash_head = chainhashentry(chain_key);
+ struct hlist_head *hash_head = chainhashentry(chain_key);
struct lock_chain *chain;
struct held_lock *hlock_curr;
int i, j;
@@ -2033,7 +2037,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
* We can walk it lock-free, because entries only get added
* to the hash:
*/
- list_for_each_entry_rcu(chain, hash_head, entry) {
+ hlist_for_each_entry_rcu(chain, hash_head, entry) {
if (chain->chain_key == chain_key) {
cache_hit:
debug_atomic_inc(chain_lookup_hits);
@@ -2057,7 +2061,7 @@ cache_hit:
/*
* We have to walk the chain again locked - to avoid duplicates:
*/
- list_for_each_entry(chain, hash_head, entry) {
+ hlist_for_each_entry(chain, hash_head, entry) {
if (chain->chain_key == chain_key) {
graph_unlock();
goto cache_hit;
@@ -2091,7 +2095,7 @@ cache_hit:
}
chain_hlocks[chain->base + j] = class - lock_classes;
}
- list_add_tail_rcu(&chain->entry, hash_head);
+ hlist_add_head_rcu(&chain->entry, hash_head);
debug_atomic_inc(chain_lookup_misses);
inc_chains();
@@ -3875,7 +3879,7 @@ void lockdep_reset(void)
nr_process_chains = 0;
debug_locks = 1;
for (i = 0; i < CHAINHASH_SIZE; i++)
- INIT_LIST_HEAD(chainhash_table + i);
+ INIT_HLIST_HEAD(chainhash_table + i);
raw_local_irq_restore(flags);
}
@@ -3894,7 +3898,7 @@ static void zap_class(struct lock_class *class)
/*
* Unhash the class and remove it from the all_lock_classes list:
*/
- list_del_rcu(&class->hash_entry);
+ hlist_del_rcu(&class->hash_entry);
list_del_rcu(&class->lock_entry);
RCU_INIT_POINTER(class->key, NULL);
@@ -3917,7 +3921,7 @@ static inline int within(const void *addr, void *start, unsigned long size)
void lockdep_free_key_range(void *start, unsigned long size)
{
struct lock_class *class;
- struct list_head *head;
+ struct hlist_head *head;
unsigned long flags;
int i;
int locked;
@@ -3930,9 +3934,7 @@ void lockdep_free_key_range(void *start, unsigned long size)
*/
for (i = 0; i < CLASSHASH_SIZE; i++) {
head = classhash_table + i;
- if (list_empty(head))
- continue;
- list_for_each_entry_rcu(class, head, hash_entry) {
+ hlist_for_each_entry_rcu(class, head, hash_entry) {
if (within(class->key, start, size))
zap_class(class);
else if (within(class->name, start, size))
@@ -3962,7 +3964,7 @@ void lockdep_free_key_range(void *start, unsigned long size)
void lockdep_reset_lock(struct lockdep_map *lock)
{
struct lock_class *class;
- struct list_head *head;
+ struct hlist_head *head;
unsigned long flags;
int i, j;
int locked;
@@ -3987,9 +3989,7 @@ void lockdep_reset_lock(struct lockdep_map *lock)
locked = graph_lock();
for (i = 0; i < CLASSHASH_SIZE; i++) {
head = classhash_table + i;
- if (list_empty(head))
- continue;
- list_for_each_entry_rcu(class, head, hash_entry) {
+ hlist_for_each_entry_rcu(class, head, hash_entry) {
int match = 0;
for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++)
@@ -4027,10 +4027,10 @@ void lockdep_init(void)
return;
for (i = 0; i < CLASSHASH_SIZE; i++)
- INIT_LIST_HEAD(classhash_table + i);
+ INIT_HLIST_HEAD(classhash_table + i);
for (i = 0; i < CHAINHASH_SIZE; i++)
- INIT_LIST_HEAD(chainhash_table + i);
+ INIT_HLIST_HEAD(chainhash_table + i);
lockdep_initialized = 1;
}
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 87e9ce6a6..393d1874b 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -14,8 +14,9 @@
* (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
* (C) Copyright 2013-2014 Red Hat, Inc.
* (C) Copyright 2015 Intel Corp.
+ * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP
*
- * Authors: Waiman Long <waiman.long@hp.com>
+ * Authors: Waiman Long <waiman.long@hpe.com>
* Peter Zijlstra <peterz@infradead.org>
*/
@@ -176,7 +177,12 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
{
struct __qspinlock *l = (void *)lock;
- return (u32)xchg(&l->tail, tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
+ /*
+ * Use release semantics to make sure that the MCS node is properly
+ * initialized before changing the tail code.
+ */
+ return (u32)xchg_release(&l->tail,
+ tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
}
#else /* _Q_PENDING_BITS == 8 */
@@ -208,7 +214,11 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
for (;;) {
new = (val & _Q_LOCKED_PENDING_MASK) | tail;
- old = atomic_cmpxchg(&lock->val, val, new);
+ /*
+ * Use release semantics to make sure that the MCS node is
+ * properly initialized before changing the tail code.
+ */
+ old = atomic_cmpxchg_release(&lock->val, val, new);
if (old == val)
break;
@@ -238,18 +248,20 @@ static __always_inline void set_locked(struct qspinlock *lock)
*/
static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
-static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { }
+static __always_inline void __pv_wait_node(struct mcs_spinlock *node,
+ struct mcs_spinlock *prev) { }
static __always_inline void __pv_kick_node(struct qspinlock *lock,
struct mcs_spinlock *node) { }
-static __always_inline void __pv_wait_head(struct qspinlock *lock,
- struct mcs_spinlock *node) { }
+static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock,
+ struct mcs_spinlock *node)
+ { return 0; }
#define pv_enabled() false
#define pv_init_node __pv_init_node
#define pv_wait_node __pv_wait_node
#define pv_kick_node __pv_kick_node
-#define pv_wait_head __pv_wait_head
+#define pv_wait_head_or_lock __pv_wait_head_or_lock
#ifdef CONFIG_PARAVIRT_SPINLOCKS
#define queued_spin_lock_slowpath native_queued_spin_lock_slowpath
@@ -319,7 +331,11 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
if (val == new)
new |= _Q_PENDING_VAL;
- old = atomic_cmpxchg(&lock->val, val, new);
+ /*
+ * Acquire semantic is required here as the function may
+ * return immediately if the lock was free.
+ */
+ old = atomic_cmpxchg_acquire(&lock->val, val, new);
if (old == val)
break;
@@ -382,6 +398,7 @@ queue:
* p,*,* -> n,*,*
*/
old = xchg_tail(lock, tail);
+ next = NULL;
/*
* if there was a previous node; link it and wait until reaching the
@@ -391,8 +408,18 @@ queue:
prev = decode_tail(old);
WRITE_ONCE(prev->next, node);
- pv_wait_node(node);
+ pv_wait_node(node, prev);
arch_mcs_spin_lock_contended(&node->locked);
+
+ /*
+ * While waiting for the MCS lock, the next pointer may have
+ * been set by another lock waiter. We optimistically load
+ * the next pointer & prefetch the cacheline for writing
+ * to reduce latency in the upcoming MCS unlock operation.
+ */
+ next = READ_ONCE(node->next);
+ if (next)
+ prefetchw(next);
}
/*
@@ -406,11 +433,22 @@ queue:
* sequentiality; this is because the set_locked() function below
* does not imply a full barrier.
*
+ * The PV pv_wait_head_or_lock function, if active, will acquire
+ * the lock and return a non-zero value. So we have to skip the
+ * smp_load_acquire() call. As the next PV queue head hasn't been
+ * designated yet, there is no way for the locked value to become
+ * _Q_SLOW_VAL. So both the set_locked() and the
+ * atomic_cmpxchg_relaxed() calls will be safe.
+ *
+ * If PV isn't active, 0 will be returned instead.
+ *
*/
- pv_wait_head(lock, node);
- while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_PENDING_MASK)
- cpu_relax();
+ if ((val = pv_wait_head_or_lock(lock, node)))
+ goto locked;
+ smp_cond_acquire(!((val = atomic_read(&lock->val)) & _Q_LOCKED_PENDING_MASK));
+
+locked:
/*
* claim the lock:
*
@@ -422,11 +460,17 @@ queue:
* to grab the lock.
*/
for (;;) {
- if (val != tail) {
+ /* In the PV case we might already have _Q_LOCKED_VAL set */
+ if ((val & _Q_TAIL_MASK) != tail) {
set_locked(lock);
break;
}
- old = atomic_cmpxchg(&lock->val, val, _Q_LOCKED_VAL);
+ /*
+ * The smp_load_acquire() call above has provided the necessary
+ * acquire semantics required for locking. At most two
+ * iterations of this loop may be ran.
+ */
+ old = atomic_cmpxchg_relaxed(&lock->val, val, _Q_LOCKED_VAL);
if (old == val)
goto release; /* No contention */
@@ -434,10 +478,12 @@ queue:
}
/*
- * contended path; wait for next, release.
+ * contended path; wait for next if not observed yet, release.
*/
- while (!(next = READ_ONCE(node->next)))
- cpu_relax();
+ if (!next) {
+ while (!(next = READ_ONCE(node->next)))
+ cpu_relax();
+ }
arch_mcs_spin_unlock_contended(&next->locked);
pv_kick_node(lock, next);
@@ -462,7 +508,7 @@ EXPORT_SYMBOL(queued_spin_lock_slowpath);
#undef pv_init_node
#undef pv_wait_node
#undef pv_kick_node
-#undef pv_wait_head
+#undef pv_wait_head_or_lock
#undef queued_spin_lock_slowpath
#define queued_spin_lock_slowpath __pv_queued_spin_lock_slowpath
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index f0450ff48..87bb235c3 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -23,6 +23,20 @@
#define _Q_SLOW_VAL (3U << _Q_LOCKED_OFFSET)
/*
+ * Queue Node Adaptive Spinning
+ *
+ * A queue node vCPU will stop spinning if the vCPU in the previous node is
+ * not running. The one lock stealing attempt allowed at slowpath entry
+ * mitigates the slight slowdown for non-overcommitted guest with this
+ * aggressive wait-early mechanism.
+ *
+ * The status of the previous node will be checked at fixed interval
+ * controlled by PV_PREV_CHECK_MASK. This is to ensure that we won't
+ * pound on the cacheline of the previous node too heavily.
+ */
+#define PV_PREV_CHECK_MASK 0xff
+
+/*
* Queue node uses: vcpu_running & vcpu_halted.
* Queue head uses: vcpu_running & vcpu_hashed.
*/
@@ -41,6 +55,94 @@ struct pv_node {
};
/*
+ * By replacing the regular queued_spin_trylock() with the function below,
+ * it will be called once when a lock waiter enter the PV slowpath before
+ * being queued. By allowing one lock stealing attempt here when the pending
+ * bit is off, it helps to reduce the performance impact of lock waiter
+ * preemption without the drawback of lock starvation.
+ */
+#define queued_spin_trylock(l) pv_queued_spin_steal_lock(l)
+static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock)
+{
+ struct __qspinlock *l = (void *)lock;
+
+ return !(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
+ (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0);
+}
+
+/*
+ * The pending bit is used by the queue head vCPU to indicate that it
+ * is actively spinning on the lock and no lock stealing is allowed.
+ */
+#if _Q_PENDING_BITS == 8
+static __always_inline void set_pending(struct qspinlock *lock)
+{
+ struct __qspinlock *l = (void *)lock;
+
+ WRITE_ONCE(l->pending, 1);
+}
+
+static __always_inline void clear_pending(struct qspinlock *lock)
+{
+ struct __qspinlock *l = (void *)lock;
+
+ WRITE_ONCE(l->pending, 0);
+}
+
+/*
+ * The pending bit check in pv_queued_spin_steal_lock() isn't a memory
+ * barrier. Therefore, an atomic cmpxchg() is used to acquire the lock
+ * just to be sure that it will get it.
+ */
+static __always_inline int trylock_clear_pending(struct qspinlock *lock)
+{
+ struct __qspinlock *l = (void *)lock;
+
+ return !READ_ONCE(l->locked) &&
+ (cmpxchg(&l->locked_pending, _Q_PENDING_VAL, _Q_LOCKED_VAL)
+ == _Q_PENDING_VAL);
+}
+#else /* _Q_PENDING_BITS == 8 */
+static __always_inline void set_pending(struct qspinlock *lock)
+{
+ atomic_set_mask(_Q_PENDING_VAL, &lock->val);
+}
+
+static __always_inline void clear_pending(struct qspinlock *lock)
+{
+ atomic_clear_mask(_Q_PENDING_VAL, &lock->val);
+}
+
+static __always_inline int trylock_clear_pending(struct qspinlock *lock)
+{
+ int val = atomic_read(&lock->val);
+
+ for (;;) {
+ int old, new;
+
+ if (val & _Q_LOCKED_MASK)
+ break;
+
+ /*
+ * Try to clear pending bit & set locked bit
+ */
+ old = val;
+ new = (val & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL;
+ val = atomic_cmpxchg(&lock->val, old, new);
+
+ if (val == old)
+ return 1;
+ }
+ return 0;
+}
+#endif /* _Q_PENDING_BITS == 8 */
+
+/*
+ * Include queued spinlock statistics code
+ */
+#include "qspinlock_stat.h"
+
+/*
* Lock and MCS node addresses hash table for fast lookup
*
* Hashing is done on a per-cacheline basis to minimize the need to access
@@ -100,10 +202,13 @@ static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node)
{
unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
struct pv_hash_entry *he;
+ int hopcnt = 0;
for_each_hash_entry(he, offset, hash) {
+ hopcnt++;
if (!cmpxchg(&he->lock, NULL, lock)) {
WRITE_ONCE(he->node, node);
+ qstat_hop(hopcnt);
return &he->lock;
}
}
@@ -144,6 +249,20 @@ static struct pv_node *pv_unhash(struct qspinlock *lock)
}
/*
+ * Return true if when it is time to check the previous node which is not
+ * in a running state.
+ */
+static inline bool
+pv_wait_early(struct pv_node *prev, int loop)
+{
+
+ if ((loop & PV_PREV_CHECK_MASK) != 0)
+ return false;
+
+ return READ_ONCE(prev->state) != vcpu_running;
+}
+
+/*
* Initialize the PV part of the mcs_spinlock node.
*/
static void pv_init_node(struct mcs_spinlock *node)
@@ -161,15 +280,23 @@ static void pv_init_node(struct mcs_spinlock *node)
* pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its
* behalf.
*/
-static void pv_wait_node(struct mcs_spinlock *node)
+static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
{
struct pv_node *pn = (struct pv_node *)node;
+ struct pv_node *pp = (struct pv_node *)prev;
+ int waitcnt = 0;
int loop;
+ bool wait_early;
- for (;;) {
- for (loop = SPIN_THRESHOLD; loop; loop--) {
+ /* waitcnt processing will be compiled out if !QUEUED_LOCK_STAT */
+ for (;; waitcnt++) {
+ for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) {
if (READ_ONCE(node->locked))
return;
+ if (pv_wait_early(pp, loop)) {
+ wait_early = true;
+ break;
+ }
cpu_relax();
}
@@ -184,12 +311,17 @@ static void pv_wait_node(struct mcs_spinlock *node)
*/
smp_store_mb(pn->state, vcpu_halted);
- if (!READ_ONCE(node->locked))
+ if (!READ_ONCE(node->locked)) {
+ qstat_inc(qstat_pv_wait_node, true);
+ qstat_inc(qstat_pv_wait_again, waitcnt);
+ qstat_inc(qstat_pv_wait_early, wait_early);
pv_wait(&pn->state, vcpu_halted);
+ }
/*
- * If pv_kick_node() changed us to vcpu_hashed, retain that value
- * so that pv_wait_head() knows to not also try to hash this lock.
+ * If pv_kick_node() changed us to vcpu_hashed, retain that
+ * value so that pv_wait_head_or_lock() knows to not also try
+ * to hash this lock.
*/
cmpxchg(&pn->state, vcpu_halted, vcpu_running);
@@ -200,6 +332,7 @@ static void pv_wait_node(struct mcs_spinlock *node)
* So it is better to spin for a while in the hope that the
* MCS lock will be released soon.
*/
+ qstat_inc(qstat_pv_spurious_wakeup, !READ_ONCE(node->locked));
}
/*
@@ -212,8 +345,9 @@ static void pv_wait_node(struct mcs_spinlock *node)
/*
* Called after setting next->locked = 1 when we're the lock owner.
*
- * Instead of waking the waiters stuck in pv_wait_node() advance their state such
- * that they're waiting in pv_wait_head(), this avoids a wake/sleep cycle.
+ * Instead of waking the waiters stuck in pv_wait_node() advance their state
+ * such that they're waiting in pv_wait_head_or_lock(), this avoids a
+ * wake/sleep cycle.
*/
static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
{
@@ -242,14 +376,19 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
}
/*
- * Wait for l->locked to become clear; halt the vcpu after a short spin.
+ * Wait for l->locked to become clear and acquire the lock;
+ * halt the vcpu after a short spin.
* __pv_queued_spin_unlock() will wake us.
+ *
+ * The current value of the lock will be returned for additional processing.
*/
-static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
+static u32
+pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
{
struct pv_node *pn = (struct pv_node *)node;
struct __qspinlock *l = (void *)lock;
struct qspinlock **lp = NULL;
+ int waitcnt = 0;
int loop;
/*
@@ -259,12 +398,25 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
if (READ_ONCE(pn->state) == vcpu_hashed)
lp = (struct qspinlock **)1;
- for (;;) {
+ for (;; waitcnt++) {
+ /*
+ * Set correct vCPU state to be used by queue node wait-early
+ * mechanism.
+ */
+ WRITE_ONCE(pn->state, vcpu_running);
+
+ /*
+ * Set the pending bit in the active lock spinning loop to
+ * disable lock stealing before attempting to acquire the lock.
+ */
+ set_pending(lock);
for (loop = SPIN_THRESHOLD; loop; loop--) {
- if (!READ_ONCE(l->locked))
- return;
+ if (trylock_clear_pending(lock))
+ goto gotlock;
cpu_relax();
}
+ clear_pending(lock);
+
if (!lp) { /* ONCE */
lp = pv_hash(lock, pn);
@@ -280,51 +432,50 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
*
* Matches the smp_rmb() in __pv_queued_spin_unlock().
*/
- if (!cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) {
+ if (xchg(&l->locked, _Q_SLOW_VAL) == 0) {
/*
- * The lock is free and _Q_SLOW_VAL has never
- * been set. Therefore we need to unhash before
- * getting the lock.
+ * The lock was free and now we own the lock.
+ * Change the lock value back to _Q_LOCKED_VAL
+ * and unhash the table.
*/
+ WRITE_ONCE(l->locked, _Q_LOCKED_VAL);
WRITE_ONCE(*lp, NULL);
- return;
+ goto gotlock;
}
}
+ WRITE_ONCE(pn->state, vcpu_halted);
+ qstat_inc(qstat_pv_wait_head, true);
+ qstat_inc(qstat_pv_wait_again, waitcnt);
pv_wait(&l->locked, _Q_SLOW_VAL);
/*
* The unlocker should have freed the lock before kicking the
* CPU. So if the lock is still not free, it is a spurious
- * wakeup and so the vCPU should wait again after spinning for
- * a while.
+ * wakeup or another vCPU has stolen the lock. The current
+ * vCPU should spin again.
*/
+ qstat_inc(qstat_pv_spurious_wakeup, READ_ONCE(l->locked));
}
/*
- * Lock is unlocked now; the caller will acquire it without waiting.
- * As with pv_wait_node() we rely on the caller to do a load-acquire
- * for us.
+ * The cmpxchg() or xchg() call before coming here provides the
+ * acquire semantics for locking. The dummy ORing of _Q_LOCKED_VAL
+ * here is to indicate to the compiler that the value will always
+ * be nozero to enable better code optimization.
*/
+gotlock:
+ return (u32)(atomic_read(&lock->val) | _Q_LOCKED_VAL);
}
/*
- * PV version of the unlock function to be used in stead of
- * queued_spin_unlock().
+ * PV versions of the unlock fastpath and slowpath functions to be used
+ * instead of queued_spin_unlock().
*/
-__visible void __pv_queued_spin_unlock(struct qspinlock *lock)
+__visible void
+__pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
{
struct __qspinlock *l = (void *)lock;
struct pv_node *node;
- u8 locked;
-
- /*
- * We must not unlock if SLOW, because in that case we must first
- * unhash. Otherwise it would be possible to have multiple @lock
- * entries, which would be BAD.
- */
- locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
- if (likely(locked == _Q_LOCKED_VAL))
- return;
if (unlikely(locked != _Q_SLOW_VAL)) {
WARN(!debug_locks_silent,
@@ -338,7 +489,7 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
* so we need a barrier to order the read of the node data in
* pv_unhash *after* we've read the lock being _Q_SLOW_VAL.
*
- * Matches the cmpxchg() in pv_wait_head() setting _Q_SLOW_VAL.
+ * Matches the cmpxchg() in pv_wait_head_or_lock() setting _Q_SLOW_VAL.
*/
smp_rmb();
@@ -361,14 +512,35 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
* vCPU is harmless other than the additional latency in completing
* the unlock.
*/
+ qstat_inc(qstat_pv_kick_unlock, true);
pv_kick(node->cpu);
}
+
/*
* Include the architecture specific callee-save thunk of the
* __pv_queued_spin_unlock(). This thunk is put together with
- * __pv_queued_spin_unlock() near the top of the file to make sure
- * that the callee-save thunk and the real unlock function are close
- * to each other sharing consecutive instruction cachelines.
+ * __pv_queued_spin_unlock() to make the callee-save thunk and the real unlock
+ * function close to each other sharing consecutive instruction cachelines.
+ * Alternatively, architecture specific version of __pv_queued_spin_unlock()
+ * can be defined.
*/
#include <asm/qspinlock_paravirt.h>
+#ifndef __pv_queued_spin_unlock
+__visible void __pv_queued_spin_unlock(struct qspinlock *lock)
+{
+ struct __qspinlock *l = (void *)lock;
+ u8 locked;
+
+ /*
+ * We must not unlock if SLOW, because in that case we must first
+ * unhash. Otherwise it would be possible to have multiple @lock
+ * entries, which would be BAD.
+ */
+ locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
+ if (likely(locked == _Q_LOCKED_VAL))
+ return;
+
+ __pv_queued_spin_unlock_slowpath(lock, locked);
+}
+#endif /* __pv_queued_spin_unlock */
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
new file mode 100644
index 000000000..640dcecdd
--- /dev/null
+++ b/kernel/locking/qspinlock_stat.h
@@ -0,0 +1,300 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Waiman Long <waiman.long@hpe.com>
+ */
+
+/*
+ * When queued spinlock statistical counters are enabled, the following
+ * debugfs files will be created for reporting the counter values:
+ *
+ * <debugfs>/qlockstat/
+ * pv_hash_hops - average # of hops per hashing operation
+ * pv_kick_unlock - # of vCPU kicks issued at unlock time
+ * pv_kick_wake - # of vCPU kicks used for computing pv_latency_wake
+ * pv_latency_kick - average latency (ns) of vCPU kick operation
+ * pv_latency_wake - average latency (ns) from vCPU kick to wakeup
+ * pv_lock_stealing - # of lock stealing operations
+ * pv_spurious_wakeup - # of spurious wakeups
+ * pv_wait_again - # of vCPU wait's that happened after a vCPU kick
+ * pv_wait_early - # of early vCPU wait's
+ * pv_wait_head - # of vCPU wait's at the queue head
+ * pv_wait_node - # of vCPU wait's at a non-head queue node
+ *
+ * Writing to the "reset_counters" file will reset all the above counter
+ * values.
+ *
+ * These statistical counters are implemented as per-cpu variables which are
+ * summed and computed whenever the corresponding debugfs files are read. This
+ * minimizes added overhead making the counters usable even in a production
+ * environment.
+ *
+ * There may be slight difference between pv_kick_wake and pv_kick_unlock.
+ */
+enum qlock_stats {
+ qstat_pv_hash_hops,
+ qstat_pv_kick_unlock,
+ qstat_pv_kick_wake,
+ qstat_pv_latency_kick,
+ qstat_pv_latency_wake,
+ qstat_pv_lock_stealing,
+ qstat_pv_spurious_wakeup,
+ qstat_pv_wait_again,
+ qstat_pv_wait_early,
+ qstat_pv_wait_head,
+ qstat_pv_wait_node,
+ qstat_num, /* Total number of statistical counters */
+ qstat_reset_cnts = qstat_num,
+};
+
+#ifdef CONFIG_QUEUED_LOCK_STAT
+/*
+ * Collect pvqspinlock statistics
+ */
+#include <linux/debugfs.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+
+static const char * const qstat_names[qstat_num + 1] = {
+ [qstat_pv_hash_hops] = "pv_hash_hops",
+ [qstat_pv_kick_unlock] = "pv_kick_unlock",
+ [qstat_pv_kick_wake] = "pv_kick_wake",
+ [qstat_pv_spurious_wakeup] = "pv_spurious_wakeup",
+ [qstat_pv_latency_kick] = "pv_latency_kick",
+ [qstat_pv_latency_wake] = "pv_latency_wake",
+ [qstat_pv_lock_stealing] = "pv_lock_stealing",
+ [qstat_pv_wait_again] = "pv_wait_again",
+ [qstat_pv_wait_early] = "pv_wait_early",
+ [qstat_pv_wait_head] = "pv_wait_head",
+ [qstat_pv_wait_node] = "pv_wait_node",
+ [qstat_reset_cnts] = "reset_counters",
+};
+
+/*
+ * Per-cpu counters
+ */
+static DEFINE_PER_CPU(unsigned long, qstats[qstat_num]);
+static DEFINE_PER_CPU(u64, pv_kick_time);
+
+/*
+ * Function to read and return the qlock statistical counter values
+ *
+ * The following counters are handled specially:
+ * 1. qstat_pv_latency_kick
+ * Average kick latency (ns) = pv_latency_kick/pv_kick_unlock
+ * 2. qstat_pv_latency_wake
+ * Average wake latency (ns) = pv_latency_wake/pv_kick_wake
+ * 3. qstat_pv_hash_hops
+ * Average hops/hash = pv_hash_hops/pv_kick_unlock
+ */
+static ssize_t qstat_read(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ char buf[64];
+ int cpu, counter, len;
+ u64 stat = 0, kicks = 0;
+
+ /*
+ * Get the counter ID stored in file->f_inode->i_private
+ */
+ if (!file->f_inode) {
+ WARN_ON_ONCE(1);
+ return -EBADF;
+ }
+ counter = (long)(file->f_inode->i_private);
+
+ if (counter >= qstat_num)
+ return -EBADF;
+
+ for_each_possible_cpu(cpu) {
+ stat += per_cpu(qstats[counter], cpu);
+ /*
+ * Need to sum additional counter for some of them
+ */
+ switch (counter) {
+
+ case qstat_pv_latency_kick:
+ case qstat_pv_hash_hops:
+ kicks += per_cpu(qstats[qstat_pv_kick_unlock], cpu);
+ break;
+
+ case qstat_pv_latency_wake:
+ kicks += per_cpu(qstats[qstat_pv_kick_wake], cpu);
+ break;
+ }
+ }
+
+ if (counter == qstat_pv_hash_hops) {
+ u64 frac;
+
+ frac = 100ULL * do_div(stat, kicks);
+ frac = DIV_ROUND_CLOSEST_ULL(frac, kicks);
+
+ /*
+ * Return a X.XX decimal number
+ */
+ len = snprintf(buf, sizeof(buf) - 1, "%llu.%02llu\n", stat, frac);
+ } else {
+ /*
+ * Round to the nearest ns
+ */
+ if ((counter == qstat_pv_latency_kick) ||
+ (counter == qstat_pv_latency_wake)) {
+ stat = 0;
+ if (kicks)
+ stat = DIV_ROUND_CLOSEST_ULL(stat, kicks);
+ }
+ len = snprintf(buf, sizeof(buf) - 1, "%llu\n", stat);
+ }
+
+ return simple_read_from_buffer(user_buf, count, ppos, buf, len);
+}
+
+/*
+ * Function to handle write request
+ *
+ * When counter = reset_cnts, reset all the counter values.
+ * Since the counter updates aren't atomic, the resetting is done twice
+ * to make sure that the counters are very likely to be all cleared.
+ */
+static ssize_t qstat_write(struct file *file, const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ int cpu;
+
+ /*
+ * Get the counter ID stored in file->f_inode->i_private
+ */
+ if (!file->f_inode) {
+ WARN_ON_ONCE(1);
+ return -EBADF;
+ }
+ if ((long)(file->f_inode->i_private) != qstat_reset_cnts)
+ return count;
+
+ for_each_possible_cpu(cpu) {
+ int i;
+ unsigned long *ptr = per_cpu_ptr(qstats, cpu);
+
+ for (i = 0 ; i < qstat_num; i++)
+ WRITE_ONCE(ptr[i], 0);
+ for (i = 0 ; i < qstat_num; i++)
+ WRITE_ONCE(ptr[i], 0);
+ }
+ return count;
+}
+
+/*
+ * Debugfs data structures
+ */
+static const struct file_operations fops_qstat = {
+ .read = qstat_read,
+ .write = qstat_write,
+ .llseek = default_llseek,
+};
+
+/*
+ * Initialize debugfs for the qspinlock statistical counters
+ */
+static int __init init_qspinlock_stat(void)
+{
+ struct dentry *d_qstat = debugfs_create_dir("qlockstat", NULL);
+ int i;
+
+ if (!d_qstat) {
+ pr_warn("Could not create 'qlockstat' debugfs directory\n");
+ return 0;
+ }
+
+ /*
+ * Create the debugfs files
+ *
+ * As reading from and writing to the stat files can be slow, only
+ * root is allowed to do the read/write to limit impact to system
+ * performance.
+ */
+ for (i = 0; i < qstat_num; i++)
+ debugfs_create_file(qstat_names[i], 0400, d_qstat,
+ (void *)(long)i, &fops_qstat);
+
+ debugfs_create_file(qstat_names[qstat_reset_cnts], 0200, d_qstat,
+ (void *)(long)qstat_reset_cnts, &fops_qstat);
+ return 0;
+}
+fs_initcall(init_qspinlock_stat);
+
+/*
+ * Increment the PV qspinlock statistical counters
+ */
+static inline void qstat_inc(enum qlock_stats stat, bool cond)
+{
+ if (cond)
+ this_cpu_inc(qstats[stat]);
+}
+
+/*
+ * PV hash hop count
+ */
+static inline void qstat_hop(int hopcnt)
+{
+ this_cpu_add(qstats[qstat_pv_hash_hops], hopcnt);
+}
+
+/*
+ * Replacement function for pv_kick()
+ */
+static inline void __pv_kick(int cpu)
+{
+ u64 start = sched_clock();
+
+ per_cpu(pv_kick_time, cpu) = start;
+ pv_kick(cpu);
+ this_cpu_add(qstats[qstat_pv_latency_kick], sched_clock() - start);
+}
+
+/*
+ * Replacement function for pv_wait()
+ */
+static inline void __pv_wait(u8 *ptr, u8 val)
+{
+ u64 *pkick_time = this_cpu_ptr(&pv_kick_time);
+
+ *pkick_time = 0;
+ pv_wait(ptr, val);
+ if (*pkick_time) {
+ this_cpu_add(qstats[qstat_pv_latency_wake],
+ sched_clock() - *pkick_time);
+ qstat_inc(qstat_pv_kick_wake, true);
+ }
+}
+
+#define pv_kick(c) __pv_kick(c)
+#define pv_wait(p, v) __pv_wait(p, v)
+
+/*
+ * PV unfair trylock count tracking function
+ */
+static inline int qstat_spin_steal_lock(struct qspinlock *lock)
+{
+ int ret = pv_queued_spin_steal_lock(lock);
+
+ qstat_inc(qstat_pv_lock_stealing, ret);
+ return ret;
+}
+#undef queued_spin_trylock
+#define queued_spin_trylock(l) qstat_spin_steal_lock(l)
+
+#else /* CONFIG_QUEUED_LOCK_STAT */
+
+static inline void qstat_inc(enum qlock_stats stat, bool cond) { }
+static inline void qstat_hop(int hopcnt) { }
+
+#endif /* CONFIG_QUEUED_LOCK_STAT */
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 8251e75dd..3e746607a 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -99,13 +99,14 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
* 2) Drop lock->wait_lock
* 3) Try to unlock the lock with cmpxchg
*/
-static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
+static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
+ unsigned long flags)
__releases(lock->wait_lock)
{
struct task_struct *owner = rt_mutex_owner(lock);
clear_rt_mutex_waiters(lock);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
/*
* If a new waiter comes in between the unlock and the cmpxchg
* we have two situations:
@@ -147,11 +148,12 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
/*
* Simple slow path only version: lock->owner is protected by lock->wait_lock.
*/
-static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
+static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
+ unsigned long flags)
__releases(lock->wait_lock)
{
lock->owner = NULL;
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
return true;
}
#endif
@@ -433,7 +435,6 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
int ret = 0, depth = 0;
struct rt_mutex *lock;
bool detect_deadlock;
- unsigned long flags;
bool requeue = true;
detect_deadlock = rt_mutex_cond_detect_deadlock(orig_waiter, chwalk);
@@ -476,7 +477,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
/*
* [1] Task cannot go away as we did a get_task() before !
*/
- raw_spin_lock_irqsave(&task->pi_lock, flags);
+ raw_spin_lock_irq(&task->pi_lock);
/*
* [2] Get the waiter on which @task is blocked on.
@@ -560,7 +561,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
* operations.
*/
if (!raw_spin_trylock(&lock->wait_lock)) {
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ raw_spin_unlock_irq(&task->pi_lock);
cpu_relax();
goto retry;
}
@@ -591,7 +592,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
/*
* No requeue[7] here. Just release @task [8]
*/
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ raw_spin_unlock(&task->pi_lock);
put_task_struct(task);
/*
@@ -599,14 +600,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
* If there is no owner of the lock, end of chain.
*/
if (!rt_mutex_owner(lock)) {
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
return 0;
}
/* [10] Grab the next task, i.e. owner of @lock */
task = rt_mutex_owner(lock);
get_task_struct(task);
- raw_spin_lock_irqsave(&task->pi_lock, flags);
+ raw_spin_lock(&task->pi_lock);
/*
* No requeue [11] here. We just do deadlock detection.
@@ -621,8 +622,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
top_waiter = rt_mutex_top_waiter(lock);
/* [13] Drop locks */
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock(&task->pi_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
/* If owner is not blocked, end of chain. */
if (!next_lock)
@@ -643,7 +644,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
rt_mutex_enqueue(lock, waiter);
/* [8] Release the task */
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ raw_spin_unlock(&task->pi_lock);
put_task_struct(task);
/*
@@ -661,14 +662,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
*/
if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))
wake_up_process(rt_mutex_top_waiter(lock)->task);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
return 0;
}
/* [10] Grab the next task, i.e. the owner of @lock */
task = rt_mutex_owner(lock);
get_task_struct(task);
- raw_spin_lock_irqsave(&task->pi_lock, flags);
+ raw_spin_lock(&task->pi_lock);
/* [11] requeue the pi waiters if necessary */
if (waiter == rt_mutex_top_waiter(lock)) {
@@ -722,8 +723,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
top_waiter = rt_mutex_top_waiter(lock);
/* [13] Drop the locks */
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock(&task->pi_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
/*
* Make the actual exit decisions [12], based on the stored
@@ -746,7 +747,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
goto again;
out_unlock_pi:
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ raw_spin_unlock_irq(&task->pi_lock);
out_put_task:
put_task_struct(task);
@@ -756,7 +757,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
/*
* Try to take an rt-mutex
*
- * Must be called with lock->wait_lock held.
+ * Must be called with lock->wait_lock held and interrupts disabled
*
* @lock: The lock to be acquired.
* @task: The task which wants to acquire the lock
@@ -766,8 +767,6 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
struct rt_mutex_waiter *waiter)
{
- unsigned long flags;
-
/*
* Before testing whether we can acquire @lock, we set the
* RT_MUTEX_HAS_WAITERS bit in @lock->owner. This forces all
@@ -852,7 +851,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
* case, but conditionals are more expensive than a redundant
* store.
*/
- raw_spin_lock_irqsave(&task->pi_lock, flags);
+ raw_spin_lock(&task->pi_lock);
task->pi_blocked_on = NULL;
/*
* Finish the lock acquisition. @task is the new owner. If
@@ -861,7 +860,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
*/
if (rt_mutex_has_waiters(lock))
rt_mutex_enqueue_pi(task, rt_mutex_top_waiter(lock));
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ raw_spin_unlock(&task->pi_lock);
takeit:
/* We got the lock. */
@@ -883,7 +882,7 @@ takeit:
*
* Prepare waiter and propagate pi chain
*
- * This must be called with lock->wait_lock held.
+ * This must be called with lock->wait_lock held and interrupts disabled
*/
static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter,
@@ -894,7 +893,6 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
struct rt_mutex_waiter *top_waiter = waiter;
struct rt_mutex *next_lock;
int chain_walk = 0, res;
- unsigned long flags;
/*
* Early deadlock detection. We really don't want the task to
@@ -908,7 +906,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
if (owner == task)
return -EDEADLK;
- raw_spin_lock_irqsave(&task->pi_lock, flags);
+ raw_spin_lock(&task->pi_lock);
__rt_mutex_adjust_prio(task);
waiter->task = task;
waiter->lock = lock;
@@ -921,12 +919,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
task->pi_blocked_on = waiter;
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ raw_spin_unlock(&task->pi_lock);
if (!owner)
return 0;
- raw_spin_lock_irqsave(&owner->pi_lock, flags);
+ raw_spin_lock(&owner->pi_lock);
if (waiter == rt_mutex_top_waiter(lock)) {
rt_mutex_dequeue_pi(owner, top_waiter);
rt_mutex_enqueue_pi(owner, waiter);
@@ -941,7 +939,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
/* Store the lock on which owner is blocked or NULL */
next_lock = task_blocked_on_lock(owner);
- raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
+ raw_spin_unlock(&owner->pi_lock);
/*
* Even if full deadlock detection is on, if the owner is not
* blocked itself, we can avoid finding this out in the chain
@@ -957,12 +955,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
*/
get_task_struct(owner);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
res = rt_mutex_adjust_prio_chain(owner, chwalk, lock,
next_lock, waiter, task);
- raw_spin_lock(&lock->wait_lock);
+ raw_spin_lock_irq(&lock->wait_lock);
return res;
}
@@ -971,15 +969,14 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
* Remove the top waiter from the current tasks pi waiter tree and
* queue it up.
*
- * Called with lock->wait_lock held.
+ * Called with lock->wait_lock held and interrupts disabled.
*/
static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
struct rt_mutex *lock)
{
struct rt_mutex_waiter *waiter;
- unsigned long flags;
- raw_spin_lock_irqsave(&current->pi_lock, flags);
+ raw_spin_lock(&current->pi_lock);
waiter = rt_mutex_top_waiter(lock);
@@ -1001,7 +998,7 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
*/
lock->owner = (void *) RT_MUTEX_HAS_WAITERS;
- raw_spin_unlock_irqrestore(&current->pi_lock, flags);
+ raw_spin_unlock(&current->pi_lock);
wake_q_add(wake_q, waiter->task);
}
@@ -1009,7 +1006,7 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
/*
* Remove a waiter from a lock and give up
*
- * Must be called with lock->wait_lock held and
+ * Must be called with lock->wait_lock held and interrupts disabled. I must
* have just failed to try_to_take_rt_mutex().
*/
static void remove_waiter(struct rt_mutex *lock,
@@ -1018,12 +1015,11 @@ static void remove_waiter(struct rt_mutex *lock,
bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
struct task_struct *owner = rt_mutex_owner(lock);
struct rt_mutex *next_lock;
- unsigned long flags;
- raw_spin_lock_irqsave(&current->pi_lock, flags);
+ raw_spin_lock(&current->pi_lock);
rt_mutex_dequeue(lock, waiter);
current->pi_blocked_on = NULL;
- raw_spin_unlock_irqrestore(&current->pi_lock, flags);
+ raw_spin_unlock(&current->pi_lock);
/*
* Only update priority if the waiter was the highest priority
@@ -1032,7 +1028,7 @@ static void remove_waiter(struct rt_mutex *lock,
if (!owner || !is_top_waiter)
return;
- raw_spin_lock_irqsave(&owner->pi_lock, flags);
+ raw_spin_lock(&owner->pi_lock);
rt_mutex_dequeue_pi(owner, waiter);
@@ -1044,7 +1040,7 @@ static void remove_waiter(struct rt_mutex *lock,
/* Store the lock on which owner is blocked or NULL */
next_lock = task_blocked_on_lock(owner);
- raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
+ raw_spin_unlock(&owner->pi_lock);
/*
* Don't walk the chain, if the owner task is not blocked
@@ -1056,12 +1052,12 @@ static void remove_waiter(struct rt_mutex *lock,
/* gets dropped in rt_mutex_adjust_prio_chain()! */
get_task_struct(owner);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
rt_mutex_adjust_prio_chain(owner, RT_MUTEX_MIN_CHAINWALK, lock,
next_lock, NULL, current);
- raw_spin_lock(&lock->wait_lock);
+ raw_spin_lock_irq(&lock->wait_lock);
}
/*
@@ -1097,11 +1093,11 @@ void rt_mutex_adjust_pi(struct task_struct *task)
* __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
* @lock: the rt_mutex to take
* @state: the state the task should block in (TASK_INTERRUPTIBLE
- * or TASK_UNINTERRUPTIBLE)
+ * or TASK_UNINTERRUPTIBLE)
* @timeout: the pre-initialized and started timer, or NULL for none
* @waiter: the pre-initialized rt_mutex_waiter
*
- * lock->wait_lock must be held by the caller.
+ * Must be called with lock->wait_lock held and interrupts disabled
*/
static int __sched
__rt_mutex_slowlock(struct rt_mutex *lock, int state,
@@ -1129,13 +1125,13 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
break;
}
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
debug_rt_mutex_print_deadlock(waiter);
schedule();
- raw_spin_lock(&lock->wait_lock);
+ raw_spin_lock_irq(&lock->wait_lock);
set_current_state(state);
}
@@ -1172,17 +1168,26 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
enum rtmutex_chainwalk chwalk)
{
struct rt_mutex_waiter waiter;
+ unsigned long flags;
int ret = 0;
debug_rt_mutex_init_waiter(&waiter);
RB_CLEAR_NODE(&waiter.pi_tree_entry);
RB_CLEAR_NODE(&waiter.tree_entry);
- raw_spin_lock(&lock->wait_lock);
+ /*
+ * Technically we could use raw_spin_[un]lock_irq() here, but this can
+ * be called in early boot if the cmpxchg() fast path is disabled
+ * (debug, no architecture support). In this case we will acquire the
+ * rtmutex with lock->wait_lock held. But we cannot unconditionally
+ * enable interrupts in that early boot case. So we need to use the
+ * irqsave/restore variants.
+ */
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
/* Try to acquire the lock again: */
if (try_to_take_rt_mutex(lock, current, NULL)) {
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
return 0;
}
@@ -1211,7 +1216,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
*/
fixup_rt_mutex_waiters(lock);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
/* Remove pending timer: */
if (unlikely(timeout))
@@ -1227,6 +1232,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
*/
static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
{
+ unsigned long flags;
int ret;
/*
@@ -1238,10 +1244,10 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
return 0;
/*
- * The mutex has currently no owner. Lock the wait lock and
- * try to acquire the lock.
+ * The mutex has currently no owner. Lock the wait lock and try to
+ * acquire the lock. We use irqsave here to support early boot calls.
*/
- raw_spin_lock(&lock->wait_lock);
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
ret = try_to_take_rt_mutex(lock, current, NULL);
@@ -1251,7 +1257,7 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
*/
fixup_rt_mutex_waiters(lock);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
return ret;
}
@@ -1263,7 +1269,10 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
struct wake_q_head *wake_q)
{
- raw_spin_lock(&lock->wait_lock);
+ unsigned long flags;
+
+ /* irqsave required to support early boot calls */
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
debug_rt_mutex_unlock(lock);
@@ -1302,10 +1311,10 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
*/
while (!rt_mutex_has_waiters(lock)) {
/* Drops lock->wait_lock ! */
- if (unlock_rt_mutex_safe(lock) == true)
+ if (unlock_rt_mutex_safe(lock, flags) == true)
return false;
/* Relock the rtmutex and try again */
- raw_spin_lock(&lock->wait_lock);
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
}
/*
@@ -1316,7 +1325,7 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
*/
mark_wakeup_next_waiter(wake_q, lock);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
/* check PI boosting */
return true;
@@ -1596,10 +1605,10 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
{
int ret;
- raw_spin_lock(&lock->wait_lock);
+ raw_spin_lock_irq(&lock->wait_lock);
if (try_to_take_rt_mutex(lock, task, NULL)) {
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
return 1;
}
@@ -1620,7 +1629,7 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
if (unlikely(ret))
remove_waiter(lock, waiter);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
debug_rt_mutex_print_deadlock(waiter);
@@ -1668,7 +1677,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
{
int ret;
- raw_spin_lock(&lock->wait_lock);
+ raw_spin_lock_irq(&lock->wait_lock);
set_current_state(TASK_INTERRUPTIBLE);
@@ -1684,7 +1693,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
*/
fixup_rt_mutex_waiters(lock);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
return ret;
}
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 25ced161e..6cf54615a 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -10,8 +10,11 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
+#include <linux/radix-tree.h>
+#include <linux/memremap.h>
#include <linux/device.h>
#include <linux/types.h>
+#include <linux/pfn_t.h>
#include <linux/io.h>
#include <linux/mm.h>
#include <linux/memory_hotplug.h>
@@ -26,10 +29,10 @@ __weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size)
static void *try_ram_remap(resource_size_t offset, size_t size)
{
- struct page *page = pfn_to_page(offset >> PAGE_SHIFT);
+ unsigned long pfn = PHYS_PFN(offset);
/* In the simple case just return the existing linear address */
- if (!PageHighMem(page))
+ if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn)))
return __va(offset);
return NULL; /* fallback to ioremap_cache */
}
@@ -149,25 +152,134 @@ void devm_memunmap(struct device *dev, void *addr)
}
EXPORT_SYMBOL(devm_memunmap);
+pfn_t phys_to_pfn_t(phys_addr_t addr, u64 flags)
+{
+ return __pfn_to_pfn_t(addr >> PAGE_SHIFT, flags);
+}
+EXPORT_SYMBOL(phys_to_pfn_t);
+
#ifdef CONFIG_ZONE_DEVICE
+static DEFINE_MUTEX(pgmap_lock);
+static RADIX_TREE(pgmap_radix, GFP_KERNEL);
+#define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1)
+#define SECTION_SIZE (1UL << PA_SECTION_SHIFT)
+
struct page_map {
struct resource res;
+ struct percpu_ref *ref;
+ struct dev_pagemap pgmap;
+ struct vmem_altmap altmap;
};
-static void devm_memremap_pages_release(struct device *dev, void *res)
+void get_zone_device_page(struct page *page)
+{
+ percpu_ref_get(page->pgmap->ref);
+}
+EXPORT_SYMBOL(get_zone_device_page);
+
+void put_zone_device_page(struct page *page)
+{
+ put_dev_pagemap(page->pgmap);
+}
+EXPORT_SYMBOL(put_zone_device_page);
+
+static void pgmap_radix_release(struct resource *res)
+{
+ resource_size_t key, align_start, align_size, align_end;
+
+ align_start = res->start & ~(SECTION_SIZE - 1);
+ align_size = ALIGN(resource_size(res), SECTION_SIZE);
+ align_end = align_start + align_size - 1;
+
+ mutex_lock(&pgmap_lock);
+ for (key = res->start; key <= res->end; key += SECTION_SIZE)
+ radix_tree_delete(&pgmap_radix, key >> PA_SECTION_SHIFT);
+ mutex_unlock(&pgmap_lock);
+}
+
+static unsigned long pfn_first(struct page_map *page_map)
+{
+ struct dev_pagemap *pgmap = &page_map->pgmap;
+ const struct resource *res = &page_map->res;
+ struct vmem_altmap *altmap = pgmap->altmap;
+ unsigned long pfn;
+
+ pfn = res->start >> PAGE_SHIFT;
+ if (altmap)
+ pfn += vmem_altmap_offset(altmap);
+ return pfn;
+}
+
+static unsigned long pfn_end(struct page_map *page_map)
{
- struct page_map *page_map = res;
+ const struct resource *res = &page_map->res;
+
+ return (res->start + resource_size(res)) >> PAGE_SHIFT;
+}
+
+#define for_each_device_pfn(pfn, map) \
+ for (pfn = pfn_first(map); pfn < pfn_end(map); pfn++)
+
+static void devm_memremap_pages_release(struct device *dev, void *data)
+{
+ struct page_map *page_map = data;
+ struct resource *res = &page_map->res;
+ resource_size_t align_start, align_size;
+ struct dev_pagemap *pgmap = &page_map->pgmap;
+
+ if (percpu_ref_tryget_live(pgmap->ref)) {
+ dev_WARN(dev, "%s: page mapping is still live!\n", __func__);
+ percpu_ref_put(pgmap->ref);
+ }
/* pages are dead and unused, undo the arch mapping */
- arch_remove_memory(page_map->res.start, resource_size(&page_map->res));
+ align_start = res->start & ~(SECTION_SIZE - 1);
+ align_size = ALIGN(resource_size(res), SECTION_SIZE);
+ arch_remove_memory(align_start, align_size);
+ pgmap_radix_release(res);
+ dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc,
+ "%s: failed to free all reserved pages\n", __func__);
}
-void *devm_memremap_pages(struct device *dev, struct resource *res)
+/* assumes rcu_read_lock() held at entry */
+struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
{
- int is_ram = region_intersects(res->start, resource_size(res),
- "System RAM");
struct page_map *page_map;
- int error, nid;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ page_map = radix_tree_lookup(&pgmap_radix, phys >> PA_SECTION_SHIFT);
+ return page_map ? &page_map->pgmap : NULL;
+}
+
+/**
+ * devm_memremap_pages - remap and provide memmap backing for the given resource
+ * @dev: hosting device for @res
+ * @res: "host memory" address range
+ * @ref: a live per-cpu reference count
+ * @altmap: optional descriptor for allocating the memmap from @res
+ *
+ * Notes:
+ * 1/ @ref must be 'live' on entry and 'dead' before devm_memunmap_pages() time
+ * (or devm release event).
+ *
+ * 2/ @res is expected to be a host memory range that could feasibly be
+ * treated as a "System RAM" range, i.e. not a device mmio range, but
+ * this is not enforced.
+ */
+void *devm_memremap_pages(struct device *dev, struct resource *res,
+ struct percpu_ref *ref, struct vmem_altmap *altmap)
+{
+ resource_size_t key, align_start, align_size, align_end;
+ struct dev_pagemap *pgmap;
+ struct page_map *page_map;
+ int error, nid, is_ram;
+ unsigned long pfn;
+
+ align_start = res->start & ~(SECTION_SIZE - 1);
+ align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
+ - align_start;
+ is_ram = region_intersects(align_start, align_size, "System RAM");
if (is_ram == REGION_MIXED) {
WARN_ONCE(1, "%s attempted on mixed region %pr\n",
@@ -178,25 +290,124 @@ void *devm_memremap_pages(struct device *dev, struct resource *res)
if (is_ram == REGION_INTERSECTS)
return __va(res->start);
+ if (altmap && !IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)) {
+ dev_err(dev, "%s: altmap requires CONFIG_SPARSEMEM_VMEMMAP=y\n",
+ __func__);
+ return ERR_PTR(-ENXIO);
+ }
+
+ if (!ref)
+ return ERR_PTR(-EINVAL);
+
page_map = devres_alloc_node(devm_memremap_pages_release,
sizeof(*page_map), GFP_KERNEL, dev_to_node(dev));
if (!page_map)
return ERR_PTR(-ENOMEM);
+ pgmap = &page_map->pgmap;
memcpy(&page_map->res, res, sizeof(*res));
+ pgmap->dev = dev;
+ if (altmap) {
+ memcpy(&page_map->altmap, altmap, sizeof(*altmap));
+ pgmap->altmap = &page_map->altmap;
+ }
+ pgmap->ref = ref;
+ pgmap->res = &page_map->res;
+
+ mutex_lock(&pgmap_lock);
+ error = 0;
+ align_end = align_start + align_size - 1;
+ for (key = align_start; key <= align_end; key += SECTION_SIZE) {
+ struct dev_pagemap *dup;
+
+ rcu_read_lock();
+ dup = find_dev_pagemap(key);
+ rcu_read_unlock();
+ if (dup) {
+ dev_err(dev, "%s: %pr collides with mapping for %s\n",
+ __func__, res, dev_name(dup->dev));
+ error = -EBUSY;
+ break;
+ }
+ error = radix_tree_insert(&pgmap_radix, key >> PA_SECTION_SHIFT,
+ page_map);
+ if (error) {
+ dev_err(dev, "%s: failed: %d\n", __func__, error);
+ break;
+ }
+ }
+ mutex_unlock(&pgmap_lock);
+ if (error)
+ goto err_radix;
+
nid = dev_to_node(dev);
if (nid < 0)
nid = numa_mem_id();
- error = arch_add_memory(nid, res->start, resource_size(res), true);
- if (error) {
- devres_free(page_map);
- return ERR_PTR(error);
- }
+ error = arch_add_memory(nid, align_start, align_size, true);
+ if (error)
+ goto err_add_memory;
+ for_each_device_pfn(pfn, page_map) {
+ struct page *page = pfn_to_page(pfn);
+
+ /*
+ * ZONE_DEVICE pages union ->lru with a ->pgmap back
+ * pointer. It is a bug if a ZONE_DEVICE page is ever
+ * freed or placed on a driver-private list. Seed the
+ * storage with LIST_POISON* values.
+ */
+ list_del(&page->lru);
+ page->pgmap = pgmap;
+ }
devres_add(dev, page_map);
return __va(res->start);
+
+ err_add_memory:
+ err_radix:
+ pgmap_radix_release(res);
+ devres_free(page_map);
+ return ERR_PTR(error);
}
EXPORT_SYMBOL(devm_memremap_pages);
+
+unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
+{
+ /* number of pfns from base where pfn_to_page() is valid */
+ return altmap->reserve + altmap->free;
+}
+
+void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns)
+{
+ altmap->alloc -= nr_pfns;
+}
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
+{
+ /*
+ * 'memmap_start' is the virtual address for the first "struct
+ * page" in this range of the vmemmap array. In the case of
+ * CONFIG_SPARSE_VMEMMAP a page_to_pfn conversion is simple
+ * pointer arithmetic, so we can perform this to_vmem_altmap()
+ * conversion without concern for the initialization state of
+ * the struct page fields.
+ */
+ struct page *page = (struct page *) memmap_start;
+ struct dev_pagemap *pgmap;
+
+ /*
+ * Uncoditionally retrieve a dev_pagemap associated with the
+ * given physical address, this is only for use in the
+ * arch_{add|remove}_memory() for setting up and tearing down
+ * the memmap.
+ */
+ rcu_read_lock();
+ pgmap = find_dev_pagemap(__pfn_to_phys(page_to_pfn(page)));
+ rcu_read_unlock();
+
+ return pgmap ? pgmap->altmap : NULL;
+}
+#endif /* CONFIG_SPARSEMEM_VMEMMAP */
#endif /* CONFIG_ZONE_DEVICE */
diff --git a/kernel/module.c b/kernel/module.c
index 0e5c71195..794ebe8e8 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -80,15 +80,6 @@
# define debug_align(X) (X)
#endif
-/*
- * Given BASE and SIZE this macro calculates the number of pages the
- * memory regions occupies
- */
-#define MOD_NUMBER_OF_PAGES(BASE, SIZE) (((SIZE) > 0) ? \
- (PFN_DOWN((unsigned long)(BASE) + (SIZE) - 1) - \
- PFN_DOWN((unsigned long)BASE) + 1) \
- : (0UL))
-
/* If this is set, the section belongs in the init part of the module */
#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1))
@@ -108,13 +99,6 @@ static LIST_HEAD(modules);
* Use a latched RB-tree for __module_address(); this allows us to use
* RCU-sched lookups of the address from any context.
*
- * Because modules have two address ranges: init and core, we need two
- * latch_tree_nodes entries. Therefore we need the back-pointer from
- * mod_tree_node.
- *
- * Because init ranges are short lived we mark them unlikely and have placed
- * them outside the critical cacheline in struct module.
- *
* This is conditional on PERF_EVENTS || TRACING because those can really hit
* __module_address() hard by doing a lot of stack unwinding; potentially from
* NMI context.
@@ -122,24 +106,16 @@ static LIST_HEAD(modules);
static __always_inline unsigned long __mod_tree_val(struct latch_tree_node *n)
{
- struct mod_tree_node *mtn = container_of(n, struct mod_tree_node, node);
- struct module *mod = mtn->mod;
+ struct module_layout *layout = container_of(n, struct module_layout, mtn.node);
- if (unlikely(mtn == &mod->mtn_init))
- return (unsigned long)mod->module_init;
-
- return (unsigned long)mod->module_core;
+ return (unsigned long)layout->base;
}
static __always_inline unsigned long __mod_tree_size(struct latch_tree_node *n)
{
- struct mod_tree_node *mtn = container_of(n, struct mod_tree_node, node);
- struct module *mod = mtn->mod;
-
- if (unlikely(mtn == &mod->mtn_init))
- return (unsigned long)mod->init_size;
+ struct module_layout *layout = container_of(n, struct module_layout, mtn.node);
- return (unsigned long)mod->core_size;
+ return (unsigned long)layout->size;
}
static __always_inline bool
@@ -197,23 +173,23 @@ static void __mod_tree_remove(struct mod_tree_node *node)
*/
static void mod_tree_insert(struct module *mod)
{
- mod->mtn_core.mod = mod;
- mod->mtn_init.mod = mod;
+ mod->core_layout.mtn.mod = mod;
+ mod->init_layout.mtn.mod = mod;
- __mod_tree_insert(&mod->mtn_core);
- if (mod->init_size)
- __mod_tree_insert(&mod->mtn_init);
+ __mod_tree_insert(&mod->core_layout.mtn);
+ if (mod->init_layout.size)
+ __mod_tree_insert(&mod->init_layout.mtn);
}
static void mod_tree_remove_init(struct module *mod)
{
- if (mod->init_size)
- __mod_tree_remove(&mod->mtn_init);
+ if (mod->init_layout.size)
+ __mod_tree_remove(&mod->init_layout.mtn);
}
static void mod_tree_remove(struct module *mod)
{
- __mod_tree_remove(&mod->mtn_core);
+ __mod_tree_remove(&mod->core_layout.mtn);
mod_tree_remove_init(mod);
}
@@ -267,9 +243,9 @@ static void __mod_update_bounds(void *base, unsigned int size)
static void mod_update_bounds(struct module *mod)
{
- __mod_update_bounds(mod->module_core, mod->core_size);
- if (mod->init_size)
- __mod_update_bounds(mod->module_init, mod->init_size);
+ __mod_update_bounds(mod->core_layout.base, mod->core_layout.size);
+ if (mod->init_layout.size)
+ __mod_update_bounds(mod->init_layout.base, mod->init_layout.size);
}
#ifdef CONFIG_KGDB_KDB
@@ -1008,6 +984,8 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
mod->exit();
blocking_notifier_call_chain(&module_notify_list,
MODULE_STATE_GOING, mod);
+ ftrace_release_mod(mod);
+
async_synchronize_full();
/* Store the name of the last unloaded module for diagnostic purposes */
@@ -1217,7 +1195,7 @@ struct module_attribute module_uevent =
static ssize_t show_coresize(struct module_attribute *mattr,
struct module_kobject *mk, char *buffer)
{
- return sprintf(buffer, "%u\n", mk->mod->core_size);
+ return sprintf(buffer, "%u\n", mk->mod->core_layout.size);
}
static struct module_attribute modinfo_coresize =
@@ -1226,7 +1204,7 @@ static struct module_attribute modinfo_coresize =
static ssize_t show_initsize(struct module_attribute *mattr,
struct module_kobject *mk, char *buffer)
{
- return sprintf(buffer, "%u\n", mk->mod->init_size);
+ return sprintf(buffer, "%u\n", mk->mod->init_layout.size);
}
static struct module_attribute modinfo_initsize =
@@ -1876,64 +1854,75 @@ static void mod_sysfs_teardown(struct module *mod)
/*
* LKM RO/NX protection: protect module's text/ro-data
* from modification and any data from execution.
+ *
+ * General layout of module is:
+ * [text] [read-only-data] [writable data]
+ * text_size -----^ ^ ^
+ * ro_size ------------------------| |
+ * size -------------------------------------------|
+ *
+ * These values are always page-aligned (as is base)
*/
-void set_page_attributes(void *start, void *end, int (*set)(unsigned long start, int num_pages))
+static void frob_text(const struct module_layout *layout,
+ int (*set_memory)(unsigned long start, int num_pages))
{
- unsigned long begin_pfn = PFN_DOWN((unsigned long)start);
- unsigned long end_pfn = PFN_DOWN((unsigned long)end);
+ BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
+ BUG_ON((unsigned long)layout->text_size & (PAGE_SIZE-1));
+ set_memory((unsigned long)layout->base,
+ layout->text_size >> PAGE_SHIFT);
+}
- if (end_pfn > begin_pfn)
- set(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn);
+static void frob_rodata(const struct module_layout *layout,
+ int (*set_memory)(unsigned long start, int num_pages))
+{
+ BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
+ BUG_ON((unsigned long)layout->text_size & (PAGE_SIZE-1));
+ BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1));
+ set_memory((unsigned long)layout->base + layout->text_size,
+ (layout->ro_size - layout->text_size) >> PAGE_SHIFT);
}
-static void set_section_ro_nx(void *base,
- unsigned long text_size,
- unsigned long ro_size,
- unsigned long total_size)
+static void frob_writable_data(const struct module_layout *layout,
+ int (*set_memory)(unsigned long start, int num_pages))
{
- /* begin and end PFNs of the current subsection */
- unsigned long begin_pfn;
- unsigned long end_pfn;
+ BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
+ BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1));
+ BUG_ON((unsigned long)layout->size & (PAGE_SIZE-1));
+ set_memory((unsigned long)layout->base + layout->ro_size,
+ (layout->size - layout->ro_size) >> PAGE_SHIFT);
+}
- /*
- * Set RO for module text and RO-data:
- * - Always protect first page.
- * - Do not protect last partial page.
- */
- if (ro_size > 0)
- set_page_attributes(base, base + ro_size, set_memory_ro);
+/* livepatching wants to disable read-only so it can frob module. */
+void module_disable_ro(const struct module *mod)
+{
+ frob_text(&mod->core_layout, set_memory_rw);
+ frob_rodata(&mod->core_layout, set_memory_rw);
+ frob_text(&mod->init_layout, set_memory_rw);
+ frob_rodata(&mod->init_layout, set_memory_rw);
+}
- /*
- * Set NX permissions for module data:
- * - Do not protect first partial page.
- * - Always protect last page.
- */
- if (total_size > text_size) {
- begin_pfn = PFN_UP((unsigned long)base + text_size);
- end_pfn = PFN_UP((unsigned long)base + total_size);
- if (end_pfn > begin_pfn)
- set_memory_nx(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn);
- }
+void module_enable_ro(const struct module *mod)
+{
+ frob_text(&mod->core_layout, set_memory_ro);
+ frob_rodata(&mod->core_layout, set_memory_ro);
+ frob_text(&mod->init_layout, set_memory_ro);
+ frob_rodata(&mod->init_layout, set_memory_ro);
}
-static void unset_module_core_ro_nx(struct module *mod)
+static void module_enable_nx(const struct module *mod)
{
- set_page_attributes(mod->module_core + mod->core_text_size,
- mod->module_core + mod->core_size,
- set_memory_x);
- set_page_attributes(mod->module_core,
- mod->module_core + mod->core_ro_size,
- set_memory_rw);
+ frob_rodata(&mod->core_layout, set_memory_nx);
+ frob_writable_data(&mod->core_layout, set_memory_nx);
+ frob_rodata(&mod->init_layout, set_memory_nx);
+ frob_writable_data(&mod->init_layout, set_memory_nx);
}
-static void unset_module_init_ro_nx(struct module *mod)
+static void module_disable_nx(const struct module *mod)
{
- set_page_attributes(mod->module_init + mod->init_text_size,
- mod->module_init + mod->init_size,
- set_memory_x);
- set_page_attributes(mod->module_init,
- mod->module_init + mod->init_ro_size,
- set_memory_rw);
+ frob_rodata(&mod->core_layout, set_memory_x);
+ frob_writable_data(&mod->core_layout, set_memory_x);
+ frob_rodata(&mod->init_layout, set_memory_x);
+ frob_writable_data(&mod->init_layout, set_memory_x);
}
/* Iterate through all modules and set each module's text as RW */
@@ -1945,16 +1934,9 @@ void set_all_modules_text_rw(void)
list_for_each_entry_rcu(mod, &modules, list) {
if (mod->state == MODULE_STATE_UNFORMED)
continue;
- if ((mod->module_core) && (mod->core_text_size)) {
- set_page_attributes(mod->module_core,
- mod->module_core + mod->core_text_size,
- set_memory_rw);
- }
- if ((mod->module_init) && (mod->init_text_size)) {
- set_page_attributes(mod->module_init,
- mod->module_init + mod->init_text_size,
- set_memory_rw);
- }
+
+ frob_text(&mod->core_layout, set_memory_rw);
+ frob_text(&mod->init_layout, set_memory_rw);
}
mutex_unlock(&module_mutex);
}
@@ -1968,23 +1950,25 @@ void set_all_modules_text_ro(void)
list_for_each_entry_rcu(mod, &modules, list) {
if (mod->state == MODULE_STATE_UNFORMED)
continue;
- if ((mod->module_core) && (mod->core_text_size)) {
- set_page_attributes(mod->module_core,
- mod->module_core + mod->core_text_size,
- set_memory_ro);
- }
- if ((mod->module_init) && (mod->init_text_size)) {
- set_page_attributes(mod->module_init,
- mod->module_init + mod->init_text_size,
- set_memory_ro);
- }
+
+ frob_text(&mod->core_layout, set_memory_ro);
+ frob_text(&mod->init_layout, set_memory_ro);
}
mutex_unlock(&module_mutex);
}
+
+static void disable_ro_nx(const struct module_layout *layout)
+{
+ frob_text(layout, set_memory_rw);
+ frob_rodata(layout, set_memory_rw);
+ frob_rodata(layout, set_memory_x);
+ frob_writable_data(layout, set_memory_x);
+}
+
#else
-static inline void set_section_ro_nx(void *base, unsigned long text_size, unsigned long ro_size, unsigned long total_size) { }
-static void unset_module_core_ro_nx(struct module *mod) { }
-static void unset_module_init_ro_nx(struct module *mod) { }
+static void disable_ro_nx(const struct module_layout *layout) { }
+static void module_enable_nx(const struct module *mod) { }
+static void module_disable_nx(const struct module *mod) { }
#endif
void __weak module_memfree(void *module_region)
@@ -2036,19 +2020,19 @@ static void free_module(struct module *mod)
synchronize_sched();
mutex_unlock(&module_mutex);
- /* This may be NULL, but that's OK */
- unset_module_init_ro_nx(mod);
+ /* This may be empty, but that's OK */
+ disable_ro_nx(&mod->init_layout);
module_arch_freeing_init(mod);
- module_memfree(mod->module_init);
+ module_memfree(mod->init_layout.base);
kfree(mod->args);
percpu_modfree(mod);
/* Free lock-classes; relies on the preceding sync_rcu(). */
- lockdep_free_key_range(mod->module_core, mod->core_size);
+ lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size);
/* Finally, free the core (containing the module structure) */
- unset_module_core_ro_nx(mod);
- module_memfree(mod->module_core);
+ disable_ro_nx(&mod->core_layout);
+ module_memfree(mod->core_layout.base);
#ifdef CONFIG_MPU
update_protections(current->mm);
@@ -2251,20 +2235,20 @@ static void layout_sections(struct module *mod, struct load_info *info)
|| s->sh_entsize != ~0UL
|| strstarts(sname, ".init"))
continue;
- s->sh_entsize = get_offset(mod, &mod->core_size, s, i);
+ s->sh_entsize = get_offset(mod, &mod->core_layout.size, s, i);
pr_debug("\t%s\n", sname);
}
switch (m) {
case 0: /* executable */
- mod->core_size = debug_align(mod->core_size);
- mod->core_text_size = mod->core_size;
+ mod->core_layout.size = debug_align(mod->core_layout.size);
+ mod->core_layout.text_size = mod->core_layout.size;
break;
case 1: /* RO: text and ro-data */
- mod->core_size = debug_align(mod->core_size);
- mod->core_ro_size = mod->core_size;
+ mod->core_layout.size = debug_align(mod->core_layout.size);
+ mod->core_layout.ro_size = mod->core_layout.size;
break;
case 3: /* whole core */
- mod->core_size = debug_align(mod->core_size);
+ mod->core_layout.size = debug_align(mod->core_layout.size);
break;
}
}
@@ -2280,21 +2264,21 @@ static void layout_sections(struct module *mod, struct load_info *info)
|| s->sh_entsize != ~0UL
|| !strstarts(sname, ".init"))
continue;
- s->sh_entsize = (get_offset(mod, &mod->init_size, s, i)
+ s->sh_entsize = (get_offset(mod, &mod->init_layout.size, s, i)
| INIT_OFFSET_MASK);
pr_debug("\t%s\n", sname);
}
switch (m) {
case 0: /* executable */
- mod->init_size = debug_align(mod->init_size);
- mod->init_text_size = mod->init_size;
+ mod->init_layout.size = debug_align(mod->init_layout.size);
+ mod->init_layout.text_size = mod->init_layout.size;
break;
case 1: /* RO: text and ro-data */
- mod->init_size = debug_align(mod->init_size);
- mod->init_ro_size = mod->init_size;
+ mod->init_layout.size = debug_align(mod->init_layout.size);
+ mod->init_layout.ro_size = mod->init_layout.size;
break;
case 3: /* whole init */
- mod->init_size = debug_align(mod->init_size);
+ mod->init_layout.size = debug_align(mod->init_layout.size);
break;
}
}
@@ -2404,7 +2388,7 @@ static char elf_type(const Elf_Sym *sym, const struct load_info *info)
}
if (sym->st_shndx == SHN_UNDEF)
return 'U';
- if (sym->st_shndx == SHN_ABS)
+ if (sym->st_shndx == SHN_ABS || sym->st_shndx == info->index.pcpu)
return 'a';
if (sym->st_shndx >= SHN_LORESERVE)
return '?';
@@ -2433,7 +2417,7 @@ static char elf_type(const Elf_Sym *sym, const struct load_info *info)
}
static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
- unsigned int shnum)
+ unsigned int shnum, unsigned int pcpundx)
{
const Elf_Shdr *sec;
@@ -2442,6 +2426,11 @@ static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
|| !src->st_name)
return false;
+#ifdef CONFIG_KALLSYMS_ALL
+ if (src->st_shndx == pcpundx)
+ return true;
+#endif
+
sec = sechdrs + src->st_shndx;
if (!(sec->sh_flags & SHF_ALLOC)
#ifndef CONFIG_KALLSYMS_ALL
@@ -2469,7 +2458,7 @@ static void layout_symtab(struct module *mod, struct load_info *info)
/* Put symbol section at end of init part of module. */
symsect->sh_flags |= SHF_ALLOC;
- symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect,
+ symsect->sh_entsize = get_offset(mod, &mod->init_layout.size, symsect,
info->index.sym) | INIT_OFFSET_MASK;
pr_debug("\t%s\n", info->secstrings + symsect->sh_name);
@@ -2479,30 +2468,31 @@ static void layout_symtab(struct module *mod, struct load_info *info)
/* Compute total space required for the core symbols' strtab. */
for (ndst = i = 0; i < nsrc; i++) {
if (i == 0 ||
- is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) {
+ is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum,
+ info->index.pcpu)) {
strtab_size += strlen(&info->strtab[src[i].st_name])+1;
ndst++;
}
}
/* Append room for core symbols at end of core part. */
- info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1);
- info->stroffs = mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym);
- mod->core_size += strtab_size;
- mod->core_size = debug_align(mod->core_size);
+ info->symoffs = ALIGN(mod->core_layout.size, symsect->sh_addralign ?: 1);
+ info->stroffs = mod->core_layout.size = info->symoffs + ndst * sizeof(Elf_Sym);
+ mod->core_layout.size += strtab_size;
+ mod->core_layout.size = debug_align(mod->core_layout.size);
/* Put string table section at end of init part of module. */
strsect->sh_flags |= SHF_ALLOC;
- strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect,
+ strsect->sh_entsize = get_offset(mod, &mod->init_layout.size, strsect,
info->index.str) | INIT_OFFSET_MASK;
pr_debug("\t%s\n", info->secstrings + strsect->sh_name);
/* We'll tack temporary mod_kallsyms on the end. */
- mod->init_size = ALIGN(mod->init_size,
- __alignof__(struct mod_kallsyms));
- info->mod_kallsyms_init_off = mod->init_size;
- mod->init_size += sizeof(struct mod_kallsyms);
- mod->init_size = debug_align(mod->init_size);
+ mod->init_layout.size = ALIGN(mod->init_layout.size,
+ __alignof__(struct mod_kallsyms));
+ info->mod_kallsyms_init_off = mod->init_layout.size;
+ mod->init_layout.size += sizeof(struct mod_kallsyms);
+ mod->init_layout.size = debug_align(mod->init_layout.size);
}
/*
@@ -2519,7 +2509,7 @@ static void add_kallsyms(struct module *mod, const struct load_info *info)
Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
/* Set up to point into init section. */
- mod->kallsyms = mod->module_init + info->mod_kallsyms_init_off;
+ mod->kallsyms = mod->init_layout.base + info->mod_kallsyms_init_off;
mod->kallsyms->symtab = (void *)symsec->sh_addr;
mod->kallsyms->num_symtab = symsec->sh_size / sizeof(Elf_Sym);
@@ -2532,12 +2522,13 @@ static void add_kallsyms(struct module *mod, const struct load_info *info)
= elf_type(&mod->kallsyms->symtab[i], info);
/* Now populate the cut down core kallsyms for after init. */
- mod->core_kallsyms.symtab = dst = mod->module_core + info->symoffs;
- mod->core_kallsyms.strtab = s = mod->module_core + info->stroffs;
+ mod->core_kallsyms.symtab = dst = mod->core_layout.base + info->symoffs;
+ mod->core_kallsyms.strtab = s = mod->core_layout.base + info->stroffs;
src = mod->kallsyms->symtab;
for (ndst = i = 0; i < mod->kallsyms->num_symtab; i++) {
if (i == 0 ||
- is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) {
+ is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum,
+ info->index.pcpu)) {
dst[ndst] = src[i];
dst[ndst++].st_name = s - mod->core_kallsyms.strtab;
s += strlcpy(s, &mod->kallsyms->strtab[src[i].st_name],
@@ -2983,7 +2974,7 @@ static int move_module(struct module *mod, struct load_info *info)
void *ptr;
/* Do the allocs. */
- ptr = module_alloc(mod->core_size);
+ ptr = module_alloc(mod->core_layout.size);
/*
* The pointer to this block is stored in the module structure
* which is inside the block. Just mark it as not being a
@@ -2993,11 +2984,11 @@ static int move_module(struct module *mod, struct load_info *info)
if (!ptr)
return -ENOMEM;
- memset(ptr, 0, mod->core_size);
- mod->module_core = ptr;
+ memset(ptr, 0, mod->core_layout.size);
+ mod->core_layout.base = ptr;
- if (mod->init_size) {
- ptr = module_alloc(mod->init_size);
+ if (mod->init_layout.size) {
+ ptr = module_alloc(mod->init_layout.size);
/*
* The pointer to this block is stored in the module structure
* which is inside the block. This block doesn't need to be
@@ -3006,13 +2997,13 @@ static int move_module(struct module *mod, struct load_info *info)
*/
kmemleak_ignore(ptr);
if (!ptr) {
- module_memfree(mod->module_core);
+ module_memfree(mod->core_layout.base);
return -ENOMEM;
}
- memset(ptr, 0, mod->init_size);
- mod->module_init = ptr;
+ memset(ptr, 0, mod->init_layout.size);
+ mod->init_layout.base = ptr;
} else
- mod->module_init = NULL;
+ mod->init_layout.base = NULL;
/* Transfer each section which specifies SHF_ALLOC */
pr_debug("final section addresses:\n");
@@ -3024,10 +3015,10 @@ static int move_module(struct module *mod, struct load_info *info)
continue;
if (shdr->sh_entsize & INIT_OFFSET_MASK)
- dest = mod->module_init
+ dest = mod->init_layout.base
+ (shdr->sh_entsize & ~INIT_OFFSET_MASK);
else
- dest = mod->module_core + shdr->sh_entsize;
+ dest = mod->core_layout.base + shdr->sh_entsize;
if (shdr->sh_type != SHT_NOBITS)
memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size);
@@ -3089,12 +3080,12 @@ static void flush_module_icache(const struct module *mod)
* Do it before processing of module parameters, so the module
* can provide parameter accessor functions of its own.
*/
- if (mod->module_init)
- flush_icache_range((unsigned long)mod->module_init,
- (unsigned long)mod->module_init
- + mod->init_size);
- flush_icache_range((unsigned long)mod->module_core,
- (unsigned long)mod->module_core + mod->core_size);
+ if (mod->init_layout.base)
+ flush_icache_range((unsigned long)mod->init_layout.base,
+ (unsigned long)mod->init_layout.base
+ + mod->init_layout.size);
+ flush_icache_range((unsigned long)mod->core_layout.base,
+ (unsigned long)mod->core_layout.base + mod->core_layout.size);
set_fs(old_fs);
}
@@ -3152,8 +3143,8 @@ static void module_deallocate(struct module *mod, struct load_info *info)
{
percpu_modfree(mod);
module_arch_freeing_init(mod);
- module_memfree(mod->module_init);
- module_memfree(mod->module_core);
+ module_memfree(mod->init_layout.base);
+ module_memfree(mod->core_layout.base);
}
int __weak module_finalize(const Elf_Ehdr *hdr,
@@ -3240,7 +3231,7 @@ static noinline int do_init_module(struct module *mod)
ret = -ENOMEM;
goto fail;
}
- freeinit->module_init = mod->module_init;
+ freeinit->module_init = mod->init_layout.base;
/*
* We want to find out whether @mod uses async during init. Clear
@@ -3297,12 +3288,12 @@ static noinline int do_init_module(struct module *mod)
rcu_assign_pointer(mod->kallsyms, &mod->core_kallsyms);
#endif
mod_tree_remove_init(mod);
- unset_module_init_ro_nx(mod);
+ disable_ro_nx(&mod->init_layout);
module_arch_freeing_init(mod);
- mod->module_init = NULL;
- mod->init_size = 0;
- mod->init_ro_size = 0;
- mod->init_text_size = 0;
+ mod->init_layout.base = NULL;
+ mod->init_layout.size = 0;
+ mod->init_layout.ro_size = 0;
+ mod->init_layout.text_size = 0;
/*
* We want to free module_init, but be aware that kallsyms may be
* walking this with preempt disabled. In all the failure paths, we
@@ -3324,6 +3315,7 @@ fail:
module_put(mod);
blocking_notifier_call_chain(&module_notify_list,
MODULE_STATE_GOING, mod);
+ ftrace_release_mod(mod);
free_module(mod);
wake_up_all(&module_wq);
return ret;
@@ -3391,23 +3383,16 @@ static int complete_formation(struct module *mod, struct load_info *info)
/* This relies on module_mutex for list integrity. */
module_bug_finalize(info->hdr, info->sechdrs, mod);
- /* Set RO and NX regions for core */
- set_section_ro_nx(mod->module_core,
- mod->core_text_size,
- mod->core_ro_size,
- mod->core_size);
-
- /* Set RO and NX regions for init */
- set_section_ro_nx(mod->module_init,
- mod->init_text_size,
- mod->init_ro_size,
- mod->init_size);
+ /* Set RO and NX regions */
+ module_enable_ro(mod);
+ module_enable_nx(mod);
/* Mark state as coming so strong_try_module_get() ignores us,
* but kallsyms etc. can see us. */
mod->state = MODULE_STATE_COMING;
mutex_unlock(&module_mutex);
+ ftrace_module_enable(mod);
blocking_notifier_call_chain(&module_notify_list,
MODULE_STATE_COMING, mod);
return 0;
@@ -3566,8 +3551,8 @@ static int load_module(struct load_info *info, const char __user *uargs,
MODULE_STATE_GOING, mod);
/* we can't deallocate the module until we clear memory protection */
- unset_module_init_ro_nx(mod);
- unset_module_core_ro_nx(mod);
+ module_disable_ro(mod);
+ module_disable_nx(mod);
ddebug_cleanup:
dynamic_debug_remove(info->debug);
@@ -3596,7 +3581,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
*/
ftrace_release_mod(mod);
/* Free lock-classes; relies on the preceding sync_rcu() */
- lockdep_free_key_range(mod->module_core, mod->core_size);
+ lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size);
module_deallocate(mod, info);
free_copy:
@@ -3680,9 +3665,9 @@ static const char *get_ksymbol(struct module *mod,
/* At worse, next value is at end of module */
if (within_module_init(addr, mod))
- nextval = (unsigned long)mod->module_init+mod->init_text_size;
+ nextval = (unsigned long)mod->init_layout.base+mod->init_layout.text_size;
else
- nextval = (unsigned long)mod->module_core+mod->core_text_size;
+ nextval = (unsigned long)mod->core_layout.base+mod->core_layout.text_size;
/* Scan for closest preceding symbol, and next symbol. (ELF
starts real symbols at 1). */
@@ -3935,7 +3920,7 @@ static int m_show(struct seq_file *m, void *p)
return 0;
seq_printf(m, "%s %u",
- mod->name, mod->init_size + mod->core_size);
+ mod->name, mod->init_layout.size + mod->core_layout.size);
print_unload_info(m, mod);
/* Informative for users. */
@@ -3944,7 +3929,7 @@ static int m_show(struct seq_file *m, void *p)
mod->state == MODULE_STATE_COMING ? "Loading" :
"Live");
/* Used by oprofile and other similar tools. */
- seq_printf(m, " 0x%pK", mod->module_core);
+ seq_printf(m, " 0x%pK", mod->core_layout.base);
/* Taints info */
if (mod->taints)
@@ -4087,8 +4072,8 @@ struct module *__module_text_address(unsigned long addr)
struct module *mod = __module_address(addr);
if (mod) {
/* Make sure it's within the text section. */
- if (!within(addr, mod->module_init, mod->init_text_size)
- && !within(addr, mod->module_core, mod->core_text_size))
+ if (!within(addr, mod->init_layout.base, mod->init_layout.text_size)
+ && !within(addr, mod->core_layout.base, mod->core_layout.text_size))
mod = NULL;
}
return mod;
diff --git a/kernel/panic.c b/kernel/panic.c
index 41e2b54f3..d96469de7 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -61,6 +61,17 @@ void __weak panic_smp_self_stop(void)
cpu_relax();
}
+/*
+ * Stop ourselves in NMI context if another CPU has already panicked. Arch code
+ * may override this to prepare for crash dumping, e.g. save regs info.
+ */
+void __weak nmi_panic_self_stop(struct pt_regs *regs)
+{
+ panic_smp_self_stop();
+}
+
+atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID);
+
/**
* panic - halt the system
* @fmt: The text string to print
@@ -71,17 +82,17 @@ void __weak panic_smp_self_stop(void)
*/
void panic(const char *fmt, ...)
{
- static DEFINE_SPINLOCK(panic_lock);
static char buf[1024];
va_list args;
long i, i_next = 0;
int state = 0;
+ int old_cpu, this_cpu;
/*
* Disable local interrupts. This will prevent panic_smp_self_stop
* from deadlocking the first cpu that invokes the panic, since
* there is nothing to prevent an interrupt handler (that runs
- * after the panic_lock is acquired) from invoking panic again.
+ * after setting panic_cpu) from invoking panic() again.
*/
local_irq_disable();
@@ -94,8 +105,16 @@ void panic(const char *fmt, ...)
* multiple parallel invocations of panic, all other CPUs either
* stop themself or will wait until they are stopped by the 1st CPU
* with smp_send_stop().
+ *
+ * `old_cpu == PANIC_CPU_INVALID' means this is the 1st CPU which
+ * comes here, so go ahead.
+ * `old_cpu == this_cpu' means we came from nmi_panic() which sets
+ * panic_cpu to this CPU. In this case, this is also the 1st CPU.
*/
- if (!spin_trylock(&panic_lock))
+ this_cpu = raw_smp_processor_id();
+ old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu);
+
+ if (old_cpu != PANIC_CPU_INVALID && old_cpu != this_cpu)
panic_smp_self_stop();
console_verbose();
@@ -117,9 +136,11 @@ void panic(const char *fmt, ...)
* everything else.
* If we want to run this after calling panic_notifiers, pass
* the "crash_kexec_post_notifiers" option to the kernel.
+ *
+ * Bypass the panic_cpu check and call __crash_kexec directly.
*/
if (!crash_kexec_post_notifiers)
- crash_kexec(NULL);
+ __crash_kexec(NULL);
/*
* Note smp_send_stop is the usual smp shutdown function, which
@@ -142,9 +163,11 @@ void panic(const char *fmt, ...)
* panic_notifiers and dumping kmsg before kdump.
* Note: since some panic_notifiers can make crashed kernel
* more unstable, it can increase risks of the kdump failure too.
+ *
+ * Bypass the panic_cpu check and call __crash_kexec directly.
*/
if (crash_kexec_post_notifiers)
- crash_kexec(NULL);
+ __crash_kexec(NULL);
bust_spinlocks(0);
diff --git a/kernel/pid.c b/kernel/pid.c
index 78b3d9f80..4d73a834c 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -588,7 +588,7 @@ void __init pidhash_init(void)
void __init pidmap_init(void)
{
- /* Veryify no one has done anything silly */
+ /* Verify no one has done anything silly: */
BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING);
/* bump default and minimum pid_max based on number of cpus */
@@ -604,5 +604,5 @@ void __init pidmap_init(void)
atomic_dec(&init_pid_ns.pidmap[0].nr_free);
init_pid_ns.pid_cachep = KMEM_CACHE(pid,
- SLAB_HWCACHE_ALIGN | SLAB_PANIC);
+ SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
}
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 9e2ee0cb1..68d3ebc12 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -101,284 +101,6 @@ config PM_STD_PARTITION
suspended image to. It will simply pick the first available swap
device.
-menuconfig TOI_CORE
- bool "Enhanced Hibernation (TuxOnIce)"
- depends on HIBERNATION
- default y
- ---help---
- TuxOnIce is the 'new and improved' suspend support.
-
- See the TuxOnIce home page (tuxonice.net)
- for FAQs, HOWTOs and other documentation.
-
- comment "Image Storage (you need at least one allocator)"
- depends on TOI_CORE
-
- config TOI_FILE
- bool "File Allocator"
- depends on TOI_CORE
- default y
- ---help---
- This option enables support for storing an image in a
- simple file. You might want this if your swap is
- sometimes full enough that you don't have enough spare
- space to store an image.
-
- config TOI_SWAP
- bool "Swap Allocator"
- depends on TOI_CORE && SWAP
- default y
- ---help---
- This option enables support for storing an image in your
- swap space.
-
- comment "General Options"
- depends on TOI_CORE
-
- config TOI_PRUNE
- bool "Image pruning support"
- depends on TOI_CORE && CRYPTO && BROKEN
- default y
- ---help---
- This option adds support for using cryptoapi hashing
- algorithms to identify pages with the same content. We
- then write a much smaller pointer to the first copy of
- the data instead of a complete (perhaps compressed)
- additional copy.
-
- You probably want this, so say Y here.
-
- comment "No image pruning support available without Cryptoapi support."
- depends on TOI_CORE && !CRYPTO
-
- config TOI_CRYPTO
- bool "Compression support"
- depends on TOI_CORE && CRYPTO
- default y
- ---help---
- This option adds support for using cryptoapi compression
- algorithms. Compression is particularly useful as it can
- more than double your suspend and resume speed (depending
- upon how well your image compresses).
-
- You probably want this, so say Y here.
-
- comment "No compression support available without Cryptoapi support."
- depends on TOI_CORE && !CRYPTO
-
- config TOI_USERUI
- bool "Userspace User Interface support"
- depends on TOI_CORE && NET && (VT || SERIAL_CONSOLE)
- default y
- ---help---
- This option enabled support for a userspace based user interface
- to TuxOnIce, which allows you to have a nice display while suspending
- and resuming, and also enables features such as pressing escape to
- cancel a cycle or interactive debugging.
-
- config TOI_USERUI_DEFAULT_PATH
- string "Default userui program location"
- default "/usr/local/sbin/tuxoniceui_text"
- depends on TOI_USERUI
- ---help---
- This entry allows you to specify a default path to the userui binary.
-
- config TOI_DEFAULT_IMAGE_SIZE_LIMIT
- int "Default image size limit"
- range -2 65536
- default "-2"
- depends on TOI_CORE
- ---help---
- This entry allows you to specify a default image size limit. It can
- be overridden at run-time using /sys/power/tuxonice/image_size_limit.
-
- config TOI_KEEP_IMAGE
- bool "Allow Keep Image Mode"
- depends on TOI_CORE
- ---help---
- This option allows you to keep and image and reuse it. It is intended
- __ONLY__ for use with systems where all filesystems are mounted read-
- only (kiosks, for example). To use it, compile this option in and boot
- normally. Set the KEEP_IMAGE flag in /sys/power/tuxonice and suspend.
- When you resume, the image will not be removed. You will be unable to turn
- off swap partitions (assuming you are using the swap allocator), but future
- suspends simply do a power-down. The image can be updated using the
- kernel command line parameter suspend_act= to turn off the keep image
- bit. Keep image mode is a little less user friendly on purpose - it
- should not be used without thought!
-
- config TOI_INCREMENTAL
- bool "Incremental Image Support"
- depends on TOI_CORE && 64BIT && TOI_KEEP_IMAGE
- default n
- ---help---
- This option enables the work in progress toward using the dirty page
- tracking to record changes to pages. It is hoped that
- this will be an initial step toward implementing storing just
- the differences between consecutive images, which will
- increase the amount of storage needed for the image, but also
- increase the speed at which writing an image occurs and
- reduce the wear and tear on drives.
-
- At the moment, all that is implemented is the first step of keeping
- an existing image and then comparing it to the contents in memory
- (by setting /sys/power/tuxonice/verify_image to 1 and triggering a
- (fake) resume) to see what the page change tracking should find to be
- different. If you have verify_image set to 1, TuxOnIce will automatically
- invalidate the old image when you next try to hibernate, so there's no
- greater chance of disk corruption than normal.
-
- comment "No incremental image support available without Keep Image support."
- depends on TOI_CORE && !TOI_KEEP_IMAGE && 64BIT
-
- config TOI_REPLACE_SWSUSP
- bool "Replace swsusp by default"
- default y
- depends on TOI_CORE
- ---help---
- TuxOnIce can replace swsusp. This option makes that the default state,
- requiring you to echo 0 > /sys/power/tuxonice/replace_swsusp if you want
- to use the vanilla kernel functionality. Note that your initrd/ramfs will
- need to do this before trying to resume, too.
- With overriding swsusp enabled, echoing disk to /sys/power/state will
- start a TuxOnIce cycle. If resume= doesn't specify an allocator and both
- the swap and file allocators are compiled in, the swap allocator will be
- used by default.
-
- config TOI_IGNORE_LATE_INITCALL
- bool "Wait for initrd/ramfs to run, by default"
- default n
- depends on TOI_CORE
- ---help---
- When booting, TuxOnIce can check for an image and start to resume prior
- to any initrd/ramfs running (via a late initcall).
-
- If you don't have an initrd/ramfs, this is what you want to happen -
- otherwise you won't be able to safely resume. You should set this option
- to 'No'.
-
- If, however, you want your initrd/ramfs to run anyway before resuming,
- you need to tell TuxOnIce to ignore that earlier opportunity to resume.
- This can be done either by using this compile time option, or by
- overriding this option with the boot-time parameter toi_initramfs_resume_only=1.
-
- Note that if TuxOnIce can't resume at the earlier opportunity, the
- value of this option won't matter - the initramfs/initrd (if any) will
- run anyway.
-
- menuconfig TOI_CLUSTER
- bool "Cluster support"
- default n
- depends on TOI_CORE && NET && BROKEN
- ---help---
- Support for linking multiple machines in a cluster so that they suspend
- and resume together.
-
- config TOI_DEFAULT_CLUSTER_INTERFACE
- string "Default cluster interface"
- depends on TOI_CLUSTER
- ---help---
- The default interface on which to communicate with other nodes in
- the cluster.
-
- If no value is set here, cluster support will be disabled by default.
-
- config TOI_DEFAULT_CLUSTER_KEY
- string "Default cluster key"
- default "Default"
- depends on TOI_CLUSTER
- ---help---
- The default key used by this node. All nodes in the same cluster
- have the same key. Multiple clusters may coexist on the same lan
- by using different values for this key.
-
- config TOI_CLUSTER_IMAGE_TIMEOUT
- int "Timeout when checking for image"
- default 15
- depends on TOI_CLUSTER
- ---help---
- Timeout (seconds) before continuing to boot when waiting to see
- whether other nodes might have an image. Set to -1 to wait
- indefinitely. In WAIT_UNTIL_NODES is non zero, we might continue
- booting sooner than this timeout.
-
- config TOI_CLUSTER_WAIT_UNTIL_NODES
- int "Nodes without image before continuing"
- default 0
- depends on TOI_CLUSTER
- ---help---
- When booting and no image is found, we wait to see if other nodes
- have an image before continuing to boot. This value lets us
- continue after seeing a certain number of nodes without an image,
- instead of continuing to wait for the timeout. Set to 0 to only
- use the timeout.
-
- config TOI_DEFAULT_CLUSTER_PRE_HIBERNATE
- string "Default pre-hibernate script"
- depends on TOI_CLUSTER
- ---help---
- The default script to be called when starting to hibernate.
-
- config TOI_DEFAULT_CLUSTER_POST_HIBERNATE
- string "Default post-hibernate script"
- depends on TOI_CLUSTER
- ---help---
- The default script to be called after resuming from hibernation.
-
- config TOI_DEFAULT_WAIT
- int "Default waiting time for emergency boot messages"
- default "25"
- range -1 32768
- depends on TOI_CORE
- help
- TuxOnIce can display warnings very early in the process of resuming,
- if (for example) it appears that you have booted a kernel that doesn't
- match an image on disk. It can then give you the opportunity to either
- continue booting that kernel, or reboot the machine. This option can be
- used to control how long to wait in such circumstances. -1 means wait
- forever. 0 means don't wait at all (do the default action, which will
- generally be to continue booting and remove the image). Values of 1 or
- more indicate a number of seconds (up to 255) to wait before doing the
- default.
-
- config TOI_DEFAULT_EXTRA_PAGES_ALLOWANCE
- int "Default extra pages allowance"
- default "2000"
- range 500 32768
- depends on TOI_CORE
- help
- This value controls the default for the allowance TuxOnIce makes for
- drivers to allocate extra memory during the atomic copy. The default
- value of 2000 will be okay in most cases. If you are using
- DRI, the easiest way to find what value to use is to try to hibernate
- and look at how many pages were actually needed in the sysfs entry
- /sys/power/tuxonice/debug_info (first number on the last line), adding
- a little extra because the value is not always the same.
-
- config TOI_CHECKSUM
- bool "Checksum pageset2"
- default n
- depends on TOI_CORE
- select CRYPTO
- select CRYPTO_ALGAPI
- select CRYPTO_MD4
- ---help---
- Adds support for checksumming pageset2 pages, to ensure you really get an
- atomic copy. Since some filesystems (XFS especially) change metadata even
- when there's no other activity, we need this to check for pages that have
- been changed while we were saving the page cache. If your debugging output
- always says no pages were resaved, you may be able to safely disable this
- option.
-
-config TOI
- bool
- depends on TOI_CORE!=n
- default y
-
-config TOI_ZRAM_SUPPORT
- def_bool y
- depends on TOI && ZRAM!=n
-
config PM_SLEEP
def_bool y
depends on SUSPEND || HIBERNATE_CALLBACKS
@@ -513,7 +235,7 @@ config PM_TRACE_RTC
config APM_EMULATION
tristate "Advanced Power Management Emulation"
- depends on PM && SYS_SUPPORTS_APM_EMULATION
+ depends on SYS_SUPPORTS_APM_EMULATION
help
APM is a BIOS specification for saving power using several different
techniques. This is mostly useful for battery powered laptops with
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 82c4795e8..cb880a14c 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,38 +1,6 @@
ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG
-tuxonice_core-y := tuxonice_modules.o
-
-obj-$(CONFIG_TOI) += tuxonice_builtin.o
-obj-$(CONFIG_TOI_INCREMENTAL) += tuxonice_incremental.o \
- tuxonice_copy_before_write.o
-
-tuxonice_core-$(CONFIG_PM_DEBUG) += tuxonice_alloc.o
-
-# Compile these in after allocation debugging, if used.
-
-tuxonice_core-y += tuxonice_sysfs.o tuxonice_highlevel.o \
- tuxonice_io.o tuxonice_pagedir.o tuxonice_prepare_image.o \
- tuxonice_extent.o tuxonice_pageflags.o tuxonice_ui.o \
- tuxonice_power_off.o tuxonice_atomic_copy.o
-
-tuxonice_core-$(CONFIG_TOI_CHECKSUM) += tuxonice_checksum.o
-
-tuxonice_core-$(CONFIG_NET) += tuxonice_storage.o tuxonice_netlink.o
-
-obj-$(CONFIG_TOI_CORE) += tuxonice_core.o
-obj-$(CONFIG_TOI_PRUNE) += tuxonice_prune.o
-obj-$(CONFIG_TOI_CRYPTO) += tuxonice_compress.o
-
-tuxonice_bio-y := tuxonice_bio_core.o tuxonice_bio_chains.o \
- tuxonice_bio_signature.o
-
-obj-$(CONFIG_TOI_SWAP) += tuxonice_bio.o tuxonice_swap.o
-obj-$(CONFIG_TOI_FILE) += tuxonice_bio.o tuxonice_file.o
-obj-$(CONFIG_TOI_CLUSTER) += tuxonice_cluster.o
-
-obj-$(CONFIG_TOI_USERUI) += tuxonice_userui.o
-
obj-y += qos.o
obj-$(CONFIG_PM) += main.o
obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 153e51db5..b7342a24f 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -31,7 +31,7 @@
#include <linux/ktime.h>
#include <trace/events/power.h>
-#include "tuxonice.h"
+#include "power.h"
static int nocompress;
@@ -39,7 +39,7 @@ static int noresume;
static int nohibernate;
static int resume_wait;
static unsigned int resume_delay;
-char resume_file[256] = CONFIG_PM_STD_PARTITION;
+static char resume_file[256] = CONFIG_PM_STD_PARTITION;
dev_t swsusp_resume_device;
sector_t swsusp_resume_block;
__visible int in_suspend __nosavedata;
@@ -123,7 +123,7 @@ static int hibernation_test(int level) { return 0; }
* platform_begin - Call platform to start hibernation.
* @platform_mode: Whether or not to use the platform driver.
*/
-int platform_begin(int platform_mode)
+static int platform_begin(int platform_mode)
{
return (platform_mode && hibernation_ops) ?
hibernation_ops->begin() : 0;
@@ -133,7 +133,7 @@ int platform_begin(int platform_mode)
* platform_end - Call platform to finish transition to the working state.
* @platform_mode: Whether or not to use the platform driver.
*/
-void platform_end(int platform_mode)
+static void platform_end(int platform_mode)
{
if (platform_mode && hibernation_ops)
hibernation_ops->end();
@@ -147,7 +147,7 @@ void platform_end(int platform_mode)
* if so configured, and return an error code if that fails.
*/
-int platform_pre_snapshot(int platform_mode)
+static int platform_pre_snapshot(int platform_mode)
{
return (platform_mode && hibernation_ops) ?
hibernation_ops->pre_snapshot() : 0;
@@ -162,7 +162,7 @@ int platform_pre_snapshot(int platform_mode)
*
* This routine is called on one CPU with interrupts disabled.
*/
-void platform_leave(int platform_mode)
+static void platform_leave(int platform_mode)
{
if (platform_mode && hibernation_ops)
hibernation_ops->leave();
@@ -177,7 +177,7 @@ void platform_leave(int platform_mode)
*
* This routine must be called after platform_prepare().
*/
-void platform_finish(int platform_mode)
+static void platform_finish(int platform_mode)
{
if (platform_mode && hibernation_ops)
hibernation_ops->finish();
@@ -193,7 +193,7 @@ void platform_finish(int platform_mode)
* If the restore fails after this function has been called,
* platform_restore_cleanup() must be called.
*/
-int platform_pre_restore(int platform_mode)
+static int platform_pre_restore(int platform_mode)
{
return (platform_mode && hibernation_ops) ?
hibernation_ops->pre_restore() : 0;
@@ -210,7 +210,7 @@ int platform_pre_restore(int platform_mode)
* function must be called too, regardless of the result of
* platform_pre_restore().
*/
-void platform_restore_cleanup(int platform_mode)
+static void platform_restore_cleanup(int platform_mode)
{
if (platform_mode && hibernation_ops)
hibernation_ops->restore_cleanup();
@@ -220,7 +220,7 @@ void platform_restore_cleanup(int platform_mode)
* platform_recover - Recover from a failure to suspend devices.
* @platform_mode: Whether or not to use the platform driver.
*/
-void platform_recover(int platform_mode)
+static void platform_recover(int platform_mode)
{
if (platform_mode && hibernation_ops && hibernation_ops->recover)
hibernation_ops->recover();
@@ -648,9 +648,6 @@ int hibernate(void)
{
int error;
- if (test_action_state(TOI_REPLACE_SWSUSP))
- return try_tuxonice_hibernate();
-
if (!hibernation_available()) {
pr_debug("PM: Hibernation not available.\n");
return -EPERM;
@@ -740,19 +737,11 @@ int hibernate(void)
* attempts to recover gracefully and make the kernel return to the normal mode
* of operation.
*/
-int software_resume(void)
+static int software_resume(void)
{
int error;
unsigned int flags;
- resume_attempted = 1;
-
- /*
- * We can't know (until an image header - if any - is loaded), whether
- * we did override swsusp. We therefore ensure that both are tried.
- */
- try_tuxonice_resume();
-
/*
* If the user said "noresume".. bail out early.
*/
@@ -1139,7 +1128,6 @@ static int __init hibernate_setup(char *str)
static int __init noresume_setup(char *str)
{
noresume = 1;
- set_toi_state(TOI_NORESUME_SPECIFIED);
return 1;
}
diff --git a/kernel/power/main.c b/kernel/power/main.c
index b2dd4d999..27946975e 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -280,13 +280,7 @@ static ssize_t pm_wakeup_irq_show(struct kobject *kobj,
return pm_wakeup_irq ? sprintf(buf, "%u\n", pm_wakeup_irq) : -ENODATA;
}
-static ssize_t pm_wakeup_irq_store(struct kobject *kobj,
- struct kobj_attribute *attr,
- const char *buf, size_t n)
-{
- return -EINVAL;
-}
-power_attr(pm_wakeup_irq);
+power_attr_ro(pm_wakeup_irq);
#else /* !CONFIG_PM_SLEEP_DEBUG */
static inline void pm_print_times_init(void) {}
@@ -564,14 +558,7 @@ static ssize_t pm_trace_dev_match_show(struct kobject *kobj,
return show_trace_dev_match(buf, PAGE_SIZE);
}
-static ssize_t
-pm_trace_dev_match_store(struct kobject *kobj, struct kobj_attribute *attr,
- const char *buf, size_t n)
-{
- return -EINVAL;
-}
-
-power_attr(pm_trace_dev_match);
+power_attr_ro(pm_trace_dev_match);
#endif /* CONFIG_PM_TRACE */
diff --git a/kernel/power/power.h b/kernel/power/power.h
index b5c9efb36..efe1b3b17 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -36,12 +36,8 @@ static inline char *check_image_kernel(struct swsusp_info *info)
return arch_hibernation_header_restore(info) ?
"architecture specific data" : NULL;
}
-#else
-extern char *check_image_kernel(struct swsusp_info *info);
#endif /* CONFIG_ARCH_HIBERNATION_HEADER */
-extern int init_header(struct swsusp_info *info);
-extern char resume_file[256];
/*
* Keep some memory free so that I/O operations can succeed without paging
* [Might this be more than 4 MB?]
@@ -81,7 +77,14 @@ static struct kobj_attribute _name##_attr = { \
.store = _name##_store, \
}
-extern struct pbe *restore_pblist;
+#define power_attr_ro(_name) \
+static struct kobj_attribute _name##_attr = { \
+ .attr = { \
+ .name = __stringify(_name), \
+ .mode = S_IRUGO, \
+ }, \
+ .show = _name##_show, \
+}
/* Preferred image size in bytes (default 500 MB) */
extern unsigned long image_size;
@@ -266,31 +269,6 @@ static inline void suspend_thaw_processes(void)
}
#endif
-extern struct page *saveable_page(struct zone *z, unsigned long p);
-#ifdef CONFIG_HIGHMEM
-struct page *saveable_highmem_page(struct zone *z, unsigned long p);
-#else
-static
-inline void *saveable_highmem_page(struct zone *z, unsigned long p)
-{
- return NULL;
-}
-#endif
-
-#define PBES_PER_PAGE (PAGE_SIZE / sizeof(struct pbe))
-extern struct list_head nosave_regions;
-
-/**
- * This structure represents a range of page frames the contents of which
- * should not be saved during the suspend.
- */
-
-struct nosave_region {
- struct list_head list;
- unsigned long start_pfn;
- unsigned long end_pfn;
-};
-
#ifdef CONFIG_PM_AUTOSLEEP
/* kernel/power/autosleep.c */
@@ -317,10 +295,3 @@ extern int pm_wake_lock(const char *buf);
extern int pm_wake_unlock(const char *buf);
#endif /* !CONFIG_PM_WAKELOCKS */
-
-#ifdef CONFIG_TOI
-unsigned long toi_get_nonconflicting_page(void);
-#define BM_END_OF_MAP (~0UL)
-#else
-#define toi_get_nonconflicting_page() (0)
-#endif
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 542163a01..3a9706043 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -36,9 +36,6 @@
#include <asm/tlbflush.h>
#include <asm/io.h>
-#include "tuxonice_modules.h"
-#include "tuxonice_builtin.h"
-#include "tuxonice_alloc.h"
#include "power.h"
static int swsusp_page_is_free(struct page *);
@@ -101,9 +98,6 @@ static void *get_image_page(gfp_t gfp_mask, int safe_needed)
{
void *res;
- if (toi_running)
- return (void *) toi_get_nonconflicting_page();
-
res = (void *)get_zeroed_page(gfp_mask);
if (safe_needed)
while (res && swsusp_page_is_free(virt_to_page(res))) {
@@ -149,11 +143,6 @@ static inline void free_image_page(void *addr, int clear_nosave_free)
page = virt_to_page(addr);
- if (toi_running) {
- toi__free_page(29, page);
- return;
- }
-
swsusp_unset_page_forbidden(page);
if (clear_nosave_free)
swsusp_unset_page_free(page);
@@ -313,15 +302,13 @@ struct bm_position {
int node_bit;
};
-#define BM_POSITION_SLOTS (NR_CPUS * 2)
-
struct memory_bitmap {
struct list_head zones;
struct linked_page *p_list; /* list of pages used to store zone
* bitmap objects and bitmap block
* objects
*/
- struct bm_position cur[BM_POSITION_SLOTS]; /* most recently used bit position */
+ struct bm_position cur; /* most recently used bit position */
};
/* Functions that operate on memory bitmaps */
@@ -486,39 +473,16 @@ static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone,
free_image_page(node->data, clear_nosave_free);
}
-void memory_bm_position_reset(struct memory_bitmap *bm)
+static void memory_bm_position_reset(struct memory_bitmap *bm)
{
- int index;
-
- for (index = 0; index < BM_POSITION_SLOTS; index++) {
- bm->cur[index].zone = list_entry(bm->zones.next, struct mem_zone_bm_rtree,
+ bm->cur.zone = list_entry(bm->zones.next, struct mem_zone_bm_rtree,
list);
- bm->cur[index].node = list_entry(bm->cur[index].zone->leaves.next,
+ bm->cur.node = list_entry(bm->cur.zone->leaves.next,
struct rtree_node, list);
- bm->cur[index].node_pfn = 0;
- bm->cur[index].node_bit = 0;
- }
+ bm->cur.node_pfn = 0;
+ bm->cur.node_bit = 0;
}
-static void memory_bm_clear_current(struct memory_bitmap *bm, int index);
-unsigned long memory_bm_next_pfn(struct memory_bitmap *bm, int index);
-
-/**
- * memory_bm_clear
- * @param bm - The bitmap to clear
- *
- * Only run while single threaded - locking not needed
- */
-void memory_bm_clear(struct memory_bitmap *bm)
-{
- memory_bm_position_reset(bm);
-
- while (memory_bm_next_pfn(bm, 0) != BM_END_OF_MAP) {
- memory_bm_clear_current(bm, 0);
- }
-
- memory_bm_position_reset(bm);
-}
static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
struct mem_extent {
@@ -631,8 +595,7 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
}
bm->p_list = ca.chain;
-
- memory_bm_position_reset(bm);
+ memory_bm_position_reset(bm);
Exit:
free_mem_extents(&mem_extents);
return error;
@@ -668,24 +631,14 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
* It walks the radix tree to find the page which contains the bit for
* pfn and returns the bit position in **addr and *bit_nr.
*/
-int memory_bm_find_bit(struct memory_bitmap *bm, int index,
- unsigned long pfn, void **addr, unsigned int *bit_nr)
+static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
+ void **addr, unsigned int *bit_nr)
{
struct mem_zone_bm_rtree *curr, *zone;
struct rtree_node *node;
int i, block_nr;
- if (!bm->cur[index].zone) {
- // Reset
- bm->cur[index].zone = list_entry(bm->zones.next, struct mem_zone_bm_rtree,
- list);
- bm->cur[index].node = list_entry(bm->cur[index].zone->leaves.next,
- struct rtree_node, list);
- bm->cur[index].node_pfn = 0;
- bm->cur[index].node_bit = 0;
- }
-
- zone = bm->cur[index].zone;
+ zone = bm->cur.zone;
if (pfn >= zone->start_pfn && pfn < zone->end_pfn)
goto zone_found;
@@ -709,8 +662,8 @@ zone_found:
* node for our pfn.
*/
- node = bm->cur[index].node;
- if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur[index].node_pfn)
+ node = bm->cur.node;
+ if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn)
goto node_found;
node = zone->rtree;
@@ -727,9 +680,9 @@ zone_found:
node_found:
/* Update last position */
- bm->cur[index].zone = zone;
- bm->cur[index].node = node;
- bm->cur[index].node_pfn = (pfn - zone->start_pfn) & ~BM_BLOCK_MASK;
+ bm->cur.zone = zone;
+ bm->cur.node = node;
+ bm->cur.node_pfn = (pfn - zone->start_pfn) & ~BM_BLOCK_MASK;
/* Set return values */
*addr = node->data;
@@ -738,66 +691,66 @@ node_found:
return 0;
}
-void memory_bm_set_bit(struct memory_bitmap *bm, int index, unsigned long pfn)
+static void memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn)
{
void *addr;
unsigned int bit;
int error;
- error = memory_bm_find_bit(bm, index, pfn, &addr, &bit);
+ error = memory_bm_find_bit(bm, pfn, &addr, &bit);
BUG_ON(error);
set_bit(bit, addr);
}
-int mem_bm_set_bit_check(struct memory_bitmap *bm, int index, unsigned long pfn)
+static int mem_bm_set_bit_check(struct memory_bitmap *bm, unsigned long pfn)
{
void *addr;
unsigned int bit;
int error;
- error = memory_bm_find_bit(bm, index, pfn, &addr, &bit);
+ error = memory_bm_find_bit(bm, pfn, &addr, &bit);
if (!error)
set_bit(bit, addr);
return error;
}
-void memory_bm_clear_bit(struct memory_bitmap *bm, int index, unsigned long pfn)
+static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn)
{
void *addr;
unsigned int bit;
int error;
- error = memory_bm_find_bit(bm, index, pfn, &addr, &bit);
+ error = memory_bm_find_bit(bm, pfn, &addr, &bit);
BUG_ON(error);
clear_bit(bit, addr);
}
-static void memory_bm_clear_current(struct memory_bitmap *bm, int index)
+static void memory_bm_clear_current(struct memory_bitmap *bm)
{
int bit;
- bit = max(bm->cur[index].node_bit - 1, 0);
- clear_bit(bit, bm->cur[index].node->data);
+ bit = max(bm->cur.node_bit - 1, 0);
+ clear_bit(bit, bm->cur.node->data);
}
-int memory_bm_test_bit(struct memory_bitmap *bm, int index, unsigned long pfn)
+static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn)
{
void *addr;
unsigned int bit;
int error;
- error = memory_bm_find_bit(bm, index, pfn, &addr, &bit);
+ error = memory_bm_find_bit(bm, pfn, &addr, &bit);
BUG_ON(error);
return test_bit(bit, addr);
}
-static bool memory_bm_pfn_present(struct memory_bitmap *bm, int index, unsigned long pfn)
+static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn)
{
void *addr;
unsigned int bit;
- return !memory_bm_find_bit(bm, index, pfn, &addr, &bit);
+ return !memory_bm_find_bit(bm, pfn, &addr, &bit);
}
/*
@@ -810,25 +763,25 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, int index, unsigned
*
* Returns true if there is a next node, false otherwise.
*/
-static bool rtree_next_node(struct memory_bitmap *bm, int index)
+static bool rtree_next_node(struct memory_bitmap *bm)
{
- bm->cur[index].node = list_entry(bm->cur[index].node->list.next,
+ bm->cur.node = list_entry(bm->cur.node->list.next,
struct rtree_node, list);
- if (&bm->cur[index].node->list != &bm->cur[index].zone->leaves) {
- bm->cur[index].node_pfn += BM_BITS_PER_BLOCK;
- bm->cur[index].node_bit = 0;
+ if (&bm->cur.node->list != &bm->cur.zone->leaves) {
+ bm->cur.node_pfn += BM_BITS_PER_BLOCK;
+ bm->cur.node_bit = 0;
touch_softlockup_watchdog();
return true;
}
/* No more nodes, goto next zone */
- bm->cur[index].zone = list_entry(bm->cur[index].zone->list.next,
+ bm->cur.zone = list_entry(bm->cur.zone->list.next,
struct mem_zone_bm_rtree, list);
- if (&bm->cur[index].zone->list != &bm->zones) {
- bm->cur[index].node = list_entry(bm->cur[index].zone->leaves.next,
+ if (&bm->cur.zone->list != &bm->zones) {
+ bm->cur.node = list_entry(bm->cur.zone->leaves.next,
struct rtree_node, list);
- bm->cur[index].node_pfn = 0;
- bm->cur[index].node_bit = 0;
+ bm->cur.node_pfn = 0;
+ bm->cur.node_bit = 0;
return true;
}
@@ -846,29 +799,38 @@ static bool rtree_next_node(struct memory_bitmap *bm, int index)
* It is required to run memory_bm_position_reset() before the
* first call to this function.
*/
-unsigned long memory_bm_next_pfn(struct memory_bitmap *bm, int index)
+static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
{
unsigned long bits, pfn, pages;
int bit;
- index += NR_CPUS; /* Iteration state is separated from get/set/test */
-
do {
- pages = bm->cur[index].zone->end_pfn - bm->cur[index].zone->start_pfn;
- bits = min(pages - bm->cur[index].node_pfn, BM_BITS_PER_BLOCK);
- bit = find_next_bit(bm->cur[index].node->data, bits,
- bm->cur[index].node_bit);
+ pages = bm->cur.zone->end_pfn - bm->cur.zone->start_pfn;
+ bits = min(pages - bm->cur.node_pfn, BM_BITS_PER_BLOCK);
+ bit = find_next_bit(bm->cur.node->data, bits,
+ bm->cur.node_bit);
if (bit < bits) {
- pfn = bm->cur[index].zone->start_pfn + bm->cur[index].node_pfn + bit;
- bm->cur[index].node_bit = bit + 1;
+ pfn = bm->cur.zone->start_pfn + bm->cur.node_pfn + bit;
+ bm->cur.node_bit = bit + 1;
return pfn;
}
- } while (rtree_next_node(bm, index));
+ } while (rtree_next_node(bm));
return BM_END_OF_MAP;
}
-LIST_HEAD(nosave_regions);
+/**
+ * This structure represents a range of page frames the contents of which
+ * should not be saved during the suspend.
+ */
+
+struct nosave_region {
+ struct list_head list;
+ unsigned long start_pfn;
+ unsigned long end_pfn;
+};
+
+static LIST_HEAD(nosave_regions);
/**
* register_nosave_region - register a range of page frames the contents
@@ -927,37 +889,37 @@ static struct memory_bitmap *free_pages_map;
void swsusp_set_page_free(struct page *page)
{
if (free_pages_map)
- memory_bm_set_bit(free_pages_map, 0, page_to_pfn(page));
+ memory_bm_set_bit(free_pages_map, page_to_pfn(page));
}
static int swsusp_page_is_free(struct page *page)
{
return free_pages_map ?
- memory_bm_test_bit(free_pages_map, 0, page_to_pfn(page)) : 0;
+ memory_bm_test_bit(free_pages_map, page_to_pfn(page)) : 0;
}
void swsusp_unset_page_free(struct page *page)
{
if (free_pages_map)
- memory_bm_clear_bit(free_pages_map, 0, page_to_pfn(page));
+ memory_bm_clear_bit(free_pages_map, page_to_pfn(page));
}
static void swsusp_set_page_forbidden(struct page *page)
{
if (forbidden_pages_map)
- memory_bm_set_bit(forbidden_pages_map, 0, page_to_pfn(page));
+ memory_bm_set_bit(forbidden_pages_map, page_to_pfn(page));
}
int swsusp_page_is_forbidden(struct page *page)
{
return forbidden_pages_map ?
- memory_bm_test_bit(forbidden_pages_map, 0, page_to_pfn(page)) : 0;
+ memory_bm_test_bit(forbidden_pages_map, page_to_pfn(page)) : 0;
}
static void swsusp_unset_page_forbidden(struct page *page)
{
if (forbidden_pages_map)
- memory_bm_clear_bit(forbidden_pages_map, 0, page_to_pfn(page));
+ memory_bm_clear_bit(forbidden_pages_map, page_to_pfn(page));
}
/**
@@ -988,7 +950,7 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
* touch the PFNs for which the error is
* returned anyway.
*/
- mem_bm_set_bit_check(bm, 0, pfn);
+ mem_bm_set_bit_check(bm, pfn);
}
}
}
@@ -1116,7 +1078,7 @@ static unsigned int count_free_highmem_pages(void)
* We should save the page if it isn't Nosave or NosaveFree, or Reserved,
* and it isn't a part of a free chunk of pages.
*/
-struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
+static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
{
struct page *page;
@@ -1163,6 +1125,11 @@ static unsigned int count_highmem_pages(void)
}
return n;
}
+#else
+static inline void *saveable_highmem_page(struct zone *z, unsigned long p)
+{
+ return NULL;
+}
#endif /* CONFIG_HIGHMEM */
/**
@@ -1173,7 +1140,7 @@ static unsigned int count_highmem_pages(void)
* of pages statically defined as 'unsaveable', and it isn't a part of
* a free chunk of pages.
*/
-struct page *saveable_page(struct zone *zone, unsigned long pfn)
+static struct page *saveable_page(struct zone *zone, unsigned long pfn)
{
struct page *page;
@@ -1311,15 +1278,15 @@ copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
max_zone_pfn = zone_end_pfn(zone);
for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
if (page_is_saveable(zone, pfn))
- memory_bm_set_bit(orig_bm, 0, pfn);
+ memory_bm_set_bit(orig_bm, pfn);
}
memory_bm_position_reset(orig_bm);
memory_bm_position_reset(copy_bm);
for(;;) {
- pfn = memory_bm_next_pfn(orig_bm, 0);
+ pfn = memory_bm_next_pfn(orig_bm);
if (unlikely(pfn == BM_END_OF_MAP))
break;
- copy_data_page(memory_bm_next_pfn(copy_bm, 0), pfn);
+ copy_data_page(memory_bm_next_pfn(copy_bm), pfn);
}
}
@@ -1365,8 +1332,8 @@ void swsusp_free(void)
memory_bm_position_reset(free_pages_map);
loop:
- fr_pfn = memory_bm_next_pfn(free_pages_map, 0);
- fb_pfn = memory_bm_next_pfn(forbidden_pages_map, 0);
+ fr_pfn = memory_bm_next_pfn(free_pages_map);
+ fb_pfn = memory_bm_next_pfn(forbidden_pages_map);
/*
* Find the next bit set in both bitmaps. This is guaranteed to
@@ -1374,16 +1341,16 @@ loop:
*/
do {
if (fb_pfn < fr_pfn)
- fb_pfn = memory_bm_next_pfn(forbidden_pages_map, 0);
+ fb_pfn = memory_bm_next_pfn(forbidden_pages_map);
if (fr_pfn < fb_pfn)
- fr_pfn = memory_bm_next_pfn(free_pages_map, 0);
+ fr_pfn = memory_bm_next_pfn(free_pages_map);
} while (fb_pfn != fr_pfn);
if (fr_pfn != BM_END_OF_MAP && pfn_valid(fr_pfn)) {
struct page *page = pfn_to_page(fr_pfn);
- memory_bm_clear_current(forbidden_pages_map, 0);
- memory_bm_clear_current(free_pages_map, 0);
+ memory_bm_clear_current(forbidden_pages_map);
+ memory_bm_clear_current(free_pages_map);
__free_page(page);
goto loop;
}
@@ -1418,7 +1385,7 @@ static unsigned long preallocate_image_pages(unsigned long nr_pages, gfp_t mask)
page = alloc_image_page(mask);
if (!page)
break;
- memory_bm_set_bit(&copy_bm, 0, page_to_pfn(page));
+ memory_bm_set_bit(&copy_bm, page_to_pfn(page));
if (PageHighMem(page))
alloc_highmem++;
else
@@ -1514,7 +1481,7 @@ static unsigned long free_unnecessary_pages(void)
memory_bm_position_reset(&copy_bm);
while (to_free_normal > 0 || to_free_highmem > 0) {
- unsigned long pfn = memory_bm_next_pfn(&copy_bm, 0);
+ unsigned long pfn = memory_bm_next_pfn(&copy_bm);
struct page *page = pfn_to_page(pfn);
if (PageHighMem(page)) {
@@ -1528,7 +1495,7 @@ static unsigned long free_unnecessary_pages(void)
to_free_normal--;
alloc_normal--;
}
- memory_bm_clear_bit(&copy_bm, 0, pfn);
+ memory_bm_clear_bit(&copy_bm, pfn);
swsusp_unset_page_forbidden(page);
swsusp_unset_page_free(page);
__free_page(page);
@@ -1813,7 +1780,7 @@ alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
struct page *page;
page = alloc_image_page(__GFP_HIGHMEM|__GFP_KSWAPD_RECLAIM);
- memory_bm_set_bit(bm, 0, page_to_pfn(page));
+ memory_bm_set_bit(bm, page_to_pfn(page));
}
return nr_highmem;
}
@@ -1856,7 +1823,7 @@ swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
page = alloc_image_page(GFP_ATOMIC | __GFP_COLD);
if (!page)
goto err_out;
- memory_bm_set_bit(copy_bm, 0, page_to_pfn(page));
+ memory_bm_set_bit(copy_bm, page_to_pfn(page));
}
}
@@ -1871,9 +1838,6 @@ asmlinkage __visible int swsusp_save(void)
{
unsigned int nr_pages, nr_highmem;
- if (toi_running)
- return toi_post_context_save();
-
printk(KERN_INFO "PM: Creating hibernation image:\n");
drain_local_pages(NULL);
@@ -1921,7 +1885,7 @@ static int init_header_complete(struct swsusp_info *info)
return 0;
}
-char *check_image_kernel(struct swsusp_info *info)
+static char *check_image_kernel(struct swsusp_info *info)
{
if (info->version_code != LINUX_VERSION_CODE)
return "kernel version";
@@ -1942,7 +1906,7 @@ unsigned long snapshot_get_image_size(void)
return nr_copy_pages + nr_meta_pages + 1;
}
-int init_header(struct swsusp_info *info)
+static int init_header(struct swsusp_info *info)
{
memset(info, 0, sizeof(struct swsusp_info));
info->num_physpages = get_num_physpages();
@@ -1964,7 +1928,7 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
int j;
for (j = 0; j < PAGE_SIZE / sizeof(long); j++) {
- buf[j] = memory_bm_next_pfn(bm, 0);
+ buf[j] = memory_bm_next_pfn(bm);
if (unlikely(buf[j] == BM_END_OF_MAP))
break;
/* Save page key for data page (s390 only). */
@@ -2015,7 +1979,7 @@ int snapshot_read_next(struct snapshot_handle *handle)
} else {
struct page *page;
- page = pfn_to_page(memory_bm_next_pfn(&copy_bm, 0));
+ page = pfn_to_page(memory_bm_next_pfn(&copy_bm));
if (PageHighMem(page)) {
/* Highmem pages are copied to the buffer,
* because we can't return with a kmapped
@@ -2057,7 +2021,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm)
/* Mark pages that correspond to the "original" pfns as "unsafe" */
memory_bm_position_reset(bm);
do {
- pfn = memory_bm_next_pfn(bm, 0);
+ pfn = memory_bm_next_pfn(bm);
if (likely(pfn != BM_END_OF_MAP)) {
if (likely(pfn_valid(pfn)))
swsusp_set_page_free(pfn_to_page(pfn));
@@ -2077,10 +2041,10 @@ duplicate_memory_bitmap(struct memory_bitmap *dst, struct memory_bitmap *src)
unsigned long pfn;
memory_bm_position_reset(src);
- pfn = memory_bm_next_pfn(src, 0);
+ pfn = memory_bm_next_pfn(src);
while (pfn != BM_END_OF_MAP) {
- memory_bm_set_bit(dst, 0, pfn);
- pfn = memory_bm_next_pfn(src, 0);
+ memory_bm_set_bit(dst, pfn);
+ pfn = memory_bm_next_pfn(src);
}
}
@@ -2131,8 +2095,8 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
/* Extract and buffer page key for data page (s390 only). */
page_key_memorize(buf + j);
- if (memory_bm_pfn_present(bm, 0, buf[j]))
- memory_bm_set_bit(bm, 0, buf[j]);
+ if (memory_bm_pfn_present(bm, buf[j]))
+ memory_bm_set_bit(bm, buf[j]);
else
return -EFAULT;
}
@@ -2175,12 +2139,12 @@ static unsigned int count_highmem_image_pages(struct memory_bitmap *bm)
unsigned int cnt = 0;
memory_bm_position_reset(bm);
- pfn = memory_bm_next_pfn(bm, 0);
+ pfn = memory_bm_next_pfn(bm);
while (pfn != BM_END_OF_MAP) {
if (PageHighMem(pfn_to_page(pfn)))
cnt++;
- pfn = memory_bm_next_pfn(bm, 0);
+ pfn = memory_bm_next_pfn(bm);
}
return cnt;
}
@@ -2225,7 +2189,7 @@ prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
page = alloc_page(__GFP_HIGHMEM);
if (!swsusp_page_is_free(page)) {
/* The page is "safe", set its bit the bitmap */
- memory_bm_set_bit(bm, 0, page_to_pfn(page));
+ memory_bm_set_bit(bm, page_to_pfn(page));
safe_highmem_pages++;
}
/* Mark the page as allocated */
@@ -2283,7 +2247,7 @@ get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
/* Copy of the page will be stored in high memory */
kaddr = buffer;
- tmp = pfn_to_page(memory_bm_next_pfn(safe_highmem_bm, 0));
+ tmp = pfn_to_page(memory_bm_next_pfn(safe_highmem_bm));
safe_highmem_pages--;
last_highmem_page = tmp;
pbe->copy_page = tmp;
@@ -2454,7 +2418,7 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
{
struct pbe *pbe;
struct page *page;
- unsigned long pfn = memory_bm_next_pfn(bm, 0);
+ unsigned long pfn = memory_bm_next_pfn(bm);
if (pfn == BM_END_OF_MAP)
return ERR_PTR(-EFAULT);
@@ -2641,82 +2605,3 @@ int restore_highmem(void)
return 0;
}
#endif /* CONFIG_HIGHMEM */
-
-struct memory_bitmap *pageset1_map, *pageset2_map, *free_map, *nosave_map,
- *pageset1_copy_map, *io_map, *page_resave_map, *compare_map;
-
-int resume_attempted;
-
-int memory_bm_write(struct memory_bitmap *bm, int (*rw_chunk)
- (int rw, struct toi_module_ops *owner, char *buffer, int buffer_size))
-{
- int result;
-
- memory_bm_position_reset(bm);
-
- do {
- result = rw_chunk(WRITE, NULL, (char *) bm->cur[0].node->data, PAGE_SIZE);
-
- if (result)
- return result;
- } while (rtree_next_node(bm, 0));
- return 0;
-}
-
-int memory_bm_read(struct memory_bitmap *bm, int (*rw_chunk)
- (int rw, struct toi_module_ops *owner, char *buffer, int buffer_size))
-{
- int result;
-
- memory_bm_position_reset(bm);
-
- do {
- result = rw_chunk(READ, NULL, (char *) bm->cur[0].node->data, PAGE_SIZE);
-
- if (result)
- return result;
-
- } while (rtree_next_node(bm, 0));
- return 0;
-}
-
-int memory_bm_space_needed(struct memory_bitmap *bm)
-{
- unsigned long bytes = 0;
-
- memory_bm_position_reset(bm);
- do {
- bytes += PAGE_SIZE;
- } while (rtree_next_node(bm, 0));
- return bytes;
-}
-
-int toi_alloc_bitmap(struct memory_bitmap **bm)
-{
- int error;
- struct memory_bitmap *bm1;
-
- bm1 = kzalloc(sizeof(struct memory_bitmap), GFP_KERNEL);
- if (!bm1)
- return -ENOMEM;
-
- error = memory_bm_create(bm1, GFP_KERNEL, PG_ANY);
- if (error) {
- printk("Error returned - %d.\n", error);
- kfree(bm1);
- return -ENOMEM;
- }
-
- *bm = bm1;
- return 0;
-}
-
-void toi_free_bitmap(struct memory_bitmap **bm)
-{
- if (!*bm)
- return;
-
- memory_bm_free(*bm, 0);
- kfree(*bm);
- *bm = NULL;
-}
diff --git a/kernel/power/tuxonice.h b/kernel/power/tuxonice.h
deleted file mode 100644
index 10b65633f..000000000
--- a/kernel/power/tuxonice.h
+++ /dev/null
@@ -1,260 +0,0 @@
-/*
- * kernel/power/tuxonice.h
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * It contains declarations used throughout swsusp.
- *
- */
-
-#ifndef KERNEL_POWER_TOI_H
-#define KERNEL_POWER_TOI_H
-
-#include <linux/delay.h>
-#include <linux/bootmem.h>
-#include <linux/suspend.h>
-#include <linux/fs.h>
-#include <asm/setup.h>
-#include "tuxonice_pageflags.h"
-#include "power.h"
-
-#define TOI_CORE_VERSION "3.3"
-#define TOI_HEADER_VERSION 3
-#define MY_BOOT_KERNEL_DATA_VERSION 4
-
-struct toi_boot_kernel_data {
- int version;
- int size;
- unsigned long toi_action;
- unsigned long toi_debug_state;
- u32 toi_default_console_level;
- int toi_io_time[2][2];
- char toi_nosave_commandline[COMMAND_LINE_SIZE];
- unsigned long pages_used[33];
- unsigned long incremental_bytes_in;
- unsigned long incremental_bytes_out;
- unsigned long compress_bytes_in;
- unsigned long compress_bytes_out;
- unsigned long pruned_pages;
-};
-
-extern struct toi_boot_kernel_data toi_bkd;
-
-/* Location of book kernel data struct in kernel being resumed */
-extern unsigned long boot_kernel_data_buffer;
-
-/* == Action states == */
-
-enum {
- TOI_REBOOT,
- TOI_PAUSE,
- TOI_LOGALL,
- TOI_CAN_CANCEL,
- TOI_KEEP_IMAGE,
- TOI_FREEZER_TEST,
- TOI_SINGLESTEP,
- TOI_PAUSE_NEAR_PAGESET_END,
- TOI_TEST_FILTER_SPEED,
- TOI_TEST_BIO,
- TOI_NO_PAGESET2,
- TOI_IGNORE_ROOTFS,
- TOI_REPLACE_SWSUSP,
- TOI_PAGESET2_FULL,
- TOI_ABORT_ON_RESAVE_NEEDED,
- TOI_NO_MULTITHREADED_IO,
- TOI_NO_DIRECT_LOAD, /* Obsolete */
- TOI_LATE_CPU_HOTPLUG, /* Obsolete */
- TOI_GET_MAX_MEM_ALLOCD,
- TOI_NO_FLUSHER_THREAD,
- TOI_NO_PS2_IF_UNNEEDED,
- TOI_POST_RESUME_BREAKPOINT,
- TOI_NO_READAHEAD,
- TOI_TRACE_DEBUG_ON,
- TOI_INCREMENTAL_IMAGE,
-};
-
-extern unsigned long toi_bootflags_mask;
-
-#define clear_action_state(bit) (test_and_clear_bit(bit, &toi_bkd.toi_action))
-
-/* == Result states == */
-
-enum {
- TOI_ABORTED,
- TOI_ABORT_REQUESTED,
- TOI_NOSTORAGE_AVAILABLE,
- TOI_INSUFFICIENT_STORAGE,
- TOI_FREEZING_FAILED,
- TOI_KEPT_IMAGE,
- TOI_WOULD_EAT_MEMORY,
- TOI_UNABLE_TO_FREE_ENOUGH_MEMORY,
- TOI_PM_SEM,
- TOI_DEVICE_REFUSED,
- TOI_SYSDEV_REFUSED,
- TOI_EXTRA_PAGES_ALLOW_TOO_SMALL,
- TOI_UNABLE_TO_PREPARE_IMAGE,
- TOI_FAILED_MODULE_INIT,
- TOI_FAILED_MODULE_CLEANUP,
- TOI_FAILED_IO,
- TOI_OUT_OF_MEMORY,
- TOI_IMAGE_ERROR,
- TOI_PLATFORM_PREP_FAILED,
- TOI_CPU_HOTPLUG_FAILED,
- TOI_ARCH_PREPARE_FAILED, /* Removed Linux-3.0 */
- TOI_RESAVE_NEEDED,
- TOI_CANT_SUSPEND,
- TOI_NOTIFIERS_PREPARE_FAILED,
- TOI_PRE_SNAPSHOT_FAILED,
- TOI_PRE_RESTORE_FAILED,
- TOI_USERMODE_HELPERS_ERR,
- TOI_CANT_USE_ALT_RESUME,
- TOI_HEADER_TOO_BIG,
- TOI_WAKEUP_EVENT,
- TOI_SYSCORE_REFUSED,
- TOI_DPM_PREPARE_FAILED,
- TOI_DPM_SUSPEND_FAILED,
- TOI_NUM_RESULT_STATES /* Used in printing debug info only */
-};
-
-extern unsigned long toi_result;
-
-#define set_result_state(bit) (test_and_set_bit(bit, &toi_result))
-#define set_abort_result(bit) (test_and_set_bit(TOI_ABORTED, &toi_result), \
- test_and_set_bit(bit, &toi_result))
-#define clear_result_state(bit) (test_and_clear_bit(bit, &toi_result))
-#define test_result_state(bit) (test_bit(bit, &toi_result))
-
-/* == Debug sections and levels == */
-
-/* debugging levels. */
-enum {
- TOI_STATUS = 0,
- TOI_ERROR = 2,
- TOI_LOW,
- TOI_MEDIUM,
- TOI_HIGH,
- TOI_VERBOSE,
-};
-
-enum {
- TOI_ANY_SECTION,
- TOI_EAT_MEMORY,
- TOI_IO,
- TOI_HEADER,
- TOI_WRITER,
- TOI_MEMORY,
- TOI_PAGEDIR,
- TOI_COMPRESS,
- TOI_BIO,
-};
-
-#define set_debug_state(bit) (test_and_set_bit(bit, &toi_bkd.toi_debug_state))
-#define clear_debug_state(bit) \
- (test_and_clear_bit(bit, &toi_bkd.toi_debug_state))
-#define test_debug_state(bit) (test_bit(bit, &toi_bkd.toi_debug_state))
-
-/* == Steps in hibernating == */
-
-enum {
- STEP_HIBERNATE_PREPARE_IMAGE,
- STEP_HIBERNATE_SAVE_IMAGE,
- STEP_HIBERNATE_POWERDOWN,
- STEP_RESUME_CAN_RESUME,
- STEP_RESUME_LOAD_PS1,
- STEP_RESUME_DO_RESTORE,
- STEP_RESUME_READ_PS2,
- STEP_RESUME_GO,
- STEP_RESUME_ALT_IMAGE,
- STEP_CLEANUP,
- STEP_QUIET_CLEANUP
-};
-
-/* == TuxOnIce states ==
- (see also include/linux/suspend.h) */
-
-#define get_toi_state() (toi_state)
-#define restore_toi_state(saved_state) \
- do { toi_state = saved_state; } while (0)
-
-/* == Module support == */
-
-struct toi_core_fns {
- int (*post_context_save)(void);
- unsigned long (*get_nonconflicting_page)(void);
- int (*try_hibernate)(void);
- void (*try_resume)(void);
-};
-
-extern struct toi_core_fns *toi_core_fns;
-
-/* == All else == */
-#define KB(x) ((x) << (PAGE_SHIFT - 10))
-#define MB(x) ((x) >> (20 - PAGE_SHIFT))
-
-extern int toi_start_anything(int toi_or_resume);
-extern void toi_finish_anything(int toi_or_resume);
-
-extern int save_image_part1(void);
-extern int toi_atomic_restore(void);
-
-extern int toi_try_hibernate(void);
-extern void toi_try_resume(void);
-
-extern int __toi_post_context_save(void);
-
-extern unsigned int nr_hibernates;
-extern char alt_resume_param[256];
-
-extern void copyback_post(void);
-extern int toi_hibernate(void);
-extern unsigned long extra_pd1_pages_used;
-
-#define SECTOR_SIZE 512
-
-extern void toi_early_boot_message(int can_erase_image, int default_answer,
- char *warning_reason, ...);
-
-extern int do_check_can_resume(void);
-extern int do_toi_step(int step);
-extern int toi_launch_userspace_program(char *command, int channel_no,
- int wait, int debug);
-
-extern char tuxonice_signature[9];
-
-extern int toi_start_other_threads(void);
-extern void toi_stop_other_threads(void);
-
-extern int toi_trace_index;
-#define TOI_TRACE_DEBUG(PFN, DESC, ...) \
- do { \
- if (test_action_state(TOI_TRACE_DEBUG_ON)) { \
- printk("*TOI* %ld %02d" DESC "\n", PFN, toi_trace_index, ##__VA_ARGS__); \
- } \
- } while(0)
-
-#ifdef CONFIG_TOI_KEEP_IMAGE
-#define toi_keeping_image (test_action_state(TOI_KEEP_IMAGE) || test_action_state(TOI_INCREMENTAL_IMAGE))
-#else
-#define toi_keeping_image (0)
-#endif
-
-#ifdef CONFIG_TOI_INCREMENTAL
-extern void toi_reset_dirtiness_one(unsigned long pfn, int verbose);
-extern int toi_reset_dirtiness(int verbose);
-extern void toi_cbw_write(void);
-extern void toi_cbw_restore(void);
-extern int toi_allocate_cbw_data(void);
-extern void toi_free_cbw_data(void);
-extern int toi_cbw_init(void);
-extern void toi_mark_tasks_cbw(void);
-#else
-static inline int toi_reset_dirtiness(int verbose) { return 0; }
-#define toi_cbw_write() do { } while(0)
-#define toi_cbw_restore() do { } while(0)
-#define toi_allocate_cbw_data() do { } while(0)
-#define toi_free_cbw_data() do { } while(0)
-static inline int toi_cbw_init(void) { return 0; }
-#endif
-#endif
diff --git a/kernel/power/tuxonice_alloc.c b/kernel/power/tuxonice_alloc.c
deleted file mode 100644
index 1d8b1cbda..000000000
--- a/kernel/power/tuxonice_alloc.c
+++ /dev/null
@@ -1,308 +0,0 @@
-/*
- * kernel/power/tuxonice_alloc.c
- *
- * Copyright (C) 2008-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- */
-
-#include <linux/export.h>
-#include <linux/slab.h>
-#include "tuxonice_modules.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice.h"
-
-#define TOI_ALLOC_PATHS 41
-
-static DEFINE_MUTEX(toi_alloc_mutex);
-
-static struct toi_module_ops toi_alloc_ops;
-
-static int toi_fail_num;
-
-static atomic_t toi_alloc_count[TOI_ALLOC_PATHS],
- toi_free_count[TOI_ALLOC_PATHS],
- toi_test_count[TOI_ALLOC_PATHS],
- toi_fail_count[TOI_ALLOC_PATHS];
-static int toi_cur_allocd[TOI_ALLOC_PATHS], toi_max_allocd[TOI_ALLOC_PATHS];
-static int cur_allocd, max_allocd;
-
-static char *toi_alloc_desc[TOI_ALLOC_PATHS] = {
- "", /* 0 */
- "get_io_info_struct",
- "extent",
- "extent (loading chain)",
- "userui channel",
- "userui arg", /* 5 */
- "attention list metadata",
- "extra pagedir memory metadata",
- "bdev metadata",
- "extra pagedir memory",
- "header_locations_read", /* 10 */
- "bio queue",
- "prepare_readahead",
- "i/o buffer",
- "writer buffer in bio_init",
- "checksum buffer", /* 15 */
- "compression buffer",
- "filewriter signature op",
- "set resume param alloc1",
- "set resume param alloc2",
- "debugging info buffer", /* 20 */
- "check can resume buffer",
- "write module config buffer",
- "read module config buffer",
- "write image header buffer",
- "read pageset1 buffer", /* 25 */
- "get_have_image_data buffer",
- "checksum page",
- "worker rw loop",
- "get nonconflicting page",
- "ps1 load addresses", /* 30 */
- "remove swap image",
- "swap image exists",
- "swap parse sig location",
- "sysfs kobj",
- "swap mark resume attempted buffer", /* 35 */
- "cluster member",
- "boot kernel data buffer",
- "setting swap signature",
- "block i/o bdev struct",
- "copy before write", /* 40 */
-};
-
-#define MIGHT_FAIL(FAIL_NUM, FAIL_VAL) \
- do { \
- BUG_ON(FAIL_NUM >= TOI_ALLOC_PATHS); \
- \
- if (FAIL_NUM == toi_fail_num) { \
- atomic_inc(&toi_test_count[FAIL_NUM]); \
- toi_fail_num = 0; \
- return FAIL_VAL; \
- } \
- } while (0)
-
-static void alloc_update_stats(int fail_num, void *result, int size)
-{
- if (!result) {
- atomic_inc(&toi_fail_count[fail_num]);
- return;
- }
-
- atomic_inc(&toi_alloc_count[fail_num]);
- if (unlikely(test_action_state(TOI_GET_MAX_MEM_ALLOCD))) {
- mutex_lock(&toi_alloc_mutex);
- toi_cur_allocd[fail_num]++;
- cur_allocd += size;
- if (unlikely(cur_allocd > max_allocd)) {
- int i;
-
- for (i = 0; i < TOI_ALLOC_PATHS; i++)
- toi_max_allocd[i] = toi_cur_allocd[i];
- max_allocd = cur_allocd;
- }
- mutex_unlock(&toi_alloc_mutex);
- }
-}
-
-static void free_update_stats(int fail_num, int size)
-{
- BUG_ON(fail_num >= TOI_ALLOC_PATHS);
- atomic_inc(&toi_free_count[fail_num]);
- if (unlikely(atomic_read(&toi_free_count[fail_num]) >
- atomic_read(&toi_alloc_count[fail_num])))
- dump_stack();
- if (unlikely(test_action_state(TOI_GET_MAX_MEM_ALLOCD))) {
- mutex_lock(&toi_alloc_mutex);
- cur_allocd -= size;
- toi_cur_allocd[fail_num]--;
- mutex_unlock(&toi_alloc_mutex);
- }
-}
-
-void *toi_kzalloc(int fail_num, size_t size, gfp_t flags)
-{
- void *result;
-
- if (toi_alloc_ops.enabled)
- MIGHT_FAIL(fail_num, NULL);
- result = kzalloc(size, flags);
- if (toi_alloc_ops.enabled)
- alloc_update_stats(fail_num, result, size);
- if (fail_num == toi_trace_allocs)
- dump_stack();
- return result;
-}
-
-unsigned long toi_get_free_pages(int fail_num, gfp_t mask,
- unsigned int order)
-{
- unsigned long result;
-
- mask |= ___GFP_TOI_NOTRACK;
- if (toi_alloc_ops.enabled)
- MIGHT_FAIL(fail_num, 0);
- result = __get_free_pages(mask, order);
- if (toi_alloc_ops.enabled)
- alloc_update_stats(fail_num, (void *) result,
- PAGE_SIZE << order);
- if (fail_num == toi_trace_allocs)
- dump_stack();
- return result;
-}
-
-struct page *toi_alloc_page(int fail_num, gfp_t mask)
-{
- struct page *result;
-
- if (toi_alloc_ops.enabled)
- MIGHT_FAIL(fail_num, NULL);
- mask |= ___GFP_TOI_NOTRACK;
- result = alloc_page(mask);
- if (toi_alloc_ops.enabled)
- alloc_update_stats(fail_num, (void *) result, PAGE_SIZE);
- if (fail_num == toi_trace_allocs)
- dump_stack();
- return result;
-}
-
-unsigned long toi_get_zeroed_page(int fail_num, gfp_t mask)
-{
- unsigned long result;
-
- if (toi_alloc_ops.enabled)
- MIGHT_FAIL(fail_num, 0);
- mask |= ___GFP_TOI_NOTRACK;
- result = get_zeroed_page(mask);
- if (toi_alloc_ops.enabled)
- alloc_update_stats(fail_num, (void *) result, PAGE_SIZE);
- if (fail_num == toi_trace_allocs)
- dump_stack();
- return result;
-}
-
-void toi_kfree(int fail_num, const void *arg, int size)
-{
- if (arg && toi_alloc_ops.enabled)
- free_update_stats(fail_num, size);
-
- if (fail_num == toi_trace_allocs)
- dump_stack();
- kfree(arg);
-}
-
-void toi_free_page(int fail_num, unsigned long virt)
-{
- if (virt && toi_alloc_ops.enabled)
- free_update_stats(fail_num, PAGE_SIZE);
-
- if (fail_num == toi_trace_allocs)
- dump_stack();
- free_page(virt);
-}
-
-void toi__free_page(int fail_num, struct page *page)
-{
- if (page && toi_alloc_ops.enabled)
- free_update_stats(fail_num, PAGE_SIZE);
-
- if (fail_num == toi_trace_allocs)
- dump_stack();
- __free_page(page);
-}
-
-void toi_free_pages(int fail_num, struct page *page, int order)
-{
- if (page && toi_alloc_ops.enabled)
- free_update_stats(fail_num, PAGE_SIZE << order);
-
- if (fail_num == toi_trace_allocs)
- dump_stack();
- __free_pages(page, order);
-}
-
-void toi_alloc_print_debug_stats(void)
-{
- int i, header_done = 0;
-
- if (!toi_alloc_ops.enabled)
- return;
-
- for (i = 0; i < TOI_ALLOC_PATHS; i++)
- if (atomic_read(&toi_alloc_count[i]) !=
- atomic_read(&toi_free_count[i])) {
- if (!header_done) {
- printk(KERN_INFO "Idx Allocs Frees Tests "
- " Fails Max Description\n");
- header_done = 1;
- }
-
- printk(KERN_INFO "%3d %7d %7d %7d %7d %7d %s\n", i,
- atomic_read(&toi_alloc_count[i]),
- atomic_read(&toi_free_count[i]),
- atomic_read(&toi_test_count[i]),
- atomic_read(&toi_fail_count[i]),
- toi_max_allocd[i],
- toi_alloc_desc[i]);
- }
-}
-
-static int toi_alloc_initialise(int starting_cycle)
-{
- int i;
-
- if (!starting_cycle)
- return 0;
-
- if (toi_trace_allocs)
- dump_stack();
-
- for (i = 0; i < TOI_ALLOC_PATHS; i++) {
- atomic_set(&toi_alloc_count[i], 0);
- atomic_set(&toi_free_count[i], 0);
- atomic_set(&toi_test_count[i], 0);
- atomic_set(&toi_fail_count[i], 0);
- toi_cur_allocd[i] = 0;
- toi_max_allocd[i] = 0;
- };
-
- max_allocd = 0;
- cur_allocd = 0;
- return 0;
-}
-
-static struct toi_sysfs_data sysfs_params[] = {
- SYSFS_INT("failure_test", SYSFS_RW, &toi_fail_num, 0, 99, 0, NULL),
- SYSFS_INT("trace", SYSFS_RW, &toi_trace_allocs, 0, TOI_ALLOC_PATHS, 0,
- NULL),
- SYSFS_BIT("find_max_mem_allocated", SYSFS_RW, &toi_bkd.toi_action,
- TOI_GET_MAX_MEM_ALLOCD, 0),
- SYSFS_INT("enabled", SYSFS_RW, &toi_alloc_ops.enabled, 0, 1, 0,
- NULL)
-};
-
-static struct toi_module_ops toi_alloc_ops = {
- .type = MISC_HIDDEN_MODULE,
- .name = "allocation debugging",
- .directory = "alloc",
- .module = THIS_MODULE,
- .early = 1,
- .initialise = toi_alloc_initialise,
-
- .sysfs_data = sysfs_params,
- .num_sysfs_entries = sizeof(sysfs_params) /
- sizeof(struct toi_sysfs_data),
-};
-
-int toi_alloc_init(void)
-{
- int result = toi_register_module(&toi_alloc_ops);
- return result;
-}
-
-void toi_alloc_exit(void)
-{
- toi_unregister_module(&toi_alloc_ops);
-}
diff --git a/kernel/power/tuxonice_alloc.h b/kernel/power/tuxonice_alloc.h
deleted file mode 100644
index 0cd6b686f..000000000
--- a/kernel/power/tuxonice_alloc.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * kernel/power/tuxonice_alloc.h
- *
- * Copyright (C) 2008-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- */
-
-#include <linux/slab.h>
-#define TOI_WAIT_GFP (GFP_NOFS | __GFP_NOWARN)
-#define TOI_ATOMIC_GFP (GFP_ATOMIC | __GFP_NOWARN)
-
-#ifdef CONFIG_PM_DEBUG
-extern void *toi_kzalloc(int fail_num, size_t size, gfp_t flags);
-extern void toi_kfree(int fail_num, const void *arg, int size);
-
-extern unsigned long toi_get_free_pages(int fail_num, gfp_t mask,
- unsigned int order);
-#define toi_get_free_page(FAIL_NUM, MASK) toi_get_free_pages(FAIL_NUM, MASK, 0)
-extern unsigned long toi_get_zeroed_page(int fail_num, gfp_t mask);
-extern void toi_free_page(int fail_num, unsigned long buf);
-extern void toi__free_page(int fail_num, struct page *page);
-extern void toi_free_pages(int fail_num, struct page *page, int order);
-extern struct page *toi_alloc_page(int fail_num, gfp_t mask);
-extern int toi_alloc_init(void);
-extern void toi_alloc_exit(void);
-
-extern void toi_alloc_print_debug_stats(void);
-
-#else /* CONFIG_PM_DEBUG */
-
-#define toi_kzalloc(FAIL, SIZE, FLAGS) (kzalloc(SIZE, FLAGS))
-#define toi_kfree(FAIL, ALLOCN, SIZE) (kfree(ALLOCN))
-
-#define toi_get_free_pages(FAIL, FLAGS, ORDER) __get_free_pages(FLAGS, ORDER)
-#define toi_get_free_page(FAIL, FLAGS) __get_free_page(FLAGS)
-#define toi_get_zeroed_page(FAIL, FLAGS) get_zeroed_page(FLAGS)
-#define toi_free_page(FAIL, ALLOCN) do { free_page(ALLOCN); } while (0)
-#define toi__free_page(FAIL, PAGE) __free_page(PAGE)
-#define toi_free_pages(FAIL, PAGE, ORDER) __free_pages(PAGE, ORDER)
-#define toi_alloc_page(FAIL, MASK) alloc_page(MASK)
-static inline int toi_alloc_init(void)
-{
- return 0;
-}
-
-static inline void toi_alloc_exit(void) { }
-
-static inline void toi_alloc_print_debug_stats(void) { }
-
-#endif
-
-extern int toi_trace_allocs;
diff --git a/kernel/power/tuxonice_atomic_copy.c b/kernel/power/tuxonice_atomic_copy.c
deleted file mode 100644
index 5845217f8..000000000
--- a/kernel/power/tuxonice_atomic_copy.c
+++ /dev/null
@@ -1,469 +0,0 @@
-/*
- * kernel/power/tuxonice_atomic_copy.c
- *
- * Copyright 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * Distributed under GPLv2.
- *
- * Routines for doing the atomic save/restore.
- */
-
-#include <linux/suspend.h>
-#include <linux/highmem.h>
-#include <linux/cpu.h>
-#include <linux/freezer.h>
-#include <linux/console.h>
-#include <linux/syscore_ops.h>
-#include <linux/ftrace.h>
-#include <asm/suspend.h>
-#include "tuxonice.h"
-#include "tuxonice_storage.h"
-#include "tuxonice_power_off.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_io.h"
-#include "tuxonice_prepare_image.h"
-#include "tuxonice_pageflags.h"
-#include "tuxonice_checksum.h"
-#include "tuxonice_builtin.h"
-#include "tuxonice_atomic_copy.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_modules.h"
-
-unsigned long extra_pd1_pages_used;
-
-/**
- * free_pbe_list - free page backup entries used by the atomic copy code.
- * @list: List to free.
- * @highmem: Whether the list is in highmem.
- *
- * Normally, this function isn't used. If, however, we need to abort before
- * doing the atomic copy, we use this to free the pbes previously allocated.
- **/
-static void free_pbe_list(struct pbe **list, int highmem)
-{
- while (*list) {
- int i;
- struct pbe *free_pbe, *next_page = NULL;
- struct page *page;
-
- if (highmem) {
- page = (struct page *) *list;
- free_pbe = (struct pbe *) kmap(page);
- } else {
- page = virt_to_page(*list);
- free_pbe = *list;
- }
-
- for (i = 0; i < PBES_PER_PAGE; i++) {
- if (!free_pbe)
- break;
- if (highmem)
- toi__free_page(29, free_pbe->address);
- else
- toi_free_page(29,
- (unsigned long) free_pbe->address);
- free_pbe = free_pbe->next;
- }
-
- if (highmem) {
- if (free_pbe)
- next_page = free_pbe;
- kunmap(page);
- } else {
- if (free_pbe)
- next_page = free_pbe;
- }
-
- toi__free_page(29, page);
- *list = (struct pbe *) next_page;
- };
-}
-
-/**
- * copyback_post - post atomic-restore actions
- *
- * After doing the atomic restore, we have a few more things to do:
- * 1) We want to retain some values across the restore, so we now copy
- * these from the nosave variables to the normal ones.
- * 2) Set the status flags.
- * 3) Resume devices.
- * 4) Tell userui so it can redraw & restore settings.
- * 5) Reread the page cache.
- **/
-void copyback_post(void)
-{
- struct toi_boot_kernel_data *bkd =
- (struct toi_boot_kernel_data *) boot_kernel_data_buffer;
-
- if (toi_activate_storage(1))
- panic("Failed to reactivate our storage.");
-
- toi_post_atomic_restore_modules(bkd);
-
- toi_cond_pause(1, "About to reload secondary pagedir.");
-
- if (read_pageset2(0))
- panic("Unable to successfully reread the page cache.");
-
- /*
- * If the user wants to sleep again after resuming from full-off,
- * it's most likely to be in order to suspend to ram, so we'll
- * do this check after loading pageset2, to give them the fastest
- * wakeup when they are ready to use the computer again.
- */
- toi_check_resleep();
-
- if (test_action_state(TOI_INCREMENTAL_IMAGE))
- toi_reset_dirtiness(1);
-}
-
-/**
- * toi_copy_pageset1 - do the atomic copy of pageset1
- *
- * Make the atomic copy of pageset1. We can't use copy_page (as we once did)
- * because we can't be sure what side effects it has. On my old Duron, with
- * 3DNOW, kernel_fpu_begin increments preempt count, making our preempt
- * count at resume time 4 instead of 3.
- *
- * We don't want to call kmap_atomic unconditionally because it has the side
- * effect of incrementing the preempt count, which will leave it one too high
- * post resume (the page containing the preempt count will be copied after
- * its incremented. This is essentially the same problem.
- **/
-void toi_copy_pageset1(void)
-{
- int i;
- unsigned long source_index, dest_index;
-
- memory_bm_position_reset(pageset1_map);
- memory_bm_position_reset(pageset1_copy_map);
-
- source_index = memory_bm_next_pfn(pageset1_map, 0);
- dest_index = memory_bm_next_pfn(pageset1_copy_map, 0);
-
- for (i = 0; i < pagedir1.size; i++) {
- unsigned long *origvirt, *copyvirt;
- struct page *origpage, *copypage;
- int loop = (PAGE_SIZE / sizeof(unsigned long)) - 1,
- was_present1, was_present2;
-
- origpage = pfn_to_page(source_index);
- copypage = pfn_to_page(dest_index);
-
- origvirt = PageHighMem(origpage) ?
- kmap_atomic(origpage) :
- page_address(origpage);
-
- copyvirt = PageHighMem(copypage) ?
- kmap_atomic(copypage) :
- page_address(copypage);
-
- was_present1 = kernel_page_present(origpage);
- if (!was_present1)
- kernel_map_pages(origpage, 1, 1);
-
- was_present2 = kernel_page_present(copypage);
- if (!was_present2)
- kernel_map_pages(copypage, 1, 1);
-
- while (loop >= 0) {
- *(copyvirt + loop) = *(origvirt + loop);
- loop--;
- }
-
- if (!was_present1)
- kernel_map_pages(origpage, 1, 0);
-
- if (!was_present2)
- kernel_map_pages(copypage, 1, 0);
-
- if (PageHighMem(origpage))
- kunmap_atomic(origvirt);
-
- if (PageHighMem(copypage))
- kunmap_atomic(copyvirt);
-
- source_index = memory_bm_next_pfn(pageset1_map, 0);
- dest_index = memory_bm_next_pfn(pageset1_copy_map, 0);
- }
-}
-
-/**
- * __toi_post_context_save - steps after saving the cpu context
- *
- * Steps taken after saving the CPU state to make the actual
- * atomic copy.
- *
- * Called from swsusp_save in snapshot.c via toi_post_context_save.
- **/
-int __toi_post_context_save(void)
-{
- unsigned long old_ps1_size = pagedir1.size;
-
- check_checksums();
-
- free_checksum_pages();
-
- toi_recalculate_image_contents(1);
-
- extra_pd1_pages_used = pagedir1.size > old_ps1_size ?
- pagedir1.size - old_ps1_size : 0;
-
- if (extra_pd1_pages_used > extra_pd1_pages_allowance) {
- printk(KERN_INFO "Pageset1 has grown by %lu pages. "
- "extra_pages_allowance is currently only %lu.\n",
- pagedir1.size - old_ps1_size,
- extra_pd1_pages_allowance);
-
- /*
- * Highlevel code will see this, clear the state and
- * retry if we haven't already done so twice.
- */
- if (any_to_free(1)) {
- set_abort_result(TOI_EXTRA_PAGES_ALLOW_TOO_SMALL);
- return 1;
- }
- if (try_allocate_extra_memory()) {
- printk(KERN_INFO "Failed to allocate the extra memory"
- " needed. Restarting the process.");
- set_abort_result(TOI_EXTRA_PAGES_ALLOW_TOO_SMALL);
- return 1;
- }
- printk(KERN_INFO "However it looks like there's enough"
- " free ram and storage to handle this, so "
- " continuing anyway.");
- /*
- * What if try_allocate_extra_memory above calls
- * toi_allocate_extra_pagedir_memory and it allocs a new
- * slab page via toi_kzalloc which should be in ps1? So...
- */
- toi_recalculate_image_contents(1);
- }
-
- if (!test_action_state(TOI_TEST_FILTER_SPEED) &&
- !test_action_state(TOI_TEST_BIO))
- toi_copy_pageset1();
-
- return 0;
-}
-
-/**
- * toi_hibernate - high level code for doing the atomic copy
- *
- * High-level code which prepares to do the atomic copy. Loosely based
- * on the swsusp version, but with the following twists:
- * - We set toi_running so the swsusp code uses our code paths.
- * - We give better feedback regarding what goes wrong if there is a
- * problem.
- * - We use an extra function to call the assembly, just in case this code
- * is in a module (return address).
- **/
-int toi_hibernate(void)
-{
- int error;
-
- error = toi_lowlevel_builtin();
-
- if (!error) {
- struct toi_boot_kernel_data *bkd =
- (struct toi_boot_kernel_data *) boot_kernel_data_buffer;
-
- /*
- * The boot kernel's data may be larger (newer version) or
- * smaller (older version) than ours. Copy the minimum
- * of the two sizes, so that we don't overwrite valid values
- * from pre-atomic copy.
- */
-
- memcpy(&toi_bkd, (char *) boot_kernel_data_buffer,
- min_t(int, sizeof(struct toi_boot_kernel_data),
- bkd->size));
- }
-
- return error;
-}
-
-/**
- * toi_atomic_restore - prepare to do the atomic restore
- *
- * Get ready to do the atomic restore. This part gets us into the same
- * state we are in prior to do calling do_toi_lowlevel while
- * hibernating: hot-unplugging secondary cpus and freeze processes,
- * before starting the thread that will do the restore.
- **/
-int toi_atomic_restore(void)
-{
- int error;
-
- toi_prepare_status(DONT_CLEAR_BAR, "Atomic restore.");
-
- memcpy(&toi_bkd.toi_nosave_commandline, saved_command_line,
- strlen(saved_command_line));
-
- toi_pre_atomic_restore_modules(&toi_bkd);
-
- if (add_boot_kernel_data_pbe())
- goto Failed;
-
- toi_prepare_status(DONT_CLEAR_BAR, "Doing atomic copy/restore.");
-
- if (toi_go_atomic(PMSG_QUIESCE, 0))
- goto Failed;
-
- /* We'll ignore saved state, but this gets preempt count (etc) right */
- save_processor_state();
-
- error = swsusp_arch_resume();
- /*
- * Code below is only ever reached in case of failure. Otherwise
- * execution continues at place where swsusp_arch_suspend was called.
- *
- * We don't know whether it's safe to continue (this shouldn't happen),
- * so lets err on the side of caution.
- */
- BUG();
-
-Failed:
- free_pbe_list(&restore_pblist, 0);
-#ifdef CONFIG_HIGHMEM
- free_pbe_list(&restore_highmem_pblist, 1);
-#endif
- return 1;
-}
-
-/**
- * toi_go_atomic - do the actual atomic copy/restore
- * @state: The state to use for dpm_suspend_start & power_down calls.
- * @suspend_time: Whether we're suspending or resuming.
- **/
-int toi_go_atomic(pm_message_t state, int suspend_time)
-{
- if (suspend_time) {
- if (platform_begin(1)) {
- set_abort_result(TOI_PLATFORM_PREP_FAILED);
- toi_end_atomic(ATOMIC_STEP_PLATFORM_END, suspend_time, 3);
- return 1;
- }
-
- if (dpm_prepare(PMSG_FREEZE)) {
- set_abort_result(TOI_DPM_PREPARE_FAILED);
- dpm_complete(PMSG_RECOVER);
- toi_end_atomic(ATOMIC_STEP_PLATFORM_END, suspend_time, 3);
- return 1;
- }
- }
-
- suspend_console();
- pm_restrict_gfp_mask();
-
- if (suspend_time) {
- if (dpm_suspend(state)) {
- set_abort_result(TOI_DPM_SUSPEND_FAILED);
- toi_end_atomic(ATOMIC_STEP_DEVICE_RESUME, suspend_time, 3);
- return 1;
- }
- } else {
- if (dpm_suspend_start(state)) {
- set_abort_result(TOI_DPM_SUSPEND_FAILED);
- toi_end_atomic(ATOMIC_STEP_DEVICE_RESUME, suspend_time, 3);
- return 1;
- }
- }
-
- /* At this point, dpm_suspend_start() has been called, but *not*
- * dpm_suspend_noirq(). We *must* dpm_suspend_noirq() now.
- * Otherwise, drivers for some devices (e.g. interrupt controllers)
- * become desynchronized with the actual state of the hardware
- * at resume time, and evil weirdness ensues.
- */
-
- if (dpm_suspend_end(state)) {
- set_abort_result(TOI_DEVICE_REFUSED);
- toi_end_atomic(ATOMIC_STEP_DEVICE_RESUME, suspend_time, 1);
- return 1;
- }
-
- if (suspend_time) {
- if (platform_pre_snapshot(1))
- set_abort_result(TOI_PRE_SNAPSHOT_FAILED);
- } else {
- if (platform_pre_restore(1))
- set_abort_result(TOI_PRE_RESTORE_FAILED);
- }
-
- if (test_result_state(TOI_ABORTED)) {
- toi_end_atomic(ATOMIC_STEP_PLATFORM_FINISH, suspend_time, 1);
- return 1;
- }
-
- if (disable_nonboot_cpus()) {
- set_abort_result(TOI_CPU_HOTPLUG_FAILED);
- toi_end_atomic(ATOMIC_STEP_CPU_HOTPLUG,
- suspend_time, 1);
- return 1;
- }
-
- local_irq_disable();
-
- if (syscore_suspend()) {
- set_abort_result(TOI_SYSCORE_REFUSED);
- toi_end_atomic(ATOMIC_STEP_IRQS, suspend_time, 1);
- return 1;
- }
-
- if (suspend_time && pm_wakeup_pending()) {
- set_abort_result(TOI_WAKEUP_EVENT);
- toi_end_atomic(ATOMIC_STEP_SYSCORE_RESUME, suspend_time, 1);
- return 1;
- }
- return 0;
-}
-
-/**
- * toi_end_atomic - post atomic copy/restore routines
- * @stage: What step to start at.
- * @suspend_time: Whether we're suspending or resuming.
- * @error: Whether we're recovering from an error.
- **/
-void toi_end_atomic(int stage, int suspend_time, int error)
-{
- pm_message_t msg = suspend_time ? (error ? PMSG_RECOVER : PMSG_THAW) :
- PMSG_RESTORE;
-
- switch (stage) {
- case ATOMIC_ALL_STEPS:
- if (!suspend_time) {
- events_check_enabled = false;
- }
- platform_leave(1);
- case ATOMIC_STEP_SYSCORE_RESUME:
- syscore_resume();
- case ATOMIC_STEP_IRQS:
- local_irq_enable();
- case ATOMIC_STEP_CPU_HOTPLUG:
- enable_nonboot_cpus();
- case ATOMIC_STEP_PLATFORM_FINISH:
- if (!suspend_time && error & 2)
- platform_restore_cleanup(1);
- else
- platform_finish(1);
- dpm_resume_start(msg);
- case ATOMIC_STEP_DEVICE_RESUME:
- if (suspend_time && (error & 2))
- platform_recover(1);
- dpm_resume(msg);
- if (!toi_in_suspend()) {
- dpm_resume_end(PMSG_RECOVER);
- }
- if (error || !toi_in_suspend()) {
- pm_restore_gfp_mask();
- }
- resume_console();
- case ATOMIC_STEP_DPM_COMPLETE:
- dpm_complete(msg);
- case ATOMIC_STEP_PLATFORM_END:
- platform_end(1);
-
- toi_prepare_status(DONT_CLEAR_BAR, "Post atomic.");
- }
-}
diff --git a/kernel/power/tuxonice_atomic_copy.h b/kernel/power/tuxonice_atomic_copy.h
deleted file mode 100644
index e2d2b4fb3..000000000
--- a/kernel/power/tuxonice_atomic_copy.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * kernel/power/tuxonice_atomic_copy.h
- *
- * Copyright 2008-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * Distributed under GPLv2.
- *
- * Routines for doing the atomic save/restore.
- */
-
-enum {
- ATOMIC_ALL_STEPS,
- ATOMIC_STEP_SYSCORE_RESUME,
- ATOMIC_STEP_IRQS,
- ATOMIC_STEP_CPU_HOTPLUG,
- ATOMIC_STEP_PLATFORM_FINISH,
- ATOMIC_STEP_DEVICE_RESUME,
- ATOMIC_STEP_DPM_COMPLETE,
- ATOMIC_STEP_PLATFORM_END,
-};
-
-int toi_go_atomic(pm_message_t state, int toi_time);
-void toi_end_atomic(int stage, int toi_time, int error);
-
-extern void platform_recover(int platform_mode);
diff --git a/kernel/power/tuxonice_bio.h b/kernel/power/tuxonice_bio.h
deleted file mode 100644
index 9d52a3b69..000000000
--- a/kernel/power/tuxonice_bio.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * kernel/power/tuxonice_bio.h
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * Distributed under GPLv2.
- *
- * This file contains declarations for functions exported from
- * tuxonice_bio.c, which contains low level io functions.
- */
-
-#include <linux/buffer_head.h>
-#include "tuxonice_extent.h"
-
-void toi_put_extent_chain(struct hibernate_extent_chain *chain);
-int toi_add_to_extent_chain(struct hibernate_extent_chain *chain,
- unsigned long start, unsigned long end);
-
-struct hibernate_extent_saved_state {
- int extent_num;
- struct hibernate_extent *extent_ptr;
- unsigned long offset;
-};
-
-struct toi_bdev_info {
- struct toi_bdev_info *next;
- struct hibernate_extent_chain blocks;
- struct block_device *bdev;
- struct toi_module_ops *allocator;
- int allocator_index;
- struct hibernate_extent_chain allocations;
- char name[266]; /* "swap on " or "file " + up to 256 chars */
-
- /* Saved in header */
- char uuid[17];
- dev_t dev_t;
- int prio;
- int bmap_shift;
- int blocks_per_page;
- unsigned long pages_used;
- struct hibernate_extent_saved_state saved_state[4];
-};
-
-struct toi_extent_iterate_state {
- struct toi_bdev_info *current_chain;
- int num_chains;
- int saved_chain_number[4];
- struct toi_bdev_info *saved_chain_ptr[4];
-};
-
-/*
- * Our exported interface so the swapwriter and filewriter don't
- * need these functions duplicated.
- */
-struct toi_bio_ops {
- int (*bdev_page_io) (int rw, struct block_device *bdev, long pos,
- struct page *page);
- int (*register_storage)(struct toi_bdev_info *new);
- void (*free_storage)(void);
-};
-
-struct toi_allocator_ops {
- unsigned long (*toi_swap_storage_available) (void);
-};
-
-extern struct toi_bio_ops toi_bio_ops;
-
-extern char *toi_writer_buffer;
-extern int toi_writer_buffer_posn;
-
-struct toi_bio_allocator_ops {
- int (*register_storage) (void);
- unsigned long (*storage_available)(void);
- int (*allocate_storage) (struct toi_bdev_info *, unsigned long);
- int (*bmap) (struct toi_bdev_info *);
- void (*free_storage) (struct toi_bdev_info *);
- unsigned long (*free_unused_storage) (struct toi_bdev_info *, unsigned long used);
-};
diff --git a/kernel/power/tuxonice_bio_chains.c b/kernel/power/tuxonice_bio_chains.c
deleted file mode 100644
index 086a5527d..000000000
--- a/kernel/power/tuxonice_bio_chains.c
+++ /dev/null
@@ -1,1126 +0,0 @@
-/*
- * kernel/power/tuxonice_bio_devinfo.c
- *
- * Copyright (C) 2009-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * Distributed under GPLv2.
- *
- */
-
-#include <linux/mm_types.h>
-#include "tuxonice_bio.h"
-#include "tuxonice_bio_internal.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_ui.h"
-#include "tuxonice.h"
-#include "tuxonice_io.h"
-
-static struct toi_bdev_info *prio_chain_head;
-static int num_chains;
-
-/* Pointer to current entry being loaded/saved. */
-struct toi_extent_iterate_state toi_writer_posn;
-
-#define metadata_size (sizeof(struct toi_bdev_info) - \
- offsetof(struct toi_bdev_info, uuid))
-
-/*
- * After section 0 (header) comes 2 => next_section[0] = 2
- */
-static int next_section[3] = { 2, 3, 1 };
-
-/**
- * dump_block_chains - print the contents of the bdev info array.
- **/
-void dump_block_chains(void)
-{
- int i = 0;
- int j;
- struct toi_bdev_info *cur_chain = prio_chain_head;
-
- while (cur_chain) {
- struct hibernate_extent *this = cur_chain->blocks.first;
-
- printk(KERN_DEBUG "Chain %d (prio %d):", i, cur_chain->prio);
-
- while (this) {
- printk(KERN_CONT " [%lu-%lu]%s", this->start,
- this->end, this->next ? "," : "");
- this = this->next;
- }
-
- printk("\n");
- cur_chain = cur_chain->next;
- i++;
- }
-
- printk(KERN_DEBUG "Saved states:\n");
- for (i = 0; i < 4; i++) {
- printk(KERN_DEBUG "Slot %d: Chain %d.\n",
- i, toi_writer_posn.saved_chain_number[i]);
-
- cur_chain = prio_chain_head;
- j = 0;
- while (cur_chain) {
- printk(KERN_DEBUG " Chain %d: Extent %d. Offset %lu.\n",
- j, cur_chain->saved_state[i].extent_num,
- cur_chain->saved_state[i].offset);
- cur_chain = cur_chain->next;
- j++;
- }
- printk(KERN_CONT "\n");
- }
-}
-
-/**
- *
- **/
-static void toi_extent_chain_next(void)
-{
- struct toi_bdev_info *this = toi_writer_posn.current_chain;
-
- if (!this->blocks.current_extent)
- return;
-
- if (this->blocks.current_offset == this->blocks.current_extent->end) {
- if (this->blocks.current_extent->next) {
- this->blocks.current_extent =
- this->blocks.current_extent->next;
- this->blocks.current_offset =
- this->blocks.current_extent->start;
- } else {
- this->blocks.current_extent = NULL;
- this->blocks.current_offset = 0;
- }
- } else
- this->blocks.current_offset++;
-}
-
-/**
- *
- */
-
-static struct toi_bdev_info *__find_next_chain_same_prio(void)
-{
- struct toi_bdev_info *start_chain = toi_writer_posn.current_chain;
- struct toi_bdev_info *this = start_chain;
- int orig_prio = this->prio;
-
- do {
- this = this->next;
-
- if (!this)
- this = prio_chain_head;
-
- /* Back on original chain? Use it again. */
- if (this == start_chain)
- return start_chain;
-
- } while (!this->blocks.current_extent || this->prio != orig_prio);
-
- return this;
-}
-
-static void find_next_chain(void)
-{
- struct toi_bdev_info *this;
-
- this = __find_next_chain_same_prio();
-
- /*
- * If we didn't get another chain of the same priority that we
- * can use, look for the next priority.
- */
- while (this && !this->blocks.current_extent)
- this = this->next;
-
- toi_writer_posn.current_chain = this;
-}
-
-/**
- * toi_extent_state_next - go to the next extent
- * @blocks: The number of values to progress.
- * @stripe_mode: Whether to spread usage across all chains.
- *
- * Given a state, progress to the next valid entry. We may begin in an
- * invalid state, as we do when invoked after extent_state_goto_start below.
- *
- * When using compression and expected_compression > 0, we let the image size
- * be larger than storage, so we can validly run out of data to return.
- **/
-static unsigned long toi_extent_state_next(int blocks, int current_stream)
-{
- int i;
-
- if (!toi_writer_posn.current_chain)
- return -ENOSPC;
-
- /* Assume chains always have lengths that are multiples of @blocks */
- for (i = 0; i < blocks; i++)
- toi_extent_chain_next();
-
- /* The header stream is not striped */
- if (current_stream ||
- !toi_writer_posn.current_chain->blocks.current_extent)
- find_next_chain();
-
- return toi_writer_posn.current_chain ? 0 : -ENOSPC;
-}
-
-static void toi_insert_chain_in_prio_list(struct toi_bdev_info *this)
-{
- struct toi_bdev_info **prev_ptr;
- struct toi_bdev_info *cur;
-
- /* Loop through the existing chain, finding where to insert it */
- prev_ptr = &prio_chain_head;
- cur = prio_chain_head;
-
- while (cur && cur->prio >= this->prio) {
- prev_ptr = &cur->next;
- cur = cur->next;
- }
-
- this->next = *prev_ptr;
- *prev_ptr = this;
-
- this = prio_chain_head;
- while (this)
- this = this->next;
- num_chains++;
-}
-
-/**
- * toi_extent_state_goto_start - reinitialize an extent chain iterator
- * @state: Iterator to reinitialize
- **/
-void toi_extent_state_goto_start(void)
-{
- struct toi_bdev_info *this = prio_chain_head;
-
- while (this) {
- toi_message(TOI_BIO, TOI_VERBOSE, 0,
- "Setting current extent to %p.", this->blocks.first);
- this->blocks.current_extent = this->blocks.first;
- if (this->blocks.current_extent) {
- toi_message(TOI_BIO, TOI_VERBOSE, 0,
- "Setting current offset to %lu.",
- this->blocks.current_extent->start);
- this->blocks.current_offset =
- this->blocks.current_extent->start;
- }
-
- this = this->next;
- }
-
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "Setting current chain to %p.",
- prio_chain_head);
- toi_writer_posn.current_chain = prio_chain_head;
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "Leaving extent state goto start.");
-}
-
-/**
- * toi_extent_state_save - save state of the iterator
- * @state: Current state of the chain
- * @saved_state: Iterator to populate
- *
- * Given a state and a struct hibernate_extent_state_store, save the current
- * position in a format that can be used with relocated chains (at
- * resume time).
- **/
-void toi_extent_state_save(int slot)
-{
- struct toi_bdev_info *cur_chain = prio_chain_head;
- struct hibernate_extent *extent;
- struct hibernate_extent_saved_state *chain_state;
- int i = 0;
-
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_extent_state_save, slot %d.",
- slot);
-
- if (!toi_writer_posn.current_chain) {
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "No current chain => "
- "chain_num = -1.");
- toi_writer_posn.saved_chain_number[slot] = -1;
- return;
- }
-
- while (cur_chain) {
- i++;
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "Saving chain %d (%p) "
- "state, slot %d.", i, cur_chain, slot);
-
- chain_state = &cur_chain->saved_state[slot];
-
- chain_state->offset = cur_chain->blocks.current_offset;
-
- if (toi_writer_posn.current_chain == cur_chain) {
- toi_writer_posn.saved_chain_number[slot] = i;
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "This is the chain "
- "we were on => chain_num is %d.", i);
- }
-
- if (!cur_chain->blocks.current_extent) {
- chain_state->extent_num = 0;
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "No current extent "
- "for this chain => extent_num %d is 0.",
- i);
- cur_chain = cur_chain->next;
- continue;
- }
-
- extent = cur_chain->blocks.first;
- chain_state->extent_num = 1;
-
- while (extent != cur_chain->blocks.current_extent) {
- chain_state->extent_num++;
- extent = extent->next;
- }
-
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "extent num %d is %d.", i,
- chain_state->extent_num);
-
- cur_chain = cur_chain->next;
- }
- toi_message(TOI_BIO, TOI_VERBOSE, 0,
- "Completed saving extent state slot %d.", slot);
-}
-
-/**
- * toi_extent_state_restore - restore the position saved by extent_state_save
- * @state: State to populate
- * @saved_state: Iterator saved to restore
- **/
-void toi_extent_state_restore(int slot)
-{
- int i = 0;
- struct toi_bdev_info *cur_chain = prio_chain_head;
- struct hibernate_extent_saved_state *chain_state;
-
- toi_message(TOI_BIO, TOI_VERBOSE, 0,
- "toi_extent_state_restore - slot %d.", slot);
-
- if (toi_writer_posn.saved_chain_number[slot] == -1) {
- toi_writer_posn.current_chain = NULL;
- return;
- }
-
- while (cur_chain) {
- int posn;
- int j;
- i++;
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "Restoring chain %d (%p) "
- "state, slot %d.", i, cur_chain, slot);
-
- chain_state = &cur_chain->saved_state[slot];
-
- posn = chain_state->extent_num;
-
- cur_chain->blocks.current_extent = cur_chain->blocks.first;
- cur_chain->blocks.current_offset = chain_state->offset;
-
- if (i == toi_writer_posn.saved_chain_number[slot]) {
- toi_writer_posn.current_chain = cur_chain;
- toi_message(TOI_BIO, TOI_VERBOSE, 0,
- "Found current chain.");
- }
-
- for (j = 0; j < 4; j++)
- if (i == toi_writer_posn.saved_chain_number[j]) {
- toi_writer_posn.saved_chain_ptr[j] = cur_chain;
- toi_message(TOI_BIO, TOI_VERBOSE, 0,
- "Found saved chain ptr %d (%p) (offset"
- " %d).", j, cur_chain,
- cur_chain->saved_state[j].offset);
- }
-
- if (posn) {
- while (--posn)
- cur_chain->blocks.current_extent =
- cur_chain->blocks.current_extent->next;
- } else
- cur_chain->blocks.current_extent = NULL;
-
- cur_chain = cur_chain->next;
- }
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "Done.");
- if (test_action_state(TOI_LOGALL))
- dump_block_chains();
-}
-
-/*
- * Storage needed
- *
- * Returns amount of space in the image header required
- * for the chain data. This ignores the links between
- * pages, which we factor in when allocating the space.
- */
-int toi_bio_devinfo_storage_needed(void)
-{
- int result = sizeof(num_chains);
- struct toi_bdev_info *chain = prio_chain_head;
-
- while (chain) {
- result += metadata_size;
-
- /* Chain size */
- result += sizeof(int);
-
- /* Extents */
- result += (2 * sizeof(unsigned long) *
- chain->blocks.num_extents);
-
- chain = chain->next;
- }
-
- result += 4 * sizeof(int);
- return result;
-}
-
-static unsigned long chain_pages_used(struct toi_bdev_info *chain)
-{
- struct hibernate_extent *this = chain->blocks.first;
- struct hibernate_extent_saved_state *state = &chain->saved_state[3];
- unsigned long size = 0;
- int extent_idx = 1;
-
- if (!state->extent_num) {
- if (!this)
- return 0;
- else
- return chain->blocks.size;
- }
-
- while (extent_idx < state->extent_num) {
- size += (this->end - this->start + 1);
- this = this->next;
- extent_idx++;
- }
-
- /* We didn't use the one we're sitting on, so don't count it */
- return size + state->offset - this->start;
-}
-
-void toi_bio_free_unused_storage_chain(struct toi_bdev_info *chain)
-{
- unsigned long used = chain_pages_used(chain);
-
- /* Free the storage */
- unsigned long first_freed = 0;
-
- if (chain->allocator->bio_allocator_ops->free_unused_storage)
- first_freed = chain->allocator->bio_allocator_ops->free_unused_storage(chain, used);
-
- printk(KERN_EMERG "Used %ld blocks in this chain. First extent freed is %lx.\n", used, first_freed);
-
- /* Adjust / free the extents. */
- toi_put_extent_chain_from(&chain->blocks, first_freed);
-
- {
- struct hibernate_extent *this = chain->blocks.first;
- while (this) {
- printk("Extent %lx-%lx.\n", this->start, this->end);
- this = this->next;
- }
- }
-}
-
-/**
- * toi_serialise_extent_chain - write a chain in the image
- * @chain: Chain to write.
- **/
-static int toi_serialise_extent_chain(struct toi_bdev_info *chain)
-{
- struct hibernate_extent *this;
- int ret;
- int i = 1;
-
- chain->pages_used = chain_pages_used(chain);
-
- if (test_action_state(TOI_LOGALL))
- dump_block_chains();
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "Serialising chain (dev_t %lx).",
- chain->dev_t);
- /* Device info - dev_t, prio, bmap_shift, blocks per page, positions */
- ret = toiActiveAllocator->rw_header_chunk(WRITE, &toi_blockwriter_ops,
- (char *) &chain->uuid, metadata_size);
- if (ret)
- return ret;
-
- /* Num extents */
- ret = toiActiveAllocator->rw_header_chunk(WRITE, &toi_blockwriter_ops,
- (char *) &chain->blocks.num_extents, sizeof(int));
- if (ret)
- return ret;
-
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "%d extents.",
- chain->blocks.num_extents);
-
- this = chain->blocks.first;
- while (this) {
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "Extent %d.", i);
- ret = toiActiveAllocator->rw_header_chunk(WRITE,
- &toi_blockwriter_ops,
- (char *) this, 2 * sizeof(this->start));
- if (ret)
- return ret;
- this = this->next;
- i++;
- }
-
- return ret;
-}
-
-int toi_serialise_extent_chains(void)
-{
- struct toi_bdev_info *this = prio_chain_head;
- int result;
-
- /* Write the number of chains */
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "Write number of chains (%d)",
- num_chains);
- result = toiActiveAllocator->rw_header_chunk(WRITE,
- &toi_blockwriter_ops, (char *) &num_chains,
- sizeof(int));
- if (result)
- return result;
-
- /* Then the chains themselves */
- while (this) {
- result = toi_serialise_extent_chain(this);
- if (result)
- return result;
- this = this->next;
- }
-
- /*
- * Finally, the chain we should be on at the start of each
- * section.
- */
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "Saved chain numbers.");
- result = toiActiveAllocator->rw_header_chunk(WRITE,
- &toi_blockwriter_ops,
- (char *) &toi_writer_posn.saved_chain_number[0],
- 4 * sizeof(int));
-
- return result;
-}
-
-int toi_register_storage_chain(struct toi_bdev_info *new)
-{
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "Inserting chain %p into list.",
- new);
- toi_insert_chain_in_prio_list(new);
- return 0;
-}
-
-static void free_bdev_info(struct toi_bdev_info *chain)
-{
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "Free chain %p.", chain);
-
- toi_message(TOI_BIO, TOI_VERBOSE, 0, " - Block extents.");
- toi_put_extent_chain(&chain->blocks);
-
- /*
- * The allocator may need to do more than just free the chains
- * (swap_free, for example). Don't call from boot kernel.
- */
- toi_message(TOI_BIO, TOI_VERBOSE, 0, " - Allocator extents.");
- if (chain->allocator)
- chain->allocator->bio_allocator_ops->free_storage(chain);
-
- /*
- * Dropping out of reading atomic copy? Need to undo
- * toi_open_by_devnum.
- */
- toi_message(TOI_BIO, TOI_VERBOSE, 0, " - Bdev.");
- if (chain->bdev && !IS_ERR(chain->bdev) &&
- chain->bdev != resume_block_device &&
- chain->bdev != header_block_device &&
- test_toi_state(TOI_TRYING_TO_RESUME))
- toi_close_bdev(chain->bdev);
-
- /* Poison */
- toi_message(TOI_BIO, TOI_VERBOSE, 0, " - Struct.");
- toi_kfree(39, chain, sizeof(*chain));
-
- if (prio_chain_head == chain)
- prio_chain_head = NULL;
-
- num_chains--;
-}
-
-void free_all_bdev_info(void)
-{
- struct toi_bdev_info *this = prio_chain_head;
-
- while (this) {
- struct toi_bdev_info *next = this->next;
- free_bdev_info(this);
- this = next;
- }
-
- memset((char *) &toi_writer_posn, 0, sizeof(toi_writer_posn));
- prio_chain_head = NULL;
-}
-
-static void set_up_start_position(void)
-{
- toi_writer_posn.current_chain = prio_chain_head;
- go_next_page(0, 0);
-}
-
-/**
- * toi_load_extent_chain - read back a chain saved in the image
- * @chain: Chain to load
- *
- * The linked list of extents is reconstructed from the disk. chain will point
- * to the first entry.
- **/
-int toi_load_extent_chain(int index, int *num_loaded)
-{
- struct toi_bdev_info *chain = toi_kzalloc(39,
- sizeof(struct toi_bdev_info), GFP_ATOMIC);
- struct hibernate_extent *this, *last = NULL;
- int i, ret;
-
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "Loading extent chain %d.", index);
- /* Get dev_t, prio, bmap_shift, blocks per page, positions */
- ret = toiActiveAllocator->rw_header_chunk_noreadahead(READ, NULL,
- (char *) &chain->uuid, metadata_size);
-
- if (ret) {
- printk(KERN_ERR "Failed to read the size of extent chain.\n");
- toi_kfree(39, chain, sizeof(*chain));
- return 1;
- }
-
- toi_bkd.pages_used[index] = chain->pages_used;
-
- ret = toiActiveAllocator->rw_header_chunk_noreadahead(READ, NULL,
- (char *) &chain->blocks.num_extents, sizeof(int));
- if (ret) {
- printk(KERN_ERR "Failed to read the size of extent chain.\n");
- toi_kfree(39, chain, sizeof(*chain));
- return 1;
- }
-
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "%d extents.",
- chain->blocks.num_extents);
-
- for (i = 0; i < chain->blocks.num_extents; i++) {
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "Extent %d.", i + 1);
-
- this = toi_kzalloc(2, sizeof(struct hibernate_extent),
- TOI_ATOMIC_GFP);
- if (!this) {
- printk(KERN_INFO "Failed to allocate a new extent.\n");
- free_bdev_info(chain);
- return -ENOMEM;
- }
- this->next = NULL;
- /* Get the next page */
- ret = toiActiveAllocator->rw_header_chunk_noreadahead(READ,
- NULL, (char *) this, 2 * sizeof(this->start));
- if (ret) {
- printk(KERN_INFO "Failed to read an extent.\n");
- toi_kfree(2, this, sizeof(struct hibernate_extent));
- free_bdev_info(chain);
- return 1;
- }
-
- if (last)
- last->next = this;
- else {
- char b1[32], b2[32], b3[32];
- /*
- * Open the bdev
- */
- toi_message(TOI_BIO, TOI_VERBOSE, 0,
- "Chain dev_t is %s. Resume dev t is %s. Header"
- " bdev_t is %s.\n",
- format_dev_t(b1, chain->dev_t),
- format_dev_t(b2, resume_dev_t),
- format_dev_t(b3, toi_sig_data->header_dev_t));
-
- if (chain->dev_t == resume_dev_t)
- chain->bdev = resume_block_device;
- else if (chain->dev_t == toi_sig_data->header_dev_t)
- chain->bdev = header_block_device;
- else {
- chain->bdev = toi_open_bdev(chain->uuid,
- chain->dev_t, 1);
- if (IS_ERR(chain->bdev)) {
- free_bdev_info(chain);
- return -ENODEV;
- }
- }
-
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "Chain bmap shift "
- "is %d and blocks per page is %d.",
- chain->bmap_shift,
- chain->blocks_per_page);
-
- chain->blocks.first = this;
-
- /*
- * Couldn't do this earlier, but can't do
- * goto_start now - we may have already used blocks
- * in the first chain.
- */
- chain->blocks.current_extent = this;
- chain->blocks.current_offset = this->start;
-
- /*
- * Can't wait until we've read the whole chain
- * before we insert it in the list. We might need
- * this chain to read the next page in the header
- */
- toi_insert_chain_in_prio_list(chain);
- }
-
- /*
- * We have to wait until 2 extents are loaded before setting up
- * properly because if the first extent has only one page, we
- * will need to put the position on the second extent. Sounds
- * obvious, but it wasn't!
- */
- (*num_loaded)++;
- if ((*num_loaded) == 2)
- set_up_start_position();
- last = this;
- }
-
- /*
- * Shouldn't get empty chains, but it's not impossible. Link them in so
- * they get freed properly later.
- */
- if (!chain->blocks.num_extents)
- toi_insert_chain_in_prio_list(chain);
-
- if (!chain->blocks.current_extent) {
- chain->blocks.current_extent = chain->blocks.first;
- if (chain->blocks.current_extent)
- chain->blocks.current_offset =
- chain->blocks.current_extent->start;
- }
- return 0;
-}
-
-int toi_load_extent_chains(void)
-{
- int result;
- int to_load;
- int i;
- int extents_loaded = 0;
-
- result = toiActiveAllocator->rw_header_chunk_noreadahead(READ, NULL,
- (char *) &to_load,
- sizeof(int));
- if (result)
- return result;
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "%d chains to read.", to_load);
-
- for (i = 0; i < to_load; i++) {
- toi_message(TOI_BIO, TOI_VERBOSE, 0, " >> Loading chain %d/%d.",
- i, to_load);
- result = toi_load_extent_chain(i, &extents_loaded);
- if (result)
- return result;
- }
-
- /* If we never got to a second extent, we still need to do this. */
- if (extents_loaded == 1)
- set_up_start_position();
-
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "Save chain numbers.");
- result = toiActiveAllocator->rw_header_chunk_noreadahead(READ,
- &toi_blockwriter_ops,
- (char *) &toi_writer_posn.saved_chain_number[0],
- 4 * sizeof(int));
-
- return result;
-}
-
-static int toi_end_of_stream(int writing, int section_barrier)
-{
- struct toi_bdev_info *cur_chain = toi_writer_posn.current_chain;
- int compare_to = next_section[current_stream];
- struct toi_bdev_info *compare_chain =
- toi_writer_posn.saved_chain_ptr[compare_to];
- int compare_offset = compare_chain ?
- compare_chain->saved_state[compare_to].offset : 0;
-
- if (!section_barrier)
- return 0;
-
- if (!cur_chain)
- return 1;
-
- if (cur_chain == compare_chain &&
- cur_chain->blocks.current_offset == compare_offset) {
- if (writing) {
- if (!current_stream) {
- debug_broken_header();
- return 1;
- }
- } else {
- more_readahead = 0;
- toi_message(TOI_BIO, TOI_VERBOSE, 0,
- "Reached the end of stream %d "
- "(not an error).", current_stream);
- return 1;
- }
- }
-
- return 0;
-}
-
-/**
- * go_next_page - skip blocks to the start of the next page
- * @writing: Whether we're reading or writing the image.
- *
- * Go forward one page.
- **/
-int go_next_page(int writing, int section_barrier)
-{
- struct toi_bdev_info *cur_chain = toi_writer_posn.current_chain;
- int max = cur_chain ? cur_chain->blocks_per_page : 1;
-
- /* Nope. Go foward a page - or maybe two. Don't stripe the header,
- * so that bad fragmentation doesn't put the extent data containing
- * the location of the second page out of the first header page.
- */
- if (toi_extent_state_next(max, current_stream)) {
- /* Don't complain if readahead falls off the end */
- if (writing && section_barrier) {
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "Extent state eof. "
- "Expected compression ratio too optimistic?");
- if (test_action_state(TOI_LOGALL))
- dump_block_chains();
- }
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "Ran out of extents to "
- "read/write. (Not necessarily a fatal error.");
- return -ENOSPC;
- }
-
- return 0;
-}
-
-int devices_of_same_priority(struct toi_bdev_info *this)
-{
- struct toi_bdev_info *check = prio_chain_head;
- int i = 0;
-
- while (check) {
- if (check->prio == this->prio)
- i++;
- check = check->next;
- }
-
- return i;
-}
-
-/**
- * toi_bio_rw_page - do i/o on the next disk page in the image
- * @writing: Whether reading or writing.
- * @page: Page to do i/o on.
- * @is_readahead: Whether we're doing readahead
- * @free_group: The group used in allocating the page
- *
- * Submit a page for reading or writing, possibly readahead.
- * Pass the group used in allocating the page as well, as it should
- * be freed on completion of the bio if we're writing the page.
- **/
-int toi_bio_rw_page(int writing, struct page *page,
- int is_readahead, int free_group)
-{
- int result = toi_end_of_stream(writing, 1);
- struct toi_bdev_info *dev_info = toi_writer_posn.current_chain;
-
- if (result) {
- if (writing)
- abort_hibernate(TOI_INSUFFICIENT_STORAGE,
- "Insufficient storage for your image.");
- else
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "Seeking to "
- "read/write another page when stream has "
- "ended.");
- return -ENOSPC;
- }
-
- toi_message(TOI_BIO, TOI_VERBOSE, 0,
- "%s %lx:%ld",
- writing ? "Write" : "Read",
- dev_info->dev_t, dev_info->blocks.current_offset);
-
- result = toi_do_io(writing, dev_info->bdev,
- dev_info->blocks.current_offset << dev_info->bmap_shift,
- page, is_readahead, 0, free_group);
-
- /* Ignore the result here - will check end of stream if come in again */
- go_next_page(writing, 1);
-
- if (result)
- printk(KERN_ERR "toi_do_io returned %d.\n", result);
- return result;
-}
-
-dev_t get_header_dev_t(void)
-{
- return prio_chain_head->dev_t;
-}
-
-struct block_device *get_header_bdev(void)
-{
- return prio_chain_head->bdev;
-}
-
-unsigned long get_headerblock(void)
-{
- return prio_chain_head->blocks.first->start <<
- prio_chain_head->bmap_shift;
-}
-
-int get_main_pool_phys_params(void)
-{
- struct toi_bdev_info *this = prio_chain_head;
- int result;
-
- while (this) {
- result = this->allocator->bio_allocator_ops->bmap(this);
- if (result)
- return result;
- this = this->next;
- }
-
- return 0;
-}
-
-static int apply_header_reservation(void)
-{
- int i;
-
- if (!header_pages_reserved) {
- toi_message(TOI_BIO, TOI_VERBOSE, 0,
- "No header pages reserved at the moment.");
- return 0;
- }
-
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "Applying header reservation.");
-
- /* Apply header space reservation */
- toi_extent_state_goto_start();
-
- for (i = 0; i < header_pages_reserved; i++)
- if (go_next_page(1, 0))
- return -ENOSPC;
-
- /* The end of header pages will be the start of pageset 2 */
- toi_extent_state_save(2);
-
- toi_message(TOI_BIO, TOI_VERBOSE, 0,
- "Finished applying header reservation.");
- return 0;
-}
-
-static int toi_bio_register_storage(void)
-{
- int result = 0;
- struct toi_module_ops *this_module;
-
- list_for_each_entry(this_module, &toi_modules, module_list) {
- if (!this_module->enabled ||
- this_module->type != BIO_ALLOCATOR_MODULE)
- continue;
- toi_message(TOI_BIO, TOI_VERBOSE, 0,
- "Registering storage from %s.",
- this_module->name);
- result = this_module->bio_allocator_ops->register_storage();
- if (result)
- break;
- }
-
- return result;
-}
-
-void toi_bio_free_unused_storage(void)
-{
- struct toi_bdev_info *this = prio_chain_head;
-
- while (this) {
- toi_bio_free_unused_storage_chain(this);
- this = this->next;
- }
-}
-
-int toi_bio_allocate_storage(unsigned long request)
-{
- struct toi_bdev_info *chain = prio_chain_head;
- unsigned long to_get = request;
- unsigned long extra_pages, needed;
- int no_free = 0;
-
- if (!chain) {
- int result = toi_bio_register_storage();
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_allocate_storage: "
- "Registering storage.");
- if (result)
- return 0;
- chain = prio_chain_head;
- if (!chain) {
- printk("TuxOnIce: No storage was registered.\n");
- return 0;
- }
- }
-
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_allocate_storage: "
- "Request is %lu pages.", request);
- extra_pages = DIV_ROUND_UP(request * (sizeof(unsigned long)
- + sizeof(int)), PAGE_SIZE);
- needed = request + extra_pages + header_pages_reserved;
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "Adding %lu extra pages and %lu "
- "for header => %lu.",
- extra_pages, header_pages_reserved, needed);
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "Already allocated %lu pages.",
- raw_pages_allocd);
-
- to_get = needed > raw_pages_allocd ? needed - raw_pages_allocd : 0;
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "Need to get %lu pages.", to_get);
-
- if (!to_get)
- return apply_header_reservation();
-
- while (to_get && chain) {
- int num_group = devices_of_same_priority(chain);
- int divisor = num_group - no_free;
- int i;
- unsigned long portion = DIV_ROUND_UP(to_get, divisor);
- unsigned long got = 0;
- unsigned long got_this_round = 0;
- struct toi_bdev_info *top = chain;
-
- toi_message(TOI_BIO, TOI_VERBOSE, 0,
- " Start of loop. To get is %lu. Divisor is %d.",
- to_get, divisor);
- no_free = 0;
-
- /*
- * We're aiming to spread the allocated storage as evenly
- * as possible, but we also want to get all the storage we
- * can off this priority.
- */
- for (i = 0; i < num_group; i++) {
- struct toi_bio_allocator_ops *ops =
- chain->allocator->bio_allocator_ops;
- toi_message(TOI_BIO, TOI_VERBOSE, 0,
- " Asking for %lu pages from chain %p.",
- portion, chain);
- got = ops->allocate_storage(chain, portion);
- toi_message(TOI_BIO, TOI_VERBOSE, 0,
- " Got %lu pages from allocator %p.",
- got, chain);
- if (!got)
- no_free++;
- got_this_round += got;
- chain = chain->next;
- }
- toi_message(TOI_BIO, TOI_VERBOSE, 0, " Loop finished. Got a "
- "total of %lu pages from %d allocators.",
- got_this_round, divisor - no_free);
-
- raw_pages_allocd += got_this_round;
- to_get = needed > raw_pages_allocd ? needed - raw_pages_allocd :
- 0;
-
- /*
- * If we got anything from chains of this priority and we
- * still have storage to allocate, go over this priority
- * again.
- */
- if (got_this_round && to_get)
- chain = top;
- else
- no_free = 0;
- }
-
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "Finished allocating. Calling "
- "get_main_pool_phys_params");
- /* Now let swap allocator bmap the pages */
- get_main_pool_phys_params();
-
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "Done. Reserving header.");
- return apply_header_reservation();
-}
-
-void toi_bio_chains_post_atomic(struct toi_boot_kernel_data *bkd)
-{
- int i = 0;
- struct toi_bdev_info *cur_chain = prio_chain_head;
-
- while (cur_chain) {
- cur_chain->pages_used = bkd->pages_used[i];
- cur_chain = cur_chain->next;
- i++;
- }
-}
-
-int toi_bio_chains_debug_info(char *buffer, int size)
-{
- /* Show what we actually used */
- struct toi_bdev_info *cur_chain = prio_chain_head;
- int len = 0;
-
- while (cur_chain) {
- len += scnprintf(buffer + len, size - len, " Used %lu pages "
- "from %s.\n", cur_chain->pages_used,
- cur_chain->name);
- cur_chain = cur_chain->next;
- }
-
- return len;
-}
-
-void toi_bio_store_inc_image_ptr(struct toi_incremental_image_pointer *ptr)
-{
- struct toi_bdev_info *this = toi_writer_posn.current_chain,
- *cmp = prio_chain_head;
-
- ptr->save.chain = 1;
- while (this != cmp) {
- ptr->save.chain++;
- cmp = cmp->next;
- }
- ptr->save.block = this->blocks.current_offset;
-
- /* Save the raw info internally for quicker access when updating pointers */
- ptr->bdev = this->bdev;
- ptr->block = this->blocks.current_offset << this->bmap_shift;
-}
-
-void toi_bio_restore_inc_image_ptr(struct toi_incremental_image_pointer *ptr)
-{
- int i = ptr->save.chain - 1;
- struct toi_bdev_info *this;
- struct hibernate_extent *hib;
-
- /* Find chain by stored index */
- this = prio_chain_head;
- while (i) {
- this = this->next;
- i--;
- }
- toi_writer_posn.current_chain = this;
-
- /* Restore block */
- this->blocks.current_offset = ptr->save.block;
-
- /* Find current offset from block number */
- hib = this->blocks.first;
-
- while (hib->start > ptr->save.block) {
- hib = hib->next;
- }
-
- this->blocks.last_touched = this->blocks.current_extent = hib;
-}
diff --git a/kernel/power/tuxonice_bio_core.c b/kernel/power/tuxonice_bio_core.c
deleted file mode 100644
index 87aa4c96e..000000000
--- a/kernel/power/tuxonice_bio_core.c
+++ /dev/null
@@ -1,1932 +0,0 @@
-/*
- * kernel/power/tuxonice_bio.c
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * Distributed under GPLv2.
- *
- * This file contains block io functions for TuxOnIce. These are
- * used by the swapwriter and it is planned that they will also
- * be used by the NFSwriter.
- *
- */
-
-#include <linux/blkdev.h>
-#include <linux/syscalls.h>
-#include <linux/suspend.h>
-#include <linux/ctype.h>
-#include <linux/mount.h>
-#include <linux/fs_uuid.h>
-
-#include "tuxonice.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_prepare_image.h"
-#include "tuxonice_bio.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_io.h"
-#include "tuxonice_builtin.h"
-#include "tuxonice_bio_internal.h"
-
-#define MEMORY_ONLY 1
-#define THROTTLE_WAIT 2
-
-/* #define MEASURE_MUTEX_CONTENTION */
-#ifndef MEASURE_MUTEX_CONTENTION
-#define my_mutex_lock(index, the_lock) mutex_lock(the_lock)
-#define my_mutex_unlock(index, the_lock) mutex_unlock(the_lock)
-#else
-unsigned long mutex_times[2][2][NR_CPUS];
-#define my_mutex_lock(index, the_lock) do { \
- int have_mutex; \
- have_mutex = mutex_trylock(the_lock); \
- if (!have_mutex) { \
- mutex_lock(the_lock); \
- mutex_times[index][0][smp_processor_id()]++; \
- } else { \
- mutex_times[index][1][smp_processor_id()]++; \
- }
-
-#define my_mutex_unlock(index, the_lock) \
- mutex_unlock(the_lock); \
-} while (0)
-#endif
-
-static int page_idx, reset_idx;
-
-static int target_outstanding_io = 1024;
-static int max_outstanding_writes, max_outstanding_reads;
-
-static struct page *bio_queue_head, *bio_queue_tail;
-static atomic_t toi_bio_queue_size;
-static DEFINE_SPINLOCK(bio_queue_lock);
-
-static int free_mem_throttle, throughput_throttle;
-int more_readahead = 1;
-static struct page *readahead_list_head, *readahead_list_tail;
-
-static struct page *waiting_on;
-
-static atomic_t toi_io_in_progress, toi_io_done;
-static DECLARE_WAIT_QUEUE_HEAD(num_in_progress_wait);
-
-int current_stream;
-/* Not static, so that the allocators can setup and complete
- * writing the header */
-char *toi_writer_buffer;
-int toi_writer_buffer_posn;
-
-static DEFINE_MUTEX(toi_bio_mutex);
-static DEFINE_MUTEX(toi_bio_readahead_mutex);
-
-static struct task_struct *toi_queue_flusher;
-static int toi_bio_queue_flush_pages(int dedicated_thread);
-
-struct toi_module_ops toi_blockwriter_ops;
-
-struct toi_incremental_image_pointer toi_inc_ptr[2][2];
-
-#define TOTAL_OUTSTANDING_IO (atomic_read(&toi_io_in_progress) + \
- atomic_read(&toi_bio_queue_size))
-
-unsigned long raw_pages_allocd, header_pages_reserved;
-
-static int toi_rw_buffer(int writing, char *buffer, int buffer_size,
- int no_readahead);
-
-/**
- * set_free_mem_throttle - set the point where we pause to avoid oom.
- *
- * Initially, this value is zero, but when we first fail to allocate memory,
- * we set it (plus a buffer) and thereafter throttle i/o once that limit is
- * reached.
- **/
-static void set_free_mem_throttle(void)
-{
- int new_throttle = nr_free_buffer_pages() + 256;
-
- if (new_throttle > free_mem_throttle)
- free_mem_throttle = new_throttle;
-}
-
-#define NUM_REASONS 7
-static atomic_t reasons[NUM_REASONS];
-static char *reason_name[NUM_REASONS] = {
- "readahead not ready",
- "bio allocation",
- "synchronous I/O",
- "toi_bio_get_new_page",
- "memory low",
- "readahead buffer allocation",
- "throughput_throttle",
-};
-
-/* User Specified Parameters. */
-unsigned long resume_firstblock;
-dev_t resume_dev_t;
-struct block_device *resume_block_device;
-static atomic_t resume_bdev_open_count;
-
-struct block_device *header_block_device;
-
-/**
- * toi_open_bdev: Open a bdev at resume time.
- *
- * index: The swap index. May be MAX_SWAPFILES for the resume_dev_t
- * (the user can have resume= pointing at a swap partition/file that isn't
- * swapon'd when they hibernate. MAX_SWAPFILES+1 for the first page of the
- * header. It will be from a swap partition that was enabled when we hibernated,
- * but we don't know it's real index until we read that first page.
- * dev_t: The device major/minor.
- * display_errs: Whether to try to do this quietly.
- *
- * We stored a dev_t in the image header. Open the matching device without
- * requiring /dev/<whatever> in most cases and record the details needed
- * to close it later and avoid duplicating work.
- */
-struct block_device *toi_open_bdev(char *uuid, dev_t default_device,
- int display_errs)
-{
- struct block_device *bdev;
- dev_t device = default_device;
- char buf[32];
- int retried = 0;
-
-retry:
- if (uuid) {
- struct fs_info seek;
- strncpy((char *) &seek.uuid, uuid, 16);
- seek.dev_t = 0;
- seek.last_mount_size = 0;
- device = blk_lookup_fs_info(&seek);
- if (!device) {
- device = default_device;
- printk(KERN_DEBUG "Unable to resolve uuid. Falling back"
- " to dev_t.\n");
- } else
- printk(KERN_DEBUG "Resolved uuid to device %s.\n",
- format_dev_t(buf, device));
- }
-
- if (!device) {
- printk(KERN_ERR "TuxOnIce attempting to open a "
- "blank dev_t!\n");
- dump_stack();
- return NULL;
- }
- bdev = toi_open_by_devnum(device);
-
- if (IS_ERR(bdev) || !bdev) {
- if (!retried) {
- retried = 1;
- wait_for_device_probe();
- goto retry;
- }
- if (display_errs)
- toi_early_boot_message(1, TOI_CONTINUE_REQ,
- "Failed to get access to block device "
- "\"%x\" (error %d).\n Maybe you need "
- "to run mknod and/or lvmsetup in an "
- "initrd/ramfs?", device, bdev);
- return ERR_PTR(-EINVAL);
- }
- toi_message(TOI_BIO, TOI_VERBOSE, 0,
- "TuxOnIce got bdev %p for dev_t %x.",
- bdev, device);
-
- return bdev;
-}
-
-static void toi_bio_reserve_header_space(unsigned long request)
-{
- header_pages_reserved = request;
-}
-
-/**
- * do_bio_wait - wait for some TuxOnIce I/O to complete
- * @reason: The array index of the reason we're waiting.
- *
- * Wait for a particular page of I/O if we're after a particular page.
- * If we're not after a particular page, wait instead for all in flight
- * I/O to be completed or for us to have enough free memory to be able
- * to submit more I/O.
- *
- * If we wait, we also update our statistics regarding why we waited.
- **/
-static void do_bio_wait(int reason)
-{
- struct page *was_waiting_on = waiting_on;
-
- /* On SMP, waiting_on can be reset, so we make a copy */
- if (was_waiting_on) {
- wait_on_page_locked(was_waiting_on);
- atomic_inc(&reasons[reason]);
- } else {
- atomic_inc(&reasons[reason]);
-
- wait_event(num_in_progress_wait,
- !atomic_read(&toi_io_in_progress) ||
- nr_free_buffer_pages() > free_mem_throttle);
- }
-}
-
-/**
- * throttle_if_needed - wait for I/O completion if throttle points are reached
- * @flags: What to check and how to act.
- *
- * Check whether we need to wait for some I/O to complete. We always check
- * whether we have enough memory available, but may also (depending upon
- * @reason) check if the throughput throttle limit has been reached.
- **/
-static int throttle_if_needed(int flags)
-{
- int free_pages = nr_free_buffer_pages();
-
- /* Getting low on memory and I/O is in progress? */
- while (unlikely(free_pages < free_mem_throttle) &&
- atomic_read(&toi_io_in_progress) &&
- !test_result_state(TOI_ABORTED)) {
- if (!(flags & THROTTLE_WAIT))
- return -ENOMEM;
- do_bio_wait(4);
- free_pages = nr_free_buffer_pages();
- }
-
- while (!(flags & MEMORY_ONLY) && throughput_throttle &&
- TOTAL_OUTSTANDING_IO >= throughput_throttle &&
- !test_result_state(TOI_ABORTED)) {
- int result = toi_bio_queue_flush_pages(0);
- if (result)
- return result;
- atomic_inc(&reasons[6]);
- wait_event(num_in_progress_wait,
- !atomic_read(&toi_io_in_progress) ||
- TOTAL_OUTSTANDING_IO < throughput_throttle);
- }
-
- return 0;
-}
-
-/**
- * update_throughput_throttle - update the raw throughput throttle
- * @jif_index: The number of times this function has been called.
- *
- * This function is called four times per second by the core, and used to limit
- * the amount of I/O we submit at once, spreading out our waiting through the
- * whole job and letting userui get an opportunity to do its work.
- *
- * We don't start limiting I/O until 1/4s has gone so that we get a
- * decent sample for our initial limit, and keep updating it because
- * throughput may vary (on rotating media, eg) with our block number.
- *
- * We throttle to 1/10s worth of I/O.
- **/
-static void update_throughput_throttle(int jif_index)
-{
- int done = atomic_read(&toi_io_done);
- throughput_throttle = done * 2 / 5 / jif_index;
-}
-
-/**
- * toi_finish_all_io - wait for all outstanding i/o to complete
- *
- * Flush any queued but unsubmitted I/O and wait for it all to complete.
- **/
-static int toi_finish_all_io(void)
-{
- int result = toi_bio_queue_flush_pages(0);
- toi_bio_queue_flusher_should_finish = 1;
- wake_up(&toi_io_queue_flusher);
- wait_event(num_in_progress_wait, !TOTAL_OUTSTANDING_IO);
- return result;
-}
-
-/**
- * toi_end_bio - bio completion function.
- * @bio: bio that has completed.
- *
- * Function called by the block driver from interrupt context when I/O is
- * completed. If we were writing the page, we want to free it and will have
- * set bio->bi_private to the parameter we should use in telling the page
- * allocation accounting code what the page was allocated for. If we're
- * reading the page, it will be in the singly linked list made from
- * page->private pointers.
- **/
-static void toi_end_bio(struct bio *bio)
-{
- struct page *page = bio->bi_io_vec[0].bv_page;
-
- BUG_ON(bio->bi_error);
-
- unlock_page(page);
- bio_put(bio);
-
- if (waiting_on == page)
- waiting_on = NULL;
-
- put_page(page);
-
- if (bio->bi_private)
- toi__free_page((int) ((unsigned long) bio->bi_private) , page);
-
- bio_put(bio);
-
- atomic_dec(&toi_io_in_progress);
- atomic_inc(&toi_io_done);
-
- wake_up(&num_in_progress_wait);
-}
-
-/**
- * submit - submit BIO request
- * @writing: READ or WRITE.
- * @dev: The block device we're using.
- * @first_block: The first sector we're using.
- * @page: The page being used for I/O.
- * @free_group: If writing, the group that was used in allocating the page
- * and which will be used in freeing the page from the completion
- * routine.
- *
- * Based on Patrick Mochell's pmdisk code from long ago: "Straight from the
- * textbook - allocate and initialize the bio. If we're writing, make sure
- * the page is marked as dirty. Then submit it and carry on."
- *
- * If we're just testing the speed of our own code, we fake having done all
- * the hard work and all toi_end_bio immediately.
- **/
-static int submit(int writing, struct block_device *dev, sector_t first_block,
- struct page *page, int free_group)
-{
- struct bio *bio = NULL;
- int cur_outstanding_io, result;
-
- /*
- * Shouldn't throttle if reading - can deadlock in the single
- * threaded case as pages are only freed when we use the
- * readahead.
- */
- if (writing) {
- result = throttle_if_needed(MEMORY_ONLY | THROTTLE_WAIT);
- if (result)
- return result;
- }
-
- while (!bio) {
- bio = bio_alloc(TOI_ATOMIC_GFP, 1);
- if (!bio) {
- set_free_mem_throttle();
- do_bio_wait(1);
- }
- }
-
- bio->bi_bdev = dev;
- bio->bi_iter.bi_sector = first_block;
- bio->bi_private = (void *) ((unsigned long) free_group);
- bio->bi_end_io = toi_end_bio;
- bio_set_flag(bio, BIO_TOI);
-
- if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
- printk(KERN_DEBUG "ERROR: adding page to bio at %lld\n",
- (unsigned long long) first_block);
- bio_put(bio);
- return -EFAULT;
- }
-
- bio_get(bio);
-
- cur_outstanding_io = atomic_add_return(1, &toi_io_in_progress);
- if (writing) {
- if (cur_outstanding_io > max_outstanding_writes)
- max_outstanding_writes = cur_outstanding_io;
- } else {
- if (cur_outstanding_io > max_outstanding_reads)
- max_outstanding_reads = cur_outstanding_io;
- }
-
- /* Still read the header! */
- if (unlikely(test_action_state(TOI_TEST_BIO) && writing)) {
- /* Fake having done the hard work */
- bio->bi_error = 0;
- toi_end_bio(bio);
- } else
- submit_bio(writing | REQ_SYNC, bio);
-
- return 0;
-}
-
-/**
- * toi_do_io: Prepare to do some i/o on a page and submit or batch it.
- *
- * @writing: Whether reading or writing.
- * @bdev: The block device which we're using.
- * @block0: The first sector we're reading or writing.
- * @page: The page on which I/O is being done.
- * @readahead_index: If doing readahead, the index (reset this flag when done).
- * @syncio: Whether the i/o is being done synchronously.
- *
- * Prepare and start a read or write operation.
- *
- * Note that we always work with our own page. If writing, we might be given a
- * compression buffer that will immediately be used to start compressing the
- * next page. For reading, we do readahead and therefore don't know the final
- * address where the data needs to go.
- **/
-int toi_do_io(int writing, struct block_device *bdev, long block0,
- struct page *page, int is_readahead, int syncio, int free_group)
-{
- page->private = 0;
-
- /* Do here so we don't race against toi_bio_get_next_page_read */
- lock_page(page);
-
- if (is_readahead) {
- if (readahead_list_head)
- readahead_list_tail->private = (unsigned long) page;
- else
- readahead_list_head = page;
-
- readahead_list_tail = page;
- }
-
- /* Done before submitting to avoid races. */
- if (syncio)
- waiting_on = page;
-
- /* Submit the page */
- get_page(page);
-
- if (submit(writing, bdev, block0, page, free_group))
- return -EFAULT;
-
- if (syncio)
- do_bio_wait(2);
-
- return 0;
-}
-
-/**
- * toi_bdev_page_io - simpler interface to do directly i/o on a single page
- * @writing: Whether reading or writing.
- * @bdev: Block device on which we're operating.
- * @pos: Sector at which page to read or write starts.
- * @page: Page to be read/written.
- *
- * A simple interface to submit a page of I/O and wait for its completion.
- * The caller must free the page used.
- **/
-static int toi_bdev_page_io(int writing, struct block_device *bdev,
- long pos, struct page *page)
-{
- return toi_do_io(writing, bdev, pos, page, 0, 1, 0);
-}
-
-/**
- * toi_bio_memory_needed - report the amount of memory needed for block i/o
- *
- * We want to have at least enough memory so as to have target_outstanding_io
- * or more transactions on the fly at once. If we can do more, fine.
- **/
-static int toi_bio_memory_needed(void)
-{
- return target_outstanding_io * (PAGE_SIZE + sizeof(struct request) +
- sizeof(struct bio));
-}
-
-/**
- * toi_bio_print_debug_stats - put out debugging info in the buffer provided
- * @buffer: A buffer of size @size into which text should be placed.
- * @size: The size of @buffer.
- *
- * Fill a buffer with debugging info. This is used for both our debug_info sysfs
- * entry and for recording the same info in dmesg.
- **/
-static int toi_bio_print_debug_stats(char *buffer, int size)
-{
- int len = 0;
-
- if (toiActiveAllocator != &toi_blockwriter_ops) {
- len = scnprintf(buffer, size,
- "- Block I/O inactive.\n");
- return len;
- }
-
- len = scnprintf(buffer, size, "- Block I/O active.\n");
-
- len += toi_bio_chains_debug_info(buffer + len, size - len);
-
- len += scnprintf(buffer + len, size - len,
- "- Max outstanding reads %d. Max writes %d.\n",
- max_outstanding_reads, max_outstanding_writes);
-
- len += scnprintf(buffer + len, size - len,
- " Memory_needed: %d x (%lu + %u + %u) = %d bytes.\n",
- target_outstanding_io,
- PAGE_SIZE, (unsigned int) sizeof(struct request),
- (unsigned int) sizeof(struct bio), toi_bio_memory_needed());
-
-#ifdef MEASURE_MUTEX_CONTENTION
- {
- int i;
-
- len += scnprintf(buffer + len, size - len,
- " Mutex contention while reading:\n Contended Free\n");
-
- for_each_online_cpu(i)
- len += scnprintf(buffer + len, size - len,
- " %9lu %9lu\n",
- mutex_times[0][0][i], mutex_times[0][1][i]);
-
- len += scnprintf(buffer + len, size - len,
- " Mutex contention while writing:\n Contended Free\n");
-
- for_each_online_cpu(i)
- len += scnprintf(buffer + len, size - len,
- " %9lu %9lu\n",
- mutex_times[1][0][i], mutex_times[1][1][i]);
-
- }
-#endif
-
- return len + scnprintf(buffer + len, size - len,
- " Free mem throttle point reached %d.\n", free_mem_throttle);
-}
-
-static int total_header_bytes;
-static int unowned;
-
-void debug_broken_header(void)
-{
- printk(KERN_DEBUG "Image header too big for size allocated!\n");
- print_toi_header_storage_for_modules();
- printk(KERN_DEBUG "Page flags : %d.\n", toi_pageflags_space_needed());
- printk(KERN_DEBUG "toi_header : %zu.\n", sizeof(struct toi_header));
- printk(KERN_DEBUG "Total unowned : %d.\n", unowned);
- printk(KERN_DEBUG "Total used : %d (%ld pages).\n", total_header_bytes,
- DIV_ROUND_UP(total_header_bytes, PAGE_SIZE));
- printk(KERN_DEBUG "Space needed now : %ld.\n",
- get_header_storage_needed());
- dump_block_chains();
- abort_hibernate(TOI_HEADER_TOO_BIG, "Header reservation too small.");
-}
-
-static int toi_bio_update_previous_inc_img_ptr(int stream)
-{
- int result;
- char * buffer = (char *) toi_get_zeroed_page(12, TOI_ATOMIC_GFP);
- struct page *page;
- struct toi_incremental_image_pointer *prev, *this;
-
- prev = &toi_inc_ptr[stream][0];
- this = &toi_inc_ptr[stream][1];
-
- if (!buffer) {
- // We're at the start of writing a pageset. Memory should not be that scarce.
- return -ENOMEM;
- }
-
- page = virt_to_page(buffer);
- result = toi_do_io(READ, prev->bdev, prev->block, page, 0, 1, 0);
-
- if (result)
- goto out;
-
- memcpy(buffer, (char *) this, sizeof(this->save));
-
- result = toi_do_io(WRITE, prev->bdev, prev->block, page, 0, 0, 12);
-
- // If the IO is successfully submitted (!result), the page will be freed
- // asynchronously on completion.
-out:
- if (result)
- toi__free_page(12, virt_to_page(buffer));
- return result;
-}
-
-/**
- * toi_rw_init_incremental - incremental image part of setting up to write new section
- */
-static int toi_write_init_incremental(int stream)
-{
- int result = 0;
-
- // Remember the location of this block so we can link to it.
- toi_bio_store_inc_image_ptr(&toi_inc_ptr[stream][1]);
-
- // Update the pointer at the start of the last pageset with the same stream number.
- result = toi_bio_update_previous_inc_img_ptr(stream);
- if (result)
- return result;
-
- // Move the current to the previous slot.
- memcpy(&toi_inc_ptr[stream][0], &toi_inc_ptr[stream][1], sizeof(toi_inc_ptr[stream][1]));
-
- // Store a blank pointer at the start of this incremental pageset
- memset(&toi_inc_ptr[stream][1], 0, sizeof(toi_inc_ptr[stream][1]));
- result = toi_rw_buffer(WRITE, (char *) &toi_inc_ptr[stream][1], sizeof(toi_inc_ptr[stream][1]), 0);
- if (result)
- return result;
-
- // Serialise extent chains if this is an incremental pageset
- return toi_serialise_extent_chains();
-}
-
-/**
- * toi_read_init_incremental - incremental image part of setting up to read new section
- */
-static int toi_read_init_incremental(int stream)
-{
- int result;
-
- // Set our position to the start of the next pageset
- toi_bio_restore_inc_image_ptr(&toi_inc_ptr[stream][1]);
-
- // Read the start of the next incremental pageset (if any)
- result = toi_rw_buffer(READ, (char *) &toi_inc_ptr[stream][1], sizeof(toi_inc_ptr[stream][1]), 0);
-
- if (!result)
- result = toi_load_extent_chains();
-
- return result;
-}
-
-/**
- * toi_rw_init - prepare to read or write a stream in the image
- * @writing: Whether reading or writing.
- * @stream number: Section of the image being processed.
- *
- * Prepare to read or write a section ('stream') in the image.
- **/
-static int toi_rw_init(int writing, int stream_number)
-{
- if (stream_number)
- toi_extent_state_restore(stream_number);
- else
- toi_extent_state_goto_start();
-
- if (writing) {
- reset_idx = 0;
- if (!current_stream)
- page_idx = 0;
- } else {
- reset_idx = 1;
- }
-
- atomic_set(&toi_io_done, 0);
- if (!toi_writer_buffer)
- toi_writer_buffer = (char *) toi_get_zeroed_page(11,
- TOI_ATOMIC_GFP);
- toi_writer_buffer_posn = writing ? 0 : PAGE_SIZE;
-
- current_stream = stream_number;
-
- more_readahead = 1;
-
- if (test_result_state(TOI_KEPT_IMAGE)) {
- int result;
-
- if (writing) {
- result = toi_write_init_incremental(stream_number);
- } else {
- result = toi_read_init_incremental(stream_number);
- }
-
- if (result)
- return result;
- }
-
- return toi_writer_buffer ? 0 : -ENOMEM;
-}
-
-/**
- * toi_bio_queue_write - queue a page for writing
- * @full_buffer: Pointer to a page to be queued
- *
- * Add a page to the queue to be submitted. If we're the queue flusher,
- * we'll do this once we've dropped toi_bio_mutex, so other threads can
- * continue to submit I/O while we're on the slow path doing the actual
- * submission.
- **/
-static void toi_bio_queue_write(char **full_buffer)
-{
- struct page *page = virt_to_page(*full_buffer);
- unsigned long flags;
-
- *full_buffer = NULL;
- page->private = 0;
-
- spin_lock_irqsave(&bio_queue_lock, flags);
- if (!bio_queue_head)
- bio_queue_head = page;
- else
- bio_queue_tail->private = (unsigned long) page;
-
- bio_queue_tail = page;
- atomic_inc(&toi_bio_queue_size);
-
- spin_unlock_irqrestore(&bio_queue_lock, flags);
- wake_up(&toi_io_queue_flusher);
-}
-
-/**
- * toi_rw_cleanup - Cleanup after i/o.
- * @writing: Whether we were reading or writing.
- *
- * Flush all I/O and clean everything up after reading or writing a
- * section of the image.
- **/
-static int toi_rw_cleanup(int writing)
-{
- int i, result = 0;
-
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_rw_cleanup.");
- if (writing) {
- if (toi_writer_buffer_posn && !test_result_state(TOI_ABORTED))
- toi_bio_queue_write(&toi_writer_buffer);
-
- while (bio_queue_head && !result)
- result = toi_bio_queue_flush_pages(0);
-
- if (result)
- return result;
-
- if (current_stream == 2)
- toi_extent_state_save(1);
- else if (current_stream == 1)
- toi_extent_state_save(3);
- }
-
- result = toi_finish_all_io();
-
- while (readahead_list_head) {
- void *next = (void *) readahead_list_head->private;
- toi__free_page(12, readahead_list_head);
- readahead_list_head = next;
- }
-
- readahead_list_tail = NULL;
-
- if (!current_stream)
- return result;
-
- for (i = 0; i < NUM_REASONS; i++) {
- if (!atomic_read(&reasons[i]))
- continue;
- printk(KERN_DEBUG "Waited for i/o due to %s %d times.\n",
- reason_name[i], atomic_read(&reasons[i]));
- atomic_set(&reasons[i], 0);
- }
-
- current_stream = 0;
- return result;
-}
-
-/**
- * toi_start_one_readahead - start one page of readahead
- * @dedicated_thread: Is this a thread dedicated to doing readahead?
- *
- * Start one new page of readahead. If this is being called by a thread
- * whose only just is to submit readahead, don't quit because we failed
- * to allocate a page.
- **/
-static int toi_start_one_readahead(int dedicated_thread)
-{
- char *buffer = NULL;
- int oom = 0, result;
-
- result = throttle_if_needed(dedicated_thread ? THROTTLE_WAIT : 0);
- if (result) {
- printk("toi_start_one_readahead: throttle_if_needed returned %d.\n", result);
- return result;
- }
-
- mutex_lock(&toi_bio_readahead_mutex);
-
- while (!buffer) {
- buffer = (char *) toi_get_zeroed_page(12,
- TOI_ATOMIC_GFP);
- if (!buffer) {
- if (oom && !dedicated_thread) {
- mutex_unlock(&toi_bio_readahead_mutex);
- printk("toi_start_one_readahead: oom and !dedicated thread %d.\n", result);
- return -ENOMEM;
- }
-
- oom = 1;
- set_free_mem_throttle();
- do_bio_wait(5);
- }
- }
-
- result = toi_bio_rw_page(READ, virt_to_page(buffer), 1, 0);
- if (result) {
- printk("toi_start_one_readahead: toi_bio_rw_page returned %d.\n", result);
- }
- if (result == -ENOSPC)
- toi__free_page(12, virt_to_page(buffer));
- mutex_unlock(&toi_bio_readahead_mutex);
- if (result) {
- if (result == -ENOSPC)
- toi_message(TOI_BIO, TOI_VERBOSE, 0,
- "Last readahead page submitted.");
- else
- printk(KERN_DEBUG "toi_bio_rw_page returned %d.\n",
- result);
- }
- return result;
-}
-
-/**
- * toi_start_new_readahead - start new readahead
- * @dedicated_thread: Are we dedicated to this task?
- *
- * Start readahead of image pages.
- *
- * We can be called as a thread dedicated to this task (may be helpful on
- * systems with lots of CPUs), in which case we don't exit until there's no
- * more readahead.
- *
- * If this is not called by a dedicated thread, we top up our queue until
- * there's no more readahead to submit, we've submitted the number given
- * in target_outstanding_io or the number in progress exceeds the target
- * outstanding I/O value.
- *
- * No mutex needed because this is only ever called by the first cpu.
- **/
-static int toi_start_new_readahead(int dedicated_thread)
-{
- int last_result, num_submitted = 0;
-
- /* Start a new readahead? */
- if (!more_readahead)
- return 0;
-
- do {
- last_result = toi_start_one_readahead(dedicated_thread);
-
- if (last_result) {
- if (last_result == -ENOMEM || last_result == -ENOSPC)
- return 0;
-
- printk(KERN_DEBUG
- "Begin read chunk returned %d.\n",
- last_result);
- } else
- num_submitted++;
-
- } while (more_readahead && !last_result &&
- (dedicated_thread ||
- (num_submitted < target_outstanding_io &&
- atomic_read(&toi_io_in_progress) < target_outstanding_io)));
-
- return last_result;
-}
-
-/**
- * bio_io_flusher - start the dedicated I/O flushing routine
- * @writing: Whether we're writing the image.
- **/
-static int bio_io_flusher(int writing)
-{
-
- if (writing)
- return toi_bio_queue_flush_pages(1);
- else
- return toi_start_new_readahead(1);
-}
-
-/**
- * toi_bio_get_next_page_read - read a disk page, perhaps with readahead
- * @no_readahead: Whether we can use readahead
- *
- * Read a page from disk, submitting readahead and cleaning up finished i/o
- * while we wait for the page we're after.
- **/
-static int toi_bio_get_next_page_read(int no_readahead)
-{
- char *virt;
- struct page *old_readahead_list_head;
-
- /*
- * When reading the second page of the header, we have to
- * delay submitting the read until after we've gotten the
- * extents out of the first page.
- */
- if (unlikely(no_readahead)) {
- int result = toi_start_one_readahead(0);
- if (result) {
- printk(KERN_EMERG "No readahead and toi_start_one_readahead "
- "returned non-zero.\n");
- return -EIO;
- }
- }
-
- if (unlikely(!readahead_list_head)) {
- /*
- * If the last page finishes exactly on the page
- * boundary, we will be called one extra time and
- * have no data to return. In this case, we should
- * not BUG(), like we used to!
- */
- if (!more_readahead) {
- printk(KERN_EMERG "No more readahead.\n");
- return -ENOSPC;
- }
- if (unlikely(toi_start_one_readahead(0))) {
- printk(KERN_EMERG "No readahead and "
- "toi_start_one_readahead returned non-zero.\n");
- return -EIO;
- }
- }
-
- if (PageLocked(readahead_list_head)) {
- waiting_on = readahead_list_head;
- do_bio_wait(0);
- }
-
- virt = page_address(readahead_list_head);
- memcpy(toi_writer_buffer, virt, PAGE_SIZE);
-
- mutex_lock(&toi_bio_readahead_mutex);
- old_readahead_list_head = readahead_list_head;
- readahead_list_head = (struct page *) readahead_list_head->private;
- mutex_unlock(&toi_bio_readahead_mutex);
- toi__free_page(12, old_readahead_list_head);
- return 0;
-}
-
-/**
- * toi_bio_queue_flush_pages - flush the queue of pages queued for writing
- * @dedicated_thread: Whether we're a dedicated thread
- *
- * Flush the queue of pages ready to be written to disk.
- *
- * If we're a dedicated thread, stay in here until told to leave,
- * sleeping in wait_event.
- *
- * The first thread is normally the only one to come in here. Another
- * thread can enter this routine too, though, via throttle_if_needed.
- * Since that's the case, we must be careful to only have one thread
- * doing this work at a time. Otherwise we have a race and could save
- * pages out of order.
- *
- * If an error occurs, free all remaining pages without submitting them
- * for I/O.
- **/
-
-int toi_bio_queue_flush_pages(int dedicated_thread)
-{
- unsigned long flags;
- int result = 0;
- static DEFINE_MUTEX(busy);
-
- if (!mutex_trylock(&busy))
- return 0;
-
-top:
- spin_lock_irqsave(&bio_queue_lock, flags);
- while (bio_queue_head) {
- struct page *page = bio_queue_head;
- bio_queue_head = (struct page *) page->private;
- if (bio_queue_tail == page)
- bio_queue_tail = NULL;
- atomic_dec(&toi_bio_queue_size);
- spin_unlock_irqrestore(&bio_queue_lock, flags);
-
- /* Don't generate more error messages if already had one */
- if (!result)
- result = toi_bio_rw_page(WRITE, page, 0, 11);
- /*
- * If writing the page failed, don't drop out.
- * Flush the rest of the queue too.
- */
- if (result)
- toi__free_page(11 , page);
- spin_lock_irqsave(&bio_queue_lock, flags);
- }
- spin_unlock_irqrestore(&bio_queue_lock, flags);
-
- if (dedicated_thread) {
- wait_event(toi_io_queue_flusher, bio_queue_head ||
- toi_bio_queue_flusher_should_finish);
- if (likely(!toi_bio_queue_flusher_should_finish))
- goto top;
- toi_bio_queue_flusher_should_finish = 0;
- }
-
- mutex_unlock(&busy);
- return result;
-}
-
-/**
- * toi_bio_get_new_page - get a new page for I/O
- * @full_buffer: Pointer to a page to allocate.
- **/
-static int toi_bio_get_new_page(char **full_buffer)
-{
- int result = throttle_if_needed(THROTTLE_WAIT);
- if (result)
- return result;
-
- while (!*full_buffer) {
- *full_buffer = (char *) toi_get_zeroed_page(11, TOI_ATOMIC_GFP);
- if (!*full_buffer) {
- set_free_mem_throttle();
- do_bio_wait(3);
- }
- }
-
- return 0;
-}
-
-/**
- * toi_rw_buffer - combine smaller buffers into PAGE_SIZE I/O
- * @writing: Bool - whether writing (or reading).
- * @buffer: The start of the buffer to write or fill.
- * @buffer_size: The size of the buffer to write or fill.
- * @no_readahead: Don't try to start readhead (when getting extents).
- **/
-static int toi_rw_buffer(int writing, char *buffer, int buffer_size,
- int no_readahead)
-{
- int bytes_left = buffer_size, result = 0;
-
- while (bytes_left) {
- char *source_start = buffer + buffer_size - bytes_left;
- char *dest_start = toi_writer_buffer + toi_writer_buffer_posn;
- int capacity = PAGE_SIZE - toi_writer_buffer_posn;
- char *to = writing ? dest_start : source_start;
- char *from = writing ? source_start : dest_start;
-
- if (bytes_left <= capacity) {
- memcpy(to, from, bytes_left);
- toi_writer_buffer_posn += bytes_left;
- return 0;
- }
-
- /* Complete this page and start a new one */
- memcpy(to, from, capacity);
- bytes_left -= capacity;
-
- if (!writing) {
- /*
- * Perform actual I/O:
- * read readahead_list_head into toi_writer_buffer
- */
- int result = toi_bio_get_next_page_read(no_readahead);
- if (result && bytes_left) {
- printk("toi_bio_get_next_page_read "
- "returned %d. Expecting to read %d bytes.\n", result, bytes_left);
- return result;
- }
- } else {
- toi_bio_queue_write(&toi_writer_buffer);
- result = toi_bio_get_new_page(&toi_writer_buffer);
- if (result) {
- printk(KERN_ERR "toi_bio_get_new_page returned "
- "%d.\n", result);
- return result;
- }
- }
-
- toi_writer_buffer_posn = 0;
- toi_cond_pause(0, NULL);
- }
-
- return 0;
-}
-
-/**
- * toi_bio_read_page - read a page of the image
- * @pfn: The pfn where the data belongs.
- * @buffer_page: The page containing the (possibly compressed) data.
- * @buf_size: The number of bytes on @buffer_page used (PAGE_SIZE).
- *
- * Read a (possibly compressed) page from the image, into buffer_page,
- * returning its pfn and the buffer size.
- **/
-static int toi_bio_read_page(unsigned long *pfn, int buf_type,
- void *buffer_page, unsigned int *buf_size)
-{
- int result = 0;
- int this_idx;
- char *buffer_virt = TOI_MAP(buf_type, buffer_page);
-
- /*
- * Only call start_new_readahead if we don't have a dedicated thread
- * and we're the queue flusher.
- */
- if (current == toi_queue_flusher && more_readahead &&
- !test_action_state(TOI_NO_READAHEAD)) {
- int result2 = toi_start_new_readahead(0);
- if (result2) {
- printk(KERN_DEBUG "Queue flusher and "
- "toi_start_one_readahead returned non-zero.\n");
- result = -EIO;
- goto out;
- }
- }
-
- my_mutex_lock(0, &toi_bio_mutex);
-
- /*
- * Structure in the image:
- * [destination pfn|page size|page data]
- * buf_size is PAGE_SIZE
- * We can validly find there's nothing to read in a multithreaded
- * situation.
- */
- if (toi_rw_buffer(READ, (char *) &this_idx, sizeof(int), 0) ||
- toi_rw_buffer(READ, (char *) pfn, sizeof(unsigned long), 0) ||
- toi_rw_buffer(READ, (char *) buf_size, sizeof(int), 0) ||
- toi_rw_buffer(READ, buffer_virt, *buf_size, 0)) {
- result = -ENODATA;
- goto out_unlock;
- }
-
- if (reset_idx) {
- page_idx = this_idx;
- reset_idx = 0;
- } else {
- page_idx++;
- if (!this_idx)
- result = -ENODATA;
- else if (page_idx != this_idx)
- printk(KERN_ERR "Got page index %d, expected %d.\n",
- this_idx, page_idx);
- }
-
-out_unlock:
- my_mutex_unlock(0, &toi_bio_mutex);
-out:
- TOI_UNMAP(buf_type, buffer_page);
- return result;
-}
-
-/**
- * toi_bio_write_page - write a page of the image
- * @pfn: The pfn where the data belongs.
- * @buffer_page: The page containing the (possibly compressed) data.
- * @buf_size: The number of bytes on @buffer_page used.
- *
- * Write a (possibly compressed) page to the image from the buffer, together
- * with it's index and buffer size.
- **/
-static int toi_bio_write_page(unsigned long pfn, int buf_type,
- void *buffer_page, unsigned int buf_size)
-{
- char *buffer_virt;
- int result = 0, result2 = 0;
-
- if (unlikely(test_action_state(TOI_TEST_FILTER_SPEED)))
- return 0;
-
- my_mutex_lock(1, &toi_bio_mutex);
-
- if (test_result_state(TOI_ABORTED)) {
- my_mutex_unlock(1, &toi_bio_mutex);
- return 0;
- }
-
- buffer_virt = TOI_MAP(buf_type, buffer_page);
- page_idx++;
-
- /*
- * Structure in the image:
- * [destination pfn|page size|page data]
- * buf_size is PAGE_SIZE
- */
- if (toi_rw_buffer(WRITE, (char *) &page_idx, sizeof(int), 0) ||
- toi_rw_buffer(WRITE, (char *) &pfn, sizeof(unsigned long), 0) ||
- toi_rw_buffer(WRITE, (char *) &buf_size, sizeof(int), 0) ||
- toi_rw_buffer(WRITE, buffer_virt, buf_size, 0)) {
- printk(KERN_DEBUG "toi_rw_buffer returned non-zero to "
- "toi_bio_write_page.\n");
- result = -EIO;
- }
-
- TOI_UNMAP(buf_type, buffer_page);
- my_mutex_unlock(1, &toi_bio_mutex);
-
- if (current == toi_queue_flusher)
- result2 = toi_bio_queue_flush_pages(0);
-
- return result ? result : result2;
-}
-
-/**
- * _toi_rw_header_chunk - read or write a portion of the image header
- * @writing: Whether reading or writing.
- * @owner: The module for which we're writing.
- * Used for confirming that modules
- * don't use more header space than they asked for.
- * @buffer: Address of the data to write.
- * @buffer_size: Size of the data buffer.
- * @no_readahead: Don't try to start readhead (when getting extents).
- *
- * Perform PAGE_SIZE I/O. Start readahead if needed.
- **/
-static int _toi_rw_header_chunk(int writing, struct toi_module_ops *owner,
- char *buffer, int buffer_size, int no_readahead)
-{
- int result = 0;
-
- if (owner) {
- owner->header_used += buffer_size;
- toi_message(TOI_HEADER, TOI_LOW, 1,
- "Header: %s : %d bytes (%d/%d) from offset %d.",
- owner->name,
- buffer_size, owner->header_used,
- owner->header_requested,
- toi_writer_buffer_posn);
- if (owner->header_used > owner->header_requested && writing) {
- printk(KERN_EMERG "TuxOnIce module %s is using more "
- "header space (%u) than it requested (%u).\n",
- owner->name,
- owner->header_used,
- owner->header_requested);
- return buffer_size;
- }
- } else {
- unowned += buffer_size;
- toi_message(TOI_HEADER, TOI_LOW, 1,
- "Header: (No owner): %d bytes (%d total so far) from "
- "offset %d.", buffer_size, unowned,
- toi_writer_buffer_posn);
- }
-
- if (!writing && !no_readahead && more_readahead) {
- result = toi_start_new_readahead(0);
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "Start new readahead "
- "returned %d.", result);
- }
-
- if (!result) {
- result = toi_rw_buffer(writing, buffer, buffer_size,
- no_readahead);
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "rw_buffer returned "
- "%d.", result);
- }
-
- total_header_bytes += buffer_size;
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "_toi_rw_header_chunk returning "
- "%d.", result);
- return result;
-}
-
-static int toi_rw_header_chunk(int writing, struct toi_module_ops *owner,
- char *buffer, int size)
-{
- return _toi_rw_header_chunk(writing, owner, buffer, size, 1);
-}
-
-static int toi_rw_header_chunk_noreadahead(int writing,
- struct toi_module_ops *owner, char *buffer, int size)
-{
- return _toi_rw_header_chunk(writing, owner, buffer, size, 1);
-}
-
-/**
- * toi_bio_storage_needed - get the amount of storage needed for my fns
- **/
-static int toi_bio_storage_needed(void)
-{
- return sizeof(int) + PAGE_SIZE + toi_bio_devinfo_storage_needed();
-}
-
-/**
- * toi_bio_save_config_info - save block I/O config to image header
- * @buf: PAGE_SIZE'd buffer into which data should be saved.
- **/
-static int toi_bio_save_config_info(char *buf)
-{
- int *ints = (int *) buf;
- ints[0] = target_outstanding_io;
- return sizeof(int);
-}
-
-/**
- * toi_bio_load_config_info - restore block I/O config
- * @buf: Data to be reloaded.
- * @size: Size of the buffer saved.
- **/
-static void toi_bio_load_config_info(char *buf, int size)
-{
- int *ints = (int *) buf;
- target_outstanding_io = ints[0];
-}
-
-void close_resume_dev_t(int force)
-{
- if (!resume_block_device)
- return;
-
- if (force)
- atomic_set(&resume_bdev_open_count, 0);
- else
- atomic_dec(&resume_bdev_open_count);
-
- if (!atomic_read(&resume_bdev_open_count)) {
- toi_close_bdev(resume_block_device);
- resume_block_device = NULL;
- }
-}
-
-int open_resume_dev_t(int force, int quiet)
-{
- if (force) {
- close_resume_dev_t(1);
- atomic_set(&resume_bdev_open_count, 1);
- } else
- atomic_inc(&resume_bdev_open_count);
-
- if (resume_block_device)
- return 0;
-
- resume_block_device = toi_open_bdev(NULL, resume_dev_t, 0);
- if (IS_ERR(resume_block_device)) {
- if (!quiet)
- toi_early_boot_message(1, TOI_CONTINUE_REQ,
- "Failed to open device %x, where"
- " the header should be found.",
- resume_dev_t);
- resume_block_device = NULL;
- atomic_set(&resume_bdev_open_count, 0);
- return 1;
- }
-
- return 0;
-}
-
-/**
- * toi_bio_initialise - initialise bio code at start of some action
- * @starting_cycle: Whether starting a hibernation cycle, or just reading or
- * writing a sysfs value.
- **/
-static int toi_bio_initialise(int starting_cycle)
-{
- int result;
-
- if (!starting_cycle || !resume_dev_t)
- return 0;
-
- max_outstanding_writes = 0;
- max_outstanding_reads = 0;
- current_stream = 0;
- toi_queue_flusher = current;
-#ifdef MEASURE_MUTEX_CONTENTION
- {
- int i, j, k;
-
- for (i = 0; i < 2; i++)
- for (j = 0; j < 2; j++)
- for_each_online_cpu(k)
- mutex_times[i][j][k] = 0;
- }
-#endif
- result = open_resume_dev_t(0, 1);
-
- if (result)
- return result;
-
- return get_signature_page();
-}
-
-static unsigned long raw_to_real(unsigned long raw)
-{
- unsigned long extra;
-
- extra = (raw * (sizeof(unsigned long) + sizeof(int)) +
- (PAGE_SIZE + sizeof(unsigned long) + sizeof(int) + 1)) /
- (PAGE_SIZE + sizeof(unsigned long) + sizeof(int));
-
- return raw > extra ? raw - extra : 0;
-}
-
-static unsigned long toi_bio_storage_available(void)
-{
- unsigned long sum = 0;
- struct toi_module_ops *this_module;
-
- list_for_each_entry(this_module, &toi_modules, module_list) {
- if (!this_module->enabled ||
- this_module->type != BIO_ALLOCATOR_MODULE)
- continue;
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "Seeking storage "
- "available from %s.", this_module->name);
- sum += this_module->bio_allocator_ops->storage_available();
- }
-
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "Total storage available is %lu "
- "pages (%d header pages).", sum, header_pages_reserved);
-
- return sum > header_pages_reserved ?
- raw_to_real(sum - header_pages_reserved) : 0;
-
-}
-
-static unsigned long toi_bio_storage_allocated(void)
-{
- return raw_pages_allocd > header_pages_reserved ?
- raw_to_real(raw_pages_allocd - header_pages_reserved) : 0;
-}
-
-/*
- * If we have read part of the image, we might have filled memory with
- * data that should be zeroed out.
- */
-static void toi_bio_noresume_reset(void)
-{
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_noresume_reset.");
- toi_rw_cleanup(READ);
- free_all_bdev_info();
-}
-
-/**
- * toi_bio_cleanup - cleanup after some action
- * @finishing_cycle: Whether completing a cycle.
- **/
-static void toi_bio_cleanup(int finishing_cycle)
-{
- if (!finishing_cycle)
- return;
-
- if (toi_writer_buffer) {
- toi_free_page(11, (unsigned long) toi_writer_buffer);
- toi_writer_buffer = NULL;
- }
-
- forget_signature_page();
-
- if (header_block_device && toi_sig_data &&
- toi_sig_data->header_dev_t != resume_dev_t)
- toi_close_bdev(header_block_device);
-
- header_block_device = NULL;
-
- close_resume_dev_t(0);
-}
-
-static int toi_bio_write_header_init(void)
-{
- int result;
-
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_write_header_init");
- toi_rw_init(WRITE, 0);
- toi_writer_buffer_posn = 0;
-
- /* Info needed to bootstrap goes at the start of the header.
- * First we save the positions and devinfo, including the number
- * of header pages. Then we save the structs containing data needed
- * for reading the header pages back.
- * Note that even if header pages take more than one page, when we
- * read back the info, we will have restored the location of the
- * next header page by the time we go to use it.
- */
-
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "serialise extent chains.");
- result = toi_serialise_extent_chains();
-
- if (result)
- return result;
-
- /*
- * Signature page hasn't been modified at this point. Write it in
- * the header so we can restore it later.
- */
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "serialise signature page.");
- return toi_rw_header_chunk_noreadahead(WRITE, &toi_blockwriter_ops,
- (char *) toi_cur_sig_page,
- PAGE_SIZE);
-}
-
-static int toi_bio_write_header_cleanup(void)
-{
- int result = 0;
-
- if (toi_writer_buffer_posn)
- toi_bio_queue_write(&toi_writer_buffer);
-
- result = toi_finish_all_io();
-
- unowned = 0;
- total_header_bytes = 0;
-
- /* Set signature to save we have an image */
- if (!result)
- result = toi_bio_mark_have_image();
-
- return result;
-}
-
-/*
- * toi_bio_read_header_init()
- *
- * Description:
- * 1. Attempt to read the device specified with resume=.
- * 2. Check the contents of the swap header for our signature.
- * 3. Warn, ignore, reset and/or continue as appropriate.
- * 4. If continuing, read the toi_swap configuration section
- * of the header and set up block device info so we can read
- * the rest of the header & image.
- *
- * Returns:
- * May not return if user choose to reboot at a warning.
- * -EINVAL if cannot resume at this time. Booting should continue
- * normally.
- */
-
-static int toi_bio_read_header_init(void)
-{
- int result = 0;
- char buf[32];
-
- toi_writer_buffer_posn = 0;
-
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_read_header_init");
-
- if (!toi_sig_data) {
- printk(KERN_INFO "toi_bio_read_header_init called when we "
- "haven't verified there is an image!\n");
- return -EINVAL;
- }
-
- /*
- * If the header is not on the resume_swap_dev_t, get the resume device
- * first.
- */
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "Header dev_t is %lx.",
- toi_sig_data->header_dev_t);
- if (toi_sig_data->have_uuid) {
- struct fs_info seek;
- dev_t device;
-
- strncpy((char *) seek.uuid, toi_sig_data->header_uuid, 16);
- seek.dev_t = toi_sig_data->header_dev_t;
- seek.last_mount_size = 0;
- device = blk_lookup_fs_info(&seek);
- if (device) {
- printk("Using dev_t %s, returned by blk_lookup_fs_info.\n",
- format_dev_t(buf, device));
- toi_sig_data->header_dev_t = device;
- }
- }
- if (toi_sig_data->header_dev_t != resume_dev_t) {
- header_block_device = toi_open_bdev(NULL,
- toi_sig_data->header_dev_t, 1);
-
- if (IS_ERR(header_block_device))
- return PTR_ERR(header_block_device);
- } else
- header_block_device = resume_block_device;
-
- if (!toi_writer_buffer)
- toi_writer_buffer = (char *) toi_get_zeroed_page(11,
- TOI_ATOMIC_GFP);
- more_readahead = 1;
-
- /*
- * Read toi_swap configuration.
- * Headerblock size taken into account already.
- */
- result = toi_bio_ops.bdev_page_io(READ, header_block_device,
- toi_sig_data->first_header_block,
- virt_to_page((unsigned long) toi_writer_buffer));
- if (result)
- return result;
-
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "load extent chains.");
- result = toi_load_extent_chains();
-
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "load original signature page.");
- toi_orig_sig_page = (char *) toi_get_zeroed_page(38, TOI_ATOMIC_GFP);
- if (!toi_orig_sig_page) {
- printk(KERN_ERR "Failed to allocate memory for the current"
- " image signature.\n");
- return -ENOMEM;
- }
-
- return toi_rw_header_chunk_noreadahead(READ, &toi_blockwriter_ops,
- (char *) toi_orig_sig_page,
- PAGE_SIZE);
-}
-
-static int toi_bio_read_header_cleanup(void)
-{
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_read_header_cleanup.");
- return toi_rw_cleanup(READ);
-}
-
-/* Works only for digits and letters, but small and fast */
-#define TOLOWER(x) ((x) | 0x20)
-
-/*
- * UUID must be 32 chars long. It may have dashes, but nothing
- * else.
- */
-char *uuid_from_commandline(char *commandline)
-{
- int low = 0;
- char *result = NULL, *output, *ptr;
-
- if (strncmp(commandline, "UUID=", 5))
- return NULL;
-
- result = kzalloc(17, GFP_KERNEL);
- if (!result) {
- printk("Failed to kzalloc UUID text memory.\n");
- return NULL;
- }
-
- ptr = commandline + 5;
- output = result;
-
- while (*ptr && (output - result) < 16) {
- if (isxdigit(*ptr)) {
- int value = isdigit(*ptr) ? *ptr - '0' :
- TOLOWER(*ptr) - 'a' + 10;
- if (low) {
- *output += value;
- output++;
- } else {
- *output = value << 4;
- }
- low = !low;
- } else if (*ptr != '-')
- break;
- ptr++;
- }
-
- if ((output - result) < 16 || *ptr) {
- printk(KERN_DEBUG "Found resume=UUID=, but the value looks "
- "invalid.\n");
- kfree(result);
- result = NULL;
- }
-
- return result;
-}
-
-#define retry_if_fails(command) \
-do { \
- command; \
- if (!resume_dev_t && !waited_for_device_probe) { \
- wait_for_device_probe(); \
- command; \
- waited_for_device_probe = 1; \
- } \
-} while(0)
-
-/**
- * try_to_open_resume_device: Try to parse and open resume=
- *
- * Any "swap:" has been stripped away and we just have the path to deal with.
- * We attempt to do name_to_dev_t, open and stat the file. Having opened the
- * file, get the struct block_device * to match.
- */
-static int try_to_open_resume_device(char *commandline, int quiet)
-{
- struct kstat stat;
- int error = 0;
- char *uuid = uuid_from_commandline(commandline);
- int waited_for_device_probe = 0;
-
- resume_dev_t = MKDEV(0, 0);
-
- if (!strlen(commandline))
- retry_if_fails(toi_bio_scan_for_image(quiet));
-
- if (uuid) {
- struct fs_info seek;
- strncpy((char *) &seek.uuid, uuid, 16);
- seek.dev_t = resume_dev_t;
- seek.last_mount_size = 0;
- retry_if_fails(resume_dev_t = blk_lookup_fs_info(&seek));
- kfree(uuid);
- }
-
- if (!resume_dev_t)
- retry_if_fails(resume_dev_t = name_to_dev_t(commandline));
-
- if (!resume_dev_t) {
- struct file *file = filp_open(commandline,
- O_RDONLY|O_LARGEFILE, 0);
-
- if (!IS_ERR(file) && file) {
- vfs_getattr(&file->f_path, &stat);
- filp_close(file, NULL);
- } else
- error = vfs_stat(commandline, &stat);
- if (!error)
- resume_dev_t = stat.rdev;
- }
-
- if (!resume_dev_t) {
- if (quiet)
- return 1;
-
- if (test_toi_state(TOI_TRYING_TO_RESUME))
- toi_early_boot_message(1, toi_translate_err_default,
- "Failed to translate \"%s\" into a device id.\n",
- commandline);
- else
- printk("TuxOnIce: Can't translate \"%s\" into a device "
- "id yet.\n", commandline);
- return 1;
- }
-
- return open_resume_dev_t(1, quiet);
-}
-
-/*
- * Parse Image Location
- *
- * Attempt to parse a resume= parameter.
- * Swap Writer accepts:
- * resume=[swap:|file:]DEVNAME[:FIRSTBLOCK][@BLOCKSIZE]
- *
- * Where:
- * DEVNAME is convertable to a dev_t by name_to_dev_t
- * FIRSTBLOCK is the location of the first block in the swap file
- * (specifying for a swap partition is nonsensical but not prohibited).
- * Data is validated by attempting to read a swap header from the
- * location given. Failure will result in toi_swap refusing to
- * save an image, and a reboot with correct parameters will be
- * necessary.
- */
-static int toi_bio_parse_sig_location(char *commandline,
- int only_allocator, int quiet)
-{
- char *thischar, *devstart, *colon = NULL;
- int signature_found, result = -EINVAL, temp_result = 0;
-
- if (strncmp(commandline, "swap:", 5) &&
- strncmp(commandline, "file:", 5)) {
- /*
- * Failing swap:, we'll take a simple resume=/dev/hda2, or a
- * blank value (scan) but fall through to other allocators
- * if /dev/ or UUID= isn't matched.
- */
- if (strncmp(commandline, "/dev/", 5) &&
- strncmp(commandline, "UUID=", 5) &&
- strlen(commandline))
- return 1;
- } else
- commandline += 5;
-
- devstart = commandline;
- thischar = commandline;
- while ((*thischar != ':') && (*thischar != '@') &&
- ((thischar - commandline) < 250) && (*thischar))
- thischar++;
-
- if (*thischar == ':') {
- colon = thischar;
- *colon = 0;
- thischar++;
- }
-
- while ((thischar - commandline) < 250 && *thischar)
- thischar++;
-
- if (colon) {
- unsigned long block;
- temp_result = kstrtoul(colon + 1, 0, &block);
- if (!temp_result)
- resume_firstblock = (int) block;
- } else
- resume_firstblock = 0;
-
- clear_toi_state(TOI_CAN_HIBERNATE);
- clear_toi_state(TOI_CAN_RESUME);
-
- if (!temp_result)
- temp_result = try_to_open_resume_device(devstart, quiet);
-
- if (colon)
- *colon = ':';
-
- /* No error if we only scanned */
- if (temp_result)
- return strlen(commandline) ? -EINVAL : 1;
-
- signature_found = toi_bio_image_exists(quiet);
-
- if (signature_found != -1) {
- result = 0;
- /*
- * TODO: If only file storage, CAN_HIBERNATE should only be
- * set if file allocator's target is valid.
- */
- set_toi_state(TOI_CAN_HIBERNATE);
- set_toi_state(TOI_CAN_RESUME);
- } else
- if (!quiet)
- printk(KERN_ERR "TuxOnIce: Block I/O: No "
- "signature found at %s.\n", devstart);
-
- return result;
-}
-
-static void toi_bio_release_storage(void)
-{
- header_pages_reserved = 0;
- raw_pages_allocd = 0;
-
- free_all_bdev_info();
-}
-
-/* toi_swap_remove_image
- *
- */
-static int toi_bio_remove_image(void)
-{
- int result;
-
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_remove_image.");
-
- result = toi_bio_restore_original_signature();
-
- /*
- * We don't do a sanity check here: we want to restore the swap
- * whatever version of kernel made the hibernate image.
- *
- * We need to write swap, but swap may not be enabled so
- * we write the device directly
- *
- * If we don't have an current_signature_page, we didn't
- * read an image header, so don't change anything.
- */
-
- toi_bio_release_storage();
-
- return result;
-}
-
-struct toi_bio_ops toi_bio_ops = {
- .bdev_page_io = toi_bdev_page_io,
- .register_storage = toi_register_storage_chain,
- .free_storage = toi_bio_release_storage,
-};
-
-static struct toi_sysfs_data sysfs_params[] = {
- SYSFS_INT("target_outstanding_io", SYSFS_RW, &target_outstanding_io,
- 0, 16384, 0, NULL),
-};
-
-struct toi_module_ops toi_blockwriter_ops = {
- .type = WRITER_MODULE,
- .name = "block i/o",
- .directory = "block_io",
- .module = THIS_MODULE,
- .memory_needed = toi_bio_memory_needed,
- .print_debug_info = toi_bio_print_debug_stats,
- .storage_needed = toi_bio_storage_needed,
- .save_config_info = toi_bio_save_config_info,
- .load_config_info = toi_bio_load_config_info,
- .initialise = toi_bio_initialise,
- .cleanup = toi_bio_cleanup,
- .post_atomic_restore = toi_bio_chains_post_atomic,
-
- .rw_init = toi_rw_init,
- .rw_cleanup = toi_rw_cleanup,
- .read_page = toi_bio_read_page,
- .write_page = toi_bio_write_page,
- .rw_header_chunk = toi_rw_header_chunk,
- .rw_header_chunk_noreadahead = toi_rw_header_chunk_noreadahead,
- .io_flusher = bio_io_flusher,
- .update_throughput_throttle = update_throughput_throttle,
- .finish_all_io = toi_finish_all_io,
-
- .noresume_reset = toi_bio_noresume_reset,
- .storage_available = toi_bio_storage_available,
- .storage_allocated = toi_bio_storage_allocated,
- .reserve_header_space = toi_bio_reserve_header_space,
- .allocate_storage = toi_bio_allocate_storage,
- .free_unused_storage = toi_bio_free_unused_storage,
- .image_exists = toi_bio_image_exists,
- .mark_resume_attempted = toi_bio_mark_resume_attempted,
- .write_header_init = toi_bio_write_header_init,
- .write_header_cleanup = toi_bio_write_header_cleanup,
- .read_header_init = toi_bio_read_header_init,
- .read_header_cleanup = toi_bio_read_header_cleanup,
- .get_header_version = toi_bio_get_header_version,
- .remove_image = toi_bio_remove_image,
- .parse_sig_location = toi_bio_parse_sig_location,
-
- .sysfs_data = sysfs_params,
- .num_sysfs_entries = sizeof(sysfs_params) /
- sizeof(struct toi_sysfs_data),
-};
-
-/**
- * toi_block_io_load - load time routine for block I/O module
- *
- * Register block i/o ops and sysfs entries.
- **/
-static __init int toi_block_io_load(void)
-{
- return toi_register_module(&toi_blockwriter_ops);
-}
-
-late_initcall(toi_block_io_load);
diff --git a/kernel/power/tuxonice_bio_internal.h b/kernel/power/tuxonice_bio_internal.h
deleted file mode 100644
index 5e1964a61..000000000
--- a/kernel/power/tuxonice_bio_internal.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * kernel/power/tuxonice_bio_internal.h
- *
- * Copyright (C) 2009-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * Distributed under GPLv2.
- *
- * This file contains declarations for functions exported from
- * tuxonice_bio.c, which contains low level io functions.
- */
-
-/* Extent chains */
-void toi_extent_state_goto_start(void);
-void toi_extent_state_save(int slot);
-int go_next_page(int writing, int section_barrier);
-void toi_extent_state_restore(int slot);
-void free_all_bdev_info(void);
-int devices_of_same_priority(struct toi_bdev_info *this);
-int toi_register_storage_chain(struct toi_bdev_info *new);
-int toi_serialise_extent_chains(void);
-int toi_load_extent_chains(void);
-int toi_bio_rw_page(int writing, struct page *page, int is_readahead,
- int free_group);
-int toi_bio_restore_original_signature(void);
-int toi_bio_devinfo_storage_needed(void);
-unsigned long get_headerblock(void);
-dev_t get_header_dev_t(void);
-struct block_device *get_header_bdev(void);
-int toi_bio_allocate_storage(unsigned long request);
-void toi_bio_free_unused_storage(void);
-
-/* Signature functions */
-#define HaveImage "HaveImage"
-#define NoImage "TuxOnIce"
-#define sig_size (sizeof(HaveImage))
-
-struct sig_data {
- char sig[sig_size];
- int have_image;
- int resumed_before;
-
- char have_uuid;
- char header_uuid[17];
- dev_t header_dev_t;
- unsigned long first_header_block;
-
- /* Repeat the signature to be sure we have a header version */
- char sig2[sig_size];
- int header_version;
-};
-
-void forget_signature_page(void);
-int toi_check_for_signature(void);
-int toi_bio_image_exists(int quiet);
-int get_signature_page(void);
-int toi_bio_mark_resume_attempted(int);
-extern char *toi_cur_sig_page;
-extern char *toi_orig_sig_page;
-int toi_bio_mark_have_image(void);
-extern struct sig_data *toi_sig_data;
-extern dev_t resume_dev_t;
-extern struct block_device *resume_block_device;
-extern struct block_device *header_block_device;
-extern unsigned long resume_firstblock;
-
-struct block_device *open_bdev(dev_t device, int display_errs);
-extern int current_stream;
-extern int more_readahead;
-int toi_do_io(int writing, struct block_device *bdev, long block0,
- struct page *page, int is_readahead, int syncio, int free_group);
-int get_main_pool_phys_params(void);
-
-void toi_close_bdev(struct block_device *bdev);
-struct block_device *toi_open_bdev(char *uuid, dev_t default_device,
- int display_errs);
-
-extern struct toi_module_ops toi_blockwriter_ops;
-void dump_block_chains(void);
-void debug_broken_header(void);
-extern unsigned long raw_pages_allocd, header_pages_reserved;
-int toi_bio_chains_debug_info(char *buffer, int size);
-void toi_bio_chains_post_atomic(struct toi_boot_kernel_data *bkd);
-int toi_bio_scan_for_image(int quiet);
-int toi_bio_get_header_version(void);
-
-void close_resume_dev_t(int force);
-int open_resume_dev_t(int force, int quiet);
-
-struct toi_incremental_image_pointer_saved_data {
- unsigned long block;
- int chain;
-};
-
-struct toi_incremental_image_pointer {
- struct toi_incremental_image_pointer_saved_data save;
- struct block_device *bdev;
- unsigned long block;
-};
-
-void toi_bio_store_inc_image_ptr(struct toi_incremental_image_pointer *ptr);
-void toi_bio_restore_inc_image_ptr(struct toi_incremental_image_pointer *ptr);
diff --git a/kernel/power/tuxonice_bio_signature.c b/kernel/power/tuxonice_bio_signature.c
deleted file mode 100644
index f5418f092..000000000
--- a/kernel/power/tuxonice_bio_signature.c
+++ /dev/null
@@ -1,403 +0,0 @@
-/*
- * kernel/power/tuxonice_bio_signature.c
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * Distributed under GPLv2.
- *
- */
-
-#include <linux/fs_uuid.h>
-
-#include "tuxonice.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_prepare_image.h"
-#include "tuxonice_bio.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_io.h"
-#include "tuxonice_builtin.h"
-#include "tuxonice_bio_internal.h"
-
-struct sig_data *toi_sig_data;
-
-/* Struct of swap header pages */
-
-struct old_sig_data {
- dev_t device;
- unsigned long sector;
- int resume_attempted;
- int orig_sig_type;
-};
-
-union diskpage {
- union swap_header swh; /* swh.magic is the only member used */
- struct sig_data sig_data;
- struct old_sig_data old_sig_data;
-};
-
-union p_diskpage {
- union diskpage *pointer;
- char *ptr;
- unsigned long address;
-};
-
-char *toi_cur_sig_page;
-char *toi_orig_sig_page;
-int have_image;
-int have_old_image;
-
-int get_signature_page(void)
-{
- if (!toi_cur_sig_page) {
- toi_message(TOI_IO, TOI_VERBOSE, 0,
- "Allocating current signature page.");
- toi_cur_sig_page = (char *) toi_get_zeroed_page(38,
- TOI_ATOMIC_GFP);
- if (!toi_cur_sig_page) {
- printk(KERN_ERR "Failed to allocate memory for the "
- "current image signature.\n");
- return -ENOMEM;
- }
-
- toi_sig_data = (struct sig_data *) toi_cur_sig_page;
- }
-
- toi_message(TOI_IO, TOI_VERBOSE, 0, "Reading signature from dev %lx,"
- " sector %d.",
- resume_block_device->bd_dev, resume_firstblock);
-
- return toi_bio_ops.bdev_page_io(READ, resume_block_device,
- resume_firstblock, virt_to_page(toi_cur_sig_page));
-}
-
-void forget_signature_page(void)
-{
- if (toi_cur_sig_page) {
- toi_sig_data = NULL;
- toi_message(TOI_IO, TOI_VERBOSE, 0, "Freeing toi_cur_sig_page"
- " (%p).", toi_cur_sig_page);
- toi_free_page(38, (unsigned long) toi_cur_sig_page);
- toi_cur_sig_page = NULL;
- }
-
- if (toi_orig_sig_page) {
- toi_message(TOI_IO, TOI_VERBOSE, 0, "Freeing toi_orig_sig_page"
- " (%p).", toi_orig_sig_page);
- toi_free_page(38, (unsigned long) toi_orig_sig_page);
- toi_orig_sig_page = NULL;
- }
-}
-
-/*
- * We need to ensure we use the signature page that's currently on disk,
- * so as to not remove the image header. Post-atomic-restore, the orig sig
- * page will be empty, so we can use that as our method of knowing that we
- * need to load the on-disk signature and not use the non-image sig in
- * memory. (We're going to powerdown after writing the change, so it's safe.
- */
-int toi_bio_mark_resume_attempted(int flag)
-{
- toi_message(TOI_IO, TOI_VERBOSE, 0, "Make resume attempted = %d.",
- flag);
- if (!toi_orig_sig_page) {
- forget_signature_page();
- get_signature_page();
- }
- toi_sig_data->resumed_before = flag;
- return toi_bio_ops.bdev_page_io(WRITE, resume_block_device,
- resume_firstblock, virt_to_page(toi_cur_sig_page));
-}
-
-int toi_bio_mark_have_image(void)
-{
- int result = 0;
- char buf[32];
- struct fs_info *fs_info;
-
- toi_message(TOI_IO, TOI_VERBOSE, 0, "Recording that an image exists.");
- memcpy(toi_sig_data->sig, tuxonice_signature,
- sizeof(tuxonice_signature));
- toi_sig_data->have_image = 1;
- toi_sig_data->resumed_before = 0;
- toi_sig_data->header_dev_t = get_header_dev_t();
- toi_sig_data->have_uuid = 0;
-
- fs_info = fs_info_from_block_dev(get_header_bdev());
- if (fs_info && !IS_ERR(fs_info)) {
- memcpy(toi_sig_data->header_uuid, &fs_info->uuid, 16);
- free_fs_info(fs_info);
- } else
- result = (int) PTR_ERR(fs_info);
-
- if (!result) {
- toi_message(TOI_IO, TOI_VERBOSE, 0, "Got uuid for dev_t %s.",
- format_dev_t(buf, get_header_dev_t()));
- toi_sig_data->have_uuid = 1;
- } else
- toi_message(TOI_IO, TOI_VERBOSE, 0, "Could not get uuid for "
- "dev_t %s.",
- format_dev_t(buf, get_header_dev_t()));
-
- toi_sig_data->first_header_block = get_headerblock();
- have_image = 1;
- toi_message(TOI_IO, TOI_VERBOSE, 0, "header dev_t is %x. First block "
- "is %d.", toi_sig_data->header_dev_t,
- toi_sig_data->first_header_block);
-
- memcpy(toi_sig_data->sig2, tuxonice_signature,
- sizeof(tuxonice_signature));
- toi_sig_data->header_version = TOI_HEADER_VERSION;
-
- return toi_bio_ops.bdev_page_io(WRITE, resume_block_device,
- resume_firstblock, virt_to_page(toi_cur_sig_page));
-}
-
-int remove_old_signature(void)
-{
- union p_diskpage swap_header_page = (union p_diskpage) toi_cur_sig_page;
- char *orig_sig;
- char *header_start = (char *) toi_get_zeroed_page(38, TOI_ATOMIC_GFP);
- int result;
- struct block_device *header_bdev;
- struct old_sig_data *old_sig_data =
- &swap_header_page.pointer->old_sig_data;
-
- header_bdev = toi_open_bdev(NULL, old_sig_data->device, 1);
- result = toi_bio_ops.bdev_page_io(READ, header_bdev,
- old_sig_data->sector, virt_to_page(header_start));
-
- if (result)
- goto out;
-
- /*
- * TODO: Get the original contents of the first bytes of the swap
- * header page.
- */
- if (!old_sig_data->orig_sig_type)
- orig_sig = "SWAP-SPACE";
- else
- orig_sig = "SWAPSPACE2";
-
- memcpy(swap_header_page.pointer->swh.magic.magic, orig_sig, 10);
- memcpy(swap_header_page.ptr, header_start, 10);
-
- result = toi_bio_ops.bdev_page_io(WRITE, resume_block_device,
- resume_firstblock, virt_to_page(swap_header_page.ptr));
-
-out:
- toi_close_bdev(header_bdev);
- have_old_image = 0;
- toi_free_page(38, (unsigned long) header_start);
- return result;
-}
-
-/*
- * toi_bio_restore_original_signature - restore the original signature
- *
- * At boot time (aborting pre atomic-restore), toi_orig_sig_page gets used.
- * It will have the original signature page contents, stored in the image
- * header. Post atomic-restore, we use :toi_cur_sig_page, which will contain
- * the contents that were loaded when we started the cycle.
- */
-int toi_bio_restore_original_signature(void)
-{
- char *use = toi_orig_sig_page ? toi_orig_sig_page : toi_cur_sig_page;
-
- if (have_old_image)
- return remove_old_signature();
-
- if (!use) {
- printk("toi_bio_restore_original_signature: No signature "
- "page loaded.\n");
- return 0;
- }
-
- toi_message(TOI_IO, TOI_VERBOSE, 0, "Recording that no image exists.");
- have_image = 0;
- toi_sig_data->have_image = 0;
- return toi_bio_ops.bdev_page_io(WRITE, resume_block_device,
- resume_firstblock, virt_to_page(use));
-}
-
-/*
- * check_for_signature - See whether we have an image.
- *
- * Returns 0 if no image, 1 if there is one, -1 if indeterminate.
- */
-int toi_check_for_signature(void)
-{
- union p_diskpage swap_header_page;
- int type;
- const char *normal_sigs[] = {"SWAP-SPACE", "SWAPSPACE2" };
- const char *swsusp_sigs[] = {"S1SUSP", "S2SUSP", "S1SUSPEND" };
- char *swap_header;
-
- if (!toi_cur_sig_page) {
- int result = get_signature_page();
-
- if (result)
- return result;
- }
-
- /*
- * Start by looking for the binary header.
- */
- if (!memcmp(tuxonice_signature, toi_cur_sig_page,
- sizeof(tuxonice_signature))) {
- have_image = toi_sig_data->have_image;
- toi_message(TOI_IO, TOI_VERBOSE, 0, "Have binary signature. "
- "Have image is %d.", have_image);
- if (have_image)
- toi_message(TOI_IO, TOI_VERBOSE, 0, "header dev_t is "
- "%x. First block is %d.",
- toi_sig_data->header_dev_t,
- toi_sig_data->first_header_block);
- return toi_sig_data->have_image;
- }
-
- /*
- * Failing that, try old file allocator headers.
- */
-
- if (!memcmp(HaveImage, toi_cur_sig_page, strlen(HaveImage))) {
- have_image = 1;
- return 1;
- }
-
- have_image = 0;
-
- if (!memcmp(NoImage, toi_cur_sig_page, strlen(NoImage)))
- return 0;
-
- /*
- * Nope? How about swap?
- */
- swap_header_page = (union p_diskpage) toi_cur_sig_page;
- swap_header = swap_header_page.pointer->swh.magic.magic;
-
- /* Normal swapspace? */
- for (type = 0; type < 2; type++)
- if (!memcmp(normal_sigs[type], swap_header,
- strlen(normal_sigs[type])))
- return 0;
-
- /* Swsusp or uswsusp? */
- for (type = 0; type < 3; type++)
- if (!memcmp(swsusp_sigs[type], swap_header,
- strlen(swsusp_sigs[type])))
- return 2;
-
- /* Old TuxOnIce version? */
- if (!memcmp(tuxonice_signature, swap_header,
- sizeof(tuxonice_signature) - 1)) {
- toi_message(TOI_IO, TOI_VERBOSE, 0, "Found old TuxOnIce "
- "signature.");
- have_old_image = 1;
- return 3;
- }
-
- return -1;
-}
-
-/*
- * Image_exists
- *
- * Returns -1 if don't know, otherwise 0 (no) or 1 (yes).
- */
-int toi_bio_image_exists(int quiet)
-{
- int result;
- char *msg = NULL;
-
- toi_message(TOI_IO, TOI_VERBOSE, 0, "toi_bio_image_exists.");
-
- if (!resume_dev_t) {
- if (!quiet)
- printk(KERN_INFO "Not even trying to read header "
- "because resume_dev_t is not set.\n");
- return -1;
- }
-
- if (open_resume_dev_t(0, quiet))
- return -1;
-
- result = toi_check_for_signature();
-
- clear_toi_state(TOI_RESUMED_BEFORE);
- if (toi_sig_data->resumed_before)
- set_toi_state(TOI_RESUMED_BEFORE);
-
- if (quiet || result == -ENOMEM)
- return result;
-
- if (result == -1)
- msg = "TuxOnIce: Unable to find a signature."
- " Could you have moved a swap file?\n";
- else if (!result)
- msg = "TuxOnIce: No image found.\n";
- else if (result == 1)
- msg = "TuxOnIce: Image found.\n";
- else if (result == 2)
- msg = "TuxOnIce: uswsusp or swsusp image found.\n";
- else if (result == 3)
- msg = "TuxOnIce: Old implementation's signature found.\n";
-
- printk(KERN_INFO "%s", msg);
-
- return result;
-}
-
-int toi_bio_scan_for_image(int quiet)
-{
- struct block_device *bdev;
- char default_name[255] = "";
-
- if (!quiet)
- printk(KERN_DEBUG "Scanning swap devices for TuxOnIce "
- "signature...\n");
- for (bdev = next_bdev_of_type(NULL, "swap"); bdev;
- bdev = next_bdev_of_type(bdev, "swap")) {
- int result;
- char name[255] = "";
- sprintf(name, "%u:%u", MAJOR(bdev->bd_dev),
- MINOR(bdev->bd_dev));
- if (!quiet)
- printk(KERN_DEBUG "- Trying %s.\n", name);
- resume_block_device = bdev;
- resume_dev_t = bdev->bd_dev;
-
- result = toi_check_for_signature();
-
- resume_block_device = NULL;
- resume_dev_t = MKDEV(0, 0);
-
- if (!default_name[0])
- strcpy(default_name, name);
-
- if (result == 1) {
- /* Got one! */
- strcpy(resume_file, name);
- next_bdev_of_type(bdev, NULL);
- if (!quiet)
- printk(KERN_DEBUG " ==> Image found on %s.\n",
- resume_file);
- return 1;
- }
- forget_signature_page();
- }
-
- if (!quiet)
- printk(KERN_DEBUG "TuxOnIce scan: No image found.\n");
- strcpy(resume_file, default_name);
- return 0;
-}
-
-int toi_bio_get_header_version(void)
-{
- return (memcmp(toi_sig_data->sig2, tuxonice_signature,
- sizeof(tuxonice_signature))) ?
- 0 : toi_sig_data->header_version;
-
-}
diff --git a/kernel/power/tuxonice_builtin.c b/kernel/power/tuxonice_builtin.c
deleted file mode 100644
index 22bf07a43..000000000
--- a/kernel/power/tuxonice_builtin.c
+++ /dev/null
@@ -1,498 +0,0 @@
-/*
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- */
-#include <linux/kernel.h>
-#include <linux/swap.h>
-#include <linux/syscalls.h>
-#include <linux/bio.h>
-#include <linux/root_dev.h>
-#include <linux/freezer.h>
-#include <linux/reboot.h>
-#include <linux/writeback.h>
-#include <linux/tty.h>
-#include <linux/crypto.h>
-#include <linux/cpu.h>
-#include <linux/ctype.h>
-#include <linux/kthread.h>
-#include "tuxonice_io.h"
-#include "tuxonice.h"
-#include "tuxonice_extent.h"
-#include "tuxonice_netlink.h"
-#include "tuxonice_prepare_image.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_pagedir.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_builtin.h"
-#include "tuxonice_power_off.h"
-#include "tuxonice_alloc.h"
-
-unsigned long toi_bootflags_mask;
-
-/*
- * Highmem related functions (x86 only).
- */
-
-#ifdef CONFIG_HIGHMEM
-
-/**
- * copyback_high: Restore highmem pages.
- *
- * Highmem data and pbe lists are/can be stored in highmem.
- * The format is slightly different to the lowmem pbe lists
- * used for the assembly code: the last pbe in each page is
- * a struct page * instead of struct pbe *, pointing to the
- * next page where pbes are stored (or NULL if happens to be
- * the end of the list). Since we don't want to generate
- * unnecessary deltas against swsusp code, we use a cast
- * instead of a union.
- **/
-
-static void copyback_high(void)
-{
- struct page *pbe_page = (struct page *) restore_highmem_pblist;
- struct pbe *this_pbe, *first_pbe;
- unsigned long *origpage, *copypage;
- int pbe_index = 1;
-
- if (!pbe_page)
- return;
-
- this_pbe = (struct pbe *) kmap_atomic(pbe_page);
- first_pbe = this_pbe;
-
- while (this_pbe) {
- int loop = (PAGE_SIZE / sizeof(unsigned long)) - 1;
-
- origpage = kmap_atomic(pfn_to_page((unsigned long) this_pbe->orig_address));
- copypage = kmap_atomic((struct page *) this_pbe->address);
-
- while (loop >= 0) {
- *(origpage + loop) = *(copypage + loop);
- loop--;
- }
-
- kunmap_atomic(origpage);
- kunmap_atomic(copypage);
-
- if (!this_pbe->next)
- break;
-
- if (pbe_index < PBES_PER_PAGE) {
- this_pbe++;
- pbe_index++;
- } else {
- pbe_page = (struct page *) this_pbe->next;
- kunmap_atomic(first_pbe);
- if (!pbe_page)
- return;
- this_pbe = (struct pbe *) kmap_atomic(pbe_page);
- first_pbe = this_pbe;
- pbe_index = 1;
- }
- }
- kunmap_atomic(first_pbe);
-}
-
-#else /* CONFIG_HIGHMEM */
-static void copyback_high(void) { }
-#endif
-
-char toi_wait_for_keypress_dev_console(int timeout)
-{
- int fd, this_timeout = 255, orig_kthread = 0;
- char key = '\0';
- struct termios t, t_backup;
-
- /* We should be guaranteed /dev/console exists after populate_rootfs()
- * in init/main.c.
- */
- fd = sys_open("/dev/console", O_RDONLY, 0);
- if (fd < 0) {
- printk(KERN_INFO "Couldn't open /dev/console.\n");
- return key;
- }
-
- if (sys_ioctl(fd, TCGETS, (long)&t) < 0)
- goto out_close;
-
- memcpy(&t_backup, &t, sizeof(t));
-
- t.c_lflag &= ~(ISIG|ICANON|ECHO);
- t.c_cc[VMIN] = 0;
-
-new_timeout:
- if (timeout > 0) {
- this_timeout = timeout < 26 ? timeout : 25;
- timeout -= this_timeout;
- this_timeout *= 10;
- }
-
- t.c_cc[VTIME] = this_timeout;
-
- if (sys_ioctl(fd, TCSETS, (long)&t) < 0)
- goto out_restore;
-
- if (current->flags & PF_KTHREAD) {
- orig_kthread = (current->flags & PF_KTHREAD);
- current->flags &= ~PF_KTHREAD;
- }
-
- while (1) {
- if (sys_read(fd, &key, 1) <= 0) {
- if (timeout)
- goto new_timeout;
- key = '\0';
- break;
- }
- key = tolower(key);
- if (test_toi_state(TOI_SANITY_CHECK_PROMPT)) {
- if (key == 'c') {
- set_toi_state(TOI_CONTINUE_REQ);
- break;
- } else if (key == ' ')
- break;
- } else
- break;
- }
- if (orig_kthread) {
- current->flags |= PF_KTHREAD;
- }
-
-out_restore:
- sys_ioctl(fd, TCSETS, (long)&t_backup);
-out_close:
- sys_close(fd);
-
- return key;
-}
-
-struct toi_boot_kernel_data toi_bkd __nosavedata
- __attribute__((aligned(PAGE_SIZE))) = {
- MY_BOOT_KERNEL_DATA_VERSION,
- 0,
-#ifdef CONFIG_TOI_REPLACE_SWSUSP
- (1 << TOI_REPLACE_SWSUSP) |
-#endif
- (1 << TOI_NO_FLUSHER_THREAD) |
- (1 << TOI_PAGESET2_FULL),
-};
-
-struct block_device *toi_open_by_devnum(dev_t dev)
-{
- struct block_device *bdev = bdget(dev);
- int err = -ENOMEM;
- if (bdev)
- err = blkdev_get(bdev, FMODE_READ | FMODE_NDELAY, NULL);
- return err ? ERR_PTR(err) : bdev;
-}
-
-/**
- * toi_close_bdev: Close a swap bdev.
- *
- * int: The swap entry number to close.
- */
-void toi_close_bdev(struct block_device *bdev)
-{
- blkdev_put(bdev, FMODE_READ | FMODE_NDELAY);
-}
-
-int toi_wait = CONFIG_TOI_DEFAULT_WAIT;
-struct toi_core_fns *toi_core_fns;
-unsigned long toi_result;
-struct pagedir pagedir1 = {1};
-struct toi_cbw **toi_first_cbw;
-int toi_next_cbw;
-
-unsigned long toi_get_nonconflicting_page(void)
-{
- return toi_core_fns->get_nonconflicting_page();
-}
-
-int toi_post_context_save(void)
-{
- return toi_core_fns->post_context_save();
-}
-
-int try_tuxonice_hibernate(void)
-{
- if (!toi_core_fns)
- return -ENODEV;
-
- return toi_core_fns->try_hibernate();
-}
-
-static int num_resume_calls;
-#ifdef CONFIG_TOI_IGNORE_LATE_INITCALL
-static int ignore_late_initcall = 1;
-#else
-static int ignore_late_initcall;
-#endif
-
-int toi_translate_err_default = TOI_CONTINUE_REQ;
-
-void try_tuxonice_resume(void)
-{
- if (!hibernation_available())
- return;
-
- /* Don't let it wrap around eventually */
- if (num_resume_calls < 2)
- num_resume_calls++;
-
- if (num_resume_calls == 1 && ignore_late_initcall) {
- printk(KERN_INFO "TuxOnIce: Ignoring late initcall, as requested.\n");
- return;
- }
-
- if (toi_core_fns)
- toi_core_fns->try_resume();
- else
- printk(KERN_INFO "TuxOnIce core not loaded yet.\n");
-}
-
-int toi_lowlevel_builtin(void)
-{
- int error = 0;
-
- save_processor_state();
- error = swsusp_arch_suspend();
- if (error)
- printk(KERN_ERR "Error %d hibernating\n", error);
-
- /* Restore control flow appears here */
- if (!toi_in_hibernate) {
- copyback_high();
- set_toi_state(TOI_NOW_RESUMING);
- }
-
- restore_processor_state();
- return error;
-}
-
-unsigned long toi_compress_bytes_in;
-unsigned long toi_compress_bytes_out;
-
-int toi_in_suspend(void)
-{
- return in_suspend;
-}
-
-unsigned long toi_state = ((1 << TOI_BOOT_TIME) |
- (1 << TOI_IGNORE_LOGLEVEL) |
- (1 << TOI_IO_STOPPED));
-
-/* The number of hibernates we have started (some may have been cancelled) */
-unsigned int nr_hibernates;
-int toi_running;
-__nosavedata int toi_in_hibernate;
-__nosavedata struct pbe *restore_highmem_pblist;
-
-int toi_trace_allocs;
-
-void toi_read_lock_tasklist(void)
-{
- read_lock(&tasklist_lock);
-}
-
-void toi_read_unlock_tasklist(void)
-{
- read_unlock(&tasklist_lock);
-}
-
-#ifdef CONFIG_TOI_ZRAM_SUPPORT
-int (*toi_flag_zram_disks) (void);
-
-int toi_do_flag_zram_disks(void)
-{
- return toi_flag_zram_disks ? (*toi_flag_zram_disks)() : 0;
-}
-
-#endif
-
-/* toi_generate_free_page_map
- *
- * Description: This routine generates a bitmap of free pages from the
- * lists used by the memory manager. We then use the bitmap
- * to quickly calculate which pages to save and in which
- * pagesets.
- */
-void toi_generate_free_page_map(void)
-{
- int order, cpu, t;
- unsigned long flags, i;
- struct zone *zone;
- struct list_head *curr;
- unsigned long pfn;
- struct page *page;
-
- for_each_populated_zone(zone) {
-
- if (!zone->spanned_pages)
- continue;
-
- spin_lock_irqsave(&zone->lock, flags);
-
- for (i = 0; i < zone->spanned_pages; i++) {
- pfn = zone->zone_start_pfn + i;
-
- if (!pfn_valid(pfn))
- continue;
-
- page = pfn_to_page(pfn);
-
- ClearPageNosaveFree(page);
- }
-
- for_each_migratetype_order(order, t) {
- list_for_each(curr,
- &zone->free_area[order].free_list[t]) {
- unsigned long j;
-
- pfn = page_to_pfn(list_entry(curr, struct page,
- lru));
- for (j = 0; j < (1UL << order); j++)
- SetPageNosaveFree(pfn_to_page(pfn + j));
- }
- }
-
- for_each_online_cpu(cpu) {
- struct per_cpu_pageset *pset =
- per_cpu_ptr(zone->pageset, cpu);
- struct per_cpu_pages *pcp = &pset->pcp;
- struct page *page;
- int t;
-
- for (t = 0; t < MIGRATE_PCPTYPES; t++)
- list_for_each_entry(page, &pcp->lists[t], lru)
- SetPageNosaveFree(page);
- }
-
- spin_unlock_irqrestore(&zone->lock, flags);
- }
-}
-
-/* toi_size_of_free_region
- *
- * Description: Return the number of pages that are free, beginning with and
- * including this one.
- */
-int toi_size_of_free_region(struct zone *zone, unsigned long start_pfn)
-{
- unsigned long this_pfn = start_pfn,
- end_pfn = zone_end_pfn(zone);
-
- while (pfn_valid(this_pfn) && this_pfn < end_pfn && PageNosaveFree(pfn_to_page(this_pfn)))
- this_pfn++;
-
- return this_pfn - start_pfn;
-}
-
-static int __init toi_wait_setup(char *str)
-{
- int value;
-
- if (sscanf(str, "=%d", &value)) {
- if (value < -1 || value > 255)
- printk(KERN_INFO "TuxOnIce_wait outside range -1 to "
- "255.\n");
- else
- toi_wait = value;
- }
-
- return 1;
-}
-__setup("toi_wait", toi_wait_setup);
-
-static int __init toi_translate_retry_setup(char *str)
-{
- toi_translate_err_default = 0;
- return 1;
-}
-__setup("toi_translate_retry", toi_translate_retry_setup);
-
-static int __init toi_debug_setup(char *str)
-{
- toi_bkd.toi_action |= (1 << TOI_LOGALL);
- toi_bootflags_mask |= (1 << TOI_LOGALL);
- toi_bkd.toi_debug_state = 255;
- toi_bkd.toi_default_console_level = 7;
- return 1;
-}
-__setup("toi_debug_setup", toi_debug_setup);
-
-static int __init toi_pause_setup(char *str)
-{
- toi_bkd.toi_action |= (1 << TOI_PAUSE);
- toi_bootflags_mask |= (1 << TOI_PAUSE);
- return 1;
-}
-__setup("toi_pause", toi_pause_setup);
-
-#ifdef CONFIG_PM_DEBUG
-static int __init toi_trace_allocs_setup(char *str)
-{
- int value;
-
- if (sscanf(str, "=%d", &value))
- toi_trace_allocs = value;
-
- return 1;
-}
-__setup("toi_trace_allocs", toi_trace_allocs_setup);
-#endif
-
-static int __init toi_ignore_late_initcall_setup(char *str)
-{
- int value;
-
- if (sscanf(str, "=%d", &value))
- ignore_late_initcall = value;
-
- return 1;
-}
-__setup("toi_initramfs_resume_only", toi_ignore_late_initcall_setup);
-
-static int __init toi_force_no_multithreaded_setup(char *str)
-{
- int value;
-
- toi_bkd.toi_action &= ~(1 << TOI_NO_MULTITHREADED_IO);
- toi_bootflags_mask |= (1 << TOI_NO_MULTITHREADED_IO);
-
- if (sscanf(str, "=%d", &value) && value)
- toi_bkd.toi_action |= (1 << TOI_NO_MULTITHREADED_IO);
-
- return 1;
-}
-__setup("toi_no_multithreaded", toi_force_no_multithreaded_setup);
-
-#ifdef CONFIG_KGDB
-static int __init toi_post_resume_breakpoint_setup(char *str)
-{
- int value;
-
- toi_bkd.toi_action &= ~(1 << TOI_POST_RESUME_BREAKPOINT);
- toi_bootflags_mask |= (1 << TOI_POST_RESUME_BREAKPOINT);
- if (sscanf(str, "=%d", &value) && value)
- toi_bkd.toi_action |= (1 << TOI_POST_RESUME_BREAKPOINT);
-
- return 1;
-}
-__setup("toi_post_resume_break", toi_post_resume_breakpoint_setup);
-#endif
-
-static int __init toi_disable_readahead_setup(char *str)
-{
- int value;
-
- toi_bkd.toi_action &= ~(1 << TOI_NO_READAHEAD);
- toi_bootflags_mask |= (1 << TOI_NO_READAHEAD);
- if (sscanf(str, "=%d", &value) && value)
- toi_bkd.toi_action |= (1 << TOI_NO_READAHEAD);
-
- return 1;
-}
-__setup("toi_no_readahead", toi_disable_readahead_setup);
diff --git a/kernel/power/tuxonice_builtin.h b/kernel/power/tuxonice_builtin.h
deleted file mode 100644
index 9539818e0..000000000
--- a/kernel/power/tuxonice_builtin.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- */
-#include <asm/setup.h>
-
-extern struct toi_core_fns *toi_core_fns;
-extern unsigned long toi_compress_bytes_in, toi_compress_bytes_out;
-extern unsigned int nr_hibernates;
-extern int toi_in_hibernate;
-
-extern __nosavedata struct pbe *restore_highmem_pblist;
-
-int toi_lowlevel_builtin(void);
-
-#ifdef CONFIG_HIGHMEM
-extern __nosavedata struct zone_data *toi_nosave_zone_list;
-extern __nosavedata unsigned long toi_nosave_max_pfn;
-#endif
-
-extern unsigned long toi_get_nonconflicting_page(void);
-extern int toi_post_context_save(void);
-
-extern char toi_wait_for_keypress_dev_console(int timeout);
-extern struct block_device *toi_open_by_devnum(dev_t dev);
-extern void toi_close_bdev(struct block_device *bdev);
-extern int toi_wait;
-extern int toi_translate_err_default;
-extern int toi_force_no_multithreaded;
-extern void toi_read_lock_tasklist(void);
-extern void toi_read_unlock_tasklist(void);
-extern int toi_in_suspend(void);
-extern void toi_generate_free_page_map(void);
-extern int toi_size_of_free_region(struct zone *zone, unsigned long start_pfn);
-
-#ifdef CONFIG_TOI_ZRAM_SUPPORT
-extern int toi_do_flag_zram_disks(void);
-#else
-#define toi_do_flag_zram_disks() (0)
-#endif
diff --git a/kernel/power/tuxonice_checksum.c b/kernel/power/tuxonice_checksum.c
deleted file mode 100644
index 1c4e10c72..000000000
--- a/kernel/power/tuxonice_checksum.c
+++ /dev/null
@@ -1,392 +0,0 @@
-/*
- * kernel/power/tuxonice_checksum.c
- *
- * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * This file contains data checksum routines for TuxOnIce,
- * using cryptoapi. They are used to locate any modifications
- * made to pageset 2 while we're saving it.
- */
-
-#include <linux/suspend.h>
-#include <linux/highmem.h>
-#include <linux/vmalloc.h>
-#include <linux/crypto.h>
-#include <linux/scatterlist.h>
-
-#include "tuxonice.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_io.h"
-#include "tuxonice_pageflags.h"
-#include "tuxonice_checksum.h"
-#include "tuxonice_pagedir.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_ui.h"
-
-static struct toi_module_ops toi_checksum_ops;
-
-/* Constant at the mo, but I might allow tuning later */
-static char toi_checksum_name[32] = "md4";
-/* Bytes per checksum */
-#define CHECKSUM_SIZE (16)
-
-#define CHECKSUMS_PER_PAGE ((PAGE_SIZE - sizeof(void *)) / CHECKSUM_SIZE)
-
-struct cpu_context {
- struct crypto_hash *transform;
- struct hash_desc desc;
- struct scatterlist sg[2];
- char *buf;
-};
-
-static DEFINE_PER_CPU(struct cpu_context, contexts);
-static int pages_allocated;
-static unsigned long page_list;
-
-static int toi_num_resaved;
-
-static unsigned long this_checksum, next_page;
-static int checksum_count;
-
-static inline int checksum_pages_needed(void)
-{
- return DIV_ROUND_UP(pagedir2.size, CHECKSUMS_PER_PAGE);
-}
-
-/* ---- Local buffer management ---- */
-
-/*
- * toi_checksum_cleanup
- *
- * Frees memory allocated for our labours.
- */
-static void toi_checksum_cleanup(int ending_cycle)
-{
- int cpu;
-
- if (ending_cycle) {
- for_each_online_cpu(cpu) {
- struct cpu_context *this = &per_cpu(contexts, cpu);
- if (this->transform) {
- crypto_free_hash(this->transform);
- this->transform = NULL;
- this->desc.tfm = NULL;
- }
-
- if (this->buf) {
- toi_free_page(27, (unsigned long) this->buf);
- this->buf = NULL;
- }
- }
- }
-}
-
-/*
- * toi_crypto_initialise
- *
- * Prepare to do some work by allocating buffers and transforms.
- * Returns: Int: Zero. Even if we can't set up checksum, we still
- * seek to hibernate.
- */
-static int toi_checksum_initialise(int starting_cycle)
-{
- int cpu;
-
- if (!(starting_cycle & SYSFS_HIBERNATE) || !toi_checksum_ops.enabled)
- return 0;
-
- if (!*toi_checksum_name) {
- printk(KERN_INFO "TuxOnIce: No checksum algorithm name set.\n");
- return 1;
- }
-
- for_each_online_cpu(cpu) {
- struct cpu_context *this = &per_cpu(contexts, cpu);
- struct page *page;
-
- this->transform = crypto_alloc_hash(toi_checksum_name, 0, 0);
- if (IS_ERR(this->transform)) {
- printk(KERN_INFO "TuxOnIce: Failed to initialise the "
- "%s checksum algorithm: %ld.\n",
- toi_checksum_name, (long) this->transform);
- this->transform = NULL;
- return 1;
- }
-
- this->desc.tfm = this->transform;
- this->desc.flags = 0;
-
- page = toi_alloc_page(27, GFP_KERNEL);
- if (!page)
- return 1;
- this->buf = page_address(page);
- sg_init_one(&this->sg[0], this->buf, PAGE_SIZE);
- }
- return 0;
-}
-
-/*
- * toi_checksum_print_debug_stats
- * @buffer: Pointer to a buffer into which the debug info will be printed.
- * @size: Size of the buffer.
- *
- * Print information to be recorded for debugging purposes into a buffer.
- * Returns: Number of characters written to the buffer.
- */
-
-static int toi_checksum_print_debug_stats(char *buffer, int size)
-{
- int len;
-
- if (!toi_checksum_ops.enabled)
- return scnprintf(buffer, size,
- "- Checksumming disabled.\n");
-
- len = scnprintf(buffer, size, "- Checksum method is '%s'.\n",
- toi_checksum_name);
- len += scnprintf(buffer + len, size - len,
- " %d pages resaved in atomic copy.\n", toi_num_resaved);
- return len;
-}
-
-static int toi_checksum_memory_needed(void)
-{
- return toi_checksum_ops.enabled ?
- checksum_pages_needed() << PAGE_SHIFT : 0;
-}
-
-static int toi_checksum_storage_needed(void)
-{
- if (toi_checksum_ops.enabled)
- return strlen(toi_checksum_name) + sizeof(int) + 1;
- else
- return 0;
-}
-
-/*
- * toi_checksum_save_config_info
- * @buffer: Pointer to a buffer of size PAGE_SIZE.
- *
- * Save informaton needed when reloading the image at resume time.
- * Returns: Number of bytes used for saving our data.
- */
-static int toi_checksum_save_config_info(char *buffer)
-{
- int namelen = strlen(toi_checksum_name) + 1;
- int total_len;
-
- *((unsigned int *) buffer) = namelen;
- strncpy(buffer + sizeof(unsigned int), toi_checksum_name, namelen);
- total_len = sizeof(unsigned int) + namelen;
- return total_len;
-}
-
-/* toi_checksum_load_config_info
- * @buffer: Pointer to the start of the data.
- * @size: Number of bytes that were saved.
- *
- * Description: Reload information needed for dechecksuming the image at
- * resume time.
- */
-static void toi_checksum_load_config_info(char *buffer, int size)
-{
- int namelen;
-
- namelen = *((unsigned int *) (buffer));
- strncpy(toi_checksum_name, buffer + sizeof(unsigned int),
- namelen);
- return;
-}
-
-/*
- * Free Checksum Memory
- */
-
-void free_checksum_pages(void)
-{
- while (pages_allocated) {
- unsigned long next = *((unsigned long *) page_list);
- ClearPageNosave(virt_to_page(page_list));
- toi_free_page(15, (unsigned long) page_list);
- page_list = next;
- pages_allocated--;
- }
-}
-
-/*
- * Allocate Checksum Memory
- */
-
-int allocate_checksum_pages(void)
-{
- int pages_needed = checksum_pages_needed();
-
- if (!toi_checksum_ops.enabled)
- return 0;
-
- while (pages_allocated < pages_needed) {
- unsigned long *new_page =
- (unsigned long *) toi_get_zeroed_page(15, TOI_ATOMIC_GFP);
- if (!new_page) {
- printk(KERN_ERR "Unable to allocate checksum pages.\n");
- return -ENOMEM;
- }
- SetPageNosave(virt_to_page(new_page));
- (*new_page) = page_list;
- page_list = (unsigned long) new_page;
- pages_allocated++;
- }
-
- next_page = (unsigned long) page_list;
- checksum_count = 0;
-
- return 0;
-}
-
-char *tuxonice_get_next_checksum(void)
-{
- if (!toi_checksum_ops.enabled)
- return NULL;
-
- if (checksum_count % CHECKSUMS_PER_PAGE)
- this_checksum += CHECKSUM_SIZE;
- else {
- this_checksum = next_page + sizeof(void *);
- next_page = *((unsigned long *) next_page);
- }
-
- checksum_count++;
- return (char *) this_checksum;
-}
-
-int tuxonice_calc_checksum(struct page *page, char *checksum_locn)
-{
- char *pa;
- int result, cpu = smp_processor_id();
- struct cpu_context *ctx = &per_cpu(contexts, cpu);
-
- if (!toi_checksum_ops.enabled)
- return 0;
-
- pa = kmap(page);
- memcpy(ctx->buf, pa, PAGE_SIZE);
- kunmap(page);
- result = crypto_hash_digest(&ctx->desc, ctx->sg, PAGE_SIZE,
- checksum_locn);
- if (result)
- printk(KERN_ERR "TuxOnIce checksumming: crypto_hash_digest "
- "returned %d.\n", result);
- return result;
-}
-/*
- * Calculate checksums
- */
-
-void check_checksums(void)
-{
- int index = 0, cpu = smp_processor_id();
- char current_checksum[CHECKSUM_SIZE];
- struct cpu_context *ctx = &per_cpu(contexts, cpu);
- unsigned long pfn;
-
- if (!toi_checksum_ops.enabled) {
- toi_message(TOI_IO, TOI_VERBOSE, 0, "Checksumming disabled.");
- return;
- }
-
- next_page = (unsigned long) page_list;
-
- toi_num_resaved = 0;
- this_checksum = 0;
-
- toi_trace_index++;
-
- toi_message(TOI_IO, TOI_VERBOSE, 0, "Verifying checksums.");
- memory_bm_position_reset(pageset2_map);
- for (pfn = memory_bm_next_pfn(pageset2_map, 0); pfn != BM_END_OF_MAP;
- pfn = memory_bm_next_pfn(pageset2_map, 0)) {
- int ret, resave_needed = false;
- char *pa;
- struct page *page = pfn_to_page(pfn);
-
- if (index < checksum_count) {
- if (index % CHECKSUMS_PER_PAGE) {
- this_checksum += CHECKSUM_SIZE;
- } else {
- this_checksum = next_page + sizeof(void *);
- next_page = *((unsigned long *) next_page);
- }
-
- /* Done when IRQs disabled so must be atomic */
- pa = kmap_atomic(page);
- memcpy(ctx->buf, pa, PAGE_SIZE);
- kunmap_atomic(pa);
- ret = crypto_hash_digest(&ctx->desc, ctx->sg, PAGE_SIZE,
- current_checksum);
-
- if (ret) {
- printk(KERN_INFO "Digest failed. Returned %d.\n", ret);
- return;
- }
-
- resave_needed = memcmp(current_checksum, (char *) this_checksum,
- CHECKSUM_SIZE);
- } else {
- resave_needed = true;
- }
-
- if (resave_needed) {
- TOI_TRACE_DEBUG(pfn, "_Resaving %d", resave_needed);
- SetPageResave(pfn_to_page(pfn));
- toi_num_resaved++;
- if (test_action_state(TOI_ABORT_ON_RESAVE_NEEDED))
- set_abort_result(TOI_RESAVE_NEEDED);
- }
-
- index++;
- }
- toi_message(TOI_IO, TOI_VERBOSE, 0, "Checksum verification complete.");
-}
-
-static struct toi_sysfs_data sysfs_params[] = {
- SYSFS_INT("enabled", SYSFS_RW, &toi_checksum_ops.enabled, 0, 1, 0,
- NULL),
- SYSFS_BIT("abort_if_resave_needed", SYSFS_RW, &toi_bkd.toi_action,
- TOI_ABORT_ON_RESAVE_NEEDED, 0)
-};
-
-/*
- * Ops structure.
- */
-static struct toi_module_ops toi_checksum_ops = {
- .type = MISC_MODULE,
- .name = "checksumming",
- .directory = "checksum",
- .module = THIS_MODULE,
- .initialise = toi_checksum_initialise,
- .cleanup = toi_checksum_cleanup,
- .print_debug_info = toi_checksum_print_debug_stats,
- .save_config_info = toi_checksum_save_config_info,
- .load_config_info = toi_checksum_load_config_info,
- .memory_needed = toi_checksum_memory_needed,
- .storage_needed = toi_checksum_storage_needed,
-
- .sysfs_data = sysfs_params,
- .num_sysfs_entries = sizeof(sysfs_params) /
- sizeof(struct toi_sysfs_data),
-};
-
-/* ---- Registration ---- */
-int toi_checksum_init(void)
-{
- int result = toi_register_module(&toi_checksum_ops);
- return result;
-}
-
-void toi_checksum_exit(void)
-{
- toi_unregister_module(&toi_checksum_ops);
-}
diff --git a/kernel/power/tuxonice_checksum.h b/kernel/power/tuxonice_checksum.h
deleted file mode 100644
index c8196fbb0..000000000
--- a/kernel/power/tuxonice_checksum.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * kernel/power/tuxonice_checksum.h
- *
- * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * This file contains data checksum routines for TuxOnIce,
- * using cryptoapi. They are used to locate any modifications
- * made to pageset 2 while we're saving it.
- */
-
-#if defined(CONFIG_TOI_CHECKSUM)
-extern int toi_checksum_init(void);
-extern void toi_checksum_exit(void);
-void check_checksums(void);
-int allocate_checksum_pages(void);
-void free_checksum_pages(void);
-char *tuxonice_get_next_checksum(void);
-int tuxonice_calc_checksum(struct page *page, char *checksum_locn);
-#else
-static inline int toi_checksum_init(void) { return 0; }
-static inline void toi_checksum_exit(void) { }
-static inline void check_checksums(void) { };
-static inline int allocate_checksum_pages(void) { return 0; };
-static inline void free_checksum_pages(void) { };
-static inline char *tuxonice_get_next_checksum(void) { return NULL; };
-static inline int tuxonice_calc_checksum(struct page *page, char *checksum_locn)
- { return 0; }
-#endif
-
diff --git a/kernel/power/tuxonice_cluster.c b/kernel/power/tuxonice_cluster.c
deleted file mode 100644
index 2873f93c6..000000000
--- a/kernel/power/tuxonice_cluster.c
+++ /dev/null
@@ -1,1058 +0,0 @@
-/*
- * kernel/power/tuxonice_cluster.c
- *
- * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * This file contains routines for cluster hibernation support.
- *
- * Based on ip autoconfiguration code in net/ipv4/ipconfig.c.
- *
- * How does it work?
- *
- * There is no 'master' node that tells everyone else what to do. All nodes
- * send messages to the broadcast address/port, maintain a list of peers
- * and figure out when to progress to the next step in hibernating or resuming.
- * This makes us more fault tolerant when it comes to nodes coming and going
- * (which may be more of an issue if we're hibernating when power supplies
- * are being unreliable).
- *
- * At boot time, we start a ktuxonice thread that handles communication with
- * other nodes. This node maintains a state machine that controls our progress
- * through hibernating and resuming, keeping us in step with other nodes. Nodes
- * are identified by their hw address.
- *
- * On startup, the node sends CLUSTER_PING on the configured interface's
- * broadcast address, port $toi_cluster_port (see below) and begins to listen
- * for other broadcast messages. CLUSTER_PING messages are repeated at
- * intervals of 5 minutes, with a random offset to spread traffic out.
- *
- * A hibernation cycle is initiated from any node via
- *
- * echo > /sys/power/tuxonice/do_hibernate
- *
- * and (possibily) the hibernate script. At each step of the process, the node
- * completes its work, and waits for all other nodes to signal completion of
- * their work (or timeout) before progressing to the next step.
- *
- * Request/state Action before reply Possible reply Next state
- * HIBERNATE capable, pre-script HIBERNATE|ACK NODE_PREP
- * HIBERNATE|NACK INIT_0
- *
- * PREP prepare_image PREP|ACK IMAGE_WRITE
- * PREP|NACK INIT_0
- * ABORT RUNNING
- *
- * IO write image IO|ACK power off
- * ABORT POST_RESUME
- *
- * (Boot time) check for image IMAGE|ACK RESUME_PREP
- * (Note 1)
- * IMAGE|NACK (Note 2)
- *
- * PREP prepare read image PREP|ACK IMAGE_READ
- * PREP|NACK (As NACK_IMAGE)
- *
- * IO read image IO|ACK POST_RESUME
- *
- * POST_RESUME thaw, post-script RUNNING
- *
- * INIT_0 init 0
- *
- * Other messages:
- *
- * - PING: Request for all other live nodes to send a PONG. Used at startup to
- * announce presence, when a node is suspected dead and periodically, in case
- * segments of the network are [un]plugged.
- *
- * - PONG: Response to a PING.
- *
- * - ABORT: Request to cancel writing an image.
- *
- * - BYE: Notification that this node is shutting down.
- *
- * Note 1: Repeated at 3s intervals until we continue to boot/resume, so that
- * nodes which are slower to start up can get state synchronised. If a node
- * starting up sees other nodes sending RESUME_PREP or IMAGE_READ, it may send
- * ACK_IMAGE and they will wait for it to catch up. If it sees ACK_READ, it
- * must invalidate its image (if any) and boot normally.
- *
- * Note 2: May occur when one node lost power or powered off while others
- * hibernated. This node waits for others to complete resuming (ACK_READ)
- * before completing its boot, so that it appears as a fail node restarting.
- *
- * If any node has an image, then it also has a list of nodes that hibernated
- * in synchronisation with it. The node will wait for other nodes to appear
- * or timeout before beginning its restoration.
- *
- * If a node has no image, it needs to wait, in case other nodes which do have
- * an image are going to resume, but are taking longer to announce their
- * presence. For this reason, the user can specify a timeout value and a number
- * of nodes detected before we just continue. (We might want to assume in a
- * cluster of, say, 15 nodes, if 8 others have booted without finding an image,
- * the remaining nodes will too. This might help in situations where some nodes
- * are much slower to boot, or more subject to hardware failures or such like).
- */
-
-#include <linux/suspend.h>
-#include <linux/if.h>
-#include <linux/rtnetlink.h>
-#include <linux/ip.h>
-#include <linux/udp.h>
-#include <linux/in.h>
-#include <linux/if_arp.h>
-#include <linux/kthread.h>
-#include <linux/wait.h>
-#include <linux/netdevice.h>
-#include <net/ip.h>
-
-#include "tuxonice.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_io.h"
-
-#if 1
-#define PRINTK(a, b...) do { printk(a, ##b); } while (0)
-#else
-#define PRINTK(a, b...) do { } while (0)
-#endif
-
-static int loopback_mode;
-static int num_local_nodes = 1;
-#define MAX_LOCAL_NODES 8
-#define SADDR (loopback_mode ? b->sid : h->saddr)
-
-#define MYNAME "TuxOnIce Clustering"
-
-enum cluster_message {
- MSG_ACK = 1,
- MSG_NACK = 2,
- MSG_PING = 4,
- MSG_ABORT = 8,
- MSG_BYE = 16,
- MSG_HIBERNATE = 32,
- MSG_IMAGE = 64,
- MSG_IO = 128,
- MSG_RUNNING = 256
-};
-
-static char *str_message(int message)
-{
- switch (message) {
- case 4:
- return "Ping";
- case 8:
- return "Abort";
- case 9:
- return "Abort acked";
- case 10:
- return "Abort nacked";
- case 16:
- return "Bye";
- case 17:
- return "Bye acked";
- case 18:
- return "Bye nacked";
- case 32:
- return "Hibernate request";
- case 33:
- return "Hibernate ack";
- case 34:
- return "Hibernate nack";
- case 64:
- return "Image exists?";
- case 65:
- return "Image does exist";
- case 66:
- return "No image here";
- case 128:
- return "I/O";
- case 129:
- return "I/O okay";
- case 130:
- return "I/O failed";
- case 256:
- return "Running";
- default:
- printk(KERN_ERR "Unrecognised message %d.\n", message);
- return "Unrecognised message (see dmesg)";
- }
-}
-
-#define MSG_ACK_MASK (MSG_ACK | MSG_NACK)
-#define MSG_STATE_MASK (~MSG_ACK_MASK)
-
-struct node_info {
- struct list_head member_list;
- wait_queue_head_t member_events;
- spinlock_t member_list_lock;
- spinlock_t receive_lock;
- int peer_count, ignored_peer_count;
- struct toi_sysfs_data sysfs_data;
- enum cluster_message current_message;
-};
-
-struct node_info node_array[MAX_LOCAL_NODES];
-
-struct cluster_member {
- __be32 addr;
- enum cluster_message message;
- struct list_head list;
- int ignore;
-};
-
-#define toi_cluster_port_send 3501
-#define toi_cluster_port_recv 3502
-
-static struct net_device *net_dev;
-static struct toi_module_ops toi_cluster_ops;
-
-static int toi_recv(struct sk_buff *skb, struct net_device *dev,
- struct packet_type *pt, struct net_device *orig_dev);
-
-static struct packet_type toi_cluster_packet_type = {
- .type = __constant_htons(ETH_P_IP),
- .func = toi_recv,
-};
-
-struct toi_pkt { /* BOOTP packet format */
- struct iphdr iph; /* IP header */
- struct udphdr udph; /* UDP header */
- u8 htype; /* HW address type */
- u8 hlen; /* HW address length */
- __be32 xid; /* Transaction ID */
- __be16 secs; /* Seconds since we started */
- __be16 flags; /* Just what it says */
- u8 hw_addr[16]; /* Sender's HW address */
- u16 message; /* Message */
- unsigned long sid; /* Source ID for loopback testing */
-};
-
-static char toi_cluster_iface[IFNAMSIZ] = CONFIG_TOI_DEFAULT_CLUSTER_INTERFACE;
-
-static int added_pack;
-
-static int others_have_image;
-
-/* Key used to allow multiple clusters on the same lan */
-static char toi_cluster_key[32] = CONFIG_TOI_DEFAULT_CLUSTER_KEY;
-static char pre_hibernate_script[255] =
- CONFIG_TOI_DEFAULT_CLUSTER_PRE_HIBERNATE;
-static char post_hibernate_script[255] =
- CONFIG_TOI_DEFAULT_CLUSTER_POST_HIBERNATE;
-
-/* List of cluster members */
-static unsigned long continue_delay = 5 * HZ;
-static unsigned long cluster_message_timeout = 3 * HZ;
-
-/* === Membership list === */
-
-static void print_member_info(int index)
-{
- struct cluster_member *this;
-
- printk(KERN_INFO "==> Dumping node %d.\n", index);
-
- list_for_each_entry(this, &node_array[index].member_list, list)
- printk(KERN_INFO "%d.%d.%d.%d last message %s. %s\n",
- NIPQUAD(this->addr),
- str_message(this->message),
- this->ignore ? "(Ignored)" : "");
- printk(KERN_INFO "== Done ==\n");
-}
-
-static struct cluster_member *__find_member(int index, __be32 addr)
-{
- struct cluster_member *this;
-
- list_for_each_entry(this, &node_array[index].member_list, list) {
- if (this->addr != addr)
- continue;
-
- return this;
- }
-
- return NULL;
-}
-
-static void set_ignore(int index, __be32 addr, struct cluster_member *this)
-{
- if (this->ignore) {
- PRINTK("Node %d already ignoring %d.%d.%d.%d.\n",
- index, NIPQUAD(addr));
- return;
- }
-
- PRINTK("Node %d sees node %d.%d.%d.%d now being ignored.\n",
- index, NIPQUAD(addr));
- this->ignore = 1;
- node_array[index].ignored_peer_count++;
-}
-
-static int __add_update_member(int index, __be32 addr, int message)
-{
- struct cluster_member *this;
-
- this = __find_member(index, addr);
- if (this) {
- if (this->message != message) {
- this->message = message;
- if ((message & MSG_NACK) &&
- (message & (MSG_HIBERNATE | MSG_IMAGE | MSG_IO)))
- set_ignore(index, addr, this);
- PRINTK("Node %d sees node %d.%d.%d.%d now sending "
- "%s.\n", index, NIPQUAD(addr),
- str_message(message));
- wake_up(&node_array[index].member_events);
- }
- return 0;
- }
-
- this = (struct cluster_member *) toi_kzalloc(36,
- sizeof(struct cluster_member), GFP_KERNEL);
-
- if (!this)
- return -1;
-
- this->addr = addr;
- this->message = message;
- this->ignore = 0;
- INIT_LIST_HEAD(&this->list);
-
- node_array[index].peer_count++;
-
- PRINTK("Node %d sees node %d.%d.%d.%d sending %s.\n", index,
- NIPQUAD(addr), str_message(message));
-
- if ((message & MSG_NACK) &&
- (message & (MSG_HIBERNATE | MSG_IMAGE | MSG_IO)))
- set_ignore(index, addr, this);
- list_add_tail(&this->list, &node_array[index].member_list);
- return 1;
-}
-
-static int add_update_member(int index, __be32 addr, int message)
-{
- int result;
- unsigned long flags;
- spin_lock_irqsave(&node_array[index].member_list_lock, flags);
- result = __add_update_member(index, addr, message);
- spin_unlock_irqrestore(&node_array[index].member_list_lock, flags);
-
- print_member_info(index);
-
- wake_up(&node_array[index].member_events);
-
- return result;
-}
-
-static void del_member(int index, __be32 addr)
-{
- struct cluster_member *this;
- unsigned long flags;
-
- spin_lock_irqsave(&node_array[index].member_list_lock, flags);
- this = __find_member(index, addr);
-
- if (this) {
- list_del_init(&this->list);
- toi_kfree(36, this, sizeof(*this));
- node_array[index].peer_count--;
- }
-
- spin_unlock_irqrestore(&node_array[index].member_list_lock, flags);
-}
-
-/* === Message transmission === */
-
-static void toi_send_if(int message, unsigned long my_id);
-
-/*
- * Process received TOI packet.
- */
-static int toi_recv(struct sk_buff *skb, struct net_device *dev,
- struct packet_type *pt, struct net_device *orig_dev)
-{
- struct toi_pkt *b;
- struct iphdr *h;
- int len, result, index;
- unsigned long addr, message, ack;
-
- /* Perform verifications before taking the lock. */
- if (skb->pkt_type == PACKET_OTHERHOST)
- goto drop;
-
- if (dev != net_dev)
- goto drop;
-
- skb = skb_share_check(skb, GFP_ATOMIC);
- if (!skb)
- return NET_RX_DROP;
-
- if (!pskb_may_pull(skb,
- sizeof(struct iphdr) +
- sizeof(struct udphdr)))
- goto drop;
-
- b = (struct toi_pkt *)skb_network_header(skb);
- h = &b->iph;
-
- if (h->ihl != 5 || h->version != 4 || h->protocol != IPPROTO_UDP)
- goto drop;
-
- /* Fragments are not supported */
- if (h->frag_off & htons(IP_OFFSET | IP_MF)) {
- if (net_ratelimit())
- printk(KERN_ERR "TuxOnIce: Ignoring fragmented "
- "cluster message.\n");
- goto drop;
- }
-
- if (skb->len < ntohs(h->tot_len))
- goto drop;
-
- if (ip_fast_csum((char *) h, h->ihl))
- goto drop;
-
- if (b->udph.source != htons(toi_cluster_port_send) ||
- b->udph.dest != htons(toi_cluster_port_recv))
- goto drop;
-
- if (ntohs(h->tot_len) < ntohs(b->udph.len) + sizeof(struct iphdr))
- goto drop;
-
- len = ntohs(b->udph.len) - sizeof(struct udphdr);
-
- /* Ok the front looks good, make sure we can get at the rest. */
- if (!pskb_may_pull(skb, skb->len))
- goto drop;
-
- b = (struct toi_pkt *)skb_network_header(skb);
- h = &b->iph;
-
- addr = SADDR;
- PRINTK(">>> Message %s received from " NIPQUAD_FMT ".\n",
- str_message(b->message), NIPQUAD(addr));
-
- message = b->message & MSG_STATE_MASK;
- ack = b->message & MSG_ACK_MASK;
-
- for (index = 0; index < num_local_nodes; index++) {
- int new_message = node_array[index].current_message,
- old_message = new_message;
-
- if (index == SADDR || !old_message) {
- PRINTK("Ignoring node %d (offline or self).\n", index);
- continue;
- }
-
- /* One message at a time, please. */
- spin_lock(&node_array[index].receive_lock);
-
- result = add_update_member(index, SADDR, b->message);
- if (result == -1) {
- printk(KERN_INFO "Failed to add new cluster member "
- NIPQUAD_FMT ".\n",
- NIPQUAD(addr));
- goto drop_unlock;
- }
-
- switch (b->message & MSG_STATE_MASK) {
- case MSG_PING:
- break;
- case MSG_ABORT:
- break;
- case MSG_BYE:
- break;
- case MSG_HIBERNATE:
- /* Can I hibernate? */
- new_message = MSG_HIBERNATE |
- ((index & 1) ? MSG_NACK : MSG_ACK);
- break;
- case MSG_IMAGE:
- /* Can I resume? */
- new_message = MSG_IMAGE |
- ((index & 1) ? MSG_NACK : MSG_ACK);
- if (new_message != old_message)
- printk(KERN_ERR "Setting whether I can resume "
- "to %d.\n", new_message);
- break;
- case MSG_IO:
- new_message = MSG_IO | MSG_ACK;
- break;
- case MSG_RUNNING:
- break;
- default:
- if (net_ratelimit())
- printk(KERN_ERR "Unrecognised TuxOnIce cluster"
- " message %d from " NIPQUAD_FMT ".\n",
- b->message, NIPQUAD(addr));
- };
-
- if (old_message != new_message) {
- node_array[index].current_message = new_message;
- printk(KERN_INFO ">>> Sending new message for node "
- "%d.\n", index);
- toi_send_if(new_message, index);
- } else if (!ack) {
- printk(KERN_INFO ">>> Resending message for node %d.\n",
- index);
- toi_send_if(new_message, index);
- }
-drop_unlock:
- spin_unlock(&node_array[index].receive_lock);
- };
-
-drop:
- /* Throw the packet out. */
- kfree_skb(skb);
-
- return 0;
-}
-
-/*
- * Send cluster message to single interface.
- */
-static void toi_send_if(int message, unsigned long my_id)
-{
- struct sk_buff *skb;
- struct toi_pkt *b;
- int hh_len = LL_RESERVED_SPACE(net_dev);
- struct iphdr *h;
-
- /* Allocate packet */
- skb = alloc_skb(sizeof(struct toi_pkt) + hh_len + 15, GFP_KERNEL);
- if (!skb)
- return;
- skb_reserve(skb, hh_len);
- b = (struct toi_pkt *) skb_put(skb, sizeof(struct toi_pkt));
- memset(b, 0, sizeof(struct toi_pkt));
-
- /* Construct IP header */
- skb_reset_network_header(skb);
- h = ip_hdr(skb);
- h->version = 4;
- h->ihl = 5;
- h->tot_len = htons(sizeof(struct toi_pkt));
- h->frag_off = htons(IP_DF);
- h->ttl = 64;
- h->protocol = IPPROTO_UDP;
- h->daddr = htonl(INADDR_BROADCAST);
- h->check = ip_fast_csum((unsigned char *) h, h->ihl);
-
- /* Construct UDP header */
- b->udph.source = htons(toi_cluster_port_send);
- b->udph.dest = htons(toi_cluster_port_recv);
- b->udph.len = htons(sizeof(struct toi_pkt) - sizeof(struct iphdr));
- /* UDP checksum not calculated -- explicitly allowed in BOOTP RFC */
-
- /* Construct message */
- b->message = message;
- b->sid = my_id;
- b->htype = net_dev->type; /* can cause undefined behavior */
- b->hlen = net_dev->addr_len;
- memcpy(b->hw_addr, net_dev->dev_addr, net_dev->addr_len);
- b->secs = htons(3); /* 3 seconds */
-
- /* Chain packet down the line... */
- skb->dev = net_dev;
- skb->protocol = htons(ETH_P_IP);
- if ((dev_hard_header(skb, net_dev, ntohs(skb->protocol),
- net_dev->broadcast, net_dev->dev_addr, skb->len) < 0) ||
- dev_queue_xmit(skb) < 0)
- printk(KERN_INFO "E");
-}
-
-/* ========================================= */
-
-/* kTOICluster */
-
-static atomic_t num_cluster_threads;
-static DECLARE_WAIT_QUEUE_HEAD(clusterd_events);
-
-static int kTOICluster(void *data)
-{
- unsigned long my_id;
-
- my_id = atomic_add_return(1, &num_cluster_threads) - 1;
- node_array[my_id].current_message = (unsigned long) data;
-
- PRINTK("kTOICluster daemon %lu starting.\n", my_id);
-
- current->flags |= PF_NOFREEZE;
-
- while (node_array[my_id].current_message) {
- toi_send_if(node_array[my_id].current_message, my_id);
- sleep_on_timeout(&clusterd_events,
- cluster_message_timeout);
- PRINTK("Link state %lu is %d.\n", my_id,
- node_array[my_id].current_message);
- }
-
- toi_send_if(MSG_BYE, my_id);
- atomic_dec(&num_cluster_threads);
- wake_up(&clusterd_events);
-
- PRINTK("kTOICluster daemon %lu exiting.\n", my_id);
- __set_current_state(TASK_RUNNING);
- return 0;
-}
-
-static void kill_clusterd(void)
-{
- int i;
-
- for (i = 0; i < num_local_nodes; i++) {
- if (node_array[i].current_message) {
- PRINTK("Seeking to kill clusterd %d.\n", i);
- node_array[i].current_message = 0;
- }
- }
- wait_event(clusterd_events,
- !atomic_read(&num_cluster_threads));
- PRINTK("All cluster daemons have exited.\n");
-}
-
-static int peers_not_in_message(int index, int message, int precise)
-{
- struct cluster_member *this;
- unsigned long flags;
- int result = 0;
-
- spin_lock_irqsave(&node_array[index].member_list_lock, flags);
- list_for_each_entry(this, &node_array[index].member_list, list) {
- if (this->ignore)
- continue;
-
- PRINTK("Peer %d.%d.%d.%d sending %s. "
- "Seeking %s.\n",
- NIPQUAD(this->addr),
- str_message(this->message), str_message(message));
- if ((precise ? this->message :
- this->message & MSG_STATE_MASK) !=
- message)
- result++;
- }
- spin_unlock_irqrestore(&node_array[index].member_list_lock, flags);
- PRINTK("%d peers in sought message.\n", result);
- return result;
-}
-
-static void reset_ignored(int index)
-{
- struct cluster_member *this;
- unsigned long flags;
-
- spin_lock_irqsave(&node_array[index].member_list_lock, flags);
- list_for_each_entry(this, &node_array[index].member_list, list)
- this->ignore = 0;
- node_array[index].ignored_peer_count = 0;
- spin_unlock_irqrestore(&node_array[index].member_list_lock, flags);
-}
-
-static int peers_in_message(int index, int message, int precise)
-{
- return node_array[index].peer_count -
- node_array[index].ignored_peer_count -
- peers_not_in_message(index, message, precise);
-}
-
-static int time_to_continue(int index, unsigned long start, int message)
-{
- int first = peers_not_in_message(index, message, 0);
- int second = peers_in_message(index, message, 1);
-
- PRINTK("First part returns %d, second returns %d.\n", first, second);
-
- if (!first && !second) {
- PRINTK("All peers answered message %d.\n",
- message);
- return 1;
- }
-
- if (time_after(jiffies, start + continue_delay)) {
- PRINTK("Timeout reached.\n");
- return 1;
- }
-
- PRINTK("Not time to continue yet (%lu < %lu).\n", jiffies,
- start + continue_delay);
- return 0;
-}
-
-void toi_initiate_cluster_hibernate(void)
-{
- int result;
- unsigned long start;
-
- result = do_toi_step(STEP_HIBERNATE_PREPARE_IMAGE);
- if (result)
- return;
-
- toi_send_if(MSG_HIBERNATE, 0);
-
- start = jiffies;
- wait_event(node_array[0].member_events,
- time_to_continue(0, start, MSG_HIBERNATE));
-
- if (test_action_state(TOI_FREEZER_TEST)) {
- toi_send_if(MSG_ABORT, 0);
-
- start = jiffies;
- wait_event(node_array[0].member_events,
- time_to_continue(0, start, MSG_RUNNING));
-
- do_toi_step(STEP_QUIET_CLEANUP);
- return;
- }
-
- toi_send_if(MSG_IO, 0);
-
- result = do_toi_step(STEP_HIBERNATE_SAVE_IMAGE);
- if (result)
- return;
-
- /* This code runs at resume time too! */
- if (toi_in_hibernate)
- result = do_toi_step(STEP_HIBERNATE_POWERDOWN);
-}
-
-/* toi_cluster_print_debug_stats
- *
- * Description: Print information to be recorded for debugging purposes into a
- * buffer.
- * Arguments: buffer: Pointer to a buffer into which the debug info will be
- * printed.
- * size: Size of the buffer.
- * Returns: Number of characters written to the buffer.
- */
-static int toi_cluster_print_debug_stats(char *buffer, int size)
-{
- int len;
-
- if (strlen(toi_cluster_iface))
- len = scnprintf(buffer, size,
- "- Cluster interface is '%s'.\n",
- toi_cluster_iface);
- else
- len = scnprintf(buffer, size,
- "- Cluster support is disabled.\n");
- return len;
-}
-
-/* cluster_memory_needed
- *
- * Description: Tell the caller how much memory we need to operate during
- * hibernate/resume.
- * Returns: Unsigned long. Maximum number of bytes of memory required for
- * operation.
- */
-static int toi_cluster_memory_needed(void)
-{
- return 0;
-}
-
-static int toi_cluster_storage_needed(void)
-{
- return 1 + strlen(toi_cluster_iface);
-}
-
-/* toi_cluster_save_config_info
- *
- * Description: Save informaton needed when reloading the image at resume time.
- * Arguments: Buffer: Pointer to a buffer of size PAGE_SIZE.
- * Returns: Number of bytes used for saving our data.
- */
-static int toi_cluster_save_config_info(char *buffer)
-{
- strcpy(buffer, toi_cluster_iface);
- return strlen(toi_cluster_iface + 1);
-}
-
-/* toi_cluster_load_config_info
- *
- * Description: Reload information needed for declustering the image at
- * resume time.
- * Arguments: Buffer: Pointer to the start of the data.
- * Size: Number of bytes that were saved.
- */
-static void toi_cluster_load_config_info(char *buffer, int size)
-{
- strncpy(toi_cluster_iface, buffer, size);
- return;
-}
-
-static void cluster_startup(void)
-{
- int have_image = do_check_can_resume(), i;
- unsigned long start = jiffies, initial_message;
- struct task_struct *p;
-
- initial_message = MSG_IMAGE;
-
- have_image = 1;
-
- for (i = 0; i < num_local_nodes; i++) {
- PRINTK("Starting ktoiclusterd %d.\n", i);
- p = kthread_create(kTOICluster, (void *) initial_message,
- "ktoiclusterd/%d", i);
- if (IS_ERR(p)) {
- printk(KERN_ERR "Failed to start ktoiclusterd.\n");
- return;
- }
-
- wake_up_process(p);
- }
-
- /* Wait for delay or someone else sending first message */
- wait_event(node_array[0].member_events, time_to_continue(0, start,
- MSG_IMAGE));
-
- others_have_image = peers_in_message(0, MSG_IMAGE | MSG_ACK, 1);
-
- printk(KERN_INFO "Continuing. I %shave an image. Peers with image:"
- " %d.\n", have_image ? "" : "don't ", others_have_image);
-
- if (have_image) {
- int result;
-
- /* Start to resume */
- printk(KERN_INFO " === Starting to resume === \n");
- node_array[0].current_message = MSG_IO;
- toi_send_if(MSG_IO, 0);
-
- /* result = do_toi_step(STEP_RESUME_LOAD_PS1); */
- result = 0;
-
- if (!result) {
- /*
- * Atomic restore - we'll come back in the hibernation
- * path.
- */
-
- /* result = do_toi_step(STEP_RESUME_DO_RESTORE); */
- result = 0;
-
- /* do_toi_step(STEP_QUIET_CLEANUP); */
- }
-
- node_array[0].current_message |= MSG_NACK;
-
- /* For debugging - disable for real life? */
- wait_event(node_array[0].member_events,
- time_to_continue(0, start, MSG_IO));
- }
-
- if (others_have_image) {
- /* Wait for them to resume */
- printk(KERN_INFO "Waiting for other nodes to resume.\n");
- start = jiffies;
- wait_event(node_array[0].member_events,
- time_to_continue(0, start, MSG_RUNNING));
- if (peers_not_in_message(0, MSG_RUNNING, 0))
- printk(KERN_INFO "Timed out while waiting for other "
- "nodes to resume.\n");
- }
-
- /* Find out whether an image exists here. Send ACK_IMAGE or NACK_IMAGE
- * as appropriate.
- *
- * If we don't have an image:
- * - Wait until someone else says they have one, or conditions are met
- * for continuing to boot (n machines or t seconds).
- * - If anyone has an image, wait for them to resume before continuing
- * to boot.
- *
- * If we have an image:
- * - Wait until conditions are met before continuing to resume (n
- * machines or t seconds). Send RESUME_PREP and freeze processes.
- * NACK_PREP if freezing fails (shouldn't) and follow logic for
- * us having no image above. On success, wait for [N]ACK_PREP from
- * other machines. Read image (including atomic restore) until done.
- * Wait for ACK_READ from others (should never fail). Thaw processes
- * and do post-resume. (The section after the atomic restore is done
- * via the code for hibernating).
- */
-
- node_array[0].current_message = MSG_RUNNING;
-}
-
-/* toi_cluster_open_iface
- *
- * Description: Prepare to use an interface.
- */
-
-static int toi_cluster_open_iface(void)
-{
- struct net_device *dev;
-
- rtnl_lock();
-
- for_each_netdev(&init_net, dev) {
- if (/* dev == &init_net.loopback_dev || */
- strcmp(dev->name, toi_cluster_iface))
- continue;
-
- net_dev = dev;
- break;
- }
-
- rtnl_unlock();
-
- if (!net_dev) {
- printk(KERN_ERR MYNAME ": Device %s not found.\n",
- toi_cluster_iface);
- return -ENODEV;
- }
-
- dev_add_pack(&toi_cluster_packet_type);
- added_pack = 1;
-
- loopback_mode = (net_dev == init_net.loopback_dev);
- num_local_nodes = loopback_mode ? 8 : 1;
-
- PRINTK("Loopback mode is %s. Number of local nodes is %d.\n",
- loopback_mode ? "on" : "off", num_local_nodes);
-
- cluster_startup();
- return 0;
-}
-
-/* toi_cluster_close_iface
- *
- * Description: Stop using an interface.
- */
-
-static int toi_cluster_close_iface(void)
-{
- kill_clusterd();
- if (added_pack) {
- dev_remove_pack(&toi_cluster_packet_type);
- added_pack = 0;
- }
- return 0;
-}
-
-static void write_side_effect(void)
-{
- if (toi_cluster_ops.enabled) {
- toi_cluster_open_iface();
- set_toi_state(TOI_CLUSTER_MODE);
- } else {
- toi_cluster_close_iface();
- clear_toi_state(TOI_CLUSTER_MODE);
- }
-}
-
-static void node_write_side_effect(void)
-{
-}
-
-/*
- * data for our sysfs entries.
- */
-static struct toi_sysfs_data sysfs_params[] = {
- SYSFS_STRING("interface", SYSFS_RW, toi_cluster_iface, IFNAMSIZ, 0,
- NULL),
- SYSFS_INT("enabled", SYSFS_RW, &toi_cluster_ops.enabled, 0, 1, 0,
- write_side_effect),
- SYSFS_STRING("cluster_name", SYSFS_RW, toi_cluster_key, 32, 0, NULL),
- SYSFS_STRING("pre-hibernate-script", SYSFS_RW, pre_hibernate_script,
- 256, 0, NULL),
- SYSFS_STRING("post-hibernate-script", SYSFS_RW, post_hibernate_script,
- 256, 0, STRING),
- SYSFS_UL("continue_delay", SYSFS_RW, &continue_delay, HZ / 2, 60 * HZ,
- 0)
-};
-
-/*
- * Ops structure.
- */
-
-static struct toi_module_ops toi_cluster_ops = {
- .type = FILTER_MODULE,
- .name = "Cluster",
- .directory = "cluster",
- .module = THIS_MODULE,
- .memory_needed = toi_cluster_memory_needed,
- .print_debug_info = toi_cluster_print_debug_stats,
- .save_config_info = toi_cluster_save_config_info,
- .load_config_info = toi_cluster_load_config_info,
- .storage_needed = toi_cluster_storage_needed,
-
- .sysfs_data = sysfs_params,
- .num_sysfs_entries = sizeof(sysfs_params) /
- sizeof(struct toi_sysfs_data),
-};
-
-/* ---- Registration ---- */
-
-#ifdef MODULE
-#define INIT static __init
-#define EXIT static __exit
-#else
-#define INIT
-#define EXIT
-#endif
-
-INIT int toi_cluster_init(void)
-{
- int temp = toi_register_module(&toi_cluster_ops), i;
- struct kobject *kobj = toi_cluster_ops.dir_kobj;
-
- for (i = 0; i < MAX_LOCAL_NODES; i++) {
- node_array[i].current_message = 0;
- INIT_LIST_HEAD(&node_array[i].member_list);
- init_waitqueue_head(&node_array[i].member_events);
- spin_lock_init(&node_array[i].member_list_lock);
- spin_lock_init(&node_array[i].receive_lock);
-
- /* Set up sysfs entry */
- node_array[i].sysfs_data.attr.name = toi_kzalloc(8,
- sizeof(node_array[i].sysfs_data.attr.name),
- GFP_KERNEL);
- sprintf((char *) node_array[i].sysfs_data.attr.name, "node_%d",
- i);
- node_array[i].sysfs_data.attr.mode = SYSFS_RW;
- node_array[i].sysfs_data.type = TOI_SYSFS_DATA_INTEGER;
- node_array[i].sysfs_data.flags = 0;
- node_array[i].sysfs_data.data.integer.variable =
- (int *) &node_array[i].current_message;
- node_array[i].sysfs_data.data.integer.minimum = 0;
- node_array[i].sysfs_data.data.integer.maximum = INT_MAX;
- node_array[i].sysfs_data.write_side_effect =
- node_write_side_effect;
- toi_register_sysfs_file(kobj, &node_array[i].sysfs_data);
- }
-
- toi_cluster_ops.enabled = (strlen(toi_cluster_iface) > 0);
-
- if (toi_cluster_ops.enabled)
- toi_cluster_open_iface();
-
- return temp;
-}
-
-EXIT void toi_cluster_exit(void)
-{
- int i;
- toi_cluster_close_iface();
-
- for (i = 0; i < MAX_LOCAL_NODES; i++)
- toi_unregister_sysfs_file(toi_cluster_ops.dir_kobj,
- &node_array[i].sysfs_data);
- toi_unregister_module(&toi_cluster_ops);
-}
-
-static int __init toi_cluster_iface_setup(char *iface)
-{
- toi_cluster_ops.enabled = (*iface &&
- strcmp(iface, "off"));
-
- if (toi_cluster_ops.enabled)
- strncpy(toi_cluster_iface, iface, strlen(iface));
-}
-
-__setup("toi_cluster=", toi_cluster_iface_setup);
diff --git a/kernel/power/tuxonice_cluster.h b/kernel/power/tuxonice_cluster.h
deleted file mode 100644
index 84356b304..000000000
--- a/kernel/power/tuxonice_cluster.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/*
- * kernel/power/tuxonice_cluster.h
- *
- * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- */
-
-#ifdef CONFIG_TOI_CLUSTER
-extern int toi_cluster_init(void);
-extern void toi_cluster_exit(void);
-extern void toi_initiate_cluster_hibernate(void);
-#else
-static inline int toi_cluster_init(void) { return 0; }
-static inline void toi_cluster_exit(void) { }
-static inline void toi_initiate_cluster_hibernate(void) { }
-#endif
-
diff --git a/kernel/power/tuxonice_compress.c b/kernel/power/tuxonice_compress.c
deleted file mode 100644
index 84b85226d..000000000
--- a/kernel/power/tuxonice_compress.c
+++ /dev/null
@@ -1,452 +0,0 @@
-/*
- * kernel/power/compression.c
- *
- * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * This file contains data compression routines for TuxOnIce,
- * using cryptoapi.
- */
-
-#include <linux/suspend.h>
-#include <linux/highmem.h>
-#include <linux/vmalloc.h>
-#include <linux/crypto.h>
-
-#include "tuxonice_builtin.h"
-#include "tuxonice.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_io.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_alloc.h"
-
-static int toi_expected_compression;
-
-static struct toi_module_ops toi_compression_ops;
-static struct toi_module_ops *next_driver;
-
-static char toi_compressor_name[32] = "lzo";
-
-static DEFINE_MUTEX(stats_lock);
-
-struct cpu_context {
- u8 *page_buffer;
- struct crypto_comp *transform;
- unsigned int len;
- u8 *buffer_start;
- u8 *output_buffer;
-};
-
-#define OUT_BUF_SIZE (2 * PAGE_SIZE)
-
-static DEFINE_PER_CPU(struct cpu_context, contexts);
-
-/*
- * toi_crypto_prepare
- *
- * Prepare to do some work by allocating buffers and transforms.
- */
-static int toi_compress_crypto_prepare(void)
-{
- int cpu;
-
- if (!*toi_compressor_name) {
- printk(KERN_INFO "TuxOnIce: Compression enabled but no "
- "compressor name set.\n");
- return 1;
- }
-
- for_each_online_cpu(cpu) {
- struct cpu_context *this = &per_cpu(contexts, cpu);
- this->transform = crypto_alloc_comp(toi_compressor_name, 0, 0);
- if (IS_ERR(this->transform)) {
- printk(KERN_INFO "TuxOnIce: Failed to initialise the "
- "%s compression transform.\n",
- toi_compressor_name);
- this->transform = NULL;
- return 1;
- }
-
- this->page_buffer =
- (char *) toi_get_zeroed_page(16, TOI_ATOMIC_GFP);
-
- if (!this->page_buffer) {
- printk(KERN_ERR
- "Failed to allocate a page buffer for TuxOnIce "
- "compression driver.\n");
- return -ENOMEM;
- }
-
- this->output_buffer =
- (char *) vmalloc_32(OUT_BUF_SIZE);
-
- if (!this->output_buffer) {
- printk(KERN_ERR
- "Failed to allocate a output buffer for TuxOnIce "
- "compression driver.\n");
- return -ENOMEM;
- }
- }
-
- return 0;
-}
-
-static int toi_compress_rw_cleanup(int writing)
-{
- int cpu;
-
- for_each_online_cpu(cpu) {
- struct cpu_context *this = &per_cpu(contexts, cpu);
- if (this->transform) {
- crypto_free_comp(this->transform);
- this->transform = NULL;
- }
-
- if (this->page_buffer)
- toi_free_page(16, (unsigned long) this->page_buffer);
-
- this->page_buffer = NULL;
-
- if (this->output_buffer)
- vfree(this->output_buffer);
-
- this->output_buffer = NULL;
- }
-
- return 0;
-}
-
-/*
- * toi_compress_init
- */
-
-static int toi_compress_init(int toi_or_resume)
-{
- if (!toi_or_resume)
- return 0;
-
- toi_compress_bytes_in = 0;
- toi_compress_bytes_out = 0;
-
- next_driver = toi_get_next_filter(&toi_compression_ops);
-
- return next_driver ? 0 : -ECHILD;
-}
-
-/*
- * toi_compress_rw_init()
- */
-
-static int toi_compress_rw_init(int rw, int stream_number)
-{
- if (toi_compress_crypto_prepare()) {
- printk(KERN_ERR "Failed to initialise compression "
- "algorithm.\n");
- if (rw == READ) {
- printk(KERN_INFO "Unable to read the image.\n");
- return -ENODEV;
- } else {
- printk(KERN_INFO "Continuing without "
- "compressing the image.\n");
- toi_compression_ops.enabled = 0;
- }
- }
-
- return 0;
-}
-
-/*
- * toi_compress_write_page()
- *
- * Compress a page of data, buffering output and passing on filled
- * pages to the next module in the pipeline.
- *
- * Buffer_page: Pointer to a buffer of size PAGE_SIZE, containing
- * data to be compressed.
- *
- * Returns: 0 on success. Otherwise the error is that returned by later
- * modules, -ECHILD if we have a broken pipeline or -EIO if
- * zlib errs.
- */
-static int toi_compress_write_page(unsigned long index, int buf_type,
- void *buffer_page, unsigned int buf_size)
-{
- int ret = 0, cpu = smp_processor_id();
- struct cpu_context *ctx = &per_cpu(contexts, cpu);
- u8* output_buffer = buffer_page;
- int output_len = buf_size;
- int out_buf_type = buf_type;
-
- if (ctx->transform) {
-
- ctx->buffer_start = TOI_MAP(buf_type, buffer_page);
- ctx->len = OUT_BUF_SIZE;
-
- ret = crypto_comp_compress(ctx->transform,
- ctx->buffer_start, buf_size,
- ctx->output_buffer, &ctx->len);
-
- TOI_UNMAP(buf_type, buffer_page);
-
- toi_message(TOI_COMPRESS, TOI_VERBOSE, 0,
- "CPU %d, index %lu: %d bytes",
- cpu, index, ctx->len);
-
- if (!ret && ctx->len < buf_size) { /* some compression */
- output_buffer = ctx->output_buffer;
- output_len = ctx->len;
- out_buf_type = TOI_VIRT;
- }
-
- }
-
- mutex_lock(&stats_lock);
-
- toi_compress_bytes_in += buf_size;
- toi_compress_bytes_out += output_len;
-
- mutex_unlock(&stats_lock);
-
- if (!ret)
- ret = next_driver->write_page(index, out_buf_type,
- output_buffer, output_len);
-
- return ret;
-}
-
-/*
- * toi_compress_read_page()
- * @buffer_page: struct page *. Pointer to a buffer of size PAGE_SIZE.
- *
- * Retrieve data from later modules and decompress it until the input buffer
- * is filled.
- * Zero if successful. Error condition from me or from downstream on failure.
- */
-static int toi_compress_read_page(unsigned long *index, int buf_type,
- void *buffer_page, unsigned int *buf_size)
-{
- int ret, cpu = smp_processor_id();
- unsigned int len;
- unsigned int outlen = PAGE_SIZE;
- char *buffer_start;
- struct cpu_context *ctx = &per_cpu(contexts, cpu);
-
- if (!ctx->transform)
- return next_driver->read_page(index, TOI_PAGE, buffer_page,
- buf_size);
-
- /*
- * All our reads must be synchronous - we can't decompress
- * data that hasn't been read yet.
- */
-
- ret = next_driver->read_page(index, TOI_VIRT, ctx->page_buffer, &len);
-
- buffer_start = kmap(buffer_page);
-
- /* Error or uncompressed data */
- if (ret || len == PAGE_SIZE) {
- memcpy(buffer_start, ctx->page_buffer, len);
- goto out;
- }
-
- ret = crypto_comp_decompress(
- ctx->transform,
- ctx->page_buffer,
- len, buffer_start, &outlen);
-
- toi_message(TOI_COMPRESS, TOI_VERBOSE, 0,
- "CPU %d, index %lu: %d=>%d (%d).",
- cpu, *index, len, outlen, ret);
-
- if (ret)
- abort_hibernate(TOI_FAILED_IO,
- "Compress_read returned %d.\n", ret);
- else if (outlen != PAGE_SIZE) {
- abort_hibernate(TOI_FAILED_IO,
- "Decompression yielded %d bytes instead of %ld.\n",
- outlen, PAGE_SIZE);
- printk(KERN_ERR "Decompression yielded %d bytes instead of "
- "%ld.\n", outlen, PAGE_SIZE);
- ret = -EIO;
- *buf_size = outlen;
- }
-out:
- TOI_UNMAP(buf_type, buffer_page);
- return ret;
-}
-
-/*
- * toi_compress_print_debug_stats
- * @buffer: Pointer to a buffer into which the debug info will be printed.
- * @size: Size of the buffer.
- *
- * Print information to be recorded for debugging purposes into a buffer.
- * Returns: Number of characters written to the buffer.
- */
-
-static int toi_compress_print_debug_stats(char *buffer, int size)
-{
- unsigned long pages_in = toi_compress_bytes_in >> PAGE_SHIFT,
- pages_out = toi_compress_bytes_out >> PAGE_SHIFT;
- int len;
-
- /* Output the compression ratio achieved. */
- if (*toi_compressor_name)
- len = scnprintf(buffer, size, "- Compressor is '%s'.\n",
- toi_compressor_name);
- else
- len = scnprintf(buffer, size, "- Compressor is not set.\n");
-
- if (pages_in)
- len += scnprintf(buffer+len, size - len, " Compressed "
- "%lu bytes into %lu (%ld percent compression).\n",
- toi_compress_bytes_in,
- toi_compress_bytes_out,
- (pages_in - pages_out) * 100 / pages_in);
- return len;
-}
-
-/*
- * toi_compress_compression_memory_needed
- *
- * Tell the caller how much memory we need to operate during hibernate/resume.
- * Returns: Unsigned long. Maximum number of bytes of memory required for
- * operation.
- */
-static int toi_compress_memory_needed(void)
-{
- return 2 * PAGE_SIZE;
-}
-
-static int toi_compress_storage_needed(void)
-{
- return 2 * sizeof(unsigned long) + 2 * sizeof(int) +
- strlen(toi_compressor_name) + 1;
-}
-
-/*
- * toi_compress_save_config_info
- * @buffer: Pointer to a buffer of size PAGE_SIZE.
- *
- * Save informaton needed when reloading the image at resume time.
- * Returns: Number of bytes used for saving our data.
- */
-static int toi_compress_save_config_info(char *buffer)
-{
- int len = strlen(toi_compressor_name) + 1, offset = 0;
-
- *((unsigned long *) buffer) = toi_compress_bytes_in;
- offset += sizeof(unsigned long);
- *((unsigned long *) (buffer + offset)) = toi_compress_bytes_out;
- offset += sizeof(unsigned long);
- *((int *) (buffer + offset)) = toi_expected_compression;
- offset += sizeof(int);
- *((int *) (buffer + offset)) = len;
- offset += sizeof(int);
- strncpy(buffer + offset, toi_compressor_name, len);
- return offset + len;
-}
-
-/* toi_compress_load_config_info
- * @buffer: Pointer to the start of the data.
- * @size: Number of bytes that were saved.
- *
- * Description: Reload information needed for decompressing the image at
- * resume time.
- */
-static void toi_compress_load_config_info(char *buffer, int size)
-{
- int len, offset = 0;
-
- toi_compress_bytes_in = *((unsigned long *) buffer);
- offset += sizeof(unsigned long);
- toi_compress_bytes_out = *((unsigned long *) (buffer + offset));
- offset += sizeof(unsigned long);
- toi_expected_compression = *((int *) (buffer + offset));
- offset += sizeof(int);
- len = *((int *) (buffer + offset));
- offset += sizeof(int);
- strncpy(toi_compressor_name, buffer + offset, len);
-}
-
-static void toi_compress_pre_atomic_restore(struct toi_boot_kernel_data *bkd)
-{
- bkd->compress_bytes_in = toi_compress_bytes_in;
- bkd->compress_bytes_out = toi_compress_bytes_out;
-}
-
-static void toi_compress_post_atomic_restore(struct toi_boot_kernel_data *bkd)
-{
- toi_compress_bytes_in = bkd->compress_bytes_in;
- toi_compress_bytes_out = bkd->compress_bytes_out;
-}
-
-/*
- * toi_expected_compression_ratio
- *
- * Description: Returns the expected ratio between data passed into this module
- * and the amount of data output when writing.
- * Returns: 100 if the module is disabled. Otherwise the value set by the
- * user via our sysfs entry.
- */
-
-static int toi_compress_expected_ratio(void)
-{
- if (!toi_compression_ops.enabled)
- return 100;
- else
- return 100 - toi_expected_compression;
-}
-
-/*
- * data for our sysfs entries.
- */
-static struct toi_sysfs_data sysfs_params[] = {
- SYSFS_INT("expected_compression", SYSFS_RW, &toi_expected_compression,
- 0, 99, 0, NULL),
- SYSFS_INT("enabled", SYSFS_RW, &toi_compression_ops.enabled, 0, 1, 0,
- NULL),
- SYSFS_STRING("algorithm", SYSFS_RW, toi_compressor_name, 31, 0, NULL),
-};
-
-/*
- * Ops structure.
- */
-static struct toi_module_ops toi_compression_ops = {
- .type = FILTER_MODULE,
- .name = "compression",
- .directory = "compression",
- .module = THIS_MODULE,
- .initialise = toi_compress_init,
- .memory_needed = toi_compress_memory_needed,
- .print_debug_info = toi_compress_print_debug_stats,
- .save_config_info = toi_compress_save_config_info,
- .load_config_info = toi_compress_load_config_info,
- .storage_needed = toi_compress_storage_needed,
- .expected_compression = toi_compress_expected_ratio,
-
- .pre_atomic_restore = toi_compress_pre_atomic_restore,
- .post_atomic_restore = toi_compress_post_atomic_restore,
-
- .rw_init = toi_compress_rw_init,
- .rw_cleanup = toi_compress_rw_cleanup,
-
- .write_page = toi_compress_write_page,
- .read_page = toi_compress_read_page,
-
- .sysfs_data = sysfs_params,
- .num_sysfs_entries = sizeof(sysfs_params) /
- sizeof(struct toi_sysfs_data),
-};
-
-/* ---- Registration ---- */
-
-static __init int toi_compress_load(void)
-{
- return toi_register_module(&toi_compression_ops);
-}
-
-late_initcall(toi_compress_load);
diff --git a/kernel/power/tuxonice_copy_before_write.c b/kernel/power/tuxonice_copy_before_write.c
deleted file mode 100644
index eb627915e..000000000
--- a/kernel/power/tuxonice_copy_before_write.c
+++ /dev/null
@@ -1,240 +0,0 @@
-/*
- * kernel/power/tuxonice_copy_before_write.c
- *
- * Copyright (C) 2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * Routines (apart from the fault handling code) to deal with allocating memory
- * for copying pages before they are modified, restoring the contents and getting
- * the contents written to disk.
- */
-
-#include <linux/percpu-defs.h>
-#include <linux/sched.h>
-#include <linux/tuxonice.h>
-#include "tuxonice_alloc.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice.h"
-
-DEFINE_PER_CPU(struct toi_cbw_state, toi_cbw_states);
-#define CBWS_PER_PAGE (PAGE_SIZE / sizeof(struct toi_cbw))
-#define toi_cbw_pool_size 100
-
-static void _toi_free_cbw_data(struct toi_cbw_state *state)
-{
- struct toi_cbw *page_ptr, *ptr, *next;
-
- page_ptr = ptr = state->first;
-
- while(ptr) {
- next = ptr->next;
-
- if (ptr->virt) {
- toi__free_page(40, virt_to_page(ptr->virt));
- }
- if ((((unsigned long) ptr) & PAGE_MASK) != (unsigned long) page_ptr) {
- /* Must be on a new page - free the previous one. */
- toi__free_page(40, virt_to_page(page_ptr));
- page_ptr = ptr;
- }
- ptr = next;
- }
-
- if (page_ptr) {
- toi__free_page(40, virt_to_page(page_ptr));
- }
-
- state->first = state->next = state->last = NULL;
- state->size = 0;
-}
-
-void toi_free_cbw_data(void)
-{
- int i;
-
- for_each_online_cpu(i) {
- struct toi_cbw_state *state = &per_cpu(toi_cbw_states, i);
-
- if (!state->first)
- continue;
-
- state->enabled = 0;
-
- while (state->active) {
- schedule();
- }
-
- _toi_free_cbw_data(state);
- }
-}
-
-static int _toi_allocate_cbw_data(struct toi_cbw_state *state)
-{
- while(state->size < toi_cbw_pool_size) {
- int i;
- struct toi_cbw *ptr;
-
- ptr = (struct toi_cbw *) toi_get_zeroed_page(40, GFP_KERNEL);
-
- if (!ptr) {
- return -ENOMEM;
- }
-
- if (!state->first) {
- state->first = state->next = state->last = ptr;
- }
-
- for (i = 0; i < CBWS_PER_PAGE; i++) {
- struct toi_cbw *cbw = &ptr[i];
-
- cbw->virt = (char *) toi_get_zeroed_page(40, GFP_KERNEL);
- if (!cbw->virt) {
- state->size += i;
- printk("Out of memory allocating CBW pages.\n");
- return -ENOMEM;
- }
-
- if (cbw == state->first)
- continue;
-
- state->last->next = cbw;
- state->last = cbw;
- }
-
- state->size += CBWS_PER_PAGE;
- }
-
- state->enabled = 1;
-
- return 0;
-}
-
-
-int toi_allocate_cbw_data(void)
-{
- int i, result;
-
- for_each_online_cpu(i) {
- struct toi_cbw_state *state = &per_cpu(toi_cbw_states, i);
-
- result = _toi_allocate_cbw_data(state);
-
- if (result)
- return result;
- }
-
- return 0;
-}
-
-void toi_cbw_restore(void)
-{
- if (!toi_keeping_image)
- return;
-
-}
-
-void toi_cbw_write(void)
-{
- if (!toi_keeping_image)
- return;
-
-}
-
-/**
- * toi_cbw_test_read - Test copy before write on one page
- *
- * Allocate copy before write buffers, then make one page only copy-before-write
- * and attempt to write to it. We should then be able to retrieve the original
- * version from the cbw buffer and the modified version from the page itself.
- */
-static int toi_cbw_test_read(const char *buffer, int count)
-{
- unsigned long virt = toi_get_zeroed_page(40, GFP_KERNEL);
- char *original = "Original contents";
- char *modified = "Modified material";
- struct page *page = virt_to_page(virt);
- int i, len = 0, found = 0, pfn = page_to_pfn(page);
-
- if (!page) {
- printk("toi_cbw_test_read: Unable to allocate a page for testing.\n");
- return -ENOMEM;
- }
-
- memcpy((char *) virt, original, strlen(original));
-
- if (toi_allocate_cbw_data()) {
- printk("toi_cbw_test_read: Unable to allocate cbw data.\n");
- return -ENOMEM;
- }
-
- toi_reset_dirtiness_one(pfn, 0);
-
- SetPageTOI_CBW(page);
-
- memcpy((char *) virt, modified, strlen(modified));
-
- if (strncmp((char *) virt, modified, strlen(modified))) {
- len += sprintf((char *) buffer + len, "Failed to write to page after protecting it.\n");
- }
-
- for_each_online_cpu(i) {
- struct toi_cbw_state *state = &per_cpu(toi_cbw_states, i);
- struct toi_cbw *ptr = state->first, *last_ptr = ptr;
-
- if (!found) {
- while (ptr) {
- if (ptr->pfn == pfn) {
- found = 1;
- if (strncmp(ptr->virt, original, strlen(original))) {
- len += sprintf((char *) buffer + len, "Contents of original buffer are not original.\n");
- } else {
- len += sprintf((char *) buffer + len, "Test passed. Buffer changed and original contents preserved.\n");
- }
- break;
- }
-
- last_ptr = ptr;
- ptr = ptr->next;
- }
- }
-
- if (!last_ptr)
- len += sprintf((char *) buffer + len, "All available CBW buffers on cpu %d used.\n", i);
- }
-
- if (!found)
- len += sprintf((char *) buffer + len, "Copy before write buffer not found.\n");
-
- toi_free_cbw_data();
-
- return len;
-}
-
-/*
- * This array contains entries that are automatically registered at
- * boot. Modules and the console code register their own entries separately.
- */
-static struct toi_sysfs_data sysfs_params[] = {
- SYSFS_CUSTOM("test", SYSFS_RW, toi_cbw_test_read,
- NULL, SYSFS_NEEDS_SM_FOR_READ, NULL),
-};
-
-static struct toi_module_ops toi_cbw_ops = {
- .type = MISC_HIDDEN_MODULE,
- .name = "copy_before_write debugging",
- .directory = "cbw",
- .module = THIS_MODULE,
- .early = 1,
-
- .sysfs_data = sysfs_params,
- .num_sysfs_entries = sizeof(sysfs_params) /
- sizeof(struct toi_sysfs_data),
-};
-
-int toi_cbw_init(void)
-{
- int result = toi_register_module(&toi_cbw_ops);
- return result;
-}
diff --git a/kernel/power/tuxonice_extent.c b/kernel/power/tuxonice_extent.c
deleted file mode 100644
index 522c836ad..000000000
--- a/kernel/power/tuxonice_extent.c
+++ /dev/null
@@ -1,144 +0,0 @@
-/*
- * kernel/power/tuxonice_extent.c
- *
- * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * Distributed under GPLv2.
- *
- * These functions encapsulate the manipulation of storage metadata.
- */
-
-#include <linux/suspend.h>
-#include "tuxonice_modules.h"
-#include "tuxonice_extent.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_ui.h"
-#include "tuxonice.h"
-
-/**
- * toi_get_extent - return a free extent
- *
- * May fail, returning NULL instead.
- **/
-static struct hibernate_extent *toi_get_extent(void)
-{
- return (struct hibernate_extent *) toi_kzalloc(2,
- sizeof(struct hibernate_extent), TOI_ATOMIC_GFP);
-}
-
-/**
- * toi_put_extent_chain - free a chain of extents starting from value 'from'
- * @chain: Chain to free.
- *
- * Note that 'from' is an extent value, and may be part way through an extent.
- * In this case, the extent should be truncated (if necessary) and following
- * extents freed.
- **/
-void toi_put_extent_chain_from(struct hibernate_extent_chain *chain, unsigned long from)
-{
- struct hibernate_extent *this;
-
- this = chain->first;
-
- while (this) {
- struct hibernate_extent *next = this->next;
-
- // Delete the whole extent?
- if (this->start >= from) {
- chain->size -= (this->end - this->start + 1);
- if (chain->first == this)
- chain->first = next;
- if (chain->last_touched == this)
- chain->last_touched = NULL;
- if (chain->current_extent == this)
- chain->current_extent = NULL;
- toi_kfree(2, this, sizeof(*this));
- chain->num_extents--;
- } else if (this->end >= from) {
- // Delete part of the extent
- chain->size -= (this->end - from + 1);
- this->start = from;
- }
- this = next;
- }
-}
-
-/**
- * toi_put_extent_chain - free a whole chain of extents
- * @chain: Chain to free.
- **/
-void toi_put_extent_chain(struct hibernate_extent_chain *chain)
-{
- toi_put_extent_chain_from(chain, 0);
-}
-
-/**
- * toi_add_to_extent_chain - add an extent to an existing chain
- * @chain: Chain to which the extend should be added
- * @start: Start of the extent (first physical block)
- * @end: End of the extent (last physical block)
- *
- * The chain information is updated if the insertion is successful.
- **/
-int toi_add_to_extent_chain(struct hibernate_extent_chain *chain,
- unsigned long start, unsigned long end)
-{
- struct hibernate_extent *new_ext = NULL, *cur_ext = NULL;
-
- toi_message(TOI_IO, TOI_VERBOSE, 0,
- "Adding extent %lu-%lu to chain %p.\n", start, end, chain);
-
- /* Find the right place in the chain */
- if (chain->last_touched && chain->last_touched->start < start)
- cur_ext = chain->last_touched;
- else if (chain->first && chain->first->start < start)
- cur_ext = chain->first;
-
- if (cur_ext) {
- while (cur_ext->next && cur_ext->next->start < start)
- cur_ext = cur_ext->next;
-
- if (cur_ext->end == (start - 1)) {
- struct hibernate_extent *next_ext = cur_ext->next;
- cur_ext->end = end;
-
- /* Merge with the following one? */
- if (next_ext && cur_ext->end + 1 == next_ext->start) {
- cur_ext->end = next_ext->end;
- cur_ext->next = next_ext->next;
- toi_kfree(2, next_ext, sizeof(*next_ext));
- chain->num_extents--;
- }
-
- chain->last_touched = cur_ext;
- chain->size += (end - start + 1);
-
- return 0;
- }
- }
-
- new_ext = toi_get_extent();
- if (!new_ext) {
- printk(KERN_INFO "Error unable to append a new extent to the "
- "chain.\n");
- return -ENOMEM;
- }
-
- chain->num_extents++;
- chain->size += (end - start + 1);
- new_ext->start = start;
- new_ext->end = end;
-
- chain->last_touched = new_ext;
-
- if (cur_ext) {
- new_ext->next = cur_ext->next;
- cur_ext->next = new_ext;
- } else {
- if (chain->first)
- new_ext->next = chain->first;
- chain->first = new_ext;
- }
-
- return 0;
-}
diff --git a/kernel/power/tuxonice_extent.h b/kernel/power/tuxonice_extent.h
deleted file mode 100644
index aeccf1f5e..000000000
--- a/kernel/power/tuxonice_extent.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * kernel/power/tuxonice_extent.h
- *
- * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * It contains declarations related to extents. Extents are
- * TuxOnIce's method of storing some of the metadata for the image.
- * See tuxonice_extent.c for more info.
- *
- */
-
-#include "tuxonice_modules.h"
-
-#ifndef EXTENT_H
-#define EXTENT_H
-
-struct hibernate_extent {
- unsigned long start, end;
- struct hibernate_extent *next;
-};
-
-struct hibernate_extent_chain {
- unsigned long size; /* size of the chain ie sum (max-min+1) */
- int num_extents;
- struct hibernate_extent *first, *last_touched;
- struct hibernate_extent *current_extent;
- unsigned long current_offset;
-};
-
-/* Simplify iterating through all the values in an extent chain */
-#define toi_extent_for_each(extent_chain, extentpointer, value) \
-if ((extent_chain)->first) \
- for ((extentpointer) = (extent_chain)->first, (value) = \
- (extentpointer)->start; \
- ((extentpointer) && ((extentpointer)->next || (value) <= \
- (extentpointer)->end)); \
- (((value) == (extentpointer)->end) ? \
- ((extentpointer) = (extentpointer)->next, (value) = \
- ((extentpointer) ? (extentpointer)->start : 0)) : \
- (value)++))
-
-extern void toi_put_extent_chain_from(struct hibernate_extent_chain *chain, unsigned long from);
-#endif
diff --git a/kernel/power/tuxonice_file.c b/kernel/power/tuxonice_file.c
deleted file mode 100644
index baf191211..000000000
--- a/kernel/power/tuxonice_file.c
+++ /dev/null
@@ -1,484 +0,0 @@
-/*
- * kernel/power/tuxonice_file.c
- *
- * Copyright (C) 2005-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * Distributed under GPLv2.
- *
- * This file encapsulates functions for usage of a simple file as a
- * backing store. It is based upon the swapallocator, and shares the
- * same basic working. Here, though, we have nothing to do with
- * swapspace, and only one device to worry about.
- *
- * The user can just
- *
- * echo TuxOnIce > /path/to/my_file
- *
- * dd if=/dev/zero bs=1M count=<file_size_desired> >> /path/to/my_file
- *
- * and
- *
- * echo /path/to/my_file > /sys/power/tuxonice/file/target
- *
- * then put what they find in /sys/power/tuxonice/resume
- * as their resume= parameter in lilo.conf (and rerun lilo if using it).
- *
- * Having done this, they're ready to hibernate and resume.
- *
- * TODO:
- * - File resizing.
- */
-
-#include <linux/blkdev.h>
-#include <linux/mount.h>
-#include <linux/fs.h>
-#include <linux/fs_uuid.h>
-
-#include "tuxonice.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_bio.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_builtin.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_io.h"
-
-#define target_is_normal_file() (S_ISREG(target_inode->i_mode))
-
-static struct toi_module_ops toi_fileops;
-
-static struct file *target_file;
-static struct block_device *toi_file_target_bdev;
-static unsigned long pages_available, pages_allocated;
-static char toi_file_target[256];
-static struct inode *target_inode;
-static int file_target_priority;
-static int used_devt;
-static int target_claim;
-static dev_t toi_file_dev_t;
-static int sig_page_index;
-
-/* For test_toi_file_target */
-static struct toi_bdev_info *file_chain;
-
-static int has_contiguous_blocks(struct toi_bdev_info *dev_info, int page_num)
-{
- int j;
- sector_t last = 0;
-
- for (j = 0; j < dev_info->blocks_per_page; j++) {
- sector_t this = bmap(target_inode,
- page_num * dev_info->blocks_per_page + j);
-
- if (!this || (last && (last + 1) != this))
- break;
-
- last = this;
- }
-
- return j == dev_info->blocks_per_page;
-}
-
-static unsigned long get_usable_pages(struct toi_bdev_info *dev_info)
-{
- unsigned long result = 0;
- struct block_device *bdev = dev_info->bdev;
- int i;
-
- switch (target_inode->i_mode & S_IFMT) {
- case S_IFSOCK:
- case S_IFCHR:
- case S_IFIFO: /* Socket, Char, Fifo */
- return -1;
- case S_IFREG: /* Regular file: current size - holes + free
- space on part */
- for (i = 0; i < (target_inode->i_size >> PAGE_SHIFT) ; i++) {
- if (has_contiguous_blocks(dev_info, i))
- result++;
- }
- break;
- case S_IFBLK: /* Block device */
- if (!bdev->bd_disk) {
- toi_message(TOI_IO, TOI_VERBOSE, 0,
- "bdev->bd_disk null.");
- return 0;
- }
-
- result = (bdev->bd_part ?
- bdev->bd_part->nr_sects :
- get_capacity(bdev->bd_disk)) >> (PAGE_SHIFT - 9);
- }
-
-
- return result;
-}
-
-static int toi_file_register_storage(void)
-{
- struct toi_bdev_info *devinfo;
- int result = 0;
- struct fs_info *fs_info;
-
- toi_message(TOI_IO, TOI_VERBOSE, 0, "toi_file_register_storage.");
- if (!strlen(toi_file_target)) {
- toi_message(TOI_IO, TOI_VERBOSE, 0, "Register file storage: "
- "No target filename set.");
- return 0;
- }
-
- target_file = filp_open(toi_file_target, O_RDONLY|O_LARGEFILE, 0);
- toi_message(TOI_IO, TOI_VERBOSE, 0, "filp_open %s returned %p.",
- toi_file_target, target_file);
-
- if (IS_ERR(target_file) || !target_file) {
- target_file = NULL;
- toi_file_dev_t = name_to_dev_t(toi_file_target);
- if (!toi_file_dev_t) {
- struct kstat stat;
- int error = vfs_stat(toi_file_target, &stat);
- printk(KERN_INFO "Open file %s returned %p and "
- "name_to_devt failed.\n",
- toi_file_target, target_file);
- if (error) {
- printk(KERN_INFO "Stating the file also failed."
- " Nothing more we can do.\n");
- return 0;
- } else
- toi_file_dev_t = stat.rdev;
- }
-
- toi_file_target_bdev = toi_open_by_devnum(toi_file_dev_t);
- if (IS_ERR(toi_file_target_bdev)) {
- printk(KERN_INFO "Got a dev_num (%lx) but failed to "
- "open it.\n",
- (unsigned long) toi_file_dev_t);
- toi_file_target_bdev = NULL;
- return 0;
- }
- used_devt = 1;
- target_inode = toi_file_target_bdev->bd_inode;
- } else
- target_inode = target_file->f_mapping->host;
-
- toi_message(TOI_IO, TOI_VERBOSE, 0, "Succeeded in opening the target.");
- if (S_ISLNK(target_inode->i_mode) || S_ISDIR(target_inode->i_mode) ||
- S_ISSOCK(target_inode->i_mode) || S_ISFIFO(target_inode->i_mode)) {
- printk(KERN_INFO "File support works with regular files,"
- " character files and block devices.\n");
- /* Cleanup routine will undo the above */
- return 0;
- }
-
- if (!used_devt) {
- if (S_ISBLK(target_inode->i_mode)) {
- toi_file_target_bdev = I_BDEV(target_inode);
- if (!blkdev_get(toi_file_target_bdev, FMODE_WRITE |
- FMODE_READ, NULL))
- target_claim = 1;
- } else
- toi_file_target_bdev = target_inode->i_sb->s_bdev;
- if (!toi_file_target_bdev) {
- printk(KERN_INFO "%s is not a valid file allocator "
- "target.\n", toi_file_target);
- return 0;
- }
- toi_file_dev_t = toi_file_target_bdev->bd_dev;
- }
-
- devinfo = toi_kzalloc(39, sizeof(struct toi_bdev_info), GFP_ATOMIC);
- if (!devinfo) {
- printk("Failed to allocate a toi_bdev_info struct for the file allocator.\n");
- return -ENOMEM;
- }
-
- devinfo->bdev = toi_file_target_bdev;
- devinfo->allocator = &toi_fileops;
- devinfo->allocator_index = 0;
-
- fs_info = fs_info_from_block_dev(toi_file_target_bdev);
- if (fs_info && !IS_ERR(fs_info)) {
- memcpy(devinfo->uuid, &fs_info->uuid, 16);
- free_fs_info(fs_info);
- } else
- result = (int) PTR_ERR(fs_info);
-
- /* Unlike swap code, only complain if fs_info_from_block_dev returned
- * -ENOMEM. The 'file' might be a full partition, so might validly not
- * have an identifiable type, UUID etc.
- */
- if (result)
- printk(KERN_DEBUG "Failed to get fs_info for file device (%d).\n",
- result);
- devinfo->dev_t = toi_file_dev_t;
- devinfo->prio = file_target_priority;
- devinfo->bmap_shift = target_inode->i_blkbits - 9;
- devinfo->blocks_per_page =
- (1 << (PAGE_SHIFT - target_inode->i_blkbits));
- sprintf(devinfo->name, "file %s", toi_file_target);
- file_chain = devinfo;
- toi_message(TOI_IO, TOI_VERBOSE, 0, "Dev_t is %lx. Prio is %d. Bmap "
- "shift is %d. Blocks per page %d.",
- devinfo->dev_t, devinfo->prio, devinfo->bmap_shift,
- devinfo->blocks_per_page);
-
- /* Keep one aside for the signature */
- pages_available = get_usable_pages(devinfo) - 1;
-
- toi_message(TOI_IO, TOI_VERBOSE, 0, "Registering file storage, %lu "
- "pages.", pages_available);
-
- toi_bio_ops.register_storage(devinfo);
- return 0;
-}
-
-static unsigned long toi_file_storage_available(void)
-{
- return pages_available;
-}
-
-static int toi_file_allocate_storage(struct toi_bdev_info *chain,
- unsigned long request)
-{
- unsigned long available = pages_available - pages_allocated;
- unsigned long to_add = min(available, request);
-
- toi_message(TOI_IO, TOI_VERBOSE, 0, "Pages available is %lu. Allocated "
- "is %lu. Allocating %lu pages from file.",
- pages_available, pages_allocated, to_add);
- pages_allocated += to_add;
-
- return to_add;
-}
-
-/**
- * __populate_block_list - add an extent to the chain
- * @min: Start of the extent (first physical block = sector)
- * @max: End of the extent (last physical block = sector)
- *
- * If TOI_TEST_BIO is set, print a debug message, outputting the min and max
- * fs block numbers.
- **/
-static int __populate_block_list(struct toi_bdev_info *chain, int min, int max)
-{
- if (test_action_state(TOI_TEST_BIO))
- toi_message(TOI_IO, TOI_VERBOSE, 0, "Adding extent %d-%d.",
- min << chain->bmap_shift,
- ((max + 1) << chain->bmap_shift) - 1);
-
- return toi_add_to_extent_chain(&chain->blocks, min, max);
-}
-
-static int get_main_pool_phys_params(struct toi_bdev_info *chain)
-{
- int i, extent_min = -1, extent_max = -1, result = 0, have_sig_page = 0;
- unsigned long pages_mapped = 0;
-
- toi_message(TOI_IO, TOI_VERBOSE, 0, "Getting file allocator blocks.");
-
- if (chain->blocks.first)
- toi_put_extent_chain(&chain->blocks);
-
- if (!target_is_normal_file()) {
- result = (pages_available > 0) ?
- __populate_block_list(chain, chain->blocks_per_page,
- (pages_allocated + 1) *
- chain->blocks_per_page - 1) : 0;
- return result;
- }
-
- /*
- * FIXME: We are assuming the first page is contiguous. Is that
- * assumption always right?
- */
-
- for (i = 0; i < (target_inode->i_size >> PAGE_SHIFT); i++) {
- sector_t new_sector;
-
- if (!has_contiguous_blocks(chain, i))
- continue;
-
- if (!have_sig_page) {
- have_sig_page = 1;
- sig_page_index = i;
- continue;
- }
-
- pages_mapped++;
-
- /* Ignore first page - it has the header */
- if (pages_mapped == 1)
- continue;
-
- new_sector = bmap(target_inode, (i * chain->blocks_per_page));
-
- /*
- * I'd love to be able to fill in holes and resize
- * files, but not yet...
- */
-
- if (new_sector == extent_max + 1)
- extent_max += chain->blocks_per_page;
- else {
- if (extent_min > -1) {
- result = __populate_block_list(chain,
- extent_min, extent_max);
- if (result)
- return result;
- }
-
- extent_min = new_sector;
- extent_max = extent_min +
- chain->blocks_per_page - 1;
- }
-
- if (pages_mapped == pages_allocated)
- break;
- }
-
- if (extent_min > -1) {
- result = __populate_block_list(chain, extent_min, extent_max);
- if (result)
- return result;
- }
-
- return 0;
-}
-
-static void toi_file_free_storage(struct toi_bdev_info *chain)
-{
- pages_allocated = 0;
- file_chain = NULL;
-}
-
-/**
- * toi_file_print_debug_stats - print debug info
- * @buffer: Buffer to data to populate
- * @size: Size of the buffer
- **/
-static int toi_file_print_debug_stats(char *buffer, int size)
-{
- int len = scnprintf(buffer, size, "- File Allocator active.\n");
-
- len += scnprintf(buffer+len, size-len, " Storage available for "
- "image: %lu pages.\n", pages_available);
-
- return len;
-}
-
-static void toi_file_cleanup(int finishing_cycle)
-{
- if (toi_file_target_bdev) {
- if (target_claim) {
- blkdev_put(toi_file_target_bdev, FMODE_WRITE | FMODE_READ);
- target_claim = 0;
- }
-
- if (used_devt) {
- blkdev_put(toi_file_target_bdev,
- FMODE_READ | FMODE_NDELAY);
- used_devt = 0;
- }
- toi_file_target_bdev = NULL;
- target_inode = NULL;
- }
-
- if (target_file) {
- filp_close(target_file, NULL);
- target_file = NULL;
- }
-
- pages_available = 0;
-}
-
-/**
- * test_toi_file_target - sysfs callback for /sys/power/tuxonince/file/target
- *
- * Test wheter the target file is valid for hibernating.
- **/
-static void test_toi_file_target(void)
-{
- int result = toi_file_register_storage();
- sector_t sector;
- char buf[50];
- struct fs_info *fs_info;
-
- if (result || !file_chain)
- return;
-
- /* This doesn't mean we're in business. Is any storage available? */
- if (!pages_available)
- goto out;
-
- toi_file_allocate_storage(file_chain, 1);
- result = get_main_pool_phys_params(file_chain);
- if (result)
- goto out;
-
-
- sector = bmap(target_inode, sig_page_index *
- file_chain->blocks_per_page) << file_chain->bmap_shift;
-
- /* Use the uuid, or the dev_t if that fails */
- fs_info = fs_info_from_block_dev(toi_file_target_bdev);
- if (!fs_info || IS_ERR(fs_info)) {
- bdevname(toi_file_target_bdev, buf);
- sprintf(resume_file, "/dev/%s:%llu", buf,
- (unsigned long long) sector);
- } else {
- int i;
- hex_dump_to_buffer(fs_info->uuid, 16, 32, 1, buf, 50, 0);
-
- /* Remove the spaces */
- for (i = 1; i < 16; i++) {
- buf[2 * i] = buf[3 * i];
- buf[2 * i + 1] = buf[3 * i + 1];
- }
- buf[32] = 0;
- sprintf(resume_file, "UUID=%s:0x%llx", buf,
- (unsigned long long) sector);
- free_fs_info(fs_info);
- }
-
- toi_attempt_to_parse_resume_device(0);
-out:
- toi_file_free_storage(file_chain);
- toi_bio_ops.free_storage();
-}
-
-static struct toi_sysfs_data sysfs_params[] = {
- SYSFS_STRING("target", SYSFS_RW, toi_file_target, 256,
- SYSFS_NEEDS_SM_FOR_WRITE, test_toi_file_target),
- SYSFS_INT("enabled", SYSFS_RW, &toi_fileops.enabled, 0, 1, 0, NULL),
- SYSFS_INT("priority", SYSFS_RW, &file_target_priority, -4095,
- 4096, 0, NULL),
-};
-
-static struct toi_bio_allocator_ops toi_bio_fileops = {
- .register_storage = toi_file_register_storage,
- .storage_available = toi_file_storage_available,
- .allocate_storage = toi_file_allocate_storage,
- .bmap = get_main_pool_phys_params,
- .free_storage = toi_file_free_storage,
-};
-
-static struct toi_module_ops toi_fileops = {
- .type = BIO_ALLOCATOR_MODULE,
- .name = "file storage",
- .directory = "file",
- .module = THIS_MODULE,
- .print_debug_info = toi_file_print_debug_stats,
- .cleanup = toi_file_cleanup,
- .bio_allocator_ops = &toi_bio_fileops,
-
- .sysfs_data = sysfs_params,
- .num_sysfs_entries = sizeof(sysfs_params) /
- sizeof(struct toi_sysfs_data),
-};
-
-/* ---- Registration ---- */
-static __init int toi_file_load(void)
-{
- return toi_register_module(&toi_fileops);
-}
-
-late_initcall(toi_file_load);
diff --git a/kernel/power/tuxonice_highlevel.c b/kernel/power/tuxonice_highlevel.c
deleted file mode 100644
index 16cf14cbc..000000000
--- a/kernel/power/tuxonice_highlevel.c
+++ /dev/null
@@ -1,1413 +0,0 @@
-/*
- * kernel/power/tuxonice_highlevel.c
- */
-/** \mainpage TuxOnIce.
- *
- * TuxOnIce provides support for saving and restoring an image of
- * system memory to an arbitrary storage device, either on the local computer,
- * or across some network. The support is entirely OS based, so TuxOnIce
- * works without requiring BIOS, APM or ACPI support. The vast majority of the
- * code is also architecture independant, so it should be very easy to port
- * the code to new architectures. TuxOnIce includes support for SMP, 4G HighMem
- * and preemption. Initramfses and initrds are also supported.
- *
- * TuxOnIce uses a modular design, in which the method of storing the image is
- * completely abstracted from the core code, as are transformations on the data
- * such as compression and/or encryption (multiple 'modules' can be used to
- * provide arbitrary combinations of functionality). The user interface is also
- * modular, so that arbitrarily simple or complex interfaces can be used to
- * provide anything from debugging information through to eye candy.
- *
- * \section Copyright
- *
- * TuxOnIce is released under the GPLv2.
- *
- * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu><BR>
- * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz><BR>
- * Copyright (C) 2002-2003 Florent Chabaud <fchabaud@free.fr><BR>
- * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au)<BR>
- *
- * \section Credits
- *
- * Nigel would like to thank the following people for their work:
- *
- * Bernard Blackham <bernard@blackham.com.au><BR>
- * Web page & Wiki administration, some coding. A person without whom
- * TuxOnIce would not be where it is.
- *
- * Michael Frank <mhf@linuxmail.org><BR>
- * Extensive testing and help with improving stability. I was constantly
- * amazed by the quality and quantity of Michael's help.
- *
- * Pavel Machek <pavel@ucw.cz><BR>
- * Modifications, defectiveness pointing, being with Gabor at the very
- * beginning, suspend to swap space, stop all tasks. Port to 2.4.18-ac and
- * 2.5.17. Even though Pavel and I disagree on the direction suspend to
- * disk should take, I appreciate the valuable work he did in helping Gabor
- * get the concept working.
- *
- * ..and of course the myriads of TuxOnIce users who have helped diagnose
- * and fix bugs, made suggestions on how to improve the code, proofread
- * documentation, and donated time and money.
- *
- * Thanks also to corporate sponsors:
- *
- * <B>Redhat.</B>Sometime employer from May 2006 (my fault, not Redhat's!).
- *
- * <B>Cyclades.com.</B> Nigel's employers from Dec 2004 until May 2006, who
- * allowed him to work on TuxOnIce and PM related issues on company time.
- *
- * <B>LinuxFund.org.</B> Sponsored Nigel's work on TuxOnIce for four months Oct
- * 2003 to Jan 2004.
- *
- * <B>LAC Linux.</B> Donated P4 hardware that enabled development and ongoing
- * maintenance of SMP and Highmem support.
- *
- * <B>OSDL.</B> Provided access to various hardware configurations, make
- * occasional small donations to the project.
- */
-
-#include <linux/suspend.h>
-#include <linux/module.h>
-#include <linux/freezer.h>
-#include <generated/utsrelease.h>
-#include <linux/cpu.h>
-#include <linux/console.h>
-#include <linux/writeback.h>
-#include <linux/uaccess.h> /* for get/set_fs & KERNEL_DS on i386 */
-#include <linux/bio.h>
-#include <linux/kgdb.h>
-
-#include "tuxonice.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_prepare_image.h"
-#include "tuxonice_io.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_power_off.h"
-#include "tuxonice_storage.h"
-#include "tuxonice_checksum.h"
-#include "tuxonice_builtin.h"
-#include "tuxonice_atomic_copy.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_cluster.h"
-
-/*! Pageset metadata. */
-struct pagedir pagedir2 = {2};
-
-static mm_segment_t oldfs;
-static DEFINE_MUTEX(tuxonice_in_use);
-static int block_dump_save;
-
-int toi_trace_index;
-
-/* Binary signature if an image is present */
-char tuxonice_signature[9] = "\xed\xc3\x02\xe9\x98\x56\xe5\x0c";
-
-unsigned long boot_kernel_data_buffer;
-
-static char *result_strings[] = {
- "Hibernation was aborted",
- "The user requested that we cancel the hibernation",
- "No storage was available",
- "Insufficient storage was available",
- "Freezing filesystems and/or tasks failed",
- "A pre-existing image was used",
- "We would free memory, but image size limit doesn't allow this",
- "Unable to free enough memory to hibernate",
- "Unable to obtain the Power Management Semaphore",
- "A device suspend/resume returned an error",
- "A system device suspend/resume returned an error",
- "The extra pages allowance is too small",
- "We were unable to successfully prepare an image",
- "TuxOnIce module initialisation failed",
- "TuxOnIce module cleanup failed",
- "I/O errors were encountered",
- "Ran out of memory",
- "An error was encountered while reading the image",
- "Platform preparation failed",
- "CPU Hotplugging failed",
- "Architecture specific preparation failed",
- "Pages needed resaving, but we were told to abort if this happens",
- "We can't hibernate at the moment (invalid resume= or filewriter "
- "target?)",
- "A hibernation preparation notifier chain member cancelled the "
- "hibernation",
- "Pre-snapshot preparation failed",
- "Pre-restore preparation failed",
- "Failed to disable usermode helpers",
- "Can't resume from alternate image",
- "Header reservation too small",
- "Device Power Management Preparation failed",
-};
-
-/**
- * toi_finish_anything - cleanup after doing anything
- * @hibernate_or_resume: Whether finishing a cycle or attempt at
- * resuming.
- *
- * This is our basic clean-up routine, matching start_anything below. We
- * call cleanup routines, drop module references and restore process fs and
- * cpus allowed masks, together with the global block_dump variable's value.
- **/
-void toi_finish_anything(int hibernate_or_resume)
-{
- toi_running = 0;
- toi_cleanup_modules(hibernate_or_resume);
- toi_put_modules();
- if (hibernate_or_resume) {
- block_dump = block_dump_save;
- set_cpus_allowed_ptr(current, cpu_all_mask);
- toi_alloc_print_debug_stats();
- atomic_inc(&snapshot_device_available);
- unlock_system_sleep();
- }
-
- set_fs(oldfs);
- mutex_unlock(&tuxonice_in_use);
-}
-
-/**
- * toi_start_anything - basic initialisation for TuxOnIce
- * @toi_or_resume: Whether starting a cycle or attempt at resuming.
- *
- * Our basic initialisation routine. Take references on modules, use the
- * kernel segment, recheck resume= if no active allocator is set, initialise
- * modules, save and reset block_dump and ensure we're running on CPU0.
- **/
-int toi_start_anything(int hibernate_or_resume)
-{
- mutex_lock(&tuxonice_in_use);
-
- oldfs = get_fs();
- set_fs(KERNEL_DS);
-
- toi_trace_index = 0;
-
- if (hibernate_or_resume) {
- lock_system_sleep();
-
- if (!atomic_add_unless(&snapshot_device_available, -1, 0))
- goto snapshotdevice_unavailable;
- }
-
- if (hibernate_or_resume == SYSFS_HIBERNATE)
- toi_print_modules();
-
- if (toi_get_modules()) {
- printk(KERN_INFO "TuxOnIce: Get modules failed!\n");
- goto prehibernate_err;
- }
-
- if (hibernate_or_resume) {
- block_dump_save = block_dump;
- block_dump = 0;
- set_cpus_allowed_ptr(current,
- cpumask_of(cpumask_first(cpu_online_mask)));
- }
-
- if (toi_initialise_modules_early(hibernate_or_resume))
- goto early_init_err;
-
- if (!toiActiveAllocator)
- toi_attempt_to_parse_resume_device(!hibernate_or_resume);
-
- if (!toi_initialise_modules_late(hibernate_or_resume)) {
- toi_running = 1; /* For the swsusp code we use :< */
- return 0;
- }
-
- toi_cleanup_modules(hibernate_or_resume);
-early_init_err:
- if (hibernate_or_resume) {
- block_dump_save = block_dump;
- set_cpus_allowed_ptr(current, cpu_all_mask);
- }
- toi_put_modules();
-prehibernate_err:
- if (hibernate_or_resume)
- atomic_inc(&snapshot_device_available);
-snapshotdevice_unavailable:
- if (hibernate_or_resume)
- mutex_unlock(&pm_mutex);
- set_fs(oldfs);
- mutex_unlock(&tuxonice_in_use);
- return -EBUSY;
-}
-
-/*
- * Nosave page tracking.
- *
- * Here rather than in prepare_image because we want to do it once only at the
- * start of a cycle.
- */
-
-/**
- * mark_nosave_pages - set up our Nosave bitmap
- *
- * Build a bitmap of Nosave pages from the list. The bitmap allows faster
- * use when preparing the image.
- **/
-static void mark_nosave_pages(void)
-{
- struct nosave_region *region;
-
- list_for_each_entry(region, &nosave_regions, list) {
- unsigned long pfn;
-
- for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++)
- if (pfn_valid(pfn)) {
- SetPageNosave(pfn_to_page(pfn));
- }
- }
-}
-
-/**
- * allocate_bitmaps - allocate bitmaps used to record page states
- *
- * Allocate the bitmaps we use to record the various TuxOnIce related
- * page states.
- **/
-static int allocate_bitmaps(void)
-{
- if (toi_alloc_bitmap(&pageset1_map) ||
- toi_alloc_bitmap(&pageset1_copy_map) ||
- toi_alloc_bitmap(&pageset2_map) ||
- toi_alloc_bitmap(&io_map) ||
- toi_alloc_bitmap(&nosave_map) ||
- toi_alloc_bitmap(&free_map) ||
- toi_alloc_bitmap(&compare_map) ||
- toi_alloc_bitmap(&page_resave_map))
- return 1;
-
- return 0;
-}
-
-/**
- * free_bitmaps - free the bitmaps used to record page states
- *
- * Free the bitmaps allocated above. It is not an error to call
- * memory_bm_free on a bitmap that isn't currently allocated.
- **/
-static void free_bitmaps(void)
-{
- toi_free_bitmap(&pageset1_map);
- toi_free_bitmap(&pageset1_copy_map);
- toi_free_bitmap(&pageset2_map);
- toi_free_bitmap(&io_map);
- toi_free_bitmap(&nosave_map);
- toi_free_bitmap(&free_map);
- toi_free_bitmap(&compare_map);
- toi_free_bitmap(&page_resave_map);
-}
-
-/**
- * io_MB_per_second - return the number of MB/s read or written
- * @write: Whether to return the speed at which we wrote.
- *
- * Calculate the number of megabytes per second that were read or written.
- **/
-static int io_MB_per_second(int write)
-{
- return (toi_bkd.toi_io_time[write][1]) ?
- MB((unsigned long) toi_bkd.toi_io_time[write][0]) * HZ /
- toi_bkd.toi_io_time[write][1] : 0;
-}
-
-#define SNPRINTF(a...) do { len += scnprintf(((char *) buffer) + len, \
- count - len - 1, ## a); } while (0)
-
-/**
- * get_debug_info - fill a buffer with debugging information
- * @buffer: The buffer to be filled.
- * @count: The size of the buffer, in bytes.
- *
- * Fill a (usually PAGE_SIZEd) buffer with the debugging info that we will
- * either printk or return via sysfs.
- **/
-static int get_toi_debug_info(const char *buffer, int count)
-{
- int len = 0, i, first_result = 1;
-
- SNPRINTF("TuxOnIce debugging info:\n");
- SNPRINTF("- TuxOnIce core : " TOI_CORE_VERSION "\n");
- SNPRINTF("- Kernel Version : " UTS_RELEASE "\n");
- SNPRINTF("- Compiler vers. : %d.%d\n", __GNUC__, __GNUC_MINOR__);
- SNPRINTF("- Attempt number : %d\n", nr_hibernates);
- SNPRINTF("- Parameters : %ld %ld %ld %d %ld %ld\n",
- toi_result,
- toi_bkd.toi_action,
- toi_bkd.toi_debug_state,
- toi_bkd.toi_default_console_level,
- image_size_limit,
- toi_poweroff_method);
- SNPRINTF("- Overall expected compression percentage: %d.\n",
- 100 - toi_expected_compression_ratio());
- len += toi_print_module_debug_info(((char *) buffer) + len,
- count - len - 1);
- if (toi_bkd.toi_io_time[0][1]) {
- if ((io_MB_per_second(0) < 5) || (io_MB_per_second(1) < 5)) {
- SNPRINTF("- I/O speed: Write %ld KB/s",
- (KB((unsigned long) toi_bkd.toi_io_time[0][0]) * HZ /
- toi_bkd.toi_io_time[0][1]));
- if (toi_bkd.toi_io_time[1][1])
- SNPRINTF(", Read %ld KB/s",
- (KB((unsigned long)
- toi_bkd.toi_io_time[1][0]) * HZ /
- toi_bkd.toi_io_time[1][1]));
- } else {
- SNPRINTF("- I/O speed: Write %ld MB/s",
- (MB((unsigned long) toi_bkd.toi_io_time[0][0]) * HZ /
- toi_bkd.toi_io_time[0][1]));
- if (toi_bkd.toi_io_time[1][1])
- SNPRINTF(", Read %ld MB/s",
- (MB((unsigned long)
- toi_bkd.toi_io_time[1][0]) * HZ /
- toi_bkd.toi_io_time[1][1]));
- }
- SNPRINTF(".\n");
- } else
- SNPRINTF("- No I/O speed stats available.\n");
- SNPRINTF("- Extra pages : %lu used/%lu.\n",
- extra_pd1_pages_used, extra_pd1_pages_allowance);
-
- for (i = 0; i < TOI_NUM_RESULT_STATES; i++)
- if (test_result_state(i)) {
- SNPRINTF("%s: %s.\n", first_result ?
- "- Result " :
- " ",
- result_strings[i]);
- first_result = 0;
- }
- if (first_result)
- SNPRINTF("- Result : %s.\n", nr_hibernates ?
- "Succeeded" :
- "No hibernation attempts so far");
- return len;
-}
-
-#ifdef CONFIG_TOI_INCREMENTAL
-/**
- * get_toi_page_state - fill a buffer with page state information
- * @buffer: The buffer to be filled.
- * @count: The size of the buffer, in bytes.
- *
- * Fill a (usually PAGE_SIZEd) buffer with the debugging info that we will
- * either printk or return via sysfs.
- **/
-static int get_toi_page_state(const char *buffer, int count)
-{
- int free = 0, untracked = 0, dirty = 0, ro = 0, invalid = 0, other = 0, total = 0;
- int len = 0;
- struct zone *zone;
- int allocated_bitmaps = 0;
-
- set_cpus_allowed_ptr(current,
- cpumask_of(cpumask_first(cpu_online_mask)));
-
- if (!free_map) {
- BUG_ON(toi_alloc_bitmap(&free_map));
- allocated_bitmaps = 1;
- }
-
- toi_generate_free_page_map();
-
- for_each_populated_zone(zone) {
- unsigned long loop;
-
- total += zone->spanned_pages;
-
- for (loop = 0; loop < zone->spanned_pages; loop++) {
- unsigned long pfn = zone->zone_start_pfn + loop;
- struct page *page;
- int chunk_size;
-
- if (!pfn_valid(pfn)) {
- continue;
- }
-
- chunk_size = toi_size_of_free_region(zone, pfn);
- if (chunk_size) {
- /*
- * If the page gets allocated, it will be need
- * saving in an image.
- * Don't bother with explicitly removing any
- * RO protection applied below.
- * We'll SetPageTOI_Dirty(page) if/when it
- * gets allocated.
- */
- free += chunk_size;
- loop += chunk_size - 1;
- continue;
- }
-
- page = pfn_to_page(pfn);
-
- if (PageTOI_Untracked(page)) {
- untracked++;
- } else if (PageTOI_RO(page)) {
- ro++;
- } else if (PageTOI_Dirty(page)) {
- dirty++;
- } else {
- printk("Page %ld state 'other'.\n", pfn);
- other++;
- }
- }
- }
-
- if (allocated_bitmaps) {
- toi_free_bitmap(&free_map);
- }
-
- set_cpus_allowed_ptr(current, cpu_all_mask);
-
- SNPRINTF("TuxOnIce page breakdown:\n");
- SNPRINTF("- Free : %d\n", free);
- SNPRINTF("- Untracked : %d\n", untracked);
- SNPRINTF("- Read only : %d\n", ro);
- SNPRINTF("- Dirty : %d\n", dirty);
- SNPRINTF("- Other : %d\n", other);
- SNPRINTF("- Invalid : %d\n", invalid);
- SNPRINTF("- Total : %d\n", total);
- return len;
-}
-#endif
-
-/**
- * do_cleanup - cleanup after attempting to hibernate or resume
- * @get_debug_info: Whether to allocate and return debugging info.
- *
- * Cleanup after attempting to hibernate or resume, possibly getting
- * debugging info as we do so.
- **/
-static void do_cleanup(int get_debug_info, int restarting)
-{
- int i = 0;
- char *buffer = NULL;
-
- trap_non_toi_io = 0;
-
- if (get_debug_info)
- toi_prepare_status(DONT_CLEAR_BAR, "Cleaning up...");
-
- free_checksum_pages();
-
- toi_cbw_restore();
- toi_free_cbw_data();
-
- if (get_debug_info)
- buffer = (char *) toi_get_zeroed_page(20, TOI_ATOMIC_GFP);
-
- if (buffer)
- i = get_toi_debug_info(buffer, PAGE_SIZE);
-
- toi_free_extra_pagedir_memory();
-
- pagedir1.size = 0;
- pagedir2.size = 0;
- set_highmem_size(pagedir1, 0);
- set_highmem_size(pagedir2, 0);
-
- if (boot_kernel_data_buffer) {
- if (!test_toi_state(TOI_BOOT_KERNEL))
- toi_free_page(37, boot_kernel_data_buffer);
- boot_kernel_data_buffer = 0;
- }
-
- if (test_toi_state(TOI_DEVICE_HOTPLUG_LOCKED)) {
- unlock_device_hotplug();
- clear_toi_state(TOI_DEVICE_HOTPLUG_LOCKED);
- }
-
- clear_toi_state(TOI_BOOT_KERNEL);
- if (current->flags & PF_SUSPEND_TASK)
- thaw_processes();
-
- if (!restarting)
- toi_stop_other_threads();
-
- if (toi_keeping_image &&
- !test_result_state(TOI_ABORTED)) {
- toi_message(TOI_ANY_SECTION, TOI_LOW, 1,
- "TuxOnIce: Not invalidating the image due "
- "to Keep Image or Incremental Image being enabled.");
- set_result_state(TOI_KEPT_IMAGE);
-
- /*
- * For an incremental image, free unused storage so
- * swap (if any) can be used for normal system operation,
- * if so desired.
- */
-
- toiActiveAllocator->free_unused_storage();
- } else
- if (toiActiveAllocator)
- toiActiveAllocator->remove_image();
-
- free_bitmaps();
- usermodehelper_enable();
-
- if (test_toi_state(TOI_NOTIFIERS_PREPARE)) {
- pm_notifier_call_chain(PM_POST_HIBERNATION);
- clear_toi_state(TOI_NOTIFIERS_PREPARE);
- }
-
- if (buffer && i) {
- /* Printk can only handle 1023 bytes, including
- * its level mangling. */
- for (i = 0; i < 3; i++)
- printk(KERN_ERR "%s", buffer + (1023 * i));
- toi_free_page(20, (unsigned long) buffer);
- }
-
- if (!restarting)
- toi_cleanup_console();
-
- free_attention_list();
-
- if (!restarting)
- toi_deactivate_storage(0);
-
- clear_toi_state(TOI_IGNORE_LOGLEVEL);
- clear_toi_state(TOI_TRYING_TO_RESUME);
- clear_toi_state(TOI_NOW_RESUMING);
-}
-
-/**
- * check_still_keeping_image - we kept an image; check whether to reuse it.
- *
- * We enter this routine when we have kept an image. If the user has said they
- * want to still keep it, all we need to do is powerdown. If powering down
- * means hibernating to ram and the power doesn't run out, we'll return 1.
- * If we do power off properly or the battery runs out, we'll resume via the
- * normal paths.
- *
- * If the user has said they want to remove the previously kept image, we
- * remove it, and return 0. We'll then store a new image.
- **/
-static int check_still_keeping_image(void)
-{
- if (toi_keeping_image) {
- if (!test_action_state(TOI_INCREMENTAL_IMAGE)) {
- printk(KERN_INFO "Image already stored: powering down "
- "immediately.");
- do_toi_step(STEP_HIBERNATE_POWERDOWN);
- return 1;
- }
- /**
- * Incremental image - need to write new part.
- * We detect that we're writing an incremental image by looking
- * at test_result_state(TOI_KEPT_IMAGE)
- **/
- return 0;
- }
-
- printk(KERN_INFO "Invalidating previous image.\n");
- toiActiveAllocator->remove_image();
-
- return 0;
-}
-
-/**
- * toi_init - prepare to hibernate to disk
- *
- * Initialise variables & data structures, in preparation for
- * hibernating to disk.
- **/
-static int toi_init(int restarting)
-{
- int result, i, j;
-
- toi_result = 0;
-
- printk(KERN_INFO "Initiating a hibernation cycle.\n");
-
- nr_hibernates++;
-
- for (i = 0; i < 2; i++)
- for (j = 0; j < 2; j++)
- toi_bkd.toi_io_time[i][j] = 0;
-
- if (!test_toi_state(TOI_CAN_HIBERNATE) ||
- allocate_bitmaps())
- return 1;
-
- mark_nosave_pages();
-
- if (!restarting)
- toi_prepare_console();
-
- result = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
- if (result) {
- set_result_state(TOI_NOTIFIERS_PREPARE_FAILED);
- return 1;
- }
- set_toi_state(TOI_NOTIFIERS_PREPARE);
-
- if (!restarting) {
- printk(KERN_ERR "Starting other threads.");
- toi_start_other_threads();
- }
-
- result = usermodehelper_disable();
- if (result) {
- printk(KERN_ERR "TuxOnIce: Failed to disable usermode "
- "helpers\n");
- set_result_state(TOI_USERMODE_HELPERS_ERR);
- return 1;
- }
-
- boot_kernel_data_buffer = toi_get_zeroed_page(37, TOI_ATOMIC_GFP);
- if (!boot_kernel_data_buffer) {
- printk(KERN_ERR "TuxOnIce: Failed to allocate "
- "boot_kernel_data_buffer.\n");
- set_result_state(TOI_OUT_OF_MEMORY);
- return 1;
- }
-
- toi_allocate_cbw_data();
-
- return 0;
-}
-
-/**
- * can_hibernate - perform basic 'Can we hibernate?' tests
- *
- * Perform basic tests that must pass if we're going to be able to hibernate:
- * Can we get the pm_mutex? Is resume= valid (we need to know where to write
- * the image header).
- **/
-static int can_hibernate(void)
-{
- if (!test_toi_state(TOI_CAN_HIBERNATE))
- toi_attempt_to_parse_resume_device(0);
-
- if (!test_toi_state(TOI_CAN_HIBERNATE)) {
- printk(KERN_INFO "TuxOnIce: Hibernation is disabled.\n"
- "This may be because you haven't put something along "
- "the lines of\n\nresume=swap:/dev/hda1\n\n"
- "in lilo.conf or equivalent. (Where /dev/hda1 is your "
- "swap partition).\n");
- set_abort_result(TOI_CANT_SUSPEND);
- return 0;
- }
-
- if (strlen(alt_resume_param)) {
- attempt_to_parse_alt_resume_param();
-
- if (!strlen(alt_resume_param)) {
- printk(KERN_INFO "Alternate resume parameter now "
- "invalid. Aborting.\n");
- set_abort_result(TOI_CANT_USE_ALT_RESUME);
- return 0;
- }
- }
-
- return 1;
-}
-
-/**
- * do_post_image_write - having written an image, figure out what to do next
- *
- * After writing an image, we might load an alternate image or power down.
- * Powering down might involve hibernating to ram, in which case we also
- * need to handle reloading pageset2.
- **/
-static int do_post_image_write(void)
-{
- /* If switching images fails, do normal powerdown */
- if (alt_resume_param[0])
- do_toi_step(STEP_RESUME_ALT_IMAGE);
-
- toi_power_down();
-
- barrier();
- mb();
- return 0;
-}
-
-/**
- * __save_image - do the hard work of saving the image
- *
- * High level routine for getting the image saved. The key assumptions made
- * are that processes have been frozen and sufficient memory is available.
- *
- * We also exit through here at resume time, coming back from toi_hibernate
- * after the atomic restore. This is the reason for the toi_in_hibernate
- * test.
- **/
-static int __save_image(void)
-{
- int temp_result, did_copy = 0;
-
- toi_prepare_status(DONT_CLEAR_BAR, "Starting to save the image..");
-
- toi_message(TOI_ANY_SECTION, TOI_LOW, 1,
- " - Final values: %d and %d.",
- pagedir1.size, pagedir2.size);
-
- toi_cond_pause(1, "About to write pagedir2.");
-
- temp_result = write_pageset(&pagedir2);
-
- if (temp_result == -1 || test_result_state(TOI_ABORTED))
- return 1;
-
- toi_cond_pause(1, "About to copy pageset 1.");
-
- if (test_result_state(TOI_ABORTED))
- return 1;
-
- toi_deactivate_storage(1);
-
- toi_prepare_status(DONT_CLEAR_BAR, "Doing atomic copy/restore.");
-
- toi_in_hibernate = 1;
-
- if (toi_go_atomic(PMSG_FREEZE, 1))
- goto Failed;
-
- temp_result = toi_hibernate();
-
-#ifdef CONFIG_KGDB
- if (test_action_state(TOI_POST_RESUME_BREAKPOINT))
- kgdb_breakpoint();
-#endif
-
- if (!temp_result)
- did_copy = 1;
-
- /* We return here at resume time too! */
- toi_end_atomic(ATOMIC_ALL_STEPS, toi_in_hibernate, temp_result);
-
-Failed:
- if (toi_activate_storage(1))
- panic("Failed to reactivate our storage.");
-
- /* Resume time? */
- if (!toi_in_hibernate) {
- copyback_post();
- return 0;
- }
-
- /* Nope. Hibernating. So, see if we can save the image... */
-
- if (temp_result || test_result_state(TOI_ABORTED)) {
- if (did_copy)
- goto abort_reloading_pagedir_two;
- else
- return 1;
- }
-
- toi_update_status(pagedir2.size, pagedir1.size + pagedir2.size,
- NULL);
-
- if (test_result_state(TOI_ABORTED))
- goto abort_reloading_pagedir_two;
-
- toi_cond_pause(1, "About to write pageset1.");
-
- toi_message(TOI_ANY_SECTION, TOI_LOW, 1, "-- Writing pageset1");
-
- temp_result = write_pageset(&pagedir1);
-
- /* We didn't overwrite any memory, so no reread needs to be done. */
- if (test_action_state(TOI_TEST_FILTER_SPEED) ||
- test_action_state(TOI_TEST_BIO))
- return 1;
-
- if (temp_result == 1 || test_result_state(TOI_ABORTED))
- goto abort_reloading_pagedir_two;
-
- toi_cond_pause(1, "About to write header.");
-
- if (test_result_state(TOI_ABORTED))
- goto abort_reloading_pagedir_two;
-
- temp_result = write_image_header();
-
- if (!temp_result && !test_result_state(TOI_ABORTED))
- return 0;
-
-abort_reloading_pagedir_two:
- temp_result = read_pageset2(1);
-
- /* If that failed, we're sunk. Panic! */
- if (temp_result)
- panic("Attempt to reload pagedir 2 while aborting "
- "a hibernate failed.");
-
- return 1;
-}
-
-static void map_ps2_pages(int enable)
-{
- unsigned long pfn = 0;
-
- memory_bm_position_reset(pageset2_map);
- pfn = memory_bm_next_pfn(pageset2_map, 0);
-
- while (pfn != BM_END_OF_MAP) {
- struct page *page = pfn_to_page(pfn);
- kernel_map_pages(page, 1, enable);
- pfn = memory_bm_next_pfn(pageset2_map, 0);
- }
-}
-
-/**
- * do_save_image - save the image and handle the result
- *
- * Save the prepared image. If we fail or we're in the path returning
- * from the atomic restore, cleanup.
- **/
-static int do_save_image(void)
-{
- int result;
- map_ps2_pages(0);
- result = __save_image();
- map_ps2_pages(1);
- return result;
-}
-
-/**
- * do_prepare_image - try to prepare an image
- *
- * Seek to initialise and prepare an image to be saved. On failure,
- * cleanup.
- **/
-static int do_prepare_image(void)
-{
- int restarting = test_result_state(TOI_EXTRA_PAGES_ALLOW_TOO_SMALL);
-
- if (!restarting && toi_activate_storage(0))
- return 1;
-
- /*
- * If kept image and still keeping image and hibernating to RAM, (non
- * incremental image case) we will return 1 after hibernating and
- * resuming (provided the power doesn't run out. In that case, we skip
- * directly to cleaning up and exiting.
- */
-
- if (!can_hibernate() ||
- (test_result_state(TOI_KEPT_IMAGE) &&
- check_still_keeping_image()))
- return 1;
-
- if (toi_init(restarting) || toi_prepare_image() ||
- test_result_state(TOI_ABORTED))
- return 1;
-
- trap_non_toi_io = 1;
-
- return 0;
-}
-
-/**
- * do_check_can_resume - find out whether an image has been stored
- *
- * Read whether an image exists. We use the same routine as the
- * image_exists sysfs entry, and just look to see whether the
- * first character in the resulting buffer is a '1'.
- **/
-int do_check_can_resume(void)
-{
- int result = -1;
-
- if (toi_activate_storage(0))
- return -1;
-
- if (!test_toi_state(TOI_RESUME_DEVICE_OK))
- toi_attempt_to_parse_resume_device(1);
-
- if (toiActiveAllocator)
- result = toiActiveAllocator->image_exists(1);
-
- toi_deactivate_storage(0);
- return result;
-}
-
-/**
- * do_load_atomic_copy - load the first part of an image, if it exists
- *
- * Check whether we have an image. If one exists, do sanity checking
- * (possibly invalidating the image or even rebooting if the user
- * requests that) before loading it into memory in preparation for the
- * atomic restore.
- *
- * If and only if we have an image loaded and ready to restore, we return 1.
- **/
-static int do_load_atomic_copy(void)
-{
- int read_image_result = 0;
-
- if (sizeof(swp_entry_t) != sizeof(long)) {
- printk(KERN_WARNING "TuxOnIce: The size of swp_entry_t != size"
- " of long. Please report this!\n");
- return 1;
- }
-
- if (!resume_file[0])
- printk(KERN_WARNING "TuxOnIce: "
- "You need to use a resume= command line parameter to "
- "tell TuxOnIce where to look for an image.\n");
-
- toi_activate_storage(0);
-
- if (!(test_toi_state(TOI_RESUME_DEVICE_OK)) &&
- !toi_attempt_to_parse_resume_device(0)) {
- /*
- * Without a usable storage device we can do nothing -
- * even if noresume is given
- */
-
- if (!toiNumAllocators)
- printk(KERN_ALERT "TuxOnIce: "
- "No storage allocators have been registered.\n");
- else
- printk(KERN_ALERT "TuxOnIce: "
- "Missing or invalid storage location "
- "(resume= parameter). Please correct and "
- "rerun lilo (or equivalent) before "
- "hibernating.\n");
- toi_deactivate_storage(0);
- return 1;
- }
-
- if (allocate_bitmaps())
- return 1;
-
- read_image_result = read_pageset1(); /* non fatal error ignored */
-
- if (test_toi_state(TOI_NORESUME_SPECIFIED))
- clear_toi_state(TOI_NORESUME_SPECIFIED);
-
- toi_deactivate_storage(0);
-
- if (read_image_result)
- return 1;
-
- return 0;
-}
-
-/**
- * prepare_restore_load_alt_image - save & restore alt image variables
- *
- * Save and restore the pageset1 maps, when loading an alternate image.
- **/
-static void prepare_restore_load_alt_image(int prepare)
-{
- static struct memory_bitmap *pageset1_map_save, *pageset1_copy_map_save;
-
- if (prepare) {
- pageset1_map_save = pageset1_map;
- pageset1_map = NULL;
- pageset1_copy_map_save = pageset1_copy_map;
- pageset1_copy_map = NULL;
- set_toi_state(TOI_LOADING_ALT_IMAGE);
- toi_reset_alt_image_pageset2_pfn();
- } else {
- toi_free_bitmap(&pageset1_map);
- pageset1_map = pageset1_map_save;
- toi_free_bitmap(&pageset1_copy_map);
- pageset1_copy_map = pageset1_copy_map_save;
- clear_toi_state(TOI_NOW_RESUMING);
- clear_toi_state(TOI_LOADING_ALT_IMAGE);
- }
-}
-
-/**
- * do_toi_step - perform a step in hibernating or resuming
- *
- * Perform a step in hibernating or resuming an image. This abstraction
- * is in preparation for implementing cluster support, and perhaps replacing
- * uswsusp too (haven't looked whether that's possible yet).
- **/
-int do_toi_step(int step)
-{
- switch (step) {
- case STEP_HIBERNATE_PREPARE_IMAGE:
- return do_prepare_image();
- case STEP_HIBERNATE_SAVE_IMAGE:
- return do_save_image();
- case STEP_HIBERNATE_POWERDOWN:
- return do_post_image_write();
- case STEP_RESUME_CAN_RESUME:
- return do_check_can_resume();
- case STEP_RESUME_LOAD_PS1:
- return do_load_atomic_copy();
- case STEP_RESUME_DO_RESTORE:
- /*
- * If we succeed, this doesn't return.
- * Instead, we return from do_save_image() in the
- * hibernated kernel.
- */
- return toi_atomic_restore();
- case STEP_RESUME_ALT_IMAGE:
- printk(KERN_INFO "Trying to resume alternate image.\n");
- toi_in_hibernate = 0;
- save_restore_alt_param(SAVE, NOQUIET);
- prepare_restore_load_alt_image(1);
- if (!do_check_can_resume()) {
- printk(KERN_INFO "Nothing to resume from.\n");
- goto out;
- }
- if (!do_load_atomic_copy())
- toi_atomic_restore();
-
- printk(KERN_INFO "Failed to load image.\n");
-out:
- prepare_restore_load_alt_image(0);
- save_restore_alt_param(RESTORE, NOQUIET);
- break;
- case STEP_CLEANUP:
- do_cleanup(1, 0);
- break;
- case STEP_QUIET_CLEANUP:
- do_cleanup(0, 0);
- break;
- }
-
- return 0;
-}
-
-/* -- Functions for kickstarting a hibernate or resume --- */
-
-/**
- * toi_try_resume - try to do the steps in resuming
- *
- * Check if we have an image and if so try to resume. Clear the status
- * flags too.
- **/
-void toi_try_resume(void)
-{
- set_toi_state(TOI_TRYING_TO_RESUME);
- resume_attempted = 1;
-
- current->flags |= PF_MEMALLOC;
- toi_start_other_threads();
-
- if (do_toi_step(STEP_RESUME_CAN_RESUME) &&
- !do_toi_step(STEP_RESUME_LOAD_PS1))
- do_toi_step(STEP_RESUME_DO_RESTORE);
-
- toi_stop_other_threads();
- do_cleanup(0, 0);
-
- current->flags &= ~PF_MEMALLOC;
-
- clear_toi_state(TOI_IGNORE_LOGLEVEL);
- clear_toi_state(TOI_TRYING_TO_RESUME);
- clear_toi_state(TOI_NOW_RESUMING);
-}
-
-/**
- * toi_sys_power_disk_try_resume - wrapper calling toi_try_resume
- *
- * Wrapper for when __toi_try_resume is called from swsusp resume path,
- * rather than from echo > /sys/power/tuxonice/do_resume.
- **/
-static void toi_sys_power_disk_try_resume(void)
-{
- resume_attempted = 1;
-
- /*
- * There's a comment in kernel/power/disk.c that indicates
- * we should be able to use mutex_lock_nested below. That
- * doesn't seem to cut it, though, so let's just turn lockdep
- * off for now.
- */
- lockdep_off();
-
- if (toi_start_anything(SYSFS_RESUMING))
- goto out;
-
- toi_try_resume();
-
- /*
- * For initramfs, we have to clear the boot time
- * flag after trying to resume
- */
- clear_toi_state(TOI_BOOT_TIME);
-
- toi_finish_anything(SYSFS_RESUMING);
-out:
- lockdep_on();
-}
-
-/**
- * toi_try_hibernate - try to start a hibernation cycle
- *
- * Start a hibernation cycle, coming in from either
- * echo > /sys/power/tuxonice/do_suspend
- *
- * or
- *
- * echo disk > /sys/power/state
- *
- * In the later case, we come in without pm_sem taken; in the
- * former, it has been taken.
- **/
-int toi_try_hibernate(void)
-{
- int result = 0, sys_power_disk = 0, retries = 0;
-
- if (!mutex_is_locked(&tuxonice_in_use)) {
- /* Came in via /sys/power/disk */
- if (toi_start_anything(SYSFS_HIBERNATING))
- return -EBUSY;
- sys_power_disk = 1;
- }
-
- current->flags |= PF_MEMALLOC;
-
- if (test_toi_state(TOI_CLUSTER_MODE)) {
- toi_initiate_cluster_hibernate();
- goto out;
- }
-
-prepare:
- result = do_toi_step(STEP_HIBERNATE_PREPARE_IMAGE);
-
- if (result)
- goto out;
-
- if (test_action_state(TOI_FREEZER_TEST))
- goto out_restore_gfp_mask;
-
- result = do_toi_step(STEP_HIBERNATE_SAVE_IMAGE);
-
- if (test_result_state(TOI_EXTRA_PAGES_ALLOW_TOO_SMALL)) {
- if (retries < 2) {
- do_cleanup(0, 1);
- retries++;
- clear_result_state(TOI_ABORTED);
- extra_pd1_pages_allowance = extra_pd1_pages_used + 500;
- printk(KERN_INFO "Automatically adjusting the extra"
- " pages allowance to %ld and restarting.\n",
- extra_pd1_pages_allowance);
- pm_restore_gfp_mask();
- goto prepare;
- }
-
- printk(KERN_INFO "Adjusted extra pages allowance twice and "
- "still couldn't hibernate successfully. Giving up.");
- }
-
- /* This code runs at resume time too! */
- if (!result && toi_in_hibernate)
- result = do_toi_step(STEP_HIBERNATE_POWERDOWN);
-
-out_restore_gfp_mask:
- pm_restore_gfp_mask();
-out:
- do_cleanup(1, 0);
- current->flags &= ~PF_MEMALLOC;
-
- if (sys_power_disk)
- toi_finish_anything(SYSFS_HIBERNATING);
-
- return result;
-}
-
-/*
- * channel_no: If !0, -c <channel_no> is added to args (userui).
- */
-int toi_launch_userspace_program(char *command, int channel_no,
- int wait, int debug)
-{
- int retval;
- static char *envp[] = {
- "HOME=/",
- "TERM=linux",
- "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
- NULL };
- static char *argv[] = { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
- };
- char *channel = NULL;
- int arg = 0, size;
- char test_read[255];
- char *orig_posn = command;
-
- if (!strlen(orig_posn))
- return 1;
-
- if (channel_no) {
- channel = toi_kzalloc(4, 6, GFP_KERNEL);
- if (!channel) {
- printk(KERN_INFO "Failed to allocate memory in "
- "preparing to launch userspace program.\n");
- return 1;
- }
- }
-
- /* Up to 6 args supported */
- while (arg < 6) {
- sscanf(orig_posn, "%s", test_read);
- size = strlen(test_read);
- if (!(size))
- break;
- argv[arg] = toi_kzalloc(5, size + 1, TOI_ATOMIC_GFP);
- strcpy(argv[arg], test_read);
- orig_posn += size + 1;
- *test_read = 0;
- arg++;
- }
-
- if (channel_no) {
- sprintf(channel, "-c%d", channel_no);
- argv[arg] = channel;
- } else
- arg--;
-
- if (debug) {
- argv[++arg] = toi_kzalloc(5, 8, TOI_ATOMIC_GFP);
- strcpy(argv[arg], "--debug");
- }
-
- retval = call_usermodehelper(argv[0], argv, envp, wait);
-
- /*
- * If the program reports an error, retval = 256. Don't complain
- * about that here.
- */
- if (retval && retval != 256)
- printk(KERN_ERR "Failed to launch userspace program '%s': "
- "Error %d\n", command, retval);
-
- {
- int i;
- for (i = 0; i < arg; i++)
- if (argv[i] && argv[i] != channel)
- toi_kfree(5, argv[i], sizeof(*argv[i]));
- }
-
- toi_kfree(4, channel, sizeof(*channel));
-
- return retval;
-}
-
-/*
- * This array contains entries that are automatically registered at
- * boot. Modules and the console code register their own entries separately.
- */
-static struct toi_sysfs_data sysfs_params[] = {
- SYSFS_LONG("extra_pages_allowance", SYSFS_RW,
- &extra_pd1_pages_allowance, 0, LONG_MAX, 0),
- SYSFS_CUSTOM("image_exists", SYSFS_RW, image_exists_read,
- image_exists_write, SYSFS_NEEDS_SM_FOR_BOTH, NULL),
- SYSFS_STRING("resume", SYSFS_RW, resume_file, 255,
- SYSFS_NEEDS_SM_FOR_WRITE,
- attempt_to_parse_resume_device2),
- SYSFS_STRING("alt_resume_param", SYSFS_RW, alt_resume_param, 255,
- SYSFS_NEEDS_SM_FOR_WRITE,
- attempt_to_parse_alt_resume_param),
- SYSFS_CUSTOM("debug_info", SYSFS_READONLY, get_toi_debug_info, NULL, 0,
- NULL),
- SYSFS_BIT("ignore_rootfs", SYSFS_RW, &toi_bkd.toi_action,
- TOI_IGNORE_ROOTFS, 0),
- SYSFS_LONG("image_size_limit", SYSFS_RW, &image_size_limit, -2,
- INT_MAX, 0),
- SYSFS_UL("last_result", SYSFS_RW, &toi_result, 0, 0, 0),
- SYSFS_BIT("no_multithreaded_io", SYSFS_RW, &toi_bkd.toi_action,
- TOI_NO_MULTITHREADED_IO, 0),
- SYSFS_BIT("no_flusher_thread", SYSFS_RW, &toi_bkd.toi_action,
- TOI_NO_FLUSHER_THREAD, 0),
- SYSFS_BIT("full_pageset2", SYSFS_RW, &toi_bkd.toi_action,
- TOI_PAGESET2_FULL, 0),
- SYSFS_BIT("reboot", SYSFS_RW, &toi_bkd.toi_action, TOI_REBOOT, 0),
- SYSFS_BIT("replace_swsusp", SYSFS_RW, &toi_bkd.toi_action,
- TOI_REPLACE_SWSUSP, 0),
- SYSFS_STRING("resume_commandline", SYSFS_RW,
- toi_bkd.toi_nosave_commandline, COMMAND_LINE_SIZE, 0,
- NULL),
- SYSFS_STRING("version", SYSFS_READONLY, TOI_CORE_VERSION, 0, 0, NULL),
- SYSFS_BIT("freezer_test", SYSFS_RW, &toi_bkd.toi_action,
- TOI_FREEZER_TEST, 0),
- SYSFS_BIT("test_bio", SYSFS_RW, &toi_bkd.toi_action, TOI_TEST_BIO, 0),
- SYSFS_BIT("test_filter_speed", SYSFS_RW, &toi_bkd.toi_action,
- TOI_TEST_FILTER_SPEED, 0),
- SYSFS_BIT("no_pageset2", SYSFS_RW, &toi_bkd.toi_action,
- TOI_NO_PAGESET2, 0),
- SYSFS_BIT("no_pageset2_if_unneeded", SYSFS_RW, &toi_bkd.toi_action,
- TOI_NO_PS2_IF_UNNEEDED, 0),
- SYSFS_STRING("binary_signature", SYSFS_READONLY,
- tuxonice_signature, 9, 0, NULL),
- SYSFS_INT("max_workers", SYSFS_RW, &toi_max_workers, 0, NR_CPUS, 0,
- NULL),
-#ifdef CONFIG_KGDB
- SYSFS_BIT("post_resume_breakpoint", SYSFS_RW, &toi_bkd.toi_action,
- TOI_POST_RESUME_BREAKPOINT, 0),
-#endif
- SYSFS_BIT("no_readahead", SYSFS_RW, &toi_bkd.toi_action,
- TOI_NO_READAHEAD, 0),
- SYSFS_BIT("trace_debug_on", SYSFS_RW, &toi_bkd.toi_action,
- TOI_TRACE_DEBUG_ON, 0),
-#ifdef CONFIG_TOI_KEEP_IMAGE
- SYSFS_BIT("keep_image", SYSFS_RW , &toi_bkd.toi_action, TOI_KEEP_IMAGE,
- 0),
-#endif
-#ifdef CONFIG_TOI_INCREMENTAL
- SYSFS_CUSTOM("pagestate", SYSFS_READONLY, get_toi_page_state, NULL, 0,
- NULL),
- SYSFS_BIT("incremental", SYSFS_RW, &toi_bkd.toi_action,
- TOI_INCREMENTAL_IMAGE, 1),
-#endif
-};
-
-static struct toi_core_fns my_fns = {
- .get_nonconflicting_page = __toi_get_nonconflicting_page,
- .post_context_save = __toi_post_context_save,
- .try_hibernate = toi_try_hibernate,
- .try_resume = toi_sys_power_disk_try_resume,
-};
-
-/**
- * core_load - initialisation of TuxOnIce core
- *
- * Initialise the core, beginning with sysfs. Checksum and so on are part of
- * the core, but have their own initialisation routines because they either
- * aren't compiled in all the time or have their own subdirectories.
- **/
-static __init int core_load(void)
-{
- int i,
- numfiles = sizeof(sysfs_params) / sizeof(struct toi_sysfs_data);
-
- printk(KERN_INFO "TuxOnIce " TOI_CORE_VERSION
- " (http://tuxonice.net)\n");
-
- if (!hibernation_available()) {
- printk(KERN_INFO "TuxOnIce disabled due to request for hibernation"
- " to be disabled in this kernel.\n");
- return 1;
- }
-
- if (toi_sysfs_init())
- return 1;
-
- for (i = 0; i < numfiles; i++)
- toi_register_sysfs_file(tuxonice_kobj, &sysfs_params[i]);
-
- toi_core_fns = &my_fns;
-
- if (toi_alloc_init())
- return 1;
- if (toi_checksum_init())
- return 1;
- if (toi_usm_init())
- return 1;
- if (toi_ui_init())
- return 1;
- if (toi_poweroff_init())
- return 1;
- if (toi_cluster_init())
- return 1;
- if (toi_cbw_init())
- return 1;
-
- return 0;
-}
-
-late_initcall(core_load);
diff --git a/kernel/power/tuxonice_incremental.c b/kernel/power/tuxonice_incremental.c
deleted file mode 100644
index a8c5f3660..000000000
--- a/kernel/power/tuxonice_incremental.c
+++ /dev/null
@@ -1,402 +0,0 @@
-/*
- * kernel/power/tuxonice_incremental.c
- *
- * Copyright (C) 2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * This file contains routines related to storing incremental images - that
- * is, retaining an image after an initial cycle and then storing incremental
- * changes on subsequent hibernations.
- *
- * Based in part on on...
- *
- * Debug helper to dump the current kernel pagetables of the system
- * so that we can see what the various memory ranges are set to.
- *
- * (C) Copyright 2008 Intel Corporation
- *
- * Author: Arjan van de Ven <arjan@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
- */
-
-#include <linux/mm.h>
-#include <linux/tuxonice.h>
-#include <linux/sched.h>
-#include <asm/pgtable.h>
-#include <asm/cacheflush.h>
-#include <asm/tlbflush.h>
-#include <asm/page.h>
-#include "tuxonice_pageflags.h"
-#include "tuxonice_builtin.h"
-#include "power.h"
-
-int toi_do_incremental_initcall;
-
-extern void kdb_init(int level);
-extern noinline void kgdb_breakpoint(void);
-
-#undef pr_debug
-#if 0
-#define pr_debug(a, b...) do { printk(a, ##b); } while(0)
-#else
-#define pr_debug(a, b...) do { } while(0)
-#endif
-
-/* Multipliers for offsets within the PTEs */
-#define PTE_LEVEL_MULT (PAGE_SIZE)
-#define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT)
-#define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT)
-#define PGD_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT)
-
-/*
- * This function gets called on a break in a continuous series
- * of PTE entries; the next one is different so we need to
- * print what we collected so far.
- */
-static void note_page(void *addr)
-{
- static struct page *lastpage;
- struct page *page;
-
- page = virt_to_page(addr);
-
- if (page != lastpage) {
- unsigned int level;
- pte_t *pte = lookup_address((unsigned long) addr, &level);
- struct page *pt_page2 = pte_page(*pte);
- //debug("Note page %p (=> %p => %p|%ld).\n", addr, pte, pt_page2, page_to_pfn(pt_page2));
- SetPageTOI_Untracked(pt_page2);
- lastpage = page;
- }
-}
-
-static void walk_pte_level(pmd_t addr)
-{
- int i;
- pte_t *start;
-
- start = (pte_t *) pmd_page_vaddr(addr);
- for (i = 0; i < PTRS_PER_PTE; i++) {
- note_page(start);
- start++;
- }
-}
-
-#if PTRS_PER_PMD > 1
-
-static void walk_pmd_level(pud_t addr)
-{
- int i;
- pmd_t *start;
-
- start = (pmd_t *) pud_page_vaddr(addr);
- for (i = 0; i < PTRS_PER_PMD; i++) {
- if (!pmd_none(*start)) {
- if (pmd_large(*start) || !pmd_present(*start))
- note_page(start);
- else
- walk_pte_level(*start);
- } else
- note_page(start);
- start++;
- }
-}
-
-#else
-#define walk_pmd_level(a) walk_pte_level(__pmd(pud_val(a)))
-#define pud_large(a) pmd_large(__pmd(pud_val(a)))
-#define pud_none(a) pmd_none(__pmd(pud_val(a)))
-#endif
-
-#if PTRS_PER_PUD > 1
-
-static void walk_pud_level(pgd_t addr)
-{
- int i;
- pud_t *start;
-
- start = (pud_t *) pgd_page_vaddr(addr);
-
- for (i = 0; i < PTRS_PER_PUD; i++) {
- if (!pud_none(*start)) {
- if (pud_large(*start) || !pud_present(*start))
- note_page(start);
- else
- walk_pmd_level(*start);
- } else
- note_page(start);
-
- start++;
- }
-}
-
-#else
-#define walk_pud_level(a) walk_pmd_level(__pud(pgd_val(a)))
-#define pgd_large(a) pud_large(__pud(pgd_val(a)))
-#define pgd_none(a) pud_none(__pud(pgd_val(a)))
-#endif
-
-/*
- * Not static in the original at the time of writing, so needs renaming here.
- */
-static void toi_ptdump_walk_pgd_level(pgd_t *pgd)
-{
-#ifdef CONFIG_X86_64
- pgd_t *start = (pgd_t *) &init_level4_pgt;
-#else
- pgd_t *start = swapper_pg_dir;
-#endif
- int i;
- if (pgd) {
- start = pgd;
- }
-
- for (i = 0; i < PTRS_PER_PGD; i++) {
- if (!pgd_none(*start)) {
- if (pgd_large(*start) || !pgd_present(*start))
- note_page(start);
- else
- walk_pud_level(*start);
- } else
- note_page(start);
-
- start++;
- }
-
- /* Flush out the last page */
- note_page(start);
-}
-
-#ifdef CONFIG_PARAVIRT
-extern struct pv_info pv_info;
-
-static void toi_set_paravirt_ops_untracked(void) {
- int i;
-
- unsigned long pvpfn = page_to_pfn(virt_to_page(__parainstructions)),
- pvpfn_end = page_to_pfn(virt_to_page(__parainstructions_end));
- //debug(KERN_EMERG ".parainstructions goes from pfn %ld to %ld.\n", pvpfn, pvpfn_end);
- for (i = pvpfn; i <= pvpfn_end; i++) {
- SetPageTOI_Untracked(pfn_to_page(i));
- }
-}
-#else
-#define toi_set_paravirt_ops_untracked() { do { } while(0) }
-#endif
-
-extern void toi_mark_per_cpus_pages_untracked(void);
-
-void toi_untrack_stack(unsigned long *stack)
-{
- int i;
- struct page *stack_page = virt_to_page(stack);
-
- for (i = 0; i < (1 << THREAD_SIZE_ORDER); i++) {
- pr_debug("Untrack stack page %p.\n", page_address(stack_page + i));
- SetPageTOI_Untracked(stack_page + i);
- }
-}
-void toi_untrack_process(struct task_struct *p)
-{
- SetPageTOI_Untracked(virt_to_page(p));
- pr_debug("Untrack process %d page %p.\n", p->pid, page_address(virt_to_page(p)));
-
- toi_untrack_stack(p->stack);
-}
-
-void toi_generate_untracked_map(void)
-{
- struct task_struct *p, *t;
- struct page *page;
- pte_t *pte;
- int i;
- unsigned int level;
- static int been_here = 0;
-
- if (been_here)
- return;
-
- been_here = 1;
-
- /* Pagetable pages */
- toi_ptdump_walk_pgd_level(NULL);
-
- /* Printk buffer - not normally needed but can be helpful for debugging. */
- //toi_set_logbuf_untracked();
-
- /* Paravirt ops */
- toi_set_paravirt_ops_untracked();
-
- /* Task structs and stacks */
- for_each_process_thread(p, t) {
- toi_untrack_process(p);
- //toi_untrack_stack((unsigned long *) t->thread.sp);
- }
-
- for (i = 0; i < NR_CPUS; i++) {
- struct task_struct *idle = idle_task(i);
-
- if (idle) {
- pr_debug("Untrack idle process for CPU %d.\n", i);
- toi_untrack_process(idle);
- }
-
- /* IRQ stack */
- pr_debug("Untrack IRQ stack for CPU %d.\n", i);
- toi_untrack_stack((unsigned long *)per_cpu(irq_stack_ptr, i));
- }
-
- /* Per CPU data */
- //pr_debug("Untracking per CPU variable pages.\n");
- toi_mark_per_cpus_pages_untracked();
-
- /* Init stack - for bringing up secondary CPUs */
- page = virt_to_page(init_stack);
- for (i = 0; i < DIV_ROUND_UP(sizeof(init_stack), PAGE_SIZE); i++) {
- SetPageTOI_Untracked(page + i);
- }
-
- pte = lookup_address((unsigned long) &mmu_cr4_features, &level);
- SetPageTOI_Untracked(pte_page(*pte));
- SetPageTOI_Untracked(virt_to_page(trampoline_cr4_features));
-}
-
-/**
- * toi_reset_dirtiness_one
- */
-
-void toi_reset_dirtiness_one(unsigned long pfn, int verbose)
-{
- struct page *page = pfn_to_page(pfn);
-
- /**
- * Don't worry about whether the Dirty flag is
- * already set. If this is our first call, it
- * won't be.
- */
-
- preempt_disable();
-
- ClearPageTOI_Dirty(page);
- SetPageTOI_RO(page);
- if (verbose)
- printk(KERN_EMERG "Making page %ld (%p|%p) read only.\n", pfn, page, page_address(page));
-
- set_memory_ro((unsigned long) page_address(page), 1);
-
- preempt_enable();
-}
-
-/**
- * TuxOnIce's incremental image support works by marking all memory apart from
- * the page tables read-only, then in the page-faults that result enabling
- * writing if appropriate and flagging the page as dirty. Free pages are also
- * marked as dirty and not protected so that if allocated, they will be included
- * in the image without further processing.
- *
- * toi_reset_dirtiness is called when and image exists and incremental images are
- * enabled, and each time we resume thereafter. It is not invoked on a fresh boot.
- *
- * This routine should be called from a single-cpu-running context to avoid races in setting
- * page dirty/read only flags.
- *
- * TODO: Make "it is not invoked on a fresh boot" true when I've finished developing it!
- *
- * TODO: Consider Xen paravirt guest boot issues. See arch/x86/mm/pageattr.c.
- **/
-
-int toi_reset_dirtiness(int verbose)
-{
- struct zone *zone;
- unsigned long loop;
- int allocated_map = 0;
-
- toi_generate_untracked_map();
-
- if (!free_map) {
- if (!toi_alloc_bitmap(&free_map))
- return -ENOMEM;
- allocated_map = 1;
- }
-
- toi_generate_free_page_map();
-
- pr_debug(KERN_EMERG "Reset dirtiness.\n");
- for_each_populated_zone(zone) {
- // 64 bit only. No need to worry about highmem.
- for (loop = 0; loop < zone->spanned_pages; loop++) {
- unsigned long pfn = zone->zone_start_pfn + loop;
- struct page *page;
- int chunk_size;
-
- if (!pfn_valid(pfn)) {
- continue;
- }
-
- chunk_size = toi_size_of_free_region(zone, pfn);
- if (chunk_size) {
- loop += chunk_size - 1;
- continue;
- }
-
- page = pfn_to_page(pfn);
-
- if (PageNosave(page) || !saveable_page(zone, pfn)) {
- continue;
- }
-
- if (PageTOI_Untracked(page)) {
- continue;
- }
-
- /**
- * Do we need to (re)protect the page?
- * If it is already protected (PageTOI_RO), there is
- * nothing to do - skip the following.
- * If it is marked as dirty (PageTOI_Dirty), it was
- * either free and has been allocated or has been
- * written to and marked dirty. Reset the dirty flag
- * and (re)apply the protection.
- */
- if (!PageTOI_RO(page)) {
- toi_reset_dirtiness_one(pfn, verbose);
- }
- }
- }
-
- pr_debug(KERN_EMERG "Done resetting dirtiness.\n");
-
- if (allocated_map) {
- toi_free_bitmap(&free_map);
- }
- return 0;
-}
-
-static int toi_reset_dirtiness_initcall(void)
-{
- if (toi_do_incremental_initcall) {
- pr_info("TuxOnIce: Enabling dirty page tracking.\n");
- toi_reset_dirtiness(0);
- }
- return 1;
-}
-extern void toi_generate_untracked_map(void);
-
-// Leave early_initcall for pages to register untracked sections.
-early_initcall(toi_reset_dirtiness_initcall);
-
-static int __init toi_incremental_initcall_setup(char *str)
-{
- int value;
-
- if (sscanf(str, "=%d", &value) && value)
- toi_do_incremental_initcall = value;
-
- return 1;
-}
-__setup("toi_incremental_initcall", toi_incremental_initcall_setup);
diff --git a/kernel/power/tuxonice_io.c b/kernel/power/tuxonice_io.c
deleted file mode 100644
index 3c62c2682..000000000
--- a/kernel/power/tuxonice_io.c
+++ /dev/null
@@ -1,1932 +0,0 @@
-/*
- * kernel/power/tuxonice_io.c
- *
- * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
- * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz>
- * Copyright (C) 2002-2003 Florent Chabaud <fchabaud@free.fr>
- * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * It contains high level IO routines for hibernating.
- *
- */
-
-#include <linux/suspend.h>
-#include <linux/version.h>
-#include <linux/utsname.h>
-#include <linux/mount.h>
-#include <linux/highmem.h>
-#include <linux/kthread.h>
-#include <linux/cpu.h>
-#include <linux/fs_struct.h>
-#include <linux/bio.h>
-#include <linux/fs_uuid.h>
-#include <linux/kmod.h>
-#include <asm/tlbflush.h>
-
-#include "tuxonice.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_pageflags.h"
-#include "tuxonice_io.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_storage.h"
-#include "tuxonice_prepare_image.h"
-#include "tuxonice_extent.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_builtin.h"
-#include "tuxonice_checksum.h"
-#include "tuxonice_alloc.h"
-char alt_resume_param[256];
-
-/* Version read from image header at resume */
-static int toi_image_header_version;
-
-#define read_if_version(VERS, VAR, DESC, ERR_ACT) do { \
- if (likely(toi_image_header_version >= VERS)) \
- if (toiActiveAllocator->rw_header_chunk(READ, NULL, \
- (char *) &VAR, sizeof(VAR))) { \
- abort_hibernate(TOI_FAILED_IO, "Failed to read DESC."); \
- ERR_ACT; \
- } \
-} while(0) \
-
-/* Variables shared between threads and updated under the mutex */
-static int io_write, io_finish_at, io_base, io_barmax, io_pageset, io_result;
-static int io_index, io_nextupdate, io_pc, io_pc_step;
-static DEFINE_MUTEX(io_mutex);
-static DEFINE_PER_CPU(struct page *, last_sought);
-static DEFINE_PER_CPU(struct page *, last_high_page);
-static DEFINE_PER_CPU(char *, checksum_locn);
-static DEFINE_PER_CPU(struct pbe *, last_low_page);
-static atomic_t io_count;
-atomic_t toi_io_workers;
-
-static int using_flusher;
-
-DECLARE_WAIT_QUEUE_HEAD(toi_io_queue_flusher);
-
-int toi_bio_queue_flusher_should_finish;
-
-int toi_max_workers;
-
-static char *image_version_error = "The image header version is newer than " \
- "this kernel supports.";
-
-struct toi_module_ops *first_filter;
-
-static atomic_t toi_num_other_threads;
-static DECLARE_WAIT_QUEUE_HEAD(toi_worker_wait_queue);
-enum toi_worker_commands {
- TOI_IO_WORKER_STOP,
- TOI_IO_WORKER_RUN,
- TOI_IO_WORKER_EXIT
-};
-static enum toi_worker_commands toi_worker_command;
-
-/**
- * toi_attempt_to_parse_resume_device - determine if we can hibernate
- *
- * Can we hibernate, using the current resume= parameter?
- **/
-int toi_attempt_to_parse_resume_device(int quiet)
-{
- struct list_head *Allocator;
- struct toi_module_ops *thisAllocator;
- int result, returning = 0;
-
- if (toi_activate_storage(0))
- return 0;
-
- toiActiveAllocator = NULL;
- clear_toi_state(TOI_RESUME_DEVICE_OK);
- clear_toi_state(TOI_CAN_RESUME);
- clear_result_state(TOI_ABORTED);
-
- if (!toiNumAllocators) {
- if (!quiet)
- printk(KERN_INFO "TuxOnIce: No storage allocators have "
- "been registered. Hibernating will be "
- "disabled.\n");
- goto cleanup;
- }
-
- list_for_each(Allocator, &toiAllocators) {
- thisAllocator = list_entry(Allocator, struct toi_module_ops,
- type_list);
-
- /*
- * Not sure why you'd want to disable an allocator, but
- * we should honour the flag if we're providing it
- */
- if (!thisAllocator->enabled)
- continue;
-
- result = thisAllocator->parse_sig_location(
- resume_file, (toiNumAllocators == 1),
- quiet);
-
- switch (result) {
- case -EINVAL:
- /* For this allocator, but not a valid
- * configuration. Error already printed. */
- goto cleanup;
-
- case 0:
- /* For this allocator and valid. */
- toiActiveAllocator = thisAllocator;
-
- set_toi_state(TOI_RESUME_DEVICE_OK);
- set_toi_state(TOI_CAN_RESUME);
- returning = 1;
- goto cleanup;
- }
- }
- if (!quiet)
- printk(KERN_INFO "TuxOnIce: No matching enabled allocator "
- "found. Resuming disabled.\n");
-cleanup:
- toi_deactivate_storage(0);
- return returning;
-}
-
-void attempt_to_parse_resume_device2(void)
-{
- toi_prepare_usm();
- toi_attempt_to_parse_resume_device(0);
- toi_cleanup_usm();
-}
-
-void save_restore_alt_param(int replace, int quiet)
-{
- static char resume_param_save[255];
- static unsigned long toi_state_save;
-
- if (replace) {
- toi_state_save = toi_state;
- strcpy(resume_param_save, resume_file);
- strcpy(resume_file, alt_resume_param);
- } else {
- strcpy(resume_file, resume_param_save);
- toi_state = toi_state_save;
- }
- toi_attempt_to_parse_resume_device(quiet);
-}
-
-void attempt_to_parse_alt_resume_param(void)
-{
- int ok = 0;
-
- /* Temporarily set resume_param to the poweroff value */
- if (!strlen(alt_resume_param))
- return;
-
- printk(KERN_INFO "=== Trying Poweroff Resume2 ===\n");
- save_restore_alt_param(SAVE, NOQUIET);
- if (test_toi_state(TOI_CAN_RESUME))
- ok = 1;
-
- printk(KERN_INFO "=== Done ===\n");
- save_restore_alt_param(RESTORE, QUIET);
-
- /* If not ok, clear the string */
- if (ok)
- return;
-
- printk(KERN_INFO "Can't resume from that location; clearing "
- "alt_resume_param.\n");
- alt_resume_param[0] = '\0';
-}
-
-/**
- * noresume_reset_modules - reset data structures in case of non resuming
- *
- * When we read the start of an image, modules (and especially the
- * active allocator) might need to reset data structures if we
- * decide to remove the image rather than resuming from it.
- **/
-static void noresume_reset_modules(void)
-{
- struct toi_module_ops *this_filter;
-
- list_for_each_entry(this_filter, &toi_filters, type_list)
- if (this_filter->noresume_reset)
- this_filter->noresume_reset();
-
- if (toiActiveAllocator && toiActiveAllocator->noresume_reset)
- toiActiveAllocator->noresume_reset();
-}
-
-/**
- * fill_toi_header - fill the hibernate header structure
- * @struct toi_header: Header data structure to be filled.
- **/
-static int fill_toi_header(struct toi_header *sh)
-{
- int i, error;
-
- error = init_header((struct swsusp_info *) sh);
- if (error)
- return error;
-
- sh->pagedir = pagedir1;
- sh->pageset_2_size = pagedir2.size;
- sh->param0 = toi_result;
- sh->param1 = toi_bkd.toi_action;
- sh->param2 = toi_bkd.toi_debug_state;
- sh->param3 = toi_bkd.toi_default_console_level;
- sh->root_fs = current->fs->root.mnt->mnt_sb->s_dev;
- for (i = 0; i < 4; i++)
- sh->io_time[i/2][i%2] = toi_bkd.toi_io_time[i/2][i%2];
- sh->bkd = boot_kernel_data_buffer;
- return 0;
-}
-
-/**
- * rw_init_modules - initialize modules
- * @rw: Whether we are reading of writing an image.
- * @which: Section of the image being processed.
- *
- * Iterate over modules, preparing the ones that will be used to read or write
- * data.
- **/
-static int rw_init_modules(int rw, int which)
-{
- struct toi_module_ops *this_module;
- /* Initialise page transformers */
- list_for_each_entry(this_module, &toi_filters, type_list) {
- if (!this_module->enabled)
- continue;
- if (this_module->rw_init && this_module->rw_init(rw, which)) {
- abort_hibernate(TOI_FAILED_MODULE_INIT,
- "Failed to initialize the %s filter.",
- this_module->name);
- return 1;
- }
- }
-
- /* Initialise allocator */
- if (toiActiveAllocator->rw_init(rw, which)) {
- abort_hibernate(TOI_FAILED_MODULE_INIT,
- "Failed to initialise the allocator.");
- return 1;
- }
-
- /* Initialise other modules */
- list_for_each_entry(this_module, &toi_modules, module_list) {
- if (!this_module->enabled ||
- this_module->type == FILTER_MODULE ||
- this_module->type == WRITER_MODULE)
- continue;
- if (this_module->rw_init && this_module->rw_init(rw, which)) {
- set_abort_result(TOI_FAILED_MODULE_INIT);
- printk(KERN_INFO "Setting aborted flag due to module "
- "init failure.\n");
- return 1;
- }
- }
-
- return 0;
-}
-
-/**
- * rw_cleanup_modules - cleanup modules
- * @rw: Whether we are reading of writing an image.
- *
- * Cleanup components after reading or writing a set of pages.
- * Only the allocator may fail.
- **/
-static int rw_cleanup_modules(int rw)
-{
- struct toi_module_ops *this_module;
- int result = 0;
-
- /* Cleanup other modules */
- list_for_each_entry(this_module, &toi_modules, module_list) {
- if (!this_module->enabled ||
- this_module->type == FILTER_MODULE ||
- this_module->type == WRITER_MODULE)
- continue;
- if (this_module->rw_cleanup)
- result |= this_module->rw_cleanup(rw);
- }
-
- /* Flush data and cleanup */
- list_for_each_entry(this_module, &toi_filters, type_list) {
- if (!this_module->enabled)
- continue;
- if (this_module->rw_cleanup)
- result |= this_module->rw_cleanup(rw);
- }
-
- result |= toiActiveAllocator->rw_cleanup(rw);
-
- return result;
-}
-
-static struct page *copy_page_from_orig_page(struct page *orig_page, int is_high)
-{
- int index, min, max;
- struct page *high_page = NULL,
- **my_last_high_page = raw_cpu_ptr(&last_high_page),
- **my_last_sought = raw_cpu_ptr(&last_sought);
- struct pbe *this, **my_last_low_page = raw_cpu_ptr(&last_low_page);
- void *compare;
-
- if (is_high) {
- if (*my_last_sought && *my_last_high_page &&
- *my_last_sought < orig_page)
- high_page = *my_last_high_page;
- else
- high_page = (struct page *) restore_highmem_pblist;
- this = (struct pbe *) kmap(high_page);
- compare = orig_page;
- } else {
- if (*my_last_sought && *my_last_low_page &&
- *my_last_sought < orig_page)
- this = *my_last_low_page;
- else
- this = restore_pblist;
- compare = page_address(orig_page);
- }
-
- *my_last_sought = orig_page;
-
- /* Locate page containing pbe */
- while (this[PBES_PER_PAGE - 1].next &&
- this[PBES_PER_PAGE - 1].orig_address < compare) {
- if (is_high) {
- struct page *next_high_page = (struct page *)
- this[PBES_PER_PAGE - 1].next;
- kunmap(high_page);
- this = kmap(next_high_page);
- high_page = next_high_page;
- } else
- this = this[PBES_PER_PAGE - 1].next;
- }
-
- /* Do a binary search within the page */
- min = 0;
- max = PBES_PER_PAGE;
- index = PBES_PER_PAGE / 2;
- while (max - min) {
- if (!this[index].orig_address ||
- this[index].orig_address > compare)
- max = index;
- else if (this[index].orig_address == compare) {
- if (is_high) {
- struct page *page = this[index].address;
- *my_last_high_page = high_page;
- kunmap(high_page);
- return page;
- }
- *my_last_low_page = this;
- return virt_to_page(this[index].address);
- } else
- min = index;
- index = ((max + min) / 2);
- };
-
- if (is_high)
- kunmap(high_page);
-
- abort_hibernate(TOI_FAILED_IO, "Failed to get destination page for"
- " orig page %p. This[min].orig_address=%p.\n", orig_page,
- this[index].orig_address);
- return NULL;
-}
-
-/**
- * write_next_page - write the next page in a pageset
- * @data_pfn: The pfn where the next data to write is located.
- * @my_io_index: The index of the page in the pageset.
- * @write_pfn: The pfn number to write in the image (where the data belongs).
- *
- * Get the pfn of the next page to write, map the page if necessary and do the
- * write.
- **/
-static int write_next_page(unsigned long *data_pfn, int *my_io_index,
- unsigned long *write_pfn)
-{
- struct page *page;
- char **my_checksum_locn = raw_cpu_ptr(&checksum_locn);
- int result = 0, was_present;
-
- *data_pfn = memory_bm_next_pfn(io_map, 0);
-
- /* Another thread could have beaten us to it. */
- if (*data_pfn == BM_END_OF_MAP) {
- if (atomic_read(&io_count)) {
- printk(KERN_INFO "Ran out of pfns but io_count is "
- "still %d.\n", atomic_read(&io_count));
- BUG();
- }
- mutex_unlock(&io_mutex);
- return -ENODATA;
- }
-
- *my_io_index = io_finish_at - atomic_sub_return(1, &io_count);
-
- memory_bm_clear_bit(io_map, 0, *data_pfn);
- page = pfn_to_page(*data_pfn);
-
- was_present = kernel_page_present(page);
- if (!was_present)
- kernel_map_pages(page, 1, 1);
-
- if (io_pageset == 1)
- *write_pfn = memory_bm_next_pfn(pageset1_map, 0);
- else {
- *write_pfn = *data_pfn;
- *my_checksum_locn = tuxonice_get_next_checksum();
- }
-
- TOI_TRACE_DEBUG(*data_pfn, "_PS%d_write %d", io_pageset, *my_io_index);
-
- mutex_unlock(&io_mutex);
-
- if (io_pageset == 2 && tuxonice_calc_checksum(page, *my_checksum_locn))
- return 1;
-
- result = first_filter->write_page(*write_pfn, TOI_PAGE, page,
- PAGE_SIZE);
-
- if (!was_present)
- kernel_map_pages(page, 1, 0);
-
- return result;
-}
-
-/**
- * read_next_page - read the next page in a pageset
- * @my_io_index: The index of the page in the pageset.
- * @write_pfn: The pfn in which the data belongs.
- *
- * Read a page of the image into our buffer. It can happen (here and in the
- * write routine) that threads don't get run until after other CPUs have done
- * all the work. This was the cause of the long standing issue with
- * occasionally getting -ENODATA errors at the end of reading the image. We
- * therefore need to check there's actually a page to read before trying to
- * retrieve one.
- **/
-
-static int read_next_page(int *my_io_index, unsigned long *write_pfn,
- struct page *buffer)
-{
- unsigned int buf_size = PAGE_SIZE;
- unsigned long left = atomic_read(&io_count);
-
- if (!left)
- return -ENODATA;
-
- /* Start off assuming the page we read isn't resaved */
- *my_io_index = io_finish_at - atomic_sub_return(1, &io_count);
-
- mutex_unlock(&io_mutex);
-
- /*
- * Are we aborting? If so, don't submit any more I/O as
- * resetting the resume_attempted flag (from ui.c) will
- * clear the bdev flags, making this thread oops.
- */
- if (unlikely(test_toi_state(TOI_STOP_RESUME))) {
- atomic_dec(&toi_io_workers);
- if (!atomic_read(&toi_io_workers)) {
- /*
- * So we can be sure we'll have memory for
- * marking that we haven't resumed.
- */
- rw_cleanup_modules(READ);
- set_toi_state(TOI_IO_STOPPED);
- }
- while (1)
- schedule();
- }
-
- /*
- * See toi_bio_read_page in tuxonice_bio.c:
- * read the next page in the image.
- */
- return first_filter->read_page(write_pfn, TOI_PAGE, buffer, &buf_size);
-}
-
-static void use_read_page(unsigned long write_pfn, struct page *buffer)
-{
- struct page *final_page = pfn_to_page(write_pfn),
- *copy_page = final_page;
- char *virt, *buffer_virt;
- int was_present, cpu = smp_processor_id();
- unsigned long idx = 0;
-
- if (io_pageset == 1 && (!pageset1_copy_map ||
- !memory_bm_test_bit(pageset1_copy_map, cpu, write_pfn))) {
- int is_high = PageHighMem(final_page);
- copy_page = copy_page_from_orig_page(is_high ? (void *) write_pfn : final_page, is_high);
- }
-
- if (!memory_bm_test_bit(io_map, cpu, write_pfn)) {
- int test = !memory_bm_test_bit(io_map, cpu, write_pfn);
- toi_message(TOI_IO, TOI_VERBOSE, 0, "Discard %ld (%d).", write_pfn, test);
- mutex_lock(&io_mutex);
- idx = atomic_add_return(1, &io_count);
- mutex_unlock(&io_mutex);
- return;
- }
-
- virt = kmap(copy_page);
- buffer_virt = kmap(buffer);
- was_present = kernel_page_present(copy_page);
- if (!was_present)
- kernel_map_pages(copy_page, 1, 1);
- memcpy(virt, buffer_virt, PAGE_SIZE);
- if (!was_present)
- kernel_map_pages(copy_page, 1, 0);
- kunmap(copy_page);
- kunmap(buffer);
- memory_bm_clear_bit(io_map, cpu, write_pfn);
- TOI_TRACE_DEBUG(write_pfn, "_PS%d_read", io_pageset);
-}
-
-static unsigned long status_update(int writing, unsigned long done,
- unsigned long ticks)
-{
- int cs_index = writing ? 0 : 1;
- unsigned long ticks_so_far = toi_bkd.toi_io_time[cs_index][1] + ticks;
- unsigned long msec = jiffies_to_msecs(abs(ticks_so_far));
- unsigned long pgs_per_s, estimate = 0, pages_left;
-
- if (msec) {
- pages_left = io_barmax - done;
- pgs_per_s = 1000 * done / msec;
- if (pgs_per_s)
- estimate = DIV_ROUND_UP(pages_left, pgs_per_s);
- }
-
- if (estimate && ticks > HZ / 2)
- return toi_update_status(done, io_barmax,
- " %d/%d MB (%lu sec left)",
- MB(done+1), MB(io_barmax), estimate);
-
- return toi_update_status(done, io_barmax, " %d/%d MB",
- MB(done+1), MB(io_barmax));
-}
-
-/**
- * worker_rw_loop - main loop to read/write pages
- *
- * The main I/O loop for reading or writing pages. The io_map bitmap is used to
- * track the pages to read/write.
- * If we are reading, the pages are loaded to their final (mapped) pfn.
- * Data is non zero iff this is a thread started via start_other_threads.
- * In that case, we stay in here until told to quit.
- **/
-static int worker_rw_loop(void *data)
-{
- unsigned long data_pfn, write_pfn, next_jiffies = jiffies + HZ / 4,
- jif_index = 1, start_time = jiffies, thread_num;
- int result = 0, my_io_index = 0, last_worker;
- struct page *buffer = toi_alloc_page(28, TOI_ATOMIC_GFP);
- cpumask_var_t orig_mask;
-
- if (!alloc_cpumask_var(&orig_mask, GFP_KERNEL)) {
- printk(KERN_EMERG "Failed to allocate cpumask for TuxOnIce I/O thread %ld.\n", (unsigned long) data);
- result = -ENOMEM;
- goto out;
- }
-
- cpumask_copy(orig_mask, tsk_cpus_allowed(current));
-
- current->flags |= PF_NOFREEZE;
-
-top:
- mutex_lock(&io_mutex);
- thread_num = atomic_read(&toi_io_workers);
-
- cpumask_copy(tsk_cpus_allowed(current), orig_mask);
- schedule();
-
- atomic_inc(&toi_io_workers);
-
- while (atomic_read(&io_count) >= atomic_read(&toi_io_workers) &&
- !(io_write && test_result_state(TOI_ABORTED)) &&
- toi_worker_command == TOI_IO_WORKER_RUN) {
- if (!thread_num && jiffies > next_jiffies) {
- next_jiffies += HZ / 4;
- if (toiActiveAllocator->update_throughput_throttle)
- toiActiveAllocator->update_throughput_throttle(
- jif_index);
- jif_index++;
- }
-
- /*
- * What page to use? If reading, don't know yet which page's
- * data will be read, so always use the buffer. If writing,
- * use the copy (Pageset1) or original page (Pageset2), but
- * always write the pfn of the original page.
- */
- if (io_write)
- result = write_next_page(&data_pfn, &my_io_index,
- &write_pfn);
- else /* Reading */
- result = read_next_page(&my_io_index, &write_pfn,
- buffer);
-
- if (result) {
- mutex_lock(&io_mutex);
- /* Nothing to do? */
- if (result == -ENODATA) {
- toi_message(TOI_IO, TOI_VERBOSE, 0,
- "Thread %d has no more work.",
- smp_processor_id());
- break;
- }
-
- io_result = result;
-
- if (io_write) {
- printk(KERN_INFO "Write chunk returned %d.\n",
- result);
- abort_hibernate(TOI_FAILED_IO,
- "Failed to write a chunk of the "
- "image.");
- break;
- }
-
- if (io_pageset == 1) {
- printk(KERN_ERR "\nBreaking out of I/O loop "
- "because of result code %d.\n", result);
- break;
- }
- panic("Read chunk returned (%d)", result);
- }
-
- /*
- * Discard reads of resaved pages while reading ps2
- * and unwanted pages while rereading ps2 when aborting.
- */
- if (!io_write) {
- if (!PageResave(pfn_to_page(write_pfn)))
- use_read_page(write_pfn, buffer);
- else {
- mutex_lock(&io_mutex);
- toi_message(TOI_IO, TOI_VERBOSE, 0,
- "Resaved %ld.", write_pfn);
- atomic_inc(&io_count);
- mutex_unlock(&io_mutex);
- }
- }
-
- if (!thread_num) {
- if(my_io_index + io_base > io_nextupdate)
- io_nextupdate = status_update(io_write,
- my_io_index + io_base,
- jiffies - start_time);
-
- if (my_io_index > io_pc) {
- printk(KERN_CONT "...%d%%", 20 * io_pc_step);
- io_pc_step++;
- io_pc = io_finish_at * io_pc_step / 5;
- }
- }
-
- toi_cond_pause(0, NULL);
-
- /*
- * Subtle: If there's less I/O still to be done than threads
- * running, quit. This stops us doing I/O beyond the end of
- * the image when reading.
- *
- * Possible race condition. Two threads could do the test at
- * the same time; one should exit and one should continue.
- * Therefore we take the mutex before comparing and exiting.
- */
-
- mutex_lock(&io_mutex);
- }
-
- last_worker = atomic_dec_and_test(&toi_io_workers);
- toi_message(TOI_IO, TOI_VERBOSE, 0, "%d workers left.", atomic_read(&toi_io_workers));
- mutex_unlock(&io_mutex);
-
- if ((unsigned long) data && toi_worker_command != TOI_IO_WORKER_EXIT) {
- /* Were we the last thread and we're using a flusher thread? */
- if (last_worker && using_flusher) {
- toiActiveAllocator->finish_all_io();
- }
- /* First, if we're doing I/O, wait for it to finish */
- wait_event(toi_worker_wait_queue, toi_worker_command != TOI_IO_WORKER_RUN);
- /* Then wait to be told what to do next */
- wait_event(toi_worker_wait_queue, toi_worker_command != TOI_IO_WORKER_STOP);
- if (toi_worker_command == TOI_IO_WORKER_RUN)
- goto top;
- }
-
- if (thread_num)
- atomic_dec(&toi_num_other_threads);
-
-out:
- toi_message(TOI_IO, TOI_LOW, 0, "Thread %d exiting.", thread_num);
- toi__free_page(28, buffer);
- free_cpumask_var(orig_mask);
-
- return result;
-}
-
-int toi_start_other_threads(void)
-{
- int cpu;
- struct task_struct *p;
- int to_start = (toi_max_workers ? toi_max_workers : num_online_cpus()) - 1;
- unsigned long num_started = 0;
-
- if (test_action_state(TOI_NO_MULTITHREADED_IO))
- return 0;
-
- toi_worker_command = TOI_IO_WORKER_STOP;
-
- for_each_online_cpu(cpu) {
- if (num_started == to_start)
- break;
-
- if (cpu == smp_processor_id())
- continue;
-
- p = kthread_create_on_node(worker_rw_loop, (void *) num_started + 1,
- cpu_to_node(cpu), "ktoi_io/%d", cpu);
- if (IS_ERR(p)) {
- printk(KERN_ERR "ktoi_io for %i failed\n", cpu);
- continue;
- }
- kthread_bind(p, cpu);
- p->flags |= PF_MEMALLOC;
- wake_up_process(p);
- num_started++;
- atomic_inc(&toi_num_other_threads);
- }
-
- toi_message(TOI_IO, TOI_LOW, 0, "Started %d threads.", num_started);
- return num_started;
-}
-
-void toi_stop_other_threads(void)
-{
- toi_message(TOI_IO, TOI_LOW, 0, "Stopping other threads.");
- toi_worker_command = TOI_IO_WORKER_EXIT;
- wake_up(&toi_worker_wait_queue);
-}
-
-/**
- * do_rw_loop - main highlevel function for reading or writing pages
- *
- * Create the io_map bitmap and call worker_rw_loop to perform I/O operations.
- **/
-static int do_rw_loop(int write, int finish_at, struct memory_bitmap *pageflags,
- int base, int barmax, int pageset)
-{
- int index = 0, cpu, result = 0, workers_started;
- unsigned long pfn, next;
-
- first_filter = toi_get_next_filter(NULL);
-
- if (!finish_at)
- return 0;
-
- io_write = write;
- io_finish_at = finish_at;
- io_base = base;
- io_barmax = barmax;
- io_pageset = pageset;
- io_index = 0;
- io_pc = io_finish_at / 5;
- io_pc_step = 1;
- io_result = 0;
- io_nextupdate = base + 1;
- toi_bio_queue_flusher_should_finish = 0;
-
- for_each_online_cpu(cpu) {
- per_cpu(last_sought, cpu) = NULL;
- per_cpu(last_low_page, cpu) = NULL;
- per_cpu(last_high_page, cpu) = NULL;
- }
-
- /* Ensure all bits clear */
- memory_bm_clear(io_map);
-
- memory_bm_position_reset(io_map);
- next = memory_bm_next_pfn(io_map, 0);
-
- BUG_ON(next != BM_END_OF_MAP);
-
- /* Set the bits for the pages to write */
- memory_bm_position_reset(pageflags);
-
- pfn = memory_bm_next_pfn(pageflags, 0);
- toi_trace_index++;
-
- while (pfn != BM_END_OF_MAP && index < finish_at) {
- TOI_TRACE_DEBUG(pfn, "_io_pageset_%d (%d/%d)", pageset, index + 1, finish_at);
- memory_bm_set_bit(io_map, 0, pfn);
- pfn = memory_bm_next_pfn(pageflags, 0);
- index++;
- }
-
- BUG_ON(next != BM_END_OF_MAP || index < finish_at);
-
- memory_bm_position_reset(io_map);
- toi_trace_index++;
-
- atomic_set(&io_count, finish_at);
-
- memory_bm_position_reset(pageset1_map);
-
- mutex_lock(&io_mutex);
-
- clear_toi_state(TOI_IO_STOPPED);
-
- using_flusher = (atomic_read(&toi_num_other_threads) &&
- toiActiveAllocator->io_flusher &&
- !test_action_state(TOI_NO_FLUSHER_THREAD));
-
- workers_started = atomic_read(&toi_num_other_threads);
-
- memory_bm_position_reset(io_map);
- memory_bm_position_reset(pageset1_copy_map);
-
- toi_worker_command = TOI_IO_WORKER_RUN;
- wake_up(&toi_worker_wait_queue);
-
- mutex_unlock(&io_mutex);
-
- if (using_flusher)
- result = toiActiveAllocator->io_flusher(write);
- else
- worker_rw_loop(NULL);
-
- while (atomic_read(&toi_io_workers))
- schedule();
-
- printk(KERN_CONT "\n");
-
- toi_worker_command = TOI_IO_WORKER_STOP;
- wake_up(&toi_worker_wait_queue);
-
- if (unlikely(test_toi_state(TOI_STOP_RESUME))) {
- if (!atomic_read(&toi_io_workers)) {
- rw_cleanup_modules(READ);
- set_toi_state(TOI_IO_STOPPED);
- }
- while (1)
- schedule();
- }
- set_toi_state(TOI_IO_STOPPED);
-
- if (!io_result && !result && !test_result_state(TOI_ABORTED)) {
- unsigned long next;
-
- toi_update_status(io_base + io_finish_at, io_barmax,
- " %d/%d MB ",
- MB(io_base + io_finish_at), MB(io_barmax));
-
- memory_bm_position_reset(io_map);
- next = memory_bm_next_pfn(io_map, 0);
- if (next != BM_END_OF_MAP) {
- printk(KERN_INFO "Finished I/O loop but still work to "
- "do?\nFinish at = %d. io_count = %d.\n",
- finish_at, atomic_read(&io_count));
- printk(KERN_INFO "I/O bitmap still records work to do."
- "%ld.\n", next);
- BUG();
- do {
- cpu_relax();
- } while (0);
- }
- }
-
- return io_result ? io_result : result;
-}
-
-/**
- * write_pageset - write a pageset to disk.
- * @pagedir: Which pagedir to write.
- *
- * Returns:
- * Zero on success or -1 on failure.
- **/
-int write_pageset(struct pagedir *pagedir)
-{
- int finish_at, base = 0;
- int barmax = pagedir1.size + pagedir2.size;
- long error = 0;
- struct memory_bitmap *pageflags;
- unsigned long start_time, end_time;
-
- /*
- * Even if there is nothing to read or write, the allocator
- * may need the init/cleanup for it's housekeeping. (eg:
- * Pageset1 may start where pageset2 ends when writing).
- */
- finish_at = pagedir->size;
-
- if (pagedir->id == 1) {
- toi_prepare_status(DONT_CLEAR_BAR,
- "Writing kernel & process data...");
- base = pagedir2.size;
- if (test_action_state(TOI_TEST_FILTER_SPEED) ||
- test_action_state(TOI_TEST_BIO))
- pageflags = pageset1_map;
- else
- pageflags = pageset1_copy_map;
- } else {
- toi_prepare_status(DONT_CLEAR_BAR, "Writing caches...");
- pageflags = pageset2_map;
- }
-
- start_time = jiffies;
-
- if (rw_init_modules(WRITE, pagedir->id)) {
- abort_hibernate(TOI_FAILED_MODULE_INIT,
- "Failed to initialise modules for writing.");
- error = 1;
- }
-
- if (!error)
- error = do_rw_loop(WRITE, finish_at, pageflags, base, barmax,
- pagedir->id);
-
- if (rw_cleanup_modules(WRITE) && !error) {
- abort_hibernate(TOI_FAILED_MODULE_CLEANUP,
- "Failed to cleanup after writing.");
- error = 1;
- }
-
- end_time = jiffies;
-
- if ((end_time - start_time) && (!test_result_state(TOI_ABORTED))) {
- toi_bkd.toi_io_time[0][0] += finish_at,
- toi_bkd.toi_io_time[0][1] += (end_time - start_time);
- }
-
- return error;
-}
-
-/**
- * read_pageset - highlevel function to read a pageset from disk
- * @pagedir: pageset to read
- * @overwrittenpagesonly: Whether to read the whole pageset or
- * only part of it.
- *
- * Returns:
- * Zero on success or -1 on failure.
- **/
-static int read_pageset(struct pagedir *pagedir, int overwrittenpagesonly)
-{
- int result = 0, base = 0;
- int finish_at = pagedir->size;
- int barmax = pagedir1.size + pagedir2.size;
- struct memory_bitmap *pageflags;
- unsigned long start_time, end_time;
-
- if (pagedir->id == 1) {
- toi_prepare_status(DONT_CLEAR_BAR,
- "Reading kernel & process data...");
- pageflags = pageset1_map;
- } else {
- toi_prepare_status(DONT_CLEAR_BAR, "Reading caches...");
- if (overwrittenpagesonly) {
- barmax = min(pagedir1.size, pagedir2.size);
- finish_at = min(pagedir1.size, pagedir2.size);
- } else
- base = pagedir1.size;
- pageflags = pageset2_map;
- }
-
- start_time = jiffies;
-
- if (rw_init_modules(READ, pagedir->id)) {
- toiActiveAllocator->remove_image();
- result = 1;
- } else
- result = do_rw_loop(READ, finish_at, pageflags, base, barmax,
- pagedir->id);
-
- if (rw_cleanup_modules(READ) && !result) {
- abort_hibernate(TOI_FAILED_MODULE_CLEANUP,
- "Failed to cleanup after reading.");
- result = 1;
- }
-
- /* Statistics */
- end_time = jiffies;
-
- if ((end_time - start_time) && (!test_result_state(TOI_ABORTED))) {
- toi_bkd.toi_io_time[1][0] += finish_at,
- toi_bkd.toi_io_time[1][1] += (end_time - start_time);
- }
-
- return result;
-}
-
-/**
- * write_module_configs - store the modules configuration
- *
- * The configuration for each module is stored in the image header.
- * Returns: Int
- * Zero on success, Error value otherwise.
- **/
-static int write_module_configs(void)
-{
- struct toi_module_ops *this_module;
- char *buffer = (char *) toi_get_zeroed_page(22, TOI_ATOMIC_GFP);
- int len, index = 1;
- struct toi_module_header toi_module_header;
-
- if (!buffer) {
- printk(KERN_INFO "Failed to allocate a buffer for saving "
- "module configuration info.\n");
- return -ENOMEM;
- }
-
- /*
- * We have to know which data goes with which module, so we at
- * least write a length of zero for a module. Note that we are
- * also assuming every module's config data takes <= PAGE_SIZE.
- */
-
- /* For each module (in registration order) */
- list_for_each_entry(this_module, &toi_modules, module_list) {
- if (!this_module->enabled || !this_module->storage_needed ||
- (this_module->type == WRITER_MODULE &&
- toiActiveAllocator != this_module))
- continue;
-
- /* Get the data from the module */
- len = 0;
- if (this_module->save_config_info)
- len = this_module->save_config_info(buffer);
-
- /* Save the details of the module */
- toi_module_header.enabled = this_module->enabled;
- toi_module_header.type = this_module->type;
- toi_module_header.index = index++;
- strncpy(toi_module_header.name, this_module->name,
- sizeof(toi_module_header.name));
- toiActiveAllocator->rw_header_chunk(WRITE,
- this_module,
- (char *) &toi_module_header,
- sizeof(toi_module_header));
-
- /* Save the size of the data and any data returned */
- toiActiveAllocator->rw_header_chunk(WRITE,
- this_module,
- (char *) &len, sizeof(int));
- if (len)
- toiActiveAllocator->rw_header_chunk(
- WRITE, this_module, buffer, len);
- }
-
- /* Write a blank header to terminate the list */
- toi_module_header.name[0] = '\0';
- toiActiveAllocator->rw_header_chunk(WRITE, NULL,
- (char *) &toi_module_header, sizeof(toi_module_header));
-
- toi_free_page(22, (unsigned long) buffer);
- return 0;
-}
-
-/**
- * read_one_module_config - read and configure one module
- *
- * Read the configuration for one module, and configure the module
- * to match if it is loaded.
- *
- * Returns: Int
- * Zero on success, Error value otherwise.
- **/
-static int read_one_module_config(struct toi_module_header *header)
-{
- struct toi_module_ops *this_module;
- int result, len;
- char *buffer;
-
- /* Find the module */
- this_module = toi_find_module_given_name(header->name);
-
- if (!this_module) {
- if (header->enabled) {
- toi_early_boot_message(1, TOI_CONTINUE_REQ,
- "It looks like we need module %s for reading "
- "the image but it hasn't been registered.\n",
- header->name);
- if (!(test_toi_state(TOI_CONTINUE_REQ)))
- return -EINVAL;
- } else
- printk(KERN_INFO "Module %s configuration data found, "
- "but the module hasn't registered. Looks like "
- "it was disabled, so we're ignoring its data.",
- header->name);
- }
-
- /* Get the length of the data (if any) */
- result = toiActiveAllocator->rw_header_chunk(READ, NULL, (char *) &len,
- sizeof(int));
- if (result) {
- printk(KERN_ERR "Failed to read the length of the module %s's"
- " configuration data.\n",
- header->name);
- return -EINVAL;
- }
-
- /* Read any data and pass to the module (if we found one) */
- if (!len)
- return 0;
-
- buffer = (char *) toi_get_zeroed_page(23, TOI_ATOMIC_GFP);
-
- if (!buffer) {
- printk(KERN_ERR "Failed to allocate a buffer for reloading "
- "module configuration info.\n");
- return -ENOMEM;
- }
-
- toiActiveAllocator->rw_header_chunk(READ, NULL, buffer, len);
-
- if (!this_module)
- goto out;
-
- if (!this_module->save_config_info)
- printk(KERN_ERR "Huh? Module %s appears to have a "
- "save_config_info, but not a load_config_info "
- "function!\n", this_module->name);
- else
- this_module->load_config_info(buffer, len);
-
- /*
- * Now move this module to the tail of its lists. This will put it in
- * order. Any new modules will end up at the top of the lists. They
- * should have been set to disabled when loaded (people will
- * normally not edit an initrd to load a new module and then hibernate
- * without using it!).
- */
-
- toi_move_module_tail(this_module);
-
- this_module->enabled = header->enabled;
-
-out:
- toi_free_page(23, (unsigned long) buffer);
- return 0;
-}
-
-/**
- * read_module_configs - reload module configurations from the image header.
- *
- * Returns: Int
- * Zero on success or an error code.
- **/
-static int read_module_configs(void)
-{
- int result = 0;
- struct toi_module_header toi_module_header;
- struct toi_module_ops *this_module;
-
- /* All modules are initially disabled. That way, if we have a module
- * loaded now that wasn't loaded when we hibernated, it won't be used
- * in trying to read the data.
- */
- list_for_each_entry(this_module, &toi_modules, module_list)
- this_module->enabled = 0;
-
- /* Get the first module header */
- result = toiActiveAllocator->rw_header_chunk(READ, NULL,
- (char *) &toi_module_header,
- sizeof(toi_module_header));
- if (result) {
- printk(KERN_ERR "Failed to read the next module header.\n");
- return -EINVAL;
- }
-
- /* For each module (in registration order) */
- while (toi_module_header.name[0]) {
- result = read_one_module_config(&toi_module_header);
-
- if (result)
- return -EINVAL;
-
- /* Get the next module header */
- result = toiActiveAllocator->rw_header_chunk(READ, NULL,
- (char *) &toi_module_header,
- sizeof(toi_module_header));
-
- if (result) {
- printk(KERN_ERR "Failed to read the next module "
- "header.\n");
- return -EINVAL;
- }
- }
-
- return 0;
-}
-
-static inline int save_fs_info(struct fs_info *fs, struct block_device *bdev)
-{
- return (!fs || IS_ERR(fs) || !fs->last_mount_size) ? 0 : 1;
-}
-
-int fs_info_space_needed(void)
-{
- const struct super_block *sb;
- int result = sizeof(int);
-
- list_for_each_entry(sb, &super_blocks, s_list) {
- struct fs_info *fs;
-
- if (!sb->s_bdev)
- continue;
-
- fs = fs_info_from_block_dev(sb->s_bdev);
- if (save_fs_info(fs, sb->s_bdev))
- result += 16 + sizeof(dev_t) + sizeof(int) +
- fs->last_mount_size;
- free_fs_info(fs);
- }
- return result;
-}
-
-static int fs_info_num_to_save(void)
-{
- const struct super_block *sb;
- int to_save = 0;
-
- list_for_each_entry(sb, &super_blocks, s_list) {
- struct fs_info *fs;
-
- if (!sb->s_bdev)
- continue;
-
- fs = fs_info_from_block_dev(sb->s_bdev);
- if (save_fs_info(fs, sb->s_bdev))
- to_save++;
- free_fs_info(fs);
- }
-
- return to_save;
-}
-
-static int fs_info_save(void)
-{
- const struct super_block *sb;
- int to_save = fs_info_num_to_save();
-
- if (toiActiveAllocator->rw_header_chunk(WRITE, NULL, (char *) &to_save,
- sizeof(int))) {
- abort_hibernate(TOI_FAILED_IO, "Failed to write num fs_info"
- " to save.");
- return -EIO;
- }
-
- list_for_each_entry(sb, &super_blocks, s_list) {
- struct fs_info *fs;
-
- if (!sb->s_bdev)
- continue;
-
- fs = fs_info_from_block_dev(sb->s_bdev);
- if (save_fs_info(fs, sb->s_bdev)) {
- if (toiActiveAllocator->rw_header_chunk(WRITE, NULL,
- &fs->uuid[0], 16)) {
- abort_hibernate(TOI_FAILED_IO, "Failed to "
- "write uuid.");
- return -EIO;
- }
- if (toiActiveAllocator->rw_header_chunk(WRITE, NULL,
- (char *) &fs->dev_t, sizeof(dev_t))) {
- abort_hibernate(TOI_FAILED_IO, "Failed to "
- "write dev_t.");
- return -EIO;
- }
- if (toiActiveAllocator->rw_header_chunk(WRITE, NULL,
- (char *) &fs->last_mount_size, sizeof(int))) {
- abort_hibernate(TOI_FAILED_IO, "Failed to "
- "write last mount length.");
- return -EIO;
- }
- if (toiActiveAllocator->rw_header_chunk(WRITE, NULL,
- fs->last_mount, fs->last_mount_size)) {
- abort_hibernate(TOI_FAILED_IO, "Failed to "
- "write uuid.");
- return -EIO;
- }
- }
- free_fs_info(fs);
- }
- return 0;
-}
-
-static int fs_info_load_and_check_one(void)
-{
- char uuid[16], *last_mount;
- int result = 0, ln;
- dev_t dev_t;
- struct block_device *dev;
- struct fs_info *fs_info, seek;
-
- if (toiActiveAllocator->rw_header_chunk(READ, NULL, uuid, 16)) {
- abort_hibernate(TOI_FAILED_IO, "Failed to read uuid.");
- return -EIO;
- }
-
- read_if_version(3, dev_t, "uuid dev_t field", return -EIO);
-
- if (toiActiveAllocator->rw_header_chunk(READ, NULL, (char *) &ln,
- sizeof(int))) {
- abort_hibernate(TOI_FAILED_IO,
- "Failed to read last mount size.");
- return -EIO;
- }
-
- last_mount = kzalloc(ln, GFP_KERNEL);
-
- if (!last_mount)
- return -ENOMEM;
-
- if (toiActiveAllocator->rw_header_chunk(READ, NULL, last_mount, ln)) {
- abort_hibernate(TOI_FAILED_IO,
- "Failed to read last mount timestamp.");
- result = -EIO;
- goto out_lmt;
- }
-
- strncpy((char *) &seek.uuid, uuid, 16);
- seek.dev_t = dev_t;
- seek.last_mount_size = ln;
- seek.last_mount = last_mount;
- dev_t = blk_lookup_fs_info(&seek);
- if (!dev_t)
- goto out_lmt;
-
- dev = toi_open_by_devnum(dev_t);
-
- fs_info = fs_info_from_block_dev(dev);
- if (fs_info && !IS_ERR(fs_info)) {
- if (ln != fs_info->last_mount_size) {
- printk(KERN_EMERG "Found matching uuid but last mount "
- "time lengths differ?! "
- "(%d vs %d).\n", ln,
- fs_info->last_mount_size);
- result = -EINVAL;
- } else {
- char buf[BDEVNAME_SIZE];
- result = !!memcmp(fs_info->last_mount, last_mount, ln);
- if (result)
- printk(KERN_EMERG "Last mount time for %s has "
- "changed!\n", bdevname(dev, buf));
- }
- }
- toi_close_bdev(dev);
- free_fs_info(fs_info);
-out_lmt:
- kfree(last_mount);
- return result;
-}
-
-static int fs_info_load_and_check(void)
-{
- int to_do, result = 0;
-
- if (toiActiveAllocator->rw_header_chunk(READ, NULL, (char *) &to_do,
- sizeof(int))) {
- abort_hibernate(TOI_FAILED_IO, "Failed to read num fs_info "
- "to load.");
- return -EIO;
- }
-
- while(to_do--)
- result |= fs_info_load_and_check_one();
-
- return result;
-}
-
-/**
- * write_image_header - write the image header after write the image proper
- *
- * Returns: Int
- * Zero on success, error value otherwise.
- **/
-int write_image_header(void)
-{
- int ret;
- int total = pagedir1.size + pagedir2.size+2;
- char *header_buffer = NULL;
-
- /* Now prepare to write the header */
- ret = toiActiveAllocator->write_header_init();
- if (ret) {
- abort_hibernate(TOI_FAILED_MODULE_INIT,
- "Active allocator's write_header_init"
- " function failed.");
- goto write_image_header_abort;
- }
-
- /* Get a buffer */
- header_buffer = (char *) toi_get_zeroed_page(24, TOI_ATOMIC_GFP);
- if (!header_buffer) {
- abort_hibernate(TOI_OUT_OF_MEMORY,
- "Out of memory when trying to get page for header!");
- goto write_image_header_abort;
- }
-
- /* Write hibernate header */
- if (fill_toi_header((struct toi_header *) header_buffer)) {
- abort_hibernate(TOI_OUT_OF_MEMORY,
- "Failure to fill header information!");
- goto write_image_header_abort;
- }
-
- if (toiActiveAllocator->rw_header_chunk(WRITE, NULL,
- header_buffer, sizeof(struct toi_header))) {
- abort_hibernate(TOI_OUT_OF_MEMORY,
- "Failure to write header info.");
- goto write_image_header_abort;
- }
-
- if (toiActiveAllocator->rw_header_chunk(WRITE, NULL,
- (char *) &toi_max_workers, sizeof(toi_max_workers))) {
- abort_hibernate(TOI_OUT_OF_MEMORY,
- "Failure to number of workers to use.");
- goto write_image_header_abort;
- }
-
- /* Write filesystem info */
- if (fs_info_save())
- goto write_image_header_abort;
-
- /* Write module configurations */
- ret = write_module_configs();
- if (ret) {
- abort_hibernate(TOI_FAILED_IO,
- "Failed to write module configs.");
- goto write_image_header_abort;
- }
-
- if (memory_bm_write(pageset1_map,
- toiActiveAllocator->rw_header_chunk)) {
- abort_hibernate(TOI_FAILED_IO,
- "Failed to write bitmaps.");
- goto write_image_header_abort;
- }
-
- /* Flush data and let allocator cleanup */
- if (toiActiveAllocator->write_header_cleanup()) {
- abort_hibernate(TOI_FAILED_IO,
- "Failed to cleanup writing header.");
- goto write_image_header_abort_no_cleanup;
- }
-
- if (test_result_state(TOI_ABORTED))
- goto write_image_header_abort_no_cleanup;
-
- toi_update_status(total, total, NULL);
-
-out:
- if (header_buffer)
- toi_free_page(24, (unsigned long) header_buffer);
- return ret;
-
-write_image_header_abort:
- toiActiveAllocator->write_header_cleanup();
-write_image_header_abort_no_cleanup:
- ret = -1;
- goto out;
-}
-
-/**
- * sanity_check - check the header
- * @sh: the header which was saved at hibernate time.
- *
- * Perform a few checks, seeking to ensure that the kernel being
- * booted matches the one hibernated. They need to match so we can
- * be _sure_ things will work. It is not absolutely impossible for
- * resuming from a different kernel to work, just not assured.
- **/
-static char *sanity_check(struct toi_header *sh)
-{
- char *reason = check_image_kernel((struct swsusp_info *) sh);
-
- if (reason)
- return reason;
-
- if (!test_action_state(TOI_IGNORE_ROOTFS)) {
- const struct super_block *sb;
- list_for_each_entry(sb, &super_blocks, s_list) {
- if ((!(sb->s_flags & MS_RDONLY)) &&
- (sb->s_type->fs_flags & FS_REQUIRES_DEV))
- return "Device backed fs has been mounted "
- "rw prior to resume or initrd/ramfs "
- "is mounted rw.";
- }
- }
-
- return NULL;
-}
-
-static DECLARE_WAIT_QUEUE_HEAD(freeze_wait);
-
-#define FREEZE_IN_PROGRESS (~0)
-
-static int freeze_result;
-
-static void do_freeze(struct work_struct *dummy)
-{
- freeze_result = freeze_processes();
- wake_up(&freeze_wait);
- trap_non_toi_io = 1;
-}
-
-static DECLARE_WORK(freeze_work, do_freeze);
-
-/**
- * __read_pageset1 - test for the existence of an image and attempt to load it
- *
- * Returns: Int
- * Zero if image found and pageset1 successfully loaded.
- * Error if no image found or loaded.
- **/
-static int __read_pageset1(void)
-{
- int i, result = 0;
- char *header_buffer = (char *) toi_get_zeroed_page(25, TOI_ATOMIC_GFP),
- *sanity_error = NULL;
- struct toi_header *toi_header;
-
- if (!header_buffer) {
- printk(KERN_INFO "Unable to allocate a page for reading the "
- "signature.\n");
- return -ENOMEM;
- }
-
- /* Check for an image */
- result = toiActiveAllocator->image_exists(1);
- if (result == 3) {
- result = -ENODATA;
- toi_early_boot_message(1, 0, "The signature from an older "
- "version of TuxOnIce has been detected.");
- goto out_remove_image;
- }
-
- if (result != 1) {
- result = -ENODATA;
- noresume_reset_modules();
- printk(KERN_INFO "TuxOnIce: No image found.\n");
- goto out;
- }
-
- /*
- * Prepare the active allocator for reading the image header. The
- * activate allocator might read its own configuration.
- *
- * NB: This call may never return because there might be a signature
- * for a different image such that we warn the user and they choose
- * to reboot. (If the device ids look erroneous (2.4 vs 2.6) or the
- * location of the image might be unavailable if it was stored on a
- * network connection).
- */
-
- result = toiActiveAllocator->read_header_init();
- if (result) {
- printk(KERN_INFO "TuxOnIce: Failed to initialise, reading the "
- "image header.\n");
- goto out_remove_image;
- }
-
- /* Check for noresume command line option */
- if (test_toi_state(TOI_NORESUME_SPECIFIED)) {
- printk(KERN_INFO "TuxOnIce: Noresume on command line. Removed "
- "image.\n");
- goto out_remove_image;
- }
-
- /* Check whether we've resumed before */
- if (test_toi_state(TOI_RESUMED_BEFORE)) {
- toi_early_boot_message(1, 0, NULL);
- if (!(test_toi_state(TOI_CONTINUE_REQ))) {
- printk(KERN_INFO "TuxOnIce: Tried to resume before: "
- "Invalidated image.\n");
- goto out_remove_image;
- }
- }
-
- clear_toi_state(TOI_CONTINUE_REQ);
-
- toi_image_header_version = toiActiveAllocator->get_header_version();
-
- if (unlikely(toi_image_header_version > TOI_HEADER_VERSION)) {
- toi_early_boot_message(1, 0, image_version_error);
- if (!(test_toi_state(TOI_CONTINUE_REQ))) {
- printk(KERN_INFO "TuxOnIce: Header version too new: "
- "Invalidated image.\n");
- goto out_remove_image;
- }
- }
-
- /* Read hibernate header */
- result = toiActiveAllocator->rw_header_chunk(READ, NULL,
- header_buffer, sizeof(struct toi_header));
- if (result < 0) {
- printk(KERN_ERR "TuxOnIce: Failed to read the image "
- "signature.\n");
- goto out_remove_image;
- }
-
- toi_header = (struct toi_header *) header_buffer;
-
- /*
- * NB: This call may also result in a reboot rather than returning.
- */
-
- sanity_error = sanity_check(toi_header);
- if (sanity_error) {
- toi_early_boot_message(1, TOI_CONTINUE_REQ,
- sanity_error);
- printk(KERN_INFO "TuxOnIce: Sanity check failed.\n");
- goto out_remove_image;
- }
-
- /*
- * We have an image and it looks like it will load okay.
- *
- * Get metadata from header. Don't override commandline parameters.
- *
- * We don't need to save the image size limit because it's not used
- * during resume and will be restored with the image anyway.
- */
-
- memcpy((char *) &pagedir1,
- (char *) &toi_header->pagedir, sizeof(pagedir1));
- toi_result = toi_header->param0;
- if (!toi_bkd.toi_debug_state) {
- toi_bkd.toi_action =
- (toi_header->param1 & ~toi_bootflags_mask) |
- (toi_bkd.toi_action & toi_bootflags_mask);
- toi_bkd.toi_debug_state = toi_header->param2;
- toi_bkd.toi_default_console_level = toi_header->param3;
- }
- clear_toi_state(TOI_IGNORE_LOGLEVEL);
- pagedir2.size = toi_header->pageset_2_size;
- for (i = 0; i < 4; i++)
- toi_bkd.toi_io_time[i/2][i%2] =
- toi_header->io_time[i/2][i%2];
-
- set_toi_state(TOI_BOOT_KERNEL);
- boot_kernel_data_buffer = toi_header->bkd;
-
- read_if_version(1, toi_max_workers, "TuxOnIce max workers",
- goto out_remove_image);
-
- /* Read filesystem info */
- if (fs_info_load_and_check()) {
- printk(KERN_EMERG "TuxOnIce: File system mount time checks "
- "failed. Refusing to corrupt your filesystems!\n");
- goto out_remove_image;
- }
-
- /* Read module configurations */
- result = read_module_configs();
- if (result) {
- pagedir1.size = 0;
- pagedir2.size = 0;
- printk(KERN_INFO "TuxOnIce: Failed to read TuxOnIce module "
- "configurations.\n");
- clear_action_state(TOI_KEEP_IMAGE);
- goto out_remove_image;
- }
-
- toi_prepare_console();
-
- set_toi_state(TOI_NOW_RESUMING);
-
- result = pm_notifier_call_chain(PM_RESTORE_PREPARE);
- if (result)
- goto out_notifier_call_chain;;
-
- if (usermodehelper_disable())
- goto out_enable_usermodehelper;
-
- current->flags |= PF_NOFREEZE;
- freeze_result = FREEZE_IN_PROGRESS;
-
- schedule_work_on(cpumask_first(cpu_online_mask), &freeze_work);
-
- toi_cond_pause(1, "About to read original pageset1 locations.");
-
- /*
- * See _toi_rw_header_chunk in tuxonice_bio.c:
- * Initialize pageset1_map by reading the map from the image.
- */
- if (memory_bm_read(pageset1_map, toiActiveAllocator->rw_header_chunk))
- goto out_thaw;
-
- /*
- * See toi_rw_cleanup in tuxonice_bio.c:
- * Clean up after reading the header.
- */
- result = toiActiveAllocator->read_header_cleanup();
- if (result) {
- printk(KERN_ERR "TuxOnIce: Failed to cleanup after reading the "
- "image header.\n");
- goto out_thaw;
- }
-
- toi_cond_pause(1, "About to read pagedir.");
-
- /*
- * Get the addresses of pages into which we will load the kernel to
- * be copied back and check if they conflict with the ones we are using.
- */
- if (toi_get_pageset1_load_addresses()) {
- printk(KERN_INFO "TuxOnIce: Failed to get load addresses for "
- "pageset1.\n");
- goto out_thaw;
- }
-
- /* Read the original kernel back */
- toi_cond_pause(1, "About to read pageset 1.");
-
- /* Given the pagemap, read back the data from disk */
- if (read_pageset(&pagedir1, 0)) {
- toi_prepare_status(DONT_CLEAR_BAR, "Failed to read pageset 1.");
- result = -EIO;
- goto out_thaw;
- }
-
- toi_cond_pause(1, "About to restore original kernel.");
- result = 0;
-
- if (!toi_keeping_image &&
- toiActiveAllocator->mark_resume_attempted)
- toiActiveAllocator->mark_resume_attempted(1);
-
- wait_event(freeze_wait, freeze_result != FREEZE_IN_PROGRESS);
-out:
- current->flags &= ~PF_NOFREEZE;
- toi_free_page(25, (unsigned long) header_buffer);
- return result;
-
-out_thaw:
- wait_event(freeze_wait, freeze_result != FREEZE_IN_PROGRESS);
- trap_non_toi_io = 0;
- thaw_processes();
-out_enable_usermodehelper:
- usermodehelper_enable();
-out_notifier_call_chain:
- pm_notifier_call_chain(PM_POST_RESTORE);
- toi_cleanup_console();
-out_remove_image:
- result = -EINVAL;
- if (!toi_keeping_image)
- toiActiveAllocator->remove_image();
- toiActiveAllocator->read_header_cleanup();
- noresume_reset_modules();
- goto out;
-}
-
-/**
- * read_pageset1 - highlevel function to read the saved pages
- *
- * Attempt to read the header and pageset1 of a hibernate image.
- * Handle the outcome, complaining where appropriate.
- **/
-int read_pageset1(void)
-{
- int error;
-
- error = __read_pageset1();
-
- if (error && error != -ENODATA && error != -EINVAL &&
- !test_result_state(TOI_ABORTED))
- abort_hibernate(TOI_IMAGE_ERROR,
- "TuxOnIce: Error %d resuming\n", error);
-
- return error;
-}
-
-/**
- * get_have_image_data - check the image header
- **/
-static char *get_have_image_data(void)
-{
- char *output_buffer = (char *) toi_get_zeroed_page(26, TOI_ATOMIC_GFP);
- struct toi_header *toi_header;
-
- if (!output_buffer) {
- printk(KERN_INFO "Output buffer null.\n");
- return NULL;
- }
-
- /* Check for an image */
- if (!toiActiveAllocator->image_exists(1) ||
- toiActiveAllocator->read_header_init() ||
- toiActiveAllocator->rw_header_chunk(READ, NULL,
- output_buffer, sizeof(struct toi_header))) {
- sprintf(output_buffer, "0\n");
- /*
- * From an initrd/ramfs, catting have_image and
- * getting a result of 0 is sufficient.
- */
- clear_toi_state(TOI_BOOT_TIME);
- goto out;
- }
-
- toi_header = (struct toi_header *) output_buffer;
-
- sprintf(output_buffer, "1\n%s\n%s\n",
- toi_header->uts.machine,
- toi_header->uts.version);
-
- /* Check whether we've resumed before */
- if (test_toi_state(TOI_RESUMED_BEFORE))
- strcat(output_buffer, "Resumed before.\n");
-
-out:
- noresume_reset_modules();
- return output_buffer;
-}
-
-/**
- * read_pageset2 - read second part of the image
- * @overwrittenpagesonly: Read only pages which would have been
- * verwritten by pageset1?
- *
- * Read in part or all of pageset2 of an image, depending upon
- * whether we are hibernating and have only overwritten a portion
- * with pageset1 pages, or are resuming and need to read them
- * all.
- *
- * Returns: Int
- * Zero if no error, otherwise the error value.
- **/
-int read_pageset2(int overwrittenpagesonly)
-{
- int result = 0;
-
- if (!pagedir2.size)
- return 0;
-
- result = read_pageset(&pagedir2, overwrittenpagesonly);
-
- toi_cond_pause(1, "Pagedir 2 read.");
-
- return result;
-}
-
-/**
- * image_exists_read - has an image been found?
- * @page: Output buffer
- *
- * Store 0 or 1 in page, depending on whether an image is found.
- * Incoming buffer is PAGE_SIZE and result is guaranteed
- * to be far less than that, so we don't worry about
- * overflow.
- **/
-int image_exists_read(const char *page, int count)
-{
- int len = 0;
- char *result;
-
- if (toi_activate_storage(0))
- return count;
-
- if (!test_toi_state(TOI_RESUME_DEVICE_OK))
- toi_attempt_to_parse_resume_device(0);
-
- if (!toiActiveAllocator) {
- len = sprintf((char *) page, "-1\n");
- } else {
- result = get_have_image_data();
- if (result) {
- len = sprintf((char *) page, "%s", result);
- toi_free_page(26, (unsigned long) result);
- }
- }
-
- toi_deactivate_storage(0);
-
- return len;
-}
-
-/**
- * image_exists_write - invalidate an image if one exists
- **/
-int image_exists_write(const char *buffer, int count)
-{
- if (toi_activate_storage(0))
- return count;
-
- if (toiActiveAllocator && toiActiveAllocator->image_exists(1))
- toiActiveAllocator->remove_image();
-
- toi_deactivate_storage(0);
-
- clear_result_state(TOI_KEPT_IMAGE);
-
- return count;
-}
diff --git a/kernel/power/tuxonice_io.h b/kernel/power/tuxonice_io.h
deleted file mode 100644
index 683eab7a0..000000000
--- a/kernel/power/tuxonice_io.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * kernel/power/tuxonice_io.h
- *
- * Copyright (C) 2005-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * It contains high level IO routines for hibernating.
- *
- */
-
-#include <linux/utsname.h>
-#include "tuxonice_pagedir.h"
-
-/* Non-module data saved in our image header */
-struct toi_header {
- /*
- * Mirror struct swsusp_info, but without
- * the page aligned attribute
- */
- struct new_utsname uts;
- u32 version_code;
- unsigned long num_physpages;
- int cpus;
- unsigned long image_pages;
- unsigned long pages;
- unsigned long size;
-
- /* Our own data */
- unsigned long orig_mem_free;
- int page_size;
- int pageset_2_size;
- int param0;
- int param1;
- int param2;
- int param3;
- int progress0;
- int progress1;
- int progress2;
- int progress3;
- int io_time[2][2];
- struct pagedir pagedir;
- dev_t root_fs;
- unsigned long bkd; /* Boot kernel data locn */
-};
-
-extern int write_pageset(struct pagedir *pagedir);
-extern int write_image_header(void);
-extern int read_pageset1(void);
-extern int read_pageset2(int overwrittenpagesonly);
-
-extern int toi_attempt_to_parse_resume_device(int quiet);
-extern void attempt_to_parse_resume_device2(void);
-extern void attempt_to_parse_alt_resume_param(void);
-int image_exists_read(const char *page, int count);
-int image_exists_write(const char *buffer, int count);
-extern void save_restore_alt_param(int replace, int quiet);
-extern atomic_t toi_io_workers;
-
-/* Args to save_restore_alt_param */
-#define RESTORE 0
-#define SAVE 1
-
-#define NOQUIET 0
-#define QUIET 1
-
-extern wait_queue_head_t toi_io_queue_flusher;
-extern int toi_bio_queue_flusher_should_finish;
-
-int fs_info_space_needed(void);
-
-extern int toi_max_workers;
diff --git a/kernel/power/tuxonice_modules.c b/kernel/power/tuxonice_modules.c
deleted file mode 100644
index a203c8fb9..000000000
--- a/kernel/power/tuxonice_modules.c
+++ /dev/null
@@ -1,520 +0,0 @@
-/*
- * kernel/power/tuxonice_modules.c
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- */
-
-#include <linux/suspend.h>
-#include <linux/module.h>
-#include "tuxonice.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_ui.h"
-
-LIST_HEAD(toi_filters);
-LIST_HEAD(toiAllocators);
-
-LIST_HEAD(toi_modules);
-
-struct toi_module_ops *toiActiveAllocator;
-
-static int toi_num_filters;
-int toiNumAllocators, toi_num_modules;
-
-/*
- * toi_header_storage_for_modules
- *
- * Returns the amount of space needed to store configuration
- * data needed by the modules prior to copying back the original
- * kernel. We can exclude data for pageset2 because it will be
- * available anyway once the kernel is copied back.
- */
-long toi_header_storage_for_modules(void)
-{
- struct toi_module_ops *this_module;
- int bytes = 0;
-
- list_for_each_entry(this_module, &toi_modules, module_list) {
- if (!this_module->enabled ||
- (this_module->type == WRITER_MODULE &&
- toiActiveAllocator != this_module))
- continue;
- if (this_module->storage_needed) {
- int this = this_module->storage_needed() +
- sizeof(struct toi_module_header) +
- sizeof(int);
- this_module->header_requested = this;
- bytes += this;
- }
- }
-
- /* One more for the empty terminator */
- return bytes + sizeof(struct toi_module_header);
-}
-
-void print_toi_header_storage_for_modules(void)
-{
- struct toi_module_ops *this_module;
- int bytes = 0;
-
- printk(KERN_DEBUG "Header storage:\n");
- list_for_each_entry(this_module, &toi_modules, module_list) {
- if (!this_module->enabled ||
- (this_module->type == WRITER_MODULE &&
- toiActiveAllocator != this_module))
- continue;
- if (this_module->storage_needed) {
- int this = this_module->storage_needed() +
- sizeof(struct toi_module_header) +
- sizeof(int);
- this_module->header_requested = this;
- bytes += this;
- printk(KERN_DEBUG "+ %16s : %-4d/%d.\n",
- this_module->name,
- this_module->header_used, this);
- }
- }
-
- printk(KERN_DEBUG "+ empty terminator : %zu.\n",
- sizeof(struct toi_module_header));
- printk(KERN_DEBUG " ====\n");
- printk(KERN_DEBUG " %zu\n",
- bytes + sizeof(struct toi_module_header));
-}
-
-/*
- * toi_memory_for_modules
- *
- * Returns the amount of memory requested by modules for
- * doing their work during the cycle.
- */
-
-long toi_memory_for_modules(int print_parts)
-{
- long bytes = 0, result;
- struct toi_module_ops *this_module;
-
- if (print_parts)
- printk(KERN_INFO "Memory for modules:\n===================\n");
- list_for_each_entry(this_module, &toi_modules, module_list) {
- int this;
- if (!this_module->enabled)
- continue;
- if (this_module->memory_needed) {
- this = this_module->memory_needed();
- if (print_parts)
- printk(KERN_INFO "%10d bytes (%5ld pages) for "
- "module '%s'.\n", this,
- DIV_ROUND_UP(this, PAGE_SIZE),
- this_module->name);
- bytes += this;
- }
- }
-
- result = DIV_ROUND_UP(bytes, PAGE_SIZE);
- if (print_parts)
- printk(KERN_INFO " => %ld bytes, %ld pages.\n", bytes, result);
-
- return result;
-}
-
-/*
- * toi_expected_compression_ratio
- *
- * Returns the compression ratio expected when saving the image.
- */
-
-int toi_expected_compression_ratio(void)
-{
- int ratio = 100;
- struct toi_module_ops *this_module;
-
- list_for_each_entry(this_module, &toi_modules, module_list) {
- if (!this_module->enabled)
- continue;
- if (this_module->expected_compression)
- ratio = ratio * this_module->expected_compression()
- / 100;
- }
-
- return ratio;
-}
-
-/* toi_find_module_given_dir
- * Functionality : Return a module (if found), given a pointer
- * to its directory name
- */
-
-static struct toi_module_ops *toi_find_module_given_dir(char *name)
-{
- struct toi_module_ops *this_module, *found_module = NULL;
-
- list_for_each_entry(this_module, &toi_modules, module_list) {
- if (!strcmp(name, this_module->directory)) {
- found_module = this_module;
- break;
- }
- }
-
- return found_module;
-}
-
-/* toi_find_module_given_name
- * Functionality : Return a module (if found), given a pointer
- * to its name
- */
-
-struct toi_module_ops *toi_find_module_given_name(char *name)
-{
- struct toi_module_ops *this_module, *found_module = NULL;
-
- list_for_each_entry(this_module, &toi_modules, module_list) {
- if (!strcmp(name, this_module->name)) {
- found_module = this_module;
- break;
- }
- }
-
- return found_module;
-}
-
-/*
- * toi_print_module_debug_info
- * Functionality : Get debugging info from modules into a buffer.
- */
-int toi_print_module_debug_info(char *buffer, int buffer_size)
-{
- struct toi_module_ops *this_module;
- int len = 0;
-
- list_for_each_entry(this_module, &toi_modules, module_list) {
- if (!this_module->enabled)
- continue;
- if (this_module->print_debug_info) {
- int result;
- result = this_module->print_debug_info(buffer + len,
- buffer_size - len);
- len += result;
- }
- }
-
- /* Ensure null terminated */
- buffer[buffer_size] = 0;
-
- return len;
-}
-
-/*
- * toi_register_module
- *
- * Register a module.
- */
-int toi_register_module(struct toi_module_ops *module)
-{
- int i;
- struct kobject *kobj;
-
- if (!hibernation_available())
- return -ENODEV;
-
- module->enabled = 1;
-
- if (toi_find_module_given_name(module->name)) {
- printk(KERN_INFO "TuxOnIce: Trying to load module %s,"
- " which is already registered.\n",
- module->name);
- return -EBUSY;
- }
-
- switch (module->type) {
- case FILTER_MODULE:
- list_add_tail(&module->type_list, &toi_filters);
- toi_num_filters++;
- break;
- case WRITER_MODULE:
- list_add_tail(&module->type_list, &toiAllocators);
- toiNumAllocators++;
- break;
- case MISC_MODULE:
- case MISC_HIDDEN_MODULE:
- case BIO_ALLOCATOR_MODULE:
- break;
- default:
- printk(KERN_ERR "Hmmm. Module '%s' has an invalid type."
- " It has been ignored.\n", module->name);
- return -EINVAL;
- }
- list_add_tail(&module->module_list, &toi_modules);
- toi_num_modules++;
-
- if ((!module->directory && !module->shared_directory) ||
- !module->sysfs_data || !module->num_sysfs_entries)
- return 0;
-
- /*
- * Modules may share a directory, but those with shared_dir
- * set must be loaded (via symbol dependencies) after parents
- * and unloaded beforehand.
- */
- if (module->shared_directory) {
- struct toi_module_ops *shared =
- toi_find_module_given_dir(module->shared_directory);
- if (!shared) {
- printk(KERN_ERR "TuxOnIce: Module %s wants to share "
- "%s's directory but %s isn't loaded.\n",
- module->name, module->shared_directory,
- module->shared_directory);
- toi_unregister_module(module);
- return -ENODEV;
- }
- kobj = shared->dir_kobj;
- } else {
- if (!strncmp(module->directory, "[ROOT]", 6))
- kobj = tuxonice_kobj;
- else
- kobj = make_toi_sysdir(module->directory);
- }
- module->dir_kobj = kobj;
- for (i = 0; i < module->num_sysfs_entries; i++) {
- int result = toi_register_sysfs_file(kobj,
- &module->sysfs_data[i]);
- if (result)
- return result;
- }
- return 0;
-}
-
-/*
- * toi_unregister_module
- *
- * Remove a module.
- */
-void toi_unregister_module(struct toi_module_ops *module)
-{
- int i;
-
- if (module->dir_kobj)
- for (i = 0; i < module->num_sysfs_entries; i++)
- toi_unregister_sysfs_file(module->dir_kobj,
- &module->sysfs_data[i]);
-
- if (!module->shared_directory && module->directory &&
- strncmp(module->directory, "[ROOT]", 6))
- remove_toi_sysdir(module->dir_kobj);
-
- switch (module->type) {
- case FILTER_MODULE:
- list_del(&module->type_list);
- toi_num_filters--;
- break;
- case WRITER_MODULE:
- list_del(&module->type_list);
- toiNumAllocators--;
- if (toiActiveAllocator == module) {
- toiActiveAllocator = NULL;
- clear_toi_state(TOI_CAN_RESUME);
- clear_toi_state(TOI_CAN_HIBERNATE);
- }
- break;
- case MISC_MODULE:
- case MISC_HIDDEN_MODULE:
- case BIO_ALLOCATOR_MODULE:
- break;
- default:
- printk(KERN_ERR "Module '%s' has an invalid type."
- " It has been ignored.\n", module->name);
- return;
- }
- list_del(&module->module_list);
- toi_num_modules--;
-}
-
-/*
- * toi_move_module_tail
- *
- * Rearrange modules when reloading the config.
- */
-void toi_move_module_tail(struct toi_module_ops *module)
-{
- switch (module->type) {
- case FILTER_MODULE:
- if (toi_num_filters > 1)
- list_move_tail(&module->type_list, &toi_filters);
- break;
- case WRITER_MODULE:
- if (toiNumAllocators > 1)
- list_move_tail(&module->type_list, &toiAllocators);
- break;
- case MISC_MODULE:
- case MISC_HIDDEN_MODULE:
- case BIO_ALLOCATOR_MODULE:
- break;
- default:
- printk(KERN_ERR "Module '%s' has an invalid type."
- " It has been ignored.\n", module->name);
- return;
- }
- if ((toi_num_filters + toiNumAllocators) > 1)
- list_move_tail(&module->module_list, &toi_modules);
-}
-
-/*
- * toi_initialise_modules
- *
- * Get ready to do some work!
- */
-int toi_initialise_modules(int starting_cycle, int early)
-{
- struct toi_module_ops *this_module;
- int result;
-
- list_for_each_entry(this_module, &toi_modules, module_list) {
- this_module->header_requested = 0;
- this_module->header_used = 0;
- if (!this_module->enabled)
- continue;
- if (this_module->early != early)
- continue;
- if (this_module->initialise) {
- result = this_module->initialise(starting_cycle);
- if (result) {
- toi_cleanup_modules(starting_cycle);
- return result;
- }
- this_module->initialised = 1;
- }
- }
-
- return 0;
-}
-
-/*
- * toi_cleanup_modules
- *
- * Tell modules the work is done.
- */
-void toi_cleanup_modules(int finishing_cycle)
-{
- struct toi_module_ops *this_module;
-
- list_for_each_entry(this_module, &toi_modules, module_list) {
- if (!this_module->enabled || !this_module->initialised)
- continue;
- if (this_module->cleanup)
- this_module->cleanup(finishing_cycle);
- this_module->initialised = 0;
- }
-}
-
-/*
- * toi_pre_atomic_restore_modules
- *
- * Get ready to do some work!
- */
-void toi_pre_atomic_restore_modules(struct toi_boot_kernel_data *bkd)
-{
- struct toi_module_ops *this_module;
-
- list_for_each_entry(this_module, &toi_modules, module_list) {
- if (this_module->enabled && this_module->pre_atomic_restore)
- this_module->pre_atomic_restore(bkd);
- }
-}
-
-/*
- * toi_post_atomic_restore_modules
- *
- * Get ready to do some work!
- */
-void toi_post_atomic_restore_modules(struct toi_boot_kernel_data *bkd)
-{
- struct toi_module_ops *this_module;
-
- list_for_each_entry(this_module, &toi_modules, module_list) {
- if (this_module->enabled && this_module->post_atomic_restore)
- this_module->post_atomic_restore(bkd);
- }
-}
-
-/*
- * toi_get_next_filter
- *
- * Get the next filter in the pipeline.
- */
-struct toi_module_ops *toi_get_next_filter(struct toi_module_ops *filter_sought)
-{
- struct toi_module_ops *last_filter = NULL, *this_filter = NULL;
-
- list_for_each_entry(this_filter, &toi_filters, type_list) {
- if (!this_filter->enabled)
- continue;
- if ((last_filter == filter_sought) || (!filter_sought))
- return this_filter;
- last_filter = this_filter;
- }
-
- return toiActiveAllocator;
-}
-
-/**
- * toi_show_modules: Printk what support is loaded.
- */
-void toi_print_modules(void)
-{
- struct toi_module_ops *this_module;
- int prev = 0;
-
- printk(KERN_INFO "TuxOnIce " TOI_CORE_VERSION ", with support for");
-
- list_for_each_entry(this_module, &toi_modules, module_list) {
- if (this_module->type == MISC_HIDDEN_MODULE)
- continue;
- printk("%s %s%s%s", prev ? "," : "",
- this_module->enabled ? "" : "[",
- this_module->name,
- this_module->enabled ? "" : "]");
- prev = 1;
- }
-
- printk(".\n");
-}
-
-/* toi_get_modules
- *
- * Take a reference to modules so they can't go away under us.
- */
-
-int toi_get_modules(void)
-{
- struct toi_module_ops *this_module;
-
- list_for_each_entry(this_module, &toi_modules, module_list) {
- struct toi_module_ops *this_module2;
-
- if (try_module_get(this_module->module))
- continue;
-
- /* Failed! Reverse gets and return error */
- list_for_each_entry(this_module2, &toi_modules,
- module_list) {
- if (this_module == this_module2)
- return -EINVAL;
- module_put(this_module2->module);
- }
- }
- return 0;
-}
-
-/* toi_put_modules
- *
- * Release our references to modules we used.
- */
-
-void toi_put_modules(void)
-{
- struct toi_module_ops *this_module;
-
- list_for_each_entry(this_module, &toi_modules, module_list)
- module_put(this_module->module);
-}
diff --git a/kernel/power/tuxonice_modules.h b/kernel/power/tuxonice_modules.h
deleted file mode 100644
index 44f10abb9..000000000
--- a/kernel/power/tuxonice_modules.h
+++ /dev/null
@@ -1,212 +0,0 @@
-/*
- * kernel/power/tuxonice_modules.h
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * It contains declarations for modules. Modules are additions to
- * TuxOnIce that provide facilities such as image compression or
- * encryption, backends for storage of the image and user interfaces.
- *
- */
-
-#ifndef TOI_MODULES_H
-#define TOI_MODULES_H
-
-/* This is the maximum size we store in the image header for a module name */
-#define TOI_MAX_MODULE_NAME_LENGTH 30
-
-struct toi_boot_kernel_data;
-
-/* Per-module metadata */
-struct toi_module_header {
- char name[TOI_MAX_MODULE_NAME_LENGTH];
- int enabled;
- int type;
- int index;
- int data_length;
- unsigned long signature;
-};
-
-enum {
- FILTER_MODULE,
- WRITER_MODULE,
- BIO_ALLOCATOR_MODULE,
- MISC_MODULE,
- MISC_HIDDEN_MODULE,
-};
-
-enum {
- TOI_ASYNC,
- TOI_SYNC
-};
-
-enum {
- TOI_VIRT,
- TOI_PAGE,
-};
-
-#define TOI_MAP(type, addr) \
- (type == TOI_PAGE ? kmap(addr) : addr)
-
-#define TOI_UNMAP(type, addr) \
- do { \
- if (type == TOI_PAGE) \
- kunmap(addr); \
- } while(0)
-
-struct toi_module_ops {
- /* Functions common to all modules */
- int type;
- char *name;
- char *directory;
- char *shared_directory;
- struct kobject *dir_kobj;
- struct module *module;
- int enabled, early, initialised;
- struct list_head module_list;
-
- /* List of filters or allocators */
- struct list_head list, type_list;
-
- /*
- * Requirements for memory and storage in
- * the image header..
- */
- int (*memory_needed) (void);
- int (*storage_needed) (void);
-
- int header_requested, header_used;
-
- int (*expected_compression) (void);
-
- /*
- * Debug info
- */
- int (*print_debug_info) (char *buffer, int size);
- int (*save_config_info) (char *buffer);
- void (*load_config_info) (char *buffer, int len);
-
- /*
- * Initialise & cleanup - general routines called
- * at the start and end of a cycle.
- */
- int (*initialise) (int starting_cycle);
- void (*cleanup) (int finishing_cycle);
-
- void (*pre_atomic_restore) (struct toi_boot_kernel_data *bkd);
- void (*post_atomic_restore) (struct toi_boot_kernel_data *bkd);
-
- /*
- * Calls for allocating storage (allocators only).
- *
- * Header space is requested separately and cannot fail, but the
- * reservation is only applied when main storage is allocated.
- * The header space reservation is thus always set prior to
- * requesting the allocation of storage - and prior to querying
- * how much storage is available.
- */
-
- unsigned long (*storage_available) (void);
- void (*reserve_header_space) (unsigned long space_requested);
- int (*register_storage) (void);
- int (*allocate_storage) (unsigned long space_requested);
- unsigned long (*storage_allocated) (void);
- void (*free_unused_storage) (void);
-
- /*
- * Routines used in image I/O.
- */
- int (*rw_init) (int rw, int stream_number);
- int (*rw_cleanup) (int rw);
- int (*write_page) (unsigned long index, int buf_type, void *buf,
- unsigned int buf_size);
- int (*read_page) (unsigned long *index, int buf_type, void *buf,
- unsigned int *buf_size);
- int (*io_flusher) (int rw);
-
- /* Reset module if image exists but reading aborted */
- void (*noresume_reset) (void);
-
- /* Read and write the metadata */
- int (*write_header_init) (void);
- int (*write_header_cleanup) (void);
-
- int (*read_header_init) (void);
- int (*read_header_cleanup) (void);
-
- /* To be called after read_header_init */
- int (*get_header_version) (void);
-
- int (*rw_header_chunk) (int rw, struct toi_module_ops *owner,
- char *buffer_start, int buffer_size);
-
- int (*rw_header_chunk_noreadahead) (int rw,
- struct toi_module_ops *owner, char *buffer_start,
- int buffer_size);
-
- /* Attempt to parse an image location */
- int (*parse_sig_location) (char *buffer, int only_writer, int quiet);
-
- /* Throttle I/O according to throughput */
- void (*update_throughput_throttle) (int jif_index);
-
- /* Flush outstanding I/O */
- int (*finish_all_io) (void);
-
- /* Determine whether image exists that we can restore */
- int (*image_exists) (int quiet);
-
- /* Mark the image as having tried to resume */
- int (*mark_resume_attempted) (int);
-
- /* Destroy image if one exists */
- int (*remove_image) (void);
-
- /* Sysfs Data */
- struct toi_sysfs_data *sysfs_data;
- int num_sysfs_entries;
-
- /* Block I/O allocator */
- struct toi_bio_allocator_ops *bio_allocator_ops;
-};
-
-extern int toi_num_modules, toiNumAllocators;
-
-extern struct toi_module_ops *toiActiveAllocator;
-extern struct list_head toi_filters, toiAllocators, toi_modules;
-
-extern void toi_prepare_console_modules(void);
-extern void toi_cleanup_console_modules(void);
-
-extern struct toi_module_ops *toi_find_module_given_name(char *name);
-extern struct toi_module_ops *toi_get_next_filter(struct toi_module_ops *);
-
-extern int toi_register_module(struct toi_module_ops *module);
-extern void toi_move_module_tail(struct toi_module_ops *module);
-
-extern long toi_header_storage_for_modules(void);
-extern long toi_memory_for_modules(int print_parts);
-extern void print_toi_header_storage_for_modules(void);
-extern int toi_expected_compression_ratio(void);
-
-extern int toi_print_module_debug_info(char *buffer, int buffer_size);
-extern int toi_register_module(struct toi_module_ops *module);
-extern void toi_unregister_module(struct toi_module_ops *module);
-
-extern int toi_initialise_modules(int starting_cycle, int early);
-#define toi_initialise_modules_early(starting) \
- toi_initialise_modules(starting, 1)
-#define toi_initialise_modules_late(starting) \
- toi_initialise_modules(starting, 0)
-extern void toi_cleanup_modules(int finishing_cycle);
-
-extern void toi_post_atomic_restore_modules(struct toi_boot_kernel_data *bkd);
-extern void toi_pre_atomic_restore_modules(struct toi_boot_kernel_data *bkd);
-
-extern void toi_print_modules(void);
-
-int toi_get_modules(void);
-void toi_put_modules(void);
-#endif
diff --git a/kernel/power/tuxonice_netlink.c b/kernel/power/tuxonice_netlink.c
deleted file mode 100644
index 78bd31b05..000000000
--- a/kernel/power/tuxonice_netlink.c
+++ /dev/null
@@ -1,324 +0,0 @@
-/*
- * kernel/power/tuxonice_netlink.c
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * Functions for communicating with a userspace helper via netlink.
- */
-
-#include <linux/suspend.h>
-#include <linux/sched.h>
-#include <linux/kmod.h>
-#include "tuxonice_netlink.h"
-#include "tuxonice.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_builtin.h"
-
-static struct user_helper_data *uhd_list;
-
-/*
- * Refill our pool of SKBs for use in emergencies (eg, when eating memory and
- * none can be allocated).
- */
-static void toi_fill_skb_pool(struct user_helper_data *uhd)
-{
- while (uhd->pool_level < uhd->pool_limit) {
- struct sk_buff *new_skb =
- alloc_skb(NLMSG_SPACE(uhd->skb_size), TOI_ATOMIC_GFP);
-
- if (!new_skb)
- break;
-
- new_skb->next = uhd->emerg_skbs;
- uhd->emerg_skbs = new_skb;
- uhd->pool_level++;
- }
-}
-
-/*
- * Try to allocate a single skb. If we can't get one, try to use one from
- * our pool.
- */
-static struct sk_buff *toi_get_skb(struct user_helper_data *uhd)
-{
- struct sk_buff *skb =
- alloc_skb(NLMSG_SPACE(uhd->skb_size), TOI_ATOMIC_GFP);
-
- if (skb)
- return skb;
-
- skb = uhd->emerg_skbs;
- if (skb) {
- uhd->pool_level--;
- uhd->emerg_skbs = skb->next;
- skb->next = NULL;
- }
-
- return skb;
-}
-
-void toi_send_netlink_message(struct user_helper_data *uhd,
- int type, void *params, size_t len)
-{
- struct sk_buff *skb;
- struct nlmsghdr *nlh;
- void *dest;
- struct task_struct *t;
-
- if (uhd->pid == -1)
- return;
-
- if (uhd->debug)
- printk(KERN_ERR "toi_send_netlink_message: Send "
- "message type %d.\n", type);
-
- skb = toi_get_skb(uhd);
- if (!skb) {
- printk(KERN_INFO "toi_netlink: Can't allocate skb!\n");
- return;
- }
-
- nlh = nlmsg_put(skb, 0, uhd->sock_seq, type, len, 0);
- uhd->sock_seq++;
-
- dest = NLMSG_DATA(nlh);
- if (params && len > 0)
- memcpy(dest, params, len);
-
- netlink_unicast(uhd->nl, skb, uhd->pid, 0);
-
- toi_read_lock_tasklist();
- t = find_task_by_pid_ns(uhd->pid, &init_pid_ns);
- if (!t) {
- toi_read_unlock_tasklist();
- if (uhd->pid > -1)
- printk(KERN_INFO "Hmm. Can't find the userspace task"
- " %d.\n", uhd->pid);
- return;
- }
- wake_up_process(t);
- toi_read_unlock_tasklist();
-
- yield();
-}
-
-static void send_whether_debugging(struct user_helper_data *uhd)
-{
- static u8 is_debugging = 1;
-
- toi_send_netlink_message(uhd, NETLINK_MSG_IS_DEBUGGING,
- &is_debugging, sizeof(u8));
-}
-
-/*
- * Set the PF_NOFREEZE flag on the given process to ensure it can run whilst we
- * are hibernating.
- */
-static int nl_set_nofreeze(struct user_helper_data *uhd, __u32 pid)
-{
- struct task_struct *t;
-
- if (uhd->debug)
- printk(KERN_ERR "nl_set_nofreeze for pid %d.\n", pid);
-
- toi_read_lock_tasklist();
- t = find_task_by_pid_ns(pid, &init_pid_ns);
- if (!t) {
- toi_read_unlock_tasklist();
- printk(KERN_INFO "Strange. Can't find the userspace task %d.\n",
- pid);
- return -EINVAL;
- }
-
- t->flags |= PF_NOFREEZE;
-
- toi_read_unlock_tasklist();
- uhd->pid = pid;
-
- toi_send_netlink_message(uhd, NETLINK_MSG_NOFREEZE_ACK, NULL, 0);
-
- return 0;
-}
-
-/*
- * Called when the userspace process has informed us that it's ready to roll.
- */
-static int nl_ready(struct user_helper_data *uhd, u32 version)
-{
- if (version != uhd->interface_version) {
- printk(KERN_INFO "%s userspace process using invalid interface"
- " version (%d - kernel wants %d). Trying to "
- "continue without it.\n",
- uhd->name, version, uhd->interface_version);
- if (uhd->not_ready)
- uhd->not_ready();
- return -EINVAL;
- }
-
- complete(&uhd->wait_for_process);
-
- return 0;
-}
-
-void toi_netlink_close_complete(struct user_helper_data *uhd)
-{
- if (uhd->nl) {
- netlink_kernel_release(uhd->nl);
- uhd->nl = NULL;
- }
-
- while (uhd->emerg_skbs) {
- struct sk_buff *next = uhd->emerg_skbs->next;
- kfree_skb(uhd->emerg_skbs);
- uhd->emerg_skbs = next;
- }
-
- uhd->pid = -1;
-}
-
-static int toi_nl_gen_rcv_msg(struct user_helper_data *uhd,
- struct sk_buff *skb, struct nlmsghdr *nlh)
-{
- int type = nlh->nlmsg_type;
- int *data;
- int err;
-
- if (uhd->debug)
- printk(KERN_ERR "toi_user_rcv_skb: Received message %d.\n",
- type);
-
- /* Let the more specific handler go first. It returns
- * 1 for valid messages that it doesn't know. */
- err = uhd->rcv_msg(skb, nlh);
- if (err != 1)
- return err;
-
- /* Only allow one task to receive NOFREEZE privileges */
- if (type == NETLINK_MSG_NOFREEZE_ME && uhd->pid != -1) {
- printk(KERN_INFO "Received extra nofreeze me requests.\n");
- return -EBUSY;
- }
-
- data = NLMSG_DATA(nlh);
-
- switch (type) {
- case NETLINK_MSG_NOFREEZE_ME:
- return nl_set_nofreeze(uhd, nlh->nlmsg_pid);
- case NETLINK_MSG_GET_DEBUGGING:
- send_whether_debugging(uhd);
- return 0;
- case NETLINK_MSG_READY:
- if (nlh->nlmsg_len != NLMSG_LENGTH(sizeof(u32))) {
- printk(KERN_INFO "Invalid ready mesage.\n");
- if (uhd->not_ready)
- uhd->not_ready();
- return -EINVAL;
- }
- return nl_ready(uhd, (u32) *data);
- case NETLINK_MSG_CLEANUP:
- toi_netlink_close_complete(uhd);
- return 0;
- }
-
- return -EINVAL;
-}
-
-static void toi_user_rcv_skb(struct sk_buff *skb)
-{
- int err;
- struct nlmsghdr *nlh;
- struct user_helper_data *uhd = uhd_list;
-
- while (uhd && uhd->netlink_id != skb->sk->sk_protocol)
- uhd = uhd->next;
-
- if (!uhd)
- return;
-
- while (skb->len >= NLMSG_SPACE(0)) {
- u32 rlen;
-
- nlh = (struct nlmsghdr *) skb->data;
- if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
- return;
-
- rlen = NLMSG_ALIGN(nlh->nlmsg_len);
- if (rlen > skb->len)
- rlen = skb->len;
-
- err = toi_nl_gen_rcv_msg(uhd, skb, nlh);
- if (err)
- netlink_ack(skb, nlh, err);
- else if (nlh->nlmsg_flags & NLM_F_ACK)
- netlink_ack(skb, nlh, 0);
- skb_pull(skb, rlen);
- }
-}
-
-static int netlink_prepare(struct user_helper_data *uhd)
-{
- struct netlink_kernel_cfg cfg = {
- .groups = 0,
- .input = toi_user_rcv_skb,
- };
-
- uhd->next = uhd_list;
- uhd_list = uhd;
-
- uhd->sock_seq = 0x42c0ffee;
- uhd->nl = netlink_kernel_create(&init_net, uhd->netlink_id, &cfg);
- if (!uhd->nl) {
- printk(KERN_INFO "Failed to allocate netlink socket for %s.\n",
- uhd->name);
- return -ENOMEM;
- }
-
- toi_fill_skb_pool(uhd);
-
- return 0;
-}
-
-void toi_netlink_close(struct user_helper_data *uhd)
-{
- struct task_struct *t;
-
- toi_read_lock_tasklist();
- t = find_task_by_pid_ns(uhd->pid, &init_pid_ns);
- if (t)
- t->flags &= ~PF_NOFREEZE;
- toi_read_unlock_tasklist();
-
- toi_send_netlink_message(uhd, NETLINK_MSG_CLEANUP, NULL, 0);
-}
-int toi_netlink_setup(struct user_helper_data *uhd)
-{
- /* In case userui didn't cleanup properly on us */
- toi_netlink_close_complete(uhd);
-
- if (netlink_prepare(uhd) < 0) {
- printk(KERN_INFO "Netlink prepare failed.\n");
- return 1;
- }
-
- if (toi_launch_userspace_program(uhd->program, uhd->netlink_id,
- UMH_WAIT_EXEC, uhd->debug) < 0) {
- printk(KERN_INFO "Launch userspace program failed.\n");
- toi_netlink_close_complete(uhd);
- return 1;
- }
-
- /* Wait 2 seconds for the userspace process to make contact */
- wait_for_completion_timeout(&uhd->wait_for_process, 2*HZ);
-
- if (uhd->pid == -1) {
- printk(KERN_INFO "%s: Failed to contact userspace process.\n",
- uhd->name);
- toi_netlink_close_complete(uhd);
- return 1;
- }
-
- return 0;
-}
diff --git a/kernel/power/tuxonice_netlink.h b/kernel/power/tuxonice_netlink.h
deleted file mode 100644
index 6613c8eaa..000000000
--- a/kernel/power/tuxonice_netlink.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * kernel/power/tuxonice_netlink.h
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * Declarations for functions for communicating with a userspace helper
- * via netlink.
- */
-
-#include <linux/netlink.h>
-#include <net/sock.h>
-
-#define NETLINK_MSG_BASE 0x10
-
-#define NETLINK_MSG_READY 0x10
-#define NETLINK_MSG_NOFREEZE_ME 0x16
-#define NETLINK_MSG_GET_DEBUGGING 0x19
-#define NETLINK_MSG_CLEANUP 0x24
-#define NETLINK_MSG_NOFREEZE_ACK 0x27
-#define NETLINK_MSG_IS_DEBUGGING 0x28
-
-struct user_helper_data {
- int (*rcv_msg) (struct sk_buff *skb, struct nlmsghdr *nlh);
- void (*not_ready) (void);
- struct sock *nl;
- u32 sock_seq;
- pid_t pid;
- char *comm;
- char program[256];
- int pool_level;
- int pool_limit;
- struct sk_buff *emerg_skbs;
- int skb_size;
- int netlink_id;
- char *name;
- struct user_helper_data *next;
- struct completion wait_for_process;
- u32 interface_version;
- int must_init;
- int debug;
-};
-
-#ifdef CONFIG_NET
-int toi_netlink_setup(struct user_helper_data *uhd);
-void toi_netlink_close(struct user_helper_data *uhd);
-void toi_send_netlink_message(struct user_helper_data *uhd,
- int type, void *params, size_t len);
-void toi_netlink_close_complete(struct user_helper_data *uhd);
-#else
-static inline int toi_netlink_setup(struct user_helper_data *uhd)
-{
- return 0;
-}
-
-static inline void toi_netlink_close(struct user_helper_data *uhd) { };
-static inline void toi_send_netlink_message(struct user_helper_data *uhd,
- int type, void *params, size_t len) { };
-static inline void toi_netlink_close_complete(struct user_helper_data *uhd)
- { };
-#endif
diff --git a/kernel/power/tuxonice_pagedir.c b/kernel/power/tuxonice_pagedir.c
deleted file mode 100644
index d469f3d2d..000000000
--- a/kernel/power/tuxonice_pagedir.c
+++ /dev/null
@@ -1,345 +0,0 @@
-/*
- * kernel/power/tuxonice_pagedir.c
- *
- * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
- * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz>
- * Copyright (C) 2002-2003 Florent Chabaud <fchabaud@free.fr>
- * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * Routines for handling pagesets.
- * Note that pbes aren't actually stored as such. They're stored as
- * bitmaps and extents.
- */
-
-#include <linux/suspend.h>
-#include <linux/highmem.h>
-#include <linux/bootmem.h>
-#include <linux/hardirq.h>
-#include <linux/sched.h>
-#include <linux/cpu.h>
-#include <asm/tlbflush.h>
-
-#include "tuxonice_pageflags.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_pagedir.h"
-#include "tuxonice_prepare_image.h"
-#include "tuxonice.h"
-#include "tuxonice_builtin.h"
-#include "tuxonice_alloc.h"
-
-static int ptoi_pfn;
-static struct pbe *this_low_pbe;
-static struct pbe **last_low_pbe_ptr;
-
-void toi_reset_alt_image_pageset2_pfn(void)
-{
- memory_bm_position_reset(pageset2_map);
-}
-
-static struct page *first_conflicting_page;
-
-/*
- * free_conflicting_pages
- */
-
-static void free_conflicting_pages(void)
-{
- while (first_conflicting_page) {
- struct page *next =
- *((struct page **) kmap(first_conflicting_page));
- kunmap(first_conflicting_page);
- toi__free_page(29, first_conflicting_page);
- first_conflicting_page = next;
- }
-}
-
-/* __toi_get_nonconflicting_page
- *
- * Description: Gets order zero pages that won't be overwritten
- * while copying the original pages.
- */
-
-struct page *___toi_get_nonconflicting_page(int can_be_highmem)
-{
- struct page *page;
- gfp_t flags = TOI_ATOMIC_GFP;
- if (can_be_highmem)
- flags |= __GFP_HIGHMEM;
-
-
- if (test_toi_state(TOI_LOADING_ALT_IMAGE) &&
- pageset2_map && ptoi_pfn) {
- do {
- ptoi_pfn = memory_bm_next_pfn(pageset2_map, 0);
- if (ptoi_pfn != BM_END_OF_MAP) {
- page = pfn_to_page(ptoi_pfn);
- if (!PagePageset1(page) &&
- (can_be_highmem || !PageHighMem(page)))
- return page;
- }
- } while (ptoi_pfn);
- }
-
- do {
- page = toi_alloc_page(29, flags | __GFP_ZERO);
- if (!page) {
- printk(KERN_INFO "Failed to get nonconflicting "
- "page.\n");
- return NULL;
- }
- if (PagePageset1(page)) {
- struct page **next = (struct page **) kmap(page);
- *next = first_conflicting_page;
- first_conflicting_page = page;
- kunmap(page);
- }
- } while (PagePageset1(page));
-
- return page;
-}
-
-unsigned long __toi_get_nonconflicting_page(void)
-{
- struct page *page = ___toi_get_nonconflicting_page(0);
- return page ? (unsigned long) page_address(page) : 0;
-}
-
-static struct pbe *get_next_pbe(struct page **page_ptr, struct pbe *this_pbe,
- int highmem)
-{
- if (((((unsigned long) this_pbe) & (PAGE_SIZE - 1))
- + 2 * sizeof(struct pbe)) > PAGE_SIZE) {
- struct page *new_page =
- ___toi_get_nonconflicting_page(highmem);
- if (!new_page)
- return ERR_PTR(-ENOMEM);
- this_pbe = (struct pbe *) kmap(new_page);
- memset(this_pbe, 0, PAGE_SIZE);
- *page_ptr = new_page;
- } else
- this_pbe++;
-
- return this_pbe;
-}
-
-/**
- * get_pageset1_load_addresses - generate pbes for conflicting pages
- *
- * We check here that pagedir & pages it points to won't collide
- * with pages where we're going to restore from the loaded pages
- * later.
- *
- * Returns:
- * Zero on success, one if couldn't find enough pages (shouldn't
- * happen).
- **/
-int toi_get_pageset1_load_addresses(void)
-{
- int pfn, highallocd = 0, lowallocd = 0;
- int low_needed = pagedir1.size - get_highmem_size(pagedir1);
- int high_needed = get_highmem_size(pagedir1);
- int low_pages_for_highmem = 0;
- gfp_t flags = GFP_ATOMIC | __GFP_NOWARN | __GFP_HIGHMEM;
- struct page *page, *high_pbe_page = NULL, *last_high_pbe_page = NULL,
- *low_pbe_page, *last_low_pbe_page = NULL;
- struct pbe **last_high_pbe_ptr = &restore_highmem_pblist,
- *this_high_pbe = NULL;
- unsigned long orig_low_pfn, orig_high_pfn;
- int high_pbes_done = 0, low_pbes_done = 0;
- int low_direct = 0, high_direct = 0, result = 0, i;
- int high_page = 1, high_offset = 0, low_page = 1, low_offset = 0;
-
- toi_trace_index++;
-
- memory_bm_position_reset(pageset1_map);
- memory_bm_position_reset(pageset1_copy_map);
-
- last_low_pbe_ptr = &restore_pblist;
-
- /* First, allocate pages for the start of our pbe lists. */
- if (high_needed) {
- high_pbe_page = ___toi_get_nonconflicting_page(1);
- if (!high_pbe_page) {
- result = -ENOMEM;
- goto out;
- }
- this_high_pbe = (struct pbe *) kmap(high_pbe_page);
- memset(this_high_pbe, 0, PAGE_SIZE);
- }
-
- low_pbe_page = ___toi_get_nonconflicting_page(0);
- if (!low_pbe_page) {
- result = -ENOMEM;
- goto out;
- }
- this_low_pbe = (struct pbe *) page_address(low_pbe_page);
-
- /*
- * Next, allocate the number of pages we need.
- */
-
- i = low_needed + high_needed;
-
- do {
- int is_high;
-
- if (i == low_needed)
- flags &= ~__GFP_HIGHMEM;
-
- page = toi_alloc_page(30, flags);
- BUG_ON(!page);
-
- SetPagePageset1Copy(page);
- is_high = PageHighMem(page);
-
- if (PagePageset1(page)) {
- if (is_high)
- high_direct++;
- else
- low_direct++;
- } else {
- if (is_high)
- highallocd++;
- else
- lowallocd++;
- }
- } while (--i);
-
- high_needed -= high_direct;
- low_needed -= low_direct;
-
- /*
- * Do we need to use some lowmem pages for the copies of highmem
- * pages?
- */
- if (high_needed > highallocd) {
- low_pages_for_highmem = high_needed - highallocd;
- high_needed -= low_pages_for_highmem;
- low_needed += low_pages_for_highmem;
- }
-
- /*
- * Now generate our pbes (which will be used for the atomic restore),
- * and free unneeded pages.
- */
- memory_bm_position_reset(pageset1_copy_map);
- for (pfn = memory_bm_next_pfn(pageset1_copy_map, 0); pfn != BM_END_OF_MAP;
- pfn = memory_bm_next_pfn(pageset1_copy_map, 0)) {
- int is_high;
- page = pfn_to_page(pfn);
- is_high = PageHighMem(page);
-
- if (PagePageset1(page))
- continue;
-
- /* Nope. We're going to use this page. Add a pbe. */
- if (is_high || low_pages_for_highmem) {
- struct page *orig_page;
- high_pbes_done++;
- if (!is_high)
- low_pages_for_highmem--;
- do {
- orig_high_pfn = memory_bm_next_pfn(pageset1_map, 0);
- BUG_ON(orig_high_pfn == BM_END_OF_MAP);
- orig_page = pfn_to_page(orig_high_pfn);
- } while (!PageHighMem(orig_page) ||
- PagePageset1Copy(orig_page));
-
- this_high_pbe->orig_address = (void *) orig_high_pfn;
- this_high_pbe->address = page;
- this_high_pbe->next = NULL;
- toi_message(TOI_PAGEDIR, TOI_VERBOSE, 0, "High pbe %d/%d: %p(%d)=>%p",
- high_page, high_offset, page, orig_high_pfn, orig_page);
- if (last_high_pbe_page != high_pbe_page) {
- *last_high_pbe_ptr =
- (struct pbe *) high_pbe_page;
- if (last_high_pbe_page) {
- kunmap(last_high_pbe_page);
- high_page++;
- high_offset = 0;
- } else
- high_offset++;
- last_high_pbe_page = high_pbe_page;
- } else {
- *last_high_pbe_ptr = this_high_pbe;
- high_offset++;
- }
- last_high_pbe_ptr = &this_high_pbe->next;
- this_high_pbe = get_next_pbe(&high_pbe_page,
- this_high_pbe, 1);
- if (IS_ERR(this_high_pbe)) {
- printk(KERN_INFO
- "This high pbe is an error.\n");
- return -ENOMEM;
- }
- } else {
- struct page *orig_page;
- low_pbes_done++;
- do {
- orig_low_pfn = memory_bm_next_pfn(pageset1_map, 0);
- BUG_ON(orig_low_pfn == BM_END_OF_MAP);
- orig_page = pfn_to_page(orig_low_pfn);
- } while (PageHighMem(orig_page) ||
- PagePageset1Copy(orig_page));
-
- this_low_pbe->orig_address = page_address(orig_page);
- this_low_pbe->address = page_address(page);
- this_low_pbe->next = NULL;
- toi_message(TOI_PAGEDIR, TOI_VERBOSE, 0, "Low pbe %d/%d: %p(%d)=>%p",
- low_page, low_offset, this_low_pbe->orig_address,
- orig_low_pfn, this_low_pbe->address);
- TOI_TRACE_DEBUG(orig_low_pfn, "LoadAddresses (%d/%d): %p=>%p", low_page, low_offset, this_low_pbe->orig_address, this_low_pbe->address);
- *last_low_pbe_ptr = this_low_pbe;
- last_low_pbe_ptr = &this_low_pbe->next;
- this_low_pbe = get_next_pbe(&low_pbe_page,
- this_low_pbe, 0);
- if (low_pbe_page != last_low_pbe_page) {
- if (last_low_pbe_page) {
- low_page++;
- low_offset = 0;
- } else {
- low_offset++;
- }
- last_low_pbe_page = low_pbe_page;
- } else
- low_offset++;
- if (IS_ERR(this_low_pbe)) {
- printk(KERN_INFO "this_low_pbe is an error.\n");
- return -ENOMEM;
- }
- }
- }
-
- if (high_pbe_page)
- kunmap(high_pbe_page);
-
- if (last_high_pbe_page != high_pbe_page) {
- if (last_high_pbe_page)
- kunmap(last_high_pbe_page);
- toi__free_page(29, high_pbe_page);
- }
-
- free_conflicting_pages();
-
-out:
- return result;
-}
-
-int add_boot_kernel_data_pbe(void)
-{
- this_low_pbe->address = (char *) __toi_get_nonconflicting_page();
- if (!this_low_pbe->address) {
- printk(KERN_INFO "Failed to get bkd atomic restore buffer.");
- return -ENOMEM;
- }
-
- toi_bkd.size = sizeof(toi_bkd);
- memcpy(this_low_pbe->address, &toi_bkd, sizeof(toi_bkd));
-
- *last_low_pbe_ptr = this_low_pbe;
- this_low_pbe->orig_address = (char *) boot_kernel_data_buffer;
- this_low_pbe->next = NULL;
- return 0;
-}
diff --git a/kernel/power/tuxonice_pagedir.h b/kernel/power/tuxonice_pagedir.h
deleted file mode 100644
index 046535918..000000000
--- a/kernel/power/tuxonice_pagedir.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * kernel/power/tuxonice_pagedir.h
- *
- * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * Declarations for routines for handling pagesets.
- */
-
-#ifndef KERNEL_POWER_PAGEDIR_H
-#define KERNEL_POWER_PAGEDIR_H
-
-/* Pagedir
- *
- * Contains the metadata for a set of pages saved in the image.
- */
-
-struct pagedir {
- int id;
- unsigned long size;
-#ifdef CONFIG_HIGHMEM
- unsigned long size_high;
-#endif
-};
-
-#ifdef CONFIG_HIGHMEM
-#define get_highmem_size(pagedir) (pagedir.size_high)
-#define set_highmem_size(pagedir, sz) do { pagedir.size_high = sz; } while (0)
-#define inc_highmem_size(pagedir) do { pagedir.size_high++; } while (0)
-#define get_lowmem_size(pagedir) (pagedir.size - pagedir.size_high)
-#else
-#define get_highmem_size(pagedir) (0)
-#define set_highmem_size(pagedir, sz) do { } while (0)
-#define inc_highmem_size(pagedir) do { } while (0)
-#define get_lowmem_size(pagedir) (pagedir.size)
-#endif
-
-extern struct pagedir pagedir1, pagedir2;
-
-extern void toi_copy_pageset1(void);
-
-extern int toi_get_pageset1_load_addresses(void);
-
-extern unsigned long __toi_get_nonconflicting_page(void);
-struct page *___toi_get_nonconflicting_page(int can_be_highmem);
-
-extern void toi_reset_alt_image_pageset2_pfn(void);
-extern int add_boot_kernel_data_pbe(void);
-#endif
diff --git a/kernel/power/tuxonice_pageflags.c b/kernel/power/tuxonice_pageflags.c
deleted file mode 100644
index 0fe92edd7..000000000
--- a/kernel/power/tuxonice_pageflags.c
+++ /dev/null
@@ -1,18 +0,0 @@
-/*
- * kernel/power/tuxonice_pageflags.c
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * Routines for serialising and relocating pageflags in which we
- * store our image metadata.
- */
-
-#include "tuxonice_pageflags.h"
-#include "power.h"
-
-int toi_pageflags_space_needed(void)
-{
- return memory_bm_space_needed(pageset1_map);
-}
diff --git a/kernel/power/tuxonice_pageflags.h b/kernel/power/tuxonice_pageflags.h
deleted file mode 100644
index ddeeaf1e7..000000000
--- a/kernel/power/tuxonice_pageflags.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * kernel/power/tuxonice_pageflags.h
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- */
-
-#ifndef KERNEL_POWER_TUXONICE_PAGEFLAGS_H
-#define KERNEL_POWER_TUXONICE_PAGEFLAGS_H
-
-struct memory_bitmap;
-void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
-void memory_bm_clear(struct memory_bitmap *bm);
-
-int mem_bm_set_bit_check(struct memory_bitmap *bm, int index, unsigned long pfn);
-void memory_bm_set_bit(struct memory_bitmap *bm, int index, unsigned long pfn);
-unsigned long memory_bm_next_pfn(struct memory_bitmap *bm, int index);
-unsigned long memory_bm_next_pfn_index(struct memory_bitmap *bm, int index);
-void memory_bm_position_reset(struct memory_bitmap *bm);
-void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
-int toi_alloc_bitmap(struct memory_bitmap **bm);
-void toi_free_bitmap(struct memory_bitmap **bm);
-void memory_bm_clear(struct memory_bitmap *bm);
-void memory_bm_clear_bit(struct memory_bitmap *bm, int index, unsigned long pfn);
-void memory_bm_set_bit(struct memory_bitmap *bm, int index, unsigned long pfn);
-int memory_bm_test_bit(struct memory_bitmap *bm, int index, unsigned long pfn);
-int memory_bm_test_bit_index(struct memory_bitmap *bm, int index, unsigned long pfn);
-void memory_bm_clear_bit_index(struct memory_bitmap *bm, int index, unsigned long pfn);
-
-struct toi_module_ops;
-int memory_bm_write(struct memory_bitmap *bm, int (*rw_chunk)
- (int rw, struct toi_module_ops *owner, char *buffer, int buffer_size));
-int memory_bm_read(struct memory_bitmap *bm, int (*rw_chunk)
- (int rw, struct toi_module_ops *owner, char *buffer, int buffer_size));
-int memory_bm_space_needed(struct memory_bitmap *bm);
-
-extern struct memory_bitmap *pageset1_map;
-extern struct memory_bitmap *pageset1_copy_map;
-extern struct memory_bitmap *pageset2_map;
-extern struct memory_bitmap *page_resave_map;
-extern struct memory_bitmap *io_map;
-extern struct memory_bitmap *nosave_map;
-extern struct memory_bitmap *free_map;
-extern struct memory_bitmap *compare_map;
-
-#define PagePageset1(page) \
- (pageset1_map && memory_bm_test_bit(pageset1_map, smp_processor_id(), page_to_pfn(page)))
-#define SetPagePageset1(page) \
- (memory_bm_set_bit(pageset1_map, smp_processor_id(), page_to_pfn(page)))
-#define ClearPagePageset1(page) \
- (memory_bm_clear_bit(pageset1_map, smp_processor_id(), page_to_pfn(page)))
-
-#define PagePageset1Copy(page) \
- (memory_bm_test_bit(pageset1_copy_map, smp_processor_id(), page_to_pfn(page)))
-#define SetPagePageset1Copy(page) \
- (memory_bm_set_bit(pageset1_copy_map, smp_processor_id(), page_to_pfn(page)))
-#define ClearPagePageset1Copy(page) \
- (memory_bm_clear_bit(pageset1_copy_map, smp_processor_id(), page_to_pfn(page)))
-
-#define PagePageset2(page) \
- (memory_bm_test_bit(pageset2_map, smp_processor_id(), page_to_pfn(page)))
-#define SetPagePageset2(page) \
- (memory_bm_set_bit(pageset2_map, smp_processor_id(), page_to_pfn(page)))
-#define ClearPagePageset2(page) \
- (memory_bm_clear_bit(pageset2_map, smp_processor_id(), page_to_pfn(page)))
-
-#define PageWasRW(page) \
- (memory_bm_test_bit(pageset2_map, smp_processor_id(), page_to_pfn(page)))
-#define SetPageWasRW(page) \
- (memory_bm_set_bit(pageset2_map, smp_processor_id(), page_to_pfn(page)))
-#define ClearPageWasRW(page) \
- (memory_bm_clear_bit(pageset2_map, smp_processor_id(), page_to_pfn(page)))
-
-#define PageResave(page) (page_resave_map ? \
- memory_bm_test_bit(page_resave_map, smp_processor_id(), page_to_pfn(page)) : 0)
-#define SetPageResave(page) \
- (memory_bm_set_bit(page_resave_map, smp_processor_id(), page_to_pfn(page)))
-#define ClearPageResave(page) \
- (memory_bm_clear_bit(page_resave_map, smp_processor_id(), page_to_pfn(page)))
-
-#define PageNosave(page) (nosave_map ? \
- memory_bm_test_bit(nosave_map, smp_processor_id(), page_to_pfn(page)) : 0)
-#define SetPageNosave(page) \
- (mem_bm_set_bit_check(nosave_map, smp_processor_id(), page_to_pfn(page)))
-#define ClearPageNosave(page) \
- (memory_bm_clear_bit(nosave_map, smp_processor_id(), page_to_pfn(page)))
-
-#define PageNosaveFree(page) (free_map ? \
- memory_bm_test_bit(free_map, smp_processor_id(), page_to_pfn(page)) : 0)
-#define SetPageNosaveFree(page) \
- (memory_bm_set_bit(free_map, smp_processor_id(), page_to_pfn(page)))
-#define ClearPageNosaveFree(page) \
- (memory_bm_clear_bit(free_map, smp_processor_id(), page_to_pfn(page)))
-
-#define PageCompareChanged(page) (compare_map ? \
- memory_bm_test_bit(compare_map, smp_processor_id(), page_to_pfn(page)) : 0)
-#define SetPageCompareChanged(page) \
- (memory_bm_set_bit(compare_map, smp_processor_id(), page_to_pfn(page)))
-#define ClearPageCompareChanged(page) \
- (memory_bm_clear_bit(compare_map, smp_processor_id(), page_to_pfn(page)))
-
-extern void save_pageflags(struct memory_bitmap *pagemap);
-extern int load_pageflags(struct memory_bitmap *pagemap);
-extern int toi_pageflags_space_needed(void);
-#endif
diff --git a/kernel/power/tuxonice_power_off.c b/kernel/power/tuxonice_power_off.c
deleted file mode 100644
index 7c78773cf..000000000
--- a/kernel/power/tuxonice_power_off.c
+++ /dev/null
@@ -1,286 +0,0 @@
-/*
- * kernel/power/tuxonice_power_off.c
- *
- * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * Support for powering down.
- */
-
-#include <linux/device.h>
-#include <linux/suspend.h>
-#include <linux/mm.h>
-#include <linux/pm.h>
-#include <linux/reboot.h>
-#include <linux/cpu.h>
-#include <linux/console.h>
-#include <linux/fs.h>
-#include "tuxonice.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_power_off.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_io.h"
-
-unsigned long toi_poweroff_method; /* 0 - Kernel power off */
-
-static int wake_delay;
-static char lid_state_file[256], wake_alarm_dir[256];
-static struct file *lid_file, *alarm_file, *epoch_file;
-static int post_wake_state = -1;
-
-static int did_suspend_to_both;
-
-/*
- * __toi_power_down
- * Functionality : Powers down or reboots the computer once the image
- * has been written to disk.
- * Key Assumptions : Able to reboot/power down via code called or that
- * the warning emitted if the calls fail will be visible
- * to the user (ie printk resumes devices).
- */
-
-static void __toi_power_down(int method)
-{
- int error;
-
- toi_cond_pause(1, test_action_state(TOI_REBOOT) ? "Ready to reboot." :
- "Powering down.");
-
- if (test_result_state(TOI_ABORTED))
- goto out;
-
- if (test_action_state(TOI_REBOOT))
- kernel_restart(NULL);
-
- switch (method) {
- case 0:
- break;
- case 3:
- /*
- * Re-read the overwritten part of pageset2 to make post-resume
- * faster.
- */
- if (read_pageset2(1))
- panic("Attempt to reload pagedir 2 failed. "
- "Try rebooting.");
-
- pm_prepare_console();
-
- error = pm_notifier_call_chain(PM_SUSPEND_PREPARE);
- if (!error) {
- pm_restore_gfp_mask();
- error = suspend_devices_and_enter(PM_SUSPEND_MEM);
- pm_restrict_gfp_mask();
- if (!error)
- did_suspend_to_both = 1;
- }
- pm_notifier_call_chain(PM_POST_SUSPEND);
- pm_restore_console();
-
- /* Success - we're now post-resume-from-ram */
- if (did_suspend_to_both)
- return;
-
- /* Failed to suspend to ram - do normal power off */
- break;
- case 4:
- /*
- * If succeeds, doesn't return. If fails, do a simple
- * powerdown.
- */
- hibernation_platform_enter();
- break;
- case 5:
- /* Historic entry only now */
- break;
- }
-
- if (method && method != 5)
- toi_cond_pause(1,
- "Falling back to alternate power off method.");
-
- if (test_result_state(TOI_ABORTED))
- goto out;
-
- if (pm_power_off)
- kernel_power_off();
- kernel_halt();
- toi_cond_pause(1, "Powerdown failed.");
- while (1)
- cpu_relax();
-
-out:
- if (read_pageset2(1))
- panic("Attempt to reload pagedir 2 failed. Try rebooting.");
- return;
-}
-
-#define CLOSE_FILE(file) \
- if (file) { \
- filp_close(file, NULL); file = NULL; \
- }
-
-static void powerdown_cleanup(int toi_or_resume)
-{
- if (!toi_or_resume)
- return;
-
- CLOSE_FILE(lid_file);
- CLOSE_FILE(alarm_file);
- CLOSE_FILE(epoch_file);
-}
-
-static void open_file(char *format, char *arg, struct file **var, int mode,
- char *desc)
-{
- char buf[256];
-
- if (strlen(arg)) {
- sprintf(buf, format, arg);
- *var = filp_open(buf, mode, 0);
- if (IS_ERR(*var) || !*var) {
- printk(KERN_INFO "Failed to open %s file '%s' (%p).\n",
- desc, buf, *var);
- *var = NULL;
- }
- }
-}
-
-static int powerdown_init(int toi_or_resume)
-{
- if (!toi_or_resume)
- return 0;
-
- did_suspend_to_both = 0;
-
- open_file("/proc/acpi/button/%s/state", lid_state_file, &lid_file,
- O_RDONLY, "lid");
-
- if (strlen(wake_alarm_dir)) {
- open_file("/sys/class/rtc/%s/wakealarm", wake_alarm_dir,
- &alarm_file, O_WRONLY, "alarm");
-
- open_file("/sys/class/rtc/%s/since_epoch", wake_alarm_dir,
- &epoch_file, O_RDONLY, "epoch");
- }
-
- return 0;
-}
-
-static int lid_closed(void)
-{
- char array[25];
- ssize_t size;
- loff_t pos = 0;
-
- if (!lid_file)
- return 0;
-
- size = vfs_read(lid_file, (char __user *) array, 25, &pos);
- if ((int) size < 1) {
- printk(KERN_INFO "Failed to read lid state file (%d).\n",
- (int) size);
- return 0;
- }
-
- if (!strcmp(array, "state: closed\n"))
- return 1;
-
- return 0;
-}
-
-static void write_alarm_file(int value)
-{
- ssize_t size;
- char buf[40];
- loff_t pos = 0;
-
- if (!alarm_file)
- return;
-
- sprintf(buf, "%d\n", value);
-
- size = vfs_write(alarm_file, (char __user *)buf, strlen(buf), &pos);
-
- if (size < 0)
- printk(KERN_INFO "Error %d writing alarm value %s.\n",
- (int) size, buf);
-}
-
-/**
- * toi_check_resleep: See whether to powerdown again after waking.
- *
- * After waking, check whether we should powerdown again in a (usually
- * different) way. We only do this if the lid switch is still closed.
- */
-void toi_check_resleep(void)
-{
- /* We only return if we suspended to ram and woke. */
- if (lid_closed() && post_wake_state >= 0)
- __toi_power_down(post_wake_state);
-}
-
-void toi_power_down(void)
-{
- if (alarm_file && wake_delay) {
- char array[25];
- loff_t pos = 0;
- size_t size = vfs_read(epoch_file, (char __user *) array, 25,
- &pos);
-
- if (((int) size) < 1)
- printk(KERN_INFO "Failed to read epoch file (%d).\n",
- (int) size);
- else {
- unsigned long since_epoch;
- if (!kstrtoul(array, 0, &since_epoch)) {
- /* Clear any wakeup time. */
- write_alarm_file(0);
-
- /* Set new wakeup time. */
- write_alarm_file(since_epoch + wake_delay);
- }
- }
- }
-
- __toi_power_down(toi_poweroff_method);
-
- toi_check_resleep();
-}
-
-static struct toi_sysfs_data sysfs_params[] = {
-#if defined(CONFIG_ACPI)
- SYSFS_STRING("lid_file", SYSFS_RW, lid_state_file, 256, 0, NULL),
- SYSFS_INT("wake_delay", SYSFS_RW, &wake_delay, 0, INT_MAX, 0, NULL),
- SYSFS_STRING("wake_alarm_dir", SYSFS_RW, wake_alarm_dir, 256, 0, NULL),
- SYSFS_INT("post_wake_state", SYSFS_RW, &post_wake_state, -1, 5, 0,
- NULL),
- SYSFS_UL("powerdown_method", SYSFS_RW, &toi_poweroff_method, 0, 5, 0),
- SYSFS_INT("did_suspend_to_both", SYSFS_READONLY, &did_suspend_to_both,
- 0, 0, 0, NULL)
-#endif
-};
-
-static struct toi_module_ops powerdown_ops = {
- .type = MISC_HIDDEN_MODULE,
- .name = "poweroff",
- .initialise = powerdown_init,
- .cleanup = powerdown_cleanup,
- .directory = "[ROOT]",
- .module = THIS_MODULE,
- .sysfs_data = sysfs_params,
- .num_sysfs_entries = sizeof(sysfs_params) /
- sizeof(struct toi_sysfs_data),
-};
-
-int toi_poweroff_init(void)
-{
- return toi_register_module(&powerdown_ops);
-}
-
-void toi_poweroff_exit(void)
-{
- toi_unregister_module(&powerdown_ops);
-}
diff --git a/kernel/power/tuxonice_power_off.h b/kernel/power/tuxonice_power_off.h
deleted file mode 100644
index 6e1d8bb39..000000000
--- a/kernel/power/tuxonice_power_off.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * kernel/power/tuxonice_power_off.h
- *
- * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * Support for the powering down.
- */
-
-int toi_pm_state_finish(void);
-void toi_power_down(void);
-extern unsigned long toi_poweroff_method;
-int toi_poweroff_init(void);
-void toi_poweroff_exit(void);
-void toi_check_resleep(void);
-
-extern int platform_begin(int platform_mode);
-extern int platform_pre_snapshot(int platform_mode);
-extern void platform_leave(int platform_mode);
-extern void platform_end(int platform_mode);
-extern void platform_finish(int platform_mode);
-extern int platform_pre_restore(int platform_mode);
-extern void platform_restore_cleanup(int platform_mode);
diff --git a/kernel/power/tuxonice_prepare_image.c b/kernel/power/tuxonice_prepare_image.c
deleted file mode 100644
index a10d62080..000000000
--- a/kernel/power/tuxonice_prepare_image.c
+++ /dev/null
@@ -1,1080 +0,0 @@
-/*
- * kernel/power/tuxonice_prepare_image.c
- *
- * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * We need to eat memory until we can:
- * 1. Perform the save without changing anything (RAM_NEEDED < #pages)
- * 2. Fit it all in available space (toiActiveAllocator->available_space() >=
- * main_storage_needed())
- * 3. Reload the pagedir and pageset1 to places that don't collide with their
- * final destinations, not knowing to what extent the resumed kernel will
- * overlap with the one loaded at boot time. I think the resumed kernel
- * should overlap completely, but I don't want to rely on this as it is
- * an unproven assumption. We therefore assume there will be no overlap at
- * all (worse case).
- * 4. Meet the user's requested limit (if any) on the size of the image.
- * The limit is in MB, so pages/256 (assuming 4K pages).
- *
- */
-
-#include <linux/highmem.h>
-#include <linux/freezer.h>
-#include <linux/hardirq.h>
-#include <linux/mmzone.h>
-#include <linux/console.h>
-#include <linux/tuxonice.h>
-
-#include "tuxonice_pageflags.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_io.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_prepare_image.h"
-#include "tuxonice.h"
-#include "tuxonice_extent.h"
-#include "tuxonice_checksum.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_atomic_copy.h"
-#include "tuxonice_builtin.h"
-
-static unsigned long num_nosave, main_storage_allocated, storage_limit,
- header_storage_needed;
-unsigned long extra_pd1_pages_allowance =
- CONFIG_TOI_DEFAULT_EXTRA_PAGES_ALLOWANCE;
-long image_size_limit = CONFIG_TOI_DEFAULT_IMAGE_SIZE_LIMIT;
-static int no_ps2_needed;
-
-struct attention_list {
- struct task_struct *task;
- struct attention_list *next;
-};
-
-static struct attention_list *attention_list;
-
-#define PAGESET1 0
-#define PAGESET2 1
-
-void free_attention_list(void)
-{
- struct attention_list *last = NULL;
-
- while (attention_list) {
- last = attention_list;
- attention_list = attention_list->next;
- toi_kfree(6, last, sizeof(*last));
- }
-}
-
-static int build_attention_list(void)
-{
- int i, task_count = 0;
- struct task_struct *p;
- struct attention_list *next;
-
- /*
- * Count all userspace process (with task->mm) marked PF_NOFREEZE.
- */
- toi_read_lock_tasklist();
- for_each_process(p)
- if ((p->flags & PF_NOFREEZE) || p == current)
- task_count++;
- toi_read_unlock_tasklist();
-
- /*
- * Allocate attention list structs.
- */
- for (i = 0; i < task_count; i++) {
- struct attention_list *this =
- toi_kzalloc(6, sizeof(struct attention_list),
- TOI_WAIT_GFP);
- if (!this) {
- printk(KERN_INFO "Failed to allocate slab for "
- "attention list.\n");
- free_attention_list();
- return 1;
- }
- this->next = NULL;
- if (attention_list)
- this->next = attention_list;
- attention_list = this;
- }
-
- next = attention_list;
- toi_read_lock_tasklist();
- for_each_process(p)
- if ((p->flags & PF_NOFREEZE) || p == current) {
- next->task = p;
- next = next->next;
- }
- toi_read_unlock_tasklist();
- return 0;
-}
-
-static void pageset2_full(void)
-{
- struct zone *zone;
- struct page *page;
- unsigned long flags;
- int i;
-
- toi_trace_index++;
-
- for_each_populated_zone(zone) {
- spin_lock_irqsave(&zone->lru_lock, flags);
- for_each_lru(i) {
- if (!zone_page_state(zone, NR_LRU_BASE + i))
- continue;
-
- list_for_each_entry(page, &zone->lruvec.lists[i], lru) {
- struct address_space *mapping;
-
- mapping = page_mapping(page);
- if (!mapping || !mapping->host ||
- !(mapping->host->i_flags & S_ATOMIC_COPY)) {
- if (PageTOI_RO(page) && test_result_state(TOI_KEPT_IMAGE)) {
- TOI_TRACE_DEBUG(page_to_pfn(page), "_Pageset2 unmodified.");
- } else {
- TOI_TRACE_DEBUG(page_to_pfn(page), "_Pageset2 pageset2_full.");
- SetPagePageset2(page);
- }
- }
- }
- }
- spin_unlock_irqrestore(&zone->lru_lock, flags);
- }
-}
-
-/*
- * toi_mark_task_as_pageset
- * Functionality : Marks all the saveable pages belonging to a given process
- * as belonging to a particular pageset.
- */
-
-static void toi_mark_task_as_pageset(struct task_struct *t, int pageset2)
-{
- struct vm_area_struct *vma;
- struct mm_struct *mm;
-
- mm = t->active_mm;
-
- if (!mm || !mm->mmap)
- return;
-
- toi_trace_index++;
-
- if (!irqs_disabled())
- down_read(&mm->mmap_sem);
-
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
- unsigned long posn;
-
- if (!vma->vm_start ||
- vma->vm_flags & VM_PFNMAP)
- continue;
-
- for (posn = vma->vm_start; posn < vma->vm_end;
- posn += PAGE_SIZE) {
- struct page *page = follow_page(vma, posn, 0);
- struct address_space *mapping;
-
- if (!page || !pfn_valid(page_to_pfn(page)))
- continue;
-
- mapping = page_mapping(page);
- if (mapping && mapping->host &&
- mapping->host->i_flags & S_ATOMIC_COPY && pageset2)
- continue;
-
- if (PageTOI_RO(page) && test_result_state(TOI_KEPT_IMAGE)) {
- TOI_TRACE_DEBUG(page_to_pfn(page), "_Unmodified %d", pageset2 ? 1 : 2);
- continue;
- }
-
- if (pageset2) {
- TOI_TRACE_DEBUG(page_to_pfn(page), "_MarkTaskAsPageset 1");
- SetPagePageset2(page);
- } else {
- TOI_TRACE_DEBUG(page_to_pfn(page), "_MarkTaskAsPageset 2");
- ClearPagePageset2(page);
- SetPagePageset1(page);
- }
- }
- }
-
- if (!irqs_disabled())
- up_read(&mm->mmap_sem);
-}
-
-static void mark_tasks(int pageset)
-{
- struct task_struct *p;
-
- toi_read_lock_tasklist();
- for_each_process(p) {
- if (!p->mm)
- continue;
-
- if (p->flags & PF_KTHREAD)
- continue;
-
- toi_mark_task_as_pageset(p, pageset);
- }
- toi_read_unlock_tasklist();
-
-}
-
-/* mark_pages_for_pageset2
- *
- * Description: Mark unshared pages in processes not needed for hibernate as
- * being able to be written out in a separate pagedir.
- * HighMem pages are simply marked as pageset2. They won't be
- * needed during hibernate.
- */
-
-static void toi_mark_pages_for_pageset2(void)
-{
- struct attention_list *this = attention_list;
-
- memory_bm_clear(pageset2_map);
-
- if (test_action_state(TOI_NO_PAGESET2) || no_ps2_needed)
- return;
-
- if (test_action_state(TOI_PAGESET2_FULL))
- pageset2_full();
- else
- mark_tasks(PAGESET2);
-
- /*
- * Because the tasks in attention_list are ones related to hibernating,
- * we know that they won't go away under us.
- */
-
- while (this) {
- if (!test_result_state(TOI_ABORTED))
- toi_mark_task_as_pageset(this->task, PAGESET1);
- this = this->next;
- }
-}
-
-/*
- * The atomic copy of pageset1 is stored in pageset2 pages.
- * But if pageset1 is larger (normally only just after boot),
- * we need to allocate extra pages to store the atomic copy.
- * The following data struct and functions are used to handle
- * the allocation and freeing of that memory.
- */
-
-static unsigned long extra_pages_allocated;
-
-struct extras {
- struct page *page;
- int order;
- struct extras *next;
-};
-
-static struct extras *extras_list;
-
-/* toi_free_extra_pagedir_memory
- *
- * Description: Free previously allocated extra pagedir memory.
- */
-void toi_free_extra_pagedir_memory(void)
-{
- /* Free allocated pages */
- while (extras_list) {
- struct extras *this = extras_list;
- int i;
-
- extras_list = this->next;
-
- for (i = 0; i < (1 << this->order); i++)
- ClearPageNosave(this->page + i);
-
- toi_free_pages(9, this->page, this->order);
- toi_kfree(7, this, sizeof(*this));
- }
-
- extra_pages_allocated = 0;
-}
-
-/* toi_allocate_extra_pagedir_memory
- *
- * Description: Allocate memory for making the atomic copy of pagedir1 in the
- * case where it is bigger than pagedir2.
- * Arguments: int num_to_alloc: Number of extra pages needed.
- * Result: int. Number of extra pages we now have allocated.
- */
-static int toi_allocate_extra_pagedir_memory(int extra_pages_needed)
-{
- int j, order, num_to_alloc = extra_pages_needed - extra_pages_allocated;
- gfp_t flags = TOI_ATOMIC_GFP;
-
- if (num_to_alloc < 1)
- return 0;
-
- order = fls(num_to_alloc);
- if (order >= MAX_ORDER)
- order = MAX_ORDER - 1;
-
- while (num_to_alloc) {
- struct page *newpage;
- unsigned long virt;
- struct extras *extras_entry;
-
- while ((1 << order) > num_to_alloc)
- order--;
-
- extras_entry = (struct extras *) toi_kzalloc(7,
- sizeof(struct extras), TOI_ATOMIC_GFP);
-
- if (!extras_entry)
- return extra_pages_allocated;
-
- virt = toi_get_free_pages(9, flags, order);
- while (!virt && order) {
- order--;
- virt = toi_get_free_pages(9, flags, order);
- }
-
- if (!virt) {
- toi_kfree(7, extras_entry, sizeof(*extras_entry));
- return extra_pages_allocated;
- }
-
- newpage = virt_to_page(virt);
-
- extras_entry->page = newpage;
- extras_entry->order = order;
- extras_entry->next = extras_list;
-
- extras_list = extras_entry;
-
- for (j = 0; j < (1 << order); j++) {
- SetPageNosave(newpage + j);
- SetPagePageset1Copy(newpage + j);
- }
-
- extra_pages_allocated += (1 << order);
- num_to_alloc -= (1 << order);
- }
-
- return extra_pages_allocated;
-}
-
-/*
- * real_nr_free_pages: Count pcp pages for a zone type or all zones
- * (-1 for all, otherwise zone_idx() result desired).
- */
-unsigned long real_nr_free_pages(unsigned long zone_idx_mask)
-{
- struct zone *zone;
- int result = 0, cpu;
-
- /* PCP lists */
- for_each_populated_zone(zone) {
- if (!(zone_idx_mask & (1 << zone_idx(zone))))
- continue;
-
- for_each_online_cpu(cpu) {
- struct per_cpu_pageset *pset =
- per_cpu_ptr(zone->pageset, cpu);
- struct per_cpu_pages *pcp = &pset->pcp;
- result += pcp->count;
- }
-
- result += zone_page_state(zone, NR_FREE_PAGES);
- }
- return result;
-}
-
-/*
- * Discover how much extra memory will be required by the drivers
- * when they're asked to hibernate. We can then ensure that amount
- * of memory is available when we really want it.
- */
-static void get_extra_pd1_allowance(void)
-{
- unsigned long orig_num_free = real_nr_free_pages(all_zones_mask), final;
-
- toi_prepare_status(CLEAR_BAR, "Finding allowance for drivers.");
-
- if (toi_go_atomic(PMSG_FREEZE, 1))
- return;
-
- final = real_nr_free_pages(all_zones_mask);
- toi_end_atomic(ATOMIC_ALL_STEPS, 1, 0);
-
- extra_pd1_pages_allowance = (orig_num_free > final) ?
- orig_num_free - final + MIN_EXTRA_PAGES_ALLOWANCE :
- MIN_EXTRA_PAGES_ALLOWANCE;
-}
-
-/*
- * Amount of storage needed, possibly taking into account the
- * expected compression ratio and possibly also ignoring our
- * allowance for extra pages.
- */
-static unsigned long main_storage_needed(int use_ecr,
- int ignore_extra_pd1_allow)
-{
- return (pagedir1.size + pagedir2.size +
- (ignore_extra_pd1_allow ? 0 : extra_pd1_pages_allowance)) *
- (use_ecr ? toi_expected_compression_ratio() : 100) / 100;
-}
-
-/*
- * Storage needed for the image header, in bytes until the return.
- */
-unsigned long get_header_storage_needed(void)
-{
- unsigned long bytes = sizeof(struct toi_header) +
- toi_header_storage_for_modules() +
- toi_pageflags_space_needed() +
- fs_info_space_needed();
-
- return DIV_ROUND_UP(bytes, PAGE_SIZE);
-}
-
-/*
- * When freeing memory, pages from either pageset might be freed.
- *
- * When seeking to free memory to be able to hibernate, for every ps1 page
- * freed, we need 2 less pages for the atomic copy because there is one less
- * page to copy and one more page into which data can be copied.
- *
- * Freeing ps2 pages saves us nothing directly. No more memory is available
- * for the atomic copy. Indirectly, a ps1 page might be freed (slab?), but
- * that's too much work to figure out.
- *
- * => ps1_to_free functions
- *
- * Of course if we just want to reduce the image size, because of storage
- * limitations or an image size limit either ps will do.
- *
- * => any_to_free function
- */
-
-static unsigned long lowpages_usable_for_highmem_copy(void)
-{
- unsigned long needed = get_lowmem_size(pagedir1) +
- extra_pd1_pages_allowance + MIN_FREE_RAM +
- toi_memory_for_modules(0),
- available = get_lowmem_size(pagedir2) +
- real_nr_free_low_pages() + extra_pages_allocated;
-
- return available > needed ? available - needed : 0;
-}
-
-static unsigned long highpages_ps1_to_free(void)
-{
- unsigned long need = get_highmem_size(pagedir1),
- available = get_highmem_size(pagedir2) +
- real_nr_free_high_pages() +
- lowpages_usable_for_highmem_copy();
-
- return need > available ? DIV_ROUND_UP(need - available, 2) : 0;
-}
-
-static unsigned long lowpages_ps1_to_free(void)
-{
- unsigned long needed = get_lowmem_size(pagedir1) +
- extra_pd1_pages_allowance + MIN_FREE_RAM +
- toi_memory_for_modules(0),
- available = get_lowmem_size(pagedir2) +
- real_nr_free_low_pages() + extra_pages_allocated;
-
- return needed > available ? DIV_ROUND_UP(needed - available, 2) : 0;
-}
-
-static unsigned long current_image_size(void)
-{
- return pagedir1.size + pagedir2.size + header_storage_needed;
-}
-
-static unsigned long storage_still_required(void)
-{
- unsigned long needed = main_storage_needed(1, 1);
- return needed > storage_limit ? needed - storage_limit : 0;
-}
-
-static unsigned long ram_still_required(void)
-{
- unsigned long needed = MIN_FREE_RAM + toi_memory_for_modules(0) +
- 2 * extra_pd1_pages_allowance,
- available = real_nr_free_low_pages() + extra_pages_allocated;
- return needed > available ? needed - available : 0;
-}
-
-unsigned long any_to_free(int use_image_size_limit)
-{
- int use_soft_limit = use_image_size_limit && image_size_limit > 0;
- unsigned long current_size = current_image_size(),
- soft_limit = use_soft_limit ? (image_size_limit << 8) : 0,
- to_free = use_soft_limit ? (current_size > soft_limit ?
- current_size - soft_limit : 0) : 0,
- storage_limit = storage_still_required(),
- ram_limit = ram_still_required(),
- first_max = max(to_free, storage_limit);
-
- return max(first_max, ram_limit);
-}
-
-static int need_pageset2(void)
-{
- return (real_nr_free_low_pages() + extra_pages_allocated -
- 2 * extra_pd1_pages_allowance - MIN_FREE_RAM -
- toi_memory_for_modules(0) - pagedir1.size) < pagedir2.size;
-}
-
-/* amount_needed
- *
- * Calculates the amount by which the image size needs to be reduced to meet
- * our constraints.
- */
-static unsigned long amount_needed(int use_image_size_limit)
-{
- return max(highpages_ps1_to_free() + lowpages_ps1_to_free(),
- any_to_free(use_image_size_limit));
-}
-
-static int image_not_ready(int use_image_size_limit)
-{
- toi_message(TOI_EAT_MEMORY, TOI_LOW, 1,
- "Amount still needed (%lu) > 0:%u,"
- " Storage allocd: %lu < %lu: %u.\n",
- amount_needed(use_image_size_limit),
- (amount_needed(use_image_size_limit) > 0),
- main_storage_allocated,
- main_storage_needed(1, 1),
- main_storage_allocated < main_storage_needed(1, 1));
-
- toi_cond_pause(0, NULL);
-
- return (amount_needed(use_image_size_limit) > 0) ||
- main_storage_allocated < main_storage_needed(1, 1);
-}
-
-static void display_failure_reason(int tries_exceeded)
-{
- unsigned long storage_required = storage_still_required(),
- ram_required = ram_still_required(),
- high_ps1 = highpages_ps1_to_free(),
- low_ps1 = lowpages_ps1_to_free();
-
- printk(KERN_INFO "Failed to prepare the image because...\n");
-
- if (!storage_limit) {
- printk(KERN_INFO "- You need some storage available to be "
- "able to hibernate.\n");
- return;
- }
-
- if (tries_exceeded)
- printk(KERN_INFO "- The maximum number of iterations was "
- "reached without successfully preparing the "
- "image.\n");
-
- if (storage_required) {
- printk(KERN_INFO " - We need at least %lu pages of storage "
- "(ignoring the header), but only have %lu.\n",
- main_storage_needed(1, 1),
- main_storage_allocated);
- set_abort_result(TOI_INSUFFICIENT_STORAGE);
- }
-
- if (ram_required) {
- printk(KERN_INFO " - We need %lu more free pages of low "
- "memory.\n", ram_required);
- printk(KERN_INFO " Minimum free : %8d\n", MIN_FREE_RAM);
- printk(KERN_INFO " + Reqd. by modules : %8lu\n",
- toi_memory_for_modules(0));
- printk(KERN_INFO " + 2 * extra allow : %8lu\n",
- 2 * extra_pd1_pages_allowance);
- printk(KERN_INFO " - Currently free : %8lu\n",
- real_nr_free_low_pages());
- printk(KERN_INFO " - Pages allocd : %8lu\n",
- extra_pages_allocated);
- printk(KERN_INFO " : ========\n");
- printk(KERN_INFO " Still needed : %8lu\n",
- ram_required);
-
- /* Print breakdown of memory needed for modules */
- toi_memory_for_modules(1);
- set_abort_result(TOI_UNABLE_TO_FREE_ENOUGH_MEMORY);
- }
-
- if (high_ps1) {
- printk(KERN_INFO "- We need to free %lu highmem pageset 1 "
- "pages.\n", high_ps1);
- set_abort_result(TOI_UNABLE_TO_FREE_ENOUGH_MEMORY);
- }
-
- if (low_ps1) {
- printk(KERN_INFO " - We need to free %ld lowmem pageset 1 "
- "pages.\n", low_ps1);
- set_abort_result(TOI_UNABLE_TO_FREE_ENOUGH_MEMORY);
- }
-}
-
-static void display_stats(int always, int sub_extra_pd1_allow)
-{
- char buffer[255];
- snprintf(buffer, 254,
- "Free:%lu(%lu). Sets:%lu(%lu),%lu(%lu). "
- "Nosave:%lu-%lu=%lu. Storage:%lu/%lu(%lu=>%lu). "
- "Needed:%lu,%lu,%lu(%u,%lu,%lu,%ld) (PS2:%s)\n",
-
- /* Free */
- real_nr_free_pages(all_zones_mask),
- real_nr_free_low_pages(),
-
- /* Sets */
- pagedir1.size, pagedir1.size - get_highmem_size(pagedir1),
- pagedir2.size, pagedir2.size - get_highmem_size(pagedir2),
-
- /* Nosave */
- num_nosave, extra_pages_allocated,
- num_nosave - extra_pages_allocated,
-
- /* Storage */
- main_storage_allocated,
- storage_limit,
- main_storage_needed(1, sub_extra_pd1_allow),
- main_storage_needed(1, 1),
-
- /* Needed */
- lowpages_ps1_to_free(), highpages_ps1_to_free(),
- any_to_free(1),
- MIN_FREE_RAM, toi_memory_for_modules(0),
- extra_pd1_pages_allowance,
- image_size_limit,
-
- need_pageset2() ? "yes" : "no");
-
- if (always)
- printk("%s", buffer);
- else
- toi_message(TOI_EAT_MEMORY, TOI_MEDIUM, 1, buffer);
-}
-
-/* flag_image_pages
- *
- * This routine generates our lists of pages to be stored in each
- * pageset. Since we store the data using extents, and adding new
- * extents might allocate a new extent page, this routine may well
- * be called more than once.
- */
-static void flag_image_pages(int atomic_copy)
-{
- int num_free = 0, num_unmodified = 0;
- unsigned long loop;
- struct zone *zone;
-
- pagedir1.size = 0;
- pagedir2.size = 0;
-
- set_highmem_size(pagedir1, 0);
- set_highmem_size(pagedir2, 0);
-
- num_nosave = 0;
- toi_trace_index++;
-
- memory_bm_clear(pageset1_map);
-
- toi_generate_free_page_map();
-
- /*
- * Pages not to be saved are marked Nosave irrespective of being
- * reserved.
- */
- for_each_populated_zone(zone) {
- int highmem = is_highmem(zone);
-
- for (loop = 0; loop < zone->spanned_pages; loop++) {
- unsigned long pfn = zone->zone_start_pfn + loop;
- struct page *page;
- int chunk_size;
-
- if (!pfn_valid(pfn)) {
- TOI_TRACE_DEBUG(pfn, "_Flag Invalid");
- continue;
- }
-
- chunk_size = toi_size_of_free_region(zone, pfn);
- if (chunk_size) {
- unsigned long y;
- for (y = pfn; y < pfn + chunk_size; y++) {
- page = pfn_to_page(y);
- TOI_TRACE_DEBUG(y, "_Flag Free");
- ClearPagePageset1(page);
- ClearPagePageset2(page);
- }
- num_free += chunk_size;
- loop += chunk_size - 1;
- continue;
- }
-
- page = pfn_to_page(pfn);
-
- if (PageNosave(page)) {
- char *desc = PagePageset1Copy(page) ? "Pageset1Copy" : "NoSave";
- TOI_TRACE_DEBUG(pfn, "_Flag %s", desc);
- num_nosave++;
- continue;
- }
-
- page = highmem ? saveable_highmem_page(zone, pfn) :
- saveable_page(zone, pfn);
-
- if (!page) {
- TOI_TRACE_DEBUG(pfn, "_Flag Nosave2");
- num_nosave++;
- continue;
- }
-
- if (PageTOI_RO(page) && test_result_state(TOI_KEPT_IMAGE)) {
- TOI_TRACE_DEBUG(pfn, "_Unmodified");
- num_unmodified++;
- continue;
- }
-
- if (PagePageset2(page)) {
- pagedir2.size++;
- TOI_TRACE_DEBUG(pfn, "_Flag PS2");
- if (PageHighMem(page))
- inc_highmem_size(pagedir2);
- else
- SetPagePageset1Copy(page);
- if (PageResave(page)) {
- SetPagePageset1(page);
- ClearPagePageset1Copy(page);
- pagedir1.size++;
- if (PageHighMem(page))
- inc_highmem_size(pagedir1);
- }
- } else {
- pagedir1.size++;
- TOI_TRACE_DEBUG(pfn, "_Flag PS1");
- SetPagePageset1(page);
- if (PageHighMem(page))
- inc_highmem_size(pagedir1);
- }
- }
- }
-
- if (!atomic_copy)
- toi_message(TOI_EAT_MEMORY, TOI_MEDIUM, 0,
- "Count data pages: Set1 (%d) + Set2 (%d) + Nosave (%ld)"
- " + Unmodified (%d) + NumFree (%d) = %d.\n",
- pagedir1.size, pagedir2.size, num_nosave, num_unmodified,
- num_free, pagedir1.size + pagedir2.size + num_nosave + num_free);
-}
-
-void toi_recalculate_image_contents(int atomic_copy)
-{
- memory_bm_clear(pageset1_map);
- if (!atomic_copy) {
- unsigned long pfn;
- memory_bm_position_reset(pageset2_map);
- for (pfn = memory_bm_next_pfn(pageset2_map, 0);
- pfn != BM_END_OF_MAP;
- pfn = memory_bm_next_pfn(pageset2_map, 0))
- ClearPagePageset1Copy(pfn_to_page(pfn));
- /* Need to call this before getting pageset1_size! */
- toi_mark_pages_for_pageset2();
- }
- memory_bm_position_reset(pageset2_map);
- flag_image_pages(atomic_copy);
-
- if (!atomic_copy) {
- storage_limit = toiActiveAllocator->storage_available();
- display_stats(0, 0);
- }
-}
-
-int try_allocate_extra_memory(void)
-{
- unsigned long wanted = pagedir1.size + extra_pd1_pages_allowance -
- get_lowmem_size(pagedir2);
- if (wanted > extra_pages_allocated) {
- unsigned long got = toi_allocate_extra_pagedir_memory(wanted);
- if (wanted < got) {
- toi_message(TOI_EAT_MEMORY, TOI_LOW, 1,
- "Want %d extra pages for pageset1, got %d.\n",
- wanted, got);
- return 1;
- }
- }
- return 0;
-}
-
-/* update_image
- *
- * Allocate [more] memory and storage for the image.
- */
-static void update_image(int ps2_recalc)
-{
- int old_header_req;
- unsigned long seek;
-
- if (try_allocate_extra_memory())
- return;
-
- if (ps2_recalc)
- goto recalc;
-
- thaw_kernel_threads();
-
- /*
- * Allocate remaining storage space, if possible, up to the
- * maximum we know we'll need. It's okay to allocate the
- * maximum if the writer is the swapwriter, but
- * we don't want to grab all available space on an NFS share.
- * We therefore ignore the expected compression ratio here,
- * thereby trying to allocate the maximum image size we could
- * need (assuming compression doesn't expand the image), but
- * don't complain if we can't get the full amount we're after.
- */
-
- do {
- int result;
-
- old_header_req = header_storage_needed;
- toiActiveAllocator->reserve_header_space(header_storage_needed);
-
- /* How much storage is free with the reservation applied? */
- storage_limit = toiActiveAllocator->storage_available();
- seek = min(storage_limit, main_storage_needed(0, 0));
-
- result = toiActiveAllocator->allocate_storage(seek);
- if (result)
- printk("Failed to allocate storage (%d).\n", result);
-
- main_storage_allocated =
- toiActiveAllocator->storage_allocated();
-
- /* Need more header because more storage allocated? */
- header_storage_needed = get_header_storage_needed();
-
- } while (header_storage_needed > old_header_req);
-
- if (freeze_kernel_threads())
- set_abort_result(TOI_FREEZING_FAILED);
-
-recalc:
- toi_recalculate_image_contents(0);
-}
-
-/* attempt_to_freeze
- *
- * Try to freeze processes.
- */
-
-static int attempt_to_freeze(void)
-{
- int result;
-
- /* Stop processes before checking again */
- toi_prepare_status(CLEAR_BAR, "Freezing processes & syncing "
- "filesystems.");
- result = freeze_processes();
-
- if (result)
- set_abort_result(TOI_FREEZING_FAILED);
-
- result = freeze_kernel_threads();
-
- if (result)
- set_abort_result(TOI_FREEZING_FAILED);
-
- return result;
-}
-
-/* eat_memory
- *
- * Try to free some memory, either to meet hard or soft constraints on the image
- * characteristics.
- *
- * Hard constraints:
- * - Pageset1 must be < half of memory;
- * - We must have enough memory free at resume time to have pageset1
- * be able to be loaded in pages that don't conflict with where it has to
- * be restored.
- * Soft constraints
- * - User specificied image size limit.
- */
-static void eat_memory(void)
-{
- unsigned long amount_wanted = 0;
- int did_eat_memory = 0;
-
- /*
- * Note that if we have enough storage space and enough free memory, we
- * may exit without eating anything. We give up when the last 10
- * iterations ate no extra pages because we're not going to get much
- * more anyway, but the few pages we get will take a lot of time.
- *
- * We freeze processes before beginning, and then unfreeze them if we
- * need to eat memory until we think we have enough. If our attempts
- * to freeze fail, we give up and abort.
- */
-
- amount_wanted = amount_needed(1);
-
- switch (image_size_limit) {
- case -1: /* Don't eat any memory */
- if (amount_wanted > 0) {
- set_abort_result(TOI_WOULD_EAT_MEMORY);
- return;
- }
- break;
- case -2: /* Free caches only */
- drop_pagecache();
- toi_recalculate_image_contents(0);
- amount_wanted = amount_needed(1);
- break;
- default:
- break;
- }
-
- if (amount_wanted > 0 && !test_result_state(TOI_ABORTED) &&
- image_size_limit != -1) {
- unsigned long request = amount_wanted;
- unsigned long high_req = max(highpages_ps1_to_free(),
- any_to_free(1));
- unsigned long low_req = lowpages_ps1_to_free();
- unsigned long got = 0;
-
- toi_prepare_status(CLEAR_BAR,
- "Seeking to free %ldMB of memory.",
- MB(amount_wanted));
-
- thaw_kernel_threads();
-
- /*
- * Ask for too many because shrink_memory_mask doesn't
- * currently return enough most of the time.
- */
-
- if (low_req)
- got = shrink_memory_mask(low_req, GFP_KERNEL);
- if (high_req)
- shrink_memory_mask(high_req - got, GFP_HIGHUSER);
-
- did_eat_memory = 1;
-
- toi_recalculate_image_contents(0);
-
- amount_wanted = amount_needed(1);
-
- printk(KERN_DEBUG "Asked shrink_memory_mask for %ld low pages &"
- " %ld pages from anywhere, got %ld.\n",
- high_req, low_req,
- request - amount_wanted);
-
- toi_cond_pause(0, NULL);
-
- if (freeze_kernel_threads())
- set_abort_result(TOI_FREEZING_FAILED);
- }
-
- if (did_eat_memory)
- toi_recalculate_image_contents(0);
-}
-
-/* toi_prepare_image
- *
- * Entry point to the whole image preparation section.
- *
- * We do four things:
- * - Freeze processes;
- * - Ensure image size constraints are met;
- * - Complete all the preparation for saving the image,
- * including allocation of storage. The only memory
- * that should be needed when we're finished is that
- * for actually storing the image (and we know how
- * much is needed for that because the modules tell
- * us).
- * - Make sure that all dirty buffers are written out.
- */
-#define MAX_TRIES 2
-int toi_prepare_image(void)
-{
- int result = 1, tries = 1;
-
- main_storage_allocated = 0;
- no_ps2_needed = 0;
-
- if (attempt_to_freeze())
- return 1;
-
- lock_device_hotplug();
- set_toi_state(TOI_DEVICE_HOTPLUG_LOCKED);
-
- if (!extra_pd1_pages_allowance)
- get_extra_pd1_allowance();
-
- storage_limit = toiActiveAllocator->storage_available();
-
- if (!storage_limit) {
- printk(KERN_INFO "No storage available. Didn't try to prepare "
- "an image.\n");
- display_failure_reason(0);
- set_abort_result(TOI_NOSTORAGE_AVAILABLE);
- return 1;
- }
-
- if (build_attention_list()) {
- abort_hibernate(TOI_UNABLE_TO_PREPARE_IMAGE,
- "Unable to successfully prepare the image.\n");
- return 1;
- }
-
- toi_recalculate_image_contents(0);
-
- do {
- toi_prepare_status(CLEAR_BAR,
- "Preparing Image. Try %d.", tries);
-
- eat_memory();
-
- if (test_result_state(TOI_ABORTED))
- break;
-
- update_image(0);
-
- tries++;
-
- } while (image_not_ready(1) && tries <= MAX_TRIES &&
- !test_result_state(TOI_ABORTED));
-
- result = image_not_ready(0);
-
- /* TODO: Handle case where need to remove existing image and resave
- * instead of adding to incremental image. */
-
- if (!test_result_state(TOI_ABORTED)) {
- if (result) {
- display_stats(1, 0);
- display_failure_reason(tries > MAX_TRIES);
- abort_hibernate(TOI_UNABLE_TO_PREPARE_IMAGE,
- "Unable to successfully prepare the image.\n");
- } else {
- /* Pageset 2 needed? */
- if (!need_pageset2() &&
- test_action_state(TOI_NO_PS2_IF_UNNEEDED)) {
- no_ps2_needed = 1;
- toi_recalculate_image_contents(0);
- update_image(1);
- }
-
- toi_cond_pause(1, "Image preparation complete.");
- }
- }
-
- return result ? result : allocate_checksum_pages();
-}
diff --git a/kernel/power/tuxonice_prepare_image.h b/kernel/power/tuxonice_prepare_image.h
deleted file mode 100644
index c1508975c..000000000
--- a/kernel/power/tuxonice_prepare_image.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * kernel/power/tuxonice_prepare_image.h
- *
- * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- */
-
-#include <asm/sections.h>
-
-extern int toi_prepare_image(void);
-extern void toi_recalculate_image_contents(int storage_available);
-extern unsigned long real_nr_free_pages(unsigned long zone_idx_mask);
-extern long image_size_limit;
-extern void toi_free_extra_pagedir_memory(void);
-extern unsigned long extra_pd1_pages_allowance;
-extern void free_attention_list(void);
-
-#define MIN_FREE_RAM 100
-#define MIN_EXTRA_PAGES_ALLOWANCE 500
-
-#define all_zones_mask ((unsigned long) ((1 << MAX_NR_ZONES) - 1))
-#ifdef CONFIG_HIGHMEM
-#define real_nr_free_high_pages() (real_nr_free_pages(1 << ZONE_HIGHMEM))
-#define real_nr_free_low_pages() (real_nr_free_pages(all_zones_mask - \
- (1 << ZONE_HIGHMEM)))
-#else
-#define real_nr_free_high_pages() (0)
-#define real_nr_free_low_pages() (real_nr_free_pages(all_zones_mask))
-
-/* For eat_memory function */
-#define ZONE_HIGHMEM (MAX_NR_ZONES + 1)
-#endif
-
-unsigned long get_header_storage_needed(void);
-unsigned long any_to_free(int use_image_size_limit);
-int try_allocate_extra_memory(void);
diff --git a/kernel/power/tuxonice_prune.c b/kernel/power/tuxonice_prune.c
deleted file mode 100644
index 5bc56d3a1..000000000
--- a/kernel/power/tuxonice_prune.c
+++ /dev/null
@@ -1,406 +0,0 @@
-/*
- * kernel/power/tuxonice_prune.c
- *
- * Copyright (C) 2012 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * This file implements a TuxOnIce module that seeks to prune the
- * amount of data written to disk. It builds a table of hashes
- * of the uncompressed data, and writes the pfn of the previous page
- * with the same contents instead of repeating the data when a match
- * is found.
- */
-
-#include <linux/suspend.h>
-#include <linux/highmem.h>
-#include <linux/vmalloc.h>
-#include <linux/crypto.h>
-#include <linux/scatterlist.h>
-#include <crypto/hash.h>
-
-#include "tuxonice_builtin.h"
-#include "tuxonice.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_io.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_alloc.h"
-
-/*
- * We never write a page bigger than PAGE_SIZE, so use a large number
- * to indicate that data is a PFN.
- */
-#define PRUNE_DATA_IS_PFN (PAGE_SIZE + 100)
-
-static unsigned long toi_pruned_pages;
-
-static struct toi_module_ops toi_prune_ops;
-static struct toi_module_ops *next_driver;
-
-static char toi_prune_hash_algo_name[32] = "sha1";
-
-static DEFINE_MUTEX(stats_lock);
-
-struct cpu_context {
- struct shash_desc desc;
- char *digest;
-};
-
-#define OUT_BUF_SIZE (2 * PAGE_SIZE)
-
-static DEFINE_PER_CPU(struct cpu_context, contexts);
-
-/*
- * toi_crypto_prepare
- *
- * Prepare to do some work by allocating buffers and transforms.
- */
-static int toi_prune_crypto_prepare(void)
-{
- int cpu, ret, digestsize;
-
- if (!*toi_prune_hash_algo_name) {
- printk(KERN_INFO "TuxOnIce: Pruning enabled but no "
- "hash algorithm set.\n");
- return 1;
- }
-
- for_each_online_cpu(cpu) {
- struct cpu_context *this = &per_cpu(contexts, cpu);
- this->desc.tfm = crypto_alloc_shash(toi_prune_hash_algo_name, 0, 0);
- if (IS_ERR(this->desc.tfm)) {
- printk(KERN_INFO "TuxOnIce: Failed to allocate the "
- "%s prune hash algorithm.\n",
- toi_prune_hash_algo_name);
- this->desc.tfm = NULL;
- return 1;
- }
-
- if (!digestsize)
- digestsize = crypto_shash_digestsize(this->desc.tfm);
-
- this->digest = kmalloc(digestsize, GFP_KERNEL);
- if (!this->digest) {
- printk(KERN_INFO "TuxOnIce: Failed to allocate space "
- "for digest output.\n");
- crypto_free_shash(this->desc.tfm);
- this->desc.tfm = NULL;
- }
-
- this->desc.flags = 0;
-
- ret = crypto_shash_init(&this->desc);
- if (ret < 0) {
- printk(KERN_INFO "TuxOnIce: Failed to initialise the "
- "%s prune hash algorithm.\n",
- toi_prune_hash_algo_name);
- kfree(this->digest);
- this->digest = NULL;
- crypto_free_shash(this->desc.tfm);
- this->desc.tfm = NULL;
- return 1;
- }
- }
-
- return 0;
-}
-
-static int toi_prune_rw_cleanup(int writing)
-{
- int cpu;
-
- for_each_online_cpu(cpu) {
- struct cpu_context *this = &per_cpu(contexts, cpu);
- if (this->desc.tfm) {
- crypto_free_shash(this->desc.tfm);
- this->desc.tfm = NULL;
- }
-
- if (this->digest) {
- kfree(this->digest);
- this->digest = NULL;
- }
- }
-
- return 0;
-}
-
-/*
- * toi_prune_init
- */
-
-static int toi_prune_init(int toi_or_resume)
-{
- if (!toi_or_resume)
- return 0;
-
- toi_pruned_pages = 0;
-
- next_driver = toi_get_next_filter(&toi_prune_ops);
-
- return next_driver ? 0 : -ECHILD;
-}
-
-/*
- * toi_prune_rw_init()
- */
-
-static int toi_prune_rw_init(int rw, int stream_number)
-{
- if (toi_prune_crypto_prepare()) {
- printk(KERN_ERR "Failed to initialise prune "
- "algorithm.\n");
- if (rw == READ) {
- printk(KERN_INFO "Unable to read the image.\n");
- return -ENODEV;
- } else {
- printk(KERN_INFO "Continuing without "
- "pruning the image.\n");
- toi_prune_ops.enabled = 0;
- }
- }
-
- return 0;
-}
-
-/*
- * toi_prune_write_page()
- *
- * Compress a page of data, buffering output and passing on filled
- * pages to the next module in the pipeline.
- *
- * Buffer_page: Pointer to a buffer of size PAGE_SIZE, containing
- * data to be checked.
- *
- * Returns: 0 on success. Otherwise the error is that returned by later
- * modules, -ECHILD if we have a broken pipeline or -EIO if
- * zlib errs.
- */
-static int toi_prune_write_page(unsigned long index, int buf_type,
- void *buffer_page, unsigned int buf_size)
-{
- int ret = 0, cpu = smp_processor_id(), write_data = 1;
- struct cpu_context *ctx = &per_cpu(contexts, cpu);
- u8* output_buffer = buffer_page;
- int output_len = buf_size;
- int out_buf_type = buf_type;
- void *buffer_start;
- u32 buf[4];
-
- if (ctx->desc.tfm) {
-
- buffer_start = TOI_MAP(buf_type, buffer_page);
- ctx->len = OUT_BUF_SIZE;
-
- ret = crypto_shash_digest(&ctx->desc, buffer_start, buf_size, &ctx->digest);
- if (ret) {
- printk(KERN_INFO "TuxOnIce: Failed to calculate digest (%d).\n", ret);
- } else {
- mutex_lock(&stats_lock);
-
- toi_pruned_pages++;
-
- mutex_unlock(&stats_lock);
-
- }
-
- TOI_UNMAP(buf_type, buffer_page);
- }
-
- if (write_data)
- ret = next_driver->write_page(index, out_buf_type,
- output_buffer, output_len);
- else
- ret = next_driver->write_page(index, out_buf_type,
- output_buffer, output_len);
-
- return ret;
-}
-
-/*
- * toi_prune_read_page()
- * @buffer_page: struct page *. Pointer to a buffer of size PAGE_SIZE.
- *
- * Retrieve data from later modules or from a previously loaded page and
- * fill the input buffer.
- * Zero if successful. Error condition from me or from downstream on failure.
- */
-static int toi_prune_read_page(unsigned long *index, int buf_type,
- void *buffer_page, unsigned int *buf_size)
-{
- int ret, cpu = smp_processor_id();
- unsigned int len;
- char *buffer_start;
- struct cpu_context *ctx = &per_cpu(contexts, cpu);
-
- if (!ctx->desc.tfm)
- return next_driver->read_page(index, TOI_PAGE, buffer_page,
- buf_size);
-
- /*
- * All our reads must be synchronous - we can't handle
- * data that hasn't been read yet.
- */
-
- ret = next_driver->read_page(index, buf_type, buffer_page, &len);
-
- if (len == PRUNE_DATA_IS_PFN) {
- buffer_start = kmap(buffer_page);
- }
-
- return ret;
-}
-
-/*
- * toi_prune_print_debug_stats
- * @buffer: Pointer to a buffer into which the debug info will be printed.
- * @size: Size of the buffer.
- *
- * Print information to be recorded for debugging purposes into a buffer.
- * Returns: Number of characters written to the buffer.
- */
-
-static int toi_prune_print_debug_stats(char *buffer, int size)
-{
- int len;
-
- /* Output the number of pages pruned. */
- if (*toi_prune_hash_algo_name)
- len = scnprintf(buffer, size, "- Compressor is '%s'.\n",
- toi_prune_hash_algo_name);
- else
- len = scnprintf(buffer, size, "- Compressor is not set.\n");
-
- if (toi_pruned_pages)
- len += scnprintf(buffer+len, size - len, " Pruned "
- "%lu pages).\n",
- toi_pruned_pages);
- return len;
-}
-
-/*
- * toi_prune_memory_needed
- *
- * Tell the caller how much memory we need to operate during hibernate/resume.
- * Returns: Unsigned long. Maximum number of bytes of memory required for
- * operation.
- */
-static int toi_prune_memory_needed(void)
-{
- return 2 * PAGE_SIZE;
-}
-
-static int toi_prune_storage_needed(void)
-{
- return 2 * sizeof(unsigned long) + 2 * sizeof(int) +
- strlen(toi_prune_hash_algo_name) + 1;
-}
-
-/*
- * toi_prune_save_config_info
- * @buffer: Pointer to a buffer of size PAGE_SIZE.
- *
- * Save informaton needed when reloading the image at resume time.
- * Returns: Number of bytes used for saving our data.
- */
-static int toi_prune_save_config_info(char *buffer)
-{
- int len = strlen(toi_prune_hash_algo_name) + 1, offset = 0;
-
- *((unsigned long *) buffer) = toi_pruned_pages;
- offset += sizeof(unsigned long);
- *((int *) (buffer + offset)) = len;
- offset += sizeof(int);
- strncpy(buffer + offset, toi_prune_hash_algo_name, len);
- return offset + len;
-}
-
-/* toi_prune_load_config_info
- * @buffer: Pointer to the start of the data.
- * @size: Number of bytes that were saved.
- *
- * Description: Reload information needed for passing back to the
- * resumed kernel.
- */
-static void toi_prune_load_config_info(char *buffer, int size)
-{
- int len, offset = 0;
-
- toi_pruned_pages = *((unsigned long *) buffer);
- offset += sizeof(unsigned long);
- len = *((int *) (buffer + offset));
- offset += sizeof(int);
- strncpy(toi_prune_hash_algo_name, buffer + offset, len);
-}
-
-static void toi_prune_pre_atomic_restore(struct toi_boot_kernel_data *bkd)
-{
- bkd->pruned_pages = toi_pruned_pages;
-}
-
-static void toi_prune_post_atomic_restore(struct toi_boot_kernel_data *bkd)
-{
- toi_pruned_pages = bkd->pruned_pages;
-}
-
-/*
- * toi_expected_ratio
- *
- * Description: Returns the expected ratio between data passed into this module
- * and the amount of data output when writing.
- * Returns: 100 - we have no idea how many pages will be pruned.
- */
-
-static int toi_prune_expected_ratio(void)
-{
- return 100;
-}
-
-/*
- * data for our sysfs entries.
- */
-static struct toi_sysfs_data sysfs_params[] = {
- SYSFS_INT("enabled", SYSFS_RW, &toi_prune_ops.enabled, 0, 1, 0,
- NULL),
- SYSFS_STRING("algorithm", SYSFS_RW, toi_prune_hash_algo_name, 31, 0, NULL),
-};
-
-/*
- * Ops structure.
- */
-static struct toi_module_ops toi_prune_ops = {
- .type = FILTER_MODULE,
- .name = "prune",
- .directory = "prune",
- .module = THIS_MODULE,
- .initialise = toi_prune_init,
- .memory_needed = toi_prune_memory_needed,
- .print_debug_info = toi_prune_print_debug_stats,
- .save_config_info = toi_prune_save_config_info,
- .load_config_info = toi_prune_load_config_info,
- .storage_needed = toi_prune_storage_needed,
- .expected_compression = toi_prune_expected_ratio,
-
- .pre_atomic_restore = toi_prune_pre_atomic_restore,
- .post_atomic_restore = toi_prune_post_atomic_restore,
-
- .rw_init = toi_prune_rw_init,
- .rw_cleanup = toi_prune_rw_cleanup,
-
- .write_page = toi_prune_write_page,
- .read_page = toi_prune_read_page,
-
- .sysfs_data = sysfs_params,
- .num_sysfs_entries = sizeof(sysfs_params) /
- sizeof(struct toi_sysfs_data),
-};
-
-/* ---- Registration ---- */
-
-static __init int toi_prune_load(void)
-{
- return toi_register_module(&toi_prune_ops);
-}
-
-late_initcall(toi_prune_load);
diff --git a/kernel/power/tuxonice_storage.c b/kernel/power/tuxonice_storage.c
deleted file mode 100644
index d8539c275..000000000
--- a/kernel/power/tuxonice_storage.c
+++ /dev/null
@@ -1,282 +0,0 @@
-/*
- * kernel/power/tuxonice_storage.c
- *
- * Copyright (C) 2005-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * Routines for talking to a userspace program that manages storage.
- *
- * The kernel side:
- * - starts the userspace program;
- * - sends messages telling it when to open and close the connection;
- * - tells it when to quit;
- *
- * The user space side:
- * - passes messages regarding status;
- *
- */
-
-#include <linux/suspend.h>
-#include <linux/freezer.h>
-
-#include "tuxonice_sysfs.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_netlink.h"
-#include "tuxonice_storage.h"
-#include "tuxonice_ui.h"
-
-static struct user_helper_data usm_helper_data;
-static struct toi_module_ops usm_ops;
-static int message_received, usm_prepare_count;
-static int storage_manager_last_action, storage_manager_action;
-
-static int usm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
-{
- int type;
- int *data;
-
- type = nlh->nlmsg_type;
-
- /* A control message: ignore them */
- if (type < NETLINK_MSG_BASE)
- return 0;
-
- /* Unknown message: reply with EINVAL */
- if (type >= USM_MSG_MAX)
- return -EINVAL;
-
- /* All operations require privileges, even GET */
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
-
- /* Only allow one task to receive NOFREEZE privileges */
- if (type == NETLINK_MSG_NOFREEZE_ME && usm_helper_data.pid != -1)
- return -EBUSY;
-
- data = (int *) NLMSG_DATA(nlh);
-
- switch (type) {
- case USM_MSG_SUCCESS:
- case USM_MSG_FAILED:
- message_received = type;
- complete(&usm_helper_data.wait_for_process);
- break;
- default:
- printk(KERN_INFO "Storage manager doesn't recognise "
- "message %d.\n", type);
- }
-
- return 1;
-}
-
-#ifdef CONFIG_NET
-static int activations;
-
-int toi_activate_storage(int force)
-{
- int tries = 1;
-
- if (usm_helper_data.pid == -1 || !usm_ops.enabled)
- return 0;
-
- message_received = 0;
- activations++;
-
- if (activations > 1 && !force)
- return 0;
-
- while ((!message_received || message_received == USM_MSG_FAILED) &&
- tries < 2) {
- toi_prepare_status(DONT_CLEAR_BAR, "Activate storage attempt "
- "%d.\n", tries);
-
- init_completion(&usm_helper_data.wait_for_process);
-
- toi_send_netlink_message(&usm_helper_data,
- USM_MSG_CONNECT,
- NULL, 0);
-
- /* Wait 2 seconds for the userspace process to make contact */
- wait_for_completion_timeout(&usm_helper_data.wait_for_process,
- 2*HZ);
-
- tries++;
- }
-
- return 0;
-}
-
-int toi_deactivate_storage(int force)
-{
- if (usm_helper_data.pid == -1 || !usm_ops.enabled)
- return 0;
-
- message_received = 0;
- activations--;
-
- if (activations && !force)
- return 0;
-
- init_completion(&usm_helper_data.wait_for_process);
-
- toi_send_netlink_message(&usm_helper_data,
- USM_MSG_DISCONNECT,
- NULL, 0);
-
- wait_for_completion_timeout(&usm_helper_data.wait_for_process, 2*HZ);
-
- if (!message_received || message_received == USM_MSG_FAILED) {
- printk(KERN_INFO "Returning failure disconnecting storage.\n");
- return 1;
- }
-
- return 0;
-}
-#endif
-
-static void storage_manager_simulate(void)
-{
- printk(KERN_INFO "--- Storage manager simulate ---\n");
- toi_prepare_usm();
- schedule();
- printk(KERN_INFO "--- Activate storage 1 ---\n");
- toi_activate_storage(1);
- schedule();
- printk(KERN_INFO "--- Deactivate storage 1 ---\n");
- toi_deactivate_storage(1);
- schedule();
- printk(KERN_INFO "--- Cleanup usm ---\n");
- toi_cleanup_usm();
- schedule();
- printk(KERN_INFO "--- Storage manager simulate ends ---\n");
-}
-
-static int usm_storage_needed(void)
-{
- return sizeof(int) + strlen(usm_helper_data.program) + 1;
-}
-
-static int usm_save_config_info(char *buf)
-{
- int len = strlen(usm_helper_data.program);
- memcpy(buf, usm_helper_data.program, len + 1);
- return sizeof(int) + len + 1;
-}
-
-static void usm_load_config_info(char *buf, int size)
-{
- /* Don't load the saved path if one has already been set */
- if (usm_helper_data.program[0])
- return;
-
- memcpy(usm_helper_data.program, buf + sizeof(int), *((int *) buf));
-}
-
-static int usm_memory_needed(void)
-{
- /* ball park figure of 32 pages */
- return 32 * PAGE_SIZE;
-}
-
-/* toi_prepare_usm
- */
-int toi_prepare_usm(void)
-{
- usm_prepare_count++;
-
- if (usm_prepare_count > 1 || !usm_ops.enabled)
- return 0;
-
- usm_helper_data.pid = -1;
-
- if (!*usm_helper_data.program)
- return 0;
-
- toi_netlink_setup(&usm_helper_data);
-
- if (usm_helper_data.pid == -1)
- printk(KERN_INFO "TuxOnIce Storage Manager wanted, but couldn't"
- " start it.\n");
-
- toi_activate_storage(0);
-
- return usm_helper_data.pid != -1;
-}
-
-void toi_cleanup_usm(void)
-{
- usm_prepare_count--;
-
- if (usm_helper_data.pid > -1 && !usm_prepare_count) {
- toi_deactivate_storage(0);
- toi_netlink_close(&usm_helper_data);
- }
-}
-
-static void storage_manager_activate(void)
-{
- if (storage_manager_action == storage_manager_last_action)
- return;
-
- if (storage_manager_action)
- toi_prepare_usm();
- else
- toi_cleanup_usm();
-
- storage_manager_last_action = storage_manager_action;
-}
-
-/*
- * User interface specific /sys/power/tuxonice entries.
- */
-
-static struct toi_sysfs_data sysfs_params[] = {
- SYSFS_NONE("simulate_atomic_copy", storage_manager_simulate),
- SYSFS_INT("enabled", SYSFS_RW, &usm_ops.enabled, 0, 1, 0, NULL),
- SYSFS_STRING("program", SYSFS_RW, usm_helper_data.program, 254, 0,
- NULL),
- SYSFS_INT("activate_storage", SYSFS_RW , &storage_manager_action, 0, 1,
- 0, storage_manager_activate)
-};
-
-static struct toi_module_ops usm_ops = {
- .type = MISC_MODULE,
- .name = "usm",
- .directory = "storage_manager",
- .module = THIS_MODULE,
- .storage_needed = usm_storage_needed,
- .save_config_info = usm_save_config_info,
- .load_config_info = usm_load_config_info,
- .memory_needed = usm_memory_needed,
-
- .sysfs_data = sysfs_params,
- .num_sysfs_entries = sizeof(sysfs_params) /
- sizeof(struct toi_sysfs_data),
-};
-
-/* toi_usm_sysfs_init
- * Description: Boot time initialisation for user interface.
- */
-int toi_usm_init(void)
-{
- usm_helper_data.nl = NULL;
- usm_helper_data.program[0] = '\0';
- usm_helper_data.pid = -1;
- usm_helper_data.skb_size = 0;
- usm_helper_data.pool_limit = 6;
- usm_helper_data.netlink_id = NETLINK_TOI_USM;
- usm_helper_data.name = "userspace storage manager";
- usm_helper_data.rcv_msg = usm_user_rcv_msg;
- usm_helper_data.interface_version = 2;
- usm_helper_data.must_init = 0;
- init_completion(&usm_helper_data.wait_for_process);
-
- return toi_register_module(&usm_ops);
-}
-
-void toi_usm_exit(void)
-{
- toi_netlink_close_complete(&usm_helper_data);
- toi_unregister_module(&usm_ops);
-}
diff --git a/kernel/power/tuxonice_storage.h b/kernel/power/tuxonice_storage.h
deleted file mode 100644
index 0189c8888..000000000
--- a/kernel/power/tuxonice_storage.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * kernel/power/tuxonice_storage.h
- *
- * Copyright (C) 2005-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- */
-
-#ifdef CONFIG_NET
-int toi_prepare_usm(void);
-void toi_cleanup_usm(void);
-
-int toi_activate_storage(int force);
-int toi_deactivate_storage(int force);
-extern int toi_usm_init(void);
-extern void toi_usm_exit(void);
-#else
-static inline int toi_usm_init(void) { return 0; }
-static inline void toi_usm_exit(void) { }
-
-static inline int toi_activate_storage(int force)
-{
- return 0;
-}
-
-static inline int toi_deactivate_storage(int force)
-{
- return 0;
-}
-
-static inline int toi_prepare_usm(void) { return 0; }
-static inline void toi_cleanup_usm(void) { }
-#endif
-
-enum {
- USM_MSG_BASE = 0x10,
-
- /* Kernel -> Userspace */
- USM_MSG_CONNECT = 0x30,
- USM_MSG_DISCONNECT = 0x31,
- USM_MSG_SUCCESS = 0x40,
- USM_MSG_FAILED = 0x41,
-
- USM_MSG_MAX,
-};
diff --git a/kernel/power/tuxonice_swap.c b/kernel/power/tuxonice_swap.c
deleted file mode 100644
index 9f555c932..000000000
--- a/kernel/power/tuxonice_swap.c
+++ /dev/null
@@ -1,474 +0,0 @@
-/*
- * kernel/power/tuxonice_swap.c
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * Distributed under GPLv2.
- *
- * This file encapsulates functions for usage of swap space as a
- * backing store.
- */
-
-#include <linux/suspend.h>
-#include <linux/blkdev.h>
-#include <linux/swapops.h>
-#include <linux/swap.h>
-#include <linux/syscalls.h>
-#include <linux/fs_uuid.h>
-
-#include "tuxonice.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_io.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_extent.h"
-#include "tuxonice_bio.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_builtin.h"
-
-static struct toi_module_ops toi_swapops;
-
-/* For swapfile automatically swapon/off'd. */
-static char swapfilename[255] = "";
-static int toi_swapon_status;
-
-/* Swap Pages */
-static unsigned long swap_allocated;
-
-static struct sysinfo swapinfo;
-
-static int is_ram_backed(struct swap_info_struct *si)
-{
- if (!strncmp(si->bdev->bd_disk->disk_name, "ram", 3) ||
- !strncmp(si->bdev->bd_disk->disk_name, "zram", 4))
- return 1;
-
- return 0;
-}
-
-/**
- * enable_swapfile: Swapon the user specified swapfile prior to hibernating.
- *
- * Activate the given swapfile if it wasn't already enabled. Remember whether
- * we really did swapon it for swapoffing later.
- */
-static void enable_swapfile(void)
-{
- int activateswapresult = -EINVAL;
-
- if (swapfilename[0]) {
- /* Attempt to swap on with maximum priority */
- activateswapresult = sys_swapon(swapfilename, 0xFFFF);
- if (activateswapresult && activateswapresult != -EBUSY)
- printk(KERN_ERR "TuxOnIce: The swapfile/partition "
- "specified by /sys/power/tuxonice/swap/swapfile"
- " (%s) could not be turned on (error %d). "
- "Attempting to continue.\n",
- swapfilename, activateswapresult);
- if (!activateswapresult)
- toi_swapon_status = 1;
- }
-}
-
-/**
- * disable_swapfile: Swapoff any file swaponed at the start of the cycle.
- *
- * If we did successfully swapon a file at the start of the cycle, swapoff
- * it now (finishing up).
- */
-static void disable_swapfile(void)
-{
- if (!toi_swapon_status)
- return;
-
- sys_swapoff(swapfilename);
- toi_swapon_status = 0;
-}
-
-static int add_blocks_to_extent_chain(struct toi_bdev_info *chain,
- unsigned long start, unsigned long end)
-{
- if (test_action_state(TOI_TEST_BIO))
- toi_message(TOI_IO, TOI_VERBOSE, 0, "Adding extent %lu-%lu to "
- "chain %p.", start << chain->bmap_shift,
- end << chain->bmap_shift, chain);
-
- return toi_add_to_extent_chain(&chain->blocks, start, end);
-}
-
-
-static int get_main_pool_phys_params(struct toi_bdev_info *chain)
-{
- struct hibernate_extent *extentpointer = NULL;
- unsigned long address, extent_min = 0, extent_max = 0;
- int empty = 1;
-
- toi_message(TOI_IO, TOI_VERBOSE, 0, "get main pool phys params for "
- "chain %d.", chain->allocator_index);
-
- if (!chain->allocations.first)
- return 0;
-
- if (chain->blocks.first)
- toi_put_extent_chain(&chain->blocks);
-
- toi_extent_for_each(&chain->allocations, extentpointer, address) {
- swp_entry_t swap_address = (swp_entry_t) { address };
- struct block_device *bdev;
- sector_t new_sector = map_swap_entry(swap_address, &bdev);
-
- if (empty) {
- empty = 0;
- extent_min = extent_max = new_sector;
- continue;
- }
-
- if (new_sector == extent_max + 1) {
- extent_max++;
- continue;
- }
-
- if (add_blocks_to_extent_chain(chain, extent_min, extent_max)) {
- printk(KERN_ERR "Out of memory while making block "
- "chains.\n");
- return -ENOMEM;
- }
-
- extent_min = new_sector;
- extent_max = new_sector;
- }
-
- if (!empty &&
- add_blocks_to_extent_chain(chain, extent_min, extent_max)) {
- printk(KERN_ERR "Out of memory while making block chains.\n");
- return -ENOMEM;
- }
-
- return 0;
-}
-
-/*
- * Like si_swapinfo, except that we don't include ram backed swap (compcache!)
- * and don't need to use the spinlocks (userspace is stopped when this
- * function is called).
- */
-void si_swapinfo_no_compcache(void)
-{
- unsigned int i;
-
- si_swapinfo(&swapinfo);
- swapinfo.freeswap = 0;
- swapinfo.totalswap = 0;
-
- for (i = 0; i < MAX_SWAPFILES; i++) {
- struct swap_info_struct *si = get_swap_info_struct(i);
- if (si && (si->flags & SWP_WRITEOK) && !is_ram_backed(si)) {
- swapinfo.totalswap += si->inuse_pages;
- swapinfo.freeswap += si->pages - si->inuse_pages;
- }
- }
-}
-/*
- * We can't just remember the value from allocation time, because other
- * processes might have allocated swap in the mean time.
- */
-static unsigned long toi_swap_storage_available(void)
-{
- toi_message(TOI_IO, TOI_VERBOSE, 0, "In toi_swap_storage_available.");
- si_swapinfo_no_compcache();
- return swapinfo.freeswap + swap_allocated;
-}
-
-static int toi_swap_initialise(int starting_cycle)
-{
- if (!starting_cycle)
- return 0;
-
- enable_swapfile();
- return 0;
-}
-
-static void toi_swap_cleanup(int ending_cycle)
-{
- if (!ending_cycle)
- return;
-
- disable_swapfile();
-}
-
-static void toi_swap_free_storage(struct toi_bdev_info *chain)
-{
- /* Free swap entries */
- struct hibernate_extent *extentpointer;
- unsigned long extentvalue;
-
- toi_message(TOI_IO, TOI_VERBOSE, 0, "Freeing storage for chain %p.",
- chain);
-
- swap_allocated -= chain->allocations.size;
- toi_extent_for_each(&chain->allocations, extentpointer, extentvalue)
- swap_free((swp_entry_t) { extentvalue });
-
- toi_put_extent_chain(&chain->allocations);
-}
-
-static void free_swap_range(unsigned long min, unsigned long max)
-{
- int j;
-
- for (j = min; j <= max; j++)
- swap_free((swp_entry_t) { j });
- swap_allocated -= (max - min + 1);
-}
-
-/*
- * Allocation of a single swap type. Swap priorities are handled at the higher
- * level.
- */
-static int toi_swap_allocate_storage(struct toi_bdev_info *chain,
- unsigned long request)
-{
- unsigned long gotten = 0;
-
- toi_message(TOI_IO, TOI_VERBOSE, 0, " Swap allocate storage: Asked to"
- " allocate %lu pages from device %d.", request,
- chain->allocator_index);
-
- while (gotten < request) {
- swp_entry_t start, end;
- if (0) {
- /* Broken at the moment for SSDs */
- get_swap_range_of_type(chain->allocator_index, &start, &end,
- request - gotten + 1);
- } else {
- start = end = get_swap_page_of_type(chain->allocator_index);
- }
- if (start.val) {
- int added = end.val - start.val + 1;
- if (toi_add_to_extent_chain(&chain->allocations,
- start.val, end.val)) {
- printk(KERN_INFO "Failed to allocate extent for "
- "%lu-%lu.\n", start.val, end.val);
- free_swap_range(start.val, end.val);
- break;
- }
- gotten += added;
- swap_allocated += added;
- } else
- break;
- }
-
- toi_message(TOI_IO, TOI_VERBOSE, 0, " Allocated %lu pages.", gotten);
- return gotten;
-}
-
-static int toi_swap_register_storage(void)
-{
- int i, result = 0;
-
- toi_message(TOI_IO, TOI_VERBOSE, 0, "toi_swap_register_storage.");
- for (i = 0; i < MAX_SWAPFILES; i++) {
- struct swap_info_struct *si = get_swap_info_struct(i);
- struct toi_bdev_info *devinfo;
- unsigned char *p;
- unsigned char buf[256];
- struct fs_info *fs_info;
-
- if (!si || !(si->flags & SWP_WRITEOK) || is_ram_backed(si))
- continue;
-
- devinfo = toi_kzalloc(39, sizeof(struct toi_bdev_info),
- GFP_ATOMIC);
- if (!devinfo) {
- printk("Failed to allocate devinfo struct for swap "
- "device %d.\n", i);
- return -ENOMEM;
- }
-
- devinfo->bdev = si->bdev;
- devinfo->allocator = &toi_swapops;
- devinfo->allocator_index = i;
-
- fs_info = fs_info_from_block_dev(si->bdev);
- if (fs_info && !IS_ERR(fs_info)) {
- memcpy(devinfo->uuid, &fs_info->uuid, 16);
- free_fs_info(fs_info);
- } else
- result = (int) PTR_ERR(fs_info);
-
- if (!fs_info)
- printk("fs_info from block dev returned %d.\n", result);
- devinfo->dev_t = si->bdev->bd_dev;
- devinfo->prio = si->prio;
- devinfo->bmap_shift = 3;
- devinfo->blocks_per_page = 1;
-
- p = d_path(&si->swap_file->f_path, buf, sizeof(buf));
- sprintf(devinfo->name, "swap on %s", p);
-
- toi_message(TOI_IO, TOI_VERBOSE, 0, "Registering swap storage:"
- " Device %d (%lx), prio %d.", i,
- (unsigned long) devinfo->dev_t, devinfo->prio);
- toi_bio_ops.register_storage(devinfo);
- }
-
- return 0;
-}
-
-static unsigned long toi_swap_free_unused_storage(struct toi_bdev_info *chain, unsigned long used)
-{
- struct hibernate_extent *extentpointer = NULL;
- unsigned long extentvalue;
- unsigned long i = 0, first_freed = 0;
-
- toi_extent_for_each(&chain->allocations, extentpointer, extentvalue) {
- i++;
- if (i > used) {
- swap_free((swp_entry_t) { extentvalue });
- if (!first_freed)
- first_freed = extentvalue;
- }
- }
-
- return first_freed;
-}
-
-/*
- * workspace_size
- *
- * Description:
- * Returns the number of bytes of RAM needed for this
- * code to do its work. (Used when calculating whether
- * we have enough memory to be able to hibernate & resume).
- *
- */
-static int toi_swap_memory_needed(void)
-{
- return 1;
-}
-
-/*
- * Print debug info
- *
- * Description:
- */
-static int toi_swap_print_debug_stats(char *buffer, int size)
-{
- int len = 0;
-
- len = scnprintf(buffer, size, "- Swap Allocator enabled.\n");
- if (swapfilename[0])
- len += scnprintf(buffer+len, size-len,
- " Attempting to automatically swapon: %s.\n",
- swapfilename);
-
- si_swapinfo_no_compcache();
-
- len += scnprintf(buffer+len, size-len,
- " Swap available for image: %lu pages.\n",
- swapinfo.freeswap + swap_allocated);
-
- return len;
-}
-
-static int header_locations_read_sysfs(const char *page, int count)
-{
- int i, printedpartitionsmessage = 0, len = 0, haveswap = 0;
- struct inode *swapf = NULL;
- int zone;
- char *path_page = (char *) toi_get_free_page(10, GFP_KERNEL);
- char *path, *output = (char *) page;
- int path_len;
-
- if (!page)
- return 0;
-
- for (i = 0; i < MAX_SWAPFILES; i++) {
- struct swap_info_struct *si = get_swap_info_struct(i);
-
- if (!si || !(si->flags & SWP_WRITEOK))
- continue;
-
- if (S_ISBLK(si->swap_file->f_mapping->host->i_mode)) {
- haveswap = 1;
- if (!printedpartitionsmessage) {
- len += sprintf(output + len,
- "For swap partitions, simply use the "
- "format: resume=swap:/dev/hda1.\n");
- printedpartitionsmessage = 1;
- }
- } else {
- path_len = 0;
-
- path = d_path(&si->swap_file->f_path, path_page,
- PAGE_SIZE);
- path_len = snprintf(path_page, PAGE_SIZE, "%s", path);
-
- haveswap = 1;
- swapf = si->swap_file->f_mapping->host;
- zone = bmap(swapf, 0);
- if (!zone) {
- len += sprintf(output + len,
- "Swapfile %s has been corrupted. Reuse"
- " mkswap on it and try again.\n",
- path_page);
- } else {
- char name_buffer[BDEVNAME_SIZE];
- len += sprintf(output + len,
- "For swapfile `%s`,"
- " use resume=swap:/dev/%s:0x%x.\n",
- path_page,
- bdevname(si->bdev, name_buffer),
- zone << (swapf->i_blkbits - 9));
- }
- }
- }
-
- if (!haveswap)
- len = sprintf(output, "You need to turn on swap partitions "
- "before examining this file.\n");
-
- toi_free_page(10, (unsigned long) path_page);
- return len;
-}
-
-static struct toi_sysfs_data sysfs_params[] = {
- SYSFS_STRING("swapfilename", SYSFS_RW, swapfilename, 255, 0, NULL),
- SYSFS_CUSTOM("headerlocations", SYSFS_READONLY,
- header_locations_read_sysfs, NULL, 0, NULL),
- SYSFS_INT("enabled", SYSFS_RW, &toi_swapops.enabled, 0, 1, 0,
- attempt_to_parse_resume_device2),
-};
-
-static struct toi_bio_allocator_ops toi_bio_swapops = {
- .register_storage = toi_swap_register_storage,
- .storage_available = toi_swap_storage_available,
- .allocate_storage = toi_swap_allocate_storage,
- .bmap = get_main_pool_phys_params,
- .free_storage = toi_swap_free_storage,
- .free_unused_storage = toi_swap_free_unused_storage,
-};
-
-static struct toi_module_ops toi_swapops = {
- .type = BIO_ALLOCATOR_MODULE,
- .name = "swap storage",
- .directory = "swap",
- .module = THIS_MODULE,
- .memory_needed = toi_swap_memory_needed,
- .print_debug_info = toi_swap_print_debug_stats,
- .initialise = toi_swap_initialise,
- .cleanup = toi_swap_cleanup,
- .bio_allocator_ops = &toi_bio_swapops,
-
- .sysfs_data = sysfs_params,
- .num_sysfs_entries = sizeof(sysfs_params) /
- sizeof(struct toi_sysfs_data),
-};
-
-/* ---- Registration ---- */
-static __init int toi_swap_load(void)
-{
- return toi_register_module(&toi_swapops);
-}
-
-late_initcall(toi_swap_load);
diff --git a/kernel/power/tuxonice_sysfs.c b/kernel/power/tuxonice_sysfs.c
deleted file mode 100644
index 77f36dbeb..000000000
--- a/kernel/power/tuxonice_sysfs.c
+++ /dev/null
@@ -1,333 +0,0 @@
-/*
- * kernel/power/tuxonice_sysfs.c
- *
- * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * This file contains support for sysfs entries for tuning TuxOnIce.
- *
- * We have a generic handler that deals with the most common cases, and
- * hooks for special handlers to use.
- */
-
-#include <linux/suspend.h>
-
-#include "tuxonice_sysfs.h"
-#include "tuxonice.h"
-#include "tuxonice_storage.h"
-#include "tuxonice_alloc.h"
-
-static int toi_sysfs_initialised;
-
-static void toi_initialise_sysfs(void);
-
-static struct toi_sysfs_data sysfs_params[];
-
-#define to_sysfs_data(_attr) container_of(_attr, struct toi_sysfs_data, attr)
-
-static void toi_main_wrapper(void)
-{
- toi_try_hibernate();
-}
-
-static ssize_t toi_attr_show(struct kobject *kobj, struct attribute *attr,
- char *page)
-{
- struct toi_sysfs_data *sysfs_data = to_sysfs_data(attr);
- int len = 0;
- int full_prep = sysfs_data->flags & SYSFS_NEEDS_SM_FOR_READ;
-
- if (full_prep && toi_start_anything(0))
- return -EBUSY;
-
- if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_READ)
- toi_prepare_usm();
-
- switch (sysfs_data->type) {
- case TOI_SYSFS_DATA_CUSTOM:
- len = (sysfs_data->data.special.read_sysfs) ?
- (sysfs_data->data.special.read_sysfs)(page, PAGE_SIZE)
- : 0;
- break;
- case TOI_SYSFS_DATA_BIT:
- len = sprintf(page, "%d\n",
- -test_bit(sysfs_data->data.bit.bit,
- sysfs_data->data.bit.bit_vector));
- break;
- case TOI_SYSFS_DATA_INTEGER:
- len = sprintf(page, "%d\n",
- *(sysfs_data->data.integer.variable));
- break;
- case TOI_SYSFS_DATA_LONG:
- len = sprintf(page, "%ld\n",
- *(sysfs_data->data.a_long.variable));
- break;
- case TOI_SYSFS_DATA_UL:
- len = sprintf(page, "%lu\n",
- *(sysfs_data->data.ul.variable));
- break;
- case TOI_SYSFS_DATA_STRING:
- len = sprintf(page, "%s\n",
- sysfs_data->data.string.variable);
- break;
- }
-
- if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_READ)
- toi_cleanup_usm();
-
- if (full_prep)
- toi_finish_anything(0);
-
- return len;
-}
-
-#define BOUND(_variable, _type) do { \
- if (*_variable < sysfs_data->data._type.minimum) \
- *_variable = sysfs_data->data._type.minimum; \
- else if (*_variable > sysfs_data->data._type.maximum) \
- *_variable = sysfs_data->data._type.maximum; \
-} while (0)
-
-static ssize_t toi_attr_store(struct kobject *kobj, struct attribute *attr,
- const char *my_buf, size_t count)
-{
- int assigned_temp_buffer = 0, result = count;
- struct toi_sysfs_data *sysfs_data = to_sysfs_data(attr);
-
- if (toi_start_anything((sysfs_data->flags & SYSFS_HIBERNATE_OR_RESUME)))
- return -EBUSY;
-
- ((char *) my_buf)[count] = 0;
-
- if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_WRITE)
- toi_prepare_usm();
-
- switch (sysfs_data->type) {
- case TOI_SYSFS_DATA_CUSTOM:
- if (sysfs_data->data.special.write_sysfs)
- result = (sysfs_data->data.special.write_sysfs)(my_buf,
- count);
- break;
- case TOI_SYSFS_DATA_BIT:
- {
- unsigned long value;
- result = kstrtoul(my_buf, 0, &value);
- if (result)
- break;
- if (value)
- set_bit(sysfs_data->data.bit.bit,
- (sysfs_data->data.bit.bit_vector));
- else
- clear_bit(sysfs_data->data.bit.bit,
- (sysfs_data->data.bit.bit_vector));
- }
- break;
- case TOI_SYSFS_DATA_INTEGER:
- {
- long temp;
- result = kstrtol(my_buf, 0, &temp);
- if (result)
- break;
- *(sysfs_data->data.integer.variable) = (int) temp;
- BOUND(sysfs_data->data.integer.variable, integer);
- break;
- }
- case TOI_SYSFS_DATA_LONG:
- {
- long *variable =
- sysfs_data->data.a_long.variable;
- result = kstrtol(my_buf, 0, variable);
- if (result)
- break;
- BOUND(variable, a_long);
- break;
- }
- case TOI_SYSFS_DATA_UL:
- {
- unsigned long *variable =
- sysfs_data->data.ul.variable;
- result = kstrtoul(my_buf, 0, variable);
- if (result)
- break;
- BOUND(variable, ul);
- break;
- }
- break;
- case TOI_SYSFS_DATA_STRING:
- {
- int copy_len = count;
- char *variable =
- sysfs_data->data.string.variable;
-
- if (sysfs_data->data.string.max_length &&
- (copy_len > sysfs_data->data.string.max_length))
- copy_len = sysfs_data->data.string.max_length;
-
- if (!variable) {
- variable = (char *) toi_get_zeroed_page(31,
- TOI_ATOMIC_GFP);
- sysfs_data->data.string.variable = variable;
- assigned_temp_buffer = 1;
- }
- strncpy(variable, my_buf, copy_len);
- if (copy_len && my_buf[copy_len - 1] == '\n')
- variable[count - 1] = 0;
- variable[count] = 0;
- }
- break;
- }
-
- if (!result)
- result = count;
-
- /* Side effect routine? */
- if (result == count && sysfs_data->write_side_effect)
- sysfs_data->write_side_effect();
-
- /* Free temporary buffers */
- if (assigned_temp_buffer) {
- toi_free_page(31,
- (unsigned long) sysfs_data->data.string.variable);
- sysfs_data->data.string.variable = NULL;
- }
-
- if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_WRITE)
- toi_cleanup_usm();
-
- toi_finish_anything(sysfs_data->flags & SYSFS_HIBERNATE_OR_RESUME);
-
- return result;
-}
-
-static struct sysfs_ops toi_sysfs_ops = {
- .show = &toi_attr_show,
- .store = &toi_attr_store,
-};
-
-static struct kobj_type toi_ktype = {
- .sysfs_ops = &toi_sysfs_ops,
-};
-
-struct kobject *tuxonice_kobj;
-
-/* Non-module sysfs entries.
- *
- * This array contains entries that are automatically registered at
- * boot. Modules and the console code register their own entries separately.
- */
-
-static struct toi_sysfs_data sysfs_params[] = {
- SYSFS_CUSTOM("do_hibernate", SYSFS_WRITEONLY, NULL, NULL,
- SYSFS_HIBERNATING, toi_main_wrapper),
- SYSFS_CUSTOM("do_resume", SYSFS_WRITEONLY, NULL, NULL,
- SYSFS_RESUMING, toi_try_resume)
-};
-
-void remove_toi_sysdir(struct kobject *kobj)
-{
- if (!kobj)
- return;
-
- kobject_put(kobj);
-}
-
-struct kobject *make_toi_sysdir(char *name)
-{
- struct kobject *kobj = kobject_create_and_add(name, tuxonice_kobj);
-
- if (!kobj) {
- printk(KERN_INFO "TuxOnIce: Can't allocate kobject for sysfs "
- "dir!\n");
- return NULL;
- }
-
- kobj->ktype = &toi_ktype;
-
- return kobj;
-}
-
-/* toi_register_sysfs_file
- *
- * Helper for registering a new /sysfs/tuxonice entry.
- */
-
-int toi_register_sysfs_file(
- struct kobject *kobj,
- struct toi_sysfs_data *toi_sysfs_data)
-{
- int result;
-
- if (!toi_sysfs_initialised)
- toi_initialise_sysfs();
-
- result = sysfs_create_file(kobj, &toi_sysfs_data->attr);
- if (result)
- printk(KERN_INFO "TuxOnIce: sysfs_create_file for %s "
- "returned %d.\n",
- toi_sysfs_data->attr.name, result);
- kobj->ktype = &toi_ktype;
-
- return result;
-}
-
-/* toi_unregister_sysfs_file
- *
- * Helper for removing unwanted /sys/power/tuxonice entries.
- *
- */
-void toi_unregister_sysfs_file(struct kobject *kobj,
- struct toi_sysfs_data *toi_sysfs_data)
-{
- sysfs_remove_file(kobj, &toi_sysfs_data->attr);
-}
-
-void toi_cleanup_sysfs(void)
-{
- int i,
- numfiles = sizeof(sysfs_params) / sizeof(struct toi_sysfs_data);
-
- if (!toi_sysfs_initialised)
- return;
-
- for (i = 0; i < numfiles; i++)
- toi_unregister_sysfs_file(tuxonice_kobj, &sysfs_params[i]);
-
- kobject_put(tuxonice_kobj);
- toi_sysfs_initialised = 0;
-}
-
-/* toi_initialise_sysfs
- *
- * Initialise the /sysfs/tuxonice directory.
- */
-
-static void toi_initialise_sysfs(void)
-{
- int i;
- int numfiles = sizeof(sysfs_params) / sizeof(struct toi_sysfs_data);
-
- if (toi_sysfs_initialised)
- return;
-
- /* Make our TuxOnIce directory a child of /sys/power */
- tuxonice_kobj = kobject_create_and_add("tuxonice", power_kobj);
- if (!tuxonice_kobj)
- return;
-
- toi_sysfs_initialised = 1;
-
- for (i = 0; i < numfiles; i++)
- toi_register_sysfs_file(tuxonice_kobj, &sysfs_params[i]);
-}
-
-int toi_sysfs_init(void)
-{
- toi_initialise_sysfs();
- return 0;
-}
-
-void toi_sysfs_exit(void)
-{
- toi_cleanup_sysfs();
-}
diff --git a/kernel/power/tuxonice_sysfs.h b/kernel/power/tuxonice_sysfs.h
deleted file mode 100644
index 1de954ce1..000000000
--- a/kernel/power/tuxonice_sysfs.h
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * kernel/power/tuxonice_sysfs.h
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- */
-
-#include <linux/sysfs.h>
-
-struct toi_sysfs_data {
- struct attribute attr;
- int type;
- int flags;
- union {
- struct {
- unsigned long *bit_vector;
- int bit;
- } bit;
- struct {
- int *variable;
- int minimum;
- int maximum;
- } integer;
- struct {
- long *variable;
- long minimum;
- long maximum;
- } a_long;
- struct {
- unsigned long *variable;
- unsigned long minimum;
- unsigned long maximum;
- } ul;
- struct {
- char *variable;
- int max_length;
- } string;
- struct {
- int (*read_sysfs) (const char *buffer, int count);
- int (*write_sysfs) (const char *buffer, int count);
- void *data;
- } special;
- } data;
-
- /* Side effects routine. Used, eg, for reparsing the
- * resume= entry when it changes */
- void (*write_side_effect) (void);
- struct list_head sysfs_data_list;
-};
-
-enum {
- TOI_SYSFS_DATA_NONE = 1,
- TOI_SYSFS_DATA_CUSTOM,
- TOI_SYSFS_DATA_BIT,
- TOI_SYSFS_DATA_INTEGER,
- TOI_SYSFS_DATA_UL,
- TOI_SYSFS_DATA_LONG,
- TOI_SYSFS_DATA_STRING
-};
-
-#define SYSFS_WRITEONLY 0200
-#define SYSFS_READONLY 0444
-#define SYSFS_RW 0644
-
-#define SYSFS_BIT(_name, _mode, _ul, _bit, _flags) { \
- .attr = {.name = _name , .mode = _mode }, \
- .type = TOI_SYSFS_DATA_BIT, \
- .flags = _flags, \
- .data = { .bit = { .bit_vector = _ul, .bit = _bit } } }
-
-#define SYSFS_INT(_name, _mode, _int, _min, _max, _flags, _wse) { \
- .attr = {.name = _name , .mode = _mode }, \
- .type = TOI_SYSFS_DATA_INTEGER, \
- .flags = _flags, \
- .data = { .integer = { .variable = _int, .minimum = _min, \
- .maximum = _max } }, \
- .write_side_effect = _wse }
-
-#define SYSFS_UL(_name, _mode, _ul, _min, _max, _flags) { \
- .attr = {.name = _name , .mode = _mode }, \
- .type = TOI_SYSFS_DATA_UL, \
- .flags = _flags, \
- .data = { .ul = { .variable = _ul, .minimum = _min, \
- .maximum = _max } } }
-
-#define SYSFS_LONG(_name, _mode, _long, _min, _max, _flags) { \
- .attr = {.name = _name , .mode = _mode }, \
- .type = TOI_SYSFS_DATA_LONG, \
- .flags = _flags, \
- .data = { .a_long = { .variable = _long, .minimum = _min, \
- .maximum = _max } } }
-
-#define SYSFS_STRING(_name, _mode, _string, _max_len, _flags, _wse) { \
- .attr = {.name = _name , .mode = _mode }, \
- .type = TOI_SYSFS_DATA_STRING, \
- .flags = _flags, \
- .data = { .string = { .variable = _string, .max_length = _max_len } }, \
- .write_side_effect = _wse }
-
-#define SYSFS_CUSTOM(_name, _mode, _read, _write, _flags, _wse) { \
- .attr = {.name = _name , .mode = _mode }, \
- .type = TOI_SYSFS_DATA_CUSTOM, \
- .flags = _flags, \
- .data = { .special = { .read_sysfs = _read, .write_sysfs = _write } }, \
- .write_side_effect = _wse }
-
-#define SYSFS_NONE(_name, _wse) { \
- .attr = {.name = _name , .mode = SYSFS_WRITEONLY }, \
- .type = TOI_SYSFS_DATA_NONE, \
- .write_side_effect = _wse, \
-}
-
-/* Flags */
-#define SYSFS_NEEDS_SM_FOR_READ 1
-#define SYSFS_NEEDS_SM_FOR_WRITE 2
-#define SYSFS_HIBERNATE 4
-#define SYSFS_RESUME 8
-#define SYSFS_HIBERNATE_OR_RESUME (SYSFS_HIBERNATE | SYSFS_RESUME)
-#define SYSFS_HIBERNATING (SYSFS_HIBERNATE | SYSFS_NEEDS_SM_FOR_WRITE)
-#define SYSFS_RESUMING (SYSFS_RESUME | SYSFS_NEEDS_SM_FOR_WRITE)
-#define SYSFS_NEEDS_SM_FOR_BOTH \
- (SYSFS_NEEDS_SM_FOR_READ | SYSFS_NEEDS_SM_FOR_WRITE)
-
-int toi_register_sysfs_file(struct kobject *kobj,
- struct toi_sysfs_data *toi_sysfs_data);
-void toi_unregister_sysfs_file(struct kobject *kobj,
- struct toi_sysfs_data *toi_sysfs_data);
-
-extern struct kobject *tuxonice_kobj;
-
-struct kobject *make_toi_sysdir(char *name);
-void remove_toi_sysdir(struct kobject *obj);
-extern void toi_cleanup_sysfs(void);
-
-extern int toi_sysfs_init(void);
-extern void toi_sysfs_exit(void);
diff --git a/kernel/power/tuxonice_ui.c b/kernel/power/tuxonice_ui.c
deleted file mode 100644
index 76152f3ff..000000000
--- a/kernel/power/tuxonice_ui.c
+++ /dev/null
@@ -1,247 +0,0 @@
-/*
- * kernel/power/tuxonice_ui.c
- *
- * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
- * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz>
- * Copyright (C) 2002-2003 Florent Chabaud <fchabaud@free.fr>
- * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * Routines for TuxOnIce's user interface.
- *
- * The user interface code talks to a userspace program via a
- * netlink socket.
- *
- * The kernel side:
- * - starts the userui program;
- * - sends text messages and progress bar status;
- *
- * The user space side:
- * - passes messages regarding user requests (abort, toggle reboot etc)
- *
- */
-
-#define __KERNEL_SYSCALLS__
-
-#include <linux/reboot.h>
-
-#include "tuxonice_sysfs.h"
-#include "tuxonice_modules.h"
-#include "tuxonice.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_netlink.h"
-#include "tuxonice_power_off.h"
-#include "tuxonice_builtin.h"
-
-static char local_printf_buf[1024]; /* Same as printk - should be safe */
-struct ui_ops *toi_current_ui;
-
-/**
- * toi_wait_for_keypress - Wait for keypress via userui or /dev/console.
- *
- * @timeout: Maximum time to wait.
- *
- * Wait for a keypress, either from userui or /dev/console if userui isn't
- * available. The non-userui path is particularly for at boot-time, prior
- * to userui being started, when we have an important warning to give to
- * the user.
- */
-static char toi_wait_for_keypress(int timeout)
-{
- if (toi_current_ui && toi_current_ui->wait_for_key(timeout))
- return ' ';
-
- return toi_wait_for_keypress_dev_console(timeout);
-}
-
-/* toi_early_boot_message()
- * Description: Handle errors early in the process of booting.
- * The user may press C to continue booting, perhaps
- * invalidating the image, or space to reboot.
- * This works from either the serial console or normally
- * attached keyboard.
- *
- * Note that we come in here from init, while the kernel is
- * locked. If we want to get events from the serial console,
- * we need to temporarily unlock the kernel.
- *
- * toi_early_boot_message may also be called post-boot.
- * In this case, it simply printks the message and returns.
- *
- * Arguments: int Whether we are able to erase the image.
- * int default_answer. What to do when we timeout. This
- * will normally be continue, but the user might
- * provide command line options (__setup) to override
- * particular cases.
- * Char *. Pointer to a string explaining why we're moaning.
- */
-
-#define say(message, a...) printk(KERN_EMERG message, ##a)
-
-void toi_early_boot_message(int message_detail, int default_answer,
- char *warning_reason, ...)
-{
-#if defined(CONFIG_VT) || defined(CONFIG_SERIAL_CONSOLE)
- unsigned long orig_state = get_toi_state(), continue_req = 0;
- unsigned long orig_loglevel = console_loglevel;
- int can_ask = 1;
-#else
- int can_ask = 0;
-#endif
-
- va_list args;
- int printed_len;
-
- if (!toi_wait) {
- set_toi_state(TOI_CONTINUE_REQ);
- can_ask = 0;
- }
-
- if (warning_reason) {
- va_start(args, warning_reason);
- printed_len = vsnprintf(local_printf_buf,
- sizeof(local_printf_buf),
- warning_reason,
- args);
- va_end(args);
- }
-
- if (!test_toi_state(TOI_BOOT_TIME)) {
- printk("TuxOnIce: %s\n", local_printf_buf);
- return;
- }
-
- if (!can_ask) {
- continue_req = !!default_answer;
- goto post_ask;
- }
-
-#if defined(CONFIG_VT) || defined(CONFIG_SERIAL_CONSOLE)
- console_loglevel = 7;
-
- say("=== TuxOnIce ===\n\n");
- if (warning_reason) {
- say("BIG FAT WARNING!! %s\n\n", local_printf_buf);
- switch (message_detail) {
- case 0:
- say("If you continue booting, note that any image WILL"
- "NOT BE REMOVED.\nTuxOnIce is unable to do so "
- "because the appropriate modules aren't\n"
- "loaded. You should manually remove the image "
- "to avoid any\npossibility of corrupting your "
- "filesystem(s) later.\n");
- break;
- case 1:
- say("If you want to use the current TuxOnIce image, "
- "reboot and try\nagain with the same kernel "
- "that you hibernated from. If you want\n"
- "to forget that image, continue and the image "
- "will be erased.\n");
- break;
- }
- say("Press SPACE to reboot or C to continue booting with "
- "this kernel\n\n");
- if (toi_wait > 0)
- say("Default action if you don't select one in %d "
- "seconds is: %s.\n",
- toi_wait,
- default_answer == TOI_CONTINUE_REQ ?
- "continue booting" : "reboot");
- } else {
- say("BIG FAT WARNING!!\n\n"
- "You have tried to resume from this image before.\n"
- "If it failed once, it may well fail again.\n"
- "Would you like to remove the image and boot "
- "normally?\nThis will be equivalent to entering "
- "noresume on the\nkernel command line.\n\n"
- "Press SPACE to remove the image or C to continue "
- "resuming.\n\n");
- if (toi_wait > 0)
- say("Default action if you don't select one in %d "
- "seconds is: %s.\n", toi_wait,
- !!default_answer ?
- "continue resuming" : "remove the image");
- }
- console_loglevel = orig_loglevel;
-
- set_toi_state(TOI_SANITY_CHECK_PROMPT);
- clear_toi_state(TOI_CONTINUE_REQ);
-
- if (toi_wait_for_keypress(toi_wait) == 0) /* We timed out */
- continue_req = !!default_answer;
- else
- continue_req = test_toi_state(TOI_CONTINUE_REQ);
-
-#endif /* CONFIG_VT or CONFIG_SERIAL_CONSOLE */
-
-post_ask:
- if ((warning_reason) && (!continue_req))
- kernel_restart(NULL);
-
- restore_toi_state(orig_state);
- if (continue_req)
- set_toi_state(TOI_CONTINUE_REQ);
-}
-
-#undef say
-
-/*
- * User interface specific /sys/power/tuxonice entries.
- */
-
-static struct toi_sysfs_data sysfs_params[] = {
-#if defined(CONFIG_NET) && defined(CONFIG_SYSFS)
- SYSFS_INT("default_console_level", SYSFS_RW,
- &toi_bkd.toi_default_console_level, 0, 7, 0, NULL),
- SYSFS_UL("debug_sections", SYSFS_RW, &toi_bkd.toi_debug_state, 0,
- 1 << 30, 0),
- SYSFS_BIT("log_everything", SYSFS_RW, &toi_bkd.toi_action, TOI_LOGALL,
- 0)
-#endif
-};
-
-static struct toi_module_ops userui_ops = {
- .type = MISC_HIDDEN_MODULE,
- .name = "printk ui",
- .directory = "user_interface",
- .module = THIS_MODULE,
- .sysfs_data = sysfs_params,
- .num_sysfs_entries = sizeof(sysfs_params) /
- sizeof(struct toi_sysfs_data),
-};
-
-int toi_register_ui_ops(struct ui_ops *this_ui)
-{
- if (toi_current_ui) {
- printk(KERN_INFO "Only one TuxOnIce user interface module can "
- "be loaded at a time.");
- return -EBUSY;
- }
-
- toi_current_ui = this_ui;
-
- return 0;
-}
-
-void toi_remove_ui_ops(struct ui_ops *this_ui)
-{
- if (toi_current_ui != this_ui)
- return;
-
- toi_current_ui = NULL;
-}
-
-/* toi_console_sysfs_init
- * Description: Boot time initialisation for user interface.
- */
-
-int toi_ui_init(void)
-{
- return toi_register_module(&userui_ops);
-}
-
-void toi_ui_exit(void)
-{
- toi_unregister_module(&userui_ops);
-}
diff --git a/kernel/power/tuxonice_ui.h b/kernel/power/tuxonice_ui.h
deleted file mode 100644
index 4934e3a91..000000000
--- a/kernel/power/tuxonice_ui.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * kernel/power/tuxonice_ui.h
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- */
-
-enum {
- DONT_CLEAR_BAR,
- CLEAR_BAR
-};
-
-enum {
- /* Userspace -> Kernel */
- USERUI_MSG_ABORT = 0x11,
- USERUI_MSG_SET_STATE = 0x12,
- USERUI_MSG_GET_STATE = 0x13,
- USERUI_MSG_GET_DEBUG_STATE = 0x14,
- USERUI_MSG_SET_DEBUG_STATE = 0x15,
- USERUI_MSG_SPACE = 0x18,
- USERUI_MSG_GET_POWERDOWN_METHOD = 0x1A,
- USERUI_MSG_SET_POWERDOWN_METHOD = 0x1B,
- USERUI_MSG_GET_LOGLEVEL = 0x1C,
- USERUI_MSG_SET_LOGLEVEL = 0x1D,
- USERUI_MSG_PRINTK = 0x1E,
-
- /* Kernel -> Userspace */
- USERUI_MSG_MESSAGE = 0x21,
- USERUI_MSG_PROGRESS = 0x22,
- USERUI_MSG_POST_ATOMIC_RESTORE = 0x25,
-
- USERUI_MSG_MAX,
-};
-
-struct userui_msg_params {
- u32 a, b, c, d;
- char text[255];
-};
-
-struct ui_ops {
- char (*wait_for_key) (int timeout);
- u32 (*update_status) (u32 value, u32 maximum, const char *fmt, ...);
- void (*prepare_status) (int clearbar, const char *fmt, ...);
- void (*cond_pause) (int pause, char *message);
- void (*abort)(int result_code, const char *fmt, ...);
- void (*prepare)(void);
- void (*cleanup)(void);
- void (*message)(u32 section, u32 level, u32 normally_logged,
- const char *fmt, ...);
-};
-
-extern struct ui_ops *toi_current_ui;
-
-#define toi_update_status(val, max, fmt, args...) \
- (toi_current_ui ? (toi_current_ui->update_status) (val, max, fmt, ##args) : \
- max)
-
-#define toi_prepare_console(void) \
- do { if (toi_current_ui) \
- (toi_current_ui->prepare)(); \
- } while (0)
-
-#define toi_cleanup_console(void) \
- do { if (toi_current_ui) \
- (toi_current_ui->cleanup)(); \
- } while (0)
-
-#define abort_hibernate(result, fmt, args...) \
- do { if (toi_current_ui) \
- (toi_current_ui->abort)(result, fmt, ##args); \
- else { \
- set_abort_result(result); \
- } \
- } while (0)
-
-#define toi_cond_pause(pause, message) \
- do { if (toi_current_ui) \
- (toi_current_ui->cond_pause)(pause, message); \
- } while (0)
-
-#define toi_prepare_status(clear, fmt, args...) \
- do { if (toi_current_ui) \
- (toi_current_ui->prepare_status)(clear, fmt, ##args); \
- else \
- printk(KERN_INFO fmt "%s", ##args, "\n"); \
- } while (0)
-
-#define toi_message(sn, lev, log, fmt, a...) \
-do { \
- if (toi_current_ui && (!sn || test_debug_state(sn))) \
- toi_current_ui->message(sn, lev, log, fmt, ##a); \
-} while (0)
-
-__exit void toi_ui_cleanup(void);
-extern int toi_ui_init(void);
-extern void toi_ui_exit(void);
-extern int toi_register_ui_ops(struct ui_ops *this_ui);
-extern void toi_remove_ui_ops(struct ui_ops *this_ui);
diff --git a/kernel/power/tuxonice_userui.c b/kernel/power/tuxonice_userui.c
deleted file mode 100644
index 6aa5ac3eb..000000000
--- a/kernel/power/tuxonice_userui.c
+++ /dev/null
@@ -1,658 +0,0 @@
-/*
- * kernel/power/user_ui.c
- *
- * Copyright (C) 2005-2007 Bernard Blackham
- * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * Routines for TuxOnIce's user interface.
- *
- * The user interface code talks to a userspace program via a
- * netlink socket.
- *
- * The kernel side:
- * - starts the userui program;
- * - sends text messages and progress bar status;
- *
- * The user space side:
- * - passes messages regarding user requests (abort, toggle reboot etc)
- *
- */
-
-#define __KERNEL_SYSCALLS__
-
-#include <linux/suspend.h>
-#include <linux/freezer.h>
-#include <linux/console.h>
-#include <linux/ctype.h>
-#include <linux/tty.h>
-#include <linux/vt_kern.h>
-#include <linux/reboot.h>
-#include <linux/security.h>
-#include <linux/syscalls.h>
-#include <linux/vt.h>
-
-#include "tuxonice_sysfs.h"
-#include "tuxonice_modules.h"
-#include "tuxonice.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_netlink.h"
-#include "tuxonice_power_off.h"
-
-static char local_printf_buf[1024]; /* Same as printk - should be safe */
-
-static struct user_helper_data ui_helper_data;
-static struct toi_module_ops userui_ops;
-static int orig_kmsg;
-
-static char lastheader[512];
-static int lastheader_message_len;
-static int ui_helper_changed; /* Used at resume-time so don't overwrite value
- set from initrd/ramfs. */
-
-/* Number of distinct progress amounts that userspace can display */
-static int progress_granularity = 30;
-
-static DECLARE_WAIT_QUEUE_HEAD(userui_wait_for_key);
-static int userui_wait_should_wake;
-
-#define toi_stop_waiting_for_userui_key() \
-{ \
- userui_wait_should_wake = true; \
- wake_up_interruptible(&userui_wait_for_key); \
-}
-
-/**
- * ui_nl_set_state - Update toi_action based on a message from userui.
- *
- * @n: The bit (1 << bit) to set.
- */
-static void ui_nl_set_state(int n)
-{
- /* Only let them change certain settings */
- static const u32 toi_action_mask =
- (1 << TOI_REBOOT) | (1 << TOI_PAUSE) |
- (1 << TOI_LOGALL) |
- (1 << TOI_SINGLESTEP) |
- (1 << TOI_PAUSE_NEAR_PAGESET_END);
- static unsigned long new_action;
-
- new_action = (toi_bkd.toi_action & (~toi_action_mask)) |
- (n & toi_action_mask);
-
- printk(KERN_DEBUG "n is %x. Action flags being changed from %lx "
- "to %lx.", n, toi_bkd.toi_action, new_action);
- toi_bkd.toi_action = new_action;
-
- if (!test_action_state(TOI_PAUSE) &&
- !test_action_state(TOI_SINGLESTEP))
- toi_stop_waiting_for_userui_key();
-}
-
-/**
- * userui_post_atomic_restore - Tell userui that atomic restore just happened.
- *
- * Tell userui that atomic restore just occured, so that it can do things like
- * redrawing the screen, re-getting settings and so on.
- */
-static void userui_post_atomic_restore(struct toi_boot_kernel_data *bkd)
-{
- toi_send_netlink_message(&ui_helper_data,
- USERUI_MSG_POST_ATOMIC_RESTORE, NULL, 0);
-}
-
-/**
- * userui_storage_needed - Report how much memory in image header is needed.
- */
-static int userui_storage_needed(void)
-{
- return sizeof(ui_helper_data.program) + 1 + sizeof(int);
-}
-
-/**
- * userui_save_config_info - Fill buffer with config info for image header.
- *
- * @buf: Buffer into which to put the config info we want to save.
- */
-static int userui_save_config_info(char *buf)
-{
- *((int *) buf) = progress_granularity;
- memcpy(buf + sizeof(int), ui_helper_data.program,
- sizeof(ui_helper_data.program));
- return sizeof(ui_helper_data.program) + sizeof(int) + 1;
-}
-
-/**
- * userui_load_config_info - Restore config info from buffer.
- *
- * @buf: Buffer containing header info loaded.
- * @size: Size of data loaded for this module.
- */
-static void userui_load_config_info(char *buf, int size)
-{
- progress_granularity = *((int *) buf);
- size -= sizeof(int);
-
- /* Don't load the saved path if one has already been set */
- if (ui_helper_changed)
- return;
-
- if (size > sizeof(ui_helper_data.program))
- size = sizeof(ui_helper_data.program);
-
- memcpy(ui_helper_data.program, buf + sizeof(int), size);
- ui_helper_data.program[sizeof(ui_helper_data.program)-1] = '\0';
-}
-
-/**
- * set_ui_program_set: Record that userui program was changed.
- *
- * Side effect routine for when the userui program is set. In an initrd or
- * ramfs, the user may set a location for the userui program. If this happens,
- * we don't want to reload the value that was saved in the image header. This
- * routine allows us to flag that we shouldn't restore the program name from
- * the image header.
- */
-static void set_ui_program_set(void)
-{
- ui_helper_changed = 1;
-}
-
-/**
- * userui_memory_needed - Tell core how much memory to reserve for us.
- */
-static int userui_memory_needed(void)
-{
- /* ball park figure of 128 pages */
- return 128 * PAGE_SIZE;
-}
-
-/**
- * userui_update_status - Update the progress bar and (if on) in-bar message.
- *
- * @value: Current progress percentage numerator.
- * @maximum: Current progress percentage denominator.
- * @fmt: Message to be displayed in the middle of the progress bar.
- *
- * Note that a NULL message does not mean that any previous message is erased!
- * For that, you need toi_prepare_status with clearbar on.
- *
- * Returns an unsigned long, being the next numerator (as determined by the
- * maximum and progress granularity) where status needs to be updated.
- * This is to reduce unnecessary calls to update_status.
- */
-static u32 userui_update_status(u32 value, u32 maximum, const char *fmt, ...)
-{
- static u32 last_step = 9999;
- struct userui_msg_params msg;
- u32 this_step, next_update;
- int bitshift;
-
- if (ui_helper_data.pid == -1)
- return 0;
-
- if ((!maximum) || (!progress_granularity))
- return maximum;
-
- if (value < 0)
- value = 0;
-
- if (value > maximum)
- value = maximum;
-
- /* Try to avoid math problems - we can't do 64 bit math here
- * (and shouldn't need it - anyone got screen resolution
- * of 65536 pixels or more?) */
- bitshift = fls(maximum) - 16;
- if (bitshift > 0) {
- u32 temp_maximum = maximum >> bitshift;
- u32 temp_value = value >> bitshift;
- this_step = (u32)
- (temp_value * progress_granularity / temp_maximum);
- next_update = (((this_step + 1) * temp_maximum /
- progress_granularity) + 1) << bitshift;
- } else {
- this_step = (u32) (value * progress_granularity / maximum);
- next_update = ((this_step + 1) * maximum /
- progress_granularity) + 1;
- }
-
- if (this_step == last_step)
- return next_update;
-
- memset(&msg, 0, sizeof(msg));
-
- msg.a = this_step;
- msg.b = progress_granularity;
-
- if (fmt) {
- va_list args;
- va_start(args, fmt);
- vsnprintf(msg.text, sizeof(msg.text), fmt, args);
- va_end(args);
- msg.text[sizeof(msg.text)-1] = '\0';
- }
-
- toi_send_netlink_message(&ui_helper_data, USERUI_MSG_PROGRESS,
- &msg, sizeof(msg));
- last_step = this_step;
-
- return next_update;
-}
-
-/**
- * userui_message - Display a message without necessarily logging it.
- *
- * @section: Type of message. Messages can be filtered by type.
- * @level: Degree of importance of the message. Lower values = higher priority.
- * @normally_logged: Whether logged even if log_everything is off.
- * @fmt: Message (and parameters).
- *
- * This function is intended to do the same job as printk, but without normally
- * logging what is printed. The point is to be able to get debugging info on
- * screen without filling the logs with "1/534. ^M 2/534^M. 3/534^M"
- *
- * It may be called from an interrupt context - can't sleep!
- */
-static void userui_message(u32 section, u32 level, u32 normally_logged,
- const char *fmt, ...)
-{
- struct userui_msg_params msg;
-
- if ((level) && (level > console_loglevel))
- return;
-
- memset(&msg, 0, sizeof(msg));
-
- msg.a = section;
- msg.b = level;
- msg.c = normally_logged;
-
- if (fmt) {
- va_list args;
- va_start(args, fmt);
- vsnprintf(msg.text, sizeof(msg.text), fmt, args);
- va_end(args);
- msg.text[sizeof(msg.text)-1] = '\0';
- }
-
- if (test_action_state(TOI_LOGALL))
- printk(KERN_INFO "%s\n", msg.text);
-
- toi_send_netlink_message(&ui_helper_data, USERUI_MSG_MESSAGE,
- &msg, sizeof(msg));
-}
-
-/**
- * wait_for_key_via_userui - Wait for userui to receive a keypress.
- */
-static void wait_for_key_via_userui(void)
-{
- DECLARE_WAITQUEUE(wait, current);
-
- add_wait_queue(&userui_wait_for_key, &wait);
- set_current_state(TASK_INTERRUPTIBLE);
-
- wait_event_interruptible(userui_wait_for_key, userui_wait_should_wake);
- userui_wait_should_wake = false;
-
- set_current_state(TASK_RUNNING);
- remove_wait_queue(&userui_wait_for_key, &wait);
-}
-
-/**
- * userui_prepare_status - Display high level messages.
- *
- * @clearbar: Whether to clear the progress bar.
- * @fmt...: New message for the title.
- *
- * Prepare the 'nice display', drawing the header and version, along with the
- * current action and perhaps also resetting the progress bar.
- */
-static void userui_prepare_status(int clearbar, const char *fmt, ...)
-{
- va_list args;
-
- if (fmt) {
- va_start(args, fmt);
- lastheader_message_len = vsnprintf(lastheader, 512, fmt, args);
- va_end(args);
- }
-
- if (clearbar)
- toi_update_status(0, 1, NULL);
-
- if (ui_helper_data.pid == -1)
- printk(KERN_EMERG "%s\n", lastheader);
- else
- toi_message(0, TOI_STATUS, 1, lastheader, NULL);
-}
-
-/**
- * toi_wait_for_keypress - Wait for keypress via userui.
- *
- * @timeout: Maximum time to wait.
- *
- * Wait for a keypress from userui.
- *
- * FIXME: Implement timeout?
- */
-static char userui_wait_for_keypress(int timeout)
-{
- char key = '\0';
-
- if (ui_helper_data.pid != -1) {
- wait_for_key_via_userui();
- key = ' ';
- }
-
- return key;
-}
-
-/**
- * userui_abort_hibernate - Abort a cycle & tell user if they didn't request it.
- *
- * @result_code: Reason why we're aborting (1 << bit).
- * @fmt: Message to display if telling the user what's going on.
- *
- * Abort a cycle. If this wasn't at the user's request (and we're displaying
- * output), tell the user why and wait for them to acknowledge the message.
- */
-static void userui_abort_hibernate(int result_code, const char *fmt, ...)
-{
- va_list args;
- int printed_len = 0;
-
- set_result_state(result_code);
-
- if (test_result_state(TOI_ABORTED))
- return;
-
- set_result_state(TOI_ABORTED);
-
- if (test_result_state(TOI_ABORT_REQUESTED))
- return;
-
- va_start(args, fmt);
- printed_len = vsnprintf(local_printf_buf, sizeof(local_printf_buf),
- fmt, args);
- va_end(args);
- if (ui_helper_data.pid != -1)
- printed_len = sprintf(local_printf_buf + printed_len,
- " (Press SPACE to continue)");
-
- toi_prepare_status(CLEAR_BAR, "%s", local_printf_buf);
-
- if (ui_helper_data.pid != -1)
- userui_wait_for_keypress(0);
-}
-
-/**
- * request_abort_hibernate - Abort hibernating or resuming at user request.
- *
- * Handle the user requesting the cancellation of a hibernation or resume by
- * pressing escape.
- */
-static void request_abort_hibernate(void)
-{
- if (test_result_state(TOI_ABORT_REQUESTED) ||
- !test_action_state(TOI_CAN_CANCEL))
- return;
-
- if (test_toi_state(TOI_NOW_RESUMING)) {
- toi_prepare_status(CLEAR_BAR, "Escape pressed. "
- "Powering down again.");
- set_toi_state(TOI_STOP_RESUME);
- while (!test_toi_state(TOI_IO_STOPPED))
- schedule();
- if (toiActiveAllocator->mark_resume_attempted)
- toiActiveAllocator->mark_resume_attempted(0);
- toi_power_down();
- }
-
- toi_prepare_status(CLEAR_BAR, "--- ESCAPE PRESSED :"
- " ABORTING HIBERNATION ---");
- set_abort_result(TOI_ABORT_REQUESTED);
- toi_stop_waiting_for_userui_key();
-}
-
-/**
- * userui_user_rcv_msg - Receive a netlink message from userui.
- *
- * @skb: skb received.
- * @nlh: Netlink header received.
- */
-static int userui_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
-{
- int type;
- int *data;
-
- type = nlh->nlmsg_type;
-
- /* A control message: ignore them */
- if (type < NETLINK_MSG_BASE)
- return 0;
-
- /* Unknown message: reply with EINVAL */
- if (type >= USERUI_MSG_MAX)
- return -EINVAL;
-
- /* All operations require privileges, even GET */
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
-
- /* Only allow one task to receive NOFREEZE privileges */
- if (type == NETLINK_MSG_NOFREEZE_ME && ui_helper_data.pid != -1) {
- printk(KERN_INFO "Got NOFREEZE_ME request when "
- "ui_helper_data.pid is %d.\n", ui_helper_data.pid);
- return -EBUSY;
- }
-
- data = (int *) NLMSG_DATA(nlh);
-
- switch (type) {
- case USERUI_MSG_ABORT:
- request_abort_hibernate();
- return 0;
- case USERUI_MSG_GET_STATE:
- toi_send_netlink_message(&ui_helper_data,
- USERUI_MSG_GET_STATE, &toi_bkd.toi_action,
- sizeof(toi_bkd.toi_action));
- return 0;
- case USERUI_MSG_GET_DEBUG_STATE:
- toi_send_netlink_message(&ui_helper_data,
- USERUI_MSG_GET_DEBUG_STATE,
- &toi_bkd.toi_debug_state,
- sizeof(toi_bkd.toi_debug_state));
- return 0;
- case USERUI_MSG_SET_STATE:
- if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int)))
- return -EINVAL;
- ui_nl_set_state(*data);
- return 0;
- case USERUI_MSG_SET_DEBUG_STATE:
- if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int)))
- return -EINVAL;
- toi_bkd.toi_debug_state = (*data);
- return 0;
- case USERUI_MSG_SPACE:
- toi_stop_waiting_for_userui_key();
- return 0;
- case USERUI_MSG_GET_POWERDOWN_METHOD:
- toi_send_netlink_message(&ui_helper_data,
- USERUI_MSG_GET_POWERDOWN_METHOD,
- &toi_poweroff_method,
- sizeof(toi_poweroff_method));
- return 0;
- case USERUI_MSG_SET_POWERDOWN_METHOD:
- if (nlh->nlmsg_len != NLMSG_LENGTH(sizeof(char)))
- return -EINVAL;
- toi_poweroff_method = (unsigned long)(*data);
- return 0;
- case USERUI_MSG_GET_LOGLEVEL:
- toi_send_netlink_message(&ui_helper_data,
- USERUI_MSG_GET_LOGLEVEL,
- &toi_bkd.toi_default_console_level,
- sizeof(toi_bkd.toi_default_console_level));
- return 0;
- case USERUI_MSG_SET_LOGLEVEL:
- if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int)))
- return -EINVAL;
- toi_bkd.toi_default_console_level = (*data);
- return 0;
- case USERUI_MSG_PRINTK:
- printk(KERN_INFO "%s", (char *) data);
- return 0;
- }
-
- /* Unhandled here */
- return 1;
-}
-
-/**
- * userui_cond_pause - Possibly pause at user request.
- *
- * @pause: Whether to pause or just display the message.
- * @message: Message to display at the start of pausing.
- *
- * Potentially pause and wait for the user to tell us to continue. We normally
- * only pause when @pause is set. While paused, the user can do things like
- * changing the loglevel, toggling the display of debugging sections and such
- * like.
- */
-static void userui_cond_pause(int pause, char *message)
-{
- int displayed_message = 0, last_key = 0;
-
- while (last_key != 32 &&
- ui_helper_data.pid != -1 &&
- ((test_action_state(TOI_PAUSE) && pause) ||
- (test_action_state(TOI_SINGLESTEP)))) {
- if (!displayed_message) {
- toi_prepare_status(DONT_CLEAR_BAR,
- "%s Press SPACE to continue.%s",
- message ? message : "",
- (test_action_state(TOI_SINGLESTEP)) ?
- " Single step on." : "");
- displayed_message = 1;
- }
- last_key = userui_wait_for_keypress(0);
- }
- schedule();
-}
-
-/**
- * userui_prepare_console - Prepare the console for use.
- *
- * Prepare a console for use, saving current kmsg settings and attempting to
- * start userui. Console loglevel changes are handled by userui.
- */
-static void userui_prepare_console(void)
-{
- orig_kmsg = vt_kmsg_redirect(fg_console + 1);
-
- ui_helper_data.pid = -1;
-
- if (!userui_ops.enabled) {
- printk(KERN_INFO "TuxOnIce: Userui disabled.\n");
- return;
- }
-
- if (*ui_helper_data.program)
- toi_netlink_setup(&ui_helper_data);
- else
- printk(KERN_INFO "TuxOnIce: Userui program not configured.\n");
-}
-
-/**
- * userui_cleanup_console - Cleanup after a cycle.
- *
- * Tell userui to cleanup, and restore kmsg_redirect to its original value.
- */
-
-static void userui_cleanup_console(void)
-{
- if (ui_helper_data.pid > -1)
- toi_netlink_close(&ui_helper_data);
-
- vt_kmsg_redirect(orig_kmsg);
-}
-
-/*
- * User interface specific /sys/power/tuxonice entries.
- */
-
-static struct toi_sysfs_data sysfs_params[] = {
-#if defined(CONFIG_NET) && defined(CONFIG_SYSFS)
- SYSFS_BIT("enable_escape", SYSFS_RW, &toi_bkd.toi_action,
- TOI_CAN_CANCEL, 0),
- SYSFS_BIT("pause_between_steps", SYSFS_RW, &toi_bkd.toi_action,
- TOI_PAUSE, 0),
- SYSFS_INT("enabled", SYSFS_RW, &userui_ops.enabled, 0, 1, 0, NULL),
- SYSFS_INT("progress_granularity", SYSFS_RW, &progress_granularity, 1,
- 2048, 0, NULL),
- SYSFS_STRING("program", SYSFS_RW, ui_helper_data.program, 255, 0,
- set_ui_program_set),
- SYSFS_INT("debug", SYSFS_RW, &ui_helper_data.debug, 0, 1, 0, NULL)
-#endif
-};
-
-static struct toi_module_ops userui_ops = {
- .type = MISC_MODULE,
- .name = "userui",
- .shared_directory = "user_interface",
- .module = THIS_MODULE,
- .storage_needed = userui_storage_needed,
- .save_config_info = userui_save_config_info,
- .load_config_info = userui_load_config_info,
- .memory_needed = userui_memory_needed,
- .post_atomic_restore = userui_post_atomic_restore,
- .sysfs_data = sysfs_params,
- .num_sysfs_entries = sizeof(sysfs_params) /
- sizeof(struct toi_sysfs_data),
-};
-
-static struct ui_ops my_ui_ops = {
- .update_status = userui_update_status,
- .message = userui_message,
- .prepare_status = userui_prepare_status,
- .abort = userui_abort_hibernate,
- .cond_pause = userui_cond_pause,
- .prepare = userui_prepare_console,
- .cleanup = userui_cleanup_console,
- .wait_for_key = userui_wait_for_keypress,
-};
-
-/**
- * toi_user_ui_init - Boot time initialisation for user interface.
- *
- * Invoked from the core init routine.
- */
-static __init int toi_user_ui_init(void)
-{
- int result;
-
- ui_helper_data.nl = NULL;
- strncpy(ui_helper_data.program, CONFIG_TOI_USERUI_DEFAULT_PATH, 255);
- ui_helper_data.pid = -1;
- ui_helper_data.skb_size = sizeof(struct userui_msg_params);
- ui_helper_data.pool_limit = 6;
- ui_helper_data.netlink_id = NETLINK_TOI_USERUI;
- ui_helper_data.name = "userspace ui";
- ui_helper_data.rcv_msg = userui_user_rcv_msg;
- ui_helper_data.interface_version = 8;
- ui_helper_data.must_init = 0;
- ui_helper_data.not_ready = userui_cleanup_console;
- init_completion(&ui_helper_data.wait_for_process);
- result = toi_register_module(&userui_ops);
- if (!result) {
- result = toi_register_ui_ops(&my_ui_ops);
- if (result)
- toi_unregister_module(&userui_ops);
- }
-
- return result;
-}
-
-late_initcall(toi_user_ui_init);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 8362f1979..af4e6968c 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -33,7 +33,6 @@
#include <linux/bootmem.h>
#include <linux/memblock.h>
#include <linux/syscalls.h>
-#include <linux/suspend.h>
#include <linux/kexec.h>
#include <linux/kdb.h>
#include <linux/ratelimit.h>
@@ -49,6 +48,7 @@
#include <linux/uio.h>
#include <asm/uaccess.h>
+#include <asm-generic/sections.h>
#define CREATE_TRACE_POINTS
#include <trace/events/printk.h>
@@ -233,7 +233,11 @@ struct printk_log {
u8 facility; /* syslog facility */
u8 flags:5; /* internal record flags */
u8 level:3; /* syslog level */
-};
+}
+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+__packed __aligned(4)
+#endif
+;
/*
* The logbuf_lock protects kmsg buffer, indices, counters. This can be taken
@@ -274,30 +278,12 @@ static u32 clear_idx;
#define LOG_FACILITY(v) ((v) >> 3 & 0xff)
/* record buffer */
-#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
-#define LOG_ALIGN 4
-#else
#define LOG_ALIGN __alignof__(struct printk_log)
-#endif
#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
static char *log_buf = __log_buf;
static u32 log_buf_len = __LOG_BUF_LEN;
-#ifdef CONFIG_TOI_INCREMENTAL
-void toi_set_logbuf_untracked(void)
-{
- int i;
- struct page *log_buf_start_page = virt_to_page(__log_buf);
-
- printk("Not protecting kernel printk log buffer (%p-%p).\n",
- __log_buf, __log_buf + __LOG_BUF_LEN);
-
- for (i = 0; i < (1 << (CONFIG_LOG_BUF_SHIFT - PAGE_SHIFT)); i++)
- SetPageTOI_Untracked(log_buf_start_page + i);
-}
-#endif
-
/* Return log buffer address */
char *log_buf_addr_get(void)
{
@@ -1675,7 +1661,7 @@ asmlinkage int vprintk_emit(int facility, int level,
const char *dict, size_t dictlen,
const char *fmt, va_list args)
{
- static int recursion_bug;
+ static bool recursion_bug;
static char textbuf[LOG_LINE_MAX];
char *text = textbuf;
size_t text_len = 0;
@@ -1711,7 +1697,7 @@ asmlinkage int vprintk_emit(int facility, int level,
* it can be printed at the next appropriate moment:
*/
if (!oops_in_progress && !lockdep_recursing(current)) {
- recursion_bug = 1;
+ recursion_bug = true;
local_irq_restore(flags);
return 0;
}
@@ -1726,7 +1712,7 @@ asmlinkage int vprintk_emit(int facility, int level,
static const char recursion_msg[] =
"BUG: recent printk recursion!";
- recursion_bug = 0;
+ recursion_bug = false;
/* emit KERN_CRIT message */
printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0,
NULL, 0, recursion_msg,
@@ -2706,13 +2692,36 @@ int unregister_console(struct console *console)
}
EXPORT_SYMBOL(unregister_console);
+/*
+ * Some boot consoles access data that is in the init section and which will
+ * be discarded after the initcalls have been run. To make sure that no code
+ * will access this data, unregister the boot consoles in a late initcall.
+ *
+ * If for some reason, such as deferred probe or the driver being a loadable
+ * module, the real console hasn't registered yet at this point, there will
+ * be a brief interval in which no messages are logged to the console, which
+ * makes it difficult to diagnose problems that occur during this time.
+ *
+ * To mitigate this problem somewhat, only unregister consoles whose memory
+ * intersects with the init section. Note that code exists elsewhere to get
+ * rid of the boot console as soon as the proper console shows up, so there
+ * won't be side-effects from postponing the removal.
+ */
static int __init printk_late_init(void)
{
struct console *con;
for_each_console(con) {
if (!keep_bootcon && con->flags & CON_BOOT) {
- unregister_console(con);
+ /*
+ * Make sure to unregister boot consoles whose data
+ * resides in the init section before the init section
+ * is discarded. Boot consoles whose data will stick
+ * around will automatically be unregistered when the
+ * proper console replaces them.
+ */
+ if (init_section_intersects(con, sizeof(*con)))
+ unregister_console(con);
}
}
hotcpu_notifier(console_cpu_notify, 0);
diff --git a/kernel/profile.c b/kernel/profile.c
index 99513e116..513696974 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -59,6 +59,7 @@ int profile_setup(char *str)
if (!strncmp(str, sleepstr, strlen(sleepstr))) {
#ifdef CONFIG_SCHEDSTATS
+ force_schedstat_enabled();
prof_on = SLEEP_PROFILING;
if (str[strlen(sleepstr)] == ',')
str += strlen(sleepstr) + 1;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 3189e51db..2341efe7f 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -387,8 +387,14 @@ unlock_creds:
mutex_unlock(&task->signal->cred_guard_mutex);
out:
if (!retval) {
- wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT,
- TASK_UNINTERRUPTIBLE);
+ /*
+ * We do not bother to change retval or clear JOBCTL_TRAPPING
+ * if wait_on_bit() was interrupted by SIGKILL. The tracer will
+ * not return to user-mode, it will exit and clear this bit in
+ * __ptrace_unlink() if it wasn't already cleared by the tracee;
+ * and until then nobody can ptrace this task.
+ */
+ wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT, TASK_KILLABLE);
proc_ptrace_connector(task, PTRACE_ATTACH);
}
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index d89328e26..d2988d047 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -162,6 +162,27 @@ static int rcu_torture_writer_state;
#define RTWS_SYNC 7
#define RTWS_STUTTER 8
#define RTWS_STOPPING 9
+static const char * const rcu_torture_writer_state_names[] = {
+ "RTWS_FIXED_DELAY",
+ "RTWS_DELAY",
+ "RTWS_REPLACE",
+ "RTWS_DEF_FREE",
+ "RTWS_EXP_SYNC",
+ "RTWS_COND_GET",
+ "RTWS_COND_SYNC",
+ "RTWS_SYNC",
+ "RTWS_STUTTER",
+ "RTWS_STOPPING",
+};
+
+static const char *rcu_torture_writer_state_getname(void)
+{
+ unsigned int i = READ_ONCE(rcu_torture_writer_state);
+
+ if (i >= ARRAY_SIZE(rcu_torture_writer_state_names))
+ return "???";
+ return rcu_torture_writer_state_names[i];
+}
#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE)
#define RCUTORTURE_RUNNABLE_INIT 1
@@ -1307,7 +1328,8 @@ rcu_torture_stats_print(void)
rcutorture_get_gp_data(cur_ops->ttype,
&flags, &gpnum, &completed);
- pr_alert("??? Writer stall state %d g%lu c%lu f%#x\n",
+ pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x\n",
+ rcu_torture_writer_state_getname(),
rcu_torture_writer_state,
gpnum, completed, flags);
show_rcu_gp_kthreads();
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index a63a1ea5a..9b9cdd549 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -489,7 +489,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
*/
void synchronize_srcu(struct srcu_struct *sp)
{
- __synchronize_srcu(sp, rcu_gp_is_expedited()
+ __synchronize_srcu(sp, (rcu_gp_is_expedited() && !rcu_gp_is_normal())
? SYNCHRONIZE_SRCU_EXP_TRYCOUNT
: SYNCHRONIZE_SRCU_TRYCOUNT);
}
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index f07343b54..9fd5b628a 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -68,10 +68,6 @@ MODULE_ALIAS("rcutree");
/* Data structures. */
-static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
-static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
-static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS];
-
/*
* In order to export the rcu_state name to the tracing tools, it
* needs to be added in the __tracepoint_string section.
@@ -246,24 +242,17 @@ static int rcu_gp_in_progress(struct rcu_state *rsp)
*/
void rcu_sched_qs(void)
{
- unsigned long flags;
-
- if (__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) {
- trace_rcu_grace_period(TPS("rcu_sched"),
- __this_cpu_read(rcu_sched_data.gpnum),
- TPS("cpuqs"));
- __this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false);
- if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))
- return;
- local_irq_save(flags);
- if (__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) {
- __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false);
- rcu_report_exp_rdp(&rcu_sched_state,
- this_cpu_ptr(&rcu_sched_data),
- true);
- }
- local_irq_restore(flags);
- }
+ if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.s))
+ return;
+ trace_rcu_grace_period(TPS("rcu_sched"),
+ __this_cpu_read(rcu_sched_data.gpnum),
+ TPS("cpuqs"));
+ __this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false);
+ if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))
+ return;
+ __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false);
+ rcu_report_exp_rdp(&rcu_sched_state,
+ this_cpu_ptr(&rcu_sched_data), true);
}
void rcu_bh_qs(void)
@@ -300,17 +289,16 @@ EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr);
* We inform the RCU core by emulating a zero-duration dyntick-idle
* period, which we in turn do by incrementing the ->dynticks counter
* by two.
+ *
+ * The caller must have disabled interrupts.
*/
static void rcu_momentary_dyntick_idle(void)
{
- unsigned long flags;
struct rcu_data *rdp;
struct rcu_dynticks *rdtp;
int resched_mask;
struct rcu_state *rsp;
- local_irq_save(flags);
-
/*
* Yes, we can lose flag-setting operations. This is OK, because
* the flag will be set again after some delay.
@@ -340,13 +328,12 @@ static void rcu_momentary_dyntick_idle(void)
smp_mb__after_atomic(); /* Later stuff after QS. */
break;
}
- local_irq_restore(flags);
}
/*
* Note a context switch. This is a quiescent state for RCU-sched,
* and requires special handling for preemptible RCU.
- * The caller must have disabled preemption.
+ * The caller must have disabled interrupts.
*/
void rcu_note_context_switch(void)
{
@@ -376,9 +363,14 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch);
*/
void rcu_all_qs(void)
{
+ unsigned long flags;
+
barrier(); /* Avoid RCU read-side critical sections leaking down. */
- if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
+ if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) {
+ local_irq_save(flags);
rcu_momentary_dyntick_idle();
+ local_irq_restore(flags);
+ }
this_cpu_inc(rcu_qs_ctr);
barrier(); /* Avoid RCU read-side critical sections leaking up. */
}
@@ -605,25 +597,25 @@ static int rcu_future_needs_gp(struct rcu_state *rsp)
* The caller must have disabled interrupts to prevent races with
* normal callback registry.
*/
-static int
+static bool
cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
{
int i;
if (rcu_gp_in_progress(rsp))
- return 0; /* No, a grace period is already in progress. */
+ return false; /* No, a grace period is already in progress. */
if (rcu_future_needs_gp(rsp))
- return 1; /* Yes, a no-CBs CPU needs one. */
+ return true; /* Yes, a no-CBs CPU needs one. */
if (!rdp->nxttail[RCU_NEXT_TAIL])
- return 0; /* No, this is a no-CBs (or offline) CPU. */
+ return false; /* No, this is a no-CBs (or offline) CPU. */
if (*rdp->nxttail[RCU_NEXT_READY_TAIL])
- return 1; /* Yes, this CPU has newly registered callbacks. */
+ return true; /* Yes, CPU has newly registered callbacks. */
for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++)
if (rdp->nxttail[i - 1] != rdp->nxttail[i] &&
ULONG_CMP_LT(READ_ONCE(rsp->completed),
rdp->nxtcompleted[i]))
- return 1; /* Yes, CBs for future grace period. */
- return 0; /* No grace period needed. */
+ return true; /* Yes, CBs for future grace period. */
+ return false; /* No grace period needed. */
}
/*
@@ -740,7 +732,7 @@ void rcu_user_enter(void)
*
* Exit from an interrupt handler, which might possibly result in entering
* idle mode, in other words, leaving the mode in which read-side critical
- * sections can occur.
+ * sections can occur. The caller must have disabled interrupts.
*
* This code assumes that the idle loop never does anything that might
* result in unbalanced calls to irq_enter() and irq_exit(). If your
@@ -753,11 +745,10 @@ void rcu_user_enter(void)
*/
void rcu_irq_exit(void)
{
- unsigned long flags;
long long oldval;
struct rcu_dynticks *rdtp;
- local_irq_save(flags);
+ RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_exit() invoked with irqs enabled!!!");
rdtp = this_cpu_ptr(&rcu_dynticks);
oldval = rdtp->dynticks_nesting;
rdtp->dynticks_nesting--;
@@ -768,6 +759,17 @@ void rcu_irq_exit(void)
else
rcu_eqs_enter_common(oldval, true);
rcu_sysidle_enter(1);
+}
+
+/*
+ * Wrapper for rcu_irq_exit() where interrupts are enabled.
+ */
+void rcu_irq_exit_irqson(void)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ rcu_irq_exit();
local_irq_restore(flags);
}
@@ -865,7 +867,7 @@ void rcu_user_exit(void)
*
* Enter an interrupt handler, which might possibly result in exiting
* idle mode, in other words, entering the mode in which read-side critical
- * sections can occur.
+ * sections can occur. The caller must have disabled interrupts.
*
* Note that the Linux kernel is fully capable of entering an interrupt
* handler that it never exits, for example when doing upcalls to
@@ -881,11 +883,10 @@ void rcu_user_exit(void)
*/
void rcu_irq_enter(void)
{
- unsigned long flags;
struct rcu_dynticks *rdtp;
long long oldval;
- local_irq_save(flags);
+ RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_enter() invoked with irqs enabled!!!");
rdtp = this_cpu_ptr(&rcu_dynticks);
oldval = rdtp->dynticks_nesting;
rdtp->dynticks_nesting++;
@@ -896,6 +897,17 @@ void rcu_irq_enter(void)
else
rcu_eqs_exit_common(oldval, true);
rcu_sysidle_exit(1);
+}
+
+/*
+ * Wrapper for rcu_irq_enter() where interrupts are enabled.
+ */
+void rcu_irq_enter_irqson(void)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ rcu_irq_enter();
local_irq_restore(flags);
}
@@ -1187,6 +1199,16 @@ static void record_gp_stall_check_time(struct rcu_state *rsp)
}
/*
+ * Convert a ->gp_state value to a character string.
+ */
+static const char *gp_state_getname(short gs)
+{
+ if (gs < 0 || gs >= ARRAY_SIZE(gp_state_names))
+ return "???";
+ return gp_state_names[gs];
+}
+
+/*
* Complain about starvation of grace-period kthread.
*/
static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
@@ -1196,12 +1218,16 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
j = jiffies;
gpa = READ_ONCE(rsp->gp_activity);
- if (j - gpa > 2 * HZ)
- pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x s%d ->state=%#lx\n",
+ if (j - gpa > 2 * HZ) {
+ pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x %s(%d) ->state=%#lx\n",
rsp->name, j - gpa,
rsp->gpnum, rsp->completed,
- rsp->gp_flags, rsp->gp_state,
- rsp->gp_kthread ? rsp->gp_kthread->state : 0);
+ rsp->gp_flags,
+ gp_state_getname(rsp->gp_state), rsp->gp_state,
+ rsp->gp_kthread ? rsp->gp_kthread->state : ~0);
+ if (rsp->gp_kthread)
+ sched_show_task(rsp->gp_kthread);
+ }
}
/*
@@ -1214,7 +1240,7 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
struct rcu_node *rnp;
rcu_for_each_leaf_node(rsp, rnp) {
- raw_spin_lock_irqsave(&rnp->lock, flags);
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
if (rnp->qsmask != 0) {
for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
if (rnp->qsmask & (1UL << cpu))
@@ -1237,7 +1263,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
/* Only let one CPU complain about others per time interval. */
- raw_spin_lock_irqsave(&rnp->lock, flags);
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
delta = jiffies - READ_ONCE(rsp->jiffies_stall);
if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) {
raw_spin_unlock_irqrestore(&rnp->lock, flags);
@@ -1256,7 +1282,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
rsp->name);
print_cpu_stall_info_begin();
rcu_for_each_leaf_node(rsp, rnp) {
- raw_spin_lock_irqsave(&rnp->lock, flags);
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
ndetected += rcu_print_task_stall(rnp);
if (rnp->qsmask != 0) {
for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
@@ -1327,7 +1353,7 @@ static void print_cpu_stall(struct rcu_state *rsp)
rcu_dump_cpu_stacks(rsp);
- raw_spin_lock_irqsave(&rnp->lock, flags);
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
if (ULONG_CMP_GE(jiffies, READ_ONCE(rsp->jiffies_stall)))
WRITE_ONCE(rsp->jiffies_stall,
jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
@@ -1534,10 +1560,8 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
* hold it, acquire the root rcu_node structure's lock in order to
* start one (if needed).
*/
- if (rnp != rnp_root) {
- raw_spin_lock(&rnp_root->lock);
- smp_mb__after_unlock_lock();
- }
+ if (rnp != rnp_root)
+ raw_spin_lock_rcu_node(rnp_root);
/*
* Get a new grace-period number. If there really is no grace
@@ -1590,7 +1614,6 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
int needmore;
struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
- rcu_nocb_gp_cleanup(rsp, rnp);
rnp->need_future_gp[c & 0x1] = 0;
needmore = rnp->need_future_gp[(c + 1) & 0x1];
trace_rcu_future_gp(rnp, rdp, c,
@@ -1611,7 +1634,7 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp)
!READ_ONCE(rsp->gp_flags) ||
!rsp->gp_kthread)
return;
- wake_up(&rsp->gp_wq);
+ swake_up(&rsp->gp_wq);
}
/*
@@ -1786,11 +1809,10 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
if ((rdp->gpnum == READ_ONCE(rnp->gpnum) &&
rdp->completed == READ_ONCE(rnp->completed) &&
!unlikely(READ_ONCE(rdp->gpwrap))) || /* w/out lock. */
- !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
+ !raw_spin_trylock_rcu_node(rnp)) { /* irqs already off, so later. */
local_irq_restore(flags);
return;
}
- smp_mb__after_unlock_lock();
needwake = __note_gp_changes(rsp, rnp, rdp);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
if (needwake)
@@ -1805,21 +1827,20 @@ static void rcu_gp_slow(struct rcu_state *rsp, int delay)
}
/*
- * Initialize a new grace period. Return 0 if no grace period required.
+ * Initialize a new grace period. Return false if no grace period required.
*/
-static int rcu_gp_init(struct rcu_state *rsp)
+static bool rcu_gp_init(struct rcu_state *rsp)
{
unsigned long oldmask;
struct rcu_data *rdp;
struct rcu_node *rnp = rcu_get_root(rsp);
WRITE_ONCE(rsp->gp_activity, jiffies);
- raw_spin_lock_irq(&rnp->lock);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irq_rcu_node(rnp);
if (!READ_ONCE(rsp->gp_flags)) {
/* Spurious wakeup, tell caller to go back to sleep. */
raw_spin_unlock_irq(&rnp->lock);
- return 0;
+ return false;
}
WRITE_ONCE(rsp->gp_flags, 0); /* Clear all flags: New grace period. */
@@ -1829,7 +1850,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
* Not supposed to be able to happen.
*/
raw_spin_unlock_irq(&rnp->lock);
- return 0;
+ return false;
}
/* Advance to a new grace period and initialize state. */
@@ -1847,8 +1868,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
*/
rcu_for_each_leaf_node(rsp, rnp) {
rcu_gp_slow(rsp, gp_preinit_delay);
- raw_spin_lock_irq(&rnp->lock);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irq_rcu_node(rnp);
if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
!rnp->wait_blkd_tasks) {
/* Nothing to do on this leaf rcu_node structure. */
@@ -1904,8 +1924,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
*/
rcu_for_each_node_breadth_first(rsp, rnp) {
rcu_gp_slow(rsp, gp_init_delay);
- raw_spin_lock_irq(&rnp->lock);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irq_rcu_node(rnp);
rdp = this_cpu_ptr(rsp->rda);
rcu_preempt_check_blocked_tasks(rnp);
rnp->qsmask = rnp->qsmaskinit;
@@ -1923,7 +1942,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
WRITE_ONCE(rsp->gp_activity, jiffies);
}
- return 1;
+ return true;
}
/*
@@ -1973,8 +1992,7 @@ static void rcu_gp_fqs(struct rcu_state *rsp, bool first_time)
}
/* Clear flag to prevent immediate re-entry. */
if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
- raw_spin_lock_irq(&rnp->lock);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irq_rcu_node(rnp);
WRITE_ONCE(rsp->gp_flags,
READ_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS);
raw_spin_unlock_irq(&rnp->lock);
@@ -1991,10 +2009,10 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
int nocb = 0;
struct rcu_data *rdp;
struct rcu_node *rnp = rcu_get_root(rsp);
+ struct swait_queue_head *sq;
WRITE_ONCE(rsp->gp_activity, jiffies);
- raw_spin_lock_irq(&rnp->lock);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irq_rcu_node(rnp);
gp_duration = jiffies - rsp->gp_start;
if (gp_duration > rsp->gp_max)
rsp->gp_max = gp_duration;
@@ -2019,8 +2037,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
* grace period is recorded in any of the rcu_node structures.
*/
rcu_for_each_node_breadth_first(rsp, rnp) {
- raw_spin_lock_irq(&rnp->lock);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irq_rcu_node(rnp);
WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
WARN_ON_ONCE(rnp->qsmask);
WRITE_ONCE(rnp->completed, rsp->gpnum);
@@ -2029,14 +2046,15 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
needgp = __note_gp_changes(rsp, rnp, rdp) || needgp;
/* smp_mb() provided by prior unlock-lock pair. */
nocb += rcu_future_gp_cleanup(rsp, rnp);
+ sq = rcu_nocb_gp_get(rnp);
raw_spin_unlock_irq(&rnp->lock);
+ rcu_nocb_gp_cleanup(sq);
cond_resched_rcu_qs();
WRITE_ONCE(rsp->gp_activity, jiffies);
rcu_gp_slow(rsp, gp_cleanup_delay);
}
rnp = rcu_get_root(rsp);
- raw_spin_lock_irq(&rnp->lock);
- smp_mb__after_unlock_lock(); /* Order GP before ->completed update. */
+ raw_spin_lock_irq_rcu_node(rnp); /* Order GP before ->completed update. */
rcu_nocb_gp_set(rnp, nocb);
/* Declare grace period done. */
@@ -2076,7 +2094,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
READ_ONCE(rsp->gpnum),
TPS("reqwait"));
rsp->gp_state = RCU_GP_WAIT_GPS;
- wait_event_interruptible(rsp->gp_wq,
+ swait_event_interruptible(rsp->gp_wq,
READ_ONCE(rsp->gp_flags) &
RCU_GP_FLAG_INIT);
rsp->gp_state = RCU_GP_DONE_GPS;
@@ -2106,7 +2124,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
READ_ONCE(rsp->gpnum),
TPS("fqswait"));
rsp->gp_state = RCU_GP_WAIT_FQS;
- ret = wait_event_interruptible_timeout(rsp->gp_wq,
+ ret = swait_event_interruptible_timeout(rsp->gp_wq,
rcu_gp_fqs_check_wake(rsp, &gf), j);
rsp->gp_state = RCU_GP_DOING_FQS;
/* Locking provides needed memory barriers. */
@@ -2230,7 +2248,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
- rcu_gp_kthread_wake(rsp);
+ swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */
}
/*
@@ -2284,8 +2302,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
raw_spin_unlock_irqrestore(&rnp->lock, flags);
rnp_c = rnp;
rnp = rnp->parent;
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
oldmask = rnp_c->qsmask;
}
@@ -2332,8 +2349,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp,
gps = rnp->gpnum;
mask = rnp->grpmask;
raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
- raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */
- smp_mb__after_unlock_lock();
+ raw_spin_lock_rcu_node(rnp_p); /* irqs already disabled. */
rcu_report_qs_rnp(mask, rsp, rnp_p, gps, flags);
}
@@ -2355,8 +2371,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
struct rcu_node *rnp;
rnp = rdp->mynode;
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
if ((rdp->cpu_no_qs.b.norm &&
rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) ||
rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum ||
@@ -2582,8 +2597,7 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
rnp = rnp->parent;
if (!rnp)
break;
- raw_spin_lock(&rnp->lock); /* irqs already disabled. */
- smp_mb__after_unlock_lock(); /* GP memory ordering. */
+ raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
rnp->qsmaskinit &= ~mask;
rnp->qsmask &= ~mask;
if (rnp->qsmaskinit) {
@@ -2611,8 +2625,7 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
/* Remove outgoing CPU from mask in the leaf rcu_node structure. */
mask = rdp->grpmask;
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */
+ raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
rnp->qsmaskinitnext &= ~mask;
raw_spin_unlock_irqrestore(&rnp->lock, flags);
}
@@ -2809,8 +2822,7 @@ static void force_qs_rnp(struct rcu_state *rsp,
rcu_for_each_leaf_node(rsp, rnp) {
cond_resched_rcu_qs();
mask = 0;
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
if (rnp->qsmask == 0) {
if (rcu_state_p == &rcu_sched_state ||
rsp != rcu_state_p ||
@@ -2881,8 +2893,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
/* rnp_old == rcu_get_root(rsp), rnp == NULL. */
/* Reached the root of the rcu_node tree, acquire lock. */
- raw_spin_lock_irqsave(&rnp_old->lock, flags);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irqsave_rcu_node(rnp_old, flags);
raw_spin_unlock(&rnp_old->fqslock);
if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
rsp->n_force_qs_lh++;
@@ -2891,7 +2902,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
}
WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
- rcu_gp_kthread_wake(rsp);
+ swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */
}
/*
@@ -2914,7 +2925,7 @@ __rcu_process_callbacks(struct rcu_state *rsp)
/* Does this CPU require a not-yet-started grace period? */
local_irq_save(flags);
if (cpu_needs_another_gp(rsp, rdp)) {
- raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */
+ raw_spin_lock_rcu_node(rcu_get_root(rsp)); /* irqs disabled. */
needwake = rcu_start_gp(rsp);
raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
if (needwake)
@@ -3005,8 +3016,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
if (!rcu_gp_in_progress(rsp)) {
struct rcu_node *rnp_root = rcu_get_root(rsp);
- raw_spin_lock(&rnp_root->lock);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_rcu_node(rnp_root);
needwake = rcu_start_gp(rsp);
raw_spin_unlock(&rnp_root->lock);
if (needwake)
@@ -3365,7 +3375,6 @@ static unsigned long rcu_seq_snap(unsigned long *sp)
{
unsigned long s;
- smp_mb(); /* Caller's modifications seen first by other CPUs. */
s = (READ_ONCE(*sp) + 3) & ~0x1;
smp_mb(); /* Above access must not bleed into critical section. */
return s;
@@ -3392,6 +3401,7 @@ static void rcu_exp_gp_seq_end(struct rcu_state *rsp)
}
static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp)
{
+ smp_mb(); /* Caller's modifications seen first by other CPUs. */
return rcu_seq_snap(&rsp->expedited_sequence);
}
static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s)
@@ -3426,8 +3436,7 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp)
* CPUs for the current rcu_node structure up the rcu_node tree.
*/
rcu_for_each_leaf_node(rsp, rnp) {
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
if (rnp->expmaskinit == rnp->expmaskinitnext) {
raw_spin_unlock_irqrestore(&rnp->lock, flags);
continue; /* No new CPUs, nothing to do. */
@@ -3447,8 +3456,7 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp)
rnp_up = rnp->parent;
done = false;
while (rnp_up) {
- raw_spin_lock_irqsave(&rnp_up->lock, flags);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irqsave_rcu_node(rnp_up, flags);
if (rnp_up->expmaskinit)
done = true;
rnp_up->expmaskinit |= mask;
@@ -3472,8 +3480,7 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp)
sync_exp_reset_tree_hotplug(rsp);
rcu_for_each_node_breadth_first(rsp, rnp) {
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
WARN_ON_ONCE(rnp->expmask);
rnp->expmask = rnp->expmaskinit;
raw_spin_unlock_irqrestore(&rnp->lock, flags);
@@ -3524,15 +3531,14 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
raw_spin_unlock_irqrestore(&rnp->lock, flags);
if (wake) {
smp_mb(); /* EGP done before wake_up(). */
- wake_up(&rsp->expedited_wq);
+ swake_up(&rsp->expedited_wq);
}
break;
}
mask = rnp->grpmask;
raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
rnp = rnp->parent;
- raw_spin_lock(&rnp->lock); /* irqs already disabled */
- smp_mb__after_unlock_lock();
+ raw_spin_lock_rcu_node(rnp); /* irqs already disabled */
WARN_ON_ONCE(!(rnp->expmask & mask));
rnp->expmask &= ~mask;
}
@@ -3549,8 +3555,7 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp,
{
unsigned long flags;
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
__rcu_report_exp_rnp(rsp, rnp, wake, flags);
}
@@ -3564,8 +3569,7 @@ static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp,
{
unsigned long flags;
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
if (!(rnp->expmask & mask)) {
raw_spin_unlock_irqrestore(&rnp->lock, flags);
return;
@@ -3609,7 +3613,7 @@ static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp,
*/
static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
{
- struct rcu_data *rdp;
+ struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
struct rcu_node *rnp0;
struct rcu_node *rnp1 = NULL;
@@ -3623,7 +3627,7 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
if (!mutex_is_locked(&rnp0->exp_funnel_mutex)) {
if (mutex_trylock(&rnp0->exp_funnel_mutex)) {
if (sync_exp_work_done(rsp, rnp0, NULL,
- &rsp->expedited_workdone0, s))
+ &rdp->expedited_workdone0, s))
return NULL;
return rnp0;
}
@@ -3637,14 +3641,13 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
* can be inexact, as it is just promoting locality and is not
* strictly needed for correctness.
*/
- rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
- if (sync_exp_work_done(rsp, NULL, NULL, &rsp->expedited_workdone1, s))
+ if (sync_exp_work_done(rsp, NULL, NULL, &rdp->expedited_workdone1, s))
return NULL;
mutex_lock(&rdp->exp_funnel_mutex);
rnp0 = rdp->mynode;
for (; rnp0 != NULL; rnp0 = rnp0->parent) {
if (sync_exp_work_done(rsp, rnp1, rdp,
- &rsp->expedited_workdone2, s))
+ &rdp->expedited_workdone2, s))
return NULL;
mutex_lock(&rnp0->exp_funnel_mutex);
if (rnp1)
@@ -3654,7 +3657,7 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
rnp1 = rnp0;
}
if (sync_exp_work_done(rsp, rnp1, rdp,
- &rsp->expedited_workdone3, s))
+ &rdp->expedited_workdone3, s))
return NULL;
return rnp1;
}
@@ -3708,8 +3711,7 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
sync_exp_reset_tree(rsp);
rcu_for_each_leaf_node(rsp, rnp) {
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
/* Each pass checks a CPU for identity, offline, and idle. */
mask_ofl_test = 0;
@@ -3741,24 +3743,22 @@ retry_ipi:
ret = smp_call_function_single(cpu, func, rsp, 0);
if (!ret) {
mask_ofl_ipi &= ~mask;
- } else {
- /* Failed, raced with offline. */
- raw_spin_lock_irqsave(&rnp->lock, flags);
- if (cpu_online(cpu) &&
- (rnp->expmask & mask)) {
- raw_spin_unlock_irqrestore(&rnp->lock,
- flags);
- schedule_timeout_uninterruptible(1);
- if (cpu_online(cpu) &&
- (rnp->expmask & mask))
- goto retry_ipi;
- raw_spin_lock_irqsave(&rnp->lock,
- flags);
- }
- if (!(rnp->expmask & mask))
- mask_ofl_ipi &= ~mask;
+ continue;
+ }
+ /* Failed, raced with offline. */
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ if (cpu_online(cpu) &&
+ (rnp->expmask & mask)) {
raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ schedule_timeout_uninterruptible(1);
+ if (cpu_online(cpu) &&
+ (rnp->expmask & mask))
+ goto retry_ipi;
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
}
+ if (!(rnp->expmask & mask))
+ mask_ofl_ipi &= ~mask;
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
}
/* Report quiescent states for those that went offline. */
mask_ofl_test |= mask_ofl_ipi;
@@ -3773,6 +3773,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
unsigned long jiffies_stall;
unsigned long jiffies_start;
unsigned long mask;
+ int ndetected;
struct rcu_node *rnp;
struct rcu_node *rnp_root = rcu_get_root(rsp);
int ret;
@@ -3781,28 +3782,30 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
jiffies_start = jiffies;
for (;;) {
- ret = wait_event_interruptible_timeout(
+ ret = swait_event_timeout(
rsp->expedited_wq,
sync_rcu_preempt_exp_done(rnp_root),
jiffies_stall);
- if (ret > 0)
+ if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root))
return;
if (ret < 0) {
/* Hit a signal, disable CPU stall warnings. */
- wait_event(rsp->expedited_wq,
+ swait_event(rsp->expedited_wq,
sync_rcu_preempt_exp_done(rnp_root));
return;
}
pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {",
rsp->name);
+ ndetected = 0;
rcu_for_each_leaf_node(rsp, rnp) {
- (void)rcu_print_task_exp_stall(rnp);
+ ndetected = rcu_print_task_exp_stall(rnp);
mask = 1;
for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
struct rcu_data *rdp;
if (!(rnp->expmask & mask))
continue;
+ ndetected++;
rdp = per_cpu_ptr(rsp->rda, cpu);
pr_cont(" %d-%c%c%c", cpu,
"O."[cpu_online(cpu)],
@@ -3811,8 +3814,23 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
}
mask <<= 1;
}
- pr_cont(" } %lu jiffies s: %lu\n",
- jiffies - jiffies_start, rsp->expedited_sequence);
+ pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n",
+ jiffies - jiffies_start, rsp->expedited_sequence,
+ rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]);
+ if (!ndetected) {
+ pr_err("blocking rcu_node structures:");
+ rcu_for_each_node_breadth_first(rsp, rnp) {
+ if (rnp == rnp_root)
+ continue; /* printed unconditionally */
+ if (sync_rcu_preempt_exp_done(rnp))
+ continue;
+ pr_cont(" l=%u:%d-%d:%#lx/%c",
+ rnp->level, rnp->grplo, rnp->grphi,
+ rnp->expmask,
+ ".T"[!!rnp->exp_tasks]);
+ }
+ pr_cont("\n");
+ }
rcu_for_each_leaf_node(rsp, rnp) {
mask = 1;
for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
@@ -3847,6 +3865,16 @@ void synchronize_sched_expedited(void)
struct rcu_node *rnp;
struct rcu_state *rsp = &rcu_sched_state;
+ /* If only one CPU, this is automatically a grace period. */
+ if (rcu_blocking_is_gp())
+ return;
+
+ /* If expedited grace periods are prohibited, fall back to normal. */
+ if (rcu_gp_is_normal()) {
+ wait_rcu_gp(call_rcu_sched);
+ return;
+ }
+
/* Take a snapshot of the sequence number. */
s = rcu_exp_gp_seq_snap(rsp);
@@ -4135,7 +4163,7 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
rnp = rnp->parent;
if (rnp == NULL)
return;
- raw_spin_lock(&rnp->lock); /* Interrupts already disabled. */
+ raw_spin_lock_rcu_node(rnp); /* Interrupts already disabled. */
rnp->qsmaskinit |= mask;
raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */
}
@@ -4152,7 +4180,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
struct rcu_node *rnp = rcu_get_root(rsp);
/* Set up local state, ensuring consistent view of global state. */
- raw_spin_lock_irqsave(&rnp->lock, flags);
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
@@ -4179,7 +4207,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
struct rcu_node *rnp = rcu_get_root(rsp);
/* Set up local state, ensuring consistent view of global state. */
- raw_spin_lock_irqsave(&rnp->lock, flags);
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
rdp->qlen_last_fqs_check = 0;
rdp->n_force_qs_snap = rsp->n_force_qs;
rdp->blimit = blimit;
@@ -4198,8 +4226,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
*/
rnp = rdp->mynode;
mask = rdp->grpmask;
- raw_spin_lock(&rnp->lock); /* irqs already disabled. */
- smp_mb__after_unlock_lock();
+ raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
rnp->qsmaskinitnext |= mask;
rnp->expmaskinitnext |= mask;
if (!rdp->beenonline)
@@ -4327,14 +4354,14 @@ static int __init rcu_spawn_gp_kthread(void)
t = kthread_create(rcu_gp_kthread, rsp, "%s", rsp->name);
BUG_ON(IS_ERR(t));
rnp = rcu_get_root(rsp);
- raw_spin_lock_irqsave(&rnp->lock, flags);
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
rsp->gp_kthread = t;
if (kthread_prio) {
sp.sched_priority = kthread_prio;
sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
}
- wake_up_process(t);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ wake_up_process(t);
}
rcu_spawn_nocb_kthreads();
rcu_spawn_boost_kthreads();
@@ -4385,12 +4412,14 @@ static void __init rcu_init_levelspread(int *levelspread, const int *levelcnt)
/*
* Helper function for rcu_init() that initializes one rcu_state structure.
*/
-static void __init rcu_init_one(struct rcu_state *rsp,
- struct rcu_data __percpu *rda)
+static void __init rcu_init_one(struct rcu_state *rsp)
{
static const char * const buf[] = RCU_NODE_NAME_INIT;
static const char * const fqs[] = RCU_FQS_NAME_INIT;
static const char * const exp[] = RCU_EXP_NAME_INIT;
+ static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
+ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
+ static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS];
static u8 fl_mask = 0x1;
int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */
@@ -4455,8 +4484,8 @@ static void __init rcu_init_one(struct rcu_state *rsp,
}
}
- init_waitqueue_head(&rsp->gp_wq);
- init_waitqueue_head(&rsp->expedited_wq);
+ init_swait_queue_head(&rsp->gp_wq);
+ init_swait_queue_head(&rsp->expedited_wq);
rnp = rsp->level[rcu_num_lvls - 1];
for_each_possible_cpu(i) {
while (i > rnp->grphi)
@@ -4576,8 +4605,8 @@ void __init rcu_init(void)
rcu_bootup_announce();
rcu_init_geometry();
- rcu_init_one(&rcu_bh_state, &rcu_bh_data);
- rcu_init_one(&rcu_sched_state, &rcu_sched_data);
+ rcu_init_one(&rcu_bh_state);
+ rcu_init_one(&rcu_sched_state);
if (dump_tree)
rcu_dump_rcu_node_tree(&rcu_sched_state);
__rcu_init_preempt();
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 9fb4e238d..bbd235d0e 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -27,6 +27,7 @@
#include <linux/threads.h>
#include <linux/cpumask.h>
#include <linux/seqlock.h>
+#include <linux/swait.h>
#include <linux/stop_machine.h>
/*
@@ -178,6 +179,8 @@ struct rcu_node {
/* beginning of each expedited GP. */
unsigned long expmaskinitnext;
/* Online CPUs for next expedited GP. */
+ /* Any CPU that has ever been online will */
+ /* have its bit set. */
unsigned long grpmask; /* Mask to apply to parent qsmask. */
/* Only one bit will be set in this mask. */
int grplo; /* lowest-numbered CPU or group here. */
@@ -241,7 +244,7 @@ struct rcu_node {
/* Refused to boost: not sure why, though. */
/* This can happen due to race conditions. */
#ifdef CONFIG_RCU_NOCB_CPU
- wait_queue_head_t nocb_gp_wq[2];
+ struct swait_queue_head nocb_gp_wq[2];
/* Place for rcu_nocb_kthread() to wait GP. */
#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
int need_future_gp[2];
@@ -384,6 +387,10 @@ struct rcu_data {
struct rcu_head oom_head;
#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
struct mutex exp_funnel_mutex;
+ atomic_long_t expedited_workdone0; /* # done by others #0. */
+ atomic_long_t expedited_workdone1; /* # done by others #1. */
+ atomic_long_t expedited_workdone2; /* # done by others #2. */
+ atomic_long_t expedited_workdone3; /* # done by others #3. */
/* 7) Callback offloading. */
#ifdef CONFIG_RCU_NOCB_CPU
@@ -393,7 +400,7 @@ struct rcu_data {
atomic_long_t nocb_q_count_lazy; /* invocation (all stages). */
struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */
struct rcu_head **nocb_follower_tail;
- wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */
+ struct swait_queue_head nocb_wq; /* For nocb kthreads to sleep on. */
struct task_struct *nocb_kthread;
int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */
@@ -472,7 +479,7 @@ struct rcu_state {
unsigned long gpnum; /* Current gp number. */
unsigned long completed; /* # of last completed gp. */
struct task_struct *gp_kthread; /* Task for grace periods. */
- wait_queue_head_t gp_wq; /* Where GP task waits. */
+ struct swait_queue_head gp_wq; /* Where GP task waits. */
short gp_flags; /* Commands for GP task. */
short gp_state; /* GP kthread sleep state. */
@@ -498,13 +505,9 @@ struct rcu_state {
/* End of fields guarded by barrier_mutex. */
unsigned long expedited_sequence; /* Take a ticket. */
- atomic_long_t expedited_workdone0; /* # done by others #0. */
- atomic_long_t expedited_workdone1; /* # done by others #1. */
- atomic_long_t expedited_workdone2; /* # done by others #2. */
- atomic_long_t expedited_workdone3; /* # done by others #3. */
atomic_long_t expedited_normal; /* # fallbacks to normal. */
atomic_t expedited_need_qs; /* # CPUs left to check in. */
- wait_queue_head_t expedited_wq; /* Wait for check-ins. */
+ struct swait_queue_head expedited_wq; /* Wait for check-ins. */
int ncpus_snap; /* # CPUs seen last time. */
unsigned long jiffies_force_qs; /* Time at which to invoke */
@@ -545,6 +548,18 @@ struct rcu_state {
#define RCU_GP_CLEANUP 5 /* Grace-period cleanup started. */
#define RCU_GP_CLEANED 6 /* Grace-period cleanup complete. */
+#ifndef RCU_TREE_NONCORE
+static const char * const gp_state_names[] = {
+ "RCU_GP_IDLE",
+ "RCU_GP_WAIT_GPS",
+ "RCU_GP_DONE_GPS",
+ "RCU_GP_WAIT_FQS",
+ "RCU_GP_DOING_FQS",
+ "RCU_GP_CLEANUP",
+ "RCU_GP_CLEANED",
+};
+#endif /* #ifndef RCU_TREE_NONCORE */
+
extern struct list_head rcu_struct_flavors;
/* Sequence through rcu_state structures for each RCU flavor. */
@@ -607,7 +622,8 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp);
static void increment_cpu_stall_ticks(void);
static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu);
static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
-static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp);
+static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
+static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
static void rcu_init_one_nocb(struct rcu_node *rnp);
static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
bool lazy, unsigned long flags);
@@ -664,3 +680,42 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
#else /* #ifdef CONFIG_PPC */
#define smp_mb__after_unlock_lock() do { } while (0)
#endif /* #else #ifdef CONFIG_PPC */
+
+/*
+ * Wrappers for the rcu_node::lock acquire.
+ *
+ * Because the rcu_nodes form a tree, the tree traversal locking will observe
+ * different lock values, this in turn means that an UNLOCK of one level
+ * followed by a LOCK of another level does not imply a full memory barrier;
+ * and most importantly transitivity is lost.
+ *
+ * In order to restore full ordering between tree levels, augment the regular
+ * lock acquire functions with smp_mb__after_unlock_lock().
+ */
+static inline void raw_spin_lock_rcu_node(struct rcu_node *rnp)
+{
+ raw_spin_lock(&rnp->lock);
+ smp_mb__after_unlock_lock();
+}
+
+static inline void raw_spin_lock_irq_rcu_node(struct rcu_node *rnp)
+{
+ raw_spin_lock_irq(&rnp->lock);
+ smp_mb__after_unlock_lock();
+}
+
+#define raw_spin_lock_irqsave_rcu_node(rnp, flags) \
+do { \
+ typecheck(unsigned long, flags); \
+ raw_spin_lock_irqsave(&(rnp)->lock, flags); \
+ smp_mb__after_unlock_lock(); \
+} while (0)
+
+static inline bool raw_spin_trylock_rcu_node(struct rcu_node *rnp)
+{
+ bool locked = raw_spin_trylock(&rnp->lock);
+
+ if (locked)
+ smp_mb__after_unlock_lock();
+ return locked;
+}
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 630c19772..080bd202d 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -63,8 +63,7 @@ static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */
/*
* Check the RCU kernel configuration parameters and print informative
- * messages about anything out of the ordinary. If you like #ifdef, you
- * will love this function.
+ * messages about anything out of the ordinary.
*/
static void __init rcu_bootup_announce_oddness(void)
{
@@ -147,8 +146,8 @@ static void __init rcu_bootup_announce(void)
* the corresponding expedited grace period will also be the end of the
* normal grace period.
*/
-static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp,
- unsigned long flags) __releases(rnp->lock)
+static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
+ __releases(rnp->lock) /* But leaves rrupts disabled. */
{
int blkd_state = (rnp->gp_tasks ? RCU_GP_TASKS : 0) +
(rnp->exp_tasks ? RCU_EXP_TASKS : 0) +
@@ -236,7 +235,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp,
rnp->gp_tasks = &t->rcu_node_entry;
if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD))
rnp->exp_tasks = &t->rcu_node_entry;
- raw_spin_unlock(&rnp->lock);
+ raw_spin_unlock(&rnp->lock); /* rrupts remain disabled. */
/*
* Report the quiescent state for the expedited GP. This expedited
@@ -251,7 +250,6 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp,
} else {
WARN_ON_ONCE(t->rcu_read_unlock_special.b.exp_need_qs);
}
- local_irq_restore(flags);
}
/*
@@ -286,12 +284,11 @@ static void rcu_preempt_qs(void)
* predating the current grace period drain, in other words, until
* rnp->gp_tasks becomes NULL.
*
- * Caller must disable preemption.
+ * Caller must disable interrupts.
*/
static void rcu_preempt_note_context_switch(void)
{
struct task_struct *t = current;
- unsigned long flags;
struct rcu_data *rdp;
struct rcu_node *rnp;
@@ -301,8 +298,7 @@ static void rcu_preempt_note_context_switch(void)
/* Possibly blocking in an RCU read-side critical section. */
rdp = this_cpu_ptr(rcu_state_p->rda);
rnp = rdp->mynode;
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_rcu_node(rnp);
t->rcu_read_unlock_special.b.blocked = true;
t->rcu_blocked_node = rnp;
@@ -318,7 +314,7 @@ static void rcu_preempt_note_context_switch(void)
(rnp->qsmask & rdp->grpmask)
? rnp->gpnum
: rnp->gpnum + 1);
- rcu_preempt_ctxt_queue(rnp, rdp, flags);
+ rcu_preempt_ctxt_queue(rnp, rdp);
} else if (t->rcu_read_lock_nesting < 0 &&
t->rcu_read_unlock_special.s) {
@@ -450,20 +446,13 @@ void rcu_read_unlock_special(struct task_struct *t)
/*
* Remove this task from the list it blocked on. The task
- * now remains queued on the rcu_node corresponding to
- * the CPU it first blocked on, so the first attempt to
- * acquire the task's rcu_node's ->lock will succeed.
- * Keep the loop and add a WARN_ON() out of sheer paranoia.
+ * now remains queued on the rcu_node corresponding to the
+ * CPU it first blocked on, so there is no longer any need
+ * to loop. Retain a WARN_ON_ONCE() out of sheer paranoia.
*/
- for (;;) {
- rnp = t->rcu_blocked_node;
- raw_spin_lock(&rnp->lock); /* irqs already disabled. */
- smp_mb__after_unlock_lock();
- if (rnp == t->rcu_blocked_node)
- break;
- WARN_ON_ONCE(1);
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
- }
+ rnp = t->rcu_blocked_node;
+ raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
+ WARN_ON_ONCE(rnp != t->rcu_blocked_node);
empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
empty_exp = sync_rcu_preempt_exp_done(rnp);
smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
@@ -527,7 +516,7 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
unsigned long flags;
struct task_struct *t;
- raw_spin_lock_irqsave(&rnp->lock, flags);
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
if (!rcu_preempt_blocked_readers_cgp(rnp)) {
raw_spin_unlock_irqrestore(&rnp->lock, flags);
return;
@@ -748,6 +737,12 @@ void synchronize_rcu_expedited(void)
struct rcu_state *rsp = rcu_state_p;
unsigned long s;
+ /* If expedited grace periods are prohibited, fall back to normal. */
+ if (rcu_gp_is_normal()) {
+ wait_rcu_gp(call_rcu);
+ return;
+ }
+
s = rcu_exp_gp_seq_snap(rsp);
rnp_unlock = exp_funnel_lock(rsp, s);
@@ -788,7 +783,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier);
*/
static void __init __rcu_init_preempt(void)
{
- rcu_init_one(rcu_state_p, rcu_data_p);
+ rcu_init_one(rcu_state_p);
}
/*
@@ -989,8 +984,7 @@ static int rcu_boost(struct rcu_node *rnp)
READ_ONCE(rnp->boost_tasks) == NULL)
return 0; /* Nothing left to boost. */
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
/*
* Recheck under the lock: all tasks in need of boosting
@@ -1176,8 +1170,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
"rcub/%d", rnp_index);
if (IS_ERR(t))
return PTR_ERR(t);
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
rnp->boost_kthread_task = t;
raw_spin_unlock_irqrestore(&rnp->lock, flags);
sp.sched_priority = kthread_prio;
@@ -1524,7 +1517,8 @@ static void rcu_prepare_for_idle(void)
struct rcu_state *rsp;
int tne;
- if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL))
+ if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) ||
+ rcu_is_nocb_cpu(smp_processor_id()))
return;
/* Handle nohz enablement switches conservatively. */
@@ -1538,10 +1532,6 @@ static void rcu_prepare_for_idle(void)
if (!tne)
return;
- /* If this is a no-CBs CPU, no callbacks, just return. */
- if (rcu_is_nocb_cpu(smp_processor_id()))
- return;
-
/*
* If a non-lazy callback arrived at a CPU having only lazy
* callbacks, invoke RCU core for the side-effect of recalculating
@@ -1567,8 +1557,7 @@ static void rcu_prepare_for_idle(void)
if (!*rdp->nxttail[RCU_DONE_TAIL])
continue;
rnp = rdp->mynode;
- raw_spin_lock(&rnp->lock); /* irqs already disabled. */
- smp_mb__after_unlock_lock();
+ raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
if (needwake)
@@ -1822,9 +1811,9 @@ early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
* Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
* grace period.
*/
-static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
+static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
{
- wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]);
+ swake_up_all(sq);
}
/*
@@ -1840,10 +1829,15 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq;
}
+static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
+{
+ return &rnp->nocb_gp_wq[rnp->completed & 0x1];
+}
+
static void rcu_init_one_nocb(struct rcu_node *rnp)
{
- init_waitqueue_head(&rnp->nocb_gp_wq[0]);
- init_waitqueue_head(&rnp->nocb_gp_wq[1]);
+ init_swait_queue_head(&rnp->nocb_gp_wq[0]);
+ init_swait_queue_head(&rnp->nocb_gp_wq[1]);
}
#ifndef CONFIG_RCU_NOCB_CPU_ALL
@@ -1868,7 +1862,7 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force)
if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) {
/* Prior smp_mb__after_atomic() orders against prior enqueue. */
WRITE_ONCE(rdp_leader->nocb_leader_sleep, false);
- wake_up(&rdp_leader->nocb_wq);
+ swake_up(&rdp_leader->nocb_wq);
}
}
@@ -2068,8 +2062,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
bool needwake;
struct rcu_node *rnp = rdp->mynode;
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
needwake = rcu_start_future_gp(rnp, rdp, &c);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
if (needwake)
@@ -2081,7 +2074,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
*/
trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait"));
for (;;) {
- wait_event_interruptible(
+ swait_event_interruptible(
rnp->nocb_gp_wq[c & 0x1],
(d = ULONG_CMP_GE(READ_ONCE(rnp->completed), c)));
if (likely(d))
@@ -2109,7 +2102,7 @@ wait_again:
/* Wait for callbacks to appear. */
if (!rcu_nocb_poll) {
trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep");
- wait_event_interruptible(my_rdp->nocb_wq,
+ swait_event_interruptible(my_rdp->nocb_wq,
!READ_ONCE(my_rdp->nocb_leader_sleep));
/* Memory barrier handled by smp_mb() calls below and repoll. */
} else if (firsttime) {
@@ -2184,7 +2177,7 @@ wait_again:
* List was empty, wake up the follower.
* Memory barriers supplied by atomic_long_add().
*/
- wake_up(&rdp->nocb_wq);
+ swake_up(&rdp->nocb_wq);
}
}
@@ -2205,7 +2198,7 @@ static void nocb_follower_wait(struct rcu_data *rdp)
if (!rcu_nocb_poll) {
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
"FollowerSleep");
- wait_event_interruptible(rdp->nocb_wq,
+ swait_event_interruptible(rdp->nocb_wq,
READ_ONCE(rdp->nocb_follower_head));
} else if (firsttime) {
/* Don't drown trace log with "Poll"! */
@@ -2364,7 +2357,7 @@ void __init rcu_init_nohz(void)
static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
{
rdp->nocb_tail = &rdp->nocb_head;
- init_waitqueue_head(&rdp->nocb_wq);
+ init_swait_queue_head(&rdp->nocb_wq);
rdp->nocb_follower_tail = &rdp->nocb_follower_head;
}
@@ -2514,7 +2507,7 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
return false;
}
-static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
+static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
{
}
@@ -2522,6 +2515,11 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
{
}
+static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
+{
+ return NULL;
+}
+
static void rcu_init_one_nocb(struct rcu_node *rnp)
{
}
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index ef7093cc9..1088e64f0 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -1,5 +1,5 @@
/*
- * Read-Copy Update tracing for classic implementation
+ * Read-Copy Update tracing for hierarchical implementation.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -16,6 +16,7 @@
* http://www.gnu.org/licenses/gpl-2.0.html.
*
* Copyright IBM Corporation, 2008
+ * Author: Paul E. McKenney
*
* Papers: http://www.rdrop.com/users/paulmck/RCU
*
@@ -33,9 +34,7 @@
#include <linux/sched.h>
#include <linux/atomic.h>
#include <linux/bitops.h>
-#include <linux/module.h>
#include <linux/completion.h>
-#include <linux/moduleparam.h>
#include <linux/percpu.h>
#include <linux/notifier.h>
#include <linux/cpu.h>
@@ -183,14 +182,20 @@ static const struct file_operations rcudata_fops = {
static int show_rcuexp(struct seq_file *m, void *v)
{
+ int cpu;
struct rcu_state *rsp = (struct rcu_state *)m->private;
-
+ struct rcu_data *rdp;
+ unsigned long s0 = 0, s1 = 0, s2 = 0, s3 = 0;
+
+ for_each_possible_cpu(cpu) {
+ rdp = per_cpu_ptr(rsp->rda, cpu);
+ s0 += atomic_long_read(&rdp->expedited_workdone0);
+ s1 += atomic_long_read(&rdp->expedited_workdone1);
+ s2 += atomic_long_read(&rdp->expedited_workdone2);
+ s3 += atomic_long_read(&rdp->expedited_workdone3);
+ }
seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n",
- rsp->expedited_sequence,
- atomic_long_read(&rsp->expedited_workdone0),
- atomic_long_read(&rsp->expedited_workdone1),
- atomic_long_read(&rsp->expedited_workdone2),
- atomic_long_read(&rsp->expedited_workdone3),
+ rsp->expedited_sequence, s0, s1, s2, s3,
atomic_long_read(&rsp->expedited_normal),
atomic_read(&rsp->expedited_need_qs),
rsp->expedited_sequence / 2);
@@ -319,7 +324,7 @@ static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
unsigned long gpmax;
struct rcu_node *rnp = &rsp->node[0];
- raw_spin_lock_irqsave(&rnp->lock, flags);
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
completed = READ_ONCE(rsp->completed);
gpnum = READ_ONCE(rsp->gpnum);
if (completed == gpnum)
@@ -487,16 +492,4 @@ free_out:
debugfs_remove_recursive(rcudir);
return 1;
}
-
-static void __exit rcutree_trace_cleanup(void)
-{
- debugfs_remove_recursive(rcudir);
-}
-
-
-module_init(rcutree_trace_init);
-module_exit(rcutree_trace_cleanup);
-
-MODULE_AUTHOR("Paul E. McKenney");
-MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation");
-MODULE_LICENSE("GPL");
+device_initcall(rcutree_trace_init);
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 5f748c5a4..76b94e194 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -60,7 +60,12 @@ MODULE_ALIAS("rcupdate");
#endif
#define MODULE_PARAM_PREFIX "rcupdate."
+#ifndef CONFIG_TINY_RCU
module_param(rcu_expedited, int, 0);
+module_param(rcu_normal, int, 0);
+static int rcu_normal_after_boot;
+module_param(rcu_normal_after_boot, int, 0);
+#endif /* #ifndef CONFIG_TINY_RCU */
#if defined(CONFIG_DEBUG_LOCK_ALLOC) && defined(CONFIG_PREEMPT_COUNT)
/**
@@ -113,6 +118,17 @@ EXPORT_SYMBOL(rcu_read_lock_sched_held);
#ifndef CONFIG_TINY_RCU
+/*
+ * Should expedited grace-period primitives always fall back to their
+ * non-expedited counterparts? Intended for use within RCU. Note
+ * that if the user specifies both rcu_expedited and rcu_normal, then
+ * rcu_normal wins.
+ */
+bool rcu_gp_is_normal(void)
+{
+ return READ_ONCE(rcu_normal);
+}
+
static atomic_t rcu_expedited_nesting =
ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0);
@@ -157,8 +173,6 @@ void rcu_unexpedite_gp(void)
}
EXPORT_SYMBOL_GPL(rcu_unexpedite_gp);
-#endif /* #ifndef CONFIG_TINY_RCU */
-
/*
* Inform RCU of the end of the in-kernel boot sequence.
*/
@@ -166,8 +180,12 @@ void rcu_end_inkernel_boot(void)
{
if (IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT))
rcu_unexpedite_gp();
+ if (rcu_normal_after_boot)
+ WRITE_ONCE(rcu_normal, 1);
}
+#endif /* #ifndef CONFIG_TINY_RCU */
+
#ifdef CONFIG_PREEMPT_RCU
/*
diff --git a/kernel/relay.c b/kernel/relay.c
index 0b4570cfa..074994bcf 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1133,7 +1133,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
if (!desc->count)
return 0;
- mutex_lock(&file_inode(filp)->i_mutex);
+ inode_lock(file_inode(filp));
do {
if (!relay_file_read_avail(buf, *ppos))
break;
@@ -1153,7 +1153,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
*ppos = relay_file_read_end_pos(buf, read_start, ret);
}
} while (desc->count && ret);
- mutex_unlock(&file_inode(filp)->i_mutex);
+ inode_unlock(file_inode(filp));
return desc->written;
}
diff --git a/kernel/resource.c b/kernel/resource.c
index 249b1eb1e..3669d1bfc 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1499,8 +1499,15 @@ int iomem_is_exclusive(u64 addr)
break;
if (p->end < addr)
continue;
- if (p->flags & IORESOURCE_BUSY &&
- p->flags & IORESOURCE_EXCLUSIVE) {
+ /*
+ * A resource is exclusive if IORESOURCE_EXCLUSIVE is set
+ * or CONFIG_IO_STRICT_DEVMEM is enabled and the
+ * resource is busy.
+ */
+ if ((p->flags & IORESOURCE_BUSY) == 0)
+ continue;
+ if (IS_ENABLED(CONFIG_IO_STRICT_DEVMEM)
+ || p->flags & IORESOURCE_EXCLUSIVE) {
err = 1;
break;
}
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 67687973c..7d4cba227 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -13,7 +13,7 @@ endif
obj-y += core.o loadavg.o clock.o cputime.o
obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
-obj-y += wait.o completion.o idle.o
+obj-y += wait.o swait.o completion.o idle.o
obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index 750ed601d..a5d966cb8 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -212,7 +212,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
ag = autogroup_task_get(p);
down_write(&ag->lock);
- err = sched_group_set_shares(ag->tg, prio_to_weight[nice + 20]);
+ err = sched_group_set_shares(ag->tg, sched_prio_to_weight[nice + 20]);
if (!err)
ag->nice = nice;
up_write(&ag->lock);
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index caf4041f5..bc54e8467 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -354,7 +354,7 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
return;
sched_clock_tick();
- touch_softlockup_watchdog();
+ touch_softlockup_watchdog_sched();
}
EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index eb70592f0..05114b15b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -26,6 +26,7 @@
* Thomas Gleixner, Mike Kravetz
*/
+#include <linux/kasan.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/nmi.h>
@@ -66,12 +67,10 @@
#include <linux/pagemap.h>
#include <linux/hrtimer.h>
#include <linux/tick.h>
-#include <linux/debugfs.h>
#include <linux/ctype.h>
#include <linux/ftrace.h>
#include <linux/slab.h>
#include <linux/init_task.h>
-#include <linux/binfmts.h>
#include <linux/context_tracking.h>
#include <linux/compiler.h>
@@ -124,138 +123,6 @@ const_debug unsigned int sysctl_sched_features =
#undef SCHED_FEAT
-#ifdef CONFIG_SCHED_DEBUG
-#define SCHED_FEAT(name, enabled) \
- #name ,
-
-static const char * const sched_feat_names[] = {
-#include "features.h"
-};
-
-#undef SCHED_FEAT
-
-static int sched_feat_show(struct seq_file *m, void *v)
-{
- int i;
-
- for (i = 0; i < __SCHED_FEAT_NR; i++) {
- if (!(sysctl_sched_features & (1UL << i)))
- seq_puts(m, "NO_");
- seq_printf(m, "%s ", sched_feat_names[i]);
- }
- seq_puts(m, "\n");
-
- return 0;
-}
-
-#ifdef HAVE_JUMP_LABEL
-
-#define jump_label_key__true STATIC_KEY_INIT_TRUE
-#define jump_label_key__false STATIC_KEY_INIT_FALSE
-
-#define SCHED_FEAT(name, enabled) \
- jump_label_key__##enabled ,
-
-struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
-#include "features.h"
-};
-
-#undef SCHED_FEAT
-
-static void sched_feat_disable(int i)
-{
- static_key_disable(&sched_feat_keys[i]);
-}
-
-static void sched_feat_enable(int i)
-{
- static_key_enable(&sched_feat_keys[i]);
-}
-#else
-static void sched_feat_disable(int i) { };
-static void sched_feat_enable(int i) { };
-#endif /* HAVE_JUMP_LABEL */
-
-static int sched_feat_set(char *cmp)
-{
- int i;
- int neg = 0;
-
- if (strncmp(cmp, "NO_", 3) == 0) {
- neg = 1;
- cmp += 3;
- }
-
- for (i = 0; i < __SCHED_FEAT_NR; i++) {
- if (strcmp(cmp, sched_feat_names[i]) == 0) {
- if (neg) {
- sysctl_sched_features &= ~(1UL << i);
- sched_feat_disable(i);
- } else {
- sysctl_sched_features |= (1UL << i);
- sched_feat_enable(i);
- }
- break;
- }
- }
-
- return i;
-}
-
-static ssize_t
-sched_feat_write(struct file *filp, const char __user *ubuf,
- size_t cnt, loff_t *ppos)
-{
- char buf[64];
- char *cmp;
- int i;
- struct inode *inode;
-
- if (cnt > 63)
- cnt = 63;
-
- if (copy_from_user(&buf, ubuf, cnt))
- return -EFAULT;
-
- buf[cnt] = 0;
- cmp = strstrip(buf);
-
- /* Ensure the static_key remains in a consistent state */
- inode = file_inode(filp);
- mutex_lock(&inode->i_mutex);
- i = sched_feat_set(cmp);
- mutex_unlock(&inode->i_mutex);
- if (i == __SCHED_FEAT_NR)
- return -EINVAL;
-
- *ppos += cnt;
-
- return cnt;
-}
-
-static int sched_feat_open(struct inode *inode, struct file *filp)
-{
- return single_open(filp, sched_feat_show, NULL);
-}
-
-static const struct file_operations sched_feat_fops = {
- .open = sched_feat_open,
- .write = sched_feat_write,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
-static __init int sched_init_debug(void)
-{
- debugfs_create_file("sched_features", 0644, NULL, NULL,
- &sched_feat_fops);
-
- return 0;
-}
-late_initcall(sched_init_debug);
-#endif /* CONFIG_SCHED_DEBUG */
-
/*
* Number of tasks to iterate in a single balance run.
* Limited because this is done with IRQs disabled.
@@ -731,7 +598,7 @@ bool sched_can_stop_tick(void)
if (current->policy == SCHED_RR) {
struct sched_rt_entity *rt_se = &current->rt;
- return rt_se->run_list.prev == rt_se->run_list.next;
+ return list_is_singular(&rt_se->run_list);
}
/*
@@ -823,8 +690,8 @@ static void set_load_weight(struct task_struct *p)
return;
}
- load->weight = scale_load(prio_to_weight[prio]);
- load->inv_weight = prio_to_wmult[prio];
+ load->weight = scale_load(sched_prio_to_weight[prio]);
+ load->inv_weight = sched_prio_to_wmult[prio];
}
static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -1071,8 +938,8 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new
{
lockdep_assert_held(&rq->lock);
- dequeue_task(rq, p, 0);
p->on_rq = TASK_ON_RQ_MIGRATING;
+ dequeue_task(rq, p, 0);
set_task_cpu(p, new_cpu);
raw_spin_unlock(&rq->lock);
@@ -1080,8 +947,8 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new
raw_spin_lock(&rq->lock);
BUG_ON(task_cpu(p) != new_cpu);
- p->on_rq = TASK_ON_RQ_QUEUED;
enqueue_task(rq, p, 0);
+ p->on_rq = TASK_ON_RQ_QUEUED;
check_preempt_curr(rq, p, 0);
return rq;
@@ -1274,6 +1141,15 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
!p->on_rq);
+ /*
+ * Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING,
+ * because schedstat_wait_{start,end} rebase migrating task's wait_start
+ * time relying on p->on_rq.
+ */
+ WARN_ON_ONCE(p->state == TASK_RUNNING &&
+ p->sched_class == &fair_sched_class &&
+ (p->on_rq && !task_on_rq_migrating(p)));
+
#ifdef CONFIG_LOCKDEP
/*
* The caller should hold either p->pi_lock or rq->lock, when changing
@@ -1310,9 +1186,11 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
src_rq = task_rq(p);
dst_rq = cpu_rq(cpu);
+ p->on_rq = TASK_ON_RQ_MIGRATING;
deactivate_task(src_rq, p, 0);
set_task_cpu(p, cpu);
activate_task(dst_rq, p, 0);
+ p->on_rq = TASK_ON_RQ_QUEUED;
check_preempt_curr(dst_rq, p, 0);
} else {
/*
@@ -1905,6 +1783,97 @@ static void ttwu_queue(struct task_struct *p, int cpu)
raw_spin_unlock(&rq->lock);
}
+/*
+ * Notes on Program-Order guarantees on SMP systems.
+ *
+ * MIGRATION
+ *
+ * The basic program-order guarantee on SMP systems is that when a task [t]
+ * migrates, all its activity on its old cpu [c0] happens-before any subsequent
+ * execution on its new cpu [c1].
+ *
+ * For migration (of runnable tasks) this is provided by the following means:
+ *
+ * A) UNLOCK of the rq(c0)->lock scheduling out task t
+ * B) migration for t is required to synchronize *both* rq(c0)->lock and
+ * rq(c1)->lock (if not at the same time, then in that order).
+ * C) LOCK of the rq(c1)->lock scheduling in task
+ *
+ * Transitivity guarantees that B happens after A and C after B.
+ * Note: we only require RCpc transitivity.
+ * Note: the cpu doing B need not be c0 or c1
+ *
+ * Example:
+ *
+ * CPU0 CPU1 CPU2
+ *
+ * LOCK rq(0)->lock
+ * sched-out X
+ * sched-in Y
+ * UNLOCK rq(0)->lock
+ *
+ * LOCK rq(0)->lock // orders against CPU0
+ * dequeue X
+ * UNLOCK rq(0)->lock
+ *
+ * LOCK rq(1)->lock
+ * enqueue X
+ * UNLOCK rq(1)->lock
+ *
+ * LOCK rq(1)->lock // orders against CPU2
+ * sched-out Z
+ * sched-in X
+ * UNLOCK rq(1)->lock
+ *
+ *
+ * BLOCKING -- aka. SLEEP + WAKEUP
+ *
+ * For blocking we (obviously) need to provide the same guarantee as for
+ * migration. However the means are completely different as there is no lock
+ * chain to provide order. Instead we do:
+ *
+ * 1) smp_store_release(X->on_cpu, 0)
+ * 2) smp_cond_acquire(!X->on_cpu)
+ *
+ * Example:
+ *
+ * CPU0 (schedule) CPU1 (try_to_wake_up) CPU2 (schedule)
+ *
+ * LOCK rq(0)->lock LOCK X->pi_lock
+ * dequeue X
+ * sched-out X
+ * smp_store_release(X->on_cpu, 0);
+ *
+ * smp_cond_acquire(!X->on_cpu);
+ * X->state = WAKING
+ * set_task_cpu(X,2)
+ *
+ * LOCK rq(2)->lock
+ * enqueue X
+ * X->state = RUNNING
+ * UNLOCK rq(2)->lock
+ *
+ * LOCK rq(2)->lock // orders against CPU1
+ * sched-out Z
+ * sched-in X
+ * UNLOCK rq(2)->lock
+ *
+ * UNLOCK X->pi_lock
+ * UNLOCK rq(0)->lock
+ *
+ *
+ * However; for wakeups there is a second guarantee we must provide, namely we
+ * must observe the state that lead to our wakeup. That is, not only must our
+ * task observe its own prior state, it must also observe the stores prior to
+ * its wakeup.
+ *
+ * This means that any means of doing remote wakeups must order the CPU doing
+ * the wakeup against the CPU the task is going to end up running on. This,
+ * however, is already required for the regular Program-Order guarantee above,
+ * since the waking CPU is the one issueing the ACQUIRE (smp_cond_acquire).
+ *
+ */
+
/**
* try_to_wake_up - wake up a thread
* @p: the thread to be awakened
@@ -1968,19 +1937,13 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
/*
* If the owning (remote) cpu is still in the middle of schedule() with
* this task as prev, wait until its done referencing the task.
- */
- while (p->on_cpu)
- cpu_relax();
- /*
- * Combined with the control dependency above, we have an effective
- * smp_load_acquire() without the need for full barriers.
*
* Pairs with the smp_store_release() in finish_lock_switch().
*
* This ensures that tasks getting woken will be fully ordered against
* their previous state and preserve Program Order.
*/
- smp_rmb();
+ smp_cond_acquire(!p->on_cpu);
p->sched_contributes_to_load = !!task_contributes_to_load(p);
p->state = TASK_WAKING;
@@ -1997,7 +1960,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
ttwu_queue(p, cpu);
stat:
- ttwu_stat(p, cpu, wake_flags);
+ if (schedstat_enabled())
+ ttwu_stat(p, cpu, wake_flags);
out:
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
@@ -2045,7 +2009,8 @@ static void try_to_wake_up_local(struct task_struct *p)
ttwu_activate(rq, p, ENQUEUE_WAKEUP);
ttwu_do_wakeup(rq, p, 0);
- ttwu_stat(p, smp_processor_id(), 0);
+ if (schedstat_enabled())
+ ttwu_stat(p, smp_processor_id(), 0);
out:
raw_spin_unlock(&p->pi_lock);
}
@@ -2087,7 +2052,6 @@ void __dl_clear_params(struct task_struct *p)
dl_se->dl_bw = 0;
dl_se->dl_throttled = 0;
- dl_se->dl_new = 1;
dl_se->dl_yielded = 0;
}
@@ -2109,7 +2073,12 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->se.vruntime = 0;
INIT_LIST_HEAD(&p->se.group_node);
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ p->se.cfs_rq = NULL;
+#endif
+
#ifdef CONFIG_SCHEDSTATS
+ /* Even if schedstat is disabled, there should not be garbage */
memset(&p->se.statistics, 0, sizeof(p->se.statistics));
#endif
@@ -2118,6 +2087,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
__dl_clear_params(p);
INIT_LIST_HEAD(&p->rt.run_list);
+ p->rt.timeout = 0;
+ p->rt.time_slice = sched_rr_timeslice;
+ p->rt.on_rq = 0;
+ p->rt.on_list = 0;
#ifdef CONFIG_PREEMPT_NOTIFIERS
INIT_HLIST_HEAD(&p->preempt_notifiers);
@@ -2181,6 +2154,69 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
#endif
#endif
+DEFINE_STATIC_KEY_FALSE(sched_schedstats);
+
+#ifdef CONFIG_SCHEDSTATS
+static void set_schedstats(bool enabled)
+{
+ if (enabled)
+ static_branch_enable(&sched_schedstats);
+ else
+ static_branch_disable(&sched_schedstats);
+}
+
+void force_schedstat_enabled(void)
+{
+ if (!schedstat_enabled()) {
+ pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n");
+ static_branch_enable(&sched_schedstats);
+ }
+}
+
+static int __init setup_schedstats(char *str)
+{
+ int ret = 0;
+ if (!str)
+ goto out;
+
+ if (!strcmp(str, "enable")) {
+ set_schedstats(true);
+ ret = 1;
+ } else if (!strcmp(str, "disable")) {
+ set_schedstats(false);
+ ret = 1;
+ }
+out:
+ if (!ret)
+ pr_warn("Unable to parse schedstats=\n");
+
+ return ret;
+}
+__setup("schedstats=", setup_schedstats);
+
+#ifdef CONFIG_PROC_SYSCTL
+int sysctl_schedstats(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table t;
+ int err;
+ int state = static_branch_likely(&sched_schedstats);
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ t = *table;
+ t.data = &state;
+ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+ if (err < 0)
+ return err;
+ if (write)
+ set_schedstats(state);
+ return err;
+}
+#endif
+#endif
+
/*
* fork()/clone()-time setup:
*/
@@ -2910,16 +2946,6 @@ u64 scheduler_tick_max_deferment(void)
}
#endif
-notrace unsigned long get_parent_ip(unsigned long addr)
-{
- if (in_lock_functions(addr)) {
- addr = CALLER_ADDR2;
- if (in_lock_functions(addr))
- addr = CALLER_ADDR3;
- }
- return addr;
-}
-
#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
defined(CONFIG_PREEMPT_TRACER))
@@ -2941,7 +2967,7 @@ void preempt_count_add(int val)
PREEMPT_MASK - 10);
#endif
if (preempt_count() == val) {
- unsigned long ip = get_parent_ip(CALLER_ADDR1);
+ unsigned long ip = get_lock_parent_ip();
#ifdef CONFIG_DEBUG_PREEMPT
current->preempt_disable_ip = ip;
#endif
@@ -2968,7 +2994,7 @@ void preempt_count_sub(int val)
#endif
if (preempt_count() == val)
- trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
+ trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
__preempt_count_sub(val);
}
EXPORT_SYMBOL(preempt_count_sub);
@@ -3109,7 +3135,6 @@ static void __sched notrace __schedule(bool preempt)
cpu = smp_processor_id();
rq = cpu_rq(cpu);
- rcu_note_context_switch();
prev = rq->curr;
/*
@@ -3128,13 +3153,16 @@ static void __sched notrace __schedule(bool preempt)
if (sched_feat(HRTICK))
hrtick_clear(rq);
+ local_irq_disable();
+ rcu_note_context_switch();
+
/*
* Make sure that signal_pending_state()->signal_pending() below
* can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
* done by the caller to avoid the race with signal_wake_up().
*/
smp_mb__before_spinlock();
- raw_spin_lock_irq(&rq->lock);
+ raw_spin_lock(&rq->lock);
lockdep_pin_lock(&rq->lock);
rq->clock_skip_update <<= 1; /* promote REQ to ACT */
@@ -3178,7 +3206,6 @@ static void __sched notrace __schedule(bool preempt)
trace_sched_switch(preempt, prev, next);
rq = context_switch(rq, prev, next); /* unlocks the rq */
- cpu = cpu_of(rq);
} else {
lockdep_unpin_lock(&rq->lock);
raw_spin_unlock_irq(&rq->lock);
@@ -3364,7 +3391,7 @@ EXPORT_SYMBOL(default_wake_function);
*/
void rt_mutex_setprio(struct task_struct *p, int prio)
{
- int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE;
+ int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
struct rq *rq;
const struct sched_class *prev_class;
@@ -3392,11 +3419,15 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
trace_sched_pi_setprio(p, prio);
oldprio = p->prio;
+
+ if (oldprio == prio)
+ queue_flag &= ~DEQUEUE_MOVE;
+
prev_class = p->sched_class;
queued = task_on_rq_queued(p);
running = task_current(rq, p);
if (queued)
- dequeue_task(rq, p, DEQUEUE_SAVE);
+ dequeue_task(rq, p, queue_flag);
if (running)
put_prev_task(rq, p);
@@ -3414,7 +3445,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
if (!dl_prio(p->normal_prio) ||
(pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
p->dl.dl_boosted = 1;
- enqueue_flag |= ENQUEUE_REPLENISH;
+ queue_flag |= ENQUEUE_REPLENISH;
} else
p->dl.dl_boosted = 0;
p->sched_class = &dl_sched_class;
@@ -3422,7 +3453,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
if (dl_prio(oldprio))
p->dl.dl_boosted = 0;
if (oldprio < prio)
- enqueue_flag |= ENQUEUE_HEAD;
+ queue_flag |= ENQUEUE_HEAD;
p->sched_class = &rt_sched_class;
} else {
if (dl_prio(oldprio))
@@ -3437,7 +3468,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
if (running)
p->sched_class->set_curr_task(rq);
if (queued)
- enqueue_task(rq, p, enqueue_flag);
+ enqueue_task(rq, p, queue_flag);
check_class_changed(rq, p, prev_class, oldprio);
out_unlock:
@@ -3793,6 +3824,7 @@ static int __sched_setscheduler(struct task_struct *p,
const struct sched_class *prev_class;
struct rq *rq;
int reset_on_fork;
+ int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
/* may grab non-irq protected spin_locks */
BUG_ON(in_interrupt());
@@ -3975,17 +4007,14 @@ change:
* itself.
*/
new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
- if (new_effective_prio == oldprio) {
- __setscheduler_params(p, attr);
- task_rq_unlock(rq, p, &flags);
- return 0;
- }
+ if (new_effective_prio == oldprio)
+ queue_flags &= ~DEQUEUE_MOVE;
}
queued = task_on_rq_queued(p);
running = task_current(rq, p);
if (queued)
- dequeue_task(rq, p, DEQUEUE_SAVE);
+ dequeue_task(rq, p, queue_flags);
if (running)
put_prev_task(rq, p);
@@ -3995,15 +4024,14 @@ change:
if (running)
p->sched_class->set_curr_task(rq);
if (queued) {
- int enqueue_flags = ENQUEUE_RESTORE;
/*
* We enqueue to tail when the priority of a task is
* increased (user space view).
*/
- if (oldprio <= p->prio)
- enqueue_flags |= ENQUEUE_HEAD;
+ if (oldprio < p->prio)
+ queue_flags |= ENQUEUE_HEAD;
- enqueue_task(rq, p, enqueue_flags);
+ enqueue_task(rq, p, queue_flags);
}
check_class_changed(rq, p, prev_class, oldprio);
@@ -4994,6 +5022,8 @@ void init_idle(struct task_struct *idle, int cpu)
idle->state = TASK_RUNNING;
idle->se.exec_start = sched_clock();
+ kasan_unpoison_task_stack(idle);
+
#ifdef CONFIG_SMP
/*
* Its possible that init_idle() gets called multiple times on a task,
@@ -5303,183 +5333,6 @@ static void migrate_tasks(struct rq *dead_rq)
}
#endif /* CONFIG_HOTPLUG_CPU */
-#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
-
-static struct ctl_table sd_ctl_dir[] = {
- {
- .procname = "sched_domain",
- .mode = 0555,
- },
- {}
-};
-
-static struct ctl_table sd_ctl_root[] = {
- {
- .procname = "kernel",
- .mode = 0555,
- .child = sd_ctl_dir,
- },
- {}
-};
-
-static struct ctl_table *sd_alloc_ctl_entry(int n)
-{
- struct ctl_table *entry =
- kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
-
- return entry;
-}
-
-static void sd_free_ctl_entry(struct ctl_table **tablep)
-{
- struct ctl_table *entry;
-
- /*
- * In the intermediate directories, both the child directory and
- * procname are dynamically allocated and could fail but the mode
- * will always be set. In the lowest directory the names are
- * static strings and all have proc handlers.
- */
- for (entry = *tablep; entry->mode; entry++) {
- if (entry->child)
- sd_free_ctl_entry(&entry->child);
- if (entry->proc_handler == NULL)
- kfree(entry->procname);
- }
-
- kfree(*tablep);
- *tablep = NULL;
-}
-
-static int min_load_idx = 0;
-static int max_load_idx = CPU_LOAD_IDX_MAX-1;
-
-static void
-set_table_entry(struct ctl_table *entry,
- const char *procname, void *data, int maxlen,
- umode_t mode, proc_handler *proc_handler,
- bool load_idx)
-{
- entry->procname = procname;
- entry->data = data;
- entry->maxlen = maxlen;
- entry->mode = mode;
- entry->proc_handler = proc_handler;
-
- if (load_idx) {
- entry->extra1 = &min_load_idx;
- entry->extra2 = &max_load_idx;
- }
-}
-
-static struct ctl_table *
-sd_alloc_ctl_domain_table(struct sched_domain *sd)
-{
- struct ctl_table *table = sd_alloc_ctl_entry(14);
-
- if (table == NULL)
- return NULL;
-
- set_table_entry(&table[0], "min_interval", &sd->min_interval,
- sizeof(long), 0644, proc_doulongvec_minmax, false);
- set_table_entry(&table[1], "max_interval", &sd->max_interval,
- sizeof(long), 0644, proc_doulongvec_minmax, false);
- set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
- sizeof(int), 0644, proc_dointvec_minmax, true);
- set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
- sizeof(int), 0644, proc_dointvec_minmax, true);
- set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
- sizeof(int), 0644, proc_dointvec_minmax, true);
- set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
- sizeof(int), 0644, proc_dointvec_minmax, true);
- set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
- sizeof(int), 0644, proc_dointvec_minmax, true);
- set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
- sizeof(int), 0644, proc_dointvec_minmax, false);
- set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
- sizeof(int), 0644, proc_dointvec_minmax, false);
- set_table_entry(&table[9], "cache_nice_tries",
- &sd->cache_nice_tries,
- sizeof(int), 0644, proc_dointvec_minmax, false);
- set_table_entry(&table[10], "flags", &sd->flags,
- sizeof(int), 0644, proc_dointvec_minmax, false);
- set_table_entry(&table[11], "max_newidle_lb_cost",
- &sd->max_newidle_lb_cost,
- sizeof(long), 0644, proc_doulongvec_minmax, false);
- set_table_entry(&table[12], "name", sd->name,
- CORENAME_MAX_SIZE, 0444, proc_dostring, false);
- /* &table[13] is terminator */
-
- return table;
-}
-
-static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
-{
- struct ctl_table *entry, *table;
- struct sched_domain *sd;
- int domain_num = 0, i;
- char buf[32];
-
- for_each_domain(cpu, sd)
- domain_num++;
- entry = table = sd_alloc_ctl_entry(domain_num + 1);
- if (table == NULL)
- return NULL;
-
- i = 0;
- for_each_domain(cpu, sd) {
- snprintf(buf, 32, "domain%d", i);
- entry->procname = kstrdup(buf, GFP_KERNEL);
- entry->mode = 0555;
- entry->child = sd_alloc_ctl_domain_table(sd);
- entry++;
- i++;
- }
- return table;
-}
-
-static struct ctl_table_header *sd_sysctl_header;
-static void register_sched_domain_sysctl(void)
-{
- int i, cpu_num = num_possible_cpus();
- struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
- char buf[32];
-
- WARN_ON(sd_ctl_dir[0].child);
- sd_ctl_dir[0].child = entry;
-
- if (entry == NULL)
- return;
-
- for_each_possible_cpu(i) {
- snprintf(buf, 32, "cpu%d", i);
- entry->procname = kstrdup(buf, GFP_KERNEL);
- entry->mode = 0555;
- entry->child = sd_alloc_ctl_cpu_table(i);
- entry++;
- }
-
- WARN_ON(sd_sysctl_header);
- sd_sysctl_header = register_sysctl_table(sd_ctl_root);
-}
-
-/* may be called multiple times per register */
-static void unregister_sched_domain_sysctl(void)
-{
- unregister_sysctl_table(sd_sysctl_header);
- sd_sysctl_header = NULL;
- if (sd_ctl_dir[0].child)
- sd_free_ctl_entry(&sd_ctl_dir[0].child);
-}
-#else
-static void register_sched_domain_sysctl(void)
-{
-}
-static void unregister_sched_domain_sysctl(void)
-{
-}
-#endif /* CONFIG_SCHED_DEBUG && CONFIG_SYSCTL */
-
static void set_rq_online(struct rq *rq)
{
if (!rq->online) {
@@ -6071,11 +5924,16 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
/* Setup the mask of cpus configured for isolated domains */
static int __init isolated_cpu_setup(char *str)
{
+ int ret;
+
alloc_bootmem_cpumask_var(&cpu_isolated_map);
- cpulist_parse(str, cpu_isolated_map);
+ ret = cpulist_parse(str, cpu_isolated_map);
+ if (ret) {
+ pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids);
+ return 0;
+ }
return 1;
}
-
__setup("isolcpus=", isolated_cpu_setup);
struct s_data {
@@ -7355,6 +7213,9 @@ int in_sched_functions(unsigned long addr)
*/
struct task_group root_task_group;
LIST_HEAD(task_groups);
+
+/* Cacheline aligned slab cache for task_group */
+static struct kmem_cache *task_group_cache __read_mostly;
#endif
DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
@@ -7412,11 +7273,12 @@ void __init sched_init(void)
#endif /* CONFIG_RT_GROUP_SCHED */
#ifdef CONFIG_CGROUP_SCHED
+ task_group_cache = KMEM_CACHE(task_group, 0);
+
list_add(&root_task_group.list, &task_groups);
INIT_LIST_HEAD(&root_task_group.children);
INIT_LIST_HEAD(&root_task_group.siblings);
autogroup_init(&init_task);
-
#endif /* CONFIG_CGROUP_SCHED */
for_each_possible_cpu(i) {
@@ -7697,7 +7559,7 @@ static void free_sched_group(struct task_group *tg)
free_fair_sched_group(tg);
free_rt_sched_group(tg);
autogroup_free(tg);
- kfree(tg);
+ kmem_cache_free(task_group_cache, tg);
}
/* allocate runqueue etc for a new task group */
@@ -7705,7 +7567,7 @@ struct task_group *sched_create_group(struct task_group *parent)
{
struct task_group *tg;
- tg = kzalloc(sizeof(*tg), GFP_KERNEL);
+ tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO);
if (!tg)
return ERR_PTR(-ENOMEM);
@@ -7754,11 +7616,9 @@ void sched_destroy_group(struct task_group *tg)
void sched_offline_group(struct task_group *tg)
{
unsigned long flags;
- int i;
/* end participation in shares distribution */
- for_each_possible_cpu(i)
- unregister_fair_sched_group(tg, i);
+ unregister_fair_sched_group(tg);
spin_lock_irqsave(&task_group_lock, flags);
list_del_rcu(&tg->list);
@@ -7784,7 +7644,7 @@ void sched_move_task(struct task_struct *tsk)
queued = task_on_rq_queued(tsk);
if (queued)
- dequeue_task(rq, tsk, DEQUEUE_SAVE);
+ dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
if (unlikely(running))
put_prev_task(rq, tsk);
@@ -7808,7 +7668,7 @@ void sched_move_task(struct task_struct *tsk)
if (unlikely(running))
tsk->sched_class->set_curr_task(rq);
if (queued)
- enqueue_task(rq, tsk, ENQUEUE_RESTORE);
+ enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE);
task_rq_unlock(rq, tsk, &flags);
}
@@ -8236,7 +8096,7 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
sched_offline_group(tg);
}
-static void cpu_cgroup_fork(struct task_struct *task, void *private)
+static void cpu_cgroup_fork(struct task_struct *task)
{
sched_move_task(task);
}
@@ -8610,3 +8470,44 @@ void dump_cpu_task(int cpu)
pr_info("Task dump for CPU %d:\n", cpu);
sched_show_task(cpu_curr(cpu));
}
+
+/*
+ * Nice levels are multiplicative, with a gentle 10% change for every
+ * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
+ * nice 1, it will get ~10% less CPU time than another CPU-bound task
+ * that remained on nice 0.
+ *
+ * The "10% effect" is relative and cumulative: from _any_ nice level,
+ * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
+ * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
+ * If a task goes up by ~10% and another task goes down by ~10% then
+ * the relative distance between them is ~25%.)
+ */
+const int sched_prio_to_weight[40] = {
+ /* -20 */ 88761, 71755, 56483, 46273, 36291,
+ /* -15 */ 29154, 23254, 18705, 14949, 11916,
+ /* -10 */ 9548, 7620, 6100, 4904, 3906,
+ /* -5 */ 3121, 2501, 1991, 1586, 1277,
+ /* 0 */ 1024, 820, 655, 526, 423,
+ /* 5 */ 335, 272, 215, 172, 137,
+ /* 10 */ 110, 87, 70, 56, 45,
+ /* 15 */ 36, 29, 23, 18, 15,
+};
+
+/*
+ * Inverse (2^32/x) values of the sched_prio_to_weight[] array, precalculated.
+ *
+ * In cases where the weight does not change often, we can use the
+ * precalculated inverse to speed up arithmetics by turning divisions
+ * into multiplications:
+ */
+const u32 sched_prio_to_wmult[40] = {
+ /* -20 */ 48388, 59856, 76040, 92818, 118348,
+ /* -15 */ 147320, 184698, 229616, 287308, 360437,
+ /* -10 */ 449829, 563644, 704093, 875809, 1099582,
+ /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
+ /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
+ /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
+ /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
+ /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
+};
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 05de80b48..75f98c549 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -5,6 +5,9 @@
#include <linux/static_key.h>
#include <linux/context_tracking.h>
#include "sched.h"
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#endif
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -259,21 +262,21 @@ static __always_inline bool steal_account_process_tick(void)
#ifdef CONFIG_PARAVIRT
if (static_key_false(&paravirt_steal_enabled)) {
u64 steal;
- cputime_t steal_ct;
+ unsigned long steal_jiffies;
steal = paravirt_steal_clock(smp_processor_id());
steal -= this_rq()->prev_steal_time;
/*
- * cputime_t may be less precise than nsecs (eg: if it's
- * based on jiffies). Lets cast the result to cputime
+ * steal is in nsecs but our caller is expecting steal
+ * time in jiffies. Lets cast the result to jiffies
* granularity and account the rest on the next rounds.
*/
- steal_ct = nsecs_to_cputime(steal);
- this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct);
+ steal_jiffies = nsecs_to_jiffies(steal);
+ this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies);
- account_steal_time(steal_ct);
- return steal_ct;
+ account_steal_time(jiffies_to_cputime(steal_jiffies));
+ return steal_jiffies;
}
#endif
return false;
@@ -466,7 +469,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
struct rq *rq = this_rq();
- if (vtime_accounting_enabled())
+ if (vtime_accounting_cpu_enabled())
return;
if (sched_clock_irqtime) {
@@ -665,26 +668,25 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-static unsigned long long vtime_delta(struct task_struct *tsk)
+static cputime_t vtime_delta(struct task_struct *tsk)
{
- unsigned long long clock;
+ unsigned long now = READ_ONCE(jiffies);
- clock = local_clock();
- if (clock < tsk->vtime_snap)
+ if (time_before(now, (unsigned long)tsk->vtime_snap))
return 0;
- return clock - tsk->vtime_snap;
+ return jiffies_to_cputime(now - tsk->vtime_snap);
}
static cputime_t get_vtime_delta(struct task_struct *tsk)
{
- unsigned long long delta = vtime_delta(tsk);
+ unsigned long now = READ_ONCE(jiffies);
+ unsigned long delta = now - tsk->vtime_snap;
- WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_SLEEPING);
- tsk->vtime_snap += delta;
+ WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
+ tsk->vtime_snap = now;
- /* CHECKME: always safe to convert nsecs to cputime? */
- return nsecs_to_cputime(delta);
+ return jiffies_to_cputime(delta);
}
static void __vtime_account_system(struct task_struct *tsk)
@@ -696,37 +698,44 @@ static void __vtime_account_system(struct task_struct *tsk)
void vtime_account_system(struct task_struct *tsk)
{
- write_seqlock(&tsk->vtime_seqlock);
+ if (!vtime_delta(tsk))
+ return;
+
+ write_seqcount_begin(&tsk->vtime_seqcount);
__vtime_account_system(tsk);
- write_sequnlock(&tsk->vtime_seqlock);
+ write_seqcount_end(&tsk->vtime_seqcount);
}
void vtime_gen_account_irq_exit(struct task_struct *tsk)
{
- write_seqlock(&tsk->vtime_seqlock);
- __vtime_account_system(tsk);
+ write_seqcount_begin(&tsk->vtime_seqcount);
+ if (vtime_delta(tsk))
+ __vtime_account_system(tsk);
if (context_tracking_in_user())
tsk->vtime_snap_whence = VTIME_USER;
- write_sequnlock(&tsk->vtime_seqlock);
+ write_seqcount_end(&tsk->vtime_seqcount);
}
void vtime_account_user(struct task_struct *tsk)
{
cputime_t delta_cpu;
- write_seqlock(&tsk->vtime_seqlock);
- delta_cpu = get_vtime_delta(tsk);
+ write_seqcount_begin(&tsk->vtime_seqcount);
tsk->vtime_snap_whence = VTIME_SYS;
- account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
- write_sequnlock(&tsk->vtime_seqlock);
+ if (vtime_delta(tsk)) {
+ delta_cpu = get_vtime_delta(tsk);
+ account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
+ }
+ write_seqcount_end(&tsk->vtime_seqcount);
}
void vtime_user_enter(struct task_struct *tsk)
{
- write_seqlock(&tsk->vtime_seqlock);
- __vtime_account_system(tsk);
+ write_seqcount_begin(&tsk->vtime_seqcount);
+ if (vtime_delta(tsk))
+ __vtime_account_system(tsk);
tsk->vtime_snap_whence = VTIME_USER;
- write_sequnlock(&tsk->vtime_seqlock);
+ write_seqcount_end(&tsk->vtime_seqcount);
}
void vtime_guest_enter(struct task_struct *tsk)
@@ -738,19 +747,20 @@ void vtime_guest_enter(struct task_struct *tsk)
* synchronization against the reader (task_gtime())
* that can thus safely catch up with a tickless delta.
*/
- write_seqlock(&tsk->vtime_seqlock);
- __vtime_account_system(tsk);
+ write_seqcount_begin(&tsk->vtime_seqcount);
+ if (vtime_delta(tsk))
+ __vtime_account_system(tsk);
current->flags |= PF_VCPU;
- write_sequnlock(&tsk->vtime_seqlock);
+ write_seqcount_end(&tsk->vtime_seqcount);
}
EXPORT_SYMBOL_GPL(vtime_guest_enter);
void vtime_guest_exit(struct task_struct *tsk)
{
- write_seqlock(&tsk->vtime_seqlock);
+ write_seqcount_begin(&tsk->vtime_seqcount);
__vtime_account_system(tsk);
current->flags &= ~PF_VCPU;
- write_sequnlock(&tsk->vtime_seqlock);
+ write_seqcount_end(&tsk->vtime_seqcount);
}
EXPORT_SYMBOL_GPL(vtime_guest_exit);
@@ -763,24 +773,26 @@ void vtime_account_idle(struct task_struct *tsk)
void arch_vtime_task_switch(struct task_struct *prev)
{
- write_seqlock(&prev->vtime_seqlock);
- prev->vtime_snap_whence = VTIME_SLEEPING;
- write_sequnlock(&prev->vtime_seqlock);
+ write_seqcount_begin(&prev->vtime_seqcount);
+ prev->vtime_snap_whence = VTIME_INACTIVE;
+ write_seqcount_end(&prev->vtime_seqcount);
- write_seqlock(&current->vtime_seqlock);
+ write_seqcount_begin(&current->vtime_seqcount);
current->vtime_snap_whence = VTIME_SYS;
- current->vtime_snap = sched_clock_cpu(smp_processor_id());
- write_sequnlock(&current->vtime_seqlock);
+ current->vtime_snap = jiffies;
+ write_seqcount_end(&current->vtime_seqcount);
}
void vtime_init_idle(struct task_struct *t, int cpu)
{
unsigned long flags;
- write_seqlock_irqsave(&t->vtime_seqlock, flags);
+ local_irq_save(flags);
+ write_seqcount_begin(&t->vtime_seqcount);
t->vtime_snap_whence = VTIME_SYS;
- t->vtime_snap = sched_clock_cpu(cpu);
- write_sequnlock_irqrestore(&t->vtime_seqlock, flags);
+ t->vtime_snap = jiffies;
+ write_seqcount_end(&t->vtime_seqcount);
+ local_irq_restore(flags);
}
cputime_t task_gtime(struct task_struct *t)
@@ -788,17 +800,17 @@ cputime_t task_gtime(struct task_struct *t)
unsigned int seq;
cputime_t gtime;
- if (!context_tracking_is_enabled())
+ if (!vtime_accounting_enabled())
return t->gtime;
do {
- seq = read_seqbegin(&t->vtime_seqlock);
+ seq = read_seqcount_begin(&t->vtime_seqcount);
gtime = t->gtime;
- if (t->flags & PF_VCPU)
+ if (t->vtime_snap_whence == VTIME_SYS && t->flags & PF_VCPU)
gtime += vtime_delta(t);
- } while (read_seqretry(&t->vtime_seqlock, seq));
+ } while (read_seqcount_retry(&t->vtime_seqcount, seq));
return gtime;
}
@@ -821,7 +833,7 @@ fetch_task_cputime(struct task_struct *t,
*udelta = 0;
*sdelta = 0;
- seq = read_seqbegin(&t->vtime_seqlock);
+ seq = read_seqcount_begin(&t->vtime_seqcount);
if (u_dst)
*u_dst = *u_src;
@@ -829,7 +841,7 @@ fetch_task_cputime(struct task_struct *t,
*s_dst = *s_src;
/* Task is sleeping, nothing to add */
- if (t->vtime_snap_whence == VTIME_SLEEPING ||
+ if (t->vtime_snap_whence == VTIME_INACTIVE ||
is_idle_task(t))
continue;
@@ -845,7 +857,7 @@ fetch_task_cputime(struct task_struct *t,
if (t->vtime_snap_whence == VTIME_SYS)
*sdelta = delta;
}
- } while (read_seqretry(&t->vtime_seqlock, seq));
+ } while (read_seqcount_retry(&t->vtime_seqcount, seq));
}
@@ -853,6 +865,14 @@ void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime)
{
cputime_t udelta, sdelta;
+ if (!vtime_accounting_enabled()) {
+ if (utime)
+ *utime = t->utime;
+ if (stime)
+ *stime = t->stime;
+ return;
+ }
+
fetch_task_cputime(t, utime, stime, &t->utime,
&t->stime, &udelta, &sdelta);
if (utime)
@@ -866,6 +886,14 @@ void task_cputime_scaled(struct task_struct *t,
{
cputime_t udelta, sdelta;
+ if (!vtime_accounting_enabled()) {
+ if (utimescaled)
+ *utimescaled = t->utimescaled;
+ if (stimescaled)
+ *stimescaled = t->stimescaled;
+ return;
+ }
+
fetch_task_cputime(t, utimescaled, stimescaled,
&t->utimescaled, &t->stimescaled, &udelta, &sdelta);
if (utimescaled)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 8b0a15e28..c7a036fac 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -176,8 +176,10 @@ static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
}
}
- if (leftmost)
+ if (leftmost) {
dl_rq->pushable_dl_tasks_leftmost = &p->pushable_dl_tasks;
+ dl_rq->earliest_dl.next = p->dl.deadline;
+ }
rb_link_node(&p->pushable_dl_tasks, parent, link);
rb_insert_color(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root);
@@ -195,6 +197,10 @@ static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
next_node = rb_next(&p->pushable_dl_tasks);
dl_rq->pushable_dl_tasks_leftmost = next_node;
+ if (next_node) {
+ dl_rq->earliest_dl.next = rb_entry(next_node,
+ struct task_struct, pushable_dl_tasks)->dl.deadline;
+ }
}
rb_erase(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root);
@@ -346,7 +352,15 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
struct rq *rq = rq_of_dl_rq(dl_rq);
- WARN_ON(!dl_se->dl_new || dl_se->dl_throttled);
+ WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline));
+
+ /*
+ * We are racing with the deadline timer. So, do nothing because
+ * the deadline timer handler will take care of properly recharging
+ * the runtime and postponing the deadline
+ */
+ if (dl_se->dl_throttled)
+ return;
/*
* We use the regular wall clock time to set deadlines in the
@@ -355,7 +369,6 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
*/
dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
dl_se->runtime = pi_se->dl_runtime;
- dl_se->dl_new = 0;
}
/*
@@ -393,6 +406,9 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
dl_se->runtime = pi_se->dl_runtime;
}
+ if (dl_se->dl_yielded && dl_se->runtime > 0)
+ dl_se->runtime = 0;
+
/*
* We keep moving the deadline away until we get some
* available runtime for the entity. This ensures correct
@@ -414,7 +430,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
* entity.
*/
if (dl_time_before(dl_se->deadline, rq_clock(rq))) {
- printk_deferred_once("sched: DL replenish lagged to much\n");
+ printk_deferred_once("sched: DL replenish lagged too much\n");
dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
dl_se->runtime = pi_se->dl_runtime;
}
@@ -494,15 +510,6 @@ static void update_dl_entity(struct sched_dl_entity *dl_se,
struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
struct rq *rq = rq_of_dl_rq(dl_rq);
- /*
- * The arrival of a new instance needs special treatment, i.e.,
- * the actual scheduling parameters have to be "renewed".
- */
- if (dl_se->dl_new) {
- setup_new_dl_entity(dl_se, pi_se);
- return;
- }
-
if (dl_time_before(dl_se->deadline, rq_clock(rq)) ||
dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) {
dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
@@ -599,16 +606,6 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
}
/*
- * This is possible if switched_from_dl() raced against a running
- * callback that took the above !dl_task() path and we've since then
- * switched back into SCHED_DEADLINE.
- *
- * There's nothing to do except drop our task reference.
- */
- if (dl_se->dl_new)
- goto unlock;
-
- /*
* The task might have been boosted by someone else and might be in the
* boosting/deboosting path, its not throttled.
*/
@@ -729,8 +726,11 @@ static void update_curr_dl(struct rq *rq)
* approach need further study.
*/
delta_exec = rq_clock_task(rq) - curr->se.exec_start;
- if (unlikely((s64)delta_exec <= 0))
+ if (unlikely((s64)delta_exec <= 0)) {
+ if (unlikely(dl_se->dl_yielded))
+ goto throttle;
return;
+ }
schedstat_set(curr->se.statistics.exec_max,
max(curr->se.statistics.exec_max, delta_exec));
@@ -743,8 +743,10 @@ static void update_curr_dl(struct rq *rq)
sched_rt_avg_update(rq, delta_exec);
- dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec;
- if (dl_runtime_exceeded(dl_se)) {
+ dl_se->runtime -= delta_exec;
+
+throttle:
+ if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) {
dl_se->dl_throttled = 1;
__dequeue_task_dl(rq, curr, 0);
if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr)))
@@ -782,42 +784,14 @@ static void update_curr_dl(struct rq *rq)
#ifdef CONFIG_SMP
-static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu);
-
-static inline u64 next_deadline(struct rq *rq)
-{
- struct task_struct *next = pick_next_earliest_dl_task(rq, rq->cpu);
-
- if (next && dl_prio(next->prio))
- return next->dl.deadline;
- else
- return 0;
-}
-
static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
{
struct rq *rq = rq_of_dl_rq(dl_rq);
if (dl_rq->earliest_dl.curr == 0 ||
dl_time_before(deadline, dl_rq->earliest_dl.curr)) {
- /*
- * If the dl_rq had no -deadline tasks, or if the new task
- * has shorter deadline than the current one on dl_rq, we
- * know that the previous earliest becomes our next earliest,
- * as the new task becomes the earliest itself.
- */
- dl_rq->earliest_dl.next = dl_rq->earliest_dl.curr;
dl_rq->earliest_dl.curr = deadline;
cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1);
- } else if (dl_rq->earliest_dl.next == 0 ||
- dl_time_before(deadline, dl_rq->earliest_dl.next)) {
- /*
- * On the other hand, if the new -deadline task has a
- * a later deadline than the earliest one on dl_rq, but
- * it is earlier than the next (if any), we must
- * recompute the next-earliest.
- */
- dl_rq->earliest_dl.next = next_deadline(rq);
}
}
@@ -839,7 +813,6 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
entry = rb_entry(leftmost, struct sched_dl_entity, rb_node);
dl_rq->earliest_dl.curr = entry->deadline;
- dl_rq->earliest_dl.next = next_deadline(rq);
cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1);
}
}
@@ -940,7 +913,7 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se,
* parameters of the task might need updating. Otherwise,
* we want a replenishment of its runtime.
*/
- if (dl_se->dl_new || flags & ENQUEUE_WAKEUP)
+ if (flags & ENQUEUE_WAKEUP)
update_dl_entity(dl_se, pi_se);
else if (flags & ENQUEUE_REPLENISH)
replenish_dl_entity(dl_se, pi_se);
@@ -1017,18 +990,14 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
*/
static void yield_task_dl(struct rq *rq)
{
- struct task_struct *p = rq->curr;
-
/*
* We make the task go to sleep until its current deadline by
* forcing its runtime to zero. This way, update_curr_dl() stops
* it and the bandwidth timer will wake it up and will give it
* new scheduling parameters (thanks to dl_yielded=1).
*/
- if (p->dl.runtime > 0) {
- rq->curr->dl.dl_yielded = 1;
- p->dl.runtime = 0;
- }
+ rq->curr->dl.dl_yielded = 1;
+
update_rq_clock(rq);
update_curr_dl(rq);
/*
@@ -1274,28 +1243,6 @@ static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
return 0;
}
-/* Returns the second earliest -deadline task, NULL otherwise */
-static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu)
-{
- struct rb_node *next_node = rq->dl.rb_leftmost;
- struct sched_dl_entity *dl_se;
- struct task_struct *p = NULL;
-
-next_node:
- next_node = rb_next(next_node);
- if (next_node) {
- dl_se = rb_entry(next_node, struct sched_dl_entity, rb_node);
- p = dl_task_of(dl_se);
-
- if (pick_dl_task(rq, p, cpu))
- return p;
-
- goto next_node;
- }
-
- return NULL;
-}
-
/*
* Return the earliest pushable rq's task, which is suitable to be executed
* on the CPU, NULL otherwise:
@@ -1767,6 +1714,9 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
*/
static void switched_to_dl(struct rq *rq, struct task_struct *p)
{
+ if (dl_time_before(p->dl.deadline, rq_clock(rq)))
+ setup_new_dl_entity(&p->dl, &p->dl);
+
if (task_on_rq_queued(p) && rq->curr != p) {
#ifdef CONFIG_SMP
if (p->nr_cpus_allowed > 1 && rq->dl.overloaded)
@@ -1813,8 +1763,7 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
*/
resched_curr(rq);
#endif /* CONFIG_SMP */
- } else
- switched_to_dl(rq, p);
+ }
}
const struct sched_class dl_sched_class = {
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 641511771..4fbc3bd5f 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -16,6 +16,7 @@
#include <linux/kallsyms.h>
#include <linux/utsname.h>
#include <linux/mempolicy.h>
+#include <linux/debugfs.h>
#include "sched.h"
@@ -58,6 +59,309 @@ static unsigned long nsec_low(unsigned long long nsec)
#define SPLIT_NS(x) nsec_high(x), nsec_low(x)
+#define SCHED_FEAT(name, enabled) \
+ #name ,
+
+static const char * const sched_feat_names[] = {
+#include "features.h"
+};
+
+#undef SCHED_FEAT
+
+static int sched_feat_show(struct seq_file *m, void *v)
+{
+ int i;
+
+ for (i = 0; i < __SCHED_FEAT_NR; i++) {
+ if (!(sysctl_sched_features & (1UL << i)))
+ seq_puts(m, "NO_");
+ seq_printf(m, "%s ", sched_feat_names[i]);
+ }
+ seq_puts(m, "\n");
+
+ return 0;
+}
+
+#ifdef HAVE_JUMP_LABEL
+
+#define jump_label_key__true STATIC_KEY_INIT_TRUE
+#define jump_label_key__false STATIC_KEY_INIT_FALSE
+
+#define SCHED_FEAT(name, enabled) \
+ jump_label_key__##enabled ,
+
+struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
+#include "features.h"
+};
+
+#undef SCHED_FEAT
+
+static void sched_feat_disable(int i)
+{
+ static_key_disable(&sched_feat_keys[i]);
+}
+
+static void sched_feat_enable(int i)
+{
+ static_key_enable(&sched_feat_keys[i]);
+}
+#else
+static void sched_feat_disable(int i) { };
+static void sched_feat_enable(int i) { };
+#endif /* HAVE_JUMP_LABEL */
+
+static int sched_feat_set(char *cmp)
+{
+ int i;
+ int neg = 0;
+
+ if (strncmp(cmp, "NO_", 3) == 0) {
+ neg = 1;
+ cmp += 3;
+ }
+
+ for (i = 0; i < __SCHED_FEAT_NR; i++) {
+ if (strcmp(cmp, sched_feat_names[i]) == 0) {
+ if (neg) {
+ sysctl_sched_features &= ~(1UL << i);
+ sched_feat_disable(i);
+ } else {
+ sysctl_sched_features |= (1UL << i);
+ sched_feat_enable(i);
+ }
+ break;
+ }
+ }
+
+ return i;
+}
+
+static ssize_t
+sched_feat_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[64];
+ char *cmp;
+ int i;
+ struct inode *inode;
+
+ if (cnt > 63)
+ cnt = 63;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = 0;
+ cmp = strstrip(buf);
+
+ /* Ensure the static_key remains in a consistent state */
+ inode = file_inode(filp);
+ inode_lock(inode);
+ i = sched_feat_set(cmp);
+ inode_unlock(inode);
+ if (i == __SCHED_FEAT_NR)
+ return -EINVAL;
+
+ *ppos += cnt;
+
+ return cnt;
+}
+
+static int sched_feat_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, sched_feat_show, NULL);
+}
+
+static const struct file_operations sched_feat_fops = {
+ .open = sched_feat_open,
+ .write = sched_feat_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static __init int sched_init_debug(void)
+{
+ debugfs_create_file("sched_features", 0644, NULL, NULL,
+ &sched_feat_fops);
+
+ return 0;
+}
+late_initcall(sched_init_debug);
+
+#ifdef CONFIG_SMP
+
+#ifdef CONFIG_SYSCTL
+
+static struct ctl_table sd_ctl_dir[] = {
+ {
+ .procname = "sched_domain",
+ .mode = 0555,
+ },
+ {}
+};
+
+static struct ctl_table sd_ctl_root[] = {
+ {
+ .procname = "kernel",
+ .mode = 0555,
+ .child = sd_ctl_dir,
+ },
+ {}
+};
+
+static struct ctl_table *sd_alloc_ctl_entry(int n)
+{
+ struct ctl_table *entry =
+ kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
+
+ return entry;
+}
+
+static void sd_free_ctl_entry(struct ctl_table **tablep)
+{
+ struct ctl_table *entry;
+
+ /*
+ * In the intermediate directories, both the child directory and
+ * procname are dynamically allocated and could fail but the mode
+ * will always be set. In the lowest directory the names are
+ * static strings and all have proc handlers.
+ */
+ for (entry = *tablep; entry->mode; entry++) {
+ if (entry->child)
+ sd_free_ctl_entry(&entry->child);
+ if (entry->proc_handler == NULL)
+ kfree(entry->procname);
+ }
+
+ kfree(*tablep);
+ *tablep = NULL;
+}
+
+static int min_load_idx = 0;
+static int max_load_idx = CPU_LOAD_IDX_MAX-1;
+
+static void
+set_table_entry(struct ctl_table *entry,
+ const char *procname, void *data, int maxlen,
+ umode_t mode, proc_handler *proc_handler,
+ bool load_idx)
+{
+ entry->procname = procname;
+ entry->data = data;
+ entry->maxlen = maxlen;
+ entry->mode = mode;
+ entry->proc_handler = proc_handler;
+
+ if (load_idx) {
+ entry->extra1 = &min_load_idx;
+ entry->extra2 = &max_load_idx;
+ }
+}
+
+static struct ctl_table *
+sd_alloc_ctl_domain_table(struct sched_domain *sd)
+{
+ struct ctl_table *table = sd_alloc_ctl_entry(14);
+
+ if (table == NULL)
+ return NULL;
+
+ set_table_entry(&table[0], "min_interval", &sd->min_interval,
+ sizeof(long), 0644, proc_doulongvec_minmax, false);
+ set_table_entry(&table[1], "max_interval", &sd->max_interval,
+ sizeof(long), 0644, proc_doulongvec_minmax, false);
+ set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
+ sizeof(int), 0644, proc_dointvec_minmax, true);
+ set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
+ sizeof(int), 0644, proc_dointvec_minmax, true);
+ set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
+ sizeof(int), 0644, proc_dointvec_minmax, true);
+ set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
+ sizeof(int), 0644, proc_dointvec_minmax, true);
+ set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
+ sizeof(int), 0644, proc_dointvec_minmax, true);
+ set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
+ sizeof(int), 0644, proc_dointvec_minmax, false);
+ set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
+ sizeof(int), 0644, proc_dointvec_minmax, false);
+ set_table_entry(&table[9], "cache_nice_tries",
+ &sd->cache_nice_tries,
+ sizeof(int), 0644, proc_dointvec_minmax, false);
+ set_table_entry(&table[10], "flags", &sd->flags,
+ sizeof(int), 0644, proc_dointvec_minmax, false);
+ set_table_entry(&table[11], "max_newidle_lb_cost",
+ &sd->max_newidle_lb_cost,
+ sizeof(long), 0644, proc_doulongvec_minmax, false);
+ set_table_entry(&table[12], "name", sd->name,
+ CORENAME_MAX_SIZE, 0444, proc_dostring, false);
+ /* &table[13] is terminator */
+
+ return table;
+}
+
+static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
+{
+ struct ctl_table *entry, *table;
+ struct sched_domain *sd;
+ int domain_num = 0, i;
+ char buf[32];
+
+ for_each_domain(cpu, sd)
+ domain_num++;
+ entry = table = sd_alloc_ctl_entry(domain_num + 1);
+ if (table == NULL)
+ return NULL;
+
+ i = 0;
+ for_each_domain(cpu, sd) {
+ snprintf(buf, 32, "domain%d", i);
+ entry->procname = kstrdup(buf, GFP_KERNEL);
+ entry->mode = 0555;
+ entry->child = sd_alloc_ctl_domain_table(sd);
+ entry++;
+ i++;
+ }
+ return table;
+}
+
+static struct ctl_table_header *sd_sysctl_header;
+void register_sched_domain_sysctl(void)
+{
+ int i, cpu_num = num_possible_cpus();
+ struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
+ char buf[32];
+
+ WARN_ON(sd_ctl_dir[0].child);
+ sd_ctl_dir[0].child = entry;
+
+ if (entry == NULL)
+ return;
+
+ for_each_possible_cpu(i) {
+ snprintf(buf, 32, "cpu%d", i);
+ entry->procname = kstrdup(buf, GFP_KERNEL);
+ entry->mode = 0555;
+ entry->child = sd_alloc_ctl_cpu_table(i);
+ entry++;
+ }
+
+ WARN_ON(sd_sysctl_header);
+ sd_sysctl_header = register_sysctl_table(sd_ctl_root);
+}
+
+/* may be called multiple times per register */
+void unregister_sched_domain_sysctl(void)
+{
+ unregister_sysctl_table(sd_sysctl_header);
+ sd_sysctl_header = NULL;
+ if (sd_ctl_dir[0].child)
+ sd_free_ctl_entry(&sd_ctl_dir[0].child);
+}
+#endif /* CONFIG_SYSCTL */
+#endif /* CONFIG_SMP */
+
#ifdef CONFIG_FAIR_GROUP_SCHED
static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
{
@@ -75,16 +379,18 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
PN(se->vruntime);
PN(se->sum_exec_runtime);
#ifdef CONFIG_SCHEDSTATS
- PN(se->statistics.wait_start);
- PN(se->statistics.sleep_start);
- PN(se->statistics.block_start);
- PN(se->statistics.sleep_max);
- PN(se->statistics.block_max);
- PN(se->statistics.exec_max);
- PN(se->statistics.slice_max);
- PN(se->statistics.wait_max);
- PN(se->statistics.wait_sum);
- P(se->statistics.wait_count);
+ if (schedstat_enabled()) {
+ PN(se->statistics.wait_start);
+ PN(se->statistics.sleep_start);
+ PN(se->statistics.block_start);
+ PN(se->statistics.sleep_max);
+ PN(se->statistics.block_max);
+ PN(se->statistics.exec_max);
+ PN(se->statistics.slice_max);
+ PN(se->statistics.wait_max);
+ PN(se->statistics.wait_sum);
+ P(se->statistics.wait_count);
+ }
#endif
P(se->load.weight);
#ifdef CONFIG_SMP
@@ -122,10 +428,12 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
(long long)(p->nvcsw + p->nivcsw),
p->prio);
#ifdef CONFIG_SCHEDSTATS
- SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
- SPLIT_NS(p->se.statistics.wait_sum),
- SPLIT_NS(p->se.sum_exec_runtime),
- SPLIT_NS(p->se.statistics.sum_sleep_runtime));
+ if (schedstat_enabled()) {
+ SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
+ SPLIT_NS(p->se.statistics.wait_sum),
+ SPLIT_NS(p->se.sum_exec_runtime),
+ SPLIT_NS(p->se.statistics.sum_sleep_runtime));
+ }
#else
SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
0LL, 0L,
@@ -258,8 +566,17 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq)
{
+ struct dl_bw *dl_bw;
+
SEQ_printf(m, "\ndl_rq[%d]:\n", cpu);
SEQ_printf(m, " .%-30s: %ld\n", "dl_nr_running", dl_rq->dl_nr_running);
+#ifdef CONFIG_SMP
+ dl_bw = &cpu_rq(cpu)->rd->dl_bw;
+#else
+ dl_bw = &dl_rq->dl_bw;
+#endif
+ SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->bw", dl_bw->bw);
+ SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->total_bw", dl_bw->total_bw);
}
extern __read_mostly int sched_clock_running;
@@ -313,17 +630,18 @@ do { \
#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
#define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n);
- P(yld_count);
-
- P(sched_count);
- P(sched_goidle);
#ifdef CONFIG_SMP
P64(avg_idle);
P64(max_idle_balance_cost);
#endif
- P(ttwu_count);
- P(ttwu_local);
+ if (schedstat_enabled()) {
+ P(yld_count);
+ P(sched_count);
+ P(sched_goidle);
+ P(ttwu_count);
+ P(ttwu_local);
+ }
#undef P
#undef P64
@@ -569,38 +887,39 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
nr_switches = p->nvcsw + p->nivcsw;
#ifdef CONFIG_SCHEDSTATS
- PN(se.statistics.sum_sleep_runtime);
- PN(se.statistics.wait_start);
- PN(se.statistics.sleep_start);
- PN(se.statistics.block_start);
- PN(se.statistics.sleep_max);
- PN(se.statistics.block_max);
- PN(se.statistics.exec_max);
- PN(se.statistics.slice_max);
- PN(se.statistics.wait_max);
- PN(se.statistics.wait_sum);
- P(se.statistics.wait_count);
- PN(se.statistics.iowait_sum);
- P(se.statistics.iowait_count);
P(se.nr_migrations);
- P(se.statistics.nr_migrations_cold);
- P(se.statistics.nr_failed_migrations_affine);
- P(se.statistics.nr_failed_migrations_running);
- P(se.statistics.nr_failed_migrations_hot);
- P(se.statistics.nr_forced_migrations);
- P(se.statistics.nr_wakeups);
- P(se.statistics.nr_wakeups_sync);
- P(se.statistics.nr_wakeups_migrate);
- P(se.statistics.nr_wakeups_local);
- P(se.statistics.nr_wakeups_remote);
- P(se.statistics.nr_wakeups_affine);
- P(se.statistics.nr_wakeups_affine_attempts);
- P(se.statistics.nr_wakeups_passive);
- P(se.statistics.nr_wakeups_idle);
- {
+ if (schedstat_enabled()) {
u64 avg_atom, avg_per_cpu;
+ PN(se.statistics.sum_sleep_runtime);
+ PN(se.statistics.wait_start);
+ PN(se.statistics.sleep_start);
+ PN(se.statistics.block_start);
+ PN(se.statistics.sleep_max);
+ PN(se.statistics.block_max);
+ PN(se.statistics.exec_max);
+ PN(se.statistics.slice_max);
+ PN(se.statistics.wait_max);
+ PN(se.statistics.wait_sum);
+ P(se.statistics.wait_count);
+ PN(se.statistics.iowait_sum);
+ P(se.statistics.iowait_count);
+ P(se.statistics.nr_migrations_cold);
+ P(se.statistics.nr_failed_migrations_affine);
+ P(se.statistics.nr_failed_migrations_running);
+ P(se.statistics.nr_failed_migrations_hot);
+ P(se.statistics.nr_forced_migrations);
+ P(se.statistics.nr_wakeups);
+ P(se.statistics.nr_wakeups_sync);
+ P(se.statistics.nr_wakeups_migrate);
+ P(se.statistics.nr_wakeups_local);
+ P(se.statistics.nr_wakeups_remote);
+ P(se.statistics.nr_wakeups_affine);
+ P(se.statistics.nr_wakeups_affine_attempts);
+ P(se.statistics.nr_wakeups_passive);
+ P(se.statistics.nr_wakeups_idle);
+
avg_atom = p->se.sum_exec_runtime;
if (nr_switches)
avg_atom = div64_ul(avg_atom, nr_switches);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 82e905862..ac7fb39c3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -20,8 +20,8 @@
* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
*/
-#include <linux/latencytop.h>
#include <linux/sched.h>
+#include <linux/latencytop.h>
#include <linux/cpumask.h>
#include <linux/cpuidle.h>
#include <linux/slab.h>
@@ -763,16 +763,52 @@ static void update_curr_fair(struct rq *rq)
update_curr(cfs_rq_of(&rq->curr->se));
}
+#ifdef CONFIG_SCHEDSTATS
static inline void
update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
+ u64 wait_start = rq_clock(rq_of(cfs_rq));
+
+ if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
+ likely(wait_start > se->statistics.wait_start))
+ wait_start -= se->statistics.wait_start;
+
+ se->statistics.wait_start = wait_start;
+}
+
+static void
+update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ struct task_struct *p;
+ u64 delta;
+
+ delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
+
+ if (entity_is_task(se)) {
+ p = task_of(se);
+ if (task_on_rq_migrating(p)) {
+ /*
+ * Preserve migrating task's wait time so wait_start
+ * time stamp can be adjusted to accumulate wait time
+ * prior to migration.
+ */
+ se->statistics.wait_start = delta;
+ return;
+ }
+ trace_sched_stat_wait(p, delta);
+ }
+
+ se->statistics.wait_max = max(se->statistics.wait_max, delta);
+ se->statistics.wait_count++;
+ se->statistics.wait_sum += delta;
+ se->statistics.wait_start = 0;
}
/*
* Task is being enqueued - update stats:
*/
-static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static inline void
+update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
/*
* Are we enqueueing a waiting task? (for current tasks
@@ -782,25 +818,8 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
update_stats_wait_start(cfs_rq, se);
}
-static void
-update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
- schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
- rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
- schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
- schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
- rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
-#ifdef CONFIG_SCHEDSTATS
- if (entity_is_task(se)) {
- trace_sched_stat_wait(task_of(se),
- rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
- }
-#endif
- schedstat_set(se->statistics.wait_start, 0);
-}
-
static inline void
-update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
/*
* Mark the end of the wait period if dequeueing a
@@ -808,8 +827,41 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
*/
if (se != cfs_rq->curr)
update_stats_wait_end(cfs_rq, se);
+
+ if (flags & DEQUEUE_SLEEP) {
+ if (entity_is_task(se)) {
+ struct task_struct *tsk = task_of(se);
+
+ if (tsk->state & TASK_INTERRUPTIBLE)
+ se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
+ if (tsk->state & TASK_UNINTERRUPTIBLE)
+ se->statistics.block_start = rq_clock(rq_of(cfs_rq));
+ }
+ }
+
+}
+#else
+static inline void
+update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+}
+
+static inline void
+update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
}
+static inline void
+update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+}
+
+static inline void
+update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+{
+}
+#endif
+
/*
* We are picking a new current task - update its stats:
*/
@@ -905,10 +957,11 @@ struct numa_group {
spinlock_t lock; /* nr_tasks, tasks */
int nr_tasks;
pid_t gid;
+ int active_nodes;
struct rcu_head rcu;
- nodemask_t active_nodes;
unsigned long total_faults;
+ unsigned long max_faults_cpu;
/*
* Faults_cpu is used to decide whether memory should move
* towards the CPU. As a consequence, these stats are weighted
@@ -967,6 +1020,18 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
}
+/*
+ * A node triggering more than 1/3 as many NUMA faults as the maximum is
+ * considered part of a numa group's pseudo-interleaving set. Migrations
+ * between these nodes are slowed down, to allow things to settle down.
+ */
+#define ACTIVE_NODE_FRACTION 3
+
+static bool numa_is_active_node(int nid, struct numa_group *ng)
+{
+ return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
+}
+
/* Handle placement on systems where not all nodes are directly connected. */
static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
int maxdist, bool task)
@@ -1116,27 +1181,23 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
return true;
/*
- * Do not migrate if the destination is not a node that
- * is actively used by this numa group.
+ * Destination node is much more heavily used than the source
+ * node? Allow migration.
*/
- if (!node_isset(dst_nid, ng->active_nodes))
- return false;
-
- /*
- * Source is a node that is not actively used by this
- * numa group, while the destination is. Migrate.
- */
- if (!node_isset(src_nid, ng->active_nodes))
+ if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
+ ACTIVE_NODE_FRACTION)
return true;
/*
- * Both source and destination are nodes in active
- * use by this numa group. Maximize memory bandwidth
- * by migrating from more heavily used groups, to less
- * heavily used ones, spreading the load around.
- * Use a 1/4 hysteresis to avoid spurious page movement.
+ * Distribute memory according to CPU & memory use on each node,
+ * with 3/4 hysteresis to avoid unnecessary memory migrations:
+ *
+ * faults_cpu(dst) 3 faults_cpu(src)
+ * --------------- * - > ---------------
+ * faults_mem(dst) 4 faults_mem(src)
*/
- return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);
+ return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
+ group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
}
static unsigned long weighted_cpuload(const int cpu);
@@ -1218,8 +1279,6 @@ static void task_numa_assign(struct task_numa_env *env,
{
if (env->best_task)
put_task_struct(env->best_task);
- if (p)
- get_task_struct(p);
env->best_task = p;
env->best_imp = imp;
@@ -1287,20 +1346,30 @@ static void task_numa_compare(struct task_numa_env *env,
long imp = env->p->numa_group ? groupimp : taskimp;
long moveimp = imp;
int dist = env->dist;
+ bool assigned = false;
rcu_read_lock();
raw_spin_lock_irq(&dst_rq->lock);
cur = dst_rq->curr;
/*
- * No need to move the exiting task, and this ensures that ->curr
- * wasn't reaped and thus get_task_struct() in task_numa_assign()
- * is safe under RCU read lock.
- * Note that rcu_read_lock() itself can't protect from the final
- * put_task_struct() after the last schedule().
+ * No need to move the exiting task or idle task.
*/
if ((cur->flags & PF_EXITING) || is_idle_task(cur))
cur = NULL;
+ else {
+ /*
+ * The task_struct must be protected here to protect the
+ * p->numa_faults access in the task_weight since the
+ * numa_faults could already be freed in the following path:
+ * finish_task_switch()
+ * --> put_task_struct()
+ * --> __put_task_struct()
+ * --> task_numa_free()
+ */
+ get_task_struct(cur);
+ }
+
raw_spin_unlock_irq(&dst_rq->lock);
/*
@@ -1384,6 +1453,7 @@ balance:
*/
if (!load_too_imbalanced(src_load, dst_load, env)) {
imp = moveimp - 1;
+ put_task_struct(cur);
cur = NULL;
goto assign;
}
@@ -1409,9 +1479,16 @@ balance:
env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
assign:
+ assigned = true;
task_numa_assign(env, cur, imp);
unlock:
rcu_read_unlock();
+ /*
+ * The dst_rq->curr isn't assigned. The protection for task_struct is
+ * finished.
+ */
+ if (cur && !assigned)
+ put_task_struct(cur);
}
static void task_numa_find_cpu(struct task_numa_env *env,
@@ -1466,7 +1543,7 @@ static int task_numa_migrate(struct task_struct *p)
.best_task = NULL,
.best_imp = 0,
- .best_cpu = -1
+ .best_cpu = -1,
};
struct sched_domain *sd;
unsigned long taskweight, groupweight;
@@ -1518,8 +1595,7 @@ static int task_numa_migrate(struct task_struct *p)
* multiple NUMA nodes; in order to better consolidate the group,
* we need to check other locations.
*/
- if (env.best_cpu == -1 || (p->numa_group &&
- nodes_weight(p->numa_group->active_nodes) > 1)) {
+ if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) {
for_each_online_node(nid) {
if (nid == env.src_nid || nid == p->numa_preferred_nid)
continue;
@@ -1554,12 +1630,14 @@ static int task_numa_migrate(struct task_struct *p)
* trying for a better one later. Do not set the preferred node here.
*/
if (p->numa_group) {
+ struct numa_group *ng = p->numa_group;
+
if (env.best_cpu == -1)
nid = env.src_nid;
else
nid = env.dst_nid;
- if (node_isset(nid, p->numa_group->active_nodes))
+ if (ng->active_nodes > 1 && numa_is_active_node(env.dst_nid, ng))
sched_setnuma(p, env.dst_nid);
}
@@ -1609,20 +1687,15 @@ static void numa_migrate_preferred(struct task_struct *p)
}
/*
- * Find the nodes on which the workload is actively running. We do this by
+ * Find out how many nodes on the workload is actively running on. Do this by
* tracking the nodes from which NUMA hinting faults are triggered. This can
* be different from the set of nodes where the workload's memory is currently
* located.
- *
- * The bitmask is used to make smarter decisions on when to do NUMA page
- * migrations, To prevent flip-flopping, and excessive page migrations, nodes
- * are added when they cause over 6/16 of the maximum number of faults, but
- * only removed when they drop below 3/16.
*/
-static void update_numa_active_node_mask(struct numa_group *numa_group)
+static void numa_group_count_active_nodes(struct numa_group *numa_group)
{
unsigned long faults, max_faults = 0;
- int nid;
+ int nid, active_nodes = 0;
for_each_online_node(nid) {
faults = group_faults_cpu(numa_group, nid);
@@ -1632,12 +1705,12 @@ static void update_numa_active_node_mask(struct numa_group *numa_group)
for_each_online_node(nid) {
faults = group_faults_cpu(numa_group, nid);
- if (!node_isset(nid, numa_group->active_nodes)) {
- if (faults > max_faults * 6 / 16)
- node_set(nid, numa_group->active_nodes);
- } else if (faults < max_faults * 3 / 16)
- node_clear(nid, numa_group->active_nodes);
+ if (faults * ACTIVE_NODE_FRACTION > max_faults)
+ active_nodes++;
}
+
+ numa_group->max_faults_cpu = max_faults;
+ numa_group->active_nodes = active_nodes;
}
/*
@@ -1928,7 +2001,7 @@ static void task_numa_placement(struct task_struct *p)
update_task_scan_period(p, fault_types[0], fault_types[1]);
if (p->numa_group) {
- update_numa_active_node_mask(p->numa_group);
+ numa_group_count_active_nodes(p->numa_group);
spin_unlock_irq(group_lock);
max_nid = preferred_group_nid(p, max_group_nid);
}
@@ -1972,14 +2045,14 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
return;
atomic_set(&grp->refcount, 1);
+ grp->active_nodes = 1;
+ grp->max_faults_cpu = 0;
spin_lock_init(&grp->lock);
grp->gid = p->pid;
/* Second half of the array tracks nids where faults happen */
grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
nr_node_ids;
- node_set(task_node(current), grp->active_nodes);
-
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
grp->faults[i] = p->numa_faults[i];
@@ -2093,6 +2166,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
bool migrated = flags & TNF_MIGRATED;
int cpu_node = task_node(current);
int local = !!(flags & TNF_FAULT_LOCAL);
+ struct numa_group *ng;
int priv;
if (!static_branch_likely(&sched_numa_balancing))
@@ -2133,9 +2207,10 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
* actively using should be counted as local. This allows the
* scan rate to slow down when a workload has settled down.
*/
- if (!priv && !local && p->numa_group &&
- node_isset(cpu_node, p->numa_group->active_nodes) &&
- node_isset(mem_node, p->numa_group->active_nodes))
+ ng = p->numa_group;
+ if (!priv && !local && ng && ng->active_nodes > 1 &&
+ numa_is_active_node(cpu_node, ng) &&
+ numa_is_active_node(mem_node, ng))
local = 1;
task_numa_placement(p);
@@ -2180,6 +2255,7 @@ void task_numa_work(struct callback_head *work)
unsigned long migrate, next_scan, now = jiffies;
struct task_struct *p = current;
struct mm_struct *mm = p->mm;
+ u64 runtime = p->se.sum_exec_runtime;
struct vm_area_struct *vma;
unsigned long start, end;
unsigned long nr_pte_updates = 0;
@@ -2302,6 +2378,17 @@ out:
else
reset_ptenuma_scan(p);
up_read(&mm->mmap_sem);
+
+ /*
+ * Make sure tasks use at least 32x as much time to run other code
+ * than they used here, to limit NUMA PTE scanning overhead to 3% max.
+ * Usually update_task_scan_period slows down scanning enough; on an
+ * overloaded system we need to limit overhead on a per task basis.
+ */
+ if (unlikely(p->se.sum_exec_runtime != runtime)) {
+ u64 diff = p->se.sum_exec_runtime - runtime;
+ p->node_stamp += 32 * diff;
+ }
}
/*
@@ -2695,12 +2782,64 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
{
long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
+ /*
+ * No need to update load_avg for root_task_group as it is not used.
+ */
+ if (cfs_rq->tg == &root_task_group)
+ return;
+
if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
atomic_long_add(delta, &cfs_rq->tg->load_avg);
cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
}
}
+/*
+ * Called within set_task_rq() right before setting a task's cpu. The
+ * caller only guarantees p->pi_lock is held; no other assumptions,
+ * including the state of rq->lock, should be made.
+ */
+void set_task_rq_fair(struct sched_entity *se,
+ struct cfs_rq *prev, struct cfs_rq *next)
+{
+ if (!sched_feat(ATTACH_AGE_LOAD))
+ return;
+
+ /*
+ * We are supposed to update the task to "current" time, then its up to
+ * date and ready to go to new CPU/cfs_rq. But we have difficulty in
+ * getting what current time is, so simply throw away the out-of-date
+ * time. This will result in the wakee task is less decayed, but giving
+ * the wakee more load sounds not bad.
+ */
+ if (se->avg.last_update_time && prev) {
+ u64 p_last_update_time;
+ u64 n_last_update_time;
+
+#ifndef CONFIG_64BIT
+ u64 p_last_update_time_copy;
+ u64 n_last_update_time_copy;
+
+ do {
+ p_last_update_time_copy = prev->load_last_update_time_copy;
+ n_last_update_time_copy = next->load_last_update_time_copy;
+
+ smp_rmb();
+
+ p_last_update_time = prev->avg.last_update_time;
+ n_last_update_time = next->avg.last_update_time;
+
+ } while (p_last_update_time != p_last_update_time_copy ||
+ n_last_update_time != n_last_update_time_copy);
+#else
+ p_last_update_time = prev->avg.last_update_time;
+ n_last_update_time = next->avg.last_update_time;
+#endif
+ __update_load_avg(p_last_update_time, cpu_of(rq_of(prev)),
+ &se->avg, 0, 0, NULL);
+ se->avg.last_update_time = n_last_update_time;
+ }
+}
#else /* CONFIG_FAIR_GROUP_SCHED */
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
#endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -2834,48 +2973,48 @@ dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
}
-/*
- * Task first catches up with cfs_rq, and then subtract
- * itself from the cfs_rq (task must be off the queue now).
- */
-void remove_entity_load_avg(struct sched_entity *se)
-{
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
- u64 last_update_time;
-
#ifndef CONFIG_64BIT
+static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
+{
u64 last_update_time_copy;
+ u64 last_update_time;
do {
last_update_time_copy = cfs_rq->load_last_update_time_copy;
smp_rmb();
last_update_time = cfs_rq->avg.last_update_time;
} while (last_update_time != last_update_time_copy);
-#else
- last_update_time = cfs_rq->avg.last_update_time;
-#endif
- __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
- atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
- atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
+ return last_update_time;
}
-
-/*
- * Update the rq's load with the elapsed running time before entering
- * idle. if the last scheduled task is not a CFS task, idle_enter will
- * be the only way to update the runnable statistic.
- */
-void idle_enter_fair(struct rq *this_rq)
+#else
+static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
{
+ return cfs_rq->avg.last_update_time;
}
+#endif
/*
- * Update the rq's load with the elapsed idle time before a task is
- * scheduled. if the newly scheduled task is not a CFS task, idle_exit will
- * be the only way to update the runnable statistic.
+ * Task first catches up with cfs_rq, and then subtract
+ * itself from the cfs_rq (task must be off the queue now).
*/
-void idle_exit_fair(struct rq *this_rq)
+void remove_entity_load_avg(struct sched_entity *se)
{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ u64 last_update_time;
+
+ /*
+ * Newly created task or never used group entity should not be removed
+ * from its (source) cfs_rq
+ */
+ if (se->avg.last_update_time == 0)
+ return;
+
+ last_update_time = cfs_rq_last_update_time(cfs_rq);
+
+ __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
+ atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
+ atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
}
static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
@@ -3020,6 +3159,26 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
+static inline void check_schedstat_required(void)
+{
+#ifdef CONFIG_SCHEDSTATS
+ if (schedstat_enabled())
+ return;
+
+ /* Force schedstat enabled if a dependent tracepoint is active */
+ if (trace_sched_stat_wait_enabled() ||
+ trace_sched_stat_sleep_enabled() ||
+ trace_sched_stat_iowait_enabled() ||
+ trace_sched_stat_blocked_enabled() ||
+ trace_sched_stat_runtime_enabled()) {
+ pr_warn_once("Scheduler tracepoints stat_sleep, stat_iowait, "
+ "stat_blocked and stat_runtime require the "
+ "kernel parameter schedstats=enabled or "
+ "kernel.sched_schedstats=1\n");
+ }
+#endif
+}
+
static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
@@ -3040,11 +3199,15 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (flags & ENQUEUE_WAKEUP) {
place_entity(cfs_rq, se, 0);
- enqueue_sleeper(cfs_rq, se);
+ if (schedstat_enabled())
+ enqueue_sleeper(cfs_rq, se);
}
- update_stats_enqueue(cfs_rq, se);
- check_spread(cfs_rq, se);
+ check_schedstat_required();
+ if (schedstat_enabled()) {
+ update_stats_enqueue(cfs_rq, se);
+ check_spread(cfs_rq, se);
+ }
if (se != cfs_rq->curr)
__enqueue_entity(cfs_rq, se);
se->on_rq = 1;
@@ -3111,19 +3274,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
update_curr(cfs_rq);
dequeue_entity_load_avg(cfs_rq, se);
- update_stats_dequeue(cfs_rq, se);
- if (flags & DEQUEUE_SLEEP) {
-#ifdef CONFIG_SCHEDSTATS
- if (entity_is_task(se)) {
- struct task_struct *tsk = task_of(se);
-
- if (tsk->state & TASK_INTERRUPTIBLE)
- se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
- if (tsk->state & TASK_UNINTERRUPTIBLE)
- se->statistics.block_start = rq_clock(rq_of(cfs_rq));
- }
-#endif
- }
+ if (schedstat_enabled())
+ update_stats_dequeue(cfs_rq, se, flags);
clear_buddies(cfs_rq, se);
@@ -3197,7 +3349,8 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
* a CPU. So account for the time it spent waiting on the
* runqueue.
*/
- update_stats_wait_end(cfs_rq, se);
+ if (schedstat_enabled())
+ update_stats_wait_end(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
update_load_avg(se, 1);
}
@@ -3210,7 +3363,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
* least twice that of our own weight (i.e. dont track it
* when there are only lesser-weight tasks around):
*/
- if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
+ if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
se->statistics.slice_max = max(se->statistics.slice_max,
se->sum_exec_runtime - se->prev_sum_exec_runtime);
}
@@ -3293,9 +3446,13 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
/* throttle cfs_rqs exceeding runtime */
check_cfs_rq_runtime(cfs_rq);
- check_spread(cfs_rq, prev);
+ if (schedstat_enabled()) {
+ check_spread(cfs_rq, prev);
+ if (prev->on_rq)
+ update_stats_wait_start(cfs_rq, prev);
+ }
+
if (prev->on_rq) {
- update_stats_wait_start(cfs_rq, prev);
/* Put 'current' back into the tree. */
__enqueue_entity(cfs_rq, prev);
/* in !on_rq case, update occurred at dequeue */
@@ -4265,42 +4422,37 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
*/
/*
- * The exact cpuload at various idx values, calculated at every tick would be
- * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
+ * The exact cpuload calculated at every tick would be:
+ *
+ * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load
*
- * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
- * on nth tick when cpu may be busy, then we have:
- * load = ((2^idx - 1) / 2^idx)^(n-1) * load
- * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
+ * If a cpu misses updates for n ticks (as it was idle) and update gets
+ * called on the n+1-th tick when cpu may be busy, then we have:
+ *
+ * load_n = (1 - 1/2^i)^n * load_0
+ * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load
*
* decay_load_missed() below does efficient calculation of
- * load = ((2^idx - 1) / 2^idx)^(n-1) * load
- * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
+ *
+ * load' = (1 - 1/2^i)^n * load
+ *
+ * Because x^(n+m) := x^n * x^m we can decompose any x^n in power-of-2 factors.
+ * This allows us to precompute the above in said factors, thereby allowing the
+ * reduction of an arbitrary n in O(log_2 n) steps. (See also
+ * fixed_power_int())
*
* The calculation is approximated on a 128 point scale.
- * degrade_zero_ticks is the number of ticks after which load at any
- * particular idx is approximated to be zero.
- * degrade_factor is a precomputed table, a row for each load idx.
- * Each column corresponds to degradation factor for a power of two ticks,
- * based on 128 point scale.
- * Example:
- * row 2, col 3 (=12) says that the degradation at load idx 2 after
- * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
- *
- * With this power of 2 load factors, we can degrade the load n times
- * by looking at 1 bits in n and doing as many mult/shift instead of
- * n mult/shifts needed by the exact degradation.
*/
#define DEGRADE_SHIFT 7
-static const unsigned char
- degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
-static const unsigned char
- degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
- {0, 0, 0, 0, 0, 0, 0, 0},
- {64, 32, 8, 0, 0, 0, 0, 0},
- {96, 72, 40, 12, 1, 0, 0},
- {112, 98, 75, 43, 15, 1, 0},
- {120, 112, 98, 76, 45, 16, 2} };
+
+static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
+static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
+ { 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 64, 32, 8, 0, 0, 0, 0, 0 },
+ { 96, 72, 40, 12, 1, 0, 0, 0 },
+ { 112, 98, 75, 43, 15, 1, 0, 0 },
+ { 120, 112, 98, 76, 45, 16, 2, 0 }
+};
/*
* Update cpu_load for any missed ticks, due to tickless idle. The backlog
@@ -4331,14 +4483,46 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
return load;
}
-/*
+/**
+ * __update_cpu_load - update the rq->cpu_load[] statistics
+ * @this_rq: The rq to update statistics for
+ * @this_load: The current load
+ * @pending_updates: The number of missed updates
+ * @active: !0 for NOHZ_FULL
+ *
* Update rq->cpu_load[] statistics. This function is usually called every
- * scheduler tick (TICK_NSEC). With tickless idle this will not be called
- * every tick. We fix it up based on jiffies.
+ * scheduler tick (TICK_NSEC).
+ *
+ * This function computes a decaying average:
+ *
+ * load[i]' = (1 - 1/2^i) * load[i] + (1/2^i) * load
+ *
+ * Because of NOHZ it might not get called on every tick which gives need for
+ * the @pending_updates argument.
+ *
+ * load[i]_n = (1 - 1/2^i) * load[i]_n-1 + (1/2^i) * load_n-1
+ * = A * load[i]_n-1 + B ; A := (1 - 1/2^i), B := (1/2^i) * load
+ * = A * (A * load[i]_n-2 + B) + B
+ * = A * (A * (A * load[i]_n-3 + B) + B) + B
+ * = A^3 * load[i]_n-3 + (A^2 + A + 1) * B
+ * = A^n * load[i]_0 + (A^(n-1) + A^(n-2) + ... + 1) * B
+ * = A^n * load[i]_0 + ((1 - A^n) / (1 - A)) * B
+ * = (1 - 1/2^i)^n * (load[i]_0 - load) + load
+ *
+ * In the above we've assumed load_n := load, which is true for NOHZ_FULL as
+ * any change in load would have resulted in the tick being turned back on.
+ *
+ * For regular NOHZ, this reduces to:
+ *
+ * load[i]_n = (1 - 1/2^i)^n * load[i]_0
+ *
+ * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
+ * term. See the @active paramter.
*/
static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
- unsigned long pending_updates)
+ unsigned long pending_updates, int active)
{
+ unsigned long tickless_load = active ? this_rq->cpu_load[0] : 0;
int i, scale;
this_rq->nr_load_updates++;
@@ -4352,6 +4536,15 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
old_load = this_rq->cpu_load[i];
old_load = decay_load_missed(old_load, pending_updates - 1, i);
+ if (tickless_load) {
+ old_load -= decay_load_missed(tickless_load, pending_updates - 1, i);
+ /*
+ * old_load can never be a negative value because a
+ * decayed tickless_load cannot be greater than the
+ * original tickless_load.
+ */
+ old_load += tickless_load;
+ }
new_load = this_load;
/*
* Round up the averaging division if load is increasing. This
@@ -4374,6 +4567,25 @@ static unsigned long weighted_cpuload(const int cpu)
}
#ifdef CONFIG_NO_HZ_COMMON
+static void __update_cpu_load_nohz(struct rq *this_rq,
+ unsigned long curr_jiffies,
+ unsigned long load,
+ int active)
+{
+ unsigned long pending_updates;
+
+ pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+ if (pending_updates) {
+ this_rq->last_load_update_tick = curr_jiffies;
+ /*
+ * In the regular NOHZ case, we were idle, this means load 0.
+ * In the NOHZ_FULL case, we were non-idle, we should consider
+ * its weighted load.
+ */
+ __update_cpu_load(this_rq, load, pending_updates, active);
+ }
+}
+
/*
* There is no sane way to deal with nohz on smp when using jiffies because the
* cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
@@ -4391,46 +4603,31 @@ static unsigned long weighted_cpuload(const int cpu)
* Called from nohz_idle_balance() to update the load ratings before doing the
* idle balance.
*/
-static void update_idle_cpu_load(struct rq *this_rq)
+static void update_cpu_load_idle(struct rq *this_rq)
{
- unsigned long curr_jiffies = READ_ONCE(jiffies);
- unsigned long load = weighted_cpuload(cpu_of(this_rq));
- unsigned long pending_updates;
-
/*
* bail if there's load or we're actually up-to-date.
*/
- if (load || curr_jiffies == this_rq->last_load_update_tick)
+ if (weighted_cpuload(cpu_of(this_rq)))
return;
- pending_updates = curr_jiffies - this_rq->last_load_update_tick;
- this_rq->last_load_update_tick = curr_jiffies;
-
- __update_cpu_load(this_rq, load, pending_updates);
+ __update_cpu_load_nohz(this_rq, READ_ONCE(jiffies), 0, 0);
}
/*
* Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
*/
-void update_cpu_load_nohz(void)
+void update_cpu_load_nohz(int active)
{
struct rq *this_rq = this_rq();
unsigned long curr_jiffies = READ_ONCE(jiffies);
- unsigned long pending_updates;
+ unsigned long load = active ? weighted_cpuload(cpu_of(this_rq)) : 0;
if (curr_jiffies == this_rq->last_load_update_tick)
return;
raw_spin_lock(&this_rq->lock);
- pending_updates = curr_jiffies - this_rq->last_load_update_tick;
- if (pending_updates) {
- this_rq->last_load_update_tick = curr_jiffies;
- /*
- * We were idle, this means load 0, the current load might be
- * !0 due to remote wakeups and the sort.
- */
- __update_cpu_load(this_rq, 0, pending_updates);
- }
+ __update_cpu_load_nohz(this_rq, curr_jiffies, load, active);
raw_spin_unlock(&this_rq->lock);
}
#endif /* CONFIG_NO_HZ */
@@ -4442,10 +4639,10 @@ void update_cpu_load_active(struct rq *this_rq)
{
unsigned long load = weighted_cpuload(cpu_of(this_rq));
/*
- * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
+ * See the mess around update_cpu_load_idle() / update_cpu_load_nohz().
*/
this_rq->last_load_update_tick = jiffies;
- __update_cpu_load(this_rq, load, 1);
+ __update_cpu_load(this_rq, load, 1, 1);
}
/*
@@ -5032,8 +5229,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
/*
* Called immediately before a task is migrated to a new cpu; task_cpu(p) and
* cfs_rq_of(p) references at time of call are still valid and identify the
- * previous cpu. However, the caller only guarantees p->pi_lock is held; no
- * other assumptions, including the state of rq->lock, should be made.
+ * previous cpu. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
*/
static void migrate_task_rq_fair(struct task_struct *p)
{
@@ -5746,8 +5942,8 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
{
lockdep_assert_held(&env->src_rq->lock);
- deactivate_task(env->src_rq, p, 0);
p->on_rq = TASK_ON_RQ_MIGRATING;
+ deactivate_task(env->src_rq, p, 0);
set_task_cpu(p, env->dst_cpu);
}
@@ -5880,8 +6076,8 @@ static void attach_task(struct rq *rq, struct task_struct *p)
lockdep_assert_held(&rq->lock);
BUG_ON(task_rq(p) != rq);
- p->on_rq = TASK_ON_RQ_QUEUED;
activate_task(rq, p, 0);
+ p->on_rq = TASK_ON_RQ_QUEUED;
check_preempt_curr(rq, p, 0);
}
@@ -6327,7 +6523,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
bool *overload)
{
unsigned long load;
- int i;
+ int i, nr_running;
memset(sgs, 0, sizeof(*sgs));
@@ -6344,7 +6540,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->group_util += cpu_util(i);
sgs->sum_nr_running += rq->cfs.h_nr_running;
- if (rq->nr_running > 1)
+ nr_running = rq->nr_running;
+ if (nr_running > 1)
*overload = true;
#ifdef CONFIG_NUMA_BALANCING
@@ -6352,7 +6549,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->nr_preferred_running += rq->nr_preferred_running;
#endif
sgs->sum_weighted_load += weighted_cpuload(i);
- if (idle_cpu(i))
+ /*
+ * No need to call idle_cpu() if nr_running is not 0
+ */
+ if (!nr_running && idle_cpu(i))
sgs->idle_cpus++;
}
@@ -7273,8 +7473,6 @@ static int idle_balance(struct rq *this_rq)
int pulled_task = 0;
u64 curr_cost = 0;
- idle_enter_fair(this_rq);
-
/*
* We must set idle_stamp _before_ calling idle_balance(), such that we
* measure the duration of idle_balance() as idle time.
@@ -7355,10 +7553,8 @@ out:
if (this_rq->nr_running != this_rq->cfs.h_nr_running)
pulled_task = -1;
- if (pulled_task) {
- idle_exit_fair(this_rq);
+ if (pulled_task)
this_rq->idle_stamp = 0;
- }
return pulled_task;
}
@@ -7737,7 +7933,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
if (time_after_eq(jiffies, rq->next_balance)) {
raw_spin_lock_irq(&rq->lock);
update_rq_clock(rq);
- update_idle_cpu_load(rq);
+ update_cpu_load_idle(rq);
raw_spin_unlock_irq(&rq->lock);
rebalance_domains(rq, CPU_IDLE);
}
@@ -8123,11 +8319,8 @@ void free_fair_sched_group(struct task_group *tg)
for_each_possible_cpu(i) {
if (tg->cfs_rq)
kfree(tg->cfs_rq[i]);
- if (tg->se) {
- if (tg->se[i])
- remove_entity_load_avg(tg->se[i]);
+ if (tg->se)
kfree(tg->se[i]);
- }
}
kfree(tg->cfs_rq);
@@ -8175,21 +8368,29 @@ err:
return 0;
}
-void unregister_fair_sched_group(struct task_group *tg, int cpu)
+void unregister_fair_sched_group(struct task_group *tg)
{
- struct rq *rq = cpu_rq(cpu);
unsigned long flags;
+ struct rq *rq;
+ int cpu;
- /*
- * Only empty task groups can be destroyed; so we can speculatively
- * check on_list without danger of it being re-added.
- */
- if (!tg->cfs_rq[cpu]->on_list)
- return;
+ for_each_possible_cpu(cpu) {
+ if (tg->se[cpu])
+ remove_entity_load_avg(tg->se[cpu]);
- raw_spin_lock_irqsave(&rq->lock, flags);
- list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ /*
+ * Only empty task groups can be destroyed; so we can speculatively
+ * check on_list without danger of it being re-added.
+ */
+ if (!tg->cfs_rq[cpu]->on_list)
+ continue;
+
+ rq = cpu_rq(cpu);
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+ }
}
void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
@@ -8271,7 +8472,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
return 1;
}
-void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
+void unregister_fair_sched_group(struct task_group *tg) { }
#endif /* CONFIG_FAIR_GROUP_SCHED */
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 4a2ef5a02..544a7133c 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -97,12 +97,6 @@ void default_idle_call(void)
static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev,
int next_state)
{
- /* Fall back to the default arch idle method on errors. */
- if (next_state < 0) {
- default_idle_call();
- return next_state;
- }
-
/*
* The idle task must be scheduled, it is pointless to go to idle, just
* update no idle residency and return.
@@ -168,7 +162,7 @@ static void cpuidle_idle_call(void)
*/
if (idle_should_freeze()) {
entered_state = cpuidle_enter_freeze(drv, dev);
- if (entered_state >= 0) {
+ if (entered_state > 0) {
local_irq_enable();
goto exit_idle;
}
@@ -219,6 +213,7 @@ static void cpu_idle_loop(void)
*/
__current_set_polling();
+ quiet_vmstat();
tick_nohz_idle_enter();
while (!need_resched()) {
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index c4ae0f1fd..47ce94931 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -47,7 +47,6 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
{
- idle_exit_fair(rq);
rq_last_tick_reset(rq);
}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 8ec86abe0..a774b4dbf 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -58,7 +58,15 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
raw_spin_lock(&rt_b->rt_runtime_lock);
if (!rt_b->rt_period_active) {
rt_b->rt_period_active = 1;
- hrtimer_forward_now(&rt_b->rt_period_timer, rt_b->rt_period);
+ /*
+ * SCHED_DEADLINE updates the bandwidth, as a run away
+ * RT task with a DL task could hog a CPU. But DL does
+ * not reset the period. If a deadline task was running
+ * without an RT task running, it can cause RT tasks to
+ * throttle when they start up. Kick the timer right away
+ * to update the period.
+ */
+ hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0));
hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED);
}
raw_spin_unlock(&rt_b->rt_runtime_lock);
@@ -436,7 +444,7 @@ static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
static inline int on_rt_rq(struct sched_rt_entity *rt_se)
{
- return !list_empty(&rt_se->run_list);
+ return rt_se->on_rq;
}
#ifdef CONFIG_RT_GROUP_SCHED
@@ -482,8 +490,8 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
return rt_se->my_q;
}
-static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head);
-static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
+static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
+static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
{
@@ -499,7 +507,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
if (!rt_se)
enqueue_top_rt_rq(rt_rq);
else if (!on_rt_rq(rt_se))
- enqueue_rt_entity(rt_se, false);
+ enqueue_rt_entity(rt_se, 0);
if (rt_rq->highest_prio.curr < curr->prio)
resched_curr(rq);
@@ -516,7 +524,7 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
if (!rt_se)
dequeue_top_rt_rq(rt_rq);
else if (on_rt_rq(rt_se))
- dequeue_rt_entity(rt_se);
+ dequeue_rt_entity(rt_se, 0);
}
static inline int rt_rq_throttled(struct rt_rq *rt_rq)
@@ -1166,7 +1174,30 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
dec_rt_group(rt_se, rt_rq);
}
-static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
+/*
+ * Change rt_se->run_list location unless SAVE && !MOVE
+ *
+ * assumes ENQUEUE/DEQUEUE flags match
+ */
+static inline bool move_entity(unsigned int flags)
+{
+ if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
+ return false;
+
+ return true;
+}
+
+static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array)
+{
+ list_del_init(&rt_se->run_list);
+
+ if (list_empty(array->queue + rt_se_prio(rt_se)))
+ __clear_bit(rt_se_prio(rt_se), array->bitmap);
+
+ rt_se->on_list = 0;
+}
+
+static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
{
struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
struct rt_prio_array *array = &rt_rq->active;
@@ -1179,26 +1210,37 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
* get throttled and the current group doesn't have any other
* active members.
*/
- if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
+ if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) {
+ if (rt_se->on_list)
+ __delist_rt_entity(rt_se, array);
return;
+ }
- if (head)
- list_add(&rt_se->run_list, queue);
- else
- list_add_tail(&rt_se->run_list, queue);
- __set_bit(rt_se_prio(rt_se), array->bitmap);
+ if (move_entity(flags)) {
+ WARN_ON_ONCE(rt_se->on_list);
+ if (flags & ENQUEUE_HEAD)
+ list_add(&rt_se->run_list, queue);
+ else
+ list_add_tail(&rt_se->run_list, queue);
+
+ __set_bit(rt_se_prio(rt_se), array->bitmap);
+ rt_se->on_list = 1;
+ }
+ rt_se->on_rq = 1;
inc_rt_tasks(rt_se, rt_rq);
}
-static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
+static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
{
struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
struct rt_prio_array *array = &rt_rq->active;
- list_del_init(&rt_se->run_list);
- if (list_empty(array->queue + rt_se_prio(rt_se)))
- __clear_bit(rt_se_prio(rt_se), array->bitmap);
+ if (move_entity(flags)) {
+ WARN_ON_ONCE(!rt_se->on_list);
+ __delist_rt_entity(rt_se, array);
+ }
+ rt_se->on_rq = 0;
dec_rt_tasks(rt_se, rt_rq);
}
@@ -1207,7 +1249,7 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
* Because the prio of an upper entry depends on the lower
* entries, we must remove entries top - down.
*/
-static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
+static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
{
struct sched_rt_entity *back = NULL;
@@ -1220,31 +1262,31 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
for (rt_se = back; rt_se; rt_se = rt_se->back) {
if (on_rt_rq(rt_se))
- __dequeue_rt_entity(rt_se);
+ __dequeue_rt_entity(rt_se, flags);
}
}
-static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
+static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
{
struct rq *rq = rq_of_rt_se(rt_se);
- dequeue_rt_stack(rt_se);
+ dequeue_rt_stack(rt_se, flags);
for_each_sched_rt_entity(rt_se)
- __enqueue_rt_entity(rt_se, head);
+ __enqueue_rt_entity(rt_se, flags);
enqueue_top_rt_rq(&rq->rt);
}
-static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
+static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
{
struct rq *rq = rq_of_rt_se(rt_se);
- dequeue_rt_stack(rt_se);
+ dequeue_rt_stack(rt_se, flags);
for_each_sched_rt_entity(rt_se) {
struct rt_rq *rt_rq = group_rt_rq(rt_se);
if (rt_rq && rt_rq->rt_nr_running)
- __enqueue_rt_entity(rt_se, false);
+ __enqueue_rt_entity(rt_se, flags);
}
enqueue_top_rt_rq(&rq->rt);
}
@@ -1260,7 +1302,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
if (flags & ENQUEUE_WAKEUP)
rt_se->timeout = 0;
- enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
+ enqueue_rt_entity(rt_se, flags);
if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
@@ -1271,7 +1313,7 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
struct sched_rt_entity *rt_se = &p->rt;
update_curr_rt(rq);
- dequeue_rt_entity(rt_se);
+ dequeue_rt_entity(rt_se, flags);
dequeue_pushable_task(rq, p);
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b242775bf..ef5875fff 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3,6 +3,7 @@
#include <linux/sched/sysctl.h>
#include <linux/sched/rt.h>
#include <linux/sched/deadline.h>
+#include <linux/binfmts.h>
#include <linux/mutex.h>
#include <linux/spinlock.h>
#include <linux/stop_machine.h>
@@ -248,7 +249,12 @@ struct task_group {
unsigned long shares;
#ifdef CONFIG_SMP
- atomic_long_t load_avg;
+ /*
+ * load_avg can be heavily contended at clock tick time, so put
+ * it in its own cacheline separated from the fields above which
+ * will also be accessed at each tick.
+ */
+ atomic_long_t load_avg ____cacheline_aligned;
#endif
#endif
@@ -308,12 +314,11 @@ extern int tg_nop(struct task_group *tg, void *data);
extern void free_fair_sched_group(struct task_group *tg);
extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent);
-extern void unregister_fair_sched_group(struct task_group *tg, int cpu);
+extern void unregister_fair_sched_group(struct task_group *tg);
extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
struct sched_entity *se, int cpu,
struct sched_entity *parent);
extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
-extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
@@ -335,7 +340,15 @@ extern void sched_move_task(struct task_struct *tsk);
#ifdef CONFIG_FAIR_GROUP_SCHED
extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
-#endif
+
+#ifdef CONFIG_SMP
+extern void set_task_rq_fair(struct sched_entity *se,
+ struct cfs_rq *prev, struct cfs_rq *next);
+#else /* !CONFIG_SMP */
+static inline void set_task_rq_fair(struct sched_entity *se,
+ struct cfs_rq *prev, struct cfs_rq *next) { }
+#endif /* CONFIG_SMP */
+#endif /* CONFIG_FAIR_GROUP_SCHED */
#else /* CONFIG_CGROUP_SCHED */
@@ -896,6 +909,18 @@ static inline unsigned int group_first_cpu(struct sched_group *group)
extern int group_balance_cpu(struct sched_group *sg);
+#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
+void register_sched_domain_sysctl(void);
+void unregister_sched_domain_sysctl(void);
+#else
+static inline void register_sched_domain_sysctl(void)
+{
+}
+static inline void unregister_sched_domain_sysctl(void)
+{
+}
+#endif
+
#else
static inline void sched_ttwu_pending(void) { }
@@ -933,6 +958,7 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
#endif
#ifdef CONFIG_FAIR_GROUP_SCHED
+ set_task_rq_fair(&p->se, p->se.cfs_rq, tg->cfs_rq[cpu]);
p->se.cfs_rq = tg->cfs_rq[cpu];
p->se.parent = tg->se[cpu];
#endif
@@ -1008,6 +1034,7 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
extern struct static_key_false sched_numa_balancing;
+extern struct static_key_false sched_schedstats;
static inline u64 global_rt_period(void)
{
@@ -1076,7 +1103,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
* In particular, the load of prev->state in finish_task_switch() must
* happen before this.
*
- * Pairs with the control dependency and rmb in try_to_wake_up().
+ * Pairs with the smp_cond_acquire() in try_to_wake_up().
*/
smp_store_release(&prev->on_cpu, 0);
#endif
@@ -1113,59 +1140,43 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
#define WEIGHT_IDLEPRIO 3
#define WMULT_IDLEPRIO 1431655765
-/*
- * Nice levels are multiplicative, with a gentle 10% change for every
- * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
- * nice 1, it will get ~10% less CPU time than another CPU-bound task
- * that remained on nice 0.
- *
- * The "10% effect" is relative and cumulative: from _any_ nice level,
- * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
- * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
- * If a task goes up by ~10% and another task goes down by ~10% then
- * the relative distance between them is ~25%.)
- */
-static const int prio_to_weight[40] = {
- /* -20 */ 88761, 71755, 56483, 46273, 36291,
- /* -15 */ 29154, 23254, 18705, 14949, 11916,
- /* -10 */ 9548, 7620, 6100, 4904, 3906,
- /* -5 */ 3121, 2501, 1991, 1586, 1277,
- /* 0 */ 1024, 820, 655, 526, 423,
- /* 5 */ 335, 272, 215, 172, 137,
- /* 10 */ 110, 87, 70, 56, 45,
- /* 15 */ 36, 29, 23, 18, 15,
-};
+extern const int sched_prio_to_weight[40];
+extern const u32 sched_prio_to_wmult[40];
/*
- * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
+ * {de,en}queue flags:
+ *
+ * DEQUEUE_SLEEP - task is no longer runnable
+ * ENQUEUE_WAKEUP - task just became runnable
+ *
+ * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks
+ * are in a known state which allows modification. Such pairs
+ * should preserve as much state as possible.
+ *
+ * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location
+ * in the runqueue.
+ *
+ * ENQUEUE_HEAD - place at front of runqueue (tail if not specified)
+ * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline)
+ * ENQUEUE_WAKING - sched_class::task_waking was called
*
- * In cases where the weight does not change often, we can use the
- * precalculated inverse to speed up arithmetics by turning divisions
- * into multiplications:
*/
-static const u32 prio_to_wmult[40] = {
- /* -20 */ 48388, 59856, 76040, 92818, 118348,
- /* -15 */ 147320, 184698, 229616, 287308, 360437,
- /* -10 */ 449829, 563644, 704093, 875809, 1099582,
- /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
- /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
- /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
- /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
- /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
-};
+
+#define DEQUEUE_SLEEP 0x01
+#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */
+#define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */
#define ENQUEUE_WAKEUP 0x01
-#define ENQUEUE_HEAD 0x02
+#define ENQUEUE_RESTORE 0x02
+#define ENQUEUE_MOVE 0x04
+
+#define ENQUEUE_HEAD 0x08
+#define ENQUEUE_REPLENISH 0x10
#ifdef CONFIG_SMP
-#define ENQUEUE_WAKING 0x04 /* sched_class::task_waking was called */
+#define ENQUEUE_WAKING 0x20
#else
#define ENQUEUE_WAKING 0x00
#endif
-#define ENQUEUE_REPLENISH 0x08
-#define ENQUEUE_RESTORE 0x10
-
-#define DEQUEUE_SLEEP 0x01
-#define DEQUEUE_SAVE 0x02
#define RETRY_TASK ((void *)-1UL)
@@ -1252,16 +1263,8 @@ extern void update_group_capacity(struct sched_domain *sd, int cpu);
extern void trigger_load_balance(struct rq *rq);
-extern void idle_enter_fair(struct rq *this_rq);
-extern void idle_exit_fair(struct rq *this_rq);
-
extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask);
-#else
-
-static inline void idle_enter_fair(struct rq *rq) { }
-static inline void idle_exit_fair(struct rq *rq) { }
-
#endif
#ifdef CONFIG_CPU_IDLE
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index b0fbc7632..70b3b6a20 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -29,9 +29,10 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
if (rq)
rq->rq_sched_info.run_delay += delta;
}
-# define schedstat_inc(rq, field) do { (rq)->field++; } while (0)
-# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
-# define schedstat_set(var, val) do { var = (val); } while (0)
+# define schedstat_enabled() static_branch_unlikely(&sched_schedstats)
+# define schedstat_inc(rq, field) do { if (schedstat_enabled()) { (rq)->field++; } } while (0)
+# define schedstat_add(rq, field, amt) do { if (schedstat_enabled()) { (rq)->field += (amt); } } while (0)
+# define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0)
#else /* !CONFIG_SCHEDSTATS */
static inline void
rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
@@ -42,6 +43,7 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
static inline void
rq_sched_info_depart(struct rq *rq, unsigned long long delta)
{}
+# define schedstat_enabled() 0
# define schedstat_inc(rq, field) do { } while (0)
# define schedstat_add(rq, field, amt) do { } while (0)
# define schedstat_set(var, val) do { } while (0)
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
new file mode 100644
index 000000000..82f0dff90
--- /dev/null
+++ b/kernel/sched/swait.c
@@ -0,0 +1,123 @@
+#include <linux/sched.h>
+#include <linux/swait.h>
+
+void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
+ struct lock_class_key *key)
+{
+ raw_spin_lock_init(&q->lock);
+ lockdep_set_class_and_name(&q->lock, key, name);
+ INIT_LIST_HEAD(&q->task_list);
+}
+EXPORT_SYMBOL(__init_swait_queue_head);
+
+/*
+ * The thing about the wake_up_state() return value; I think we can ignore it.
+ *
+ * If for some reason it would return 0, that means the previously waiting
+ * task is already running, so it will observe condition true (or has already).
+ */
+void swake_up_locked(struct swait_queue_head *q)
+{
+ struct swait_queue *curr;
+
+ if (list_empty(&q->task_list))
+ return;
+
+ curr = list_first_entry(&q->task_list, typeof(*curr), task_list);
+ wake_up_process(curr->task);
+ list_del_init(&curr->task_list);
+}
+EXPORT_SYMBOL(swake_up_locked);
+
+void swake_up(struct swait_queue_head *q)
+{
+ unsigned long flags;
+
+ if (!swait_active(q))
+ return;
+
+ raw_spin_lock_irqsave(&q->lock, flags);
+ swake_up_locked(q);
+ raw_spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(swake_up);
+
+/*
+ * Does not allow usage from IRQ disabled, since we must be able to
+ * release IRQs to guarantee bounded hold time.
+ */
+void swake_up_all(struct swait_queue_head *q)
+{
+ struct swait_queue *curr;
+ LIST_HEAD(tmp);
+
+ if (!swait_active(q))
+ return;
+
+ raw_spin_lock_irq(&q->lock);
+ list_splice_init(&q->task_list, &tmp);
+ while (!list_empty(&tmp)) {
+ curr = list_first_entry(&tmp, typeof(*curr), task_list);
+
+ wake_up_state(curr->task, TASK_NORMAL);
+ list_del_init(&curr->task_list);
+
+ if (list_empty(&tmp))
+ break;
+
+ raw_spin_unlock_irq(&q->lock);
+ raw_spin_lock_irq(&q->lock);
+ }
+ raw_spin_unlock_irq(&q->lock);
+}
+EXPORT_SYMBOL(swake_up_all);
+
+void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait)
+{
+ wait->task = current;
+ if (list_empty(&wait->task_list))
+ list_add(&wait->task_list, &q->task_list);
+}
+
+void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&q->lock, flags);
+ __prepare_to_swait(q, wait);
+ set_current_state(state);
+ raw_spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(prepare_to_swait);
+
+long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state)
+{
+ if (signal_pending_state(state, current))
+ return -ERESTARTSYS;
+
+ prepare_to_swait(q, wait, state);
+
+ return 0;
+}
+EXPORT_SYMBOL(prepare_to_swait_event);
+
+void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait)
+{
+ __set_current_state(TASK_RUNNING);
+ if (!list_empty(&wait->task_list))
+ list_del_init(&wait->task_list);
+}
+
+void finish_swait(struct swait_queue_head *q, struct swait_queue *wait)
+{
+ unsigned long flags;
+
+ __set_current_state(TASK_RUNNING);
+
+ if (!list_empty_careful(&wait->task_list)) {
+ raw_spin_lock_irqsave(&q->lock, flags);
+ list_del_init(&wait->task_list);
+ raw_spin_unlock_irqrestore(&q->lock, flags);
+ }
+}
+EXPORT_SYMBOL(finish_swait);
diff --git a/kernel/signal.c b/kernel/signal.c
index f3f1f7a97..0508544c8 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3508,8 +3508,10 @@ static int sigsuspend(sigset_t *set)
current->saved_sigmask = current->blocked;
set_current_blocked(set);
- __set_current_state(TASK_INTERRUPTIBLE);
- schedule();
+ while (!signal_pending(current)) {
+ __set_current_state(TASK_INTERRUPTIBLE);
+ schedule();
+ }
set_restore_sigmask();
return -ERESTARTNOHAND;
}
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 28c8e736d..d264f59bf 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -174,7 +174,7 @@ __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
if (tsk)
return 0;
- td = kzalloc_node(sizeof(*td), GFP_KERNEL | ___GFP_TOI_NOTRACK, cpu_to_node(cpu));
+ td = kzalloc_node(sizeof(*td), GFP_KERNEL, cpu_to_node(cpu));
if (!td)
return -ENOMEM;
td->cpu = cpu;
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 479e4436f..8aae49dd7 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -116,9 +116,9 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
if (preempt_count() == cnt) {
#ifdef CONFIG_DEBUG_PREEMPT
- current->preempt_disable_ip = get_parent_ip(CALLER_ADDR1);
+ current->preempt_disable_ip = get_lock_parent_ip();
#endif
- trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
+ trace_preempt_off(CALLER_ADDR0, get_lock_parent_ip());
}
}
EXPORT_SYMBOL(__local_bh_disable_ip);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index a3bbaee77..a467e6c28 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -28,7 +28,6 @@
*/
struct cpu_stop_done {
atomic_t nr_todo; /* nr left to execute */
- bool executed; /* actually executed? */
int ret; /* collected return value */
struct completion completion; /* fired if nr_todo reaches 0 */
};
@@ -63,14 +62,10 @@ static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
}
/* signal completion unless @done is NULL */
-static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed)
+static void cpu_stop_signal_done(struct cpu_stop_done *done)
{
- if (done) {
- if (executed)
- done->executed = true;
- if (atomic_dec_and_test(&done->nr_todo))
- complete(&done->completion);
- }
+ if (atomic_dec_and_test(&done->nr_todo))
+ complete(&done->completion);
}
static void __cpu_stop_queue_work(struct cpu_stopper *stopper,
@@ -81,17 +76,21 @@ static void __cpu_stop_queue_work(struct cpu_stopper *stopper,
}
/* queue @work to @stopper. if offline, @work is completed immediately */
-static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
+static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
{
struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
unsigned long flags;
+ bool enabled;
spin_lock_irqsave(&stopper->lock, flags);
- if (stopper->enabled)
+ enabled = stopper->enabled;
+ if (enabled)
__cpu_stop_queue_work(stopper, work);
- else
- cpu_stop_signal_done(work->done, false);
+ else if (work->done)
+ cpu_stop_signal_done(work->done);
spin_unlock_irqrestore(&stopper->lock, flags);
+
+ return enabled;
}
/**
@@ -124,9 +123,10 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done };
cpu_stop_init_done(&done, 1);
- cpu_stop_queue_work(cpu, &work);
+ if (!cpu_stop_queue_work(cpu, &work))
+ return -ENOENT;
wait_for_completion(&done.completion);
- return done.executed ? done.ret : -ENOENT;
+ return done.ret;
}
/* This controls the threads on each CPU. */
@@ -258,7 +258,6 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
struct cpu_stop_work work1, work2;
struct multi_stop_data msdata;
- preempt_disable();
msdata = (struct multi_stop_data){
.fn = fn,
.data = arg,
@@ -277,16 +276,11 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
if (cpu1 > cpu2)
swap(cpu1, cpu2);
- if (cpu_stop_queue_two_works(cpu1, &work1, cpu2, &work2)) {
- preempt_enable();
+ if (cpu_stop_queue_two_works(cpu1, &work1, cpu2, &work2))
return -ENOENT;
- }
-
- preempt_enable();
wait_for_completion(&done.completion);
-
- return done.executed ? done.ret : -ENOENT;
+ return done.ret;
}
/**
@@ -302,23 +296,28 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
*
* CONTEXT:
* Don't care.
+ *
+ * RETURNS:
+ * true if cpu_stop_work was queued successfully and @fn will be called,
+ * false otherwise.
*/
-void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
+bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
struct cpu_stop_work *work_buf)
{
*work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, };
- cpu_stop_queue_work(cpu, work_buf);
+ return cpu_stop_queue_work(cpu, work_buf);
}
/* static data for stop_cpus */
static DEFINE_MUTEX(stop_cpus_mutex);
-static void queue_stop_cpus_work(const struct cpumask *cpumask,
+static bool queue_stop_cpus_work(const struct cpumask *cpumask,
cpu_stop_fn_t fn, void *arg,
struct cpu_stop_done *done)
{
struct cpu_stop_work *work;
unsigned int cpu;
+ bool queued = false;
/*
* Disable preemption while queueing to avoid getting
@@ -331,9 +330,12 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask,
work->fn = fn;
work->arg = arg;
work->done = done;
- cpu_stop_queue_work(cpu, work);
+ if (cpu_stop_queue_work(cpu, work))
+ queued = true;
}
lg_global_unlock(&stop_cpus_lock);
+
+ return queued;
}
static int __stop_cpus(const struct cpumask *cpumask,
@@ -342,9 +344,10 @@ static int __stop_cpus(const struct cpumask *cpumask,
struct cpu_stop_done done;
cpu_stop_init_done(&done, cpumask_weight(cpumask));
- queue_stop_cpus_work(cpumask, fn, arg, &done);
+ if (!queue_stop_cpus_work(cpumask, fn, arg, &done))
+ return -ENOENT;
wait_for_completion(&done.completion);
- return done.executed ? done.ret : -ENOENT;
+ return done.ret;
}
/**
@@ -432,7 +435,6 @@ static void cpu_stopper_thread(unsigned int cpu)
{
struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
struct cpu_stop_work *work;
- int ret;
repeat:
work = NULL;
@@ -448,23 +450,19 @@ repeat:
cpu_stop_fn_t fn = work->fn;
void *arg = work->arg;
struct cpu_stop_done *done = work->done;
- char ksym_buf[KSYM_NAME_LEN] __maybe_unused;
-
- /* cpu stop callbacks are not allowed to sleep */
- preempt_disable();
+ int ret;
+ /* cpu stop callbacks must not sleep, make in_atomic() == T */
+ preempt_count_inc();
ret = fn(arg);
- if (ret)
- done->ret = ret;
-
- /* restore preemption and check it's still balanced */
- preempt_enable();
+ if (done) {
+ if (ret)
+ done->ret = ret;
+ cpu_stop_signal_done(done);
+ }
+ preempt_count_dec();
WARN_ONCE(preempt_count(),
- "cpu_stop: %s(%p) leaked preempt count\n",
- kallsyms_lookup((unsigned long)fn, NULL, NULL, NULL,
- ksym_buf), arg);
-
- cpu_stop_signal_done(done, true);
+ "cpu_stop: %pf(%p) leaked preempt count\n", fn, arg);
goto repeat;
}
}
@@ -531,8 +529,6 @@ static int __init cpu_stop_init(void)
}
early_initcall(cpu_stop_init);
-#if defined(CONFIG_SMP) || defined(CONFIG_HOTPLUG_CPU)
-
static int __stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus)
{
struct multi_stop_data msdata = {
@@ -630,5 +626,3 @@ int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data,
mutex_unlock(&stop_cpus_mutex);
return ret ?: done.ret;
}
-
-#endif /* CONFIG_SMP || CONFIG_HOTPLUG_CPU */
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 0623787ec..2c5e3a8e0 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -174,6 +174,7 @@ cond_syscall(sys_setfsuid);
cond_syscall(sys_setfsgid);
cond_syscall(sys_capget);
cond_syscall(sys_capset);
+cond_syscall(sys_copy_file_range);
/* arch-specific weak syscall entries */
cond_syscall(sys_pciconfig_read);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index dc6858d66..f5102fabe 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -173,7 +173,7 @@ extern int no_unaligned_warning;
#define SYSCTL_WRITES_WARN 0
#define SYSCTL_WRITES_STRICT 1
-static int sysctl_writes_strict = SYSCTL_WRITES_WARN;
+static int sysctl_writes_strict = SYSCTL_WRITES_STRICT;
static int proc_do_cad_pid(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -350,6 +350,17 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+#ifdef CONFIG_SCHEDSTATS
+ {
+ .procname = "sched_schedstats",
+ .data = NULL,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sysctl_schedstats,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+#endif /* CONFIG_SCHEDSTATS */
#endif /* CONFIG_SMP */
#ifdef CONFIG_NUMA_BALANCING
{
@@ -505,7 +516,7 @@ static struct ctl_table kern_table[] = {
.data = &latencytop_enabled,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = sysctl_latencytop,
},
#endif
#ifdef CONFIG_BLK_DEV_INITRD
@@ -1568,6 +1579,28 @@ static struct ctl_table vm_table[] = {
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
},
+#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
+ {
+ .procname = "mmap_rnd_bits",
+ .data = &mmap_rnd_bits,
+ .maxlen = sizeof(mmap_rnd_bits),
+ .mode = 0600,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = (void *)&mmap_rnd_bits_min,
+ .extra2 = (void *)&mmap_rnd_bits_max,
+ },
+#endif
+#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
+ {
+ .procname = "mmap_rnd_compat_bits",
+ .data = &mmap_rnd_compat_bits,
+ .maxlen = sizeof(mmap_rnd_compat_bits),
+ .mode = 0600,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = (void *)&mmap_rnd_compat_bits_min,
+ .extra2 = (void *)&mmap_rnd_compat_bits_max,
+ },
+#endif
{ }
};
@@ -1735,6 +1768,20 @@ static struct ctl_table fs_table[] = {
.proc_handler = &pipe_proc_fn,
.extra1 = &pipe_min_size,
},
+ {
+ .procname = "pipe-user-pages-hard",
+ .data = &pipe_user_pages_hard,
+ .maxlen = sizeof(pipe_user_pages_hard),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+ {
+ .procname = "pipe-user-pages-soft",
+ .data = &pipe_user_pages_soft,
+ .maxlen = sizeof(pipe_user_pages_soft),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
{ }
};
@@ -2047,9 +2094,8 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
void *data)
{
int *i, vleft, first = 1, err = 0;
- unsigned long page = 0;
size_t left;
- char *kbuf;
+ char *kbuf = NULL, *p;
if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) {
*lenp = 0;
@@ -2078,15 +2124,9 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
if (left > PAGE_SIZE - 1)
left = PAGE_SIZE - 1;
- page = __get_free_page(GFP_TEMPORARY);
- kbuf = (char *) page;
- if (!kbuf)
- return -ENOMEM;
- if (copy_from_user(kbuf, buffer, left)) {
- err = -EFAULT;
- goto free;
- }
- kbuf[left] = 0;
+ p = kbuf = memdup_user_nul(buffer, left);
+ if (IS_ERR(kbuf))
+ return PTR_ERR(kbuf);
}
for (; left && vleft--; i++, first=0) {
@@ -2094,11 +2134,11 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
bool neg;
if (write) {
- left -= proc_skip_spaces(&kbuf);
+ left -= proc_skip_spaces(&p);
if (!left)
break;
- err = proc_get_long(&kbuf, &left, &lval, &neg,
+ err = proc_get_long(&p, &left, &lval, &neg,
proc_wspace_sep,
sizeof(proc_wspace_sep), NULL);
if (err)
@@ -2125,10 +2165,9 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
if (!write && !first && left && !err)
err = proc_put_char(&buffer, &left, '\n');
if (write && !err && left)
- left -= proc_skip_spaces(&kbuf);
-free:
+ left -= proc_skip_spaces(&p);
if (write) {
- free_page(page);
+ kfree(kbuf);
if (first)
return err ? : -EINVAL;
}
@@ -2310,9 +2349,8 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
{
unsigned long *i, *min, *max;
int vleft, first = 1, err = 0;
- unsigned long page = 0;
size_t left;
- char *kbuf;
+ char *kbuf = NULL, *p;
if (!data || !table->maxlen || !*lenp || (*ppos && !write)) {
*lenp = 0;
@@ -2340,15 +2378,9 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
if (left > PAGE_SIZE - 1)
left = PAGE_SIZE - 1;
- page = __get_free_page(GFP_TEMPORARY);
- kbuf = (char *) page;
- if (!kbuf)
- return -ENOMEM;
- if (copy_from_user(kbuf, buffer, left)) {
- err = -EFAULT;
- goto free;
- }
- kbuf[left] = 0;
+ p = kbuf = memdup_user_nul(buffer, left);
+ if (IS_ERR(kbuf))
+ return PTR_ERR(kbuf);
}
for (; left && vleft--; i++, first = 0) {
@@ -2357,9 +2389,9 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
if (write) {
bool neg;
- left -= proc_skip_spaces(&kbuf);
+ left -= proc_skip_spaces(&p);
- err = proc_get_long(&kbuf, &left, &val, &neg,
+ err = proc_get_long(&p, &left, &val, &neg,
proc_wspace_sep,
sizeof(proc_wspace_sep), NULL);
if (err)
@@ -2385,10 +2417,9 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
if (!write && !first && left && !err)
err = proc_put_char(&buffer, &left, '\n');
if (write && !err)
- left -= proc_skip_spaces(&kbuf);
-free:
+ left -= proc_skip_spaces(&p);
if (write) {
- free_page(page);
+ kfree(kbuf);
if (first)
return err ? : -EINVAL;
}
@@ -2650,34 +2681,27 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
}
if (write) {
- unsigned long page = 0;
- char *kbuf;
+ char *kbuf, *p;
if (left > PAGE_SIZE - 1)
left = PAGE_SIZE - 1;
- page = __get_free_page(GFP_TEMPORARY);
- kbuf = (char *) page;
- if (!kbuf)
- return -ENOMEM;
- if (copy_from_user(kbuf, buffer, left)) {
- free_page(page);
- return -EFAULT;
- }
- kbuf[left] = 0;
+ p = kbuf = memdup_user_nul(buffer, left);
+ if (IS_ERR(kbuf))
+ return PTR_ERR(kbuf);
tmp_bitmap = kzalloc(BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long),
GFP_KERNEL);
if (!tmp_bitmap) {
- free_page(page);
+ kfree(kbuf);
return -ENOMEM;
}
- proc_skip_char(&kbuf, &left, '\n');
+ proc_skip_char(&p, &left, '\n');
while (!err && left) {
unsigned long val_a, val_b;
bool neg;
- err = proc_get_long(&kbuf, &left, &val_a, &neg, tr_a,
+ err = proc_get_long(&p, &left, &val_a, &neg, tr_a,
sizeof(tr_a), &c);
if (err)
break;
@@ -2688,12 +2712,12 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
val_b = val_a;
if (left) {
- kbuf++;
+ p++;
left--;
}
if (c == '-') {
- err = proc_get_long(&kbuf, &left, &val_b,
+ err = proc_get_long(&p, &left, &val_b,
&neg, tr_b, sizeof(tr_b),
&c);
if (err)
@@ -2704,16 +2728,16 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
break;
}
if (left) {
- kbuf++;
+ p++;
left--;
}
}
bitmap_set(tmp_bitmap, val_a, val_b - val_a + 1);
first = 0;
- proc_skip_char(&kbuf, &left, '\n');
+ proc_skip_char(&p, &left, '\n');
}
- free_page(page);
+ kfree(kbuf);
} else {
unsigned long bit_a, bit_b = 0;
diff --git a/kernel/task_work.c b/kernel/task_work.c
index bce3211e7..53fa971d0 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -118,4 +118,3 @@ void task_work_run(void)
} while (work);
}
}
-EXPORT_SYMBOL_GPL(task_work_run);
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 7fbba635a..e840ed867 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -271,11 +271,27 @@ static int alarmtimer_suspend(struct device *dev)
__pm_wakeup_event(ws, MSEC_PER_SEC);
return ret;
}
+
+static int alarmtimer_resume(struct device *dev)
+{
+ struct rtc_device *rtc;
+
+ rtc = alarmtimer_get_rtcdev();
+ if (rtc)
+ rtc_timer_cancel(rtc, &rtctimer);
+ return 0;
+}
+
#else
static int alarmtimer_suspend(struct device *dev)
{
return 0;
}
+
+static int alarmtimer_resume(struct device *dev)
+{
+ return 0;
+}
#endif
static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type)
@@ -800,6 +816,7 @@ out:
/* Suspend hook structures */
static const struct dev_pm_ops alarmtimer_pm_ops = {
.suspend = alarmtimer_suspend,
+ .resume = alarmtimer_resume,
};
static struct platform_driver alarmtimer_driver = {
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 1347882d1..664de5392 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -218,8 +218,8 @@ static void clocksource_watchdog(unsigned long data)
/* Check the deviation from the watchdog clocksource. */
if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) {
- pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable because the skew is too large:\n",
- cs->name);
+ pr_warn("timekeeping watchdog on CPU%d: Marking clocksource '%s' as unstable because the skew is too large:\n",
+ smp_processor_id(), cs->name);
pr_warn(" '%s' wd_now: %llx wd_last: %llx mask: %llx\n",
watchdog->name, wdnow, wdlast, watchdog->mask);
pr_warn(" '%s' cs_now: %llx cs_last: %llx mask: %llx\n",
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 149cc8086..6df8927c5 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -16,8 +16,11 @@
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/rtc.h>
+#include <linux/math64.h>
#include "ntp_internal.h"
+#include "timekeeping_internal.h"
+
/*
* NTP timekeeping variables:
@@ -70,7 +73,7 @@ static long time_esterror = NTP_PHASE_LIMIT;
static s64 time_freq;
/* time at last adjustment (secs): */
-static long time_reftime;
+static time64_t time_reftime;
static long time_adjust;
@@ -297,25 +300,27 @@ static void ntp_update_offset(long offset)
if (!(time_status & STA_PLL))
return;
- if (!(time_status & STA_NANO))
+ if (!(time_status & STA_NANO)) {
+ /* Make sure the multiplication below won't overflow */
+ offset = clamp(offset, -USEC_PER_SEC, USEC_PER_SEC);
offset *= NSEC_PER_USEC;
+ }
/*
* Scale the phase adjustment and
* clamp to the operating range.
*/
- offset = min(offset, MAXPHASE);
- offset = max(offset, -MAXPHASE);
+ offset = clamp(offset, -MAXPHASE, MAXPHASE);
/*
* Select how the frequency is to be controlled
* and in which mode (PLL or FLL).
*/
- secs = get_seconds() - time_reftime;
+ secs = (long)(__ktime_get_real_seconds() - time_reftime);
if (unlikely(time_status & STA_FREQHOLD))
secs = 0;
- time_reftime = get_seconds();
+ time_reftime = __ktime_get_real_seconds();
offset64 = offset;
freq_adj = ntp_update_offset_fll(offset64, secs);
@@ -390,10 +395,11 @@ ktime_t ntp_get_next_leap(void)
*
* Also handles leap second processing, and returns leap offset
*/
-int second_overflow(unsigned long secs)
+int second_overflow(time64_t secs)
{
s64 delta;
int leap = 0;
+ s32 rem;
/*
* Leap second processing. If in leap-insert state at the end of the
@@ -404,19 +410,19 @@ int second_overflow(unsigned long secs)
case TIME_OK:
if (time_status & STA_INS) {
time_state = TIME_INS;
- ntp_next_leap_sec = secs + SECS_PER_DAY -
- (secs % SECS_PER_DAY);
+ div_s64_rem(secs, SECS_PER_DAY, &rem);
+ ntp_next_leap_sec = secs + SECS_PER_DAY - rem;
} else if (time_status & STA_DEL) {
time_state = TIME_DEL;
- ntp_next_leap_sec = secs + SECS_PER_DAY -
- ((secs+1) % SECS_PER_DAY);
+ div_s64_rem(secs + 1, SECS_PER_DAY, &rem);
+ ntp_next_leap_sec = secs + SECS_PER_DAY - rem;
}
break;
case TIME_INS:
if (!(time_status & STA_INS)) {
ntp_next_leap_sec = TIME64_MAX;
time_state = TIME_OK;
- } else if (secs % SECS_PER_DAY == 0) {
+ } else if (secs == ntp_next_leap_sec) {
leap = -1;
time_state = TIME_OOP;
printk(KERN_NOTICE
@@ -427,7 +433,7 @@ int second_overflow(unsigned long secs)
if (!(time_status & STA_DEL)) {
ntp_next_leap_sec = TIME64_MAX;
time_state = TIME_OK;
- } else if ((secs + 1) % SECS_PER_DAY == 0) {
+ } else if (secs == ntp_next_leap_sec) {
leap = 1;
ntp_next_leap_sec = TIME64_MAX;
time_state = TIME_WAIT;
@@ -590,7 +596,7 @@ static inline void process_adj_status(struct timex *txc, struct timespec64 *ts)
* reference time to current time.
*/
if (!(time_status & STA_PLL) && (txc->status & STA_PLL))
- time_reftime = get_seconds();
+ time_reftime = __ktime_get_real_seconds();
/* only set allowed bits */
time_status &= STA_RONLY;
@@ -674,8 +680,24 @@ int ntp_validate_timex(struct timex *txc)
return -EINVAL;
}
- if ((txc->modes & ADJ_SETOFFSET) && (!capable(CAP_SYS_TIME)))
- return -EPERM;
+ if (txc->modes & ADJ_SETOFFSET) {
+ /* In order to inject time, you gotta be super-user! */
+ if (!capable(CAP_SYS_TIME))
+ return -EPERM;
+
+ if (txc->modes & ADJ_NANO) {
+ struct timespec ts;
+
+ ts.tv_sec = txc->time.tv_sec;
+ ts.tv_nsec = txc->time.tv_usec;
+ if (!timespec_inject_offset_valid(&ts))
+ return -EINVAL;
+
+ } else {
+ if (!timeval_inject_offset_valid(&txc->time))
+ return -EINVAL;
+ }
+ }
/*
* Check for potential multiplication overflows that can
diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h
index af924470e..d8a7c11fa 100644
--- a/kernel/time/ntp_internal.h
+++ b/kernel/time/ntp_internal.h
@@ -6,7 +6,7 @@ extern void ntp_clear(void);
/* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */
extern u64 ntp_tick_length(void);
extern ktime_t ntp_get_next_leap(void);
-extern int second_overflow(unsigned long secs);
+extern int second_overflow(time64_t secs);
extern int ntp_validate_timex(struct timex *);
extern int __do_adjtimex(struct timex *, struct timespec64 *, s32 *);
extern void __hardpps(const struct timespec64 *, const struct timespec64 *);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 22c57e191..0b1742434 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -36,16 +36,17 @@
*/
static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
-/*
- * The time, when the last jiffy update happened. Protected by jiffies_lock.
- */
-static ktime_t last_jiffies_update;
-
struct tick_sched *tick_get_tick_sched(int cpu)
{
return &per_cpu(tick_cpu_sched, cpu);
}
+#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS)
+/*
+ * The time, when the last jiffy update happened. Protected by jiffies_lock.
+ */
+static ktime_t last_jiffies_update;
+
/*
* Must be called with interrupts disabled !
*/
@@ -143,7 +144,7 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
* when we go busy again does not account too much ticks.
*/
if (ts->tick_stopped) {
- touch_softlockup_watchdog();
+ touch_softlockup_watchdog_sched();
if (is_idle_task(current))
ts->idle_jiffies++;
}
@@ -151,6 +152,7 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
update_process_times(user_mode(regs));
profile_tick(CPU_PROFILING);
}
+#endif
#ifdef CONFIG_NO_HZ_FULL
cpumask_var_t tick_nohz_full_mask;
@@ -387,7 +389,7 @@ void __init tick_nohz_init(void)
/*
* NO HZ enabled ?
*/
-static int tick_nohz_enabled __read_mostly = 1;
+int tick_nohz_enabled __read_mostly = 1;
unsigned long tick_nohz_active __read_mostly;
/*
* Enable / Disable tickless mode
@@ -430,7 +432,7 @@ static void tick_nohz_update_jiffies(ktime_t now)
tick_do_update_jiffies64(now);
local_irq_restore(flags);
- touch_softlockup_watchdog();
+ touch_softlockup_watchdog_sched();
}
/*
@@ -603,15 +605,31 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
/*
* If the tick is due in the next period, keep it ticking or
- * restart it proper.
+ * force prod the timer.
*/
delta = next_tick - basemono;
if (delta <= (u64)TICK_NSEC) {
tick.tv64 = 0;
+ /*
+ * We've not stopped the tick yet, and there's a timer in the
+ * next period, so no point in stopping it either, bail.
+ */
if (!ts->tick_stopped)
goto out;
+
+ /*
+ * If, OTOH, we did stop it, but there's a pending (expired)
+ * timer reprogram the timer hardware to fire now.
+ *
+ * We will not restart the tick proper, just prod the timer
+ * hardware into firing an interrupt to process the pending
+ * timers. Just like tick_irq_exit() will not restart the tick
+ * for 'normal' interrupts.
+ *
+ * Only once we exit the idle loop will we re-enable the tick,
+ * see tick_nohz_idle_exit().
+ */
if (delta == 0) {
- /* Tick is stopped, but required now. Enforce it */
tick_nohz_restart(ts, now);
goto out;
}
@@ -694,14 +712,14 @@ out:
return tick;
}
-static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
+static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now, int active)
{
/* Update jiffies first */
tick_do_update_jiffies64(now);
- update_cpu_load_nohz();
+ update_cpu_load_nohz(active);
calc_load_exit_idle();
- touch_softlockup_watchdog();
+ touch_softlockup_watchdog_sched();
/*
* Cancel the scheduled timer and restore the tick
*/
@@ -725,7 +743,7 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts)
if (can_stop_full_tick())
tick_nohz_stop_sched_tick(ts, ktime_get(), cpu);
else if (ts->tick_stopped)
- tick_nohz_restart_sched_tick(ts, ktime_get());
+ tick_nohz_restart_sched_tick(ts, ktime_get(), 1);
#endif
}
@@ -875,7 +893,7 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
unsigned long ticks;
- if (vtime_accounting_enabled())
+ if (vtime_accounting_cpu_enabled())
return;
/*
* We stopped the tick in idle. Update process times would miss the
@@ -916,7 +934,7 @@ void tick_nohz_idle_exit(void)
tick_nohz_stop_idle(ts, now);
if (ts->tick_stopped) {
- tick_nohz_restart_sched_tick(ts, now);
+ tick_nohz_restart_sched_tick(ts, now, 0);
tick_nohz_account_idle_ticks(ts);
}
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 99188ee5d..34b4cedfa 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -845,6 +845,19 @@ time64_t ktime_get_real_seconds(void)
}
EXPORT_SYMBOL_GPL(ktime_get_real_seconds);
+/**
+ * __ktime_get_real_seconds - The same as ktime_get_real_seconds
+ * but without the sequence counter protect. This internal function
+ * is called just when timekeeping lock is already held.
+ */
+time64_t __ktime_get_real_seconds(void)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+
+ return tk->xtime_sec;
+}
+
+
#ifdef CONFIG_NTP_PPS
/**
@@ -958,7 +971,7 @@ int timekeeping_inject_offset(struct timespec *ts)
struct timespec64 ts64, tmp;
int ret = 0;
- if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
+ if (!timespec_inject_offset_valid(ts))
return -EINVAL;
ts64 = timespec_to_timespec64(*ts);
@@ -1591,9 +1604,12 @@ static __always_inline void timekeeping_freqadjust(struct timekeeper *tk,
{
s64 interval = tk->cycle_interval;
s64 xinterval = tk->xtime_interval;
+ u32 base = tk->tkr_mono.clock->mult;
+ u32 max = tk->tkr_mono.clock->maxadj;
+ u32 cur_adj = tk->tkr_mono.mult;
s64 tick_error;
bool negative;
- u32 adj;
+ u32 adj_scale;
/* Remove any current error adj from freq calculation */
if (tk->ntp_err_mult)
@@ -1612,13 +1628,33 @@ static __always_inline void timekeeping_freqadjust(struct timekeeper *tk,
/* preserve the direction of correction */
negative = (tick_error < 0);
- /* Sort out the magnitude of the correction */
+ /* If any adjustment would pass the max, just return */
+ if (negative && (cur_adj - 1) <= (base - max))
+ return;
+ if (!negative && (cur_adj + 1) >= (base + max))
+ return;
+ /*
+ * Sort out the magnitude of the correction, but
+ * avoid making so large a correction that we go
+ * over the max adjustment.
+ */
+ adj_scale = 0;
tick_error = abs(tick_error);
- for (adj = 0; tick_error > interval; adj++)
+ while (tick_error > interval) {
+ u32 adj = 1 << (adj_scale + 1);
+
+ /* Check if adjustment gets us within 1 unit from the max */
+ if (negative && (cur_adj - adj) <= (base - max))
+ break;
+ if (!negative && (cur_adj + adj) >= (base + max))
+ break;
+
+ adj_scale++;
tick_error >>= 1;
+ }
/* scale the corrections */
- timekeeping_apply_adjustment(tk, offset, negative, adj);
+ timekeeping_apply_adjustment(tk, offset, negative, adj_scale);
}
/*
diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h
index 4ea005a7f..5be76270e 100644
--- a/kernel/time/timekeeping_internal.h
+++ b/kernel/time/timekeeping_internal.h
@@ -17,7 +17,11 @@ static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask)
{
cycle_t ret = (now - last) & mask;
- return (s64) ret > 0 ? ret : 0;
+ /*
+ * Prevent time going backwards by checking the MSB of mask in
+ * the result. If set, return 0.
+ */
+ return ret & ~(mask >> 1) ? 0 : ret;
}
#else
static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask)
@@ -26,4 +30,6 @@ static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask)
}
#endif
+extern time64_t __ktime_get_real_seconds(void);
+
#endif /* _TIMEKEEPING_INTERNAL_H */
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index a990824c8..2aeb6ffc0 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -349,16 +349,10 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
if (count >= BLK_TN_MAX_MSG)
return -EINVAL;
- msg = kmalloc(count + 1, GFP_KERNEL);
- if (msg == NULL)
- return -ENOMEM;
-
- if (copy_from_user(msg, buffer, count)) {
- kfree(msg);
- return -EFAULT;
- }
+ msg = memdup_user_nul(buffer, count);
+ if (IS_ERR(msg))
+ return PTR_ERR(msg);
- msg[count] = '\0';
bt = filp->private_data;
__trace_note_message(bt, "%s", msg);
kfree(msg);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 4228fd368..326a75e88 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -191,14 +191,17 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
struct bpf_array *array = container_of(map, struct bpf_array, map);
struct perf_event *event;
+ struct file *file;
if (unlikely(index >= array->map.max_entries))
return -E2BIG;
- event = (struct perf_event *)array->ptrs[index];
- if (!event)
+ file = (struct file *)array->ptrs[index];
+ if (unlikely(!file))
return -ENOENT;
+ event = file->private_data;
+
/* make sure event is local and doesn't have pmu::count */
if (event->oncpu != smp_processor_id() ||
event->pmu->count)
@@ -228,6 +231,7 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 index, u64 r4, u64 size)
void *data = (void *) (long) r4;
struct perf_sample_data sample_data;
struct perf_event *event;
+ struct file *file;
struct perf_raw_record raw = {
.size = size,
.data = data,
@@ -236,10 +240,12 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 index, u64 r4, u64 size)
if (unlikely(index >= array->map.max_entries))
return -E2BIG;
- event = (struct perf_event *)array->ptrs[index];
- if (unlikely(!event))
+ file = (struct file *)array->ptrs[index];
+ if (unlikely(!file))
return -ENOENT;
+ event = file->private_data;
+
if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE ||
event->attr.config != PERF_COUNT_SW_BPF_OUTPUT))
return -EINVAL;
@@ -316,7 +322,7 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type
return true;
}
-static struct bpf_verifier_ops kprobe_prog_ops = {
+static const struct bpf_verifier_ops kprobe_prog_ops = {
.get_func_proto = kprobe_prog_func_proto,
.is_valid_access = kprobe_prog_is_valid_access,
};
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 3f743b147..57a6eea84 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -62,8 +62,6 @@
#define FTRACE_HASH_DEFAULT_BITS 10
#define FTRACE_HASH_MAX_BITS 12
-#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_CONTROL)
-
#ifdef CONFIG_DYNAMIC_FTRACE
#define INIT_OPS_HASH(opsname) \
.func_hash = &opsname.local_hash, \
@@ -113,14 +111,9 @@ static int ftrace_disabled __read_mostly;
static DEFINE_MUTEX(ftrace_lock);
-static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end;
static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end;
ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
static struct ftrace_ops global_ops;
-static struct ftrace_ops control_ops;
-
-static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *op, struct pt_regs *regs);
#if ARCH_SUPPORTS_FTRACE_OPS
static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
@@ -203,7 +196,7 @@ void clear_ftrace_function(void)
ftrace_trace_function = ftrace_stub;
}
-static void control_ops_disable_all(struct ftrace_ops *ops)
+static void per_cpu_ops_disable_all(struct ftrace_ops *ops)
{
int cpu;
@@ -211,16 +204,19 @@ static void control_ops_disable_all(struct ftrace_ops *ops)
*per_cpu_ptr(ops->disabled, cpu) = 1;
}
-static int control_ops_alloc(struct ftrace_ops *ops)
+static int per_cpu_ops_alloc(struct ftrace_ops *ops)
{
int __percpu *disabled;
+ if (WARN_ON_ONCE(!(ops->flags & FTRACE_OPS_FL_PER_CPU)))
+ return -EINVAL;
+
disabled = alloc_percpu(int);
if (!disabled)
return -ENOMEM;
ops->disabled = disabled;
- control_ops_disable_all(ops);
+ per_cpu_ops_disable_all(ops);
return 0;
}
@@ -256,10 +252,11 @@ static inline void update_function_graph_func(void) { }
static ftrace_func_t ftrace_ops_get_list_func(struct ftrace_ops *ops)
{
/*
- * If this is a dynamic ops or we force list func,
+ * If this is a dynamic, RCU, or per CPU ops, or we force list func,
* then it needs to call the list anyway.
*/
- if (ops->flags & FTRACE_OPS_FL_DYNAMIC || FTRACE_FORCE_LIST_FUNC)
+ if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU |
+ FTRACE_OPS_FL_RCU) || FTRACE_FORCE_LIST_FUNC)
return ftrace_ops_list_func;
return ftrace_ops_get_func(ops);
@@ -383,26 +380,6 @@ static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
return 0;
}
-static void add_ftrace_list_ops(struct ftrace_ops **list,
- struct ftrace_ops *main_ops,
- struct ftrace_ops *ops)
-{
- int first = *list == &ftrace_list_end;
- add_ftrace_ops(list, ops);
- if (first)
- add_ftrace_ops(&ftrace_ops_list, main_ops);
-}
-
-static int remove_ftrace_list_ops(struct ftrace_ops **list,
- struct ftrace_ops *main_ops,
- struct ftrace_ops *ops)
-{
- int ret = remove_ftrace_ops(list, ops);
- if (!ret && *list == &ftrace_list_end)
- ret = remove_ftrace_ops(&ftrace_ops_list, main_ops);
- return ret;
-}
-
static void ftrace_update_trampoline(struct ftrace_ops *ops);
static int __register_ftrace_function(struct ftrace_ops *ops)
@@ -430,14 +407,12 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
if (!core_kernel_data((unsigned long)ops))
ops->flags |= FTRACE_OPS_FL_DYNAMIC;
- if (ops->flags & FTRACE_OPS_FL_CONTROL) {
- if (control_ops_alloc(ops))
+ if (ops->flags & FTRACE_OPS_FL_PER_CPU) {
+ if (per_cpu_ops_alloc(ops))
return -ENOMEM;
- add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops);
- /* The control_ops needs the trampoline update */
- ops = &control_ops;
- } else
- add_ftrace_ops(&ftrace_ops_list, ops);
+ }
+
+ add_ftrace_ops(&ftrace_ops_list, ops);
/* Always save the function, and reset at unregistering */
ops->saved_func = ops->func;
@@ -460,11 +435,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED)))
return -EBUSY;
- if (ops->flags & FTRACE_OPS_FL_CONTROL) {
- ret = remove_ftrace_list_ops(&ftrace_control_list,
- &control_ops, ops);
- } else
- ret = remove_ftrace_ops(&ftrace_ops_list, ops);
+ ret = remove_ftrace_ops(&ftrace_ops_list, ops);
if (ret < 0)
return ret;
@@ -1687,6 +1658,9 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
int in_hash = 0;
int match = 0;
+ if (rec->flags & FTRACE_FL_DISABLED)
+ continue;
+
if (all) {
/*
* Only the filter_hash affects all records.
@@ -1940,7 +1914,7 @@ static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops,
return __ftrace_hash_update_ipmodify(ops, old_hash, new_hash);
}
-static void print_ip_ins(const char *fmt, unsigned char *p)
+static void print_ip_ins(const char *fmt, const unsigned char *p)
{
int i;
@@ -1952,6 +1926,31 @@ static void print_ip_ins(const char *fmt, unsigned char *p)
static struct ftrace_ops *
ftrace_find_tramp_ops_any(struct dyn_ftrace *rec);
+static struct ftrace_ops *
+ftrace_find_tramp_ops_next(struct dyn_ftrace *rec, struct ftrace_ops *ops);
+
+enum ftrace_bug_type ftrace_bug_type;
+const void *ftrace_expected;
+
+static void print_bug_type(void)
+{
+ switch (ftrace_bug_type) {
+ case FTRACE_BUG_UNKNOWN:
+ break;
+ case FTRACE_BUG_INIT:
+ pr_info("Initializing ftrace call sites\n");
+ break;
+ case FTRACE_BUG_NOP:
+ pr_info("Setting ftrace call site to NOP\n");
+ break;
+ case FTRACE_BUG_CALL:
+ pr_info("Setting ftrace call site to call ftrace function\n");
+ break;
+ case FTRACE_BUG_UPDATE:
+ pr_info("Updating ftrace call site to call a different ftrace function\n");
+ break;
+ }
+}
/**
* ftrace_bug - report and shutdown function tracer
@@ -1979,8 +1978,12 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec)
FTRACE_WARN_ON_ONCE(1);
pr_info("ftrace failed to modify ");
print_ip_sym(ip);
- print_ip_ins(" actual: ", (unsigned char *)ip);
+ print_ip_ins(" actual: ", (unsigned char *)ip);
pr_cont("\n");
+ if (ftrace_expected) {
+ print_ip_ins(" expected: ", ftrace_expected);
+ pr_cont("\n");
+ }
break;
case -EPERM:
FTRACE_WARN_ON_ONCE(1);
@@ -1992,6 +1995,7 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec)
pr_info("ftrace faulted on unknown error ");
print_ip_sym(ip);
}
+ print_bug_type();
if (rec) {
struct ftrace_ops *ops = NULL;
@@ -2000,15 +2004,19 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec)
rec->flags & FTRACE_FL_REGS ? " R" : " ");
if (rec->flags & FTRACE_FL_TRAMP_EN) {
ops = ftrace_find_tramp_ops_any(rec);
- if (ops)
- pr_cont("\ttramp: %pS",
- (void *)ops->trampoline);
- else
+ if (ops) {
+ do {
+ pr_cont("\ttramp: %pS (%pS)",
+ (void *)ops->trampoline,
+ (void *)ops->func);
+ ops = ftrace_find_tramp_ops_next(rec, ops);
+ } while (ops);
+ } else
pr_cont("\ttramp: ERROR!");
}
ip = ftrace_get_addr_curr(rec);
- pr_cont(" expected tramp: %lx\n", ip);
+ pr_cont("\n expected tramp: %lx\n", ip);
}
}
@@ -2016,6 +2024,11 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
{
unsigned long flag = 0UL;
+ ftrace_bug_type = FTRACE_BUG_UNKNOWN;
+
+ if (rec->flags & FTRACE_FL_DISABLED)
+ return FTRACE_UPDATE_IGNORE;
+
/*
* If we are updating calls:
*
@@ -2077,9 +2090,12 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
* from the save regs, to a non-save regs function or
* vice versa, or from a trampoline call.
*/
- if (flag & FTRACE_FL_ENABLED)
+ if (flag & FTRACE_FL_ENABLED) {
+ ftrace_bug_type = FTRACE_BUG_CALL;
return FTRACE_UPDATE_MAKE_CALL;
+ }
+ ftrace_bug_type = FTRACE_BUG_UPDATE;
return FTRACE_UPDATE_MODIFY_CALL;
}
@@ -2096,6 +2112,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
FTRACE_FL_REGS_EN);
}
+ ftrace_bug_type = FTRACE_BUG_NOP;
return FTRACE_UPDATE_MAKE_NOP;
}
@@ -2145,6 +2162,24 @@ ftrace_find_tramp_ops_any(struct dyn_ftrace *rec)
}
static struct ftrace_ops *
+ftrace_find_tramp_ops_next(struct dyn_ftrace *rec,
+ struct ftrace_ops *op)
+{
+ unsigned long ip = rec->ip;
+
+ while_for_each_ftrace_op(op) {
+
+ if (!op->trampoline)
+ continue;
+
+ if (hash_contains_ip(ip, op->func_hash))
+ return op;
+ }
+
+ return NULL;
+}
+
+static struct ftrace_ops *
ftrace_find_tramp_ops_curr(struct dyn_ftrace *rec)
{
struct ftrace_ops *op;
@@ -2307,17 +2342,22 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
ret = ftrace_update_record(rec, enable);
+ ftrace_bug_type = FTRACE_BUG_UNKNOWN;
+
switch (ret) {
case FTRACE_UPDATE_IGNORE:
return 0;
case FTRACE_UPDATE_MAKE_CALL:
+ ftrace_bug_type = FTRACE_BUG_CALL;
return ftrace_make_call(rec, ftrace_addr);
case FTRACE_UPDATE_MAKE_NOP:
+ ftrace_bug_type = FTRACE_BUG_NOP;
return ftrace_make_nop(NULL, rec, ftrace_old_addr);
case FTRACE_UPDATE_MODIFY_CALL:
+ ftrace_bug_type = FTRACE_BUG_UPDATE;
return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr);
}
@@ -2425,6 +2465,7 @@ ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR);
if (ret) {
+ ftrace_bug_type = FTRACE_BUG_INIT;
ftrace_bug(ret, rec);
return 0;
}
@@ -2566,7 +2607,7 @@ void __weak arch_ftrace_trampoline_free(struct ftrace_ops *ops)
{
}
-static void control_ops_free(struct ftrace_ops *ops)
+static void per_cpu_ops_free(struct ftrace_ops *ops)
{
free_percpu(ops->disabled);
}
@@ -2667,13 +2708,13 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
if (!command || !ftrace_enabled) {
/*
- * If these are control ops, they still need their
+ * If these are per_cpu ops, they still need their
* per_cpu field freed. Since, function tracing is
* not currently active, we can just free them
* without synchronizing all CPUs.
*/
- if (ops->flags & FTRACE_OPS_FL_CONTROL)
- control_ops_free(ops);
+ if (ops->flags & FTRACE_OPS_FL_PER_CPU)
+ per_cpu_ops_free(ops);
return 0;
}
@@ -2714,7 +2755,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
/*
* Dynamic ops may be freed, we must make sure that all
* callers are done before leaving this function.
- * The same goes for freeing the per_cpu data of the control
+ * The same goes for freeing the per_cpu data of the per_cpu
* ops.
*
* Again, normal synchronize_sched() is not good enough.
@@ -2725,13 +2766,13 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
* infrastructure to do the synchronization, thus we must do it
* ourselves.
*/
- if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_CONTROL)) {
+ if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU)) {
schedule_on_each_cpu(ftrace_sync);
arch_ftrace_trampoline_free(ops);
- if (ops->flags & FTRACE_OPS_FL_CONTROL)
- control_ops_free(ops);
+ if (ops->flags & FTRACE_OPS_FL_PER_CPU)
+ per_cpu_ops_free(ops);
}
return 0;
@@ -2798,9 +2839,9 @@ ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec)
if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
return 0;
- /* If ops traces all mods, we already accounted for it */
+ /* If ops traces all then it includes this function */
if (ops_traces_mod(ops))
- return 0;
+ return 1;
/* The function must be in the filter */
if (!ftrace_hash_empty(ops->func_hash->filter_hash) &&
@@ -2814,64 +2855,41 @@ ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec)
return 1;
}
-static int referenced_filters(struct dyn_ftrace *rec)
-{
- struct ftrace_ops *ops;
- int cnt = 0;
-
- for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) {
- if (ops_references_rec(ops, rec))
- cnt++;
- }
-
- return cnt;
-}
-
static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
{
struct ftrace_page *pg;
struct dyn_ftrace *p;
cycle_t start, stop;
unsigned long update_cnt = 0;
- unsigned long ref = 0;
- bool test = false;
+ unsigned long rec_flags = 0;
int i;
+ start = ftrace_now(raw_smp_processor_id());
+
/*
- * When adding a module, we need to check if tracers are
- * currently enabled and if they are set to trace all functions.
- * If they are, we need to enable the module functions as well
- * as update the reference counts for those function records.
+ * When a module is loaded, this function is called to convert
+ * the calls to mcount in its text to nops, and also to create
+ * an entry in the ftrace data. Now, if ftrace is activated
+ * after this call, but before the module sets its text to
+ * read-only, the modification of enabling ftrace can fail if
+ * the read-only is done while ftrace is converting the calls.
+ * To prevent this, the module's records are set as disabled
+ * and will be enabled after the call to set the module's text
+ * to read-only.
*/
- if (mod) {
- struct ftrace_ops *ops;
-
- for (ops = ftrace_ops_list;
- ops != &ftrace_list_end; ops = ops->next) {
- if (ops->flags & FTRACE_OPS_FL_ENABLED) {
- if (ops_traces_mod(ops))
- ref++;
- else
- test = true;
- }
- }
- }
-
- start = ftrace_now(raw_smp_processor_id());
+ if (mod)
+ rec_flags |= FTRACE_FL_DISABLED;
for (pg = new_pgs; pg; pg = pg->next) {
for (i = 0; i < pg->index; i++) {
- int cnt = ref;
/* If something went wrong, bail without enabling anything */
if (unlikely(ftrace_disabled))
return -1;
p = &pg->records[i];
- if (test)
- cnt += referenced_filters(p);
- p->flags = cnt;
+ p->flags = rec_flags;
/*
* Do the initial record conversion from mcount jump
@@ -2881,21 +2899,6 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
break;
update_cnt++;
-
- /*
- * If the tracing is enabled, go ahead and enable the record.
- *
- * The reason not to enable the record immediatelly is the
- * inherent check of ftrace_make_nop/ftrace_make_call for
- * correct previous instructions. Making first the NOP
- * conversion puts the module to the correct state, thus
- * passing the ftrace_make_call check.
- */
- if (ftrace_start_up && cnt) {
- int failed = __ftrace_replace_code(p, 1);
- if (failed)
- ftrace_bug(failed, p);
- }
}
}
@@ -3258,7 +3261,7 @@ static int t_show(struct seq_file *m, void *v)
seq_printf(m, "%ps", (void *)rec->ip);
if (iter->flags & FTRACE_ITER_ENABLED) {
- struct ftrace_ops *ops = NULL;
+ struct ftrace_ops *ops;
seq_printf(m, " (%ld)%s%s",
ftrace_rec_count(rec),
@@ -3266,14 +3269,19 @@ static int t_show(struct seq_file *m, void *v)
rec->flags & FTRACE_FL_IPMODIFY ? " I" : " ");
if (rec->flags & FTRACE_FL_TRAMP_EN) {
ops = ftrace_find_tramp_ops_any(rec);
- if (ops)
- seq_printf(m, "\ttramp: %pS",
- (void *)ops->trampoline);
- else
+ if (ops) {
+ do {
+ seq_printf(m, "\ttramp: %pS (%pS)",
+ (void *)ops->trampoline,
+ (void *)ops->func);
+ add_trampoline_func(m, ops, rec);
+ ops = ftrace_find_tramp_ops_next(rec, ops);
+ } while (ops);
+ } else
seq_puts(m, "\ttramp: ERROR!");
-
+ } else {
+ add_trampoline_func(m, NULL, rec);
}
- add_trampoline_func(m, ops, rec);
}
seq_putc(m, '\n');
@@ -4898,6 +4906,19 @@ static int ftrace_process_locs(struct module *mod,
#define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next)
+static int referenced_filters(struct dyn_ftrace *rec)
+{
+ struct ftrace_ops *ops;
+ int cnt = 0;
+
+ for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) {
+ if (ops_references_rec(ops, rec))
+ cnt++;
+ }
+
+ return cnt;
+}
+
void ftrace_release_mod(struct module *mod)
{
struct dyn_ftrace *rec;
@@ -4940,44 +4961,85 @@ void ftrace_release_mod(struct module *mod)
mutex_unlock(&ftrace_lock);
}
-static void ftrace_init_module(struct module *mod,
- unsigned long *start, unsigned long *end)
+void ftrace_module_enable(struct module *mod)
{
- if (ftrace_disabled || start == end)
- return;
- ftrace_process_locs(mod, start, end);
-}
+ struct dyn_ftrace *rec;
+ struct ftrace_page *pg;
-void ftrace_module_init(struct module *mod)
-{
- ftrace_init_module(mod, mod->ftrace_callsites,
- mod->ftrace_callsites +
- mod->num_ftrace_callsites);
-}
+ mutex_lock(&ftrace_lock);
-static int ftrace_module_notify_exit(struct notifier_block *self,
- unsigned long val, void *data)
-{
- struct module *mod = data;
+ if (ftrace_disabled)
+ goto out_unlock;
- if (val == MODULE_STATE_GOING)
- ftrace_release_mod(mod);
+ /*
+ * If the tracing is enabled, go ahead and enable the record.
+ *
+ * The reason not to enable the record immediatelly is the
+ * inherent check of ftrace_make_nop/ftrace_make_call for
+ * correct previous instructions. Making first the NOP
+ * conversion puts the module to the correct state, thus
+ * passing the ftrace_make_call check.
+ *
+ * We also delay this to after the module code already set the
+ * text to read-only, as we now need to set it back to read-write
+ * so that we can modify the text.
+ */
+ if (ftrace_start_up)
+ ftrace_arch_code_modify_prepare();
- return 0;
+ do_for_each_ftrace_rec(pg, rec) {
+ int cnt;
+ /*
+ * do_for_each_ftrace_rec() is a double loop.
+ * module text shares the pg. If a record is
+ * not part of this module, then skip this pg,
+ * which the "break" will do.
+ */
+ if (!within_module_core(rec->ip, mod))
+ break;
+
+ cnt = 0;
+
+ /*
+ * When adding a module, we need to check if tracers are
+ * currently enabled and if they are, and can trace this record,
+ * we need to enable the module functions as well as update the
+ * reference counts for those function records.
+ */
+ if (ftrace_start_up)
+ cnt += referenced_filters(rec);
+
+ /* This clears FTRACE_FL_DISABLED */
+ rec->flags = cnt;
+
+ if (ftrace_start_up && cnt) {
+ int failed = __ftrace_replace_code(rec, 1);
+ if (failed) {
+ ftrace_bug(failed, rec);
+ goto out_loop;
+ }
+ }
+
+ } while_for_each_ftrace_rec();
+
+ out_loop:
+ if (ftrace_start_up)
+ ftrace_arch_code_modify_post_process();
+
+ out_unlock:
+ mutex_unlock(&ftrace_lock);
}
-#else
-static int ftrace_module_notify_exit(struct notifier_block *self,
- unsigned long val, void *data)
+
+void ftrace_module_init(struct module *mod)
{
- return 0;
+ if (ftrace_disabled || !mod->num_ftrace_callsites)
+ return;
+
+ ftrace_process_locs(mod, mod->ftrace_callsites,
+ mod->ftrace_callsites + mod->num_ftrace_callsites);
}
#endif /* CONFIG_MODULES */
-struct notifier_block ftrace_module_exit_nb = {
- .notifier_call = ftrace_module_notify_exit,
- .priority = INT_MIN, /* Run after anything that can remove kprobes */
-};
-
void __init ftrace_init(void)
{
extern unsigned long __start_mcount_loc[];
@@ -5006,10 +5068,6 @@ void __init ftrace_init(void)
__start_mcount_loc,
__stop_mcount_loc);
- ret = register_module_notifier(&ftrace_module_exit_nb);
- if (ret)
- pr_warning("Failed to register trace ftrace module exit notifier\n");
-
set_ftrace_early_filters();
return;
@@ -5116,44 +5174,6 @@ void ftrace_reset_array_ops(struct trace_array *tr)
tr->ops->func = ftrace_stub;
}
-static void
-ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *op, struct pt_regs *regs)
-{
- if (unlikely(trace_recursion_test(TRACE_CONTROL_BIT)))
- return;
-
- /*
- * Some of the ops may be dynamically allocated,
- * they must be freed after a synchronize_sched().
- */
- preempt_disable_notrace();
- trace_recursion_set(TRACE_CONTROL_BIT);
-
- /*
- * Control funcs (perf) uses RCU. Only trace if
- * RCU is currently active.
- */
- if (!rcu_is_watching())
- goto out;
-
- do_for_each_ftrace_op(op, ftrace_control_list) {
- if (!(op->flags & FTRACE_OPS_FL_STUB) &&
- !ftrace_function_local_disabled(op) &&
- ftrace_ops_test(op, ip, regs))
- op->func(ip, parent_ip, op, regs);
- } while_for_each_ftrace_op(op);
- out:
- trace_recursion_clear(TRACE_CONTROL_BIT);
- preempt_enable_notrace();
-}
-
-static struct ftrace_ops control_ops = {
- .func = ftrace_ops_control_func,
- .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
- INIT_OPS_HASH(control_ops)
-};
-
static inline void
__ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *ignored, struct pt_regs *regs)
@@ -5170,8 +5190,22 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
* they must be freed after a synchronize_sched().
*/
preempt_disable_notrace();
+
do_for_each_ftrace_op(op, ftrace_ops_list) {
- if (ftrace_ops_test(op, ip, regs)) {
+ /*
+ * Check the following for each ops before calling their func:
+ * if RCU flag is set, then rcu_is_watching() must be true
+ * if PER_CPU is set, then ftrace_function_local_disable()
+ * must be false
+ * Otherwise test if the ip matches the ops filter
+ *
+ * If any of the above fails then the op->func() is not executed.
+ */
+ if ((!(op->flags & FTRACE_OPS_FL_RCU) || rcu_is_watching()) &&
+ (!(op->flags & FTRACE_OPS_FL_PER_CPU) ||
+ !ftrace_function_local_disabled(op)) &&
+ ftrace_ops_test(op, ip, regs)) {
+
if (FTRACE_WARN_ON(!op->func)) {
pr_warn("op=%p %pS\n", op, op);
goto out;
@@ -5195,7 +5229,7 @@ out:
* being NULL, or CONFIG_DYNAMIC_FTRACE_WITH_REGS.
* Note, CONFIG_DYNAMIC_FTRACE_WITH_REGS expects a full regs to be saved.
* An architecture can pass partial regs with ftrace_ops and still
- * set the ARCH_SUPPORT_FTARCE_OPS.
+ * set the ARCH_SUPPORTS_FTRACE_OPS.
*/
#if ARCH_SUPPORTS_FTRACE_OPS
static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
@@ -5212,20 +5246,29 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip)
/*
* If there's only one function registered but it does not support
- * recursion, this function will be called by the mcount trampoline.
- * This function will handle recursion protection.
+ * recursion, needs RCU protection and/or requires per cpu handling, then
+ * this function will be called by the mcount trampoline.
*/
-static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip,
+static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct pt_regs *regs)
{
int bit;
+ if ((op->flags & FTRACE_OPS_FL_RCU) && !rcu_is_watching())
+ return;
+
bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX);
if (bit < 0)
return;
- op->func(ip, parent_ip, op, regs);
+ preempt_disable_notrace();
+ if (!(op->flags & FTRACE_OPS_FL_PER_CPU) ||
+ !ftrace_function_local_disabled(op)) {
+ op->func(ip, parent_ip, op, regs);
+ }
+
+ preempt_enable_notrace();
trace_clear_recursion(bit);
}
@@ -5243,12 +5286,12 @@ static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip,
ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
{
/*
- * If the func handles its own recursion, call it directly.
- * Otherwise call the recursion protected function that
- * will call the ftrace ops function.
+ * If the function does not handle recursion, needs to be RCU safe,
+ * or does per cpu logic, then we need to call the assist handler.
*/
- if (!(ops->flags & FTRACE_OPS_FL_RECURSION_SAFE))
- return ftrace_ops_recurs_func;
+ if (!(ops->flags & FTRACE_OPS_FL_RECURSION_SAFE) ||
+ ops->flags & (FTRACE_OPS_FL_RCU | FTRACE_OPS_FL_PER_CPU))
+ return ftrace_ops_assist_func;
return ops->func;
}
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 9c6045a27..95181e368 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1001,17 +1001,13 @@ static int rb_head_page_replace(struct buffer_page *old,
/*
* rb_tail_page_update - move the tail page forward
- *
- * Returns 1 if moved tail page, 0 if someone else did.
*/
-static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
+static void rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
struct buffer_page *tail_page,
struct buffer_page *next_page)
{
- struct buffer_page *old_tail;
unsigned long old_entries;
unsigned long old_write;
- int ret = 0;
/*
* The tail page now needs to be moved forward.
@@ -1036,7 +1032,7 @@ static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
* it is, then it is up to us to update the tail
* pointer.
*/
- if (tail_page == cpu_buffer->tail_page) {
+ if (tail_page == READ_ONCE(cpu_buffer->tail_page)) {
/* Zero the write counter */
unsigned long val = old_write & ~RB_WRITE_MASK;
unsigned long eval = old_entries & ~RB_WRITE_MASK;
@@ -1061,14 +1057,9 @@ static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
*/
local_set(&next_page->page->commit, 0);
- old_tail = cmpxchg(&cpu_buffer->tail_page,
- tail_page, next_page);
-
- if (old_tail == tail_page)
- ret = 1;
+ /* Again, either we update tail_page or an interrupt does */
+ (void)cmpxchg(&cpu_buffer->tail_page, tail_page, next_page);
}
-
- return ret;
}
static int rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer,
@@ -2036,12 +2027,15 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
* the tail page would have moved.
*/
if (ret == RB_PAGE_NORMAL) {
+ struct buffer_page *buffer_tail_page;
+
+ buffer_tail_page = READ_ONCE(cpu_buffer->tail_page);
/*
* If the tail had moved passed next, then we need
* to reset the pointer.
*/
- if (cpu_buffer->tail_page != tail_page &&
- cpu_buffer->tail_page != next_page)
+ if (buffer_tail_page != tail_page &&
+ buffer_tail_page != next_page)
rb_head_page_set_normal(cpu_buffer, new_head,
next_page,
RB_PAGE_HEAD);
@@ -2135,6 +2129,8 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
local_sub(length, &tail_page->write);
}
+static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer);
+
/*
* This is the slow path, force gcc not to inline it.
*/
@@ -2147,7 +2143,6 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
struct ring_buffer *buffer = cpu_buffer->buffer;
struct buffer_page *next_page;
int ret;
- u64 ts;
next_page = tail_page;
@@ -2221,20 +2216,17 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
}
}
- ret = rb_tail_page_update(cpu_buffer, tail_page, next_page);
- if (ret) {
- /*
- * Nested commits always have zero deltas, so
- * just reread the time stamp
- */
- ts = rb_time_stamp(buffer);
- next_page->page->time_stamp = ts;
- }
+ rb_tail_page_update(cpu_buffer, tail_page, next_page);
out_again:
rb_reset_tail(cpu_buffer, tail, info);
+ /* Commit what we have for now. */
+ rb_end_commit(cpu_buffer);
+ /* rb_end_commit() decs committing */
+ local_inc(&cpu_buffer->committing);
+
/* fail and let the caller try again */
return ERR_PTR(-EAGAIN);
@@ -2362,7 +2354,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
addr = (unsigned long)event;
addr &= PAGE_MASK;
- bpage = cpu_buffer->tail_page;
+ bpage = READ_ONCE(cpu_buffer->tail_page);
if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
unsigned long write_mask =
@@ -2410,7 +2402,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
again:
max_count = cpu_buffer->nr_pages * 100;
- while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
+ while (cpu_buffer->commit_page != READ_ONCE(cpu_buffer->tail_page)) {
if (RB_WARN_ON(cpu_buffer, !(--max_count)))
return;
if (RB_WARN_ON(cpu_buffer,
@@ -2419,8 +2411,10 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
local_set(&cpu_buffer->commit_page->page->commit,
rb_page_write(cpu_buffer->commit_page));
rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
- cpu_buffer->write_stamp =
- cpu_buffer->commit_page->page->time_stamp;
+ /* Only update the write stamp if the page has an event */
+ if (rb_page_write(cpu_buffer->commit_page))
+ cpu_buffer->write_stamp =
+ cpu_buffer->commit_page->page->time_stamp;
/* add barrier to keep gcc from optimizing too much */
barrier();
}
@@ -2443,7 +2437,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
* and pushed the tail page forward, we will be left with
* a dangling commit that will never go forward.
*/
- if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page))
+ if (unlikely(cpu_buffer->commit_page != READ_ONCE(cpu_buffer->tail_page)))
goto again;
}
@@ -2699,7 +2693,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
if (unlikely(info->add_timestamp))
info->length += RB_LEN_TIME_EXTEND;
- tail_page = info->tail_page = cpu_buffer->tail_page;
+ /* Don't let the compiler play games with cpu_buffer->tail_page */
+ tail_page = info->tail_page = READ_ONCE(cpu_buffer->tail_page);
write = local_add_return(info->length, &tail_page->write);
/* set write to only the index of the write */
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 919d9d076..8414fa40b 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -363,8 +363,8 @@ struct trace_option_dentry {
* @name: the name chosen to select it on the available_tracers file
* @init: called when one switches to this tracer (echo name > current_tracer)
* @reset: called when one switches to another tracer
- * @start: called when tracing is unpaused (echo 1 > tracing_enabled)
- * @stop: called when tracing is paused (echo 0 > tracing_enabled)
+ * @start: called when tracing is unpaused (echo 1 > tracing_on)
+ * @stop: called when tracing is paused (echo 0 > tracing_on)
* @update_thresh: called when tracing_thresh is updated
* @open: called when the trace file is opened
* @pipe_open: called when the trace_pipe file is opened
@@ -467,8 +467,6 @@ enum {
TRACE_INTERNAL_IRQ_BIT,
TRACE_INTERNAL_SIRQ_BIT,
- TRACE_CONTROL_BIT,
-
TRACE_BRANCH_BIT,
/*
* Abuse of the trace_recursion.
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index cc9f7a931..00df25fd8 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -334,7 +334,7 @@ static int perf_ftrace_function_register(struct perf_event *event)
{
struct ftrace_ops *ops = &event->ftrace_ops;
- ops->flags |= FTRACE_OPS_FL_CONTROL;
+ ops->flags |= FTRACE_OPS_FL_PER_CPU | FTRACE_OPS_FL_RCU;
ops->func = perf_ftrace_function_call;
return register_ftrace_function(ops);
}
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index d202d991e..05ddc0820 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1343,15 +1343,9 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
if (cnt >= PAGE_SIZE)
return -EINVAL;
- buf = (char *)__get_free_page(GFP_TEMPORARY);
- if (!buf)
- return -ENOMEM;
-
- if (copy_from_user(buf, ubuf, cnt)) {
- free_page((unsigned long) buf);
- return -EFAULT;
- }
- buf[cnt] = '\0';
+ buf = memdup_user_nul(ubuf, cnt);
+ if (IS_ERR(buf))
+ return PTR_ERR(buf);
mutex_lock(&event_mutex);
file = event_file_data(filp);
@@ -1359,7 +1353,7 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
err = apply_event_filter(file, buf);
mutex_unlock(&event_mutex);
- free_page((unsigned long) buf);
+ kfree(buf);
if (err < 0)
return err;
@@ -1510,18 +1504,12 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
if (cnt >= PAGE_SIZE)
return -EINVAL;
- buf = (char *)__get_free_page(GFP_TEMPORARY);
- if (!buf)
- return -ENOMEM;
-
- if (copy_from_user(buf, ubuf, cnt)) {
- free_page((unsigned long) buf);
- return -EFAULT;
- }
- buf[cnt] = '\0';
+ buf = memdup_user_nul(ubuf, cnt);
+ if (IS_ERR(buf))
+ return PTR_ERR(buf);
err = apply_subsystem_event_filter(dir, buf);
- free_page((unsigned long) buf);
+ kfree(buf);
if (err < 0)
return err;
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 42a4009fd..b38f617b6 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -237,28 +237,23 @@ static ssize_t event_trigger_regex_write(struct file *file,
if (cnt >= PAGE_SIZE)
return -EINVAL;
- buf = (char *)__get_free_page(GFP_TEMPORARY);
- if (!buf)
- return -ENOMEM;
+ buf = memdup_user_nul(ubuf, cnt);
+ if (IS_ERR(buf))
+ return PTR_ERR(buf);
- if (copy_from_user(buf, ubuf, cnt)) {
- free_page((unsigned long)buf);
- return -EFAULT;
- }
- buf[cnt] = '\0';
strim(buf);
mutex_lock(&event_mutex);
event_file = event_file_data(file);
if (unlikely(!event_file)) {
mutex_unlock(&event_mutex);
- free_page((unsigned long)buf);
+ kfree(buf);
return -ENODEV;
}
ret = trigger_process_regex(event_file, buf);
mutex_unlock(&event_mutex);
- free_page((unsigned long)buf);
+ kfree(buf);
if (ret < 0)
goto out;
@@ -543,11 +538,12 @@ static int register_trigger(char *glob, struct event_trigger_ops *ops,
list_add_rcu(&data->list, &file->triggers);
ret++;
+ update_cond_flag(file);
if (trace_event_trigger_enable_disable(file, 1) < 0) {
list_del_rcu(&data->list);
+ update_cond_flag(file);
ret--;
}
- update_cond_flag(file);
out:
return ret;
}
@@ -575,8 +571,8 @@ static void unregister_trigger(char *glob, struct event_trigger_ops *ops,
if (data->cmd_ops->trigger_type == test->cmd_ops->trigger_type) {
unregistered = true;
list_del_rcu(&data->list);
- update_cond_flag(file);
trace_event_trigger_enable_disable(file, 0);
+ update_cond_flag(file);
break;
}
}
@@ -1319,11 +1315,12 @@ static int event_enable_register_trigger(char *glob,
list_add_rcu(&data->list, &file->triggers);
ret++;
+ update_cond_flag(file);
if (trace_event_trigger_enable_disable(file, 1) < 0) {
list_del_rcu(&data->list);
+ update_cond_flag(file);
ret--;
}
- update_cond_flag(file);
out:
return ret;
}
@@ -1344,8 +1341,8 @@ static void event_enable_unregister_trigger(char *glob,
(enable_data->file == test_enable_data->file)) {
unregistered = true;
list_del_rcu(&data->list);
- update_cond_flag(file);
trace_event_trigger_enable_disable(file, 0);
+ update_cond_flag(file);
break;
}
}
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 202df6cff..2a1abbaca 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -156,7 +156,11 @@ check_stack(unsigned long ip, unsigned long *stack)
for (; p < top && i < stack_trace_max.nr_entries; p++) {
if (stack_dump_trace[i] == ULONG_MAX)
break;
- if (*p == stack_dump_trace[i]) {
+ /*
+ * The READ_ONCE_NOCHECK is used to let KASAN know that
+ * this is not a stack-out-of-bounds error.
+ */
+ if ((READ_ONCE_NOCHECK(*p)) == stack_dump_trace[i]) {
stack_dump_trace[x] = stack_dump_trace[i++];
this_size = stack_trace_index[x++] =
(top - p) * sizeof(unsigned long);
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 975cb49e3..f8e26ab96 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -93,9 +93,11 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
{
struct mm_struct *mm;
- /* convert pages-usec to Mbyte-usec */
- stats->coremem = p->acct_rss_mem1 * PAGE_SIZE / MB;
- stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE / MB;
+ /* convert pages-nsec/1024 to Mbyte-usec, see __acct_update_integrals */
+ stats->coremem = p->acct_rss_mem1 * PAGE_SIZE;
+ do_div(stats->coremem, 1000 * KB);
+ stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE;
+ do_div(stats->virtmem, 1000 * KB);
mm = get_task_mm(p);
if (mm) {
/* adjust to KB unit */
@@ -123,27 +125,28 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
static void __acct_update_integrals(struct task_struct *tsk,
cputime_t utime, cputime_t stime)
{
- if (likely(tsk->mm)) {
- cputime_t time, dtime;
- struct timeval value;
- unsigned long flags;
- u64 delta;
-
- local_irq_save(flags);
- time = stime + utime;
- dtime = time - tsk->acct_timexpd;
- jiffies_to_timeval(cputime_to_jiffies(dtime), &value);
- delta = value.tv_sec;
- delta = delta * USEC_PER_SEC + value.tv_usec;
-
- if (delta == 0)
- goto out;
- tsk->acct_timexpd = time;
- tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm);
- tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
- out:
- local_irq_restore(flags);
- }
+ cputime_t time, dtime;
+ u64 delta;
+
+ if (!likely(tsk->mm))
+ return;
+
+ time = stime + utime;
+ dtime = time - tsk->acct_timexpd;
+ /* Avoid division: cputime_t is often in nanoseconds already. */
+ delta = cputime_to_nsecs(dtime);
+
+ if (delta < TICK_NSEC)
+ return;
+
+ tsk->acct_timexpd = time;
+ /*
+ * Divide by 1024 to avoid overflow, and to avoid division.
+ * The final unit reported to userspace is Mbyte-usecs,
+ * the rest of the math is done in xacct_add_tsk.
+ */
+ tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm) >> 10;
+ tsk->acct_vm_mem1 += delta * tsk->mm->total_vm >> 10;
}
/**
@@ -153,9 +156,12 @@ static void __acct_update_integrals(struct task_struct *tsk,
void acct_update_integrals(struct task_struct *tsk)
{
cputime_t utime, stime;
+ unsigned long flags;
+ local_irq_save(flags);
task_cputime(tsk, &utime, &stime);
__acct_update_integrals(tsk, utime, stime);
+ local_irq_restore(flags);
}
/**
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 88fefa68c..9bafc2119 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -602,8 +602,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
struct uid_gid_map new_map;
unsigned idx;
struct uid_gid_extent *extent = NULL;
- unsigned long page = 0;
- char *kbuf, *pos, *next_line;
+ char *kbuf = NULL, *pos, *next_line;
ssize_t ret = -EINVAL;
/*
@@ -638,23 +637,18 @@ static ssize_t map_write(struct file *file, const char __user *buf,
if (cap_valid(cap_setid) && !file_ns_capable(file, ns, CAP_SYS_ADMIN))
goto out;
- /* Get a buffer */
- ret = -ENOMEM;
- page = __get_free_page(GFP_TEMPORARY);
- kbuf = (char *) page;
- if (!page)
- goto out;
-
/* Only allow < page size writes at the beginning of the file */
ret = -EINVAL;
if ((*ppos != 0) || (count >= PAGE_SIZE))
goto out;
/* Slurp in the user data */
- ret = -EFAULT;
- if (copy_from_user(kbuf, buf, count))
+ kbuf = memdup_user_nul(buf, count);
+ if (IS_ERR(kbuf)) {
+ ret = PTR_ERR(kbuf);
+ kbuf = NULL;
goto out;
- kbuf[count] = '\0';
+ }
/* Parse the user data */
ret = -EINVAL;
@@ -756,8 +750,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
ret = count;
out:
mutex_unlock(&userns_state_mutex);
- if (page)
- free_page(page);
+ kfree(kbuf);
return ret;
}
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 18f34cf75..b3ace6ebb 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -20,6 +20,7 @@
#include <linux/smpboot.h>
#include <linux/sched/rt.h>
#include <linux/tick.h>
+#include <linux/workqueue.h>
#include <asm/irq_regs.h>
#include <linux/kvm_para.h>
@@ -225,7 +226,15 @@ static void __touch_watchdog(void)
__this_cpu_write(watchdog_touch_ts, get_timestamp());
}
-void touch_softlockup_watchdog(void)
+/**
+ * touch_softlockup_watchdog_sched - touch watchdog on scheduler stalls
+ *
+ * Call when the scheduler may have stalled for legitimate reasons
+ * preventing the watchdog task from executing - e.g. the scheduler
+ * entering idle state. This should only be used for scheduler events.
+ * Use touch_softlockup_watchdog() for everything else.
+ */
+void touch_softlockup_watchdog_sched(void)
{
/*
* Preemption can be enabled. It doesn't matter which CPU's timestamp
@@ -233,6 +242,12 @@ void touch_softlockup_watchdog(void)
*/
raw_cpu_write(watchdog_touch_ts, 0);
}
+
+void touch_softlockup_watchdog(void)
+{
+ touch_softlockup_watchdog_sched();
+ wq_watchdog_touch(raw_smp_processor_id());
+}
EXPORT_SYMBOL(touch_softlockup_watchdog);
void touch_all_softlockup_watchdogs(void)
@@ -246,6 +261,7 @@ void touch_all_softlockup_watchdogs(void)
*/
for_each_watchdog_cpu(cpu)
per_cpu(watchdog_touch_ts, cpu) = 0;
+ wq_watchdog_touch(-1);
}
#ifdef CONFIG_HARDLOCKUP_DETECTOR
@@ -351,7 +367,7 @@ static void watchdog_overflow_callback(struct perf_event *event,
trigger_allbutself_cpu_backtrace();
if (hardlockup_panic)
- panic("Hard LOCKUP");
+ nmi_panic(regs, "Hard LOCKUP");
__this_cpu_write(hard_watchdog_warn, true);
return;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 450c21fd0..7ff5dc7d2 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -148,6 +148,8 @@ struct worker_pool {
int id; /* I: pool ID */
unsigned int flags; /* X: flags */
+ unsigned long watchdog_ts; /* L: watchdog timestamp */
+
struct list_head worklist; /* L: list of pending works */
int nr_workers; /* L: total number of workers */
@@ -299,7 +301,23 @@ static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */
static LIST_HEAD(workqueues); /* PR: list of all workqueues */
static bool workqueue_freezing; /* PL: have wqs started freezing? */
-static cpumask_var_t wq_unbound_cpumask; /* PL: low level cpumask for all unbound wqs */
+/* PL: allowable cpus for unbound wqs and work items */
+static cpumask_var_t wq_unbound_cpumask;
+
+/* CPU where unbound work was last round robin scheduled from this CPU */
+static DEFINE_PER_CPU(int, wq_rr_cpu_last);
+
+/*
+ * Local execution of unbound work items is no longer guaranteed. The
+ * following always forces round-robin CPU selection on unbound work items
+ * to uncover usages which depend on it.
+ */
+#ifdef CONFIG_DEBUG_WQ_FORCE_RR_CPU
+static bool wq_debug_force_rr_cpu = true;
+#else
+static bool wq_debug_force_rr_cpu = false;
+#endif
+module_param_named(debug_force_rr_cpu, wq_debug_force_rr_cpu, bool, 0644);
/* the per-cpu worker pools */
static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
@@ -1093,6 +1111,8 @@ static void pwq_activate_delayed_work(struct work_struct *work)
struct pool_workqueue *pwq = get_work_pwq(work);
trace_workqueue_activate_work(work);
+ if (list_empty(&pwq->pool->worklist))
+ pwq->pool->watchdog_ts = jiffies;
move_linked_works(work, &pwq->pool->worklist, NULL);
__clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
pwq->nr_active++;
@@ -1304,6 +1324,39 @@ static bool is_chained_work(struct workqueue_struct *wq)
return worker && worker->current_pwq->wq == wq;
}
+/*
+ * When queueing an unbound work item to a wq, prefer local CPU if allowed
+ * by wq_unbound_cpumask. Otherwise, round robin among the allowed ones to
+ * avoid perturbing sensitive tasks.
+ */
+static int wq_select_unbound_cpu(int cpu)
+{
+ static bool printed_dbg_warning;
+ int new_cpu;
+
+ if (likely(!wq_debug_force_rr_cpu)) {
+ if (cpumask_test_cpu(cpu, wq_unbound_cpumask))
+ return cpu;
+ } else if (!printed_dbg_warning) {
+ pr_warn("workqueue: round-robin CPU selection forced, expect performance impact\n");
+ printed_dbg_warning = true;
+ }
+
+ if (cpumask_empty(wq_unbound_cpumask))
+ return cpu;
+
+ new_cpu = __this_cpu_read(wq_rr_cpu_last);
+ new_cpu = cpumask_next_and(new_cpu, wq_unbound_cpumask, cpu_online_mask);
+ if (unlikely(new_cpu >= nr_cpu_ids)) {
+ new_cpu = cpumask_first_and(wq_unbound_cpumask, cpu_online_mask);
+ if (unlikely(new_cpu >= nr_cpu_ids))
+ return cpu;
+ }
+ __this_cpu_write(wq_rr_cpu_last, new_cpu);
+
+ return new_cpu;
+}
+
static void __queue_work(int cpu, struct workqueue_struct *wq,
struct work_struct *work)
{
@@ -1329,7 +1382,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
return;
retry:
if (req_cpu == WORK_CPU_UNBOUND)
- cpu = raw_smp_processor_id();
+ cpu = wq_select_unbound_cpu(raw_smp_processor_id());
/* pwq which will be used unless @work is executing elsewhere */
if (!(wq->flags & WQ_UNBOUND))
@@ -1395,6 +1448,8 @@ retry:
trace_workqueue_activate_work(work);
pwq->nr_active++;
worklist = &pwq->pool->worklist;
+ if (list_empty(worklist))
+ pwq->pool->watchdog_ts = jiffies;
} else {
work_flags |= WORK_STRUCT_DELAYED;
worklist = &pwq->delayed_works;
@@ -2167,6 +2222,8 @@ recheck:
list_first_entry(&pool->worklist,
struct work_struct, entry);
+ pool->watchdog_ts = jiffies;
+
if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
/* optimization path, not strictly necessary */
process_one_work(worker, work);
@@ -2250,6 +2307,7 @@ repeat:
struct pool_workqueue, mayday_node);
struct worker_pool *pool = pwq->pool;
struct work_struct *work, *n;
+ bool first = true;
__set_current_state(TASK_RUNNING);
list_del_init(&pwq->mayday_node);
@@ -2266,9 +2324,14 @@ repeat:
* process'em.
*/
WARN_ON_ONCE(!list_empty(scheduled));
- list_for_each_entry_safe(work, n, &pool->worklist, entry)
- if (get_work_pwq(work) == pwq)
+ list_for_each_entry_safe(work, n, &pool->worklist, entry) {
+ if (get_work_pwq(work) == pwq) {
+ if (first)
+ pool->watchdog_ts = jiffies;
move_linked_works(work, scheduled, &n);
+ }
+ first = false;
+ }
if (!list_empty(scheduled)) {
process_scheduled_works(rescuer);
@@ -2326,6 +2389,38 @@ repeat:
goto repeat;
}
+/**
+ * check_flush_dependency - check for flush dependency sanity
+ * @target_wq: workqueue being flushed
+ * @target_work: work item being flushed (NULL for workqueue flushes)
+ *
+ * %current is trying to flush the whole @target_wq or @target_work on it.
+ * If @target_wq doesn't have %WQ_MEM_RECLAIM, verify that %current is not
+ * reclaiming memory or running on a workqueue which doesn't have
+ * %WQ_MEM_RECLAIM as that can break forward-progress guarantee leading to
+ * a deadlock.
+ */
+static void check_flush_dependency(struct workqueue_struct *target_wq,
+ struct work_struct *target_work)
+{
+ work_func_t target_func = target_work ? target_work->func : NULL;
+ struct worker *worker;
+
+ if (target_wq->flags & WQ_MEM_RECLAIM)
+ return;
+
+ worker = current_wq_worker();
+
+ WARN_ONCE(current->flags & PF_MEMALLOC,
+ "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%pf",
+ current->pid, current->comm, target_wq->name, target_func);
+ WARN_ONCE(worker && ((worker->current_pwq->wq->flags &
+ (WQ_MEM_RECLAIM | __WQ_LEGACY)) == WQ_MEM_RECLAIM),
+ "workqueue: WQ_MEM_RECLAIM %s:%pf is flushing !WQ_MEM_RECLAIM %s:%pf",
+ worker->current_pwq->wq->name, worker->current_func,
+ target_wq->name, target_func);
+}
+
struct wq_barrier {
struct work_struct work;
struct completion done;
@@ -2535,6 +2630,8 @@ void flush_workqueue(struct workqueue_struct *wq)
list_add_tail(&this_flusher.list, &wq->flusher_overflow);
}
+ check_flush_dependency(wq, NULL);
+
mutex_unlock(&wq->mutex);
wait_for_completion(&this_flusher.done);
@@ -2707,6 +2804,8 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
pwq = worker->current_pwq;
}
+ check_flush_dependency(pwq->wq, work);
+
insert_wq_barrier(pwq, barr, work, worker);
spin_unlock_irq(&pool->lock);
@@ -3079,6 +3178,7 @@ static int init_worker_pool(struct worker_pool *pool)
pool->cpu = -1;
pool->node = NUMA_NO_NODE;
pool->flags |= POOL_DISASSOCIATED;
+ pool->watchdog_ts = jiffies;
INIT_LIST_HEAD(&pool->worklist);
INIT_LIST_HEAD(&pool->idle_list);
hash_init(pool->busy_hash);
@@ -3611,7 +3711,6 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
const struct workqueue_attrs *attrs)
{
struct apply_wqattrs_ctx *ctx;
- int ret = -ENOMEM;
/* only unbound workqueues can change attributes */
if (WARN_ON(!(wq->flags & WQ_UNBOUND)))
@@ -3622,16 +3721,14 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
return -EINVAL;
ctx = apply_wqattrs_prepare(wq, attrs);
+ if (!ctx)
+ return -ENOMEM;
/* the ctx has been prepared successfully, let's commit it */
- if (ctx) {
- apply_wqattrs_commit(ctx);
- ret = 0;
- }
-
+ apply_wqattrs_commit(ctx);
apply_wqattrs_cleanup(ctx);
- return ret;
+ return 0;
}
/**
@@ -4318,7 +4415,9 @@ void show_workqueue_state(void)
pr_info("pool %d:", pool->id);
pr_cont_pool_info(pool);
- pr_cont(" workers=%d", pool->nr_workers);
+ pr_cont(" hung=%us workers=%d",
+ jiffies_to_msecs(jiffies - pool->watchdog_ts) / 1000,
+ pool->nr_workers);
if (pool->manager)
pr_cont(" manager: %d",
task_pid_nr(pool->manager->task));
@@ -5177,6 +5276,154 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq)
static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { }
#endif /* CONFIG_SYSFS */
+/*
+ * Workqueue watchdog.
+ *
+ * Stall may be caused by various bugs - missing WQ_MEM_RECLAIM, illegal
+ * flush dependency, a concurrency managed work item which stays RUNNING
+ * indefinitely. Workqueue stalls can be very difficult to debug as the
+ * usual warning mechanisms don't trigger and internal workqueue state is
+ * largely opaque.
+ *
+ * Workqueue watchdog monitors all worker pools periodically and dumps
+ * state if some pools failed to make forward progress for a while where
+ * forward progress is defined as the first item on ->worklist changing.
+ *
+ * This mechanism is controlled through the kernel parameter
+ * "workqueue.watchdog_thresh" which can be updated at runtime through the
+ * corresponding sysfs parameter file.
+ */
+#ifdef CONFIG_WQ_WATCHDOG
+
+static void wq_watchdog_timer_fn(unsigned long data);
+
+static unsigned long wq_watchdog_thresh = 30;
+static struct timer_list wq_watchdog_timer =
+ TIMER_DEFERRED_INITIALIZER(wq_watchdog_timer_fn, 0, 0);
+
+static unsigned long wq_watchdog_touched = INITIAL_JIFFIES;
+static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES;
+
+static void wq_watchdog_reset_touched(void)
+{
+ int cpu;
+
+ wq_watchdog_touched = jiffies;
+ for_each_possible_cpu(cpu)
+ per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies;
+}
+
+static void wq_watchdog_timer_fn(unsigned long data)
+{
+ unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ;
+ bool lockup_detected = false;
+ struct worker_pool *pool;
+ int pi;
+
+ if (!thresh)
+ return;
+
+ rcu_read_lock();
+
+ for_each_pool(pool, pi) {
+ unsigned long pool_ts, touched, ts;
+
+ if (list_empty(&pool->worklist))
+ continue;
+
+ /* get the latest of pool and touched timestamps */
+ pool_ts = READ_ONCE(pool->watchdog_ts);
+ touched = READ_ONCE(wq_watchdog_touched);
+
+ if (time_after(pool_ts, touched))
+ ts = pool_ts;
+ else
+ ts = touched;
+
+ if (pool->cpu >= 0) {
+ unsigned long cpu_touched =
+ READ_ONCE(per_cpu(wq_watchdog_touched_cpu,
+ pool->cpu));
+ if (time_after(cpu_touched, ts))
+ ts = cpu_touched;
+ }
+
+ /* did we stall? */
+ if (time_after(jiffies, ts + thresh)) {
+ lockup_detected = true;
+ pr_emerg("BUG: workqueue lockup - pool");
+ pr_cont_pool_info(pool);
+ pr_cont(" stuck for %us!\n",
+ jiffies_to_msecs(jiffies - pool_ts) / 1000);
+ }
+ }
+
+ rcu_read_unlock();
+
+ if (lockup_detected)
+ show_workqueue_state();
+
+ wq_watchdog_reset_touched();
+ mod_timer(&wq_watchdog_timer, jiffies + thresh);
+}
+
+void wq_watchdog_touch(int cpu)
+{
+ if (cpu >= 0)
+ per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies;
+ else
+ wq_watchdog_touched = jiffies;
+}
+
+static void wq_watchdog_set_thresh(unsigned long thresh)
+{
+ wq_watchdog_thresh = 0;
+ del_timer_sync(&wq_watchdog_timer);
+
+ if (thresh) {
+ wq_watchdog_thresh = thresh;
+ wq_watchdog_reset_touched();
+ mod_timer(&wq_watchdog_timer, jiffies + thresh * HZ);
+ }
+}
+
+static int wq_watchdog_param_set_thresh(const char *val,
+ const struct kernel_param *kp)
+{
+ unsigned long thresh;
+ int ret;
+
+ ret = kstrtoul(val, 0, &thresh);
+ if (ret)
+ return ret;
+
+ if (system_wq)
+ wq_watchdog_set_thresh(thresh);
+ else
+ wq_watchdog_thresh = thresh;
+
+ return 0;
+}
+
+static const struct kernel_param_ops wq_watchdog_thresh_ops = {
+ .set = wq_watchdog_param_set_thresh,
+ .get = param_get_ulong,
+};
+
+module_param_cb(watchdog_thresh, &wq_watchdog_thresh_ops, &wq_watchdog_thresh,
+ 0644);
+
+static void wq_watchdog_init(void)
+{
+ wq_watchdog_set_thresh(wq_watchdog_thresh);
+}
+
+#else /* CONFIG_WQ_WATCHDOG */
+
+static inline void wq_watchdog_init(void) { }
+
+#endif /* CONFIG_WQ_WATCHDOG */
+
static void __init wq_numa_init(void)
{
cpumask_var_t *tbl;
@@ -5300,6 +5547,9 @@ static int __init init_workqueues(void)
!system_unbound_wq || !system_freezable_wq ||
!system_power_efficient_wq ||
!system_freezable_power_efficient_wq);
+
+ wq_watchdog_init();
+
return 0;
}
early_initcall(init_workqueues);