diff options
author | André Fabian Silva Delgado <emulatorman@parabola.nu> | 2016-03-25 03:53:42 -0300 |
---|---|---|
committer | André Fabian Silva Delgado <emulatorman@parabola.nu> | 2016-03-25 03:53:42 -0300 |
commit | 03dd4cb26d967f9588437b0fc9cc0e8353322bb7 (patch) | |
tree | fa581f6dc1c0596391690d1f67eceef3af8246dc /kernel | |
parent | d4e493caf788ef44982e131ff9c786546904d934 (diff) |
Linux-libre 4.5-gnu
Diffstat (limited to 'kernel')
155 files changed, 5394 insertions, 21496 deletions
diff --git a/kernel/async.c b/kernel/async.c index 4c3773c0b..d2edd6efe 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -326,3 +326,4 @@ bool current_is_async(void) return worker && worker->current_func == async_run_entry_fn; } +EXPORT_SYMBOL_GPL(current_is_async); diff --git a/kernel/audit.c b/kernel/audit.c index 5ffcbd354..3a3e5deed 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -110,7 +110,6 @@ static u32 audit_backlog_limit = 64; #define AUDIT_BACKLOG_WAIT_TIME (60 * HZ) static u32 audit_backlog_wait_time_master = AUDIT_BACKLOG_WAIT_TIME; static u32 audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME; -static u32 audit_backlog_wait_overflow = 0; /* The identity of the user shutting down the audit system. */ kuid_t audit_sig_uid = INVALID_UID; @@ -509,8 +508,7 @@ static void flush_hold_queue(void) * if auditd just disappeared but we * dequeued an skb we need to drop ref */ - if (skb) - consume_skb(skb); + consume_skb(skb); } static int kauditd_thread(void *dummy) @@ -524,7 +522,8 @@ static int kauditd_thread(void *dummy) skb = skb_dequeue(&audit_skb_queue); if (skb) { - if (skb_queue_len(&audit_skb_queue) <= audit_backlog_limit) + if (!audit_backlog_limit || + (skb_queue_len(&audit_skb_queue) <= audit_backlog_limit)) wake_up(&audit_backlog_wait); if (audit_pid) kauditd_send_skb(skb); @@ -1232,9 +1231,7 @@ static void audit_buffer_free(struct audit_buffer *ab) if (!ab) return; - if (ab->skb) - kfree_skb(ab->skb); - + kfree_skb(ab->skb); spin_lock_irqsave(&audit_freelist_lock, flags); if (audit_freelist_count > AUDIT_MAXFREE) kfree(ab); @@ -1372,7 +1369,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, return NULL; if (gfp_mask & __GFP_DIRECT_RECLAIM) { - if (audit_pid && audit_pid == current->pid) + if (audit_pid && audit_pid == current->tgid) gfp_mask &= ~__GFP_DIRECT_RECLAIM; else reserve = 0; @@ -1395,12 +1392,12 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, skb_queue_len(&audit_skb_queue), audit_backlog_limit); audit_log_lost("backlog limit exceeded"); - audit_backlog_wait_time = audit_backlog_wait_overflow; + audit_backlog_wait_time = 0; wake_up(&audit_backlog_wait); return NULL; } - if (!reserve) + if (!reserve && !audit_backlog_wait_time) audit_backlog_wait_time = audit_backlog_wait_time_master; ab = audit_buffer_alloc(ctx, gfp_mask, type); @@ -1722,7 +1719,7 @@ static inline int audit_copy_fcaps(struct audit_names *name, /* Copy inode data into an audit_names. */ void audit_copy_inode(struct audit_names *name, const struct dentry *dentry, - const struct inode *inode) + struct inode *inode) { name->ino = inode->i_ino; name->dev = inode->i_sb->s_dev; diff --git a/kernel/audit.h b/kernel/audit.h index de6cbb7cf..cbbe6bb64 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -207,7 +207,7 @@ extern u32 audit_ever_enabled; extern void audit_copy_inode(struct audit_names *name, const struct dentry *dentry, - const struct inode *inode); + struct inode *inode); extern void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap); extern void audit_log_name(struct audit_context *context, diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index 27c6046c2..f84f8d06e 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -95,7 +95,7 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa if (IS_ERR(dentry)) return (void *)dentry; /* returning an error */ inode = path.dentry->d_inode; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); audit_mark = kzalloc(sizeof(*audit_mark), GFP_KERNEL); if (unlikely(!audit_mark)) { diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 656c7e93a..9f194aad0 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -364,7 +364,7 @@ static int audit_get_nd(struct audit_watch *watch, struct path *parent) struct dentry *d = kern_path_locked(watch->path, parent); if (IS_ERR(d)) return PTR_ERR(d); - mutex_unlock(&d_backing_inode(parent->dentry)->i_mutex); + inode_unlock(d_backing_inode(parent->dentry)); if (d_is_positive(d)) { /* update watch filter fields */ watch->dev = d_backing_inode(d)->i_sb->s_dev; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index b86cc0495..195ffaee5 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1754,7 +1754,7 @@ void __audit_inode(struct filename *name, const struct dentry *dentry, unsigned int flags) { struct audit_context *context = current->audit_context; - const struct inode *inode = d_backing_inode(dentry); + struct inode *inode = d_backing_inode(dentry); struct audit_names *n; bool parent = flags & AUDIT_INODE_PARENT; @@ -1848,12 +1848,12 @@ void __audit_file(const struct file *file) * must be hooked prior, in order to capture the target inode during * unsuccessful attempts. */ -void __audit_inode_child(const struct inode *parent, +void __audit_inode_child(struct inode *parent, const struct dentry *dentry, const unsigned char type) { struct audit_context *context = current->audit_context; - const struct inode *inode = d_backing_inode(dentry); + struct inode *inode = d_backing_inode(dentry); const char *dname = dentry->d_name.name; struct audit_names *n, *found_parent = NULL, *found_child = NULL; diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index b0799bced..89ebbc4d1 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -291,10 +291,13 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd) { struct perf_event *event; const struct perf_event_attr *attr; + struct file *file; - event = perf_event_get(fd); - if (IS_ERR(event)) - return event; + file = perf_event_get(fd); + if (IS_ERR(file)) + return file; + + event = file->private_data; attr = perf_event_attrs(event); if (IS_ERR(attr)) @@ -304,24 +307,22 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd) goto err; if (attr->type == PERF_TYPE_RAW) - return event; + return file; if (attr->type == PERF_TYPE_HARDWARE) - return event; + return file; if (attr->type == PERF_TYPE_SOFTWARE && attr->config == PERF_COUNT_SW_BPF_OUTPUT) - return event; + return file; err: - perf_event_release_kernel(event); + fput(file); return ERR_PTR(-EINVAL); } static void perf_event_fd_array_put_ptr(void *ptr) { - struct perf_event *event = ptr; - - perf_event_release_kernel(event); + fput((struct file *)ptr); } static const struct bpf_map_ops perf_event_array_ops = { diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 334b1bdd5..972d9a8e4 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -306,10 +306,6 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; ARG1 = (u64) (unsigned long) ctx; - /* Registers used in classic BPF programs need to be reset first. */ - regs[BPF_REG_A] = 0; - regs[BPF_REG_X] = 0; - select_insn: goto *jumptable[insn->code]; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 34777b374..c5b30fd8a 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -14,11 +14,15 @@ #include <linux/filter.h> #include <linux/vmalloc.h> +struct bucket { + struct hlist_head head; + raw_spinlock_t lock; +}; + struct bpf_htab { struct bpf_map map; - struct hlist_head *buckets; - raw_spinlock_t lock; - u32 count; /* number of elements in this hashtable */ + struct bucket *buckets; + atomic_t count; /* number of elements in this hashtable */ u32 n_buckets; /* number of hash buckets */ u32 elem_size; /* size of each element in bytes */ }; @@ -79,34 +83,35 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) /* prevent zero size kmalloc and check for u32 overflow */ if (htab->n_buckets == 0 || - htab->n_buckets > U32_MAX / sizeof(struct hlist_head)) + htab->n_buckets > U32_MAX / sizeof(struct bucket)) goto free_htab; - if ((u64) htab->n_buckets * sizeof(struct hlist_head) + + if ((u64) htab->n_buckets * sizeof(struct bucket) + (u64) htab->elem_size * htab->map.max_entries >= U32_MAX - PAGE_SIZE) /* make sure page count doesn't overflow */ goto free_htab; - htab->map.pages = round_up(htab->n_buckets * sizeof(struct hlist_head) + + htab->map.pages = round_up(htab->n_buckets * sizeof(struct bucket) + htab->elem_size * htab->map.max_entries, PAGE_SIZE) >> PAGE_SHIFT; err = -ENOMEM; - htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct hlist_head), + htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct bucket), GFP_USER | __GFP_NOWARN); if (!htab->buckets) { - htab->buckets = vmalloc(htab->n_buckets * sizeof(struct hlist_head)); + htab->buckets = vmalloc(htab->n_buckets * sizeof(struct bucket)); if (!htab->buckets) goto free_htab; } - for (i = 0; i < htab->n_buckets; i++) - INIT_HLIST_HEAD(&htab->buckets[i]); + for (i = 0; i < htab->n_buckets; i++) { + INIT_HLIST_HEAD(&htab->buckets[i].head); + raw_spin_lock_init(&htab->buckets[i].lock); + } - raw_spin_lock_init(&htab->lock); - htab->count = 0; + atomic_set(&htab->count, 0); return &htab->map; @@ -120,11 +125,16 @@ static inline u32 htab_map_hash(const void *key, u32 key_len) return jhash(key, key_len, 0); } -static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) +static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) { return &htab->buckets[hash & (htab->n_buckets - 1)]; } +static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) +{ + return &__select_bucket(htab, hash)->head; +} + static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash, void *key, u32 key_size) { @@ -227,6 +237,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct htab_elem *l_new, *l_old; struct hlist_head *head; + struct bucket *b; unsigned long flags; u32 key_size; int ret; @@ -248,15 +259,15 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, memcpy(l_new->key + round_up(key_size, 8), value, map->value_size); l_new->hash = htab_map_hash(l_new->key, key_size); + b = __select_bucket(htab, l_new->hash); + head = &b->head; /* bpf_map_update_elem() can be called in_irq() */ - raw_spin_lock_irqsave(&htab->lock, flags); - - head = select_bucket(htab, l_new->hash); + raw_spin_lock_irqsave(&b->lock, flags); l_old = lookup_elem_raw(head, l_new->hash, key, key_size); - if (!l_old && unlikely(htab->count >= map->max_entries)) { + if (!l_old && unlikely(atomic_read(&htab->count) >= map->max_entries)) { /* if elem with this 'key' doesn't exist and we've reached * max_entries limit, fail insertion of new elem */ @@ -284,13 +295,13 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, hlist_del_rcu(&l_old->hash_node); kfree_rcu(l_old, rcu); } else { - htab->count++; + atomic_inc(&htab->count); } - raw_spin_unlock_irqrestore(&htab->lock, flags); + raw_spin_unlock_irqrestore(&b->lock, flags); return 0; err: - raw_spin_unlock_irqrestore(&htab->lock, flags); + raw_spin_unlock_irqrestore(&b->lock, flags); kfree(l_new); return ret; } @@ -300,6 +311,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct hlist_head *head; + struct bucket *b; struct htab_elem *l; unsigned long flags; u32 hash, key_size; @@ -310,21 +322,21 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) key_size = map->key_size; hash = htab_map_hash(key, key_size); + b = __select_bucket(htab, hash); + head = &b->head; - raw_spin_lock_irqsave(&htab->lock, flags); - - head = select_bucket(htab, hash); + raw_spin_lock_irqsave(&b->lock, flags); l = lookup_elem_raw(head, hash, key, key_size); if (l) { hlist_del_rcu(&l->hash_node); - htab->count--; + atomic_dec(&htab->count); kfree_rcu(l, rcu); ret = 0; } - raw_spin_unlock_irqrestore(&htab->lock, flags); + raw_spin_unlock_irqrestore(&b->lock, flags); return ret; } @@ -339,7 +351,7 @@ static void delete_all_elements(struct bpf_htab *htab) hlist_for_each_entry_safe(l, n, head, hash_node) { hlist_del_rcu(&l->hash_node); - htab->count--; + atomic_dec(&htab->count); kfree(l); } } diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 5a8a797d5..f2ece3c17 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -187,11 +187,31 @@ static int bpf_mkobj(struct inode *dir, struct dentry *dentry, umode_t mode, } } +static int bpf_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *new_dentry) +{ + if (bpf_dname_reserved(new_dentry)) + return -EPERM; + + return simple_link(old_dentry, dir, new_dentry); +} + +static int bpf_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + if (bpf_dname_reserved(new_dentry)) + return -EPERM; + + return simple_rename(old_dir, old_dentry, new_dir, new_dentry); +} + static const struct inode_operations bpf_dir_iops = { .lookup = simple_lookup, .mknod = bpf_mkobj, .mkdir = bpf_mkdir, .rmdir = simple_rmdir, + .rename = bpf_rename, + .link = bpf_link, .unlink = simple_unlink, }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 3b39550d8..637397059 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -113,8 +113,28 @@ static int bpf_map_release(struct inode *inode, struct file *filp) return 0; } +#ifdef CONFIG_PROC_FS +static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) +{ + const struct bpf_map *map = filp->private_data; + + seq_printf(m, + "map_type:\t%u\n" + "key_size:\t%u\n" + "value_size:\t%u\n" + "max_entries:\t%u\n", + map->map_type, + map->key_size, + map->value_size, + map->max_entries); +} +#endif + static const struct file_operations bpf_map_fops = { - .release = bpf_map_release, +#ifdef CONFIG_PROC_FS + .show_fdinfo = bpf_map_show_fdinfo, +#endif + .release = bpf_map_release, }; int bpf_map_new_fd(struct bpf_map *map) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index fb1ecfd2d..d27904c19 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -57,8 +57,9 @@ #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ #include <linux/kthread.h> #include <linux/delay.h> -#include <linux/cpuset.h> #include <linux/atomic.h> +#include <linux/cpuset.h> +#include <net/sock.h> /* * pidlists linger the following amount before being destroyed. The goal @@ -211,6 +212,7 @@ static unsigned long have_free_callback __read_mostly; /* Ditto for the can_fork callback. */ static unsigned long have_canfork_callback __read_mostly; +static struct file_system_type cgroup2_fs_type; static struct cftype cgroup_dfl_base_files[]; static struct cftype cgroup_legacy_base_files[]; @@ -440,11 +442,6 @@ static bool cgroup_tryget(struct cgroup *cgrp) return css_tryget(&cgrp->self); } -static void cgroup_put(struct cgroup *cgrp) -{ - css_put(&cgrp->self); -} - struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) { struct cgroup *cgrp = of->kn->parent->priv; @@ -465,25 +462,6 @@ struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) } EXPORT_SYMBOL_GPL(of_css); -/** - * cgroup_is_descendant - test ancestry - * @cgrp: the cgroup to be tested - * @ancestor: possible ancestor of @cgrp - * - * Test whether @cgrp is a descendant of @ancestor. It also returns %true - * if @cgrp == @ancestor. This function is safe to call as long as @cgrp - * and @ancestor are accessible. - */ -bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor) -{ - while (cgrp) { - if (cgrp == ancestor) - return true; - cgrp = cgroup_parent(cgrp); - } - return false; -} - static int notify_on_release(const struct cgroup *cgrp) { return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); @@ -1647,10 +1625,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) all_ss = true; continue; } - if (!strcmp(token, "__DEVEL__sane_behavior")) { - opts->flags |= CGRP_ROOT_SANE_BEHAVIOR; - continue; - } if (!strcmp(token, "noprefix")) { opts->flags |= CGRP_ROOT_NOPREFIX; continue; @@ -1717,15 +1691,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) return -ENOENT; } - if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { - pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); - if (nr_opts != 1) { - pr_err("sane_behavior: no other mount options allowed\n"); - return -EINVAL; - } - return 0; - } - /* * If the 'all' option was specified select all the subsystems, * otherwise if 'none', 'name=' and a subsystem name options were @@ -1924,6 +1889,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) if (ret < 0) goto out; root_cgrp->id = ret; + root_cgrp->ancestor_ids[0] = ret; ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0, GFP_KERNEL); @@ -2004,6 +1970,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, int flags, const char *unused_dev_name, void *data) { + bool is_v2 = fs_type == &cgroup2_fs_type; struct super_block *pinned_sb = NULL; struct cgroup_subsys *ss; struct cgroup_root *root; @@ -2020,6 +1987,17 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, if (!use_task_css_set_links) cgroup_enable_task_cg_lists(); + if (is_v2) { + if (data) { + pr_err("cgroup2: unknown option \"%s\"\n", (char *)data); + return ERR_PTR(-EINVAL); + } + cgrp_dfl_root_visible = true; + root = &cgrp_dfl_root; + cgroup_get(&root->cgrp); + goto out_mount; + } + mutex_lock(&cgroup_mutex); /* First find the desired set of subsystems */ @@ -2027,15 +2005,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, if (ret) goto out_unlock; - /* look for a matching existing root */ - if (opts.flags & CGRP_ROOT_SANE_BEHAVIOR) { - cgrp_dfl_root_visible = true; - root = &cgrp_dfl_root; - cgroup_get(&root->cgrp); - ret = 0; - goto out_unlock; - } - /* * Destruction of cgroup root is asynchronous, so subsystems may * still be dying after the previous unmount. Let's drain the @@ -2146,9 +2115,10 @@ out_free: if (ret) return ERR_PTR(ret); - +out_mount: dentry = kernfs_mount(fs_type, flags, root->kf_root, - CGROUP_SUPER_MAGIC, &new_sb); + is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC, + &new_sb); if (IS_ERR(dentry) || !new_sb) cgroup_put(&root->cgrp); @@ -2191,6 +2161,12 @@ static struct file_system_type cgroup_fs_type = { .kill_sb = cgroup_kill_sb, }; +static struct file_system_type cgroup2_fs_type = { + .name = "cgroup2", + .mount = cgroup_mount, + .kill_sb = cgroup_kill_sb, +}; + /** * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy * @task: target task @@ -4063,7 +4039,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) goto out_err; /* - * Migrate tasks one-by-one until @form is empty. This fails iff + * Migrate tasks one-by-one until @from is empty. This fails iff * ->can_attach() fails. */ do { @@ -4681,14 +4657,15 @@ static void css_free_work_fn(struct work_struct *work) if (ss) { /* css free path */ + struct cgroup_subsys_state *parent = css->parent; int id = css->id; - if (css->parent) - css_put(css->parent); - ss->css_free(css); cgroup_idr_remove(&ss->css_idr, id); cgroup_put(cgrp); + + if (parent) + css_put(parent); } else { /* cgroup free path */ atomic_dec(&cgrp->root->nr_cgrps); @@ -4909,11 +4886,11 @@ err_free_css: static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) { - struct cgroup *parent, *cgrp; + struct cgroup *parent, *cgrp, *tcgrp; struct cgroup_root *root; struct cgroup_subsys *ss; struct kernfs_node *kn; - int ssid, ret; + int level, ssid, ret; /* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable. */ @@ -4924,9 +4901,11 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, if (!parent) return -ENODEV; root = parent->root; + level = parent->level + 1; /* allocate the cgroup and its ID, 0 is reserved for the root */ - cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); + cgrp = kzalloc(sizeof(*cgrp) + + sizeof(cgrp->ancestor_ids[0]) * (level + 1), GFP_KERNEL); if (!cgrp) { ret = -ENOMEM; goto out_unlock; @@ -4950,6 +4929,10 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, cgrp->self.parent = &parent->self; cgrp->root = root; + cgrp->level = level; + + for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) + cgrp->ancestor_ids[tcgrp->level] = tcgrp->id; if (notify_on_release(parent)) set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); @@ -5201,7 +5184,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) { struct cgroup_subsys_state *css; - printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); + pr_debug("Initializing cgroup subsys %s\n", ss->name); mutex_lock(&cgroup_mutex); @@ -5359,6 +5342,7 @@ int __init cgroup_init(void) WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup")); WARN_ON(register_filesystem(&cgroup_fs_type)); + WARN_ON(register_filesystem(&cgroup2_fs_type)); WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations)); return 0; @@ -5502,19 +5486,6 @@ static const struct file_operations proc_cgroupstats_operations = { .release = single_release, }; -static void **subsys_canfork_priv_p(void *ss_priv[CGROUP_CANFORK_COUNT], int i) -{ - if (CGROUP_CANFORK_START <= i && i < CGROUP_CANFORK_END) - return &ss_priv[i - CGROUP_CANFORK_START]; - return NULL; -} - -static void *subsys_canfork_priv(void *ss_priv[CGROUP_CANFORK_COUNT], int i) -{ - void **private = subsys_canfork_priv_p(ss_priv, i); - return private ? *private : NULL; -} - /** * cgroup_fork - initialize cgroup related fields during copy_process() * @child: pointer to task_struct of forking parent process. @@ -5537,14 +5508,13 @@ void cgroup_fork(struct task_struct *child) * returns an error, the fork aborts with that error code. This allows for * a cgroup subsystem to conditionally allow or deny new forks. */ -int cgroup_can_fork(struct task_struct *child, - void *ss_priv[CGROUP_CANFORK_COUNT]) +int cgroup_can_fork(struct task_struct *child) { struct cgroup_subsys *ss; int i, j, ret; for_each_subsys_which(ss, i, &have_canfork_callback) { - ret = ss->can_fork(child, subsys_canfork_priv_p(ss_priv, i)); + ret = ss->can_fork(child); if (ret) goto out_revert; } @@ -5556,7 +5526,7 @@ out_revert: if (j >= i) break; if (ss->cancel_fork) - ss->cancel_fork(child, subsys_canfork_priv(ss_priv, j)); + ss->cancel_fork(child); } return ret; @@ -5569,15 +5539,14 @@ out_revert: * This calls the cancel_fork() callbacks if a fork failed *after* * cgroup_can_fork() succeded. */ -void cgroup_cancel_fork(struct task_struct *child, - void *ss_priv[CGROUP_CANFORK_COUNT]) +void cgroup_cancel_fork(struct task_struct *child) { struct cgroup_subsys *ss; int i; for_each_subsys(ss, i) if (ss->cancel_fork) - ss->cancel_fork(child, subsys_canfork_priv(ss_priv, i)); + ss->cancel_fork(child); } /** @@ -5590,8 +5559,7 @@ void cgroup_cancel_fork(struct task_struct *child, * cgroup_task_iter_start() - to guarantee that the new task ends up on its * list. */ -void cgroup_post_fork(struct task_struct *child, - void *old_ss_priv[CGROUP_CANFORK_COUNT]) +void cgroup_post_fork(struct task_struct *child) { struct cgroup_subsys *ss; int i; @@ -5635,7 +5603,7 @@ void cgroup_post_fork(struct task_struct *child, * and addition to css_set. */ for_each_subsys_which(ss, i, &have_fork_callback) - ss->fork(child, subsys_canfork_priv(old_ss_priv, i)); + ss->fork(child); } /** @@ -5835,6 +5803,93 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) return id > 0 ? idr_find(&ss->css_idr, id) : NULL; } +/** + * cgroup_get_from_path - lookup and get a cgroup from its default hierarchy path + * @path: path on the default hierarchy + * + * Find the cgroup at @path on the default hierarchy, increment its + * reference count and return it. Returns pointer to the found cgroup on + * success, ERR_PTR(-ENOENT) if @path doens't exist and ERR_PTR(-ENOTDIR) + * if @path points to a non-directory. + */ +struct cgroup *cgroup_get_from_path(const char *path) +{ + struct kernfs_node *kn; + struct cgroup *cgrp; + + mutex_lock(&cgroup_mutex); + + kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path); + if (kn) { + if (kernfs_type(kn) == KERNFS_DIR) { + cgrp = kn->priv; + cgroup_get(cgrp); + } else { + cgrp = ERR_PTR(-ENOTDIR); + } + kernfs_put(kn); + } else { + cgrp = ERR_PTR(-ENOENT); + } + + mutex_unlock(&cgroup_mutex); + return cgrp; +} +EXPORT_SYMBOL_GPL(cgroup_get_from_path); + +/* + * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data + * definition in cgroup-defs.h. + */ +#ifdef CONFIG_SOCK_CGROUP_DATA + +#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID) + +DEFINE_SPINLOCK(cgroup_sk_update_lock); +static bool cgroup_sk_alloc_disabled __read_mostly; + +void cgroup_sk_alloc_disable(void) +{ + if (cgroup_sk_alloc_disabled) + return; + pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n"); + cgroup_sk_alloc_disabled = true; +} + +#else + +#define cgroup_sk_alloc_disabled false + +#endif + +void cgroup_sk_alloc(struct sock_cgroup_data *skcd) +{ + if (cgroup_sk_alloc_disabled) + return; + + rcu_read_lock(); + + while (true) { + struct css_set *cset; + + cset = task_css_set(current); + if (likely(cgroup_tryget(cset->dfl_cgrp))) { + skcd->val = (unsigned long)cset->dfl_cgrp; + break; + } + cpu_relax(); + } + + rcu_read_unlock(); +} + +void cgroup_sk_free(struct sock_cgroup_data *skcd) +{ + cgroup_put(sock_cgroup_ptr(skcd)); +} + +#endif /* CONFIG_SOCK_CGROUP_DATA */ + #ifdef CONFIG_CGROUP_DEBUG static struct cgroup_subsys_state * debug_css_alloc(struct cgroup_subsys_state *parent_css) diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 2d3df82c5..1b72d56ed 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -200,7 +200,7 @@ static void freezer_attach(struct cgroup_taskset *tset) * to do anything as freezer_attach() will put @task into the appropriate * state. */ -static void freezer_fork(struct task_struct *task, void *private) +static void freezer_fork(struct task_struct *task) { struct freezer *freezer; diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c index b50d5a167..303097b37 100644 --- a/kernel/cgroup_pids.c +++ b/kernel/cgroup_pids.c @@ -134,7 +134,7 @@ static void pids_charge(struct pids_cgroup *pids, int num) * * This function follows the set limit. It will fail if the charge would cause * the new value to exceed the hierarchical limit. Returns 0 if the charge - * succeded, otherwise -EAGAIN. + * succeeded, otherwise -EAGAIN. */ static int pids_try_charge(struct pids_cgroup *pids, int num) { @@ -209,7 +209,7 @@ static void pids_cancel_attach(struct cgroup_taskset *tset) * task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies * on threadgroup_change_begin() held by the copy_process(). */ -static int pids_can_fork(struct task_struct *task, void **priv_p) +static int pids_can_fork(struct task_struct *task) { struct cgroup_subsys_state *css; struct pids_cgroup *pids; @@ -219,7 +219,7 @@ static int pids_can_fork(struct task_struct *task, void **priv_p) return pids_try_charge(pids, 1); } -static void pids_cancel_fork(struct task_struct *task, void *priv) +static void pids_cancel_fork(struct task_struct *task) { struct cgroup_subsys_state *css; struct pids_cgroup *pids; diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index d8560ee3b..9ad37b9e4 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -24,7 +24,7 @@ #define CREATE_TRACE_POINTS #include <trace/events/context_tracking.h> -struct static_key context_tracking_enabled = STATIC_KEY_INIT_FALSE; +DEFINE_STATIC_KEY_FALSE(context_tracking_enabled); EXPORT_SYMBOL_GPL(context_tracking_enabled); DEFINE_PER_CPU(struct context_tracking, context_tracking); @@ -191,7 +191,7 @@ void __init context_tracking_cpu_set(int cpu) if (!per_cpu(context_tracking.active, cpu)) { per_cpu(context_tracking.active, cpu) = true; - static_key_slow_inc(&context_tracking_enabled); + static_branch_inc(&context_tracking_enabled); } if (initialized) diff --git a/kernel/cpu.c b/kernel/cpu.c index 85ff5e26e..5b9d39633 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -759,71 +759,33 @@ const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL; EXPORT_SYMBOL(cpu_all_bits); #ifdef CONFIG_INIT_ALL_POSSIBLE -static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly - = CPU_BITS_ALL; +struct cpumask __cpu_possible_mask __read_mostly + = {CPU_BITS_ALL}; #else -static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly; +struct cpumask __cpu_possible_mask __read_mostly; #endif -const struct cpumask *const cpu_possible_mask = to_cpumask(cpu_possible_bits); -EXPORT_SYMBOL(cpu_possible_mask); +EXPORT_SYMBOL(__cpu_possible_mask); -static DECLARE_BITMAP(cpu_online_bits, CONFIG_NR_CPUS) __read_mostly; -const struct cpumask *const cpu_online_mask = to_cpumask(cpu_online_bits); -EXPORT_SYMBOL(cpu_online_mask); +struct cpumask __cpu_online_mask __read_mostly; +EXPORT_SYMBOL(__cpu_online_mask); -static DECLARE_BITMAP(cpu_present_bits, CONFIG_NR_CPUS) __read_mostly; -const struct cpumask *const cpu_present_mask = to_cpumask(cpu_present_bits); -EXPORT_SYMBOL(cpu_present_mask); +struct cpumask __cpu_present_mask __read_mostly; +EXPORT_SYMBOL(__cpu_present_mask); -static DECLARE_BITMAP(cpu_active_bits, CONFIG_NR_CPUS) __read_mostly; -const struct cpumask *const cpu_active_mask = to_cpumask(cpu_active_bits); -EXPORT_SYMBOL(cpu_active_mask); - -void set_cpu_possible(unsigned int cpu, bool possible) -{ - if (possible) - cpumask_set_cpu(cpu, to_cpumask(cpu_possible_bits)); - else - cpumask_clear_cpu(cpu, to_cpumask(cpu_possible_bits)); -} - -void set_cpu_present(unsigned int cpu, bool present) -{ - if (present) - cpumask_set_cpu(cpu, to_cpumask(cpu_present_bits)); - else - cpumask_clear_cpu(cpu, to_cpumask(cpu_present_bits)); -} - -void set_cpu_online(unsigned int cpu, bool online) -{ - if (online) { - cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits)); - cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits)); - } else { - cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits)); - } -} - -void set_cpu_active(unsigned int cpu, bool active) -{ - if (active) - cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits)); - else - cpumask_clear_cpu(cpu, to_cpumask(cpu_active_bits)); -} +struct cpumask __cpu_active_mask __read_mostly; +EXPORT_SYMBOL(__cpu_active_mask); void init_cpu_present(const struct cpumask *src) { - cpumask_copy(to_cpumask(cpu_present_bits), src); + cpumask_copy(&__cpu_present_mask, src); } void init_cpu_possible(const struct cpumask *src) { - cpumask_copy(to_cpumask(cpu_possible_bits), src); + cpumask_copy(&__cpu_possible_mask, src); } void init_cpu_online(const struct cpumask *src) { - cpumask_copy(to_cpumask(cpu_online_bits), src); + cpumask_copy(&__cpu_online_mask, src); } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 2ade63219..41989ab4d 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -51,6 +51,7 @@ #include <linux/stat.h> #include <linux/string.h> #include <linux/time.h> +#include <linux/time64.h> #include <linux/backing-dev.h> #include <linux/sort.h> @@ -68,7 +69,7 @@ struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE; struct fmeter { int cnt; /* unprocessed events count */ int val; /* most recent output value */ - time_t time; /* clock (secs) when val computed */ + time64_t time; /* clock (secs) when val computed */ spinlock_t lock; /* guards read or write of above */ }; @@ -1397,7 +1398,7 @@ out: */ #define FM_COEF 933 /* coefficient for half-life of 10 secs */ -#define FM_MAXTICKS ((time_t)99) /* useless computing more ticks than this */ +#define FM_MAXTICKS ((u32)99) /* useless computing more ticks than this */ #define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */ #define FM_SCALE 1000 /* faux fixed point scale */ @@ -1413,8 +1414,11 @@ static void fmeter_init(struct fmeter *fmp) /* Internal meter update - process cnt events and update value */ static void fmeter_update(struct fmeter *fmp) { - time_t now = get_seconds(); - time_t ticks = now - fmp->time; + time64_t now; + u32 ticks; + + now = ktime_get_seconds(); + ticks = now - fmp->time; if (ticks == 0) return; diff --git a/kernel/cred.c b/kernel/cred.c index 71179a09c..0c0cd8a62 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -569,8 +569,8 @@ EXPORT_SYMBOL(revert_creds); void __init cred_init(void) { /* allocate a slab in which we can store credentials */ - cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred), - 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred), 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); } /** diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 412134549..2a20c0dfd 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -2021,7 +2021,7 @@ static int kdb_lsmod(int argc, const char **argv) continue; kdb_printf("%-20s%8u 0x%p ", mod->name, - mod->core_size, (void *)mod); + mod->core_layout.size, (void *)mod); #ifdef CONFIG_MODULE_UNLOAD kdb_printf("%4d ", module_refcount(mod)); #endif @@ -2031,7 +2031,7 @@ static int kdb_lsmod(int argc, const char **argv) kdb_printf(" (Loading)"); else kdb_printf(" (Live)"); - kdb_printf(" 0x%p", mod->module_core); + kdb_printf(" 0x%p", mod->core_layout.base); #ifdef CONFIG_MODULE_UNLOAD { diff --git a/kernel/delayacct.c b/kernel/delayacct.c index ef90b04d7..435c14a45 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -34,7 +34,7 @@ __setup("nodelayacct", delayacct_setup_disable); void delayacct_init(void) { - delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC); + delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC|SLAB_ACCOUNT); delayacct_tsk_init(&init_task); } diff --git a/kernel/events/core.c b/kernel/events/core.c index 1087bbeb1..614614821 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -49,8 +49,6 @@ #include <asm/irq_regs.h> -static struct workqueue_struct *perf_wq; - typedef int (*remote_function_f)(void *); struct remote_function_call { @@ -66,8 +64,17 @@ static void remote_function(void *data) struct task_struct *p = tfc->p; if (p) { - tfc->ret = -EAGAIN; - if (task_cpu(p) != smp_processor_id() || !task_curr(p)) + /* -EAGAIN */ + if (task_cpu(p) != smp_processor_id()) + return; + + /* + * Now that we're on right CPU with IRQs disabled, we can test + * if we hit the right task without races. + */ + + tfc->ret = -ESRCH; /* No such (running) process */ + if (p != current) return; } @@ -94,13 +101,17 @@ task_function_call(struct task_struct *p, remote_function_f func, void *info) .p = p, .func = func, .info = info, - .ret = -ESRCH, /* No such (running) process */ + .ret = -EAGAIN, }; + int ret; - if (task_curr(p)) - smp_call_function_single(task_cpu(p), remote_function, &data, 1); + do { + ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1); + if (!ret) + ret = data.ret; + } while (ret == -EAGAIN); - return data.ret; + return ret; } /** @@ -126,11 +137,168 @@ static int cpu_function_call(int cpu, remote_function_f func, void *info) return data.ret; } -#define EVENT_OWNER_KERNEL ((void *) -1) +static inline struct perf_cpu_context * +__get_cpu_context(struct perf_event_context *ctx) +{ + return this_cpu_ptr(ctx->pmu->pmu_cpu_context); +} + +static void perf_ctx_lock(struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + raw_spin_lock(&cpuctx->ctx.lock); + if (ctx) + raw_spin_lock(&ctx->lock); +} + +static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + if (ctx) + raw_spin_unlock(&ctx->lock); + raw_spin_unlock(&cpuctx->ctx.lock); +} + +#define TASK_TOMBSTONE ((void *)-1L) static bool is_kernel_event(struct perf_event *event) { - return event->owner == EVENT_OWNER_KERNEL; + return READ_ONCE(event->owner) == TASK_TOMBSTONE; +} + +/* + * On task ctx scheduling... + * + * When !ctx->nr_events a task context will not be scheduled. This means + * we can disable the scheduler hooks (for performance) without leaving + * pending task ctx state. + * + * This however results in two special cases: + * + * - removing the last event from a task ctx; this is relatively straight + * forward and is done in __perf_remove_from_context. + * + * - adding the first event to a task ctx; this is tricky because we cannot + * rely on ctx->is_active and therefore cannot use event_function_call(). + * See perf_install_in_context(). + * + * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set. + */ + +typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *, + struct perf_event_context *, void *); + +struct event_function_struct { + struct perf_event *event; + event_f func; + void *data; +}; + +static int event_function(void *info) +{ + struct event_function_struct *efs = info; + struct perf_event *event = efs->event; + struct perf_event_context *ctx = event->ctx; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + struct perf_event_context *task_ctx = cpuctx->task_ctx; + int ret = 0; + + WARN_ON_ONCE(!irqs_disabled()); + + perf_ctx_lock(cpuctx, task_ctx); + /* + * Since we do the IPI call without holding ctx->lock things can have + * changed, double check we hit the task we set out to hit. + */ + if (ctx->task) { + if (ctx->task != current) { + ret = -ESRCH; + goto unlock; + } + + /* + * We only use event_function_call() on established contexts, + * and event_function() is only ever called when active (or + * rather, we'll have bailed in task_function_call() or the + * above ctx->task != current test), therefore we must have + * ctx->is_active here. + */ + WARN_ON_ONCE(!ctx->is_active); + /* + * And since we have ctx->is_active, cpuctx->task_ctx must + * match. + */ + WARN_ON_ONCE(task_ctx != ctx); + } else { + WARN_ON_ONCE(&cpuctx->ctx != ctx); + } + + efs->func(event, cpuctx, ctx, efs->data); +unlock: + perf_ctx_unlock(cpuctx, task_ctx); + + return ret; +} + +static void event_function_local(struct perf_event *event, event_f func, void *data) +{ + struct event_function_struct efs = { + .event = event, + .func = func, + .data = data, + }; + + int ret = event_function(&efs); + WARN_ON_ONCE(ret); +} + +static void event_function_call(struct perf_event *event, event_f func, void *data) +{ + struct perf_event_context *ctx = event->ctx; + struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */ + struct event_function_struct efs = { + .event = event, + .func = func, + .data = data, + }; + + if (!event->parent) { + /* + * If this is a !child event, we must hold ctx::mutex to + * stabilize the the event->ctx relation. See + * perf_event_ctx_lock(). + */ + lockdep_assert_held(&ctx->mutex); + } + + if (!task) { + cpu_function_call(event->cpu, event_function, &efs); + return; + } + + if (task == TASK_TOMBSTONE) + return; + +again: + if (!task_function_call(task, event_function, &efs)) + return; + + raw_spin_lock_irq(&ctx->lock); + /* + * Reload the task pointer, it might have been changed by + * a concurrent perf_event_context_sched_out(). + */ + task = ctx->task; + if (task == TASK_TOMBSTONE) { + raw_spin_unlock_irq(&ctx->lock); + return; + } + if (ctx->is_active) { + raw_spin_unlock_irq(&ctx->lock); + goto again; + } + func(event, NULL, ctx, data); + raw_spin_unlock_irq(&ctx->lock); } #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ @@ -148,6 +316,7 @@ static bool is_kernel_event(struct perf_event *event) enum event_type_t { EVENT_FLEXIBLE = 0x1, EVENT_PINNED = 0x2, + EVENT_TIME = 0x4, EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, }; @@ -155,7 +324,13 @@ enum event_type_t { * perf_sched_events : >0 events exist * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu */ -struct static_key_deferred perf_sched_events __read_mostly; + +static void perf_sched_delayed(struct work_struct *work); +DEFINE_STATIC_KEY_FALSE(perf_sched_events); +static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed); +static DEFINE_MUTEX(perf_sched_mutex); +static atomic_t perf_sched_count; + static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); static DEFINE_PER_CPU(int, perf_sched_cb_usages); @@ -337,28 +512,6 @@ static inline u64 perf_event_clock(struct perf_event *event) return event->clock(); } -static inline struct perf_cpu_context * -__get_cpu_context(struct perf_event_context *ctx) -{ - return this_cpu_ptr(ctx->pmu->pmu_cpu_context); -} - -static void perf_ctx_lock(struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) -{ - raw_spin_lock(&cpuctx->ctx.lock); - if (ctx) - raw_spin_lock(&ctx->lock); -} - -static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) -{ - if (ctx) - raw_spin_unlock(&ctx->lock); - raw_spin_unlock(&cpuctx->ctx.lock); -} - #ifdef CONFIG_CGROUP_PERF static inline bool @@ -548,13 +701,7 @@ static inline void perf_cgroup_sched_out(struct task_struct *task, * we are holding the rcu lock */ cgrp1 = perf_cgroup_from_task(task, NULL); - - /* - * next is NULL when called from perf_event_enable_on_exec() - * that will systematically cause a cgroup_switch() - */ - if (next) - cgrp2 = perf_cgroup_from_task(next, NULL); + cgrp2 = perf_cgroup_from_task(next, NULL); /* * only schedule out current cgroup events if we know @@ -580,8 +727,6 @@ static inline void perf_cgroup_sched_in(struct task_struct *prev, * we are holding the rcu lock */ cgrp1 = perf_cgroup_from_task(task, NULL); - - /* prev can never be NULL */ cgrp2 = perf_cgroup_from_task(prev, NULL); /* @@ -886,7 +1031,7 @@ static void put_ctx(struct perf_event_context *ctx) if (atomic_dec_and_test(&ctx->refcount)) { if (ctx->parent_ctx) put_ctx(ctx->parent_ctx); - if (ctx->task) + if (ctx->task && ctx->task != TASK_TOMBSTONE) put_task_struct(ctx->task); call_rcu(&ctx->rcu_head, free_ctx); } @@ -903,9 +1048,8 @@ static void put_ctx(struct perf_event_context *ctx) * perf_event_context::mutex nests and those are: * * - perf_event_exit_task_context() [ child , 0 ] - * __perf_event_exit_task() - * sync_child_event() - * put_event() [ parent, 1 ] + * perf_event_exit_event() + * put_event() [ parent, 1 ] * * - perf_event_init_context() [ parent, 0 ] * inherit_task_group() @@ -948,8 +1092,8 @@ static void put_ctx(struct perf_event_context *ctx) * Lock order: * task_struct::perf_event_mutex * perf_event_context::mutex - * perf_event_context::lock * perf_event::child_mutex; + * perf_event_context::lock * perf_event::mmap_mutex * mmap_sem */ @@ -1047,6 +1191,7 @@ static u64 primary_event_id(struct perf_event *event) /* * Get the perf_event_context for a task and lock it. + * * This has to cope with with the fact that until it is locked, * the context could get moved to another task. */ @@ -1087,9 +1232,12 @@ retry: goto retry; } - if (!atomic_inc_not_zero(&ctx->refcount)) { + if (ctx->task == TASK_TOMBSTONE || + !atomic_inc_not_zero(&ctx->refcount)) { raw_spin_unlock(&ctx->lock); ctx = NULL; + } else { + WARN_ON_ONCE(ctx->task != task); } } rcu_read_unlock(); @@ -1149,16 +1297,18 @@ static u64 perf_event_time(struct perf_event *event) /* * Update the total_time_enabled and total_time_running fields for a event. - * The caller of this function needs to hold the ctx->lock. */ static void update_event_times(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; u64 run_end; + lockdep_assert_held(&ctx->lock); + if (event->state < PERF_EVENT_STATE_INACTIVE || event->group_leader->state < PERF_EVENT_STATE_INACTIVE) return; + /* * in cgroup mode, time_enabled represents * the time the event was enabled AND active @@ -1215,6 +1365,8 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) static void list_add_event(struct perf_event *event, struct perf_event_context *ctx) { + lockdep_assert_held(&ctx->lock); + WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); event->attach_state |= PERF_ATTACH_CONTEXT; @@ -1417,11 +1569,14 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) if (is_cgroup_event(event)) { ctx->nr_cgroups--; + /* + * Because cgroup events are always per-cpu events, this will + * always be called from the right CPU. + */ cpuctx = __get_cpu_context(ctx); /* - * if there are no more cgroup events - * then cler cgrp to avoid stale pointer - * in update_cgrp_time_from_cpuctx() + * If there are no more cgroup events then clear cgrp to avoid + * stale pointer in update_cgrp_time_from_cpuctx(). */ if (!ctx->nr_cgroups) cpuctx->cgrp = NULL; @@ -1499,45 +1654,11 @@ out: perf_event__header_size(tmp); } -/* - * User event without the task. - */ static bool is_orphaned_event(struct perf_event *event) { - return event && !is_kernel_event(event) && !event->owner; -} - -/* - * Event has a parent but parent's task finished and it's - * alive only because of children holding refference. - */ -static bool is_orphaned_child(struct perf_event *event) -{ - return is_orphaned_event(event->parent); -} - -static void orphans_remove_work(struct work_struct *work); - -static void schedule_orphans_remove(struct perf_event_context *ctx) -{ - if (!ctx->task || ctx->orphans_remove_sched || !perf_wq) - return; - - if (queue_delayed_work(perf_wq, &ctx->orphans_remove, 1)) { - get_ctx(ctx); - ctx->orphans_remove_sched = true; - } -} - -static int __init perf_workqueue_init(void) -{ - perf_wq = create_singlethread_workqueue("perf"); - WARN(!perf_wq, "failed to create perf workqueue\n"); - return perf_wq ? 0 : -1; + return event->state == PERF_EVENT_STATE_DEAD; } -core_initcall(perf_workqueue_init); - static inline int pmu_filter_match(struct perf_event *event) { struct pmu *pmu = event->pmu; @@ -1580,14 +1701,14 @@ event_sched_out(struct perf_event *event, perf_pmu_disable(event->pmu); + event->tstamp_stopped = tstamp; + event->pmu->del(event, 0); + event->oncpu = -1; event->state = PERF_EVENT_STATE_INACTIVE; if (event->pending_disable) { event->pending_disable = 0; event->state = PERF_EVENT_STATE_OFF; } - event->tstamp_stopped = tstamp; - event->pmu->del(event, 0); - event->oncpu = -1; if (!is_software_event(event)) cpuctx->active_oncpu--; @@ -1598,9 +1719,6 @@ event_sched_out(struct perf_event *event, if (event->attr.exclusive || !cpuctx->active_oncpu) cpuctx->exclusive = 0; - if (is_orphaned_child(event)) - schedule_orphans_remove(ctx); - perf_pmu_enable(event->pmu); } @@ -1624,10 +1742,7 @@ group_sched_out(struct perf_event *group_event, cpuctx->exclusive = 0; } -struct remove_event { - struct perf_event *event; - bool detach_group; -}; +#define DETACH_GROUP 0x01UL /* * Cross CPU call to remove a performance event @@ -1635,34 +1750,31 @@ struct remove_event { * We disable the event on the hardware level first. After that we * remove it from the context list. */ -static int __perf_remove_from_context(void *info) +static void +__perf_remove_from_context(struct perf_event *event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx, + void *info) { - struct remove_event *re = info; - struct perf_event *event = re->event; - struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + unsigned long flags = (unsigned long)info; - raw_spin_lock(&ctx->lock); event_sched_out(event, cpuctx, ctx); - if (re->detach_group) + if (flags & DETACH_GROUP) perf_group_detach(event); list_del_event(event, ctx); - if (!ctx->nr_events && cpuctx->task_ctx == ctx) { + + if (!ctx->nr_events && ctx->is_active) { ctx->is_active = 0; - cpuctx->task_ctx = NULL; + if (ctx->task) { + WARN_ON_ONCE(cpuctx->task_ctx != ctx); + cpuctx->task_ctx = NULL; + } } - raw_spin_unlock(&ctx->lock); - - return 0; } - /* * Remove the event from a task's (or a CPU's) list of events. * - * CPU events are removed with a smp call. For task events we only - * call when the task is on a CPU. - * * If event->ctx is a cloned context, callers must make sure that * every task struct that event->ctx->task could possibly point to * remains valid. This is OK when called from perf_release since @@ -1670,96 +1782,32 @@ static int __perf_remove_from_context(void *info) * When called from perf_event_exit_task, it's OK because the * context has been detached from its task. */ -static void perf_remove_from_context(struct perf_event *event, bool detach_group) +static void perf_remove_from_context(struct perf_event *event, unsigned long flags) { - struct perf_event_context *ctx = event->ctx; - struct task_struct *task = ctx->task; - struct remove_event re = { - .event = event, - .detach_group = detach_group, - }; + lockdep_assert_held(&event->ctx->mutex); - lockdep_assert_held(&ctx->mutex); - - if (!task) { - /* - * Per cpu events are removed via an smp call. The removal can - * fail if the CPU is currently offline, but in that case we - * already called __perf_remove_from_context from - * perf_event_exit_cpu. - */ - cpu_function_call(event->cpu, __perf_remove_from_context, &re); - return; - } - -retry: - if (!task_function_call(task, __perf_remove_from_context, &re)) - return; - - raw_spin_lock_irq(&ctx->lock); - /* - * If we failed to find a running task, but find the context active now - * that we've acquired the ctx->lock, retry. - */ - if (ctx->is_active) { - raw_spin_unlock_irq(&ctx->lock); - /* - * Reload the task pointer, it might have been changed by - * a concurrent perf_event_context_sched_out(). - */ - task = ctx->task; - goto retry; - } - - /* - * Since the task isn't running, its safe to remove the event, us - * holding the ctx->lock ensures the task won't get scheduled in. - */ - if (detach_group) - perf_group_detach(event); - list_del_event(event, ctx); - raw_spin_unlock_irq(&ctx->lock); + event_function_call(event, __perf_remove_from_context, (void *)flags); } /* * Cross CPU call to disable a performance event */ -int __perf_event_disable(void *info) +static void __perf_event_disable(struct perf_event *event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx, + void *info) { - struct perf_event *event = info; - struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); - - /* - * If this is a per-task event, need to check whether this - * event's task is the current task on this cpu. - * - * Can trigger due to concurrent perf_event_context_sched_out() - * flipping contexts around. - */ - if (ctx->task && cpuctx->task_ctx != ctx) - return -EINVAL; - - raw_spin_lock(&ctx->lock); - - /* - * If the event is on, turn it off. - * If it is in error state, leave it in error state. - */ - if (event->state >= PERF_EVENT_STATE_INACTIVE) { - update_context_time(ctx); - update_cgrp_time_from_event(event); - update_group_times(event); - if (event == event->group_leader) - group_sched_out(event, cpuctx, ctx); - else - event_sched_out(event, cpuctx, ctx); - event->state = PERF_EVENT_STATE_OFF; - } - - raw_spin_unlock(&ctx->lock); + if (event->state < PERF_EVENT_STATE_INACTIVE) + return; - return 0; + update_context_time(ctx); + update_cgrp_time_from_event(event); + update_group_times(event); + if (event == event->group_leader) + group_sched_out(event, cpuctx, ctx); + else + event_sched_out(event, cpuctx, ctx); + event->state = PERF_EVENT_STATE_OFF; } /* @@ -1770,7 +1818,8 @@ int __perf_event_disable(void *info) * remains valid. This condition is satisifed when called through * perf_event_for_each_child or perf_event_for_each because they * hold the top-level event's child_mutex, so any descendant that - * goes to exit will block in sync_child_event. + * goes to exit will block in perf_event_exit_event(). + * * When called from perf_pending_event it's OK because event->ctx * is the current context on this CPU and preemption is disabled, * hence we can't get into perf_event_task_sched_out for this context. @@ -1778,43 +1827,20 @@ int __perf_event_disable(void *info) static void _perf_event_disable(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; - struct task_struct *task = ctx->task; - - if (!task) { - /* - * Disable the event on the cpu that it's on - */ - cpu_function_call(event->cpu, __perf_event_disable, event); - return; - } - -retry: - if (!task_function_call(task, __perf_event_disable, event)) - return; raw_spin_lock_irq(&ctx->lock); - /* - * If the event is still active, we need to retry the cross-call. - */ - if (event->state == PERF_EVENT_STATE_ACTIVE) { + if (event->state <= PERF_EVENT_STATE_OFF) { raw_spin_unlock_irq(&ctx->lock); - /* - * Reload the task pointer, it might have been changed by - * a concurrent perf_event_context_sched_out(). - */ - task = ctx->task; - goto retry; - } - - /* - * Since we have the lock this context can't be scheduled - * in, so we can change the state safely. - */ - if (event->state == PERF_EVENT_STATE_INACTIVE) { - update_group_times(event); - event->state = PERF_EVENT_STATE_OFF; + return; } raw_spin_unlock_irq(&ctx->lock); + + event_function_call(event, __perf_event_disable, NULL); +} + +void perf_event_disable_local(struct perf_event *event) +{ + event_function_local(event, __perf_event_disable, NULL); } /* @@ -1927,9 +1953,6 @@ event_sched_in(struct perf_event *event, if (event->attr.exclusive) cpuctx->exclusive = 1; - if (is_orphaned_child(event)) - schedule_orphans_remove(ctx); - out: perf_pmu_enable(event->pmu); @@ -2048,13 +2071,27 @@ static void add_event_to_ctx(struct perf_event *event, event->tstamp_stopped = tstamp; } -static void task_ctx_sched_out(struct perf_event_context *ctx); +static void ctx_sched_out(struct perf_event_context *ctx, + struct perf_cpu_context *cpuctx, + enum event_type_t event_type); static void ctx_sched_in(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx, enum event_type_t event_type, struct task_struct *task); +static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + if (!cpuctx->task_ctx) + return; + + if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) + return; + + ctx_sched_out(ctx, cpuctx, EVENT_ALL); +} + static void perf_event_sched_in(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, struct task_struct *task) @@ -2067,10 +2104,22 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx, ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task); } +static void ctx_resched(struct perf_cpu_context *cpuctx, + struct perf_event_context *task_ctx) +{ + perf_pmu_disable(cpuctx->ctx.pmu); + if (task_ctx) + task_ctx_sched_out(cpuctx, task_ctx); + cpu_ctx_sched_out(cpuctx, EVENT_ALL); + perf_event_sched_in(cpuctx, task_ctx, current); + perf_pmu_enable(cpuctx->ctx.pmu); +} + /* * Cross CPU call to install and enable a performance event * - * Must be called with ctx->mutex held + * Very similar to remote_function() + event_function() but cannot assume that + * things like ctx->is_active and cpuctx->task_ctx are set. */ static int __perf_install_in_context(void *info) { @@ -2078,72 +2127,59 @@ static int __perf_install_in_context(void *info) struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); struct perf_event_context *task_ctx = cpuctx->task_ctx; - struct task_struct *task = current; - - perf_ctx_lock(cpuctx, task_ctx); - perf_pmu_disable(cpuctx->ctx.pmu); - - /* - * If there was an active task_ctx schedule it out. - */ - if (task_ctx) - task_ctx_sched_out(task_ctx); + bool activate = true; + int ret = 0; - /* - * If the context we're installing events in is not the - * active task_ctx, flip them. - */ - if (ctx->task && task_ctx != ctx) { - if (task_ctx) - raw_spin_unlock(&task_ctx->lock); + raw_spin_lock(&cpuctx->ctx.lock); + if (ctx->task) { raw_spin_lock(&ctx->lock); task_ctx = ctx; - } - - if (task_ctx) { - cpuctx->task_ctx = task_ctx; - task = task_ctx->task; - } - cpu_ctx_sched_out(cpuctx, EVENT_ALL); + /* If we're on the wrong CPU, try again */ + if (task_cpu(ctx->task) != smp_processor_id()) { + ret = -ESRCH; + goto unlock; + } - update_context_time(ctx); - /* - * update cgrp time only if current cgrp - * matches event->cgrp. Must be done before - * calling add_event_to_ctx() - */ - update_cgrp_time_from_event(event); + /* + * If we're on the right CPU, see if the task we target is + * current, if not we don't have to activate the ctx, a future + * context switch will do that for us. + */ + if (ctx->task != current) + activate = false; + else + WARN_ON_ONCE(cpuctx->task_ctx && cpuctx->task_ctx != ctx); - add_event_to_ctx(event, ctx); + } else if (task_ctx) { + raw_spin_lock(&task_ctx->lock); + } - /* - * Schedule everything back in - */ - perf_event_sched_in(cpuctx, task_ctx, task); + if (activate) { + ctx_sched_out(ctx, cpuctx, EVENT_TIME); + add_event_to_ctx(event, ctx); + ctx_resched(cpuctx, task_ctx); + } else { + add_event_to_ctx(event, ctx); + } - perf_pmu_enable(cpuctx->ctx.pmu); +unlock: perf_ctx_unlock(cpuctx, task_ctx); - return 0; + return ret; } /* - * Attach a performance event to a context + * Attach a performance event to a context. * - * First we add the event to the list with the hardware enable bit - * in event->hw_config cleared. - * - * If the event is attached to a task which is on a CPU we use a smp - * call to enable it in the task context. The task might have been - * scheduled away, but we check this in the smp call again. + * Very similar to event_function_call, see comment there. */ static void perf_install_in_context(struct perf_event_context *ctx, struct perf_event *event, int cpu) { - struct task_struct *task = ctx->task; + struct task_struct *task = READ_ONCE(ctx->task); lockdep_assert_held(&ctx->mutex); @@ -2152,39 +2188,45 @@ perf_install_in_context(struct perf_event_context *ctx, event->cpu = cpu; if (!task) { - /* - * Per cpu events are installed via an smp call and - * the install is always successful. - */ cpu_function_call(cpu, __perf_install_in_context, event); return; } -retry: - if (!task_function_call(task, __perf_install_in_context, event)) + /* + * Should not happen, we validate the ctx is still alive before calling. + */ + if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) return; - raw_spin_lock_irq(&ctx->lock); /* - * If we failed to find a running task, but find the context active now - * that we've acquired the ctx->lock, retry. + * Installing events is tricky because we cannot rely on ctx->is_active + * to be set in case this is the nr_events 0 -> 1 transition. */ - if (ctx->is_active) { - raw_spin_unlock_irq(&ctx->lock); +again: + /* + * Cannot use task_function_call() because we need to run on the task's + * CPU regardless of whether its current or not. + */ + if (!cpu_function_call(task_cpu(task), __perf_install_in_context, event)) + return; + + raw_spin_lock_irq(&ctx->lock); + task = ctx->task; + if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) { /* - * Reload the task pointer, it might have been changed by - * a concurrent perf_event_context_sched_out(). + * Cannot happen because we already checked above (which also + * cannot happen), and we hold ctx->mutex, which serializes us + * against perf_event_exit_task_context(). */ - task = ctx->task; - goto retry; + raw_spin_unlock_irq(&ctx->lock); + return; } - + raw_spin_unlock_irq(&ctx->lock); /* - * Since the task isn't running, its safe to add the event, us holding - * the ctx->lock ensures the task won't get scheduled in. + * Since !ctx->is_active doesn't mean anything, we must IPI + * unconditionally. */ - add_event_to_ctx(event, ctx); - raw_spin_unlock_irq(&ctx->lock); + goto again; } /* @@ -2211,80 +2253,47 @@ static void __perf_event_mark_enabled(struct perf_event *event) /* * Cross CPU call to enable a performance event */ -static int __perf_event_enable(void *info) +static void __perf_event_enable(struct perf_event *event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx, + void *info) { - struct perf_event *event = info; - struct perf_event_context *ctx = event->ctx; struct perf_event *leader = event->group_leader; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); - int err; + struct perf_event_context *task_ctx; - /* - * There's a time window between 'ctx->is_active' check - * in perf_event_enable function and this place having: - * - IRQs on - * - ctx->lock unlocked - * - * where the task could be killed and 'ctx' deactivated - * by perf_event_exit_task. - */ - if (!ctx->is_active) - return -EINVAL; - - raw_spin_lock(&ctx->lock); - update_context_time(ctx); - - if (event->state >= PERF_EVENT_STATE_INACTIVE) - goto unlock; + if (event->state >= PERF_EVENT_STATE_INACTIVE || + event->state <= PERF_EVENT_STATE_ERROR) + return; - /* - * set current task's cgroup time reference point - */ - perf_cgroup_set_timestamp(current, ctx); + if (ctx->is_active) + ctx_sched_out(ctx, cpuctx, EVENT_TIME); __perf_event_mark_enabled(event); + if (!ctx->is_active) + return; + if (!event_filter_match(event)) { if (is_cgroup_event(event)) perf_cgroup_defer_enabled(event); - goto unlock; + ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); + return; } /* * If the event is in a group and isn't the group leader, * then don't put it on unless the group is on. */ - if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) - goto unlock; - - if (!group_can_go_on(event, cpuctx, 1)) { - err = -EEXIST; - } else { - if (event == leader) - err = group_sched_in(event, cpuctx, ctx); - else - err = event_sched_in(event, cpuctx, ctx); - } - - if (err) { - /* - * If this event can't go on and it's part of a - * group, then the whole group has to come off. - */ - if (leader != event) { - group_sched_out(leader, cpuctx, ctx); - perf_mux_hrtimer_restart(cpuctx); - } - if (leader->attr.pinned) { - update_group_times(leader); - leader->state = PERF_EVENT_STATE_ERROR; - } + if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) { + ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); + return; } -unlock: - raw_spin_unlock(&ctx->lock); + task_ctx = cpuctx->task_ctx; + if (ctx->task) + WARN_ON_ONCE(task_ctx != ctx); - return 0; + ctx_resched(cpuctx, task_ctx); } /* @@ -2299,58 +2308,26 @@ unlock: static void _perf_event_enable(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; - struct task_struct *task = ctx->task; - if (!task) { - /* - * Enable the event on the cpu that it's on - */ - cpu_function_call(event->cpu, __perf_event_enable, event); + raw_spin_lock_irq(&ctx->lock); + if (event->state >= PERF_EVENT_STATE_INACTIVE || + event->state < PERF_EVENT_STATE_ERROR) { + raw_spin_unlock_irq(&ctx->lock); return; } - raw_spin_lock_irq(&ctx->lock); - if (event->state >= PERF_EVENT_STATE_INACTIVE) - goto out; - /* * If the event is in error state, clear that first. - * That way, if we see the event in error state below, we - * know that it has gone back into error state, as distinct - * from the task having been scheduled away before the - * cross-call arrived. + * + * That way, if we see the event in error state below, we know that it + * has gone back into error state, as distinct from the task having + * been scheduled away before the cross-call arrived. */ if (event->state == PERF_EVENT_STATE_ERROR) event->state = PERF_EVENT_STATE_OFF; - -retry: - if (!ctx->is_active) { - __perf_event_mark_enabled(event); - goto out; - } - raw_spin_unlock_irq(&ctx->lock); - if (!task_function_call(task, __perf_event_enable, event)) - return; - - raw_spin_lock_irq(&ctx->lock); - - /* - * If the context is active and the event is still off, - * we need to retry the cross-call. - */ - if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) { - /* - * task could have been flipped by a concurrent - * perf_event_context_sched_out() - */ - task = ctx->task; - goto retry; - } - -out: - raw_spin_unlock_irq(&ctx->lock); + event_function_call(event, __perf_event_enable, NULL); } /* @@ -2400,25 +2377,49 @@ static void ctx_sched_out(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx, enum event_type_t event_type) { - struct perf_event *event; int is_active = ctx->is_active; + struct perf_event *event; - ctx->is_active &= ~event_type; - if (likely(!ctx->nr_events)) + lockdep_assert_held(&ctx->lock); + + if (likely(!ctx->nr_events)) { + /* + * See __perf_remove_from_context(). + */ + WARN_ON_ONCE(ctx->is_active); + if (ctx->task) + WARN_ON_ONCE(cpuctx->task_ctx); return; + } - update_context_time(ctx); - update_cgrp_time_from_cpuctx(cpuctx); - if (!ctx->nr_active) + ctx->is_active &= ~event_type; + if (!(ctx->is_active & EVENT_ALL)) + ctx->is_active = 0; + + if (ctx->task) { + WARN_ON_ONCE(cpuctx->task_ctx != ctx); + if (!ctx->is_active) + cpuctx->task_ctx = NULL; + } + + is_active ^= ctx->is_active; /* changed bits */ + + if (is_active & EVENT_TIME) { + /* update (and stop) ctx time */ + update_context_time(ctx); + update_cgrp_time_from_cpuctx(cpuctx); + } + + if (!ctx->nr_active || !(is_active & EVENT_ALL)) return; perf_pmu_disable(ctx->pmu); - if ((is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) { + if (is_active & EVENT_PINNED) { list_for_each_entry(event, &ctx->pinned_groups, group_entry) group_sched_out(event, cpuctx, ctx); } - if ((is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) { + if (is_active & EVENT_FLEXIBLE) { list_for_each_entry(event, &ctx->flexible_groups, group_entry) group_sched_out(event, cpuctx, ctx); } @@ -2576,17 +2577,21 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, raw_spin_lock(&ctx->lock); raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); if (context_equiv(ctx, next_ctx)) { - /* - * XXX do we need a memory barrier of sorts - * wrt to rcu_dereference() of perf_event_ctxp - */ - task->perf_event_ctxp[ctxn] = next_ctx; - next->perf_event_ctxp[ctxn] = ctx; - ctx->task = next; - next_ctx->task = task; + WRITE_ONCE(ctx->task, next); + WRITE_ONCE(next_ctx->task, task); swap(ctx->task_ctx_data, next_ctx->task_ctx_data); + /* + * RCU_INIT_POINTER here is safe because we've not + * modified the ctx and the above modification of + * ctx->task and ctx->task_ctx_data are immaterial + * since those values are always verified under + * ctx->lock which we're now holding. + */ + RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx); + RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx); + do_switch = 0; perf_event_sync_stat(ctx, next_ctx); @@ -2599,8 +2604,7 @@ unlock: if (do_switch) { raw_spin_lock(&ctx->lock); - ctx_sched_out(ctx, cpuctx, EVENT_ALL); - cpuctx->task_ctx = NULL; + task_ctx_sched_out(cpuctx, ctx); raw_spin_unlock(&ctx->lock); } } @@ -2695,20 +2699,6 @@ void __perf_event_task_sched_out(struct task_struct *task, perf_cgroup_sched_out(task, next); } -static void task_ctx_sched_out(struct perf_event_context *ctx) -{ - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); - - if (!cpuctx->task_ctx) - return; - - if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) - return; - - ctx_sched_out(ctx, cpuctx, EVENT_ALL); - cpuctx->task_ctx = NULL; -} - /* * Called with IRQs disabled */ @@ -2783,25 +2773,40 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type, struct task_struct *task) { - u64 now; int is_active = ctx->is_active; + u64 now; + + lockdep_assert_held(&ctx->lock); - ctx->is_active |= event_type; if (likely(!ctx->nr_events)) return; - now = perf_clock(); - ctx->timestamp = now; - perf_cgroup_set_timestamp(task, ctx); + ctx->is_active |= (event_type | EVENT_TIME); + if (ctx->task) { + if (!is_active) + cpuctx->task_ctx = ctx; + else + WARN_ON_ONCE(cpuctx->task_ctx != ctx); + } + + is_active ^= ctx->is_active; /* changed bits */ + + if (is_active & EVENT_TIME) { + /* start ctx time */ + now = perf_clock(); + ctx->timestamp = now; + perf_cgroup_set_timestamp(task, ctx); + } + /* * First go through the list and put on any pinned groups * in order to give them the best chance of going on. */ - if (!(is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) + if (is_active & EVENT_PINNED) ctx_pinned_sched_in(ctx, cpuctx); /* Then walk through the lower prio flexible groups */ - if (!(is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) + if (is_active & EVENT_FLEXIBLE) ctx_flexible_sched_in(ctx, cpuctx); } @@ -2831,12 +2836,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, * cpu flexible, task flexible. */ cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); - - if (ctx->nr_events) - cpuctx->task_ctx = ctx; - - perf_event_sched_in(cpuctx, cpuctx->task_ctx, task); - + perf_event_sched_in(cpuctx, ctx, task); perf_pmu_enable(ctx->pmu); perf_ctx_unlock(cpuctx, ctx); } @@ -2858,6 +2858,16 @@ void __perf_event_task_sched_in(struct task_struct *prev, struct perf_event_context *ctx; int ctxn; + /* + * If cgroup events exist on this CPU, then we need to check if we have + * to switch in PMU state; cgroup event are system-wide mode only. + * + * Since cgroup events are CPU events, we must schedule these in before + * we schedule in the task events. + */ + if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) + perf_cgroup_sched_in(prev, task); + for_each_task_context_nr(ctxn) { ctx = task->perf_event_ctxp[ctxn]; if (likely(!ctx)) @@ -2865,13 +2875,6 @@ void __perf_event_task_sched_in(struct task_struct *prev, perf_event_context_sched_in(ctx, task); } - /* - * if cgroup events exist on this CPU, then we need - * to check if we have to switch in PMU state. - * cgroup event are system-wide mode only - */ - if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) - perf_cgroup_sched_in(prev, task); if (atomic_read(&nr_switch_events)) perf_event_switch(task, prev, true); @@ -3157,46 +3160,31 @@ static int event_enable_on_exec(struct perf_event *event, static void perf_event_enable_on_exec(int ctxn) { struct perf_event_context *ctx, *clone_ctx = NULL; + struct perf_cpu_context *cpuctx; struct perf_event *event; unsigned long flags; int enabled = 0; - int ret; local_irq_save(flags); ctx = current->perf_event_ctxp[ctxn]; if (!ctx || !ctx->nr_events) goto out; - /* - * We must ctxsw out cgroup events to avoid conflict - * when invoking perf_task_event_sched_in() later on - * in this function. Otherwise we end up trying to - * ctxswin cgroup events which are already scheduled - * in. - */ - perf_cgroup_sched_out(current, NULL); - - raw_spin_lock(&ctx->lock); - task_ctx_sched_out(ctx); - - list_for_each_entry(event, &ctx->event_list, event_entry) { - ret = event_enable_on_exec(event, ctx); - if (ret) - enabled = 1; - } + cpuctx = __get_cpu_context(ctx); + perf_ctx_lock(cpuctx, ctx); + ctx_sched_out(ctx, cpuctx, EVENT_TIME); + list_for_each_entry(event, &ctx->event_list, event_entry) + enabled |= event_enable_on_exec(event, ctx); /* - * Unclone this context if we enabled any event. + * Unclone and reschedule this context if we enabled any event. */ - if (enabled) + if (enabled) { clone_ctx = unclone_ctx(ctx); + ctx_resched(cpuctx, ctx); + } + perf_ctx_unlock(cpuctx, ctx); - raw_spin_unlock(&ctx->lock); - - /* - * Also calls ctxswin for cgroup events, if any: - */ - perf_event_context_sched_in(ctx, ctx->task); out: local_irq_restore(flags); @@ -3392,7 +3380,6 @@ static void __perf_event_init_context(struct perf_event_context *ctx) INIT_LIST_HEAD(&ctx->flexible_groups); INIT_LIST_HEAD(&ctx->event_list); atomic_set(&ctx->refcount, 1); - INIT_DELAYED_WORK(&ctx->orphans_remove, orphans_remove_work); } static struct perf_event_context * @@ -3579,11 +3566,13 @@ static void unaccount_event_cpu(struct perf_event *event, int cpu) static void unaccount_event(struct perf_event *event) { + bool dec = false; + if (event->parent) return; if (event->attach_state & PERF_ATTACH_TASK) - static_key_slow_dec_deferred(&perf_sched_events); + dec = true; if (event->attr.mmap || event->attr.mmap_data) atomic_dec(&nr_mmap_events); if (event->attr.comm) @@ -3593,17 +3582,30 @@ static void unaccount_event(struct perf_event *event) if (event->attr.freq) atomic_dec(&nr_freq_events); if (event->attr.context_switch) { - static_key_slow_dec_deferred(&perf_sched_events); + dec = true; atomic_dec(&nr_switch_events); } if (is_cgroup_event(event)) - static_key_slow_dec_deferred(&perf_sched_events); + dec = true; if (has_branch_stack(event)) - static_key_slow_dec_deferred(&perf_sched_events); + dec = true; + + if (dec) { + if (!atomic_add_unless(&perf_sched_count, -1, 1)) + schedule_delayed_work(&perf_sched_work, HZ); + } unaccount_event_cpu(event, event->cpu); } +static void perf_sched_delayed(struct work_struct *work) +{ + mutex_lock(&perf_sched_mutex); + if (atomic_dec_and_test(&perf_sched_count)) + static_branch_disable(&perf_sched_events); + mutex_unlock(&perf_sched_mutex); +} + /* * The following implement mutual exclusion of events on "exclusive" pmus * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled @@ -3614,7 +3616,7 @@ static void unaccount_event(struct perf_event *event) * 3) two matching events on the same context. * * The former two cases are handled in the allocation path (perf_event_alloc(), - * __free_event()), the latter -- before the first perf_install_in_context(). + * _free_event()), the latter -- before the first perf_install_in_context(). */ static int exclusive_event_init(struct perf_event *event) { @@ -3689,29 +3691,6 @@ static bool exclusive_event_installable(struct perf_event *event, return true; } -static void __free_event(struct perf_event *event) -{ - if (!event->parent) { - if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) - put_callchain_buffers(); - } - - perf_event_free_bpf_prog(event); - - if (event->destroy) - event->destroy(event); - - if (event->ctx) - put_ctx(event->ctx); - - if (event->pmu) { - exclusive_event_destroy(event); - module_put(event->pmu->module); - } - - call_rcu(&event->rcu_head, free_event_rcu); -} - static void _free_event(struct perf_event *event) { irq_work_sync(&event->pending); @@ -3733,7 +3712,25 @@ static void _free_event(struct perf_event *event) if (is_cgroup_event(event)) perf_detach_cgroup(event); - __free_event(event); + if (!event->parent) { + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) + put_callchain_buffers(); + } + + perf_event_free_bpf_prog(event); + + if (event->destroy) + event->destroy(event); + + if (event->ctx) + put_ctx(event->ctx); + + if (event->pmu) { + exclusive_event_destroy(event); + module_put(event->pmu->module); + } + + call_rcu(&event->rcu_head, free_event_rcu); } /* @@ -3760,14 +3757,13 @@ static void perf_remove_from_owner(struct perf_event *event) struct task_struct *owner; rcu_read_lock(); - owner = ACCESS_ONCE(event->owner); /* - * Matches the smp_wmb() in perf_event_exit_task(). If we observe - * !owner it means the list deletion is complete and we can indeed - * free this event, otherwise we need to serialize on + * Matches the smp_store_release() in perf_event_exit_task(). If we + * observe !owner it means the list deletion is complete and we can + * indeed free this event, otherwise we need to serialize on * owner->perf_event_mutex. */ - smp_read_barrier_depends(); + owner = lockless_dereference(event->owner); if (owner) { /* * Since delayed_put_task_struct() also drops the last @@ -3795,8 +3791,10 @@ static void perf_remove_from_owner(struct perf_event *event) * ensured they're done, and we can proceed with freeing the * event. */ - if (event->owner) + if (event->owner) { list_del_init(&event->owner_entry); + smp_store_release(&event->owner, NULL); + } mutex_unlock(&owner->perf_event_mutex); put_task_struct(owner); } @@ -3804,37 +3802,111 @@ static void perf_remove_from_owner(struct perf_event *event) static void put_event(struct perf_event *event) { - struct perf_event_context *ctx; - if (!atomic_long_dec_and_test(&event->refcount)) return; + _free_event(event); +} + +/* + * Kill an event dead; while event:refcount will preserve the event + * object, it will not preserve its functionality. Once the last 'user' + * gives up the object, we'll destroy the thing. + */ +int perf_event_release_kernel(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + struct perf_event *child, *tmp; + + /* + * If we got here through err_file: fput(event_file); we will not have + * attached to a context yet. + */ + if (!ctx) { + WARN_ON_ONCE(event->attach_state & + (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP)); + goto no_ctx; + } + if (!is_kernel_event(event)) perf_remove_from_owner(event); + ctx = perf_event_ctx_lock(event); + WARN_ON_ONCE(ctx->parent_ctx); + perf_remove_from_context(event, DETACH_GROUP); + + raw_spin_lock_irq(&ctx->lock); /* - * There are two ways this annotation is useful: + * Mark this even as STATE_DEAD, there is no external reference to it + * anymore. * - * 1) there is a lock recursion from perf_event_exit_task - * see the comment there. + * Anybody acquiring event->child_mutex after the below loop _must_ + * also see this, most importantly inherit_event() which will avoid + * placing more children on the list. * - * 2) there is a lock-inversion with mmap_sem through - * perf_read_group(), which takes faults while - * holding ctx->mutex, however this is called after - * the last filedesc died, so there is no possibility - * to trigger the AB-BA case. + * Thus this guarantees that we will in fact observe and kill _ALL_ + * child events. */ - ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING); - WARN_ON_ONCE(ctx->parent_ctx); - perf_remove_from_context(event, true); + event->state = PERF_EVENT_STATE_DEAD; + raw_spin_unlock_irq(&ctx->lock); + perf_event_ctx_unlock(event, ctx); - _free_event(event); -} +again: + mutex_lock(&event->child_mutex); + list_for_each_entry(child, &event->child_list, child_list) { -int perf_event_release_kernel(struct perf_event *event) -{ - put_event(event); + /* + * Cannot change, child events are not migrated, see the + * comment with perf_event_ctx_lock_nested(). + */ + ctx = lockless_dereference(child->ctx); + /* + * Since child_mutex nests inside ctx::mutex, we must jump + * through hoops. We start by grabbing a reference on the ctx. + * + * Since the event cannot get freed while we hold the + * child_mutex, the context must also exist and have a !0 + * reference count. + */ + get_ctx(ctx); + + /* + * Now that we have a ctx ref, we can drop child_mutex, and + * acquire ctx::mutex without fear of it going away. Then we + * can re-acquire child_mutex. + */ + mutex_unlock(&event->child_mutex); + mutex_lock(&ctx->mutex); + mutex_lock(&event->child_mutex); + + /* + * Now that we hold ctx::mutex and child_mutex, revalidate our + * state, if child is still the first entry, it didn't get freed + * and we can continue doing so. + */ + tmp = list_first_entry_or_null(&event->child_list, + struct perf_event, child_list); + if (tmp == child) { + perf_remove_from_context(child, DETACH_GROUP); + list_del(&child->child_list); + free_event(child); + /* + * This matches the refcount bump in inherit_event(); + * this can't be the last reference. + */ + put_event(event); + } + + mutex_unlock(&event->child_mutex); + mutex_unlock(&ctx->mutex); + put_ctx(ctx); + goto again; + } + mutex_unlock(&event->child_mutex); + +no_ctx: + put_event(event); /* Must be the 'last' reference */ return 0; } EXPORT_SYMBOL_GPL(perf_event_release_kernel); @@ -3844,46 +3916,10 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel); */ static int perf_release(struct inode *inode, struct file *file) { - put_event(file->private_data); + perf_event_release_kernel(file->private_data); return 0; } -/* - * Remove all orphanes events from the context. - */ -static void orphans_remove_work(struct work_struct *work) -{ - struct perf_event_context *ctx; - struct perf_event *event, *tmp; - - ctx = container_of(work, struct perf_event_context, - orphans_remove.work); - - mutex_lock(&ctx->mutex); - list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) { - struct perf_event *parent_event = event->parent; - - if (!is_orphaned_child(event)) - continue; - - perf_remove_from_context(event, true); - - mutex_lock(&parent_event->child_mutex); - list_del_init(&event->child_list); - mutex_unlock(&parent_event->child_mutex); - - free_event(event); - put_event(parent_event); - } - - raw_spin_lock_irq(&ctx->lock); - ctx->orphans_remove_sched = false; - raw_spin_unlock_irq(&ctx->lock); - mutex_unlock(&ctx->mutex); - - put_ctx(ctx); -} - u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) { struct perf_event *child; @@ -4027,7 +4063,7 @@ static bool is_event_hup(struct perf_event *event) { bool no_children; - if (event->state != PERF_EVENT_STATE_EXIT) + if (event->state > PERF_EVENT_STATE_EXIT) return false; mutex_lock(&event->child_mutex); @@ -4112,7 +4148,7 @@ static void _perf_event_reset(struct perf_event *event) /* * Holding the top-level event's child_mutex means that any * descendant process that has inherited this event will block - * in sync_child_event if it goes to exit, thus satisfying the + * in perf_event_exit_event() if it goes to exit, thus satisfying the * task existence requirements of perf_event_enable/disable. */ static void perf_event_for_each_child(struct perf_event *event, @@ -4144,20 +4180,14 @@ static void perf_event_for_each(struct perf_event *event, perf_event_for_each_child(sibling, func); } -struct period_event { - struct perf_event *event; - u64 value; -}; - -static int __perf_event_period(void *info) +static void __perf_event_period(struct perf_event *event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx, + void *info) { - struct period_event *pe = info; - struct perf_event *event = pe->event; - struct perf_event_context *ctx = event->ctx; - u64 value = pe->value; + u64 value = *((u64 *)info); bool active; - raw_spin_lock(&ctx->lock); if (event->attr.freq) { event->attr.sample_freq = value; } else { @@ -4177,16 +4207,10 @@ static int __perf_event_period(void *info) event->pmu->start(event, PERF_EF_RELOAD); perf_pmu_enable(ctx->pmu); } - raw_spin_unlock(&ctx->lock); - - return 0; } static int perf_event_period(struct perf_event *event, u64 __user *arg) { - struct period_event pe = { .event = event, }; - struct perf_event_context *ctx = event->ctx; - struct task_struct *task; u64 value; if (!is_sampling_event(event)) @@ -4201,34 +4225,7 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg) if (event->attr.freq && value > sysctl_perf_event_sample_rate) return -EINVAL; - task = ctx->task; - pe.value = value; - - if (!task) { - cpu_function_call(event->cpu, __perf_event_period, &pe); - return 0; - } - -retry: - if (!task_function_call(task, __perf_event_period, &pe)) - return 0; - - raw_spin_lock_irq(&ctx->lock); - if (ctx->is_active) { - raw_spin_unlock_irq(&ctx->lock); - task = ctx->task; - goto retry; - } - - if (event->attr.freq) { - event->attr.sample_freq = value; - } else { - event->attr.sample_period = value; - event->hw.sample_period = value; - } - - local64_set(&event->hw.period_left, 0); - raw_spin_unlock_irq(&ctx->lock); + event_function_call(event, __perf_event_period, &value); return 0; } @@ -4940,9 +4937,9 @@ static int perf_fasync(int fd, struct file *filp, int on) struct perf_event *event = filp->private_data; int retval; - mutex_lock(&inode->i_mutex); + inode_lock(inode); retval = fasync_helper(fd, filp, on, &event->fasync); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (retval < 0) return retval; @@ -5000,7 +4997,7 @@ static void perf_pending_event(struct irq_work *entry) if (event->pending_disable) { event->pending_disable = 0; - __perf_event_disable(event); + perf_event_disable_local(event); } if (event->pending_wakeup) { @@ -7821,11 +7818,13 @@ static void account_event_cpu(struct perf_event *event, int cpu) static void account_event(struct perf_event *event) { + bool inc = false; + if (event->parent) return; if (event->attach_state & PERF_ATTACH_TASK) - static_key_slow_inc(&perf_sched_events.key); + inc = true; if (event->attr.mmap || event->attr.mmap_data) atomic_inc(&nr_mmap_events); if (event->attr.comm) @@ -7838,12 +7837,35 @@ static void account_event(struct perf_event *event) } if (event->attr.context_switch) { atomic_inc(&nr_switch_events); - static_key_slow_inc(&perf_sched_events.key); + inc = true; } if (has_branch_stack(event)) - static_key_slow_inc(&perf_sched_events.key); + inc = true; if (is_cgroup_event(event)) - static_key_slow_inc(&perf_sched_events.key); + inc = true; + + if (inc) { + if (atomic_inc_not_zero(&perf_sched_count)) + goto enabled; + + mutex_lock(&perf_sched_mutex); + if (!atomic_read(&perf_sched_count)) { + static_branch_enable(&perf_sched_events); + /* + * Guarantee that all CPUs observe they key change and + * call the perf scheduling hooks before proceeding to + * install events that need them. + */ + synchronize_sched(); + } + /* + * Now that we have waited for the sync_sched(), allow further + * increments to by-pass the mutex. + */ + atomic_inc(&perf_sched_count); + mutex_unlock(&perf_sched_mutex); + } +enabled: account_event_cpu(event, event->cpu); } @@ -8462,10 +8484,19 @@ SYSCALL_DEFINE5(perf_event_open, if (move_group) { gctx = group_leader->ctx; mutex_lock_double(&gctx->mutex, &ctx->mutex); + if (gctx->task == TASK_TOMBSTONE) { + err = -ESRCH; + goto err_locked; + } } else { mutex_lock(&ctx->mutex); } + if (ctx->task == TASK_TOMBSTONE) { + err = -ESRCH; + goto err_locked; + } + if (!perf_event_validate_size(event)) { err = -E2BIG; goto err_locked; @@ -8490,11 +8521,11 @@ SYSCALL_DEFINE5(perf_event_open, * See perf_event_ctx_lock() for comments on the details * of swizzling perf_event::ctx. */ - perf_remove_from_context(group_leader, false); + perf_remove_from_context(group_leader, 0); list_for_each_entry(sibling, &group_leader->sibling_list, group_entry) { - perf_remove_from_context(sibling, false); + perf_remove_from_context(sibling, 0); put_ctx(gctx); } @@ -8547,6 +8578,8 @@ SYSCALL_DEFINE5(perf_event_open, perf_event__header_size(event); perf_event__id_header_size(event); + event->owner = current; + perf_install_in_context(ctx, event, event->cpu); perf_unpin_context(ctx); @@ -8556,8 +8589,6 @@ SYSCALL_DEFINE5(perf_event_open, put_online_cpus(); - event->owner = current; - mutex_lock(¤t->perf_event_mutex); list_add_tail(&event->owner_entry, ¤t->perf_event_list); mutex_unlock(¤t->perf_event_mutex); @@ -8582,7 +8613,12 @@ err_context: perf_unpin_context(ctx); put_ctx(ctx); err_alloc: - free_event(event); + /* + * If event_file is set, the fput() above will have called ->release() + * and that will take care of freeing the event. + */ + if (!event_file) + free_event(event); err_cpus: put_online_cpus(); err_task: @@ -8624,7 +8660,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, } /* Mark owner so we could distinguish it from user events. */ - event->owner = EVENT_OWNER_KERNEL; + event->owner = TASK_TOMBSTONE; account_event(event); @@ -8636,12 +8672,14 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); + if (ctx->task == TASK_TOMBSTONE) { + err = -ESRCH; + goto err_unlock; + } + if (!exclusive_event_installable(event, ctx)) { - mutex_unlock(&ctx->mutex); - perf_unpin_context(ctx); - put_ctx(ctx); err = -EBUSY; - goto err_free; + goto err_unlock; } perf_install_in_context(ctx, event, cpu); @@ -8650,6 +8688,10 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, return event; +err_unlock: + mutex_unlock(&ctx->mutex); + perf_unpin_context(ctx); + put_ctx(ctx); err_free: free_event(event); err: @@ -8674,7 +8716,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex); list_for_each_entry_safe(event, tmp, &src_ctx->event_list, event_entry) { - perf_remove_from_context(event, false); + perf_remove_from_context(event, 0); unaccount_event_cpu(event, src_cpu); put_ctx(src_ctx); list_add(&event->migrate_entry, &events); @@ -8741,33 +8783,15 @@ static void sync_child_event(struct perf_event *child_event, &parent_event->child_total_time_enabled); atomic64_add(child_event->total_time_running, &parent_event->child_total_time_running); - - /* - * Remove this event from the parent's list - */ - WARN_ON_ONCE(parent_event->ctx->parent_ctx); - mutex_lock(&parent_event->child_mutex); - list_del_init(&child_event->child_list); - mutex_unlock(&parent_event->child_mutex); - - /* - * Make sure user/parent get notified, that we just - * lost one event. - */ - perf_event_wakeup(parent_event); - - /* - * Release the parent event, if this was the last - * reference to it. - */ - put_event(parent_event); } static void -__perf_event_exit_task(struct perf_event *child_event, - struct perf_event_context *child_ctx, - struct task_struct *child) +perf_event_exit_event(struct perf_event *child_event, + struct perf_event_context *child_ctx, + struct task_struct *child) { + struct perf_event *parent_event = child_event->parent; + /* * Do not destroy the 'original' grouping; because of the context * switch optimization the original events could've ended up in a @@ -8780,57 +8804,86 @@ __perf_event_exit_task(struct perf_event *child_event, * Do destroy all inherited groups, we don't care about those * and being thorough is better. */ - perf_remove_from_context(child_event, !!child_event->parent); + raw_spin_lock_irq(&child_ctx->lock); + WARN_ON_ONCE(child_ctx->is_active); + + if (parent_event) + perf_group_detach(child_event); + list_del_event(child_event, child_ctx); + child_event->state = PERF_EVENT_STATE_EXIT; /* is_event_hup() */ + raw_spin_unlock_irq(&child_ctx->lock); /* - * It can happen that the parent exits first, and has events - * that are still around due to the child reference. These - * events need to be zapped. + * Parent events are governed by their filedesc, retain them. */ - if (child_event->parent) { - sync_child_event(child_event, child); - free_event(child_event); - } else { - child_event->state = PERF_EVENT_STATE_EXIT; + if (!parent_event) { perf_event_wakeup(child_event); + return; } + /* + * Child events can be cleaned up. + */ + + sync_child_event(child_event, child); + + /* + * Remove this event from the parent's list + */ + WARN_ON_ONCE(parent_event->ctx->parent_ctx); + mutex_lock(&parent_event->child_mutex); + list_del_init(&child_event->child_list); + mutex_unlock(&parent_event->child_mutex); + + /* + * Kick perf_poll() for is_event_hup(). + */ + perf_event_wakeup(parent_event); + free_event(child_event); + put_event(parent_event); } static void perf_event_exit_task_context(struct task_struct *child, int ctxn) { - struct perf_event *child_event, *next; struct perf_event_context *child_ctx, *clone_ctx = NULL; - unsigned long flags; + struct perf_event *child_event, *next; - if (likely(!child->perf_event_ctxp[ctxn])) + WARN_ON_ONCE(child != current); + + child_ctx = perf_pin_task_context(child, ctxn); + if (!child_ctx) return; - local_irq_save(flags); /* - * We can't reschedule here because interrupts are disabled, - * and either child is current or it is a task that can't be - * scheduled, so we are now safe from rescheduling changing - * our context. + * In order to reduce the amount of tricky in ctx tear-down, we hold + * ctx::mutex over the entire thing. This serializes against almost + * everything that wants to access the ctx. + * + * The exception is sys_perf_event_open() / + * perf_event_create_kernel_count() which does find_get_context() + * without ctx::mutex (it cannot because of the move_group double mutex + * lock thing). See the comments in perf_install_in_context(). */ - child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]); + mutex_lock(&child_ctx->mutex); /* - * Take the context lock here so that if find_get_context is - * reading child->perf_event_ctxp, we wait until it has - * incremented the context's refcount before we do put_ctx below. + * In a single ctx::lock section, de-schedule the events and detach the + * context from the task such that we cannot ever get it scheduled back + * in. */ - raw_spin_lock(&child_ctx->lock); - task_ctx_sched_out(child_ctx); - child->perf_event_ctxp[ctxn] = NULL; + raw_spin_lock_irq(&child_ctx->lock); + task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx); /* - * If this context is a clone; unclone it so it can't get - * swapped to another process while we're removing all - * the events from it. + * Now that the context is inactive, destroy the task <-> ctx relation + * and mark the context dead. */ + RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL); + put_ctx(child_ctx); /* cannot be last */ + WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE); + put_task_struct(current); /* cannot be last */ + clone_ctx = unclone_ctx(child_ctx); - update_context_time(child_ctx); - raw_spin_unlock_irqrestore(&child_ctx->lock, flags); + raw_spin_unlock_irq(&child_ctx->lock); if (clone_ctx) put_ctx(clone_ctx); @@ -8842,20 +8895,8 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) */ perf_event_task(child, child_ctx, 0); - /* - * We can recurse on the same lock type through: - * - * __perf_event_exit_task() - * sync_child_event() - * put_event() - * mutex_lock(&ctx->mutex) - * - * But since its the parent context it won't be the same instance. - */ - mutex_lock(&child_ctx->mutex); - list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry) - __perf_event_exit_task(child_event, child_ctx, child); + perf_event_exit_event(child_event, child_ctx, child); mutex_unlock(&child_ctx->mutex); @@ -8880,8 +8921,7 @@ void perf_event_exit_task(struct task_struct *child) * the owner, closes a race against perf_release() where * we need to serialize on the owner->perf_event_mutex. */ - smp_wmb(); - event->owner = NULL; + smp_store_release(&event->owner, NULL); } mutex_unlock(&child->perf_event_mutex); @@ -8964,21 +9004,20 @@ void perf_event_delayed_put(struct task_struct *task) WARN_ON_ONCE(task->perf_event_ctxp[ctxn]); } -struct perf_event *perf_event_get(unsigned int fd) +struct file *perf_event_get(unsigned int fd) { - int err; - struct fd f; - struct perf_event *event; + struct file *file; - err = perf_fget_light(fd, &f); - if (err) - return ERR_PTR(err); + file = fget_raw(fd); + if (!file) + return ERR_PTR(-EBADF); - event = f.file->private_data; - atomic_long_inc(&event->refcount); - fdput(f); + if (file->f_op != &perf_fops) { + fput(file); + return ERR_PTR(-EBADF); + } - return event; + return file; } const struct perf_event_attr *perf_event_attrs(struct perf_event *event) @@ -9021,8 +9060,16 @@ inherit_event(struct perf_event *parent_event, if (IS_ERR(child_event)) return child_event; + /* + * is_orphaned_event() and list_add_tail(&parent_event->child_list) + * must be under the same lock in order to serialize against + * perf_event_release_kernel(), such that either we must observe + * is_orphaned_event() or they will observe us on the child_list. + */ + mutex_lock(&parent_event->child_mutex); if (is_orphaned_event(parent_event) || !atomic_long_inc_not_zero(&parent_event->refcount)) { + mutex_unlock(&parent_event->child_mutex); free_event(child_event); return NULL; } @@ -9070,8 +9117,6 @@ inherit_event(struct perf_event *parent_event, /* * Link this into the parent event's child list */ - WARN_ON_ONCE(parent_event->ctx->parent_ctx); - mutex_lock(&parent_event->child_mutex); list_add_tail(&child_event->child_list, &parent_event->child_list); mutex_unlock(&parent_event->child_mutex); @@ -9276,7 +9321,7 @@ static void perf_event_init_cpu(int cpu) struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); mutex_lock(&swhash->hlist_mutex); - if (swhash->hlist_refcount > 0) { + if (swhash->hlist_refcount > 0 && !swevent_hlist_deref(swhash)) { struct swevent_hlist *hlist; hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu)); @@ -9289,13 +9334,14 @@ static void perf_event_init_cpu(int cpu) #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE static void __perf_event_exit_context(void *__info) { - struct remove_event re = { .detach_group = true }; struct perf_event_context *ctx = __info; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + struct perf_event *event; - rcu_read_lock(); - list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry) - __perf_remove_from_context(&re); - rcu_read_unlock(); + raw_spin_lock(&ctx->lock); + list_for_each_entry(event, &ctx->event_list, event_entry) + __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP); + raw_spin_unlock(&ctx->lock); } static void perf_event_exit_cpu_context(int cpu) @@ -9351,11 +9397,9 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: - case CPU_DOWN_FAILED: perf_event_init_cpu(cpu); break; - case CPU_UP_CANCELED: case CPU_DOWN_PREPARE: perf_event_exit_cpu(cpu); break; @@ -9384,9 +9428,6 @@ void __init perf_event_init(void) ret = init_hw_breakpoint(); WARN(ret, "hw_breakpoint initialization failed with: %d", ret); - /* do not patch jump label more than once per second */ - jump_label_rate_limit(&perf_sched_events, HZ); - /* * Build time assertion that we keep the data_head at the intended * location. IOW, validation we got the __reserved[] size right. diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 92ce5f4cc..3f8cb1e14 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -444,7 +444,7 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att * current task. */ if (irqs_disabled() && bp->ctx && bp->ctx->task == current) - __perf_event_disable(bp); + perf_event_disable_local(bp); else perf_event_disable(bp); diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index adfdc0536..1faad2cfd 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -459,6 +459,25 @@ static void rb_free_aux_page(struct ring_buffer *rb, int idx) __free_page(page); } +static void __rb_free_aux(struct ring_buffer *rb) +{ + int pg; + + if (rb->aux_priv) { + rb->free_aux(rb->aux_priv); + rb->free_aux = NULL; + rb->aux_priv = NULL; + } + + if (rb->aux_nr_pages) { + for (pg = 0; pg < rb->aux_nr_pages; pg++) + rb_free_aux_page(rb, pg); + + kfree(rb->aux_pages); + rb->aux_nr_pages = 0; + } +} + int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, pgoff_t pgoff, int nr_pages, long watermark, int flags) { @@ -547,30 +566,11 @@ out: if (!ret) rb->aux_pgoff = pgoff; else - rb_free_aux(rb); + __rb_free_aux(rb); return ret; } -static void __rb_free_aux(struct ring_buffer *rb) -{ - int pg; - - if (rb->aux_priv) { - rb->free_aux(rb->aux_priv); - rb->free_aux = NULL; - rb->aux_priv = NULL; - } - - if (rb->aux_nr_pages) { - for (pg = 0; pg < rb->aux_nr_pages; pg++) - rb_free_aux_page(rb, pg); - - kfree(rb->aux_pages); - rb->aux_nr_pages = 0; - } -} - void rb_free_aux(struct ring_buffer *rb) { if (atomic_dec_and_test(&rb->aux_refcount)) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 7dad84913..016767918 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -161,7 +161,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, const unsigned long mmun_end = addr + PAGE_SIZE; struct mem_cgroup *memcg; - err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg); + err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg, + false); if (err) return err; @@ -175,12 +176,12 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, goto unlock; get_page(kpage); - page_add_new_anon_rmap(kpage, vma, addr); - mem_cgroup_commit_charge(kpage, memcg, false); + page_add_new_anon_rmap(kpage, vma, addr, false); + mem_cgroup_commit_charge(kpage, memcg, false, false); lru_cache_add_active_or_unevictable(kpage, vma); if (!PageAnon(page)) { - dec_mm_counter(mm, MM_FILEPAGES); + dec_mm_counter(mm, mm_counter_file(page)); inc_mm_counter(mm, MM_ANONPAGES); } @@ -188,7 +189,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, ptep_clear_flush_notify(vma, addr, ptep); set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); - page_remove_rmap(page); + page_remove_rmap(page, false); if (!page_mapped(page)) try_to_free_swap(page); pte_unmap_unlock(ptep, ptl); @@ -199,7 +200,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, err = 0; unlock: - mem_cgroup_cancel_charge(kpage, memcg); + mem_cgroup_cancel_charge(kpage, memcg, false); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); unlock_page(page); return err; diff --git a/kernel/exit.c b/kernel/exit.c index 07110c602..10e088237 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -59,8 +59,6 @@ #include <asm/pgtable.h> #include <asm/mmu_context.h> -static void exit_mm(struct task_struct *tsk); - static void __unhash_process(struct task_struct *p, bool group_dead) { nr_threads--; @@ -1120,8 +1118,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) static int *task_stopped_code(struct task_struct *p, bool ptrace) { if (ptrace) { - if (task_is_stopped_or_traced(p) && - !(p->jobctl & JOBCTL_LISTENING)) + if (task_is_traced(p) && !(p->jobctl & JOBCTL_LISTENING)) return &p->exit_code; } else { if (p->signal->flags & SIGNAL_STOP_STOPPED) diff --git a/kernel/fork.c b/kernel/fork.c index 0b59aed29..f91740137 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -138,7 +138,7 @@ static struct kmem_cache *task_struct_cachep; static inline struct task_struct *alloc_task_struct_node(int node) { - return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL | ___GFP_TOI_NOTRACK, node); + return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node); } static inline void free_task_struct(struct task_struct *tsk) @@ -300,9 +300,9 @@ void __init fork_init(void) #define ARCH_MIN_TASKALIGN L1_CACHE_BYTES #endif /* create a slab on which task_structs can be allocated */ - task_struct_cachep = - kmem_cache_create("task_struct", arch_task_struct_size, - ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL); + task_struct_cachep = kmem_cache_create("task_struct", + arch_task_struct_size, ARCH_MIN_TASKALIGN, + SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, NULL); #endif /* do the arch specific task caches init */ @@ -414,7 +414,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); mm->total_vm = oldmm->total_vm; - mm->shared_vm = oldmm->shared_vm; + mm->data_vm = oldmm->data_vm; mm->exec_vm = oldmm->exec_vm; mm->stack_vm = oldmm->stack_vm; @@ -433,8 +433,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) struct file *file; if (mpnt->vm_flags & VM_DONTCOPY) { - vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, - -vma_pages(mpnt)); + vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt)); continue; } charge = 0; @@ -465,7 +464,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) struct inode *inode = file_inode(file); struct address_space *mapping = file->f_mapping; - vma_get_file(tmp); + get_file(file); if (tmp->vm_flags & VM_DENYWRITE) atomic_dec(&inode->i_writecount); i_mmap_lock_write(mapping); @@ -1250,7 +1249,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, { int retval; struct task_struct *p; - void *cgrp_ss_priv[CGROUP_CANFORK_COUNT] = {}; if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) return ERR_PTR(-EINVAL); @@ -1349,9 +1347,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, prev_cputime_init(&p->prev_cputime); #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN - seqlock_init(&p->vtime_seqlock); + seqcount_init(&p->vtime_seqcount); p->vtime_snap = 0; - p->vtime_snap_whence = VTIME_SLEEPING; + p->vtime_snap_whence = VTIME_INACTIVE; #endif #if defined(SPLIT_RSS_COUNTING) @@ -1527,7 +1525,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, * between here and cgroup_post_fork() if an organisation operation is in * progress. */ - retval = cgroup_can_fork(p, cgrp_ss_priv); + retval = cgroup_can_fork(p); if (retval) goto bad_fork_free_pid; @@ -1609,7 +1607,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, write_unlock_irq(&tasklist_lock); proc_fork_connector(p); - cgroup_post_fork(p, cgrp_ss_priv); + cgroup_post_fork(p); threadgroup_change_end(current); perf_event_fork(p); @@ -1619,7 +1617,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, return p; bad_fork_cancel_cgroup: - cgroup_cancel_fork(p, cgrp_ss_priv); + cgroup_cancel_fork(p); bad_fork_free_pid: if (pid != &init_struct_pid) free_pid(pid); @@ -1849,16 +1847,19 @@ void __init proc_caches_init(void) sighand_cachep = kmem_cache_create("sighand_cache", sizeof(struct sighand_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU| - SLAB_NOTRACK, sighand_ctor); + SLAB_NOTRACK|SLAB_ACCOUNT, sighand_ctor); signal_cachep = kmem_cache_create("signal_cache", sizeof(struct signal_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, + NULL); files_cachep = kmem_cache_create("files_cache", sizeof(struct files_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, + NULL); fs_cachep = kmem_cache_create("fs_cache", sizeof(struct fs_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, + NULL); /* * FIXME! The "sizeof(struct mm_struct)" currently includes the * whole struct cpumask for the OFFSTACK case. We could change @@ -1868,8 +1869,9 @@ void __init proc_caches_init(void) */ mm_cachep = kmem_cache_create("mm_struct", sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); - vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, + NULL); + vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT); mmap_init(); nsproxy_cache_init(); } diff --git a/kernel/futex.c b/kernel/futex.c index 461c72b2d..5d6ce6413 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -469,7 +469,8 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; - struct page *page, *page_head; + struct page *page; + struct address_space *mapping; int err, ro = 0; /* @@ -519,46 +520,9 @@ again: else err = 0; -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - page_head = page; - if (unlikely(PageTail(page))) { - put_page(page); - /* serialize against __split_huge_page_splitting() */ - local_irq_disable(); - if (likely(__get_user_pages_fast(address, 1, !ro, &page) == 1)) { - page_head = compound_head(page); - /* - * page_head is valid pointer but we must pin - * it before taking the PG_lock and/or - * PG_compound_lock. The moment we re-enable - * irqs __split_huge_page_splitting() can - * return and the head page can be freed from - * under us. We can't take the PG_lock and/or - * PG_compound_lock on a page that could be - * freed from under us. - */ - if (page != page_head) { - get_page(page_head); - put_page(page); - } - local_irq_enable(); - } else { - local_irq_enable(); - goto again; - } - } -#else - page_head = compound_head(page); - if (page != page_head) { - get_page(page_head); - put_page(page); - } -#endif - - lock_page(page_head); - + lock_page(page); /* - * If page_head->mapping is NULL, then it cannot be a PageAnon + * If page->mapping is NULL, then it cannot be a PageAnon * page; but it might be the ZERO_PAGE or in the gate area or * in a special mapping (all cases which we are happy to fail); * or it may have been a good file page when get_user_pages_fast @@ -570,12 +534,13 @@ again: * * The case we do have to guard against is when memory pressure made * shmem_writepage move it from filecache to swapcache beneath us: - * an unlikely race, but we do need to retry for page_head->mapping. + * an unlikely race, but we do need to retry for page->mapping. */ - if (!page_head->mapping) { - int shmem_swizzled = PageSwapCache(page_head); - unlock_page(page_head); - put_page(page_head); + mapping = compound_head(page)->mapping; + if (!mapping) { + int shmem_swizzled = PageSwapCache(page); + unlock_page(page); + put_page(page); if (shmem_swizzled) goto again; return -EFAULT; @@ -588,7 +553,7 @@ again: * it's a read-only handle, it's expected that futexes attach to * the object not the particular process. */ - if (PageAnon(page_head)) { + if (PageAnon(page)) { /* * A RO anonymous page will never change and thus doesn't make * sense for futex operations. @@ -603,15 +568,15 @@ again: key->private.address = address; } else { key->both.offset |= FUT_OFF_INODE; /* inode-based key */ - key->shared.inode = page_head->mapping->host; + key->shared.inode = mapping->host; key->shared.pgoff = basepage_index(page); } get_futex_key_refs(key); /* implies MB (B) */ out: - unlock_page(page_head); - put_page(page_head); + unlock_page(page); + put_page(page); return err; } @@ -639,7 +604,7 @@ static int fault_in_user_writeable(u32 __user *uaddr) down_read(&mm->mmap_sem); ret = fixup_user_fault(current, mm, (unsigned long)uaddr, - FAULT_FLAG_WRITE); + FAULT_FLAG_WRITE, NULL); up_read(&mm->mmap_sem); return ret < 0 ? ret : 0; @@ -725,9 +690,12 @@ static struct futex_pi_state * alloc_pi_state(void) } /* + * Drops a reference to the pi_state object and frees or caches it + * when the last reference is gone. + * * Must be called with the hb lock held. */ -static void free_pi_state(struct futex_pi_state *pi_state) +static void put_pi_state(struct futex_pi_state *pi_state) { if (!pi_state) return; @@ -1223,7 +1191,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, if (pi_state->owner != current) return -EINVAL; - raw_spin_lock(&pi_state->pi_mutex.wait_lock); + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); /* @@ -1249,22 +1217,22 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, else if (curval != uval) ret = -EINVAL; if (ret) { - raw_spin_unlock(&pi_state->pi_mutex.wait_lock); + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); return ret; } - raw_spin_lock_irq(&pi_state->owner->pi_lock); + raw_spin_lock(&pi_state->owner->pi_lock); WARN_ON(list_empty(&pi_state->list)); list_del_init(&pi_state->list); - raw_spin_unlock_irq(&pi_state->owner->pi_lock); + raw_spin_unlock(&pi_state->owner->pi_lock); - raw_spin_lock_irq(&new_owner->pi_lock); + raw_spin_lock(&new_owner->pi_lock); WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &new_owner->pi_state_list); pi_state->owner = new_owner; - raw_spin_unlock_irq(&new_owner->pi_lock); + raw_spin_unlock(&new_owner->pi_lock); - raw_spin_unlock(&pi_state->pi_mutex.wait_lock); + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q); @@ -1706,31 +1674,35 @@ retry_private: * exist yet, look it up one more time to ensure we have a * reference to it. If the lock was taken, ret contains the * vpid of the top waiter task. + * If the lock was not taken, we have pi_state and an initial + * refcount on it. In case of an error we have nothing. */ if (ret > 0) { WARN_ON(pi_state); drop_count++; task_count++; /* - * If we acquired the lock, then the user - * space value of uaddr2 should be vpid. It - * cannot be changed by the top waiter as it - * is blocked on hb2 lock if it tries to do - * so. If something fiddled with it behind our - * back the pi state lookup might unearth - * it. So we rather use the known value than - * rereading and handing potential crap to - * lookup_pi_state. + * If we acquired the lock, then the user space value + * of uaddr2 should be vpid. It cannot be changed by + * the top waiter as it is blocked on hb2 lock if it + * tries to do so. If something fiddled with it behind + * our back the pi state lookup might unearth it. So + * we rather use the known value than rereading and + * handing potential crap to lookup_pi_state. + * + * If that call succeeds then we have pi_state and an + * initial refcount on it. */ ret = lookup_pi_state(ret, hb2, &key2, &pi_state); } switch (ret) { case 0: + /* We hold a reference on the pi state. */ break; + + /* If the above failed, then pi_state is NULL */ case -EFAULT: - free_pi_state(pi_state); - pi_state = NULL; double_unlock_hb(hb1, hb2); hb_waiters_dec(hb2); put_futex_key(&key2); @@ -1746,8 +1718,6 @@ retry_private: * exit to complete. * - The user space value changed. */ - free_pi_state(pi_state); - pi_state = NULL; double_unlock_hb(hb1, hb2); hb_waiters_dec(hb2); put_futex_key(&key2); @@ -1801,30 +1771,58 @@ retry_private: * of requeue_pi if we couldn't acquire the lock atomically. */ if (requeue_pi) { - /* Prepare the waiter to take the rt_mutex. */ + /* + * Prepare the waiter to take the rt_mutex. Take a + * refcount on the pi_state and store the pointer in + * the futex_q object of the waiter. + */ atomic_inc(&pi_state->refcount); this->pi_state = pi_state; ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, this->rt_waiter, this->task); if (ret == 1) { - /* We got the lock. */ + /* + * We got the lock. We do neither drop the + * refcount on pi_state nor clear + * this->pi_state because the waiter needs the + * pi_state for cleaning up the user space + * value. It will drop the refcount after + * doing so. + */ requeue_pi_wake_futex(this, &key2, hb2); drop_count++; continue; } else if (ret) { - /* -EDEADLK */ + /* + * rt_mutex_start_proxy_lock() detected a + * potential deadlock when we tried to queue + * that waiter. Drop the pi_state reference + * which we took above and remove the pointer + * to the state from the waiters futex_q + * object. + */ this->pi_state = NULL; - free_pi_state(pi_state); - goto out_unlock; + put_pi_state(pi_state); + /* + * We stop queueing more waiters and let user + * space deal with the mess. + */ + break; } } requeue_futex(this, hb1, hb2, &key2); drop_count++; } + /* + * We took an extra initial reference to the pi_state either + * in futex_proxy_trylock_atomic() or in lookup_pi_state(). We + * need to drop it here again. + */ + put_pi_state(pi_state); + out_unlock: - free_pi_state(pi_state); double_unlock_hb(hb1, hb2); wake_up_q(&wake_q); hb_waiters_dec(hb2); @@ -1973,7 +1971,7 @@ static void unqueue_me_pi(struct futex_q *q) __unqueue_futex(q); BUG_ON(!q->pi_state); - free_pi_state(q->pi_state); + put_pi_state(q->pi_state); q->pi_state = NULL; spin_unlock(q->lock_ptr); @@ -2129,11 +2127,11 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) * we returned due to timeout or signal without taking the * rt_mutex. Too late. */ - raw_spin_lock(&q->pi_state->pi_mutex.wait_lock); + raw_spin_lock_irq(&q->pi_state->pi_mutex.wait_lock); owner = rt_mutex_owner(&q->pi_state->pi_mutex); if (!owner) owner = rt_mutex_next_owner(&q->pi_state->pi_mutex); - raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock); + raw_spin_unlock_irq(&q->pi_state->pi_mutex.wait_lock); ret = fixup_pi_state_owner(uaddr, q, owner); goto out; } @@ -2759,7 +2757,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, * Drop the reference to the pi state which * the requeue_pi() code acquired for us. */ - free_pi_state(q.pi_state); + put_pi_state(q.pi_state); spin_unlock(q.lock_ptr); } } else { @@ -3051,7 +3049,8 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, if (op & FUTEX_CLOCK_REALTIME) { flags |= FLAGS_CLOCKRT; - if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI) + if (cmd != FUTEX_WAIT && cmd != FUTEX_WAIT_BITSET && \ + cmd != FUTEX_WAIT_REQUEUE_PI) return -ENOSYS; } diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c index 7080ae1eb..2f9df3794 100644 --- a/kernel/gcov/base.c +++ b/kernel/gcov/base.c @@ -123,11 +123,6 @@ void gcov_enable_events(void) } #ifdef CONFIG_MODULES -static inline int within(void *addr, void *start, unsigned long size) -{ - return ((addr >= start) && (addr < start + size)); -} - /* Update list and generate events when modules are unloaded. */ static int gcov_module_notifier(struct notifier_block *nb, unsigned long event, void *data) @@ -142,7 +137,7 @@ static int gcov_module_notifier(struct notifier_block *nb, unsigned long event, /* Remove entries located in module from linked list. */ while ((info = gcov_info_next(info))) { - if (within(info, mod->module_core, mod->core_size)) { + if (within_module((unsigned long)info, mod)) { gcov_info_unlink(prev, info); if (gcov_events_enabled) gcov_event(GCOV_REMOVE, info); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 15206453b..5797909f4 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -338,7 +338,6 @@ void handle_nested_irq(unsigned int irq) raw_spin_lock_irq(&desc->lock); desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); - kstat_incr_irqs_this_cpu(desc); action = desc->action; if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) { @@ -346,6 +345,7 @@ void handle_nested_irq(unsigned int irq) goto out_unlock; } + kstat_incr_irqs_this_cpu(desc); irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); raw_spin_unlock_irq(&desc->lock); @@ -412,13 +412,13 @@ void handle_simple_irq(struct irq_desc *desc) goto out_unlock; desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); - kstat_incr_irqs_this_cpu(desc); if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { desc->istate |= IRQS_PENDING; goto out_unlock; } + kstat_incr_irqs_this_cpu(desc); handle_irq_event(desc); out_unlock: @@ -462,7 +462,6 @@ void handle_level_irq(struct irq_desc *desc) goto out_unlock; desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); - kstat_incr_irqs_this_cpu(desc); /* * If its disabled or no action available @@ -473,6 +472,7 @@ void handle_level_irq(struct irq_desc *desc) goto out_unlock; } + kstat_incr_irqs_this_cpu(desc); handle_irq_event(desc); cond_unmask_irq(desc); @@ -532,7 +532,6 @@ void handle_fasteoi_irq(struct irq_desc *desc) goto out; desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); - kstat_incr_irqs_this_cpu(desc); /* * If its disabled or no action available @@ -544,6 +543,7 @@ void handle_fasteoi_irq(struct irq_desc *desc) goto out; } + kstat_incr_irqs_this_cpu(desc); if (desc->istate & IRQS_ONESHOT) mask_irq(desc); @@ -950,6 +950,7 @@ void irq_chip_ack_parent(struct irq_data *data) data = data->parent_data; data->chip->irq_ack(data); } +EXPORT_SYMBOL_GPL(irq_chip_ack_parent); /** * irq_chip_mask_parent - Mask the parent interrupt diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 239e2ae2c..0409da0bc 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -159,6 +159,7 @@ static struct irq_desc *alloc_desc(int irq, int node, struct module *owner) raw_spin_lock_init(&desc->lock); lockdep_set_class(&desc->lock, &irq_desc_lock_class); + init_rcu_head(&desc->rcu); desc_set_defaults(irq, desc, node, owner); @@ -171,6 +172,15 @@ err_desc: return NULL; } +static void delayed_free_desc(struct rcu_head *rhp) +{ + struct irq_desc *desc = container_of(rhp, struct irq_desc, rcu); + + free_masks(desc); + free_percpu(desc->kstat_irqs); + kfree(desc); +} + static void free_desc(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); @@ -187,9 +197,12 @@ static void free_desc(unsigned int irq) delete_irq_desc(irq); mutex_unlock(&sparse_irq_lock); - free_masks(desc); - free_percpu(desc->kstat_irqs); - kfree(desc); + /* + * We free the descriptor, masks and stat fields via RCU. That + * allows demultiplex interrupts to do rcu based management of + * the child interrupts. + */ + call_rcu(&desc->rcu, delayed_free_desc); } static int alloc_descs(unsigned int start, unsigned int cnt, int node, diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 22aa9612e..3e56d2f03 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -60,6 +60,7 @@ struct fwnode_handle *irq_domain_alloc_fwnode(void *data) fwid->fwnode.type = FWNODE_IRQCHIP; return &fwid->fwnode; } +EXPORT_SYMBOL_GPL(irq_domain_alloc_fwnode); /** * irq_domain_free_fwnode - Free a non-OF-backed fwnode_handle @@ -70,13 +71,14 @@ void irq_domain_free_fwnode(struct fwnode_handle *fwnode) { struct irqchip_fwid *fwid; - if (WARN_ON(fwnode->type != FWNODE_IRQCHIP)) + if (WARN_ON(!is_fwnode_irqchip(fwnode))) return; fwid = container_of(fwnode, struct irqchip_fwid, fwnode); kfree(fwid->name); kfree(fwid); } +EXPORT_SYMBOL_GPL(irq_domain_free_fwnode); /** * __irq_domain_add() - Allocate a new irq_domain data structure @@ -573,10 +575,15 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) unsigned int type = IRQ_TYPE_NONE; int virq; - if (fwspec->fwnode) - domain = irq_find_matching_fwnode(fwspec->fwnode, DOMAIN_BUS_ANY); - else + if (fwspec->fwnode) { + domain = irq_find_matching_fwnode(fwspec->fwnode, + DOMAIN_BUS_WIRED); + if (!domain) + domain = irq_find_matching_fwnode(fwspec->fwnode, + DOMAIN_BUS_ANY); + } else { domain = irq_default_domain; + } if (!domain) { pr_warn("no irq domain found for %s !\n", @@ -1013,6 +1020,7 @@ struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain, return NULL; } +EXPORT_SYMBOL_GPL(irq_domain_get_irq_data); /** * irq_domain_set_hwirq_and_chip - Set hwirq and irqchip of @virq at @domain @@ -1058,6 +1066,7 @@ void irq_domain_set_info(struct irq_domain *domain, unsigned int virq, __irq_set_handler(virq, handler, 0, handler_name); irq_set_handler_data(virq, handler_data); } +EXPORT_SYMBOL(irq_domain_set_info); /** * irq_domain_reset_irq_data - Clear hwirq, chip and chip_data in @irq_data @@ -1125,9 +1134,9 @@ static void irq_domain_free_irqs_recursive(struct irq_domain *domain, } } -static int irq_domain_alloc_irqs_recursive(struct irq_domain *domain, - unsigned int irq_base, - unsigned int nr_irqs, void *arg) +int irq_domain_alloc_irqs_recursive(struct irq_domain *domain, + unsigned int irq_base, + unsigned int nr_irqs, void *arg) { int ret = 0; struct irq_domain *parent = domain->parent; @@ -1343,6 +1352,7 @@ struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain, return (irq_data && irq_data->domain == domain) ? irq_data : NULL; } +EXPORT_SYMBOL_GPL(irq_domain_get_irq_data); /** * irq_domain_set_info - Set the complete data for a @virq in @domain diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 6ead20037..841187239 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1743,6 +1743,31 @@ out: } EXPORT_SYMBOL_GPL(enable_percpu_irq); +/** + * irq_percpu_is_enabled - Check whether the per cpu irq is enabled + * @irq: Linux irq number to check for + * + * Must be called from a non migratable context. Returns the enable + * state of a per cpu interrupt on the current cpu. + */ +bool irq_percpu_is_enabled(unsigned int irq) +{ + unsigned int cpu = smp_processor_id(); + struct irq_desc *desc; + unsigned long flags; + bool is_enabled; + + desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU); + if (!desc) + return false; + + is_enabled = cpumask_test_cpu(cpu, desc->percpu_enabled); + irq_put_desc_unlock(desc, flags); + + return is_enabled; +} +EXPORT_SYMBOL_GPL(irq_percpu_is_enabled); + void disable_percpu_irq(unsigned int irq) { unsigned int cpu = smp_processor_id(); diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 6b0c0b74a..38e89ce7b 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -109,9 +109,11 @@ static int msi_domain_alloc(struct irq_domain *domain, unsigned int virq, if (irq_find_mapping(domain, hwirq) > 0) return -EEXIST; - ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg); - if (ret < 0) - return ret; + if (domain->parent) { + ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg); + if (ret < 0) + return ret; + } for (i = 0; i < nr_irqs; i++) { ret = ops->msi_init(domain, info, virq + i, hwirq + i, arg); @@ -252,6 +254,60 @@ struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode, &msi_domain_ops, info); } +int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev, + int nvec, msi_alloc_info_t *arg) +{ + struct msi_domain_info *info = domain->host_data; + struct msi_domain_ops *ops = info->ops; + int ret; + + ret = ops->msi_check(domain, info, dev); + if (ret == 0) + ret = ops->msi_prepare(domain, dev, nvec, arg); + + return ret; +} + +int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev, + int virq, int nvec, msi_alloc_info_t *arg) +{ + struct msi_domain_info *info = domain->host_data; + struct msi_domain_ops *ops = info->ops; + struct msi_desc *desc; + int ret = 0; + + for_each_msi_entry(desc, dev) { + /* Don't even try the multi-MSI brain damage. */ + if (WARN_ON(!desc->irq || desc->nvec_used != 1)) { + ret = -EINVAL; + break; + } + + if (!(desc->irq >= virq && desc->irq < (virq + nvec))) + continue; + + ops->set_desc(arg, desc); + /* Assumes the domain mutex is held! */ + ret = irq_domain_alloc_irqs_recursive(domain, virq, 1, arg); + if (ret) + break; + + irq_set_msi_desc_off(virq, 0, desc); + } + + if (ret) { + /* Mop up the damage */ + for_each_msi_entry(desc, dev) { + if (!(desc->irq >= virq && desc->irq < (virq + nvec))) + continue; + + irq_domain_free_irqs_common(domain, desc->irq, 1); + } + } + + return ret; +} + /** * msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain * @domain: The domain to allocate from @@ -270,9 +326,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, struct msi_desc *desc; int i, ret, virq = -1; - ret = ops->msi_check(domain, info, dev); - if (ret == 0) - ret = ops->msi_prepare(domain, dev, nvec, &arg); + ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg); if (ret) return ret; diff --git a/kernel/kexec.c b/kernel/kexec.c index d873b64fb..ee70aef5c 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -63,16 +63,16 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry, if (ret) goto out_free_image; - ret = sanity_check_segment_list(image); - if (ret) - goto out_free_image; - - /* Enable the special crash kernel control page allocation policy. */ if (kexec_on_panic) { + /* Enable special crash kernel control page alloc policy. */ image->control_page = crashk_res.start; image->type = KEXEC_TYPE_CRASH; } + ret = sanity_check_segment_list(image); + if (ret) + goto out_free_image; + /* * Find a location for the control code buffer, and add it * the vector of segments so that it's pages will also be diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 11b64a63c..8dc659144 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -310,12 +310,9 @@ static void kimage_free_pages(struct page *page) void kimage_free_page_list(struct list_head *list) { - struct list_head *pos, *next; + struct page *page, *next; - list_for_each_safe(pos, next, list) { - struct page *page; - - page = list_entry(pos, struct page, lru); + list_for_each_entry_safe(page, next, list, lru) { list_del(&page->lru); kimage_free_pages(page); } @@ -853,7 +850,12 @@ struct kimage *kexec_image; struct kimage *kexec_crash_image; int kexec_load_disabled; -void crash_kexec(struct pt_regs *regs) +/* + * No panic_cpu check version of crash_kexec(). This function is called + * only when panic_cpu holds the current CPU number; this is the only CPU + * which processes crash_kexec routines. + */ +void __crash_kexec(struct pt_regs *regs) { /* Take the kexec_mutex here to prevent sys_kexec_load * running on one cpu from replacing the crash kernel @@ -876,6 +878,29 @@ void crash_kexec(struct pt_regs *regs) } } +void crash_kexec(struct pt_regs *regs) +{ + int old_cpu, this_cpu; + + /* + * Only one CPU is allowed to execute the crash_kexec() code as with + * panic(). Otherwise parallel calls of panic() and crash_kexec() + * may stop each other. To exclude them, we use panic_cpu here too. + */ + this_cpu = raw_smp_processor_id(); + old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu); + if (old_cpu == PANIC_CPU_INVALID) { + /* This is the 1st CPU which comes here, so go ahead. */ + __crash_kexec(regs); + + /* + * Reset panic_cpu to allow another panic()/crash_kexec() + * call. + */ + atomic_set(&panic_cpu, PANIC_CPU_INVALID); + } +} + size_t crash_get_memory_size(void) { size_t size = 0; diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index b70ada002..007b791f6 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -109,11 +109,13 @@ int __weak arch_kimage_file_post_load_cleanup(struct kimage *image) return -EINVAL; } +#ifdef CONFIG_KEXEC_VERIFY_SIG int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf, unsigned long buf_len) { return -EKEYREJECTED; } +#endif /* Apply relocations of type RELA */ int __weak diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h index e4392a698..0a52315d9 100644 --- a/kernel/kexec_internal.h +++ b/kernel/kexec_internal.h @@ -15,6 +15,27 @@ int kimage_is_destination_range(struct kimage *image, extern struct mutex kexec_mutex; #ifdef CONFIG_KEXEC_FILE +struct kexec_sha_region { + unsigned long start; + unsigned long len; +}; + +/* + * Keeps track of buffer parameters as provided by caller for requesting + * memory placement of buffer. + */ +struct kexec_buf { + struct kimage *image; + char *buffer; + unsigned long bufsz; + unsigned long mem; + unsigned long memsz; + unsigned long buf_align; + unsigned long buf_min; + unsigned long buf_max; + bool top_down; /* allocate from top of memory hole */ +}; + void kimage_file_post_load_cleanup(struct kimage *image); #else /* CONFIG_KEXEC_FILE */ static inline void kimage_file_post_load_cleanup(struct kimage *image) { } diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index e83b26464..152da4a48 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -20,7 +20,7 @@ #include <linux/capability.h> #include <linux/compiler.h> -#include <linux/rcupdate.h> /* rcu_expedited */ +#include <linux/rcupdate.h> /* rcu_expedited and rcu_normal */ #define KERNEL_ATTR_RO(_name) \ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) @@ -144,11 +144,12 @@ static ssize_t fscaps_show(struct kobject *kobj, } KERNEL_ATTR_RO(fscaps); +#ifndef CONFIG_TINY_RCU int rcu_expedited; static ssize_t rcu_expedited_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", rcu_expedited); + return sprintf(buf, "%d\n", READ_ONCE(rcu_expedited)); } static ssize_t rcu_expedited_store(struct kobject *kobj, struct kobj_attribute *attr, @@ -161,6 +162,24 @@ static ssize_t rcu_expedited_store(struct kobject *kobj, } KERNEL_ATTR_RW(rcu_expedited); +int rcu_normal; +static ssize_t rcu_normal_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", READ_ONCE(rcu_normal)); +} +static ssize_t rcu_normal_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + if (kstrtoint(buf, 0, &rcu_normal)) + return -EINVAL; + + return count; +} +KERNEL_ATTR_RW(rcu_normal); +#endif /* #ifndef CONFIG_TINY_RCU */ + /* * Make /sys/kernel/notes give the raw contents of our kernel .notes section. */ @@ -202,7 +221,10 @@ static struct attribute * kernel_attrs[] = { &kexec_crash_size_attr.attr, &vmcoreinfo_attr.attr, #endif +#ifndef CONFIG_TINY_RCU &rcu_expedited_attr.attr, + &rcu_normal_attr.attr, +#endif NULL }; diff --git a/kernel/kthread.c b/kernel/kthread.c index 12d8a8f88..9ff173dca 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -275,7 +275,7 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), DECLARE_COMPLETION_ONSTACK(done); struct task_struct *task; struct kthread_create_info *create = kmalloc(sizeof(*create), - GFP_KERNEL | ___GFP_TOI_NOTRACK); + GFP_KERNEL); if (!create) return ERR_PTR(-ENOMEM); diff --git a/kernel/latencytop.c b/kernel/latencytop.c index a02812743..b5c30d9f4 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -47,12 +47,12 @@ * of times) */ -#include <linux/latencytop.h> #include <linux/kallsyms.h> #include <linux/seq_file.h> #include <linux/notifier.h> #include <linux/spinlock.h> #include <linux/proc_fs.h> +#include <linux/latencytop.h> #include <linux/export.h> #include <linux/sched.h> #include <linux/list.h> @@ -289,4 +289,16 @@ static int __init init_lstats_procfs(void) proc_create("latency_stats", 0644, NULL, &lstats_fops); return 0; } + +int sysctl_latencytop(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int err; + + err = proc_dointvec(table, write, buffer, lenp, ppos); + if (latencytop_enabled) + force_schedstat_enabled(); + + return err; +} device_initcall(init_lstats_procfs); diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index db545cbcd..bc2c85c06 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -28,6 +28,7 @@ #include <linux/list.h> #include <linux/kallsyms.h> #include <linux/livepatch.h> +#include <asm/cacheflush.h> /** * struct klp_ops - structure for tracking registered ftrace ops structs @@ -135,13 +136,8 @@ struct klp_find_arg { const char *objname; const char *name; unsigned long addr; - /* - * If count == 0, the symbol was not found. If count == 1, a unique - * match was found and addr is set. If count > 1, there is - * unresolvable ambiguity among "count" number of symbols with the same - * name in the same object. - */ unsigned long count; + unsigned long pos; }; static int klp_find_callback(void *data, const char *name, @@ -158,37 +154,48 @@ static int klp_find_callback(void *data, const char *name, if (args->objname && strcmp(args->objname, mod->name)) return 0; - /* - * args->addr might be overwritten if another match is found - * but klp_find_object_symbol() handles this and only returns the - * addr if count == 1. - */ args->addr = addr; args->count++; + /* + * Finish the search when the symbol is found for the desired position + * or the position is not defined for a non-unique symbol. + */ + if ((args->pos && (args->count == args->pos)) || + (!args->pos && (args->count > 1))) + return 1; + return 0; } static int klp_find_object_symbol(const char *objname, const char *name, - unsigned long *addr) + unsigned long sympos, unsigned long *addr) { struct klp_find_arg args = { .objname = objname, .name = name, .addr = 0, - .count = 0 + .count = 0, + .pos = sympos, }; mutex_lock(&module_mutex); kallsyms_on_each_symbol(klp_find_callback, &args); mutex_unlock(&module_mutex); - if (args.count == 0) + /* + * Ensure an address was found. If sympos is 0, ensure symbol is unique; + * otherwise ensure the symbol position count matches sympos. + */ + if (args.addr == 0) pr_err("symbol '%s' not found in symbol table\n", name); - else if (args.count > 1) + else if (args.count > 1 && sympos == 0) { pr_err("unresolvable ambiguity (%lu matches) on symbol '%s' in object '%s'\n", args.count, name, objname); - else { + } else if (sympos != args.count && sympos > 0) { + pr_err("symbol position %lu for symbol '%s' in object '%s' not found\n", + sympos, name, objname ? objname : "vmlinux"); + } else { *addr = args.addr; return 0; } @@ -197,66 +204,6 @@ static int klp_find_object_symbol(const char *objname, const char *name, return -EINVAL; } -struct klp_verify_args { - const char *name; - const unsigned long addr; -}; - -static int klp_verify_callback(void *data, const char *name, - struct module *mod, unsigned long addr) -{ - struct klp_verify_args *args = data; - - if (!mod && - !strcmp(args->name, name) && - args->addr == addr) - return 1; - - return 0; -} - -static int klp_verify_vmlinux_symbol(const char *name, unsigned long addr) -{ - struct klp_verify_args args = { - .name = name, - .addr = addr, - }; - int ret; - - mutex_lock(&module_mutex); - ret = kallsyms_on_each_symbol(klp_verify_callback, &args); - mutex_unlock(&module_mutex); - - if (!ret) { - pr_err("symbol '%s' not found at specified address 0x%016lx, kernel mismatch?\n", - name, addr); - return -EINVAL; - } - - return 0; -} - -static int klp_find_verify_func_addr(struct klp_object *obj, - struct klp_func *func) -{ - int ret; - -#if defined(CONFIG_RANDOMIZE_BASE) - /* If KASLR has been enabled, adjust old_addr accordingly */ - if (kaslr_enabled() && func->old_addr) - func->old_addr += kaslr_offset(); -#endif - - if (!func->old_addr || klp_is_module(obj)) - ret = klp_find_object_symbol(obj->name, func->old_name, - &func->old_addr); - else - ret = klp_verify_vmlinux_symbol(func->old_name, - func->old_addr); - - return ret; -} - /* * external symbols are located outside the parent object (where the parent * object is either vmlinux or the kmod being patched). @@ -276,14 +223,18 @@ static int klp_find_external_symbol(struct module *pmod, const char *name, } preempt_enable(); - /* otherwise check if it's in another .o within the patch module */ - return klp_find_object_symbol(pmod->name, name, addr); + /* + * Check if it's in another .o within the patch module. This also + * checks that the external symbol is unique. + */ + return klp_find_object_symbol(pmod->name, name, 0, addr); } static int klp_write_object_relocations(struct module *pmod, struct klp_object *obj) { - int ret; + int ret = 0; + unsigned long val; struct klp_reloc *reloc; if (WARN_ON(!klp_is_object_loaded(obj))) @@ -292,41 +243,38 @@ static int klp_write_object_relocations(struct module *pmod, if (WARN_ON(!obj->relocs)) return -EINVAL; + module_disable_ro(pmod); + for (reloc = obj->relocs; reloc->name; reloc++) { - if (!klp_is_module(obj)) { - -#if defined(CONFIG_RANDOMIZE_BASE) - /* If KASLR has been enabled, adjust old value accordingly */ - if (kaslr_enabled()) - reloc->val += kaslr_offset(); -#endif - ret = klp_verify_vmlinux_symbol(reloc->name, - reloc->val); - if (ret) - return ret; - } else { - /* module, reloc->val needs to be discovered */ - if (reloc->external) - ret = klp_find_external_symbol(pmod, - reloc->name, - &reloc->val); - else - ret = klp_find_object_symbol(obj->mod->name, - reloc->name, - &reloc->val); - if (ret) - return ret; - } + /* discover the address of the referenced symbol */ + if (reloc->external) { + if (reloc->sympos > 0) { + pr_err("non-zero sympos for external reloc symbol '%s' is not supported\n", + reloc->name); + ret = -EINVAL; + goto out; + } + ret = klp_find_external_symbol(pmod, reloc->name, &val); + } else + ret = klp_find_object_symbol(obj->name, + reloc->name, + reloc->sympos, + &val); + if (ret) + goto out; + ret = klp_write_module_reloc(pmod, reloc->type, reloc->loc, - reloc->val + reloc->addend); + val + reloc->addend); if (ret) { pr_err("relocation failed for symbol '%s' at 0x%016lx (%d)\n", - reloc->name, reloc->val, ret); - return ret; + reloc->name, val, ret); + goto out; } } - return 0; +out: + module_enable_ro(pmod); + return ret; } static void notrace klp_ftrace_handler(unsigned long ip, @@ -593,7 +541,7 @@ EXPORT_SYMBOL_GPL(klp_enable_patch); * /sys/kernel/livepatch/<patch> * /sys/kernel/livepatch/<patch>/enabled * /sys/kernel/livepatch/<patch>/<object> - * /sys/kernel/livepatch/<patch>/<object>/<func> + * /sys/kernel/livepatch/<patch>/<object>/<function,sympos> */ static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr, @@ -738,8 +686,14 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func) INIT_LIST_HEAD(&func->stack_node); func->state = KLP_DISABLED; + /* The format for the sysfs directory is <function,sympos> where sympos + * is the nth occurrence of this symbol in kallsyms for the patched + * object. If the user selects 0 for old_sympos, then 1 will be used + * since a unique symbol will be the first occurrence. + */ return kobject_init_and_add(&func->kobj, &klp_ktype_func, - &obj->kobj, "%s", func->old_name); + &obj->kobj, "%s,%lu", func->old_name, + func->old_sympos ? func->old_sympos : 1); } /* parts of the initialization that is done only when the object is loaded */ @@ -756,7 +710,9 @@ static int klp_init_object_loaded(struct klp_patch *patch, } klp_for_each_func(obj, func) { - ret = klp_find_verify_func_addr(obj, func); + ret = klp_find_object_symbol(obj->name, func->old_name, + func->old_sympos, + &func->old_addr); if (ret) return ret; } diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 60ace5661..716547fdb 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -292,7 +292,7 @@ LIST_HEAD(all_lock_classes); #define __classhashfn(key) hash_long((unsigned long)key, CLASSHASH_BITS) #define classhashentry(key) (classhash_table + __classhashfn((key))) -static struct list_head classhash_table[CLASSHASH_SIZE]; +static struct hlist_head classhash_table[CLASSHASH_SIZE]; /* * We put the lock dependency chains into a hash-table as well, to cache @@ -303,7 +303,7 @@ static struct list_head classhash_table[CLASSHASH_SIZE]; #define __chainhashfn(chain) hash_long(chain, CHAINHASH_BITS) #define chainhashentry(chain) (chainhash_table + __chainhashfn((chain))) -static struct list_head chainhash_table[CHAINHASH_SIZE]; +static struct hlist_head chainhash_table[CHAINHASH_SIZE]; /* * The hash key of the lock dependency chains is a hash itself too: @@ -666,7 +666,7 @@ static inline struct lock_class * look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) { struct lockdep_subclass_key *key; - struct list_head *hash_head; + struct hlist_head *hash_head; struct lock_class *class; #ifdef CONFIG_DEBUG_LOCKDEP @@ -719,7 +719,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return NULL; - list_for_each_entry_rcu(class, hash_head, hash_entry) { + hlist_for_each_entry_rcu(class, hash_head, hash_entry) { if (class->key == key) { /* * Huh! same key, different name? Did someone trample @@ -742,7 +742,7 @@ static inline struct lock_class * register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) { struct lockdep_subclass_key *key; - struct list_head *hash_head; + struct hlist_head *hash_head; struct lock_class *class; DEBUG_LOCKS_WARN_ON(!irqs_disabled()); @@ -774,7 +774,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) * We have to do the hash-walk again, to avoid races * with another CPU: */ - list_for_each_entry_rcu(class, hash_head, hash_entry) { + hlist_for_each_entry_rcu(class, hash_head, hash_entry) { if (class->key == key) goto out_unlock_set; } @@ -805,7 +805,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) * We use RCU's safe list-add method to make * parallel walking of the hash-list safe: */ - list_add_tail_rcu(&class->hash_entry, hash_head); + hlist_add_head_rcu(&class->hash_entry, hash_head); /* * Add it to the global list of classes: */ @@ -1822,7 +1822,7 @@ check_deadlock(struct task_struct *curr, struct held_lock *next, */ static int check_prev_add(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next, int distance, int trylock_loop) + struct held_lock *next, int distance, int *stack_saved) { struct lock_list *entry; int ret; @@ -1883,8 +1883,11 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, } } - if (!trylock_loop && !save_trace(&trace)) - return 0; + if (!*stack_saved) { + if (!save_trace(&trace)) + return 0; + *stack_saved = 1; + } /* * Ok, all validations passed, add the new lock @@ -1907,6 +1910,8 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, * Debugging printouts: */ if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) { + /* We drop graph lock, so another thread can overwrite trace. */ + *stack_saved = 0; graph_unlock(); printk("\n new dependency: "); print_lock_name(hlock_class(prev)); @@ -1929,7 +1934,7 @@ static int check_prevs_add(struct task_struct *curr, struct held_lock *next) { int depth = curr->lockdep_depth; - int trylock_loop = 0; + int stack_saved = 0; struct held_lock *hlock; /* @@ -1956,7 +1961,7 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) */ if (hlock->read != 2 && hlock->check) { if (!check_prev_add(curr, hlock, next, - distance, trylock_loop)) + distance, &stack_saved)) return 0; /* * Stop after the first non-trylock entry, @@ -1979,7 +1984,6 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) if (curr->held_locks[depth].irq_context != curr->held_locks[depth-1].irq_context) break; - trylock_loop = 1; } return 1; out_bug: @@ -2017,7 +2021,7 @@ static inline int lookup_chain_cache(struct task_struct *curr, u64 chain_key) { struct lock_class *class = hlock_class(hlock); - struct list_head *hash_head = chainhashentry(chain_key); + struct hlist_head *hash_head = chainhashentry(chain_key); struct lock_chain *chain; struct held_lock *hlock_curr; int i, j; @@ -2033,7 +2037,7 @@ static inline int lookup_chain_cache(struct task_struct *curr, * We can walk it lock-free, because entries only get added * to the hash: */ - list_for_each_entry_rcu(chain, hash_head, entry) { + hlist_for_each_entry_rcu(chain, hash_head, entry) { if (chain->chain_key == chain_key) { cache_hit: debug_atomic_inc(chain_lookup_hits); @@ -2057,7 +2061,7 @@ cache_hit: /* * We have to walk the chain again locked - to avoid duplicates: */ - list_for_each_entry(chain, hash_head, entry) { + hlist_for_each_entry(chain, hash_head, entry) { if (chain->chain_key == chain_key) { graph_unlock(); goto cache_hit; @@ -2091,7 +2095,7 @@ cache_hit: } chain_hlocks[chain->base + j] = class - lock_classes; } - list_add_tail_rcu(&chain->entry, hash_head); + hlist_add_head_rcu(&chain->entry, hash_head); debug_atomic_inc(chain_lookup_misses); inc_chains(); @@ -3875,7 +3879,7 @@ void lockdep_reset(void) nr_process_chains = 0; debug_locks = 1; for (i = 0; i < CHAINHASH_SIZE; i++) - INIT_LIST_HEAD(chainhash_table + i); + INIT_HLIST_HEAD(chainhash_table + i); raw_local_irq_restore(flags); } @@ -3894,7 +3898,7 @@ static void zap_class(struct lock_class *class) /* * Unhash the class and remove it from the all_lock_classes list: */ - list_del_rcu(&class->hash_entry); + hlist_del_rcu(&class->hash_entry); list_del_rcu(&class->lock_entry); RCU_INIT_POINTER(class->key, NULL); @@ -3917,7 +3921,7 @@ static inline int within(const void *addr, void *start, unsigned long size) void lockdep_free_key_range(void *start, unsigned long size) { struct lock_class *class; - struct list_head *head; + struct hlist_head *head; unsigned long flags; int i; int locked; @@ -3930,9 +3934,7 @@ void lockdep_free_key_range(void *start, unsigned long size) */ for (i = 0; i < CLASSHASH_SIZE; i++) { head = classhash_table + i; - if (list_empty(head)) - continue; - list_for_each_entry_rcu(class, head, hash_entry) { + hlist_for_each_entry_rcu(class, head, hash_entry) { if (within(class->key, start, size)) zap_class(class); else if (within(class->name, start, size)) @@ -3962,7 +3964,7 @@ void lockdep_free_key_range(void *start, unsigned long size) void lockdep_reset_lock(struct lockdep_map *lock) { struct lock_class *class; - struct list_head *head; + struct hlist_head *head; unsigned long flags; int i, j; int locked; @@ -3987,9 +3989,7 @@ void lockdep_reset_lock(struct lockdep_map *lock) locked = graph_lock(); for (i = 0; i < CLASSHASH_SIZE; i++) { head = classhash_table + i; - if (list_empty(head)) - continue; - list_for_each_entry_rcu(class, head, hash_entry) { + hlist_for_each_entry_rcu(class, head, hash_entry) { int match = 0; for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++) @@ -4027,10 +4027,10 @@ void lockdep_init(void) return; for (i = 0; i < CLASSHASH_SIZE; i++) - INIT_LIST_HEAD(classhash_table + i); + INIT_HLIST_HEAD(classhash_table + i); for (i = 0; i < CHAINHASH_SIZE; i++) - INIT_LIST_HEAD(chainhash_table + i); + INIT_HLIST_HEAD(chainhash_table + i); lockdep_initialized = 1; } diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 87e9ce6a6..393d1874b 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -14,8 +14,9 @@ * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P. * (C) Copyright 2013-2014 Red Hat, Inc. * (C) Copyright 2015 Intel Corp. + * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP * - * Authors: Waiman Long <waiman.long@hp.com> + * Authors: Waiman Long <waiman.long@hpe.com> * Peter Zijlstra <peterz@infradead.org> */ @@ -176,7 +177,12 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) { struct __qspinlock *l = (void *)lock; - return (u32)xchg(&l->tail, tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET; + /* + * Use release semantics to make sure that the MCS node is properly + * initialized before changing the tail code. + */ + return (u32)xchg_release(&l->tail, + tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET; } #else /* _Q_PENDING_BITS == 8 */ @@ -208,7 +214,11 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) for (;;) { new = (val & _Q_LOCKED_PENDING_MASK) | tail; - old = atomic_cmpxchg(&lock->val, val, new); + /* + * Use release semantics to make sure that the MCS node is + * properly initialized before changing the tail code. + */ + old = atomic_cmpxchg_release(&lock->val, val, new); if (old == val) break; @@ -238,18 +248,20 @@ static __always_inline void set_locked(struct qspinlock *lock) */ static __always_inline void __pv_init_node(struct mcs_spinlock *node) { } -static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { } +static __always_inline void __pv_wait_node(struct mcs_spinlock *node, + struct mcs_spinlock *prev) { } static __always_inline void __pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) { } -static __always_inline void __pv_wait_head(struct qspinlock *lock, - struct mcs_spinlock *node) { } +static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock, + struct mcs_spinlock *node) + { return 0; } #define pv_enabled() false #define pv_init_node __pv_init_node #define pv_wait_node __pv_wait_node #define pv_kick_node __pv_kick_node -#define pv_wait_head __pv_wait_head +#define pv_wait_head_or_lock __pv_wait_head_or_lock #ifdef CONFIG_PARAVIRT_SPINLOCKS #define queued_spin_lock_slowpath native_queued_spin_lock_slowpath @@ -319,7 +331,11 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) if (val == new) new |= _Q_PENDING_VAL; - old = atomic_cmpxchg(&lock->val, val, new); + /* + * Acquire semantic is required here as the function may + * return immediately if the lock was free. + */ + old = atomic_cmpxchg_acquire(&lock->val, val, new); if (old == val) break; @@ -382,6 +398,7 @@ queue: * p,*,* -> n,*,* */ old = xchg_tail(lock, tail); + next = NULL; /* * if there was a previous node; link it and wait until reaching the @@ -391,8 +408,18 @@ queue: prev = decode_tail(old); WRITE_ONCE(prev->next, node); - pv_wait_node(node); + pv_wait_node(node, prev); arch_mcs_spin_lock_contended(&node->locked); + + /* + * While waiting for the MCS lock, the next pointer may have + * been set by another lock waiter. We optimistically load + * the next pointer & prefetch the cacheline for writing + * to reduce latency in the upcoming MCS unlock operation. + */ + next = READ_ONCE(node->next); + if (next) + prefetchw(next); } /* @@ -406,11 +433,22 @@ queue: * sequentiality; this is because the set_locked() function below * does not imply a full barrier. * + * The PV pv_wait_head_or_lock function, if active, will acquire + * the lock and return a non-zero value. So we have to skip the + * smp_load_acquire() call. As the next PV queue head hasn't been + * designated yet, there is no way for the locked value to become + * _Q_SLOW_VAL. So both the set_locked() and the + * atomic_cmpxchg_relaxed() calls will be safe. + * + * If PV isn't active, 0 will be returned instead. + * */ - pv_wait_head(lock, node); - while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_PENDING_MASK) - cpu_relax(); + if ((val = pv_wait_head_or_lock(lock, node))) + goto locked; + smp_cond_acquire(!((val = atomic_read(&lock->val)) & _Q_LOCKED_PENDING_MASK)); + +locked: /* * claim the lock: * @@ -422,11 +460,17 @@ queue: * to grab the lock. */ for (;;) { - if (val != tail) { + /* In the PV case we might already have _Q_LOCKED_VAL set */ + if ((val & _Q_TAIL_MASK) != tail) { set_locked(lock); break; } - old = atomic_cmpxchg(&lock->val, val, _Q_LOCKED_VAL); + /* + * The smp_load_acquire() call above has provided the necessary + * acquire semantics required for locking. At most two + * iterations of this loop may be ran. + */ + old = atomic_cmpxchg_relaxed(&lock->val, val, _Q_LOCKED_VAL); if (old == val) goto release; /* No contention */ @@ -434,10 +478,12 @@ queue: } /* - * contended path; wait for next, release. + * contended path; wait for next if not observed yet, release. */ - while (!(next = READ_ONCE(node->next))) - cpu_relax(); + if (!next) { + while (!(next = READ_ONCE(node->next))) + cpu_relax(); + } arch_mcs_spin_unlock_contended(&next->locked); pv_kick_node(lock, next); @@ -462,7 +508,7 @@ EXPORT_SYMBOL(queued_spin_lock_slowpath); #undef pv_init_node #undef pv_wait_node #undef pv_kick_node -#undef pv_wait_head +#undef pv_wait_head_or_lock #undef queued_spin_lock_slowpath #define queued_spin_lock_slowpath __pv_queued_spin_lock_slowpath diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index f0450ff48..87bb235c3 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -23,6 +23,20 @@ #define _Q_SLOW_VAL (3U << _Q_LOCKED_OFFSET) /* + * Queue Node Adaptive Spinning + * + * A queue node vCPU will stop spinning if the vCPU in the previous node is + * not running. The one lock stealing attempt allowed at slowpath entry + * mitigates the slight slowdown for non-overcommitted guest with this + * aggressive wait-early mechanism. + * + * The status of the previous node will be checked at fixed interval + * controlled by PV_PREV_CHECK_MASK. This is to ensure that we won't + * pound on the cacheline of the previous node too heavily. + */ +#define PV_PREV_CHECK_MASK 0xff + +/* * Queue node uses: vcpu_running & vcpu_halted. * Queue head uses: vcpu_running & vcpu_hashed. */ @@ -41,6 +55,94 @@ struct pv_node { }; /* + * By replacing the regular queued_spin_trylock() with the function below, + * it will be called once when a lock waiter enter the PV slowpath before + * being queued. By allowing one lock stealing attempt here when the pending + * bit is off, it helps to reduce the performance impact of lock waiter + * preemption without the drawback of lock starvation. + */ +#define queued_spin_trylock(l) pv_queued_spin_steal_lock(l) +static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock) +{ + struct __qspinlock *l = (void *)lock; + + return !(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) && + (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0); +} + +/* + * The pending bit is used by the queue head vCPU to indicate that it + * is actively spinning on the lock and no lock stealing is allowed. + */ +#if _Q_PENDING_BITS == 8 +static __always_inline void set_pending(struct qspinlock *lock) +{ + struct __qspinlock *l = (void *)lock; + + WRITE_ONCE(l->pending, 1); +} + +static __always_inline void clear_pending(struct qspinlock *lock) +{ + struct __qspinlock *l = (void *)lock; + + WRITE_ONCE(l->pending, 0); +} + +/* + * The pending bit check in pv_queued_spin_steal_lock() isn't a memory + * barrier. Therefore, an atomic cmpxchg() is used to acquire the lock + * just to be sure that it will get it. + */ +static __always_inline int trylock_clear_pending(struct qspinlock *lock) +{ + struct __qspinlock *l = (void *)lock; + + return !READ_ONCE(l->locked) && + (cmpxchg(&l->locked_pending, _Q_PENDING_VAL, _Q_LOCKED_VAL) + == _Q_PENDING_VAL); +} +#else /* _Q_PENDING_BITS == 8 */ +static __always_inline void set_pending(struct qspinlock *lock) +{ + atomic_set_mask(_Q_PENDING_VAL, &lock->val); +} + +static __always_inline void clear_pending(struct qspinlock *lock) +{ + atomic_clear_mask(_Q_PENDING_VAL, &lock->val); +} + +static __always_inline int trylock_clear_pending(struct qspinlock *lock) +{ + int val = atomic_read(&lock->val); + + for (;;) { + int old, new; + + if (val & _Q_LOCKED_MASK) + break; + + /* + * Try to clear pending bit & set locked bit + */ + old = val; + new = (val & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL; + val = atomic_cmpxchg(&lock->val, old, new); + + if (val == old) + return 1; + } + return 0; +} +#endif /* _Q_PENDING_BITS == 8 */ + +/* + * Include queued spinlock statistics code + */ +#include "qspinlock_stat.h" + +/* * Lock and MCS node addresses hash table for fast lookup * * Hashing is done on a per-cacheline basis to minimize the need to access @@ -100,10 +202,13 @@ static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node) { unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits); struct pv_hash_entry *he; + int hopcnt = 0; for_each_hash_entry(he, offset, hash) { + hopcnt++; if (!cmpxchg(&he->lock, NULL, lock)) { WRITE_ONCE(he->node, node); + qstat_hop(hopcnt); return &he->lock; } } @@ -144,6 +249,20 @@ static struct pv_node *pv_unhash(struct qspinlock *lock) } /* + * Return true if when it is time to check the previous node which is not + * in a running state. + */ +static inline bool +pv_wait_early(struct pv_node *prev, int loop) +{ + + if ((loop & PV_PREV_CHECK_MASK) != 0) + return false; + + return READ_ONCE(prev->state) != vcpu_running; +} + +/* * Initialize the PV part of the mcs_spinlock node. */ static void pv_init_node(struct mcs_spinlock *node) @@ -161,15 +280,23 @@ static void pv_init_node(struct mcs_spinlock *node) * pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its * behalf. */ -static void pv_wait_node(struct mcs_spinlock *node) +static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev) { struct pv_node *pn = (struct pv_node *)node; + struct pv_node *pp = (struct pv_node *)prev; + int waitcnt = 0; int loop; + bool wait_early; - for (;;) { - for (loop = SPIN_THRESHOLD; loop; loop--) { + /* waitcnt processing will be compiled out if !QUEUED_LOCK_STAT */ + for (;; waitcnt++) { + for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) { if (READ_ONCE(node->locked)) return; + if (pv_wait_early(pp, loop)) { + wait_early = true; + break; + } cpu_relax(); } @@ -184,12 +311,17 @@ static void pv_wait_node(struct mcs_spinlock *node) */ smp_store_mb(pn->state, vcpu_halted); - if (!READ_ONCE(node->locked)) + if (!READ_ONCE(node->locked)) { + qstat_inc(qstat_pv_wait_node, true); + qstat_inc(qstat_pv_wait_again, waitcnt); + qstat_inc(qstat_pv_wait_early, wait_early); pv_wait(&pn->state, vcpu_halted); + } /* - * If pv_kick_node() changed us to vcpu_hashed, retain that value - * so that pv_wait_head() knows to not also try to hash this lock. + * If pv_kick_node() changed us to vcpu_hashed, retain that + * value so that pv_wait_head_or_lock() knows to not also try + * to hash this lock. */ cmpxchg(&pn->state, vcpu_halted, vcpu_running); @@ -200,6 +332,7 @@ static void pv_wait_node(struct mcs_spinlock *node) * So it is better to spin for a while in the hope that the * MCS lock will be released soon. */ + qstat_inc(qstat_pv_spurious_wakeup, !READ_ONCE(node->locked)); } /* @@ -212,8 +345,9 @@ static void pv_wait_node(struct mcs_spinlock *node) /* * Called after setting next->locked = 1 when we're the lock owner. * - * Instead of waking the waiters stuck in pv_wait_node() advance their state such - * that they're waiting in pv_wait_head(), this avoids a wake/sleep cycle. + * Instead of waking the waiters stuck in pv_wait_node() advance their state + * such that they're waiting in pv_wait_head_or_lock(), this avoids a + * wake/sleep cycle. */ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) { @@ -242,14 +376,19 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) } /* - * Wait for l->locked to become clear; halt the vcpu after a short spin. + * Wait for l->locked to become clear and acquire the lock; + * halt the vcpu after a short spin. * __pv_queued_spin_unlock() will wake us. + * + * The current value of the lock will be returned for additional processing. */ -static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) +static u32 +pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) { struct pv_node *pn = (struct pv_node *)node; struct __qspinlock *l = (void *)lock; struct qspinlock **lp = NULL; + int waitcnt = 0; int loop; /* @@ -259,12 +398,25 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) if (READ_ONCE(pn->state) == vcpu_hashed) lp = (struct qspinlock **)1; - for (;;) { + for (;; waitcnt++) { + /* + * Set correct vCPU state to be used by queue node wait-early + * mechanism. + */ + WRITE_ONCE(pn->state, vcpu_running); + + /* + * Set the pending bit in the active lock spinning loop to + * disable lock stealing before attempting to acquire the lock. + */ + set_pending(lock); for (loop = SPIN_THRESHOLD; loop; loop--) { - if (!READ_ONCE(l->locked)) - return; + if (trylock_clear_pending(lock)) + goto gotlock; cpu_relax(); } + clear_pending(lock); + if (!lp) { /* ONCE */ lp = pv_hash(lock, pn); @@ -280,51 +432,50 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) * * Matches the smp_rmb() in __pv_queued_spin_unlock(). */ - if (!cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) { + if (xchg(&l->locked, _Q_SLOW_VAL) == 0) { /* - * The lock is free and _Q_SLOW_VAL has never - * been set. Therefore we need to unhash before - * getting the lock. + * The lock was free and now we own the lock. + * Change the lock value back to _Q_LOCKED_VAL + * and unhash the table. */ + WRITE_ONCE(l->locked, _Q_LOCKED_VAL); WRITE_ONCE(*lp, NULL); - return; + goto gotlock; } } + WRITE_ONCE(pn->state, vcpu_halted); + qstat_inc(qstat_pv_wait_head, true); + qstat_inc(qstat_pv_wait_again, waitcnt); pv_wait(&l->locked, _Q_SLOW_VAL); /* * The unlocker should have freed the lock before kicking the * CPU. So if the lock is still not free, it is a spurious - * wakeup and so the vCPU should wait again after spinning for - * a while. + * wakeup or another vCPU has stolen the lock. The current + * vCPU should spin again. */ + qstat_inc(qstat_pv_spurious_wakeup, READ_ONCE(l->locked)); } /* - * Lock is unlocked now; the caller will acquire it without waiting. - * As with pv_wait_node() we rely on the caller to do a load-acquire - * for us. + * The cmpxchg() or xchg() call before coming here provides the + * acquire semantics for locking. The dummy ORing of _Q_LOCKED_VAL + * here is to indicate to the compiler that the value will always + * be nozero to enable better code optimization. */ +gotlock: + return (u32)(atomic_read(&lock->val) | _Q_LOCKED_VAL); } /* - * PV version of the unlock function to be used in stead of - * queued_spin_unlock(). + * PV versions of the unlock fastpath and slowpath functions to be used + * instead of queued_spin_unlock(). */ -__visible void __pv_queued_spin_unlock(struct qspinlock *lock) +__visible void +__pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked) { struct __qspinlock *l = (void *)lock; struct pv_node *node; - u8 locked; - - /* - * We must not unlock if SLOW, because in that case we must first - * unhash. Otherwise it would be possible to have multiple @lock - * entries, which would be BAD. - */ - locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0); - if (likely(locked == _Q_LOCKED_VAL)) - return; if (unlikely(locked != _Q_SLOW_VAL)) { WARN(!debug_locks_silent, @@ -338,7 +489,7 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock) * so we need a barrier to order the read of the node data in * pv_unhash *after* we've read the lock being _Q_SLOW_VAL. * - * Matches the cmpxchg() in pv_wait_head() setting _Q_SLOW_VAL. + * Matches the cmpxchg() in pv_wait_head_or_lock() setting _Q_SLOW_VAL. */ smp_rmb(); @@ -361,14 +512,35 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock) * vCPU is harmless other than the additional latency in completing * the unlock. */ + qstat_inc(qstat_pv_kick_unlock, true); pv_kick(node->cpu); } + /* * Include the architecture specific callee-save thunk of the * __pv_queued_spin_unlock(). This thunk is put together with - * __pv_queued_spin_unlock() near the top of the file to make sure - * that the callee-save thunk and the real unlock function are close - * to each other sharing consecutive instruction cachelines. + * __pv_queued_spin_unlock() to make the callee-save thunk and the real unlock + * function close to each other sharing consecutive instruction cachelines. + * Alternatively, architecture specific version of __pv_queued_spin_unlock() + * can be defined. */ #include <asm/qspinlock_paravirt.h> +#ifndef __pv_queued_spin_unlock +__visible void __pv_queued_spin_unlock(struct qspinlock *lock) +{ + struct __qspinlock *l = (void *)lock; + u8 locked; + + /* + * We must not unlock if SLOW, because in that case we must first + * unhash. Otherwise it would be possible to have multiple @lock + * entries, which would be BAD. + */ + locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0); + if (likely(locked == _Q_LOCKED_VAL)) + return; + + __pv_queued_spin_unlock_slowpath(lock, locked); +} +#endif /* __pv_queued_spin_unlock */ diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h new file mode 100644 index 000000000..640dcecdd --- /dev/null +++ b/kernel/locking/qspinlock_stat.h @@ -0,0 +1,300 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Authors: Waiman Long <waiman.long@hpe.com> + */ + +/* + * When queued spinlock statistical counters are enabled, the following + * debugfs files will be created for reporting the counter values: + * + * <debugfs>/qlockstat/ + * pv_hash_hops - average # of hops per hashing operation + * pv_kick_unlock - # of vCPU kicks issued at unlock time + * pv_kick_wake - # of vCPU kicks used for computing pv_latency_wake + * pv_latency_kick - average latency (ns) of vCPU kick operation + * pv_latency_wake - average latency (ns) from vCPU kick to wakeup + * pv_lock_stealing - # of lock stealing operations + * pv_spurious_wakeup - # of spurious wakeups + * pv_wait_again - # of vCPU wait's that happened after a vCPU kick + * pv_wait_early - # of early vCPU wait's + * pv_wait_head - # of vCPU wait's at the queue head + * pv_wait_node - # of vCPU wait's at a non-head queue node + * + * Writing to the "reset_counters" file will reset all the above counter + * values. + * + * These statistical counters are implemented as per-cpu variables which are + * summed and computed whenever the corresponding debugfs files are read. This + * minimizes added overhead making the counters usable even in a production + * environment. + * + * There may be slight difference between pv_kick_wake and pv_kick_unlock. + */ +enum qlock_stats { + qstat_pv_hash_hops, + qstat_pv_kick_unlock, + qstat_pv_kick_wake, + qstat_pv_latency_kick, + qstat_pv_latency_wake, + qstat_pv_lock_stealing, + qstat_pv_spurious_wakeup, + qstat_pv_wait_again, + qstat_pv_wait_early, + qstat_pv_wait_head, + qstat_pv_wait_node, + qstat_num, /* Total number of statistical counters */ + qstat_reset_cnts = qstat_num, +}; + +#ifdef CONFIG_QUEUED_LOCK_STAT +/* + * Collect pvqspinlock statistics + */ +#include <linux/debugfs.h> +#include <linux/sched.h> +#include <linux/fs.h> + +static const char * const qstat_names[qstat_num + 1] = { + [qstat_pv_hash_hops] = "pv_hash_hops", + [qstat_pv_kick_unlock] = "pv_kick_unlock", + [qstat_pv_kick_wake] = "pv_kick_wake", + [qstat_pv_spurious_wakeup] = "pv_spurious_wakeup", + [qstat_pv_latency_kick] = "pv_latency_kick", + [qstat_pv_latency_wake] = "pv_latency_wake", + [qstat_pv_lock_stealing] = "pv_lock_stealing", + [qstat_pv_wait_again] = "pv_wait_again", + [qstat_pv_wait_early] = "pv_wait_early", + [qstat_pv_wait_head] = "pv_wait_head", + [qstat_pv_wait_node] = "pv_wait_node", + [qstat_reset_cnts] = "reset_counters", +}; + +/* + * Per-cpu counters + */ +static DEFINE_PER_CPU(unsigned long, qstats[qstat_num]); +static DEFINE_PER_CPU(u64, pv_kick_time); + +/* + * Function to read and return the qlock statistical counter values + * + * The following counters are handled specially: + * 1. qstat_pv_latency_kick + * Average kick latency (ns) = pv_latency_kick/pv_kick_unlock + * 2. qstat_pv_latency_wake + * Average wake latency (ns) = pv_latency_wake/pv_kick_wake + * 3. qstat_pv_hash_hops + * Average hops/hash = pv_hash_hops/pv_kick_unlock + */ +static ssize_t qstat_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + char buf[64]; + int cpu, counter, len; + u64 stat = 0, kicks = 0; + + /* + * Get the counter ID stored in file->f_inode->i_private + */ + if (!file->f_inode) { + WARN_ON_ONCE(1); + return -EBADF; + } + counter = (long)(file->f_inode->i_private); + + if (counter >= qstat_num) + return -EBADF; + + for_each_possible_cpu(cpu) { + stat += per_cpu(qstats[counter], cpu); + /* + * Need to sum additional counter for some of them + */ + switch (counter) { + + case qstat_pv_latency_kick: + case qstat_pv_hash_hops: + kicks += per_cpu(qstats[qstat_pv_kick_unlock], cpu); + break; + + case qstat_pv_latency_wake: + kicks += per_cpu(qstats[qstat_pv_kick_wake], cpu); + break; + } + } + + if (counter == qstat_pv_hash_hops) { + u64 frac; + + frac = 100ULL * do_div(stat, kicks); + frac = DIV_ROUND_CLOSEST_ULL(frac, kicks); + + /* + * Return a X.XX decimal number + */ + len = snprintf(buf, sizeof(buf) - 1, "%llu.%02llu\n", stat, frac); + } else { + /* + * Round to the nearest ns + */ + if ((counter == qstat_pv_latency_kick) || + (counter == qstat_pv_latency_wake)) { + stat = 0; + if (kicks) + stat = DIV_ROUND_CLOSEST_ULL(stat, kicks); + } + len = snprintf(buf, sizeof(buf) - 1, "%llu\n", stat); + } + + return simple_read_from_buffer(user_buf, count, ppos, buf, len); +} + +/* + * Function to handle write request + * + * When counter = reset_cnts, reset all the counter values. + * Since the counter updates aren't atomic, the resetting is done twice + * to make sure that the counters are very likely to be all cleared. + */ +static ssize_t qstat_write(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + int cpu; + + /* + * Get the counter ID stored in file->f_inode->i_private + */ + if (!file->f_inode) { + WARN_ON_ONCE(1); + return -EBADF; + } + if ((long)(file->f_inode->i_private) != qstat_reset_cnts) + return count; + + for_each_possible_cpu(cpu) { + int i; + unsigned long *ptr = per_cpu_ptr(qstats, cpu); + + for (i = 0 ; i < qstat_num; i++) + WRITE_ONCE(ptr[i], 0); + for (i = 0 ; i < qstat_num; i++) + WRITE_ONCE(ptr[i], 0); + } + return count; +} + +/* + * Debugfs data structures + */ +static const struct file_operations fops_qstat = { + .read = qstat_read, + .write = qstat_write, + .llseek = default_llseek, +}; + +/* + * Initialize debugfs for the qspinlock statistical counters + */ +static int __init init_qspinlock_stat(void) +{ + struct dentry *d_qstat = debugfs_create_dir("qlockstat", NULL); + int i; + + if (!d_qstat) { + pr_warn("Could not create 'qlockstat' debugfs directory\n"); + return 0; + } + + /* + * Create the debugfs files + * + * As reading from and writing to the stat files can be slow, only + * root is allowed to do the read/write to limit impact to system + * performance. + */ + for (i = 0; i < qstat_num; i++) + debugfs_create_file(qstat_names[i], 0400, d_qstat, + (void *)(long)i, &fops_qstat); + + debugfs_create_file(qstat_names[qstat_reset_cnts], 0200, d_qstat, + (void *)(long)qstat_reset_cnts, &fops_qstat); + return 0; +} +fs_initcall(init_qspinlock_stat); + +/* + * Increment the PV qspinlock statistical counters + */ +static inline void qstat_inc(enum qlock_stats stat, bool cond) +{ + if (cond) + this_cpu_inc(qstats[stat]); +} + +/* + * PV hash hop count + */ +static inline void qstat_hop(int hopcnt) +{ + this_cpu_add(qstats[qstat_pv_hash_hops], hopcnt); +} + +/* + * Replacement function for pv_kick() + */ +static inline void __pv_kick(int cpu) +{ + u64 start = sched_clock(); + + per_cpu(pv_kick_time, cpu) = start; + pv_kick(cpu); + this_cpu_add(qstats[qstat_pv_latency_kick], sched_clock() - start); +} + +/* + * Replacement function for pv_wait() + */ +static inline void __pv_wait(u8 *ptr, u8 val) +{ + u64 *pkick_time = this_cpu_ptr(&pv_kick_time); + + *pkick_time = 0; + pv_wait(ptr, val); + if (*pkick_time) { + this_cpu_add(qstats[qstat_pv_latency_wake], + sched_clock() - *pkick_time); + qstat_inc(qstat_pv_kick_wake, true); + } +} + +#define pv_kick(c) __pv_kick(c) +#define pv_wait(p, v) __pv_wait(p, v) + +/* + * PV unfair trylock count tracking function + */ +static inline int qstat_spin_steal_lock(struct qspinlock *lock) +{ + int ret = pv_queued_spin_steal_lock(lock); + + qstat_inc(qstat_pv_lock_stealing, ret); + return ret; +} +#undef queued_spin_trylock +#define queued_spin_trylock(l) qstat_spin_steal_lock(l) + +#else /* CONFIG_QUEUED_LOCK_STAT */ + +static inline void qstat_inc(enum qlock_stats stat, bool cond) { } +static inline void qstat_hop(int hopcnt) { } + +#endif /* CONFIG_QUEUED_LOCK_STAT */ diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 8251e75dd..3e746607a 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -99,13 +99,14 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) * 2) Drop lock->wait_lock * 3) Try to unlock the lock with cmpxchg */ -static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock) +static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, + unsigned long flags) __releases(lock->wait_lock) { struct task_struct *owner = rt_mutex_owner(lock); clear_rt_mutex_waiters(lock); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); /* * If a new waiter comes in between the unlock and the cmpxchg * we have two situations: @@ -147,11 +148,12 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) /* * Simple slow path only version: lock->owner is protected by lock->wait_lock. */ -static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock) +static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, + unsigned long flags) __releases(lock->wait_lock) { lock->owner = NULL; - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); return true; } #endif @@ -433,7 +435,6 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, int ret = 0, depth = 0; struct rt_mutex *lock; bool detect_deadlock; - unsigned long flags; bool requeue = true; detect_deadlock = rt_mutex_cond_detect_deadlock(orig_waiter, chwalk); @@ -476,7 +477,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, /* * [1] Task cannot go away as we did a get_task() before ! */ - raw_spin_lock_irqsave(&task->pi_lock, flags); + raw_spin_lock_irq(&task->pi_lock); /* * [2] Get the waiter on which @task is blocked on. @@ -560,7 +561,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * operations. */ if (!raw_spin_trylock(&lock->wait_lock)) { - raw_spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock_irq(&task->pi_lock); cpu_relax(); goto retry; } @@ -591,7 +592,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, /* * No requeue[7] here. Just release @task [8] */ - raw_spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock(&task->pi_lock); put_task_struct(task); /* @@ -599,14 +600,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * If there is no owner of the lock, end of chain. */ if (!rt_mutex_owner(lock)) { - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irq(&lock->wait_lock); return 0; } /* [10] Grab the next task, i.e. owner of @lock */ task = rt_mutex_owner(lock); get_task_struct(task); - raw_spin_lock_irqsave(&task->pi_lock, flags); + raw_spin_lock(&task->pi_lock); /* * No requeue [11] here. We just do deadlock detection. @@ -621,8 +622,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, top_waiter = rt_mutex_top_waiter(lock); /* [13] Drop locks */ - raw_spin_unlock_irqrestore(&task->pi_lock, flags); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock(&task->pi_lock); + raw_spin_unlock_irq(&lock->wait_lock); /* If owner is not blocked, end of chain. */ if (!next_lock) @@ -643,7 +644,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, rt_mutex_enqueue(lock, waiter); /* [8] Release the task */ - raw_spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock(&task->pi_lock); put_task_struct(task); /* @@ -661,14 +662,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, */ if (prerequeue_top_waiter != rt_mutex_top_waiter(lock)) wake_up_process(rt_mutex_top_waiter(lock)->task); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irq(&lock->wait_lock); return 0; } /* [10] Grab the next task, i.e. the owner of @lock */ task = rt_mutex_owner(lock); get_task_struct(task); - raw_spin_lock_irqsave(&task->pi_lock, flags); + raw_spin_lock(&task->pi_lock); /* [11] requeue the pi waiters if necessary */ if (waiter == rt_mutex_top_waiter(lock)) { @@ -722,8 +723,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, top_waiter = rt_mutex_top_waiter(lock); /* [13] Drop the locks */ - raw_spin_unlock_irqrestore(&task->pi_lock, flags); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock(&task->pi_lock); + raw_spin_unlock_irq(&lock->wait_lock); /* * Make the actual exit decisions [12], based on the stored @@ -746,7 +747,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, goto again; out_unlock_pi: - raw_spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock_irq(&task->pi_lock); out_put_task: put_task_struct(task); @@ -756,7 +757,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, /* * Try to take an rt-mutex * - * Must be called with lock->wait_lock held. + * Must be called with lock->wait_lock held and interrupts disabled * * @lock: The lock to be acquired. * @task: The task which wants to acquire the lock @@ -766,8 +767,6 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, struct rt_mutex_waiter *waiter) { - unsigned long flags; - /* * Before testing whether we can acquire @lock, we set the * RT_MUTEX_HAS_WAITERS bit in @lock->owner. This forces all @@ -852,7 +851,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, * case, but conditionals are more expensive than a redundant * store. */ - raw_spin_lock_irqsave(&task->pi_lock, flags); + raw_spin_lock(&task->pi_lock); task->pi_blocked_on = NULL; /* * Finish the lock acquisition. @task is the new owner. If @@ -861,7 +860,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, */ if (rt_mutex_has_waiters(lock)) rt_mutex_enqueue_pi(task, rt_mutex_top_waiter(lock)); - raw_spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock(&task->pi_lock); takeit: /* We got the lock. */ @@ -883,7 +882,7 @@ takeit: * * Prepare waiter and propagate pi chain * - * This must be called with lock->wait_lock held. + * This must be called with lock->wait_lock held and interrupts disabled */ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, @@ -894,7 +893,6 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, struct rt_mutex_waiter *top_waiter = waiter; struct rt_mutex *next_lock; int chain_walk = 0, res; - unsigned long flags; /* * Early deadlock detection. We really don't want the task to @@ -908,7 +906,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, if (owner == task) return -EDEADLK; - raw_spin_lock_irqsave(&task->pi_lock, flags); + raw_spin_lock(&task->pi_lock); __rt_mutex_adjust_prio(task); waiter->task = task; waiter->lock = lock; @@ -921,12 +919,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, task->pi_blocked_on = waiter; - raw_spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock(&task->pi_lock); if (!owner) return 0; - raw_spin_lock_irqsave(&owner->pi_lock, flags); + raw_spin_lock(&owner->pi_lock); if (waiter == rt_mutex_top_waiter(lock)) { rt_mutex_dequeue_pi(owner, top_waiter); rt_mutex_enqueue_pi(owner, waiter); @@ -941,7 +939,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, /* Store the lock on which owner is blocked or NULL */ next_lock = task_blocked_on_lock(owner); - raw_spin_unlock_irqrestore(&owner->pi_lock, flags); + raw_spin_unlock(&owner->pi_lock); /* * Even if full deadlock detection is on, if the owner is not * blocked itself, we can avoid finding this out in the chain @@ -957,12 +955,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, */ get_task_struct(owner); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irq(&lock->wait_lock); res = rt_mutex_adjust_prio_chain(owner, chwalk, lock, next_lock, waiter, task); - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irq(&lock->wait_lock); return res; } @@ -971,15 +969,14 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, * Remove the top waiter from the current tasks pi waiter tree and * queue it up. * - * Called with lock->wait_lock held. + * Called with lock->wait_lock held and interrupts disabled. */ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q, struct rt_mutex *lock) { struct rt_mutex_waiter *waiter; - unsigned long flags; - raw_spin_lock_irqsave(¤t->pi_lock, flags); + raw_spin_lock(¤t->pi_lock); waiter = rt_mutex_top_waiter(lock); @@ -1001,7 +998,7 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q, */ lock->owner = (void *) RT_MUTEX_HAS_WAITERS; - raw_spin_unlock_irqrestore(¤t->pi_lock, flags); + raw_spin_unlock(¤t->pi_lock); wake_q_add(wake_q, waiter->task); } @@ -1009,7 +1006,7 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q, /* * Remove a waiter from a lock and give up * - * Must be called with lock->wait_lock held and + * Must be called with lock->wait_lock held and interrupts disabled. I must * have just failed to try_to_take_rt_mutex(). */ static void remove_waiter(struct rt_mutex *lock, @@ -1018,12 +1015,11 @@ static void remove_waiter(struct rt_mutex *lock, bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock)); struct task_struct *owner = rt_mutex_owner(lock); struct rt_mutex *next_lock; - unsigned long flags; - raw_spin_lock_irqsave(¤t->pi_lock, flags); + raw_spin_lock(¤t->pi_lock); rt_mutex_dequeue(lock, waiter); current->pi_blocked_on = NULL; - raw_spin_unlock_irqrestore(¤t->pi_lock, flags); + raw_spin_unlock(¤t->pi_lock); /* * Only update priority if the waiter was the highest priority @@ -1032,7 +1028,7 @@ static void remove_waiter(struct rt_mutex *lock, if (!owner || !is_top_waiter) return; - raw_spin_lock_irqsave(&owner->pi_lock, flags); + raw_spin_lock(&owner->pi_lock); rt_mutex_dequeue_pi(owner, waiter); @@ -1044,7 +1040,7 @@ static void remove_waiter(struct rt_mutex *lock, /* Store the lock on which owner is blocked or NULL */ next_lock = task_blocked_on_lock(owner); - raw_spin_unlock_irqrestore(&owner->pi_lock, flags); + raw_spin_unlock(&owner->pi_lock); /* * Don't walk the chain, if the owner task is not blocked @@ -1056,12 +1052,12 @@ static void remove_waiter(struct rt_mutex *lock, /* gets dropped in rt_mutex_adjust_prio_chain()! */ get_task_struct(owner); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irq(&lock->wait_lock); rt_mutex_adjust_prio_chain(owner, RT_MUTEX_MIN_CHAINWALK, lock, next_lock, NULL, current); - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irq(&lock->wait_lock); } /* @@ -1097,11 +1093,11 @@ void rt_mutex_adjust_pi(struct task_struct *task) * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop * @lock: the rt_mutex to take * @state: the state the task should block in (TASK_INTERRUPTIBLE - * or TASK_UNINTERRUPTIBLE) + * or TASK_UNINTERRUPTIBLE) * @timeout: the pre-initialized and started timer, or NULL for none * @waiter: the pre-initialized rt_mutex_waiter * - * lock->wait_lock must be held by the caller. + * Must be called with lock->wait_lock held and interrupts disabled */ static int __sched __rt_mutex_slowlock(struct rt_mutex *lock, int state, @@ -1129,13 +1125,13 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, break; } - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irq(&lock->wait_lock); debug_rt_mutex_print_deadlock(waiter); schedule(); - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irq(&lock->wait_lock); set_current_state(state); } @@ -1172,17 +1168,26 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, enum rtmutex_chainwalk chwalk) { struct rt_mutex_waiter waiter; + unsigned long flags; int ret = 0; debug_rt_mutex_init_waiter(&waiter); RB_CLEAR_NODE(&waiter.pi_tree_entry); RB_CLEAR_NODE(&waiter.tree_entry); - raw_spin_lock(&lock->wait_lock); + /* + * Technically we could use raw_spin_[un]lock_irq() here, but this can + * be called in early boot if the cmpxchg() fast path is disabled + * (debug, no architecture support). In this case we will acquire the + * rtmutex with lock->wait_lock held. But we cannot unconditionally + * enable interrupts in that early boot case. So we need to use the + * irqsave/restore variants. + */ + raw_spin_lock_irqsave(&lock->wait_lock, flags); /* Try to acquire the lock again: */ if (try_to_take_rt_mutex(lock, current, NULL)) { - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); return 0; } @@ -1211,7 +1216,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, */ fixup_rt_mutex_waiters(lock); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); /* Remove pending timer: */ if (unlikely(timeout)) @@ -1227,6 +1232,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, */ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) { + unsigned long flags; int ret; /* @@ -1238,10 +1244,10 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) return 0; /* - * The mutex has currently no owner. Lock the wait lock and - * try to acquire the lock. + * The mutex has currently no owner. Lock the wait lock and try to + * acquire the lock. We use irqsave here to support early boot calls. */ - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irqsave(&lock->wait_lock, flags); ret = try_to_take_rt_mutex(lock, current, NULL); @@ -1251,7 +1257,7 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) */ fixup_rt_mutex_waiters(lock); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); return ret; } @@ -1263,7 +1269,10 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, struct wake_q_head *wake_q) { - raw_spin_lock(&lock->wait_lock); + unsigned long flags; + + /* irqsave required to support early boot calls */ + raw_spin_lock_irqsave(&lock->wait_lock, flags); debug_rt_mutex_unlock(lock); @@ -1302,10 +1311,10 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, */ while (!rt_mutex_has_waiters(lock)) { /* Drops lock->wait_lock ! */ - if (unlock_rt_mutex_safe(lock) == true) + if (unlock_rt_mutex_safe(lock, flags) == true) return false; /* Relock the rtmutex and try again */ - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irqsave(&lock->wait_lock, flags); } /* @@ -1316,7 +1325,7 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, */ mark_wakeup_next_waiter(wake_q, lock); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); /* check PI boosting */ return true; @@ -1596,10 +1605,10 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, { int ret; - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irq(&lock->wait_lock); if (try_to_take_rt_mutex(lock, task, NULL)) { - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irq(&lock->wait_lock); return 1; } @@ -1620,7 +1629,7 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, if (unlikely(ret)) remove_waiter(lock, waiter); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irq(&lock->wait_lock); debug_rt_mutex_print_deadlock(waiter); @@ -1668,7 +1677,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, { int ret; - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irq(&lock->wait_lock); set_current_state(TASK_INTERRUPTIBLE); @@ -1684,7 +1693,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, */ fixup_rt_mutex_waiters(lock); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irq(&lock->wait_lock); return ret; } diff --git a/kernel/memremap.c b/kernel/memremap.c index 25ced161e..6cf54615a 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -10,8 +10,11 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. */ +#include <linux/radix-tree.h> +#include <linux/memremap.h> #include <linux/device.h> #include <linux/types.h> +#include <linux/pfn_t.h> #include <linux/io.h> #include <linux/mm.h> #include <linux/memory_hotplug.h> @@ -26,10 +29,10 @@ __weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size) static void *try_ram_remap(resource_size_t offset, size_t size) { - struct page *page = pfn_to_page(offset >> PAGE_SHIFT); + unsigned long pfn = PHYS_PFN(offset); /* In the simple case just return the existing linear address */ - if (!PageHighMem(page)) + if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn))) return __va(offset); return NULL; /* fallback to ioremap_cache */ } @@ -149,25 +152,134 @@ void devm_memunmap(struct device *dev, void *addr) } EXPORT_SYMBOL(devm_memunmap); +pfn_t phys_to_pfn_t(phys_addr_t addr, u64 flags) +{ + return __pfn_to_pfn_t(addr >> PAGE_SHIFT, flags); +} +EXPORT_SYMBOL(phys_to_pfn_t); + #ifdef CONFIG_ZONE_DEVICE +static DEFINE_MUTEX(pgmap_lock); +static RADIX_TREE(pgmap_radix, GFP_KERNEL); +#define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1) +#define SECTION_SIZE (1UL << PA_SECTION_SHIFT) + struct page_map { struct resource res; + struct percpu_ref *ref; + struct dev_pagemap pgmap; + struct vmem_altmap altmap; }; -static void devm_memremap_pages_release(struct device *dev, void *res) +void get_zone_device_page(struct page *page) +{ + percpu_ref_get(page->pgmap->ref); +} +EXPORT_SYMBOL(get_zone_device_page); + +void put_zone_device_page(struct page *page) +{ + put_dev_pagemap(page->pgmap); +} +EXPORT_SYMBOL(put_zone_device_page); + +static void pgmap_radix_release(struct resource *res) +{ + resource_size_t key, align_start, align_size, align_end; + + align_start = res->start & ~(SECTION_SIZE - 1); + align_size = ALIGN(resource_size(res), SECTION_SIZE); + align_end = align_start + align_size - 1; + + mutex_lock(&pgmap_lock); + for (key = res->start; key <= res->end; key += SECTION_SIZE) + radix_tree_delete(&pgmap_radix, key >> PA_SECTION_SHIFT); + mutex_unlock(&pgmap_lock); +} + +static unsigned long pfn_first(struct page_map *page_map) +{ + struct dev_pagemap *pgmap = &page_map->pgmap; + const struct resource *res = &page_map->res; + struct vmem_altmap *altmap = pgmap->altmap; + unsigned long pfn; + + pfn = res->start >> PAGE_SHIFT; + if (altmap) + pfn += vmem_altmap_offset(altmap); + return pfn; +} + +static unsigned long pfn_end(struct page_map *page_map) { - struct page_map *page_map = res; + const struct resource *res = &page_map->res; + + return (res->start + resource_size(res)) >> PAGE_SHIFT; +} + +#define for_each_device_pfn(pfn, map) \ + for (pfn = pfn_first(map); pfn < pfn_end(map); pfn++) + +static void devm_memremap_pages_release(struct device *dev, void *data) +{ + struct page_map *page_map = data; + struct resource *res = &page_map->res; + resource_size_t align_start, align_size; + struct dev_pagemap *pgmap = &page_map->pgmap; + + if (percpu_ref_tryget_live(pgmap->ref)) { + dev_WARN(dev, "%s: page mapping is still live!\n", __func__); + percpu_ref_put(pgmap->ref); + } /* pages are dead and unused, undo the arch mapping */ - arch_remove_memory(page_map->res.start, resource_size(&page_map->res)); + align_start = res->start & ~(SECTION_SIZE - 1); + align_size = ALIGN(resource_size(res), SECTION_SIZE); + arch_remove_memory(align_start, align_size); + pgmap_radix_release(res); + dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc, + "%s: failed to free all reserved pages\n", __func__); } -void *devm_memremap_pages(struct device *dev, struct resource *res) +/* assumes rcu_read_lock() held at entry */ +struct dev_pagemap *find_dev_pagemap(resource_size_t phys) { - int is_ram = region_intersects(res->start, resource_size(res), - "System RAM"); struct page_map *page_map; - int error, nid; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + page_map = radix_tree_lookup(&pgmap_radix, phys >> PA_SECTION_SHIFT); + return page_map ? &page_map->pgmap : NULL; +} + +/** + * devm_memremap_pages - remap and provide memmap backing for the given resource + * @dev: hosting device for @res + * @res: "host memory" address range + * @ref: a live per-cpu reference count + * @altmap: optional descriptor for allocating the memmap from @res + * + * Notes: + * 1/ @ref must be 'live' on entry and 'dead' before devm_memunmap_pages() time + * (or devm release event). + * + * 2/ @res is expected to be a host memory range that could feasibly be + * treated as a "System RAM" range, i.e. not a device mmio range, but + * this is not enforced. + */ +void *devm_memremap_pages(struct device *dev, struct resource *res, + struct percpu_ref *ref, struct vmem_altmap *altmap) +{ + resource_size_t key, align_start, align_size, align_end; + struct dev_pagemap *pgmap; + struct page_map *page_map; + int error, nid, is_ram; + unsigned long pfn; + + align_start = res->start & ~(SECTION_SIZE - 1); + align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) + - align_start; + is_ram = region_intersects(align_start, align_size, "System RAM"); if (is_ram == REGION_MIXED) { WARN_ONCE(1, "%s attempted on mixed region %pr\n", @@ -178,25 +290,124 @@ void *devm_memremap_pages(struct device *dev, struct resource *res) if (is_ram == REGION_INTERSECTS) return __va(res->start); + if (altmap && !IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)) { + dev_err(dev, "%s: altmap requires CONFIG_SPARSEMEM_VMEMMAP=y\n", + __func__); + return ERR_PTR(-ENXIO); + } + + if (!ref) + return ERR_PTR(-EINVAL); + page_map = devres_alloc_node(devm_memremap_pages_release, sizeof(*page_map), GFP_KERNEL, dev_to_node(dev)); if (!page_map) return ERR_PTR(-ENOMEM); + pgmap = &page_map->pgmap; memcpy(&page_map->res, res, sizeof(*res)); + pgmap->dev = dev; + if (altmap) { + memcpy(&page_map->altmap, altmap, sizeof(*altmap)); + pgmap->altmap = &page_map->altmap; + } + pgmap->ref = ref; + pgmap->res = &page_map->res; + + mutex_lock(&pgmap_lock); + error = 0; + align_end = align_start + align_size - 1; + for (key = align_start; key <= align_end; key += SECTION_SIZE) { + struct dev_pagemap *dup; + + rcu_read_lock(); + dup = find_dev_pagemap(key); + rcu_read_unlock(); + if (dup) { + dev_err(dev, "%s: %pr collides with mapping for %s\n", + __func__, res, dev_name(dup->dev)); + error = -EBUSY; + break; + } + error = radix_tree_insert(&pgmap_radix, key >> PA_SECTION_SHIFT, + page_map); + if (error) { + dev_err(dev, "%s: failed: %d\n", __func__, error); + break; + } + } + mutex_unlock(&pgmap_lock); + if (error) + goto err_radix; + nid = dev_to_node(dev); if (nid < 0) nid = numa_mem_id(); - error = arch_add_memory(nid, res->start, resource_size(res), true); - if (error) { - devres_free(page_map); - return ERR_PTR(error); - } + error = arch_add_memory(nid, align_start, align_size, true); + if (error) + goto err_add_memory; + for_each_device_pfn(pfn, page_map) { + struct page *page = pfn_to_page(pfn); + + /* + * ZONE_DEVICE pages union ->lru with a ->pgmap back + * pointer. It is a bug if a ZONE_DEVICE page is ever + * freed or placed on a driver-private list. Seed the + * storage with LIST_POISON* values. + */ + list_del(&page->lru); + page->pgmap = pgmap; + } devres_add(dev, page_map); return __va(res->start); + + err_add_memory: + err_radix: + pgmap_radix_release(res); + devres_free(page_map); + return ERR_PTR(error); } EXPORT_SYMBOL(devm_memremap_pages); + +unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) +{ + /* number of pfns from base where pfn_to_page() is valid */ + return altmap->reserve + altmap->free; +} + +void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns) +{ + altmap->alloc -= nr_pfns; +} + +#ifdef CONFIG_SPARSEMEM_VMEMMAP +struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) +{ + /* + * 'memmap_start' is the virtual address for the first "struct + * page" in this range of the vmemmap array. In the case of + * CONFIG_SPARSE_VMEMMAP a page_to_pfn conversion is simple + * pointer arithmetic, so we can perform this to_vmem_altmap() + * conversion without concern for the initialization state of + * the struct page fields. + */ + struct page *page = (struct page *) memmap_start; + struct dev_pagemap *pgmap; + + /* + * Uncoditionally retrieve a dev_pagemap associated with the + * given physical address, this is only for use in the + * arch_{add|remove}_memory() for setting up and tearing down + * the memmap. + */ + rcu_read_lock(); + pgmap = find_dev_pagemap(__pfn_to_phys(page_to_pfn(page))); + rcu_read_unlock(); + + return pgmap ? pgmap->altmap : NULL; +} +#endif /* CONFIG_SPARSEMEM_VMEMMAP */ #endif /* CONFIG_ZONE_DEVICE */ diff --git a/kernel/module.c b/kernel/module.c index 0e5c71195..794ebe8e8 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -80,15 +80,6 @@ # define debug_align(X) (X) #endif -/* - * Given BASE and SIZE this macro calculates the number of pages the - * memory regions occupies - */ -#define MOD_NUMBER_OF_PAGES(BASE, SIZE) (((SIZE) > 0) ? \ - (PFN_DOWN((unsigned long)(BASE) + (SIZE) - 1) - \ - PFN_DOWN((unsigned long)BASE) + 1) \ - : (0UL)) - /* If this is set, the section belongs in the init part of the module */ #define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) @@ -108,13 +99,6 @@ static LIST_HEAD(modules); * Use a latched RB-tree for __module_address(); this allows us to use * RCU-sched lookups of the address from any context. * - * Because modules have two address ranges: init and core, we need two - * latch_tree_nodes entries. Therefore we need the back-pointer from - * mod_tree_node. - * - * Because init ranges are short lived we mark them unlikely and have placed - * them outside the critical cacheline in struct module. - * * This is conditional on PERF_EVENTS || TRACING because those can really hit * __module_address() hard by doing a lot of stack unwinding; potentially from * NMI context. @@ -122,24 +106,16 @@ static LIST_HEAD(modules); static __always_inline unsigned long __mod_tree_val(struct latch_tree_node *n) { - struct mod_tree_node *mtn = container_of(n, struct mod_tree_node, node); - struct module *mod = mtn->mod; + struct module_layout *layout = container_of(n, struct module_layout, mtn.node); - if (unlikely(mtn == &mod->mtn_init)) - return (unsigned long)mod->module_init; - - return (unsigned long)mod->module_core; + return (unsigned long)layout->base; } static __always_inline unsigned long __mod_tree_size(struct latch_tree_node *n) { - struct mod_tree_node *mtn = container_of(n, struct mod_tree_node, node); - struct module *mod = mtn->mod; - - if (unlikely(mtn == &mod->mtn_init)) - return (unsigned long)mod->init_size; + struct module_layout *layout = container_of(n, struct module_layout, mtn.node); - return (unsigned long)mod->core_size; + return (unsigned long)layout->size; } static __always_inline bool @@ -197,23 +173,23 @@ static void __mod_tree_remove(struct mod_tree_node *node) */ static void mod_tree_insert(struct module *mod) { - mod->mtn_core.mod = mod; - mod->mtn_init.mod = mod; + mod->core_layout.mtn.mod = mod; + mod->init_layout.mtn.mod = mod; - __mod_tree_insert(&mod->mtn_core); - if (mod->init_size) - __mod_tree_insert(&mod->mtn_init); + __mod_tree_insert(&mod->core_layout.mtn); + if (mod->init_layout.size) + __mod_tree_insert(&mod->init_layout.mtn); } static void mod_tree_remove_init(struct module *mod) { - if (mod->init_size) - __mod_tree_remove(&mod->mtn_init); + if (mod->init_layout.size) + __mod_tree_remove(&mod->init_layout.mtn); } static void mod_tree_remove(struct module *mod) { - __mod_tree_remove(&mod->mtn_core); + __mod_tree_remove(&mod->core_layout.mtn); mod_tree_remove_init(mod); } @@ -267,9 +243,9 @@ static void __mod_update_bounds(void *base, unsigned int size) static void mod_update_bounds(struct module *mod) { - __mod_update_bounds(mod->module_core, mod->core_size); - if (mod->init_size) - __mod_update_bounds(mod->module_init, mod->init_size); + __mod_update_bounds(mod->core_layout.base, mod->core_layout.size); + if (mod->init_layout.size) + __mod_update_bounds(mod->init_layout.base, mod->init_layout.size); } #ifdef CONFIG_KGDB_KDB @@ -1008,6 +984,8 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, mod->exit(); blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, mod); + ftrace_release_mod(mod); + async_synchronize_full(); /* Store the name of the last unloaded module for diagnostic purposes */ @@ -1217,7 +1195,7 @@ struct module_attribute module_uevent = static ssize_t show_coresize(struct module_attribute *mattr, struct module_kobject *mk, char *buffer) { - return sprintf(buffer, "%u\n", mk->mod->core_size); + return sprintf(buffer, "%u\n", mk->mod->core_layout.size); } static struct module_attribute modinfo_coresize = @@ -1226,7 +1204,7 @@ static struct module_attribute modinfo_coresize = static ssize_t show_initsize(struct module_attribute *mattr, struct module_kobject *mk, char *buffer) { - return sprintf(buffer, "%u\n", mk->mod->init_size); + return sprintf(buffer, "%u\n", mk->mod->init_layout.size); } static struct module_attribute modinfo_initsize = @@ -1876,64 +1854,75 @@ static void mod_sysfs_teardown(struct module *mod) /* * LKM RO/NX protection: protect module's text/ro-data * from modification and any data from execution. + * + * General layout of module is: + * [text] [read-only-data] [writable data] + * text_size -----^ ^ ^ + * ro_size ------------------------| | + * size -------------------------------------------| + * + * These values are always page-aligned (as is base) */ -void set_page_attributes(void *start, void *end, int (*set)(unsigned long start, int num_pages)) +static void frob_text(const struct module_layout *layout, + int (*set_memory)(unsigned long start, int num_pages)) { - unsigned long begin_pfn = PFN_DOWN((unsigned long)start); - unsigned long end_pfn = PFN_DOWN((unsigned long)end); + BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1)); + BUG_ON((unsigned long)layout->text_size & (PAGE_SIZE-1)); + set_memory((unsigned long)layout->base, + layout->text_size >> PAGE_SHIFT); +} - if (end_pfn > begin_pfn) - set(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn); +static void frob_rodata(const struct module_layout *layout, + int (*set_memory)(unsigned long start, int num_pages)) +{ + BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1)); + BUG_ON((unsigned long)layout->text_size & (PAGE_SIZE-1)); + BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1)); + set_memory((unsigned long)layout->base + layout->text_size, + (layout->ro_size - layout->text_size) >> PAGE_SHIFT); } -static void set_section_ro_nx(void *base, - unsigned long text_size, - unsigned long ro_size, - unsigned long total_size) +static void frob_writable_data(const struct module_layout *layout, + int (*set_memory)(unsigned long start, int num_pages)) { - /* begin and end PFNs of the current subsection */ - unsigned long begin_pfn; - unsigned long end_pfn; + BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1)); + BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1)); + BUG_ON((unsigned long)layout->size & (PAGE_SIZE-1)); + set_memory((unsigned long)layout->base + layout->ro_size, + (layout->size - layout->ro_size) >> PAGE_SHIFT); +} - /* - * Set RO for module text and RO-data: - * - Always protect first page. - * - Do not protect last partial page. - */ - if (ro_size > 0) - set_page_attributes(base, base + ro_size, set_memory_ro); +/* livepatching wants to disable read-only so it can frob module. */ +void module_disable_ro(const struct module *mod) +{ + frob_text(&mod->core_layout, set_memory_rw); + frob_rodata(&mod->core_layout, set_memory_rw); + frob_text(&mod->init_layout, set_memory_rw); + frob_rodata(&mod->init_layout, set_memory_rw); +} - /* - * Set NX permissions for module data: - * - Do not protect first partial page. - * - Always protect last page. - */ - if (total_size > text_size) { - begin_pfn = PFN_UP((unsigned long)base + text_size); - end_pfn = PFN_UP((unsigned long)base + total_size); - if (end_pfn > begin_pfn) - set_memory_nx(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn); - } +void module_enable_ro(const struct module *mod) +{ + frob_text(&mod->core_layout, set_memory_ro); + frob_rodata(&mod->core_layout, set_memory_ro); + frob_text(&mod->init_layout, set_memory_ro); + frob_rodata(&mod->init_layout, set_memory_ro); } -static void unset_module_core_ro_nx(struct module *mod) +static void module_enable_nx(const struct module *mod) { - set_page_attributes(mod->module_core + mod->core_text_size, - mod->module_core + mod->core_size, - set_memory_x); - set_page_attributes(mod->module_core, - mod->module_core + mod->core_ro_size, - set_memory_rw); + frob_rodata(&mod->core_layout, set_memory_nx); + frob_writable_data(&mod->core_layout, set_memory_nx); + frob_rodata(&mod->init_layout, set_memory_nx); + frob_writable_data(&mod->init_layout, set_memory_nx); } -static void unset_module_init_ro_nx(struct module *mod) +static void module_disable_nx(const struct module *mod) { - set_page_attributes(mod->module_init + mod->init_text_size, - mod->module_init + mod->init_size, - set_memory_x); - set_page_attributes(mod->module_init, - mod->module_init + mod->init_ro_size, - set_memory_rw); + frob_rodata(&mod->core_layout, set_memory_x); + frob_writable_data(&mod->core_layout, set_memory_x); + frob_rodata(&mod->init_layout, set_memory_x); + frob_writable_data(&mod->init_layout, set_memory_x); } /* Iterate through all modules and set each module's text as RW */ @@ -1945,16 +1934,9 @@ void set_all_modules_text_rw(void) list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) continue; - if ((mod->module_core) && (mod->core_text_size)) { - set_page_attributes(mod->module_core, - mod->module_core + mod->core_text_size, - set_memory_rw); - } - if ((mod->module_init) && (mod->init_text_size)) { - set_page_attributes(mod->module_init, - mod->module_init + mod->init_text_size, - set_memory_rw); - } + + frob_text(&mod->core_layout, set_memory_rw); + frob_text(&mod->init_layout, set_memory_rw); } mutex_unlock(&module_mutex); } @@ -1968,23 +1950,25 @@ void set_all_modules_text_ro(void) list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) continue; - if ((mod->module_core) && (mod->core_text_size)) { - set_page_attributes(mod->module_core, - mod->module_core + mod->core_text_size, - set_memory_ro); - } - if ((mod->module_init) && (mod->init_text_size)) { - set_page_attributes(mod->module_init, - mod->module_init + mod->init_text_size, - set_memory_ro); - } + + frob_text(&mod->core_layout, set_memory_ro); + frob_text(&mod->init_layout, set_memory_ro); } mutex_unlock(&module_mutex); } + +static void disable_ro_nx(const struct module_layout *layout) +{ + frob_text(layout, set_memory_rw); + frob_rodata(layout, set_memory_rw); + frob_rodata(layout, set_memory_x); + frob_writable_data(layout, set_memory_x); +} + #else -static inline void set_section_ro_nx(void *base, unsigned long text_size, unsigned long ro_size, unsigned long total_size) { } -static void unset_module_core_ro_nx(struct module *mod) { } -static void unset_module_init_ro_nx(struct module *mod) { } +static void disable_ro_nx(const struct module_layout *layout) { } +static void module_enable_nx(const struct module *mod) { } +static void module_disable_nx(const struct module *mod) { } #endif void __weak module_memfree(void *module_region) @@ -2036,19 +2020,19 @@ static void free_module(struct module *mod) synchronize_sched(); mutex_unlock(&module_mutex); - /* This may be NULL, but that's OK */ - unset_module_init_ro_nx(mod); + /* This may be empty, but that's OK */ + disable_ro_nx(&mod->init_layout); module_arch_freeing_init(mod); - module_memfree(mod->module_init); + module_memfree(mod->init_layout.base); kfree(mod->args); percpu_modfree(mod); /* Free lock-classes; relies on the preceding sync_rcu(). */ - lockdep_free_key_range(mod->module_core, mod->core_size); + lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size); /* Finally, free the core (containing the module structure) */ - unset_module_core_ro_nx(mod); - module_memfree(mod->module_core); + disable_ro_nx(&mod->core_layout); + module_memfree(mod->core_layout.base); #ifdef CONFIG_MPU update_protections(current->mm); @@ -2251,20 +2235,20 @@ static void layout_sections(struct module *mod, struct load_info *info) || s->sh_entsize != ~0UL || strstarts(sname, ".init")) continue; - s->sh_entsize = get_offset(mod, &mod->core_size, s, i); + s->sh_entsize = get_offset(mod, &mod->core_layout.size, s, i); pr_debug("\t%s\n", sname); } switch (m) { case 0: /* executable */ - mod->core_size = debug_align(mod->core_size); - mod->core_text_size = mod->core_size; + mod->core_layout.size = debug_align(mod->core_layout.size); + mod->core_layout.text_size = mod->core_layout.size; break; case 1: /* RO: text and ro-data */ - mod->core_size = debug_align(mod->core_size); - mod->core_ro_size = mod->core_size; + mod->core_layout.size = debug_align(mod->core_layout.size); + mod->core_layout.ro_size = mod->core_layout.size; break; case 3: /* whole core */ - mod->core_size = debug_align(mod->core_size); + mod->core_layout.size = debug_align(mod->core_layout.size); break; } } @@ -2280,21 +2264,21 @@ static void layout_sections(struct module *mod, struct load_info *info) || s->sh_entsize != ~0UL || !strstarts(sname, ".init")) continue; - s->sh_entsize = (get_offset(mod, &mod->init_size, s, i) + s->sh_entsize = (get_offset(mod, &mod->init_layout.size, s, i) | INIT_OFFSET_MASK); pr_debug("\t%s\n", sname); } switch (m) { case 0: /* executable */ - mod->init_size = debug_align(mod->init_size); - mod->init_text_size = mod->init_size; + mod->init_layout.size = debug_align(mod->init_layout.size); + mod->init_layout.text_size = mod->init_layout.size; break; case 1: /* RO: text and ro-data */ - mod->init_size = debug_align(mod->init_size); - mod->init_ro_size = mod->init_size; + mod->init_layout.size = debug_align(mod->init_layout.size); + mod->init_layout.ro_size = mod->init_layout.size; break; case 3: /* whole init */ - mod->init_size = debug_align(mod->init_size); + mod->init_layout.size = debug_align(mod->init_layout.size); break; } } @@ -2404,7 +2388,7 @@ static char elf_type(const Elf_Sym *sym, const struct load_info *info) } if (sym->st_shndx == SHN_UNDEF) return 'U'; - if (sym->st_shndx == SHN_ABS) + if (sym->st_shndx == SHN_ABS || sym->st_shndx == info->index.pcpu) return 'a'; if (sym->st_shndx >= SHN_LORESERVE) return '?'; @@ -2433,7 +2417,7 @@ static char elf_type(const Elf_Sym *sym, const struct load_info *info) } static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs, - unsigned int shnum) + unsigned int shnum, unsigned int pcpundx) { const Elf_Shdr *sec; @@ -2442,6 +2426,11 @@ static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs, || !src->st_name) return false; +#ifdef CONFIG_KALLSYMS_ALL + if (src->st_shndx == pcpundx) + return true; +#endif + sec = sechdrs + src->st_shndx; if (!(sec->sh_flags & SHF_ALLOC) #ifndef CONFIG_KALLSYMS_ALL @@ -2469,7 +2458,7 @@ static void layout_symtab(struct module *mod, struct load_info *info) /* Put symbol section at end of init part of module. */ symsect->sh_flags |= SHF_ALLOC; - symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect, + symsect->sh_entsize = get_offset(mod, &mod->init_layout.size, symsect, info->index.sym) | INIT_OFFSET_MASK; pr_debug("\t%s\n", info->secstrings + symsect->sh_name); @@ -2479,30 +2468,31 @@ static void layout_symtab(struct module *mod, struct load_info *info) /* Compute total space required for the core symbols' strtab. */ for (ndst = i = 0; i < nsrc; i++) { if (i == 0 || - is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) { + is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum, + info->index.pcpu)) { strtab_size += strlen(&info->strtab[src[i].st_name])+1; ndst++; } } /* Append room for core symbols at end of core part. */ - info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); - info->stroffs = mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym); - mod->core_size += strtab_size; - mod->core_size = debug_align(mod->core_size); + info->symoffs = ALIGN(mod->core_layout.size, symsect->sh_addralign ?: 1); + info->stroffs = mod->core_layout.size = info->symoffs + ndst * sizeof(Elf_Sym); + mod->core_layout.size += strtab_size; + mod->core_layout.size = debug_align(mod->core_layout.size); /* Put string table section at end of init part of module. */ strsect->sh_flags |= SHF_ALLOC; - strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect, + strsect->sh_entsize = get_offset(mod, &mod->init_layout.size, strsect, info->index.str) | INIT_OFFSET_MASK; pr_debug("\t%s\n", info->secstrings + strsect->sh_name); /* We'll tack temporary mod_kallsyms on the end. */ - mod->init_size = ALIGN(mod->init_size, - __alignof__(struct mod_kallsyms)); - info->mod_kallsyms_init_off = mod->init_size; - mod->init_size += sizeof(struct mod_kallsyms); - mod->init_size = debug_align(mod->init_size); + mod->init_layout.size = ALIGN(mod->init_layout.size, + __alignof__(struct mod_kallsyms)); + info->mod_kallsyms_init_off = mod->init_layout.size; + mod->init_layout.size += sizeof(struct mod_kallsyms); + mod->init_layout.size = debug_align(mod->init_layout.size); } /* @@ -2519,7 +2509,7 @@ static void add_kallsyms(struct module *mod, const struct load_info *info) Elf_Shdr *symsec = &info->sechdrs[info->index.sym]; /* Set up to point into init section. */ - mod->kallsyms = mod->module_init + info->mod_kallsyms_init_off; + mod->kallsyms = mod->init_layout.base + info->mod_kallsyms_init_off; mod->kallsyms->symtab = (void *)symsec->sh_addr; mod->kallsyms->num_symtab = symsec->sh_size / sizeof(Elf_Sym); @@ -2532,12 +2522,13 @@ static void add_kallsyms(struct module *mod, const struct load_info *info) = elf_type(&mod->kallsyms->symtab[i], info); /* Now populate the cut down core kallsyms for after init. */ - mod->core_kallsyms.symtab = dst = mod->module_core + info->symoffs; - mod->core_kallsyms.strtab = s = mod->module_core + info->stroffs; + mod->core_kallsyms.symtab = dst = mod->core_layout.base + info->symoffs; + mod->core_kallsyms.strtab = s = mod->core_layout.base + info->stroffs; src = mod->kallsyms->symtab; for (ndst = i = 0; i < mod->kallsyms->num_symtab; i++) { if (i == 0 || - is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) { + is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum, + info->index.pcpu)) { dst[ndst] = src[i]; dst[ndst++].st_name = s - mod->core_kallsyms.strtab; s += strlcpy(s, &mod->kallsyms->strtab[src[i].st_name], @@ -2983,7 +2974,7 @@ static int move_module(struct module *mod, struct load_info *info) void *ptr; /* Do the allocs. */ - ptr = module_alloc(mod->core_size); + ptr = module_alloc(mod->core_layout.size); /* * The pointer to this block is stored in the module structure * which is inside the block. Just mark it as not being a @@ -2993,11 +2984,11 @@ static int move_module(struct module *mod, struct load_info *info) if (!ptr) return -ENOMEM; - memset(ptr, 0, mod->core_size); - mod->module_core = ptr; + memset(ptr, 0, mod->core_layout.size); + mod->core_layout.base = ptr; - if (mod->init_size) { - ptr = module_alloc(mod->init_size); + if (mod->init_layout.size) { + ptr = module_alloc(mod->init_layout.size); /* * The pointer to this block is stored in the module structure * which is inside the block. This block doesn't need to be @@ -3006,13 +2997,13 @@ static int move_module(struct module *mod, struct load_info *info) */ kmemleak_ignore(ptr); if (!ptr) { - module_memfree(mod->module_core); + module_memfree(mod->core_layout.base); return -ENOMEM; } - memset(ptr, 0, mod->init_size); - mod->module_init = ptr; + memset(ptr, 0, mod->init_layout.size); + mod->init_layout.base = ptr; } else - mod->module_init = NULL; + mod->init_layout.base = NULL; /* Transfer each section which specifies SHF_ALLOC */ pr_debug("final section addresses:\n"); @@ -3024,10 +3015,10 @@ static int move_module(struct module *mod, struct load_info *info) continue; if (shdr->sh_entsize & INIT_OFFSET_MASK) - dest = mod->module_init + dest = mod->init_layout.base + (shdr->sh_entsize & ~INIT_OFFSET_MASK); else - dest = mod->module_core + shdr->sh_entsize; + dest = mod->core_layout.base + shdr->sh_entsize; if (shdr->sh_type != SHT_NOBITS) memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size); @@ -3089,12 +3080,12 @@ static void flush_module_icache(const struct module *mod) * Do it before processing of module parameters, so the module * can provide parameter accessor functions of its own. */ - if (mod->module_init) - flush_icache_range((unsigned long)mod->module_init, - (unsigned long)mod->module_init - + mod->init_size); - flush_icache_range((unsigned long)mod->module_core, - (unsigned long)mod->module_core + mod->core_size); + if (mod->init_layout.base) + flush_icache_range((unsigned long)mod->init_layout.base, + (unsigned long)mod->init_layout.base + + mod->init_layout.size); + flush_icache_range((unsigned long)mod->core_layout.base, + (unsigned long)mod->core_layout.base + mod->core_layout.size); set_fs(old_fs); } @@ -3152,8 +3143,8 @@ static void module_deallocate(struct module *mod, struct load_info *info) { percpu_modfree(mod); module_arch_freeing_init(mod); - module_memfree(mod->module_init); - module_memfree(mod->module_core); + module_memfree(mod->init_layout.base); + module_memfree(mod->core_layout.base); } int __weak module_finalize(const Elf_Ehdr *hdr, @@ -3240,7 +3231,7 @@ static noinline int do_init_module(struct module *mod) ret = -ENOMEM; goto fail; } - freeinit->module_init = mod->module_init; + freeinit->module_init = mod->init_layout.base; /* * We want to find out whether @mod uses async during init. Clear @@ -3297,12 +3288,12 @@ static noinline int do_init_module(struct module *mod) rcu_assign_pointer(mod->kallsyms, &mod->core_kallsyms); #endif mod_tree_remove_init(mod); - unset_module_init_ro_nx(mod); + disable_ro_nx(&mod->init_layout); module_arch_freeing_init(mod); - mod->module_init = NULL; - mod->init_size = 0; - mod->init_ro_size = 0; - mod->init_text_size = 0; + mod->init_layout.base = NULL; + mod->init_layout.size = 0; + mod->init_layout.ro_size = 0; + mod->init_layout.text_size = 0; /* * We want to free module_init, but be aware that kallsyms may be * walking this with preempt disabled. In all the failure paths, we @@ -3324,6 +3315,7 @@ fail: module_put(mod); blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, mod); + ftrace_release_mod(mod); free_module(mod); wake_up_all(&module_wq); return ret; @@ -3391,23 +3383,16 @@ static int complete_formation(struct module *mod, struct load_info *info) /* This relies on module_mutex for list integrity. */ module_bug_finalize(info->hdr, info->sechdrs, mod); - /* Set RO and NX regions for core */ - set_section_ro_nx(mod->module_core, - mod->core_text_size, - mod->core_ro_size, - mod->core_size); - - /* Set RO and NX regions for init */ - set_section_ro_nx(mod->module_init, - mod->init_text_size, - mod->init_ro_size, - mod->init_size); + /* Set RO and NX regions */ + module_enable_ro(mod); + module_enable_nx(mod); /* Mark state as coming so strong_try_module_get() ignores us, * but kallsyms etc. can see us. */ mod->state = MODULE_STATE_COMING; mutex_unlock(&module_mutex); + ftrace_module_enable(mod); blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod); return 0; @@ -3566,8 +3551,8 @@ static int load_module(struct load_info *info, const char __user *uargs, MODULE_STATE_GOING, mod); /* we can't deallocate the module until we clear memory protection */ - unset_module_init_ro_nx(mod); - unset_module_core_ro_nx(mod); + module_disable_ro(mod); + module_disable_nx(mod); ddebug_cleanup: dynamic_debug_remove(info->debug); @@ -3596,7 +3581,7 @@ static int load_module(struct load_info *info, const char __user *uargs, */ ftrace_release_mod(mod); /* Free lock-classes; relies on the preceding sync_rcu() */ - lockdep_free_key_range(mod->module_core, mod->core_size); + lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size); module_deallocate(mod, info); free_copy: @@ -3680,9 +3665,9 @@ static const char *get_ksymbol(struct module *mod, /* At worse, next value is at end of module */ if (within_module_init(addr, mod)) - nextval = (unsigned long)mod->module_init+mod->init_text_size; + nextval = (unsigned long)mod->init_layout.base+mod->init_layout.text_size; else - nextval = (unsigned long)mod->module_core+mod->core_text_size; + nextval = (unsigned long)mod->core_layout.base+mod->core_layout.text_size; /* Scan for closest preceding symbol, and next symbol. (ELF starts real symbols at 1). */ @@ -3935,7 +3920,7 @@ static int m_show(struct seq_file *m, void *p) return 0; seq_printf(m, "%s %u", - mod->name, mod->init_size + mod->core_size); + mod->name, mod->init_layout.size + mod->core_layout.size); print_unload_info(m, mod); /* Informative for users. */ @@ -3944,7 +3929,7 @@ static int m_show(struct seq_file *m, void *p) mod->state == MODULE_STATE_COMING ? "Loading" : "Live"); /* Used by oprofile and other similar tools. */ - seq_printf(m, " 0x%pK", mod->module_core); + seq_printf(m, " 0x%pK", mod->core_layout.base); /* Taints info */ if (mod->taints) @@ -4087,8 +4072,8 @@ struct module *__module_text_address(unsigned long addr) struct module *mod = __module_address(addr); if (mod) { /* Make sure it's within the text section. */ - if (!within(addr, mod->module_init, mod->init_text_size) - && !within(addr, mod->module_core, mod->core_text_size)) + if (!within(addr, mod->init_layout.base, mod->init_layout.text_size) + && !within(addr, mod->core_layout.base, mod->core_layout.text_size)) mod = NULL; } return mod; diff --git a/kernel/panic.c b/kernel/panic.c index 41e2b54f3..d96469de7 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -61,6 +61,17 @@ void __weak panic_smp_self_stop(void) cpu_relax(); } +/* + * Stop ourselves in NMI context if another CPU has already panicked. Arch code + * may override this to prepare for crash dumping, e.g. save regs info. + */ +void __weak nmi_panic_self_stop(struct pt_regs *regs) +{ + panic_smp_self_stop(); +} + +atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID); + /** * panic - halt the system * @fmt: The text string to print @@ -71,17 +82,17 @@ void __weak panic_smp_self_stop(void) */ void panic(const char *fmt, ...) { - static DEFINE_SPINLOCK(panic_lock); static char buf[1024]; va_list args; long i, i_next = 0; int state = 0; + int old_cpu, this_cpu; /* * Disable local interrupts. This will prevent panic_smp_self_stop * from deadlocking the first cpu that invokes the panic, since * there is nothing to prevent an interrupt handler (that runs - * after the panic_lock is acquired) from invoking panic again. + * after setting panic_cpu) from invoking panic() again. */ local_irq_disable(); @@ -94,8 +105,16 @@ void panic(const char *fmt, ...) * multiple parallel invocations of panic, all other CPUs either * stop themself or will wait until they are stopped by the 1st CPU * with smp_send_stop(). + * + * `old_cpu == PANIC_CPU_INVALID' means this is the 1st CPU which + * comes here, so go ahead. + * `old_cpu == this_cpu' means we came from nmi_panic() which sets + * panic_cpu to this CPU. In this case, this is also the 1st CPU. */ - if (!spin_trylock(&panic_lock)) + this_cpu = raw_smp_processor_id(); + old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu); + + if (old_cpu != PANIC_CPU_INVALID && old_cpu != this_cpu) panic_smp_self_stop(); console_verbose(); @@ -117,9 +136,11 @@ void panic(const char *fmt, ...) * everything else. * If we want to run this after calling panic_notifiers, pass * the "crash_kexec_post_notifiers" option to the kernel. + * + * Bypass the panic_cpu check and call __crash_kexec directly. */ if (!crash_kexec_post_notifiers) - crash_kexec(NULL); + __crash_kexec(NULL); /* * Note smp_send_stop is the usual smp shutdown function, which @@ -142,9 +163,11 @@ void panic(const char *fmt, ...) * panic_notifiers and dumping kmsg before kdump. * Note: since some panic_notifiers can make crashed kernel * more unstable, it can increase risks of the kdump failure too. + * + * Bypass the panic_cpu check and call __crash_kexec directly. */ if (crash_kexec_post_notifiers) - crash_kexec(NULL); + __crash_kexec(NULL); bust_spinlocks(0); diff --git a/kernel/pid.c b/kernel/pid.c index 78b3d9f80..4d73a834c 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -588,7 +588,7 @@ void __init pidhash_init(void) void __init pidmap_init(void) { - /* Veryify no one has done anything silly */ + /* Verify no one has done anything silly: */ BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING); /* bump default and minimum pid_max based on number of cpus */ @@ -604,5 +604,5 @@ void __init pidmap_init(void) atomic_dec(&init_pid_ns.pidmap[0].nr_free); init_pid_ns.pid_cachep = KMEM_CACHE(pid, - SLAB_HWCACHE_ALIGN | SLAB_PANIC); + SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); } diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 9e2ee0cb1..68d3ebc12 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -101,284 +101,6 @@ config PM_STD_PARTITION suspended image to. It will simply pick the first available swap device. -menuconfig TOI_CORE - bool "Enhanced Hibernation (TuxOnIce)" - depends on HIBERNATION - default y - ---help--- - TuxOnIce is the 'new and improved' suspend support. - - See the TuxOnIce home page (tuxonice.net) - for FAQs, HOWTOs and other documentation. - - comment "Image Storage (you need at least one allocator)" - depends on TOI_CORE - - config TOI_FILE - bool "File Allocator" - depends on TOI_CORE - default y - ---help--- - This option enables support for storing an image in a - simple file. You might want this if your swap is - sometimes full enough that you don't have enough spare - space to store an image. - - config TOI_SWAP - bool "Swap Allocator" - depends on TOI_CORE && SWAP - default y - ---help--- - This option enables support for storing an image in your - swap space. - - comment "General Options" - depends on TOI_CORE - - config TOI_PRUNE - bool "Image pruning support" - depends on TOI_CORE && CRYPTO && BROKEN - default y - ---help--- - This option adds support for using cryptoapi hashing - algorithms to identify pages with the same content. We - then write a much smaller pointer to the first copy of - the data instead of a complete (perhaps compressed) - additional copy. - - You probably want this, so say Y here. - - comment "No image pruning support available without Cryptoapi support." - depends on TOI_CORE && !CRYPTO - - config TOI_CRYPTO - bool "Compression support" - depends on TOI_CORE && CRYPTO - default y - ---help--- - This option adds support for using cryptoapi compression - algorithms. Compression is particularly useful as it can - more than double your suspend and resume speed (depending - upon how well your image compresses). - - You probably want this, so say Y here. - - comment "No compression support available without Cryptoapi support." - depends on TOI_CORE && !CRYPTO - - config TOI_USERUI - bool "Userspace User Interface support" - depends on TOI_CORE && NET && (VT || SERIAL_CONSOLE) - default y - ---help--- - This option enabled support for a userspace based user interface - to TuxOnIce, which allows you to have a nice display while suspending - and resuming, and also enables features such as pressing escape to - cancel a cycle or interactive debugging. - - config TOI_USERUI_DEFAULT_PATH - string "Default userui program location" - default "/usr/local/sbin/tuxoniceui_text" - depends on TOI_USERUI - ---help--- - This entry allows you to specify a default path to the userui binary. - - config TOI_DEFAULT_IMAGE_SIZE_LIMIT - int "Default image size limit" - range -2 65536 - default "-2" - depends on TOI_CORE - ---help--- - This entry allows you to specify a default image size limit. It can - be overridden at run-time using /sys/power/tuxonice/image_size_limit. - - config TOI_KEEP_IMAGE - bool "Allow Keep Image Mode" - depends on TOI_CORE - ---help--- - This option allows you to keep and image and reuse it. It is intended - __ONLY__ for use with systems where all filesystems are mounted read- - only (kiosks, for example). To use it, compile this option in and boot - normally. Set the KEEP_IMAGE flag in /sys/power/tuxonice and suspend. - When you resume, the image will not be removed. You will be unable to turn - off swap partitions (assuming you are using the swap allocator), but future - suspends simply do a power-down. The image can be updated using the - kernel command line parameter suspend_act= to turn off the keep image - bit. Keep image mode is a little less user friendly on purpose - it - should not be used without thought! - - config TOI_INCREMENTAL - bool "Incremental Image Support" - depends on TOI_CORE && 64BIT && TOI_KEEP_IMAGE - default n - ---help--- - This option enables the work in progress toward using the dirty page - tracking to record changes to pages. It is hoped that - this will be an initial step toward implementing storing just - the differences between consecutive images, which will - increase the amount of storage needed for the image, but also - increase the speed at which writing an image occurs and - reduce the wear and tear on drives. - - At the moment, all that is implemented is the first step of keeping - an existing image and then comparing it to the contents in memory - (by setting /sys/power/tuxonice/verify_image to 1 and triggering a - (fake) resume) to see what the page change tracking should find to be - different. If you have verify_image set to 1, TuxOnIce will automatically - invalidate the old image when you next try to hibernate, so there's no - greater chance of disk corruption than normal. - - comment "No incremental image support available without Keep Image support." - depends on TOI_CORE && !TOI_KEEP_IMAGE && 64BIT - - config TOI_REPLACE_SWSUSP - bool "Replace swsusp by default" - default y - depends on TOI_CORE - ---help--- - TuxOnIce can replace swsusp. This option makes that the default state, - requiring you to echo 0 > /sys/power/tuxonice/replace_swsusp if you want - to use the vanilla kernel functionality. Note that your initrd/ramfs will - need to do this before trying to resume, too. - With overriding swsusp enabled, echoing disk to /sys/power/state will - start a TuxOnIce cycle. If resume= doesn't specify an allocator and both - the swap and file allocators are compiled in, the swap allocator will be - used by default. - - config TOI_IGNORE_LATE_INITCALL - bool "Wait for initrd/ramfs to run, by default" - default n - depends on TOI_CORE - ---help--- - When booting, TuxOnIce can check for an image and start to resume prior - to any initrd/ramfs running (via a late initcall). - - If you don't have an initrd/ramfs, this is what you want to happen - - otherwise you won't be able to safely resume. You should set this option - to 'No'. - - If, however, you want your initrd/ramfs to run anyway before resuming, - you need to tell TuxOnIce to ignore that earlier opportunity to resume. - This can be done either by using this compile time option, or by - overriding this option with the boot-time parameter toi_initramfs_resume_only=1. - - Note that if TuxOnIce can't resume at the earlier opportunity, the - value of this option won't matter - the initramfs/initrd (if any) will - run anyway. - - menuconfig TOI_CLUSTER - bool "Cluster support" - default n - depends on TOI_CORE && NET && BROKEN - ---help--- - Support for linking multiple machines in a cluster so that they suspend - and resume together. - - config TOI_DEFAULT_CLUSTER_INTERFACE - string "Default cluster interface" - depends on TOI_CLUSTER - ---help--- - The default interface on which to communicate with other nodes in - the cluster. - - If no value is set here, cluster support will be disabled by default. - - config TOI_DEFAULT_CLUSTER_KEY - string "Default cluster key" - default "Default" - depends on TOI_CLUSTER - ---help--- - The default key used by this node. All nodes in the same cluster - have the same key. Multiple clusters may coexist on the same lan - by using different values for this key. - - config TOI_CLUSTER_IMAGE_TIMEOUT - int "Timeout when checking for image" - default 15 - depends on TOI_CLUSTER - ---help--- - Timeout (seconds) before continuing to boot when waiting to see - whether other nodes might have an image. Set to -1 to wait - indefinitely. In WAIT_UNTIL_NODES is non zero, we might continue - booting sooner than this timeout. - - config TOI_CLUSTER_WAIT_UNTIL_NODES - int "Nodes without image before continuing" - default 0 - depends on TOI_CLUSTER - ---help--- - When booting and no image is found, we wait to see if other nodes - have an image before continuing to boot. This value lets us - continue after seeing a certain number of nodes without an image, - instead of continuing to wait for the timeout. Set to 0 to only - use the timeout. - - config TOI_DEFAULT_CLUSTER_PRE_HIBERNATE - string "Default pre-hibernate script" - depends on TOI_CLUSTER - ---help--- - The default script to be called when starting to hibernate. - - config TOI_DEFAULT_CLUSTER_POST_HIBERNATE - string "Default post-hibernate script" - depends on TOI_CLUSTER - ---help--- - The default script to be called after resuming from hibernation. - - config TOI_DEFAULT_WAIT - int "Default waiting time for emergency boot messages" - default "25" - range -1 32768 - depends on TOI_CORE - help - TuxOnIce can display warnings very early in the process of resuming, - if (for example) it appears that you have booted a kernel that doesn't - match an image on disk. It can then give you the opportunity to either - continue booting that kernel, or reboot the machine. This option can be - used to control how long to wait in such circumstances. -1 means wait - forever. 0 means don't wait at all (do the default action, which will - generally be to continue booting and remove the image). Values of 1 or - more indicate a number of seconds (up to 255) to wait before doing the - default. - - config TOI_DEFAULT_EXTRA_PAGES_ALLOWANCE - int "Default extra pages allowance" - default "2000" - range 500 32768 - depends on TOI_CORE - help - This value controls the default for the allowance TuxOnIce makes for - drivers to allocate extra memory during the atomic copy. The default - value of 2000 will be okay in most cases. If you are using - DRI, the easiest way to find what value to use is to try to hibernate - and look at how many pages were actually needed in the sysfs entry - /sys/power/tuxonice/debug_info (first number on the last line), adding - a little extra because the value is not always the same. - - config TOI_CHECKSUM - bool "Checksum pageset2" - default n - depends on TOI_CORE - select CRYPTO - select CRYPTO_ALGAPI - select CRYPTO_MD4 - ---help--- - Adds support for checksumming pageset2 pages, to ensure you really get an - atomic copy. Since some filesystems (XFS especially) change metadata even - when there's no other activity, we need this to check for pages that have - been changed while we were saving the page cache. If your debugging output - always says no pages were resaved, you may be able to safely disable this - option. - -config TOI - bool - depends on TOI_CORE!=n - default y - -config TOI_ZRAM_SUPPORT - def_bool y - depends on TOI && ZRAM!=n - config PM_SLEEP def_bool y depends on SUSPEND || HIBERNATE_CALLBACKS @@ -513,7 +235,7 @@ config PM_TRACE_RTC config APM_EMULATION tristate "Advanced Power Management Emulation" - depends on PM && SYS_SUPPORTS_APM_EMULATION + depends on SYS_SUPPORTS_APM_EMULATION help APM is a BIOS specification for saving power using several different techniques. This is mostly useful for battery powered laptops with diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 82c4795e8..cb880a14c 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -1,38 +1,6 @@ ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG -tuxonice_core-y := tuxonice_modules.o - -obj-$(CONFIG_TOI) += tuxonice_builtin.o -obj-$(CONFIG_TOI_INCREMENTAL) += tuxonice_incremental.o \ - tuxonice_copy_before_write.o - -tuxonice_core-$(CONFIG_PM_DEBUG) += tuxonice_alloc.o - -# Compile these in after allocation debugging, if used. - -tuxonice_core-y += tuxonice_sysfs.o tuxonice_highlevel.o \ - tuxonice_io.o tuxonice_pagedir.o tuxonice_prepare_image.o \ - tuxonice_extent.o tuxonice_pageflags.o tuxonice_ui.o \ - tuxonice_power_off.o tuxonice_atomic_copy.o - -tuxonice_core-$(CONFIG_TOI_CHECKSUM) += tuxonice_checksum.o - -tuxonice_core-$(CONFIG_NET) += tuxonice_storage.o tuxonice_netlink.o - -obj-$(CONFIG_TOI_CORE) += tuxonice_core.o -obj-$(CONFIG_TOI_PRUNE) += tuxonice_prune.o -obj-$(CONFIG_TOI_CRYPTO) += tuxonice_compress.o - -tuxonice_bio-y := tuxonice_bio_core.o tuxonice_bio_chains.o \ - tuxonice_bio_signature.o - -obj-$(CONFIG_TOI_SWAP) += tuxonice_bio.o tuxonice_swap.o -obj-$(CONFIG_TOI_FILE) += tuxonice_bio.o tuxonice_file.o -obj-$(CONFIG_TOI_CLUSTER) += tuxonice_cluster.o - -obj-$(CONFIG_TOI_USERUI) += tuxonice_userui.o - obj-y += qos.o obj-$(CONFIG_PM) += main.o obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 153e51db5..b7342a24f 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -31,7 +31,7 @@ #include <linux/ktime.h> #include <trace/events/power.h> -#include "tuxonice.h" +#include "power.h" static int nocompress; @@ -39,7 +39,7 @@ static int noresume; static int nohibernate; static int resume_wait; static unsigned int resume_delay; -char resume_file[256] = CONFIG_PM_STD_PARTITION; +static char resume_file[256] = CONFIG_PM_STD_PARTITION; dev_t swsusp_resume_device; sector_t swsusp_resume_block; __visible int in_suspend __nosavedata; @@ -123,7 +123,7 @@ static int hibernation_test(int level) { return 0; } * platform_begin - Call platform to start hibernation. * @platform_mode: Whether or not to use the platform driver. */ -int platform_begin(int platform_mode) +static int platform_begin(int platform_mode) { return (platform_mode && hibernation_ops) ? hibernation_ops->begin() : 0; @@ -133,7 +133,7 @@ int platform_begin(int platform_mode) * platform_end - Call platform to finish transition to the working state. * @platform_mode: Whether or not to use the platform driver. */ -void platform_end(int platform_mode) +static void platform_end(int platform_mode) { if (platform_mode && hibernation_ops) hibernation_ops->end(); @@ -147,7 +147,7 @@ void platform_end(int platform_mode) * if so configured, and return an error code if that fails. */ -int platform_pre_snapshot(int platform_mode) +static int platform_pre_snapshot(int platform_mode) { return (platform_mode && hibernation_ops) ? hibernation_ops->pre_snapshot() : 0; @@ -162,7 +162,7 @@ int platform_pre_snapshot(int platform_mode) * * This routine is called on one CPU with interrupts disabled. */ -void platform_leave(int platform_mode) +static void platform_leave(int platform_mode) { if (platform_mode && hibernation_ops) hibernation_ops->leave(); @@ -177,7 +177,7 @@ void platform_leave(int platform_mode) * * This routine must be called after platform_prepare(). */ -void platform_finish(int platform_mode) +static void platform_finish(int platform_mode) { if (platform_mode && hibernation_ops) hibernation_ops->finish(); @@ -193,7 +193,7 @@ void platform_finish(int platform_mode) * If the restore fails after this function has been called, * platform_restore_cleanup() must be called. */ -int platform_pre_restore(int platform_mode) +static int platform_pre_restore(int platform_mode) { return (platform_mode && hibernation_ops) ? hibernation_ops->pre_restore() : 0; @@ -210,7 +210,7 @@ int platform_pre_restore(int platform_mode) * function must be called too, regardless of the result of * platform_pre_restore(). */ -void platform_restore_cleanup(int platform_mode) +static void platform_restore_cleanup(int platform_mode) { if (platform_mode && hibernation_ops) hibernation_ops->restore_cleanup(); @@ -220,7 +220,7 @@ void platform_restore_cleanup(int platform_mode) * platform_recover - Recover from a failure to suspend devices. * @platform_mode: Whether or not to use the platform driver. */ -void platform_recover(int platform_mode) +static void platform_recover(int platform_mode) { if (platform_mode && hibernation_ops && hibernation_ops->recover) hibernation_ops->recover(); @@ -648,9 +648,6 @@ int hibernate(void) { int error; - if (test_action_state(TOI_REPLACE_SWSUSP)) - return try_tuxonice_hibernate(); - if (!hibernation_available()) { pr_debug("PM: Hibernation not available.\n"); return -EPERM; @@ -740,19 +737,11 @@ int hibernate(void) * attempts to recover gracefully and make the kernel return to the normal mode * of operation. */ -int software_resume(void) +static int software_resume(void) { int error; unsigned int flags; - resume_attempted = 1; - - /* - * We can't know (until an image header - if any - is loaded), whether - * we did override swsusp. We therefore ensure that both are tried. - */ - try_tuxonice_resume(); - /* * If the user said "noresume".. bail out early. */ @@ -1139,7 +1128,6 @@ static int __init hibernate_setup(char *str) static int __init noresume_setup(char *str) { noresume = 1; - set_toi_state(TOI_NORESUME_SPECIFIED); return 1; } diff --git a/kernel/power/main.c b/kernel/power/main.c index b2dd4d999..27946975e 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -280,13 +280,7 @@ static ssize_t pm_wakeup_irq_show(struct kobject *kobj, return pm_wakeup_irq ? sprintf(buf, "%u\n", pm_wakeup_irq) : -ENODATA; } -static ssize_t pm_wakeup_irq_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t n) -{ - return -EINVAL; -} -power_attr(pm_wakeup_irq); +power_attr_ro(pm_wakeup_irq); #else /* !CONFIG_PM_SLEEP_DEBUG */ static inline void pm_print_times_init(void) {} @@ -564,14 +558,7 @@ static ssize_t pm_trace_dev_match_show(struct kobject *kobj, return show_trace_dev_match(buf, PAGE_SIZE); } -static ssize_t -pm_trace_dev_match_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t n) -{ - return -EINVAL; -} - -power_attr(pm_trace_dev_match); +power_attr_ro(pm_trace_dev_match); #endif /* CONFIG_PM_TRACE */ diff --git a/kernel/power/power.h b/kernel/power/power.h index b5c9efb36..efe1b3b17 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -36,12 +36,8 @@ static inline char *check_image_kernel(struct swsusp_info *info) return arch_hibernation_header_restore(info) ? "architecture specific data" : NULL; } -#else -extern char *check_image_kernel(struct swsusp_info *info); #endif /* CONFIG_ARCH_HIBERNATION_HEADER */ -extern int init_header(struct swsusp_info *info); -extern char resume_file[256]; /* * Keep some memory free so that I/O operations can succeed without paging * [Might this be more than 4 MB?] @@ -81,7 +77,14 @@ static struct kobj_attribute _name##_attr = { \ .store = _name##_store, \ } -extern struct pbe *restore_pblist; +#define power_attr_ro(_name) \ +static struct kobj_attribute _name##_attr = { \ + .attr = { \ + .name = __stringify(_name), \ + .mode = S_IRUGO, \ + }, \ + .show = _name##_show, \ +} /* Preferred image size in bytes (default 500 MB) */ extern unsigned long image_size; @@ -266,31 +269,6 @@ static inline void suspend_thaw_processes(void) } #endif -extern struct page *saveable_page(struct zone *z, unsigned long p); -#ifdef CONFIG_HIGHMEM -struct page *saveable_highmem_page(struct zone *z, unsigned long p); -#else -static -inline void *saveable_highmem_page(struct zone *z, unsigned long p) -{ - return NULL; -} -#endif - -#define PBES_PER_PAGE (PAGE_SIZE / sizeof(struct pbe)) -extern struct list_head nosave_regions; - -/** - * This structure represents a range of page frames the contents of which - * should not be saved during the suspend. - */ - -struct nosave_region { - struct list_head list; - unsigned long start_pfn; - unsigned long end_pfn; -}; - #ifdef CONFIG_PM_AUTOSLEEP /* kernel/power/autosleep.c */ @@ -317,10 +295,3 @@ extern int pm_wake_lock(const char *buf); extern int pm_wake_unlock(const char *buf); #endif /* !CONFIG_PM_WAKELOCKS */ - -#ifdef CONFIG_TOI -unsigned long toi_get_nonconflicting_page(void); -#define BM_END_OF_MAP (~0UL) -#else -#define toi_get_nonconflicting_page() (0) -#endif diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 542163a01..3a9706043 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -36,9 +36,6 @@ #include <asm/tlbflush.h> #include <asm/io.h> -#include "tuxonice_modules.h" -#include "tuxonice_builtin.h" -#include "tuxonice_alloc.h" #include "power.h" static int swsusp_page_is_free(struct page *); @@ -101,9 +98,6 @@ static void *get_image_page(gfp_t gfp_mask, int safe_needed) { void *res; - if (toi_running) - return (void *) toi_get_nonconflicting_page(); - res = (void *)get_zeroed_page(gfp_mask); if (safe_needed) while (res && swsusp_page_is_free(virt_to_page(res))) { @@ -149,11 +143,6 @@ static inline void free_image_page(void *addr, int clear_nosave_free) page = virt_to_page(addr); - if (toi_running) { - toi__free_page(29, page); - return; - } - swsusp_unset_page_forbidden(page); if (clear_nosave_free) swsusp_unset_page_free(page); @@ -313,15 +302,13 @@ struct bm_position { int node_bit; }; -#define BM_POSITION_SLOTS (NR_CPUS * 2) - struct memory_bitmap { struct list_head zones; struct linked_page *p_list; /* list of pages used to store zone * bitmap objects and bitmap block * objects */ - struct bm_position cur[BM_POSITION_SLOTS]; /* most recently used bit position */ + struct bm_position cur; /* most recently used bit position */ }; /* Functions that operate on memory bitmaps */ @@ -486,39 +473,16 @@ static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone, free_image_page(node->data, clear_nosave_free); } -void memory_bm_position_reset(struct memory_bitmap *bm) +static void memory_bm_position_reset(struct memory_bitmap *bm) { - int index; - - for (index = 0; index < BM_POSITION_SLOTS; index++) { - bm->cur[index].zone = list_entry(bm->zones.next, struct mem_zone_bm_rtree, + bm->cur.zone = list_entry(bm->zones.next, struct mem_zone_bm_rtree, list); - bm->cur[index].node = list_entry(bm->cur[index].zone->leaves.next, + bm->cur.node = list_entry(bm->cur.zone->leaves.next, struct rtree_node, list); - bm->cur[index].node_pfn = 0; - bm->cur[index].node_bit = 0; - } + bm->cur.node_pfn = 0; + bm->cur.node_bit = 0; } -static void memory_bm_clear_current(struct memory_bitmap *bm, int index); -unsigned long memory_bm_next_pfn(struct memory_bitmap *bm, int index); - -/** - * memory_bm_clear - * @param bm - The bitmap to clear - * - * Only run while single threaded - locking not needed - */ -void memory_bm_clear(struct memory_bitmap *bm) -{ - memory_bm_position_reset(bm); - - while (memory_bm_next_pfn(bm, 0) != BM_END_OF_MAP) { - memory_bm_clear_current(bm, 0); - } - - memory_bm_position_reset(bm); -} static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free); struct mem_extent { @@ -631,8 +595,7 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) } bm->p_list = ca.chain; - - memory_bm_position_reset(bm); + memory_bm_position_reset(bm); Exit: free_mem_extents(&mem_extents); return error; @@ -668,24 +631,14 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) * It walks the radix tree to find the page which contains the bit for * pfn and returns the bit position in **addr and *bit_nr. */ -int memory_bm_find_bit(struct memory_bitmap *bm, int index, - unsigned long pfn, void **addr, unsigned int *bit_nr) +static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, + void **addr, unsigned int *bit_nr) { struct mem_zone_bm_rtree *curr, *zone; struct rtree_node *node; int i, block_nr; - if (!bm->cur[index].zone) { - // Reset - bm->cur[index].zone = list_entry(bm->zones.next, struct mem_zone_bm_rtree, - list); - bm->cur[index].node = list_entry(bm->cur[index].zone->leaves.next, - struct rtree_node, list); - bm->cur[index].node_pfn = 0; - bm->cur[index].node_bit = 0; - } - - zone = bm->cur[index].zone; + zone = bm->cur.zone; if (pfn >= zone->start_pfn && pfn < zone->end_pfn) goto zone_found; @@ -709,8 +662,8 @@ zone_found: * node for our pfn. */ - node = bm->cur[index].node; - if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur[index].node_pfn) + node = bm->cur.node; + if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn) goto node_found; node = zone->rtree; @@ -727,9 +680,9 @@ zone_found: node_found: /* Update last position */ - bm->cur[index].zone = zone; - bm->cur[index].node = node; - bm->cur[index].node_pfn = (pfn - zone->start_pfn) & ~BM_BLOCK_MASK; + bm->cur.zone = zone; + bm->cur.node = node; + bm->cur.node_pfn = (pfn - zone->start_pfn) & ~BM_BLOCK_MASK; /* Set return values */ *addr = node->data; @@ -738,66 +691,66 @@ node_found: return 0; } -void memory_bm_set_bit(struct memory_bitmap *bm, int index, unsigned long pfn) +static void memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn) { void *addr; unsigned int bit; int error; - error = memory_bm_find_bit(bm, index, pfn, &addr, &bit); + error = memory_bm_find_bit(bm, pfn, &addr, &bit); BUG_ON(error); set_bit(bit, addr); } -int mem_bm_set_bit_check(struct memory_bitmap *bm, int index, unsigned long pfn) +static int mem_bm_set_bit_check(struct memory_bitmap *bm, unsigned long pfn) { void *addr; unsigned int bit; int error; - error = memory_bm_find_bit(bm, index, pfn, &addr, &bit); + error = memory_bm_find_bit(bm, pfn, &addr, &bit); if (!error) set_bit(bit, addr); return error; } -void memory_bm_clear_bit(struct memory_bitmap *bm, int index, unsigned long pfn) +static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn) { void *addr; unsigned int bit; int error; - error = memory_bm_find_bit(bm, index, pfn, &addr, &bit); + error = memory_bm_find_bit(bm, pfn, &addr, &bit); BUG_ON(error); clear_bit(bit, addr); } -static void memory_bm_clear_current(struct memory_bitmap *bm, int index) +static void memory_bm_clear_current(struct memory_bitmap *bm) { int bit; - bit = max(bm->cur[index].node_bit - 1, 0); - clear_bit(bit, bm->cur[index].node->data); + bit = max(bm->cur.node_bit - 1, 0); + clear_bit(bit, bm->cur.node->data); } -int memory_bm_test_bit(struct memory_bitmap *bm, int index, unsigned long pfn) +static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn) { void *addr; unsigned int bit; int error; - error = memory_bm_find_bit(bm, index, pfn, &addr, &bit); + error = memory_bm_find_bit(bm, pfn, &addr, &bit); BUG_ON(error); return test_bit(bit, addr); } -static bool memory_bm_pfn_present(struct memory_bitmap *bm, int index, unsigned long pfn) +static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn) { void *addr; unsigned int bit; - return !memory_bm_find_bit(bm, index, pfn, &addr, &bit); + return !memory_bm_find_bit(bm, pfn, &addr, &bit); } /* @@ -810,25 +763,25 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, int index, unsigned * * Returns true if there is a next node, false otherwise. */ -static bool rtree_next_node(struct memory_bitmap *bm, int index) +static bool rtree_next_node(struct memory_bitmap *bm) { - bm->cur[index].node = list_entry(bm->cur[index].node->list.next, + bm->cur.node = list_entry(bm->cur.node->list.next, struct rtree_node, list); - if (&bm->cur[index].node->list != &bm->cur[index].zone->leaves) { - bm->cur[index].node_pfn += BM_BITS_PER_BLOCK; - bm->cur[index].node_bit = 0; + if (&bm->cur.node->list != &bm->cur.zone->leaves) { + bm->cur.node_pfn += BM_BITS_PER_BLOCK; + bm->cur.node_bit = 0; touch_softlockup_watchdog(); return true; } /* No more nodes, goto next zone */ - bm->cur[index].zone = list_entry(bm->cur[index].zone->list.next, + bm->cur.zone = list_entry(bm->cur.zone->list.next, struct mem_zone_bm_rtree, list); - if (&bm->cur[index].zone->list != &bm->zones) { - bm->cur[index].node = list_entry(bm->cur[index].zone->leaves.next, + if (&bm->cur.zone->list != &bm->zones) { + bm->cur.node = list_entry(bm->cur.zone->leaves.next, struct rtree_node, list); - bm->cur[index].node_pfn = 0; - bm->cur[index].node_bit = 0; + bm->cur.node_pfn = 0; + bm->cur.node_bit = 0; return true; } @@ -846,29 +799,38 @@ static bool rtree_next_node(struct memory_bitmap *bm, int index) * It is required to run memory_bm_position_reset() before the * first call to this function. */ -unsigned long memory_bm_next_pfn(struct memory_bitmap *bm, int index) +static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) { unsigned long bits, pfn, pages; int bit; - index += NR_CPUS; /* Iteration state is separated from get/set/test */ - do { - pages = bm->cur[index].zone->end_pfn - bm->cur[index].zone->start_pfn; - bits = min(pages - bm->cur[index].node_pfn, BM_BITS_PER_BLOCK); - bit = find_next_bit(bm->cur[index].node->data, bits, - bm->cur[index].node_bit); + pages = bm->cur.zone->end_pfn - bm->cur.zone->start_pfn; + bits = min(pages - bm->cur.node_pfn, BM_BITS_PER_BLOCK); + bit = find_next_bit(bm->cur.node->data, bits, + bm->cur.node_bit); if (bit < bits) { - pfn = bm->cur[index].zone->start_pfn + bm->cur[index].node_pfn + bit; - bm->cur[index].node_bit = bit + 1; + pfn = bm->cur.zone->start_pfn + bm->cur.node_pfn + bit; + bm->cur.node_bit = bit + 1; return pfn; } - } while (rtree_next_node(bm, index)); + } while (rtree_next_node(bm)); return BM_END_OF_MAP; } -LIST_HEAD(nosave_regions); +/** + * This structure represents a range of page frames the contents of which + * should not be saved during the suspend. + */ + +struct nosave_region { + struct list_head list; + unsigned long start_pfn; + unsigned long end_pfn; +}; + +static LIST_HEAD(nosave_regions); /** * register_nosave_region - register a range of page frames the contents @@ -927,37 +889,37 @@ static struct memory_bitmap *free_pages_map; void swsusp_set_page_free(struct page *page) { if (free_pages_map) - memory_bm_set_bit(free_pages_map, 0, page_to_pfn(page)); + memory_bm_set_bit(free_pages_map, page_to_pfn(page)); } static int swsusp_page_is_free(struct page *page) { return free_pages_map ? - memory_bm_test_bit(free_pages_map, 0, page_to_pfn(page)) : 0; + memory_bm_test_bit(free_pages_map, page_to_pfn(page)) : 0; } void swsusp_unset_page_free(struct page *page) { if (free_pages_map) - memory_bm_clear_bit(free_pages_map, 0, page_to_pfn(page)); + memory_bm_clear_bit(free_pages_map, page_to_pfn(page)); } static void swsusp_set_page_forbidden(struct page *page) { if (forbidden_pages_map) - memory_bm_set_bit(forbidden_pages_map, 0, page_to_pfn(page)); + memory_bm_set_bit(forbidden_pages_map, page_to_pfn(page)); } int swsusp_page_is_forbidden(struct page *page) { return forbidden_pages_map ? - memory_bm_test_bit(forbidden_pages_map, 0, page_to_pfn(page)) : 0; + memory_bm_test_bit(forbidden_pages_map, page_to_pfn(page)) : 0; } static void swsusp_unset_page_forbidden(struct page *page) { if (forbidden_pages_map) - memory_bm_clear_bit(forbidden_pages_map, 0, page_to_pfn(page)); + memory_bm_clear_bit(forbidden_pages_map, page_to_pfn(page)); } /** @@ -988,7 +950,7 @@ static void mark_nosave_pages(struct memory_bitmap *bm) * touch the PFNs for which the error is * returned anyway. */ - mem_bm_set_bit_check(bm, 0, pfn); + mem_bm_set_bit_check(bm, pfn); } } } @@ -1116,7 +1078,7 @@ static unsigned int count_free_highmem_pages(void) * We should save the page if it isn't Nosave or NosaveFree, or Reserved, * and it isn't a part of a free chunk of pages. */ -struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn) +static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn) { struct page *page; @@ -1163,6 +1125,11 @@ static unsigned int count_highmem_pages(void) } return n; } +#else +static inline void *saveable_highmem_page(struct zone *z, unsigned long p) +{ + return NULL; +} #endif /* CONFIG_HIGHMEM */ /** @@ -1173,7 +1140,7 @@ static unsigned int count_highmem_pages(void) * of pages statically defined as 'unsaveable', and it isn't a part of * a free chunk of pages. */ -struct page *saveable_page(struct zone *zone, unsigned long pfn) +static struct page *saveable_page(struct zone *zone, unsigned long pfn) { struct page *page; @@ -1311,15 +1278,15 @@ copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm) max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) if (page_is_saveable(zone, pfn)) - memory_bm_set_bit(orig_bm, 0, pfn); + memory_bm_set_bit(orig_bm, pfn); } memory_bm_position_reset(orig_bm); memory_bm_position_reset(copy_bm); for(;;) { - pfn = memory_bm_next_pfn(orig_bm, 0); + pfn = memory_bm_next_pfn(orig_bm); if (unlikely(pfn == BM_END_OF_MAP)) break; - copy_data_page(memory_bm_next_pfn(copy_bm, 0), pfn); + copy_data_page(memory_bm_next_pfn(copy_bm), pfn); } } @@ -1365,8 +1332,8 @@ void swsusp_free(void) memory_bm_position_reset(free_pages_map); loop: - fr_pfn = memory_bm_next_pfn(free_pages_map, 0); - fb_pfn = memory_bm_next_pfn(forbidden_pages_map, 0); + fr_pfn = memory_bm_next_pfn(free_pages_map); + fb_pfn = memory_bm_next_pfn(forbidden_pages_map); /* * Find the next bit set in both bitmaps. This is guaranteed to @@ -1374,16 +1341,16 @@ loop: */ do { if (fb_pfn < fr_pfn) - fb_pfn = memory_bm_next_pfn(forbidden_pages_map, 0); + fb_pfn = memory_bm_next_pfn(forbidden_pages_map); if (fr_pfn < fb_pfn) - fr_pfn = memory_bm_next_pfn(free_pages_map, 0); + fr_pfn = memory_bm_next_pfn(free_pages_map); } while (fb_pfn != fr_pfn); if (fr_pfn != BM_END_OF_MAP && pfn_valid(fr_pfn)) { struct page *page = pfn_to_page(fr_pfn); - memory_bm_clear_current(forbidden_pages_map, 0); - memory_bm_clear_current(free_pages_map, 0); + memory_bm_clear_current(forbidden_pages_map); + memory_bm_clear_current(free_pages_map); __free_page(page); goto loop; } @@ -1418,7 +1385,7 @@ static unsigned long preallocate_image_pages(unsigned long nr_pages, gfp_t mask) page = alloc_image_page(mask); if (!page) break; - memory_bm_set_bit(©_bm, 0, page_to_pfn(page)); + memory_bm_set_bit(©_bm, page_to_pfn(page)); if (PageHighMem(page)) alloc_highmem++; else @@ -1514,7 +1481,7 @@ static unsigned long free_unnecessary_pages(void) memory_bm_position_reset(©_bm); while (to_free_normal > 0 || to_free_highmem > 0) { - unsigned long pfn = memory_bm_next_pfn(©_bm, 0); + unsigned long pfn = memory_bm_next_pfn(©_bm); struct page *page = pfn_to_page(pfn); if (PageHighMem(page)) { @@ -1528,7 +1495,7 @@ static unsigned long free_unnecessary_pages(void) to_free_normal--; alloc_normal--; } - memory_bm_clear_bit(©_bm, 0, pfn); + memory_bm_clear_bit(©_bm, pfn); swsusp_unset_page_forbidden(page); swsusp_unset_page_free(page); __free_page(page); @@ -1813,7 +1780,7 @@ alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem) struct page *page; page = alloc_image_page(__GFP_HIGHMEM|__GFP_KSWAPD_RECLAIM); - memory_bm_set_bit(bm, 0, page_to_pfn(page)); + memory_bm_set_bit(bm, page_to_pfn(page)); } return nr_highmem; } @@ -1856,7 +1823,7 @@ swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, page = alloc_image_page(GFP_ATOMIC | __GFP_COLD); if (!page) goto err_out; - memory_bm_set_bit(copy_bm, 0, page_to_pfn(page)); + memory_bm_set_bit(copy_bm, page_to_pfn(page)); } } @@ -1871,9 +1838,6 @@ asmlinkage __visible int swsusp_save(void) { unsigned int nr_pages, nr_highmem; - if (toi_running) - return toi_post_context_save(); - printk(KERN_INFO "PM: Creating hibernation image:\n"); drain_local_pages(NULL); @@ -1921,7 +1885,7 @@ static int init_header_complete(struct swsusp_info *info) return 0; } -char *check_image_kernel(struct swsusp_info *info) +static char *check_image_kernel(struct swsusp_info *info) { if (info->version_code != LINUX_VERSION_CODE) return "kernel version"; @@ -1942,7 +1906,7 @@ unsigned long snapshot_get_image_size(void) return nr_copy_pages + nr_meta_pages + 1; } -int init_header(struct swsusp_info *info) +static int init_header(struct swsusp_info *info) { memset(info, 0, sizeof(struct swsusp_info)); info->num_physpages = get_num_physpages(); @@ -1964,7 +1928,7 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm) int j; for (j = 0; j < PAGE_SIZE / sizeof(long); j++) { - buf[j] = memory_bm_next_pfn(bm, 0); + buf[j] = memory_bm_next_pfn(bm); if (unlikely(buf[j] == BM_END_OF_MAP)) break; /* Save page key for data page (s390 only). */ @@ -2015,7 +1979,7 @@ int snapshot_read_next(struct snapshot_handle *handle) } else { struct page *page; - page = pfn_to_page(memory_bm_next_pfn(©_bm, 0)); + page = pfn_to_page(memory_bm_next_pfn(©_bm)); if (PageHighMem(page)) { /* Highmem pages are copied to the buffer, * because we can't return with a kmapped @@ -2057,7 +2021,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm) /* Mark pages that correspond to the "original" pfns as "unsafe" */ memory_bm_position_reset(bm); do { - pfn = memory_bm_next_pfn(bm, 0); + pfn = memory_bm_next_pfn(bm); if (likely(pfn != BM_END_OF_MAP)) { if (likely(pfn_valid(pfn))) swsusp_set_page_free(pfn_to_page(pfn)); @@ -2077,10 +2041,10 @@ duplicate_memory_bitmap(struct memory_bitmap *dst, struct memory_bitmap *src) unsigned long pfn; memory_bm_position_reset(src); - pfn = memory_bm_next_pfn(src, 0); + pfn = memory_bm_next_pfn(src); while (pfn != BM_END_OF_MAP) { - memory_bm_set_bit(dst, 0, pfn); - pfn = memory_bm_next_pfn(src, 0); + memory_bm_set_bit(dst, pfn); + pfn = memory_bm_next_pfn(src); } } @@ -2131,8 +2095,8 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) /* Extract and buffer page key for data page (s390 only). */ page_key_memorize(buf + j); - if (memory_bm_pfn_present(bm, 0, buf[j])) - memory_bm_set_bit(bm, 0, buf[j]); + if (memory_bm_pfn_present(bm, buf[j])) + memory_bm_set_bit(bm, buf[j]); else return -EFAULT; } @@ -2175,12 +2139,12 @@ static unsigned int count_highmem_image_pages(struct memory_bitmap *bm) unsigned int cnt = 0; memory_bm_position_reset(bm); - pfn = memory_bm_next_pfn(bm, 0); + pfn = memory_bm_next_pfn(bm); while (pfn != BM_END_OF_MAP) { if (PageHighMem(pfn_to_page(pfn))) cnt++; - pfn = memory_bm_next_pfn(bm, 0); + pfn = memory_bm_next_pfn(bm); } return cnt; } @@ -2225,7 +2189,7 @@ prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p) page = alloc_page(__GFP_HIGHMEM); if (!swsusp_page_is_free(page)) { /* The page is "safe", set its bit the bitmap */ - memory_bm_set_bit(bm, 0, page_to_pfn(page)); + memory_bm_set_bit(bm, page_to_pfn(page)); safe_highmem_pages++; } /* Mark the page as allocated */ @@ -2283,7 +2247,7 @@ get_highmem_page_buffer(struct page *page, struct chain_allocator *ca) /* Copy of the page will be stored in high memory */ kaddr = buffer; - tmp = pfn_to_page(memory_bm_next_pfn(safe_highmem_bm, 0)); + tmp = pfn_to_page(memory_bm_next_pfn(safe_highmem_bm)); safe_highmem_pages--; last_highmem_page = tmp; pbe->copy_page = tmp; @@ -2454,7 +2418,7 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) { struct pbe *pbe; struct page *page; - unsigned long pfn = memory_bm_next_pfn(bm, 0); + unsigned long pfn = memory_bm_next_pfn(bm); if (pfn == BM_END_OF_MAP) return ERR_PTR(-EFAULT); @@ -2641,82 +2605,3 @@ int restore_highmem(void) return 0; } #endif /* CONFIG_HIGHMEM */ - -struct memory_bitmap *pageset1_map, *pageset2_map, *free_map, *nosave_map, - *pageset1_copy_map, *io_map, *page_resave_map, *compare_map; - -int resume_attempted; - -int memory_bm_write(struct memory_bitmap *bm, int (*rw_chunk) - (int rw, struct toi_module_ops *owner, char *buffer, int buffer_size)) -{ - int result; - - memory_bm_position_reset(bm); - - do { - result = rw_chunk(WRITE, NULL, (char *) bm->cur[0].node->data, PAGE_SIZE); - - if (result) - return result; - } while (rtree_next_node(bm, 0)); - return 0; -} - -int memory_bm_read(struct memory_bitmap *bm, int (*rw_chunk) - (int rw, struct toi_module_ops *owner, char *buffer, int buffer_size)) -{ - int result; - - memory_bm_position_reset(bm); - - do { - result = rw_chunk(READ, NULL, (char *) bm->cur[0].node->data, PAGE_SIZE); - - if (result) - return result; - - } while (rtree_next_node(bm, 0)); - return 0; -} - -int memory_bm_space_needed(struct memory_bitmap *bm) -{ - unsigned long bytes = 0; - - memory_bm_position_reset(bm); - do { - bytes += PAGE_SIZE; - } while (rtree_next_node(bm, 0)); - return bytes; -} - -int toi_alloc_bitmap(struct memory_bitmap **bm) -{ - int error; - struct memory_bitmap *bm1; - - bm1 = kzalloc(sizeof(struct memory_bitmap), GFP_KERNEL); - if (!bm1) - return -ENOMEM; - - error = memory_bm_create(bm1, GFP_KERNEL, PG_ANY); - if (error) { - printk("Error returned - %d.\n", error); - kfree(bm1); - return -ENOMEM; - } - - *bm = bm1; - return 0; -} - -void toi_free_bitmap(struct memory_bitmap **bm) -{ - if (!*bm) - return; - - memory_bm_free(*bm, 0); - kfree(*bm); - *bm = NULL; -} diff --git a/kernel/power/tuxonice.h b/kernel/power/tuxonice.h deleted file mode 100644 index 10b65633f..000000000 --- a/kernel/power/tuxonice.h +++ /dev/null @@ -1,260 +0,0 @@ -/* - * kernel/power/tuxonice.h - * - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * It contains declarations used throughout swsusp. - * - */ - -#ifndef KERNEL_POWER_TOI_H -#define KERNEL_POWER_TOI_H - -#include <linux/delay.h> -#include <linux/bootmem.h> -#include <linux/suspend.h> -#include <linux/fs.h> -#include <asm/setup.h> -#include "tuxonice_pageflags.h" -#include "power.h" - -#define TOI_CORE_VERSION "3.3" -#define TOI_HEADER_VERSION 3 -#define MY_BOOT_KERNEL_DATA_VERSION 4 - -struct toi_boot_kernel_data { - int version; - int size; - unsigned long toi_action; - unsigned long toi_debug_state; - u32 toi_default_console_level; - int toi_io_time[2][2]; - char toi_nosave_commandline[COMMAND_LINE_SIZE]; - unsigned long pages_used[33]; - unsigned long incremental_bytes_in; - unsigned long incremental_bytes_out; - unsigned long compress_bytes_in; - unsigned long compress_bytes_out; - unsigned long pruned_pages; -}; - -extern struct toi_boot_kernel_data toi_bkd; - -/* Location of book kernel data struct in kernel being resumed */ -extern unsigned long boot_kernel_data_buffer; - -/* == Action states == */ - -enum { - TOI_REBOOT, - TOI_PAUSE, - TOI_LOGALL, - TOI_CAN_CANCEL, - TOI_KEEP_IMAGE, - TOI_FREEZER_TEST, - TOI_SINGLESTEP, - TOI_PAUSE_NEAR_PAGESET_END, - TOI_TEST_FILTER_SPEED, - TOI_TEST_BIO, - TOI_NO_PAGESET2, - TOI_IGNORE_ROOTFS, - TOI_REPLACE_SWSUSP, - TOI_PAGESET2_FULL, - TOI_ABORT_ON_RESAVE_NEEDED, - TOI_NO_MULTITHREADED_IO, - TOI_NO_DIRECT_LOAD, /* Obsolete */ - TOI_LATE_CPU_HOTPLUG, /* Obsolete */ - TOI_GET_MAX_MEM_ALLOCD, - TOI_NO_FLUSHER_THREAD, - TOI_NO_PS2_IF_UNNEEDED, - TOI_POST_RESUME_BREAKPOINT, - TOI_NO_READAHEAD, - TOI_TRACE_DEBUG_ON, - TOI_INCREMENTAL_IMAGE, -}; - -extern unsigned long toi_bootflags_mask; - -#define clear_action_state(bit) (test_and_clear_bit(bit, &toi_bkd.toi_action)) - -/* == Result states == */ - -enum { - TOI_ABORTED, - TOI_ABORT_REQUESTED, - TOI_NOSTORAGE_AVAILABLE, - TOI_INSUFFICIENT_STORAGE, - TOI_FREEZING_FAILED, - TOI_KEPT_IMAGE, - TOI_WOULD_EAT_MEMORY, - TOI_UNABLE_TO_FREE_ENOUGH_MEMORY, - TOI_PM_SEM, - TOI_DEVICE_REFUSED, - TOI_SYSDEV_REFUSED, - TOI_EXTRA_PAGES_ALLOW_TOO_SMALL, - TOI_UNABLE_TO_PREPARE_IMAGE, - TOI_FAILED_MODULE_INIT, - TOI_FAILED_MODULE_CLEANUP, - TOI_FAILED_IO, - TOI_OUT_OF_MEMORY, - TOI_IMAGE_ERROR, - TOI_PLATFORM_PREP_FAILED, - TOI_CPU_HOTPLUG_FAILED, - TOI_ARCH_PREPARE_FAILED, /* Removed Linux-3.0 */ - TOI_RESAVE_NEEDED, - TOI_CANT_SUSPEND, - TOI_NOTIFIERS_PREPARE_FAILED, - TOI_PRE_SNAPSHOT_FAILED, - TOI_PRE_RESTORE_FAILED, - TOI_USERMODE_HELPERS_ERR, - TOI_CANT_USE_ALT_RESUME, - TOI_HEADER_TOO_BIG, - TOI_WAKEUP_EVENT, - TOI_SYSCORE_REFUSED, - TOI_DPM_PREPARE_FAILED, - TOI_DPM_SUSPEND_FAILED, - TOI_NUM_RESULT_STATES /* Used in printing debug info only */ -}; - -extern unsigned long toi_result; - -#define set_result_state(bit) (test_and_set_bit(bit, &toi_result)) -#define set_abort_result(bit) (test_and_set_bit(TOI_ABORTED, &toi_result), \ - test_and_set_bit(bit, &toi_result)) -#define clear_result_state(bit) (test_and_clear_bit(bit, &toi_result)) -#define test_result_state(bit) (test_bit(bit, &toi_result)) - -/* == Debug sections and levels == */ - -/* debugging levels. */ -enum { - TOI_STATUS = 0, - TOI_ERROR = 2, - TOI_LOW, - TOI_MEDIUM, - TOI_HIGH, - TOI_VERBOSE, -}; - -enum { - TOI_ANY_SECTION, - TOI_EAT_MEMORY, - TOI_IO, - TOI_HEADER, - TOI_WRITER, - TOI_MEMORY, - TOI_PAGEDIR, - TOI_COMPRESS, - TOI_BIO, -}; - -#define set_debug_state(bit) (test_and_set_bit(bit, &toi_bkd.toi_debug_state)) -#define clear_debug_state(bit) \ - (test_and_clear_bit(bit, &toi_bkd.toi_debug_state)) -#define test_debug_state(bit) (test_bit(bit, &toi_bkd.toi_debug_state)) - -/* == Steps in hibernating == */ - -enum { - STEP_HIBERNATE_PREPARE_IMAGE, - STEP_HIBERNATE_SAVE_IMAGE, - STEP_HIBERNATE_POWERDOWN, - STEP_RESUME_CAN_RESUME, - STEP_RESUME_LOAD_PS1, - STEP_RESUME_DO_RESTORE, - STEP_RESUME_READ_PS2, - STEP_RESUME_GO, - STEP_RESUME_ALT_IMAGE, - STEP_CLEANUP, - STEP_QUIET_CLEANUP -}; - -/* == TuxOnIce states == - (see also include/linux/suspend.h) */ - -#define get_toi_state() (toi_state) -#define restore_toi_state(saved_state) \ - do { toi_state = saved_state; } while (0) - -/* == Module support == */ - -struct toi_core_fns { - int (*post_context_save)(void); - unsigned long (*get_nonconflicting_page)(void); - int (*try_hibernate)(void); - void (*try_resume)(void); -}; - -extern struct toi_core_fns *toi_core_fns; - -/* == All else == */ -#define KB(x) ((x) << (PAGE_SHIFT - 10)) -#define MB(x) ((x) >> (20 - PAGE_SHIFT)) - -extern int toi_start_anything(int toi_or_resume); -extern void toi_finish_anything(int toi_or_resume); - -extern int save_image_part1(void); -extern int toi_atomic_restore(void); - -extern int toi_try_hibernate(void); -extern void toi_try_resume(void); - -extern int __toi_post_context_save(void); - -extern unsigned int nr_hibernates; -extern char alt_resume_param[256]; - -extern void copyback_post(void); -extern int toi_hibernate(void); -extern unsigned long extra_pd1_pages_used; - -#define SECTOR_SIZE 512 - -extern void toi_early_boot_message(int can_erase_image, int default_answer, - char *warning_reason, ...); - -extern int do_check_can_resume(void); -extern int do_toi_step(int step); -extern int toi_launch_userspace_program(char *command, int channel_no, - int wait, int debug); - -extern char tuxonice_signature[9]; - -extern int toi_start_other_threads(void); -extern void toi_stop_other_threads(void); - -extern int toi_trace_index; -#define TOI_TRACE_DEBUG(PFN, DESC, ...) \ - do { \ - if (test_action_state(TOI_TRACE_DEBUG_ON)) { \ - printk("*TOI* %ld %02d" DESC "\n", PFN, toi_trace_index, ##__VA_ARGS__); \ - } \ - } while(0) - -#ifdef CONFIG_TOI_KEEP_IMAGE -#define toi_keeping_image (test_action_state(TOI_KEEP_IMAGE) || test_action_state(TOI_INCREMENTAL_IMAGE)) -#else -#define toi_keeping_image (0) -#endif - -#ifdef CONFIG_TOI_INCREMENTAL -extern void toi_reset_dirtiness_one(unsigned long pfn, int verbose); -extern int toi_reset_dirtiness(int verbose); -extern void toi_cbw_write(void); -extern void toi_cbw_restore(void); -extern int toi_allocate_cbw_data(void); -extern void toi_free_cbw_data(void); -extern int toi_cbw_init(void); -extern void toi_mark_tasks_cbw(void); -#else -static inline int toi_reset_dirtiness(int verbose) { return 0; } -#define toi_cbw_write() do { } while(0) -#define toi_cbw_restore() do { } while(0) -#define toi_allocate_cbw_data() do { } while(0) -#define toi_free_cbw_data() do { } while(0) -static inline int toi_cbw_init(void) { return 0; } -#endif -#endif diff --git a/kernel/power/tuxonice_alloc.c b/kernel/power/tuxonice_alloc.c deleted file mode 100644 index 1d8b1cbda..000000000 --- a/kernel/power/tuxonice_alloc.c +++ /dev/null @@ -1,308 +0,0 @@ -/* - * kernel/power/tuxonice_alloc.c - * - * Copyright (C) 2008-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - */ - -#include <linux/export.h> -#include <linux/slab.h> -#include "tuxonice_modules.h" -#include "tuxonice_alloc.h" -#include "tuxonice_sysfs.h" -#include "tuxonice.h" - -#define TOI_ALLOC_PATHS 41 - -static DEFINE_MUTEX(toi_alloc_mutex); - -static struct toi_module_ops toi_alloc_ops; - -static int toi_fail_num; - -static atomic_t toi_alloc_count[TOI_ALLOC_PATHS], - toi_free_count[TOI_ALLOC_PATHS], - toi_test_count[TOI_ALLOC_PATHS], - toi_fail_count[TOI_ALLOC_PATHS]; -static int toi_cur_allocd[TOI_ALLOC_PATHS], toi_max_allocd[TOI_ALLOC_PATHS]; -static int cur_allocd, max_allocd; - -static char *toi_alloc_desc[TOI_ALLOC_PATHS] = { - "", /* 0 */ - "get_io_info_struct", - "extent", - "extent (loading chain)", - "userui channel", - "userui arg", /* 5 */ - "attention list metadata", - "extra pagedir memory metadata", - "bdev metadata", - "extra pagedir memory", - "header_locations_read", /* 10 */ - "bio queue", - "prepare_readahead", - "i/o buffer", - "writer buffer in bio_init", - "checksum buffer", /* 15 */ - "compression buffer", - "filewriter signature op", - "set resume param alloc1", - "set resume param alloc2", - "debugging info buffer", /* 20 */ - "check can resume buffer", - "write module config buffer", - "read module config buffer", - "write image header buffer", - "read pageset1 buffer", /* 25 */ - "get_have_image_data buffer", - "checksum page", - "worker rw loop", - "get nonconflicting page", - "ps1 load addresses", /* 30 */ - "remove swap image", - "swap image exists", - "swap parse sig location", - "sysfs kobj", - "swap mark resume attempted buffer", /* 35 */ - "cluster member", - "boot kernel data buffer", - "setting swap signature", - "block i/o bdev struct", - "copy before write", /* 40 */ -}; - -#define MIGHT_FAIL(FAIL_NUM, FAIL_VAL) \ - do { \ - BUG_ON(FAIL_NUM >= TOI_ALLOC_PATHS); \ - \ - if (FAIL_NUM == toi_fail_num) { \ - atomic_inc(&toi_test_count[FAIL_NUM]); \ - toi_fail_num = 0; \ - return FAIL_VAL; \ - } \ - } while (0) - -static void alloc_update_stats(int fail_num, void *result, int size) -{ - if (!result) { - atomic_inc(&toi_fail_count[fail_num]); - return; - } - - atomic_inc(&toi_alloc_count[fail_num]); - if (unlikely(test_action_state(TOI_GET_MAX_MEM_ALLOCD))) { - mutex_lock(&toi_alloc_mutex); - toi_cur_allocd[fail_num]++; - cur_allocd += size; - if (unlikely(cur_allocd > max_allocd)) { - int i; - - for (i = 0; i < TOI_ALLOC_PATHS; i++) - toi_max_allocd[i] = toi_cur_allocd[i]; - max_allocd = cur_allocd; - } - mutex_unlock(&toi_alloc_mutex); - } -} - -static void free_update_stats(int fail_num, int size) -{ - BUG_ON(fail_num >= TOI_ALLOC_PATHS); - atomic_inc(&toi_free_count[fail_num]); - if (unlikely(atomic_read(&toi_free_count[fail_num]) > - atomic_read(&toi_alloc_count[fail_num]))) - dump_stack(); - if (unlikely(test_action_state(TOI_GET_MAX_MEM_ALLOCD))) { - mutex_lock(&toi_alloc_mutex); - cur_allocd -= size; - toi_cur_allocd[fail_num]--; - mutex_unlock(&toi_alloc_mutex); - } -} - -void *toi_kzalloc(int fail_num, size_t size, gfp_t flags) -{ - void *result; - - if (toi_alloc_ops.enabled) - MIGHT_FAIL(fail_num, NULL); - result = kzalloc(size, flags); - if (toi_alloc_ops.enabled) - alloc_update_stats(fail_num, result, size); - if (fail_num == toi_trace_allocs) - dump_stack(); - return result; -} - -unsigned long toi_get_free_pages(int fail_num, gfp_t mask, - unsigned int order) -{ - unsigned long result; - - mask |= ___GFP_TOI_NOTRACK; - if (toi_alloc_ops.enabled) - MIGHT_FAIL(fail_num, 0); - result = __get_free_pages(mask, order); - if (toi_alloc_ops.enabled) - alloc_update_stats(fail_num, (void *) result, - PAGE_SIZE << order); - if (fail_num == toi_trace_allocs) - dump_stack(); - return result; -} - -struct page *toi_alloc_page(int fail_num, gfp_t mask) -{ - struct page *result; - - if (toi_alloc_ops.enabled) - MIGHT_FAIL(fail_num, NULL); - mask |= ___GFP_TOI_NOTRACK; - result = alloc_page(mask); - if (toi_alloc_ops.enabled) - alloc_update_stats(fail_num, (void *) result, PAGE_SIZE); - if (fail_num == toi_trace_allocs) - dump_stack(); - return result; -} - -unsigned long toi_get_zeroed_page(int fail_num, gfp_t mask) -{ - unsigned long result; - - if (toi_alloc_ops.enabled) - MIGHT_FAIL(fail_num, 0); - mask |= ___GFP_TOI_NOTRACK; - result = get_zeroed_page(mask); - if (toi_alloc_ops.enabled) - alloc_update_stats(fail_num, (void *) result, PAGE_SIZE); - if (fail_num == toi_trace_allocs) - dump_stack(); - return result; -} - -void toi_kfree(int fail_num, const void *arg, int size) -{ - if (arg && toi_alloc_ops.enabled) - free_update_stats(fail_num, size); - - if (fail_num == toi_trace_allocs) - dump_stack(); - kfree(arg); -} - -void toi_free_page(int fail_num, unsigned long virt) -{ - if (virt && toi_alloc_ops.enabled) - free_update_stats(fail_num, PAGE_SIZE); - - if (fail_num == toi_trace_allocs) - dump_stack(); - free_page(virt); -} - -void toi__free_page(int fail_num, struct page *page) -{ - if (page && toi_alloc_ops.enabled) - free_update_stats(fail_num, PAGE_SIZE); - - if (fail_num == toi_trace_allocs) - dump_stack(); - __free_page(page); -} - -void toi_free_pages(int fail_num, struct page *page, int order) -{ - if (page && toi_alloc_ops.enabled) - free_update_stats(fail_num, PAGE_SIZE << order); - - if (fail_num == toi_trace_allocs) - dump_stack(); - __free_pages(page, order); -} - -void toi_alloc_print_debug_stats(void) -{ - int i, header_done = 0; - - if (!toi_alloc_ops.enabled) - return; - - for (i = 0; i < TOI_ALLOC_PATHS; i++) - if (atomic_read(&toi_alloc_count[i]) != - atomic_read(&toi_free_count[i])) { - if (!header_done) { - printk(KERN_INFO "Idx Allocs Frees Tests " - " Fails Max Description\n"); - header_done = 1; - } - - printk(KERN_INFO "%3d %7d %7d %7d %7d %7d %s\n", i, - atomic_read(&toi_alloc_count[i]), - atomic_read(&toi_free_count[i]), - atomic_read(&toi_test_count[i]), - atomic_read(&toi_fail_count[i]), - toi_max_allocd[i], - toi_alloc_desc[i]); - } -} - -static int toi_alloc_initialise(int starting_cycle) -{ - int i; - - if (!starting_cycle) - return 0; - - if (toi_trace_allocs) - dump_stack(); - - for (i = 0; i < TOI_ALLOC_PATHS; i++) { - atomic_set(&toi_alloc_count[i], 0); - atomic_set(&toi_free_count[i], 0); - atomic_set(&toi_test_count[i], 0); - atomic_set(&toi_fail_count[i], 0); - toi_cur_allocd[i] = 0; - toi_max_allocd[i] = 0; - }; - - max_allocd = 0; - cur_allocd = 0; - return 0; -} - -static struct toi_sysfs_data sysfs_params[] = { - SYSFS_INT("failure_test", SYSFS_RW, &toi_fail_num, 0, 99, 0, NULL), - SYSFS_INT("trace", SYSFS_RW, &toi_trace_allocs, 0, TOI_ALLOC_PATHS, 0, - NULL), - SYSFS_BIT("find_max_mem_allocated", SYSFS_RW, &toi_bkd.toi_action, - TOI_GET_MAX_MEM_ALLOCD, 0), - SYSFS_INT("enabled", SYSFS_RW, &toi_alloc_ops.enabled, 0, 1, 0, - NULL) -}; - -static struct toi_module_ops toi_alloc_ops = { - .type = MISC_HIDDEN_MODULE, - .name = "allocation debugging", - .directory = "alloc", - .module = THIS_MODULE, - .early = 1, - .initialise = toi_alloc_initialise, - - .sysfs_data = sysfs_params, - .num_sysfs_entries = sizeof(sysfs_params) / - sizeof(struct toi_sysfs_data), -}; - -int toi_alloc_init(void) -{ - int result = toi_register_module(&toi_alloc_ops); - return result; -} - -void toi_alloc_exit(void) -{ - toi_unregister_module(&toi_alloc_ops); -} diff --git a/kernel/power/tuxonice_alloc.h b/kernel/power/tuxonice_alloc.h deleted file mode 100644 index 0cd6b686f..000000000 --- a/kernel/power/tuxonice_alloc.h +++ /dev/null @@ -1,54 +0,0 @@ -/* - * kernel/power/tuxonice_alloc.h - * - * Copyright (C) 2008-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - */ - -#include <linux/slab.h> -#define TOI_WAIT_GFP (GFP_NOFS | __GFP_NOWARN) -#define TOI_ATOMIC_GFP (GFP_ATOMIC | __GFP_NOWARN) - -#ifdef CONFIG_PM_DEBUG -extern void *toi_kzalloc(int fail_num, size_t size, gfp_t flags); -extern void toi_kfree(int fail_num, const void *arg, int size); - -extern unsigned long toi_get_free_pages(int fail_num, gfp_t mask, - unsigned int order); -#define toi_get_free_page(FAIL_NUM, MASK) toi_get_free_pages(FAIL_NUM, MASK, 0) -extern unsigned long toi_get_zeroed_page(int fail_num, gfp_t mask); -extern void toi_free_page(int fail_num, unsigned long buf); -extern void toi__free_page(int fail_num, struct page *page); -extern void toi_free_pages(int fail_num, struct page *page, int order); -extern struct page *toi_alloc_page(int fail_num, gfp_t mask); -extern int toi_alloc_init(void); -extern void toi_alloc_exit(void); - -extern void toi_alloc_print_debug_stats(void); - -#else /* CONFIG_PM_DEBUG */ - -#define toi_kzalloc(FAIL, SIZE, FLAGS) (kzalloc(SIZE, FLAGS)) -#define toi_kfree(FAIL, ALLOCN, SIZE) (kfree(ALLOCN)) - -#define toi_get_free_pages(FAIL, FLAGS, ORDER) __get_free_pages(FLAGS, ORDER) -#define toi_get_free_page(FAIL, FLAGS) __get_free_page(FLAGS) -#define toi_get_zeroed_page(FAIL, FLAGS) get_zeroed_page(FLAGS) -#define toi_free_page(FAIL, ALLOCN) do { free_page(ALLOCN); } while (0) -#define toi__free_page(FAIL, PAGE) __free_page(PAGE) -#define toi_free_pages(FAIL, PAGE, ORDER) __free_pages(PAGE, ORDER) -#define toi_alloc_page(FAIL, MASK) alloc_page(MASK) -static inline int toi_alloc_init(void) -{ - return 0; -} - -static inline void toi_alloc_exit(void) { } - -static inline void toi_alloc_print_debug_stats(void) { } - -#endif - -extern int toi_trace_allocs; diff --git a/kernel/power/tuxonice_atomic_copy.c b/kernel/power/tuxonice_atomic_copy.c deleted file mode 100644 index 5845217f8..000000000 --- a/kernel/power/tuxonice_atomic_copy.c +++ /dev/null @@ -1,469 +0,0 @@ -/* - * kernel/power/tuxonice_atomic_copy.c - * - * Copyright 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * Distributed under GPLv2. - * - * Routines for doing the atomic save/restore. - */ - -#include <linux/suspend.h> -#include <linux/highmem.h> -#include <linux/cpu.h> -#include <linux/freezer.h> -#include <linux/console.h> -#include <linux/syscore_ops.h> -#include <linux/ftrace.h> -#include <asm/suspend.h> -#include "tuxonice.h" -#include "tuxonice_storage.h" -#include "tuxonice_power_off.h" -#include "tuxonice_ui.h" -#include "tuxonice_io.h" -#include "tuxonice_prepare_image.h" -#include "tuxonice_pageflags.h" -#include "tuxonice_checksum.h" -#include "tuxonice_builtin.h" -#include "tuxonice_atomic_copy.h" -#include "tuxonice_alloc.h" -#include "tuxonice_modules.h" - -unsigned long extra_pd1_pages_used; - -/** - * free_pbe_list - free page backup entries used by the atomic copy code. - * @list: List to free. - * @highmem: Whether the list is in highmem. - * - * Normally, this function isn't used. If, however, we need to abort before - * doing the atomic copy, we use this to free the pbes previously allocated. - **/ -static void free_pbe_list(struct pbe **list, int highmem) -{ - while (*list) { - int i; - struct pbe *free_pbe, *next_page = NULL; - struct page *page; - - if (highmem) { - page = (struct page *) *list; - free_pbe = (struct pbe *) kmap(page); - } else { - page = virt_to_page(*list); - free_pbe = *list; - } - - for (i = 0; i < PBES_PER_PAGE; i++) { - if (!free_pbe) - break; - if (highmem) - toi__free_page(29, free_pbe->address); - else - toi_free_page(29, - (unsigned long) free_pbe->address); - free_pbe = free_pbe->next; - } - - if (highmem) { - if (free_pbe) - next_page = free_pbe; - kunmap(page); - } else { - if (free_pbe) - next_page = free_pbe; - } - - toi__free_page(29, page); - *list = (struct pbe *) next_page; - }; -} - -/** - * copyback_post - post atomic-restore actions - * - * After doing the atomic restore, we have a few more things to do: - * 1) We want to retain some values across the restore, so we now copy - * these from the nosave variables to the normal ones. - * 2) Set the status flags. - * 3) Resume devices. - * 4) Tell userui so it can redraw & restore settings. - * 5) Reread the page cache. - **/ -void copyback_post(void) -{ - struct toi_boot_kernel_data *bkd = - (struct toi_boot_kernel_data *) boot_kernel_data_buffer; - - if (toi_activate_storage(1)) - panic("Failed to reactivate our storage."); - - toi_post_atomic_restore_modules(bkd); - - toi_cond_pause(1, "About to reload secondary pagedir."); - - if (read_pageset2(0)) - panic("Unable to successfully reread the page cache."); - - /* - * If the user wants to sleep again after resuming from full-off, - * it's most likely to be in order to suspend to ram, so we'll - * do this check after loading pageset2, to give them the fastest - * wakeup when they are ready to use the computer again. - */ - toi_check_resleep(); - - if (test_action_state(TOI_INCREMENTAL_IMAGE)) - toi_reset_dirtiness(1); -} - -/** - * toi_copy_pageset1 - do the atomic copy of pageset1 - * - * Make the atomic copy of pageset1. We can't use copy_page (as we once did) - * because we can't be sure what side effects it has. On my old Duron, with - * 3DNOW, kernel_fpu_begin increments preempt count, making our preempt - * count at resume time 4 instead of 3. - * - * We don't want to call kmap_atomic unconditionally because it has the side - * effect of incrementing the preempt count, which will leave it one too high - * post resume (the page containing the preempt count will be copied after - * its incremented. This is essentially the same problem. - **/ -void toi_copy_pageset1(void) -{ - int i; - unsigned long source_index, dest_index; - - memory_bm_position_reset(pageset1_map); - memory_bm_position_reset(pageset1_copy_map); - - source_index = memory_bm_next_pfn(pageset1_map, 0); - dest_index = memory_bm_next_pfn(pageset1_copy_map, 0); - - for (i = 0; i < pagedir1.size; i++) { - unsigned long *origvirt, *copyvirt; - struct page *origpage, *copypage; - int loop = (PAGE_SIZE / sizeof(unsigned long)) - 1, - was_present1, was_present2; - - origpage = pfn_to_page(source_index); - copypage = pfn_to_page(dest_index); - - origvirt = PageHighMem(origpage) ? - kmap_atomic(origpage) : - page_address(origpage); - - copyvirt = PageHighMem(copypage) ? - kmap_atomic(copypage) : - page_address(copypage); - - was_present1 = kernel_page_present(origpage); - if (!was_present1) - kernel_map_pages(origpage, 1, 1); - - was_present2 = kernel_page_present(copypage); - if (!was_present2) - kernel_map_pages(copypage, 1, 1); - - while (loop >= 0) { - *(copyvirt + loop) = *(origvirt + loop); - loop--; - } - - if (!was_present1) - kernel_map_pages(origpage, 1, 0); - - if (!was_present2) - kernel_map_pages(copypage, 1, 0); - - if (PageHighMem(origpage)) - kunmap_atomic(origvirt); - - if (PageHighMem(copypage)) - kunmap_atomic(copyvirt); - - source_index = memory_bm_next_pfn(pageset1_map, 0); - dest_index = memory_bm_next_pfn(pageset1_copy_map, 0); - } -} - -/** - * __toi_post_context_save - steps after saving the cpu context - * - * Steps taken after saving the CPU state to make the actual - * atomic copy. - * - * Called from swsusp_save in snapshot.c via toi_post_context_save. - **/ -int __toi_post_context_save(void) -{ - unsigned long old_ps1_size = pagedir1.size; - - check_checksums(); - - free_checksum_pages(); - - toi_recalculate_image_contents(1); - - extra_pd1_pages_used = pagedir1.size > old_ps1_size ? - pagedir1.size - old_ps1_size : 0; - - if (extra_pd1_pages_used > extra_pd1_pages_allowance) { - printk(KERN_INFO "Pageset1 has grown by %lu pages. " - "extra_pages_allowance is currently only %lu.\n", - pagedir1.size - old_ps1_size, - extra_pd1_pages_allowance); - - /* - * Highlevel code will see this, clear the state and - * retry if we haven't already done so twice. - */ - if (any_to_free(1)) { - set_abort_result(TOI_EXTRA_PAGES_ALLOW_TOO_SMALL); - return 1; - } - if (try_allocate_extra_memory()) { - printk(KERN_INFO "Failed to allocate the extra memory" - " needed. Restarting the process."); - set_abort_result(TOI_EXTRA_PAGES_ALLOW_TOO_SMALL); - return 1; - } - printk(KERN_INFO "However it looks like there's enough" - " free ram and storage to handle this, so " - " continuing anyway."); - /* - * What if try_allocate_extra_memory above calls - * toi_allocate_extra_pagedir_memory and it allocs a new - * slab page via toi_kzalloc which should be in ps1? So... - */ - toi_recalculate_image_contents(1); - } - - if (!test_action_state(TOI_TEST_FILTER_SPEED) && - !test_action_state(TOI_TEST_BIO)) - toi_copy_pageset1(); - - return 0; -} - -/** - * toi_hibernate - high level code for doing the atomic copy - * - * High-level code which prepares to do the atomic copy. Loosely based - * on the swsusp version, but with the following twists: - * - We set toi_running so the swsusp code uses our code paths. - * - We give better feedback regarding what goes wrong if there is a - * problem. - * - We use an extra function to call the assembly, just in case this code - * is in a module (return address). - **/ -int toi_hibernate(void) -{ - int error; - - error = toi_lowlevel_builtin(); - - if (!error) { - struct toi_boot_kernel_data *bkd = - (struct toi_boot_kernel_data *) boot_kernel_data_buffer; - - /* - * The boot kernel's data may be larger (newer version) or - * smaller (older version) than ours. Copy the minimum - * of the two sizes, so that we don't overwrite valid values - * from pre-atomic copy. - */ - - memcpy(&toi_bkd, (char *) boot_kernel_data_buffer, - min_t(int, sizeof(struct toi_boot_kernel_data), - bkd->size)); - } - - return error; -} - -/** - * toi_atomic_restore - prepare to do the atomic restore - * - * Get ready to do the atomic restore. This part gets us into the same - * state we are in prior to do calling do_toi_lowlevel while - * hibernating: hot-unplugging secondary cpus and freeze processes, - * before starting the thread that will do the restore. - **/ -int toi_atomic_restore(void) -{ - int error; - - toi_prepare_status(DONT_CLEAR_BAR, "Atomic restore."); - - memcpy(&toi_bkd.toi_nosave_commandline, saved_command_line, - strlen(saved_command_line)); - - toi_pre_atomic_restore_modules(&toi_bkd); - - if (add_boot_kernel_data_pbe()) - goto Failed; - - toi_prepare_status(DONT_CLEAR_BAR, "Doing atomic copy/restore."); - - if (toi_go_atomic(PMSG_QUIESCE, 0)) - goto Failed; - - /* We'll ignore saved state, but this gets preempt count (etc) right */ - save_processor_state(); - - error = swsusp_arch_resume(); - /* - * Code below is only ever reached in case of failure. Otherwise - * execution continues at place where swsusp_arch_suspend was called. - * - * We don't know whether it's safe to continue (this shouldn't happen), - * so lets err on the side of caution. - */ - BUG(); - -Failed: - free_pbe_list(&restore_pblist, 0); -#ifdef CONFIG_HIGHMEM - free_pbe_list(&restore_highmem_pblist, 1); -#endif - return 1; -} - -/** - * toi_go_atomic - do the actual atomic copy/restore - * @state: The state to use for dpm_suspend_start & power_down calls. - * @suspend_time: Whether we're suspending or resuming. - **/ -int toi_go_atomic(pm_message_t state, int suspend_time) -{ - if (suspend_time) { - if (platform_begin(1)) { - set_abort_result(TOI_PLATFORM_PREP_FAILED); - toi_end_atomic(ATOMIC_STEP_PLATFORM_END, suspend_time, 3); - return 1; - } - - if (dpm_prepare(PMSG_FREEZE)) { - set_abort_result(TOI_DPM_PREPARE_FAILED); - dpm_complete(PMSG_RECOVER); - toi_end_atomic(ATOMIC_STEP_PLATFORM_END, suspend_time, 3); - return 1; - } - } - - suspend_console(); - pm_restrict_gfp_mask(); - - if (suspend_time) { - if (dpm_suspend(state)) { - set_abort_result(TOI_DPM_SUSPEND_FAILED); - toi_end_atomic(ATOMIC_STEP_DEVICE_RESUME, suspend_time, 3); - return 1; - } - } else { - if (dpm_suspend_start(state)) { - set_abort_result(TOI_DPM_SUSPEND_FAILED); - toi_end_atomic(ATOMIC_STEP_DEVICE_RESUME, suspend_time, 3); - return 1; - } - } - - /* At this point, dpm_suspend_start() has been called, but *not* - * dpm_suspend_noirq(). We *must* dpm_suspend_noirq() now. - * Otherwise, drivers for some devices (e.g. interrupt controllers) - * become desynchronized with the actual state of the hardware - * at resume time, and evil weirdness ensues. - */ - - if (dpm_suspend_end(state)) { - set_abort_result(TOI_DEVICE_REFUSED); - toi_end_atomic(ATOMIC_STEP_DEVICE_RESUME, suspend_time, 1); - return 1; - } - - if (suspend_time) { - if (platform_pre_snapshot(1)) - set_abort_result(TOI_PRE_SNAPSHOT_FAILED); - } else { - if (platform_pre_restore(1)) - set_abort_result(TOI_PRE_RESTORE_FAILED); - } - - if (test_result_state(TOI_ABORTED)) { - toi_end_atomic(ATOMIC_STEP_PLATFORM_FINISH, suspend_time, 1); - return 1; - } - - if (disable_nonboot_cpus()) { - set_abort_result(TOI_CPU_HOTPLUG_FAILED); - toi_end_atomic(ATOMIC_STEP_CPU_HOTPLUG, - suspend_time, 1); - return 1; - } - - local_irq_disable(); - - if (syscore_suspend()) { - set_abort_result(TOI_SYSCORE_REFUSED); - toi_end_atomic(ATOMIC_STEP_IRQS, suspend_time, 1); - return 1; - } - - if (suspend_time && pm_wakeup_pending()) { - set_abort_result(TOI_WAKEUP_EVENT); - toi_end_atomic(ATOMIC_STEP_SYSCORE_RESUME, suspend_time, 1); - return 1; - } - return 0; -} - -/** - * toi_end_atomic - post atomic copy/restore routines - * @stage: What step to start at. - * @suspend_time: Whether we're suspending or resuming. - * @error: Whether we're recovering from an error. - **/ -void toi_end_atomic(int stage, int suspend_time, int error) -{ - pm_message_t msg = suspend_time ? (error ? PMSG_RECOVER : PMSG_THAW) : - PMSG_RESTORE; - - switch (stage) { - case ATOMIC_ALL_STEPS: - if (!suspend_time) { - events_check_enabled = false; - } - platform_leave(1); - case ATOMIC_STEP_SYSCORE_RESUME: - syscore_resume(); - case ATOMIC_STEP_IRQS: - local_irq_enable(); - case ATOMIC_STEP_CPU_HOTPLUG: - enable_nonboot_cpus(); - case ATOMIC_STEP_PLATFORM_FINISH: - if (!suspend_time && error & 2) - platform_restore_cleanup(1); - else - platform_finish(1); - dpm_resume_start(msg); - case ATOMIC_STEP_DEVICE_RESUME: - if (suspend_time && (error & 2)) - platform_recover(1); - dpm_resume(msg); - if (!toi_in_suspend()) { - dpm_resume_end(PMSG_RECOVER); - } - if (error || !toi_in_suspend()) { - pm_restore_gfp_mask(); - } - resume_console(); - case ATOMIC_STEP_DPM_COMPLETE: - dpm_complete(msg); - case ATOMIC_STEP_PLATFORM_END: - platform_end(1); - - toi_prepare_status(DONT_CLEAR_BAR, "Post atomic."); - } -} diff --git a/kernel/power/tuxonice_atomic_copy.h b/kernel/power/tuxonice_atomic_copy.h deleted file mode 100644 index e2d2b4fb3..000000000 --- a/kernel/power/tuxonice_atomic_copy.h +++ /dev/null @@ -1,25 +0,0 @@ -/* - * kernel/power/tuxonice_atomic_copy.h - * - * Copyright 2008-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * Distributed under GPLv2. - * - * Routines for doing the atomic save/restore. - */ - -enum { - ATOMIC_ALL_STEPS, - ATOMIC_STEP_SYSCORE_RESUME, - ATOMIC_STEP_IRQS, - ATOMIC_STEP_CPU_HOTPLUG, - ATOMIC_STEP_PLATFORM_FINISH, - ATOMIC_STEP_DEVICE_RESUME, - ATOMIC_STEP_DPM_COMPLETE, - ATOMIC_STEP_PLATFORM_END, -}; - -int toi_go_atomic(pm_message_t state, int toi_time); -void toi_end_atomic(int stage, int toi_time, int error); - -extern void platform_recover(int platform_mode); diff --git a/kernel/power/tuxonice_bio.h b/kernel/power/tuxonice_bio.h deleted file mode 100644 index 9d52a3b69..000000000 --- a/kernel/power/tuxonice_bio.h +++ /dev/null @@ -1,78 +0,0 @@ -/* - * kernel/power/tuxonice_bio.h - * - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * Distributed under GPLv2. - * - * This file contains declarations for functions exported from - * tuxonice_bio.c, which contains low level io functions. - */ - -#include <linux/buffer_head.h> -#include "tuxonice_extent.h" - -void toi_put_extent_chain(struct hibernate_extent_chain *chain); -int toi_add_to_extent_chain(struct hibernate_extent_chain *chain, - unsigned long start, unsigned long end); - -struct hibernate_extent_saved_state { - int extent_num; - struct hibernate_extent *extent_ptr; - unsigned long offset; -}; - -struct toi_bdev_info { - struct toi_bdev_info *next; - struct hibernate_extent_chain blocks; - struct block_device *bdev; - struct toi_module_ops *allocator; - int allocator_index; - struct hibernate_extent_chain allocations; - char name[266]; /* "swap on " or "file " + up to 256 chars */ - - /* Saved in header */ - char uuid[17]; - dev_t dev_t; - int prio; - int bmap_shift; - int blocks_per_page; - unsigned long pages_used; - struct hibernate_extent_saved_state saved_state[4]; -}; - -struct toi_extent_iterate_state { - struct toi_bdev_info *current_chain; - int num_chains; - int saved_chain_number[4]; - struct toi_bdev_info *saved_chain_ptr[4]; -}; - -/* - * Our exported interface so the swapwriter and filewriter don't - * need these functions duplicated. - */ -struct toi_bio_ops { - int (*bdev_page_io) (int rw, struct block_device *bdev, long pos, - struct page *page); - int (*register_storage)(struct toi_bdev_info *new); - void (*free_storage)(void); -}; - -struct toi_allocator_ops { - unsigned long (*toi_swap_storage_available) (void); -}; - -extern struct toi_bio_ops toi_bio_ops; - -extern char *toi_writer_buffer; -extern int toi_writer_buffer_posn; - -struct toi_bio_allocator_ops { - int (*register_storage) (void); - unsigned long (*storage_available)(void); - int (*allocate_storage) (struct toi_bdev_info *, unsigned long); - int (*bmap) (struct toi_bdev_info *); - void (*free_storage) (struct toi_bdev_info *); - unsigned long (*free_unused_storage) (struct toi_bdev_info *, unsigned long used); -}; diff --git a/kernel/power/tuxonice_bio_chains.c b/kernel/power/tuxonice_bio_chains.c deleted file mode 100644 index 086a5527d..000000000 --- a/kernel/power/tuxonice_bio_chains.c +++ /dev/null @@ -1,1126 +0,0 @@ -/* - * kernel/power/tuxonice_bio_devinfo.c - * - * Copyright (C) 2009-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * Distributed under GPLv2. - * - */ - -#include <linux/mm_types.h> -#include "tuxonice_bio.h" -#include "tuxonice_bio_internal.h" -#include "tuxonice_alloc.h" -#include "tuxonice_ui.h" -#include "tuxonice.h" -#include "tuxonice_io.h" - -static struct toi_bdev_info *prio_chain_head; -static int num_chains; - -/* Pointer to current entry being loaded/saved. */ -struct toi_extent_iterate_state toi_writer_posn; - -#define metadata_size (sizeof(struct toi_bdev_info) - \ - offsetof(struct toi_bdev_info, uuid)) - -/* - * After section 0 (header) comes 2 => next_section[0] = 2 - */ -static int next_section[3] = { 2, 3, 1 }; - -/** - * dump_block_chains - print the contents of the bdev info array. - **/ -void dump_block_chains(void) -{ - int i = 0; - int j; - struct toi_bdev_info *cur_chain = prio_chain_head; - - while (cur_chain) { - struct hibernate_extent *this = cur_chain->blocks.first; - - printk(KERN_DEBUG "Chain %d (prio %d):", i, cur_chain->prio); - - while (this) { - printk(KERN_CONT " [%lu-%lu]%s", this->start, - this->end, this->next ? "," : ""); - this = this->next; - } - - printk("\n"); - cur_chain = cur_chain->next; - i++; - } - - printk(KERN_DEBUG "Saved states:\n"); - for (i = 0; i < 4; i++) { - printk(KERN_DEBUG "Slot %d: Chain %d.\n", - i, toi_writer_posn.saved_chain_number[i]); - - cur_chain = prio_chain_head; - j = 0; - while (cur_chain) { - printk(KERN_DEBUG " Chain %d: Extent %d. Offset %lu.\n", - j, cur_chain->saved_state[i].extent_num, - cur_chain->saved_state[i].offset); - cur_chain = cur_chain->next; - j++; - } - printk(KERN_CONT "\n"); - } -} - -/** - * - **/ -static void toi_extent_chain_next(void) -{ - struct toi_bdev_info *this = toi_writer_posn.current_chain; - - if (!this->blocks.current_extent) - return; - - if (this->blocks.current_offset == this->blocks.current_extent->end) { - if (this->blocks.current_extent->next) { - this->blocks.current_extent = - this->blocks.current_extent->next; - this->blocks.current_offset = - this->blocks.current_extent->start; - } else { - this->blocks.current_extent = NULL; - this->blocks.current_offset = 0; - } - } else - this->blocks.current_offset++; -} - -/** - * - */ - -static struct toi_bdev_info *__find_next_chain_same_prio(void) -{ - struct toi_bdev_info *start_chain = toi_writer_posn.current_chain; - struct toi_bdev_info *this = start_chain; - int orig_prio = this->prio; - - do { - this = this->next; - - if (!this) - this = prio_chain_head; - - /* Back on original chain? Use it again. */ - if (this == start_chain) - return start_chain; - - } while (!this->blocks.current_extent || this->prio != orig_prio); - - return this; -} - -static void find_next_chain(void) -{ - struct toi_bdev_info *this; - - this = __find_next_chain_same_prio(); - - /* - * If we didn't get another chain of the same priority that we - * can use, look for the next priority. - */ - while (this && !this->blocks.current_extent) - this = this->next; - - toi_writer_posn.current_chain = this; -} - -/** - * toi_extent_state_next - go to the next extent - * @blocks: The number of values to progress. - * @stripe_mode: Whether to spread usage across all chains. - * - * Given a state, progress to the next valid entry. We may begin in an - * invalid state, as we do when invoked after extent_state_goto_start below. - * - * When using compression and expected_compression > 0, we let the image size - * be larger than storage, so we can validly run out of data to return. - **/ -static unsigned long toi_extent_state_next(int blocks, int current_stream) -{ - int i; - - if (!toi_writer_posn.current_chain) - return -ENOSPC; - - /* Assume chains always have lengths that are multiples of @blocks */ - for (i = 0; i < blocks; i++) - toi_extent_chain_next(); - - /* The header stream is not striped */ - if (current_stream || - !toi_writer_posn.current_chain->blocks.current_extent) - find_next_chain(); - - return toi_writer_posn.current_chain ? 0 : -ENOSPC; -} - -static void toi_insert_chain_in_prio_list(struct toi_bdev_info *this) -{ - struct toi_bdev_info **prev_ptr; - struct toi_bdev_info *cur; - - /* Loop through the existing chain, finding where to insert it */ - prev_ptr = &prio_chain_head; - cur = prio_chain_head; - - while (cur && cur->prio >= this->prio) { - prev_ptr = &cur->next; - cur = cur->next; - } - - this->next = *prev_ptr; - *prev_ptr = this; - - this = prio_chain_head; - while (this) - this = this->next; - num_chains++; -} - -/** - * toi_extent_state_goto_start - reinitialize an extent chain iterator - * @state: Iterator to reinitialize - **/ -void toi_extent_state_goto_start(void) -{ - struct toi_bdev_info *this = prio_chain_head; - - while (this) { - toi_message(TOI_BIO, TOI_VERBOSE, 0, - "Setting current extent to %p.", this->blocks.first); - this->blocks.current_extent = this->blocks.first; - if (this->blocks.current_extent) { - toi_message(TOI_BIO, TOI_VERBOSE, 0, - "Setting current offset to %lu.", - this->blocks.current_extent->start); - this->blocks.current_offset = - this->blocks.current_extent->start; - } - - this = this->next; - } - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Setting current chain to %p.", - prio_chain_head); - toi_writer_posn.current_chain = prio_chain_head; - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Leaving extent state goto start."); -} - -/** - * toi_extent_state_save - save state of the iterator - * @state: Current state of the chain - * @saved_state: Iterator to populate - * - * Given a state and a struct hibernate_extent_state_store, save the current - * position in a format that can be used with relocated chains (at - * resume time). - **/ -void toi_extent_state_save(int slot) -{ - struct toi_bdev_info *cur_chain = prio_chain_head; - struct hibernate_extent *extent; - struct hibernate_extent_saved_state *chain_state; - int i = 0; - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_extent_state_save, slot %d.", - slot); - - if (!toi_writer_posn.current_chain) { - toi_message(TOI_BIO, TOI_VERBOSE, 0, "No current chain => " - "chain_num = -1."); - toi_writer_posn.saved_chain_number[slot] = -1; - return; - } - - while (cur_chain) { - i++; - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Saving chain %d (%p) " - "state, slot %d.", i, cur_chain, slot); - - chain_state = &cur_chain->saved_state[slot]; - - chain_state->offset = cur_chain->blocks.current_offset; - - if (toi_writer_posn.current_chain == cur_chain) { - toi_writer_posn.saved_chain_number[slot] = i; - toi_message(TOI_BIO, TOI_VERBOSE, 0, "This is the chain " - "we were on => chain_num is %d.", i); - } - - if (!cur_chain->blocks.current_extent) { - chain_state->extent_num = 0; - toi_message(TOI_BIO, TOI_VERBOSE, 0, "No current extent " - "for this chain => extent_num %d is 0.", - i); - cur_chain = cur_chain->next; - continue; - } - - extent = cur_chain->blocks.first; - chain_state->extent_num = 1; - - while (extent != cur_chain->blocks.current_extent) { - chain_state->extent_num++; - extent = extent->next; - } - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "extent num %d is %d.", i, - chain_state->extent_num); - - cur_chain = cur_chain->next; - } - toi_message(TOI_BIO, TOI_VERBOSE, 0, - "Completed saving extent state slot %d.", slot); -} - -/** - * toi_extent_state_restore - restore the position saved by extent_state_save - * @state: State to populate - * @saved_state: Iterator saved to restore - **/ -void toi_extent_state_restore(int slot) -{ - int i = 0; - struct toi_bdev_info *cur_chain = prio_chain_head; - struct hibernate_extent_saved_state *chain_state; - - toi_message(TOI_BIO, TOI_VERBOSE, 0, - "toi_extent_state_restore - slot %d.", slot); - - if (toi_writer_posn.saved_chain_number[slot] == -1) { - toi_writer_posn.current_chain = NULL; - return; - } - - while (cur_chain) { - int posn; - int j; - i++; - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Restoring chain %d (%p) " - "state, slot %d.", i, cur_chain, slot); - - chain_state = &cur_chain->saved_state[slot]; - - posn = chain_state->extent_num; - - cur_chain->blocks.current_extent = cur_chain->blocks.first; - cur_chain->blocks.current_offset = chain_state->offset; - - if (i == toi_writer_posn.saved_chain_number[slot]) { - toi_writer_posn.current_chain = cur_chain; - toi_message(TOI_BIO, TOI_VERBOSE, 0, - "Found current chain."); - } - - for (j = 0; j < 4; j++) - if (i == toi_writer_posn.saved_chain_number[j]) { - toi_writer_posn.saved_chain_ptr[j] = cur_chain; - toi_message(TOI_BIO, TOI_VERBOSE, 0, - "Found saved chain ptr %d (%p) (offset" - " %d).", j, cur_chain, - cur_chain->saved_state[j].offset); - } - - if (posn) { - while (--posn) - cur_chain->blocks.current_extent = - cur_chain->blocks.current_extent->next; - } else - cur_chain->blocks.current_extent = NULL; - - cur_chain = cur_chain->next; - } - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Done."); - if (test_action_state(TOI_LOGALL)) - dump_block_chains(); -} - -/* - * Storage needed - * - * Returns amount of space in the image header required - * for the chain data. This ignores the links between - * pages, which we factor in when allocating the space. - */ -int toi_bio_devinfo_storage_needed(void) -{ - int result = sizeof(num_chains); - struct toi_bdev_info *chain = prio_chain_head; - - while (chain) { - result += metadata_size; - - /* Chain size */ - result += sizeof(int); - - /* Extents */ - result += (2 * sizeof(unsigned long) * - chain->blocks.num_extents); - - chain = chain->next; - } - - result += 4 * sizeof(int); - return result; -} - -static unsigned long chain_pages_used(struct toi_bdev_info *chain) -{ - struct hibernate_extent *this = chain->blocks.first; - struct hibernate_extent_saved_state *state = &chain->saved_state[3]; - unsigned long size = 0; - int extent_idx = 1; - - if (!state->extent_num) { - if (!this) - return 0; - else - return chain->blocks.size; - } - - while (extent_idx < state->extent_num) { - size += (this->end - this->start + 1); - this = this->next; - extent_idx++; - } - - /* We didn't use the one we're sitting on, so don't count it */ - return size + state->offset - this->start; -} - -void toi_bio_free_unused_storage_chain(struct toi_bdev_info *chain) -{ - unsigned long used = chain_pages_used(chain); - - /* Free the storage */ - unsigned long first_freed = 0; - - if (chain->allocator->bio_allocator_ops->free_unused_storage) - first_freed = chain->allocator->bio_allocator_ops->free_unused_storage(chain, used); - - printk(KERN_EMERG "Used %ld blocks in this chain. First extent freed is %lx.\n", used, first_freed); - - /* Adjust / free the extents. */ - toi_put_extent_chain_from(&chain->blocks, first_freed); - - { - struct hibernate_extent *this = chain->blocks.first; - while (this) { - printk("Extent %lx-%lx.\n", this->start, this->end); - this = this->next; - } - } -} - -/** - * toi_serialise_extent_chain - write a chain in the image - * @chain: Chain to write. - **/ -static int toi_serialise_extent_chain(struct toi_bdev_info *chain) -{ - struct hibernate_extent *this; - int ret; - int i = 1; - - chain->pages_used = chain_pages_used(chain); - - if (test_action_state(TOI_LOGALL)) - dump_block_chains(); - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Serialising chain (dev_t %lx).", - chain->dev_t); - /* Device info - dev_t, prio, bmap_shift, blocks per page, positions */ - ret = toiActiveAllocator->rw_header_chunk(WRITE, &toi_blockwriter_ops, - (char *) &chain->uuid, metadata_size); - if (ret) - return ret; - - /* Num extents */ - ret = toiActiveAllocator->rw_header_chunk(WRITE, &toi_blockwriter_ops, - (char *) &chain->blocks.num_extents, sizeof(int)); - if (ret) - return ret; - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "%d extents.", - chain->blocks.num_extents); - - this = chain->blocks.first; - while (this) { - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Extent %d.", i); - ret = toiActiveAllocator->rw_header_chunk(WRITE, - &toi_blockwriter_ops, - (char *) this, 2 * sizeof(this->start)); - if (ret) - return ret; - this = this->next; - i++; - } - - return ret; -} - -int toi_serialise_extent_chains(void) -{ - struct toi_bdev_info *this = prio_chain_head; - int result; - - /* Write the number of chains */ - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Write number of chains (%d)", - num_chains); - result = toiActiveAllocator->rw_header_chunk(WRITE, - &toi_blockwriter_ops, (char *) &num_chains, - sizeof(int)); - if (result) - return result; - - /* Then the chains themselves */ - while (this) { - result = toi_serialise_extent_chain(this); - if (result) - return result; - this = this->next; - } - - /* - * Finally, the chain we should be on at the start of each - * section. - */ - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Saved chain numbers."); - result = toiActiveAllocator->rw_header_chunk(WRITE, - &toi_blockwriter_ops, - (char *) &toi_writer_posn.saved_chain_number[0], - 4 * sizeof(int)); - - return result; -} - -int toi_register_storage_chain(struct toi_bdev_info *new) -{ - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Inserting chain %p into list.", - new); - toi_insert_chain_in_prio_list(new); - return 0; -} - -static void free_bdev_info(struct toi_bdev_info *chain) -{ - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Free chain %p.", chain); - - toi_message(TOI_BIO, TOI_VERBOSE, 0, " - Block extents."); - toi_put_extent_chain(&chain->blocks); - - /* - * The allocator may need to do more than just free the chains - * (swap_free, for example). Don't call from boot kernel. - */ - toi_message(TOI_BIO, TOI_VERBOSE, 0, " - Allocator extents."); - if (chain->allocator) - chain->allocator->bio_allocator_ops->free_storage(chain); - - /* - * Dropping out of reading atomic copy? Need to undo - * toi_open_by_devnum. - */ - toi_message(TOI_BIO, TOI_VERBOSE, 0, " - Bdev."); - if (chain->bdev && !IS_ERR(chain->bdev) && - chain->bdev != resume_block_device && - chain->bdev != header_block_device && - test_toi_state(TOI_TRYING_TO_RESUME)) - toi_close_bdev(chain->bdev); - - /* Poison */ - toi_message(TOI_BIO, TOI_VERBOSE, 0, " - Struct."); - toi_kfree(39, chain, sizeof(*chain)); - - if (prio_chain_head == chain) - prio_chain_head = NULL; - - num_chains--; -} - -void free_all_bdev_info(void) -{ - struct toi_bdev_info *this = prio_chain_head; - - while (this) { - struct toi_bdev_info *next = this->next; - free_bdev_info(this); - this = next; - } - - memset((char *) &toi_writer_posn, 0, sizeof(toi_writer_posn)); - prio_chain_head = NULL; -} - -static void set_up_start_position(void) -{ - toi_writer_posn.current_chain = prio_chain_head; - go_next_page(0, 0); -} - -/** - * toi_load_extent_chain - read back a chain saved in the image - * @chain: Chain to load - * - * The linked list of extents is reconstructed from the disk. chain will point - * to the first entry. - **/ -int toi_load_extent_chain(int index, int *num_loaded) -{ - struct toi_bdev_info *chain = toi_kzalloc(39, - sizeof(struct toi_bdev_info), GFP_ATOMIC); - struct hibernate_extent *this, *last = NULL; - int i, ret; - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Loading extent chain %d.", index); - /* Get dev_t, prio, bmap_shift, blocks per page, positions */ - ret = toiActiveAllocator->rw_header_chunk_noreadahead(READ, NULL, - (char *) &chain->uuid, metadata_size); - - if (ret) { - printk(KERN_ERR "Failed to read the size of extent chain.\n"); - toi_kfree(39, chain, sizeof(*chain)); - return 1; - } - - toi_bkd.pages_used[index] = chain->pages_used; - - ret = toiActiveAllocator->rw_header_chunk_noreadahead(READ, NULL, - (char *) &chain->blocks.num_extents, sizeof(int)); - if (ret) { - printk(KERN_ERR "Failed to read the size of extent chain.\n"); - toi_kfree(39, chain, sizeof(*chain)); - return 1; - } - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "%d extents.", - chain->blocks.num_extents); - - for (i = 0; i < chain->blocks.num_extents; i++) { - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Extent %d.", i + 1); - - this = toi_kzalloc(2, sizeof(struct hibernate_extent), - TOI_ATOMIC_GFP); - if (!this) { - printk(KERN_INFO "Failed to allocate a new extent.\n"); - free_bdev_info(chain); - return -ENOMEM; - } - this->next = NULL; - /* Get the next page */ - ret = toiActiveAllocator->rw_header_chunk_noreadahead(READ, - NULL, (char *) this, 2 * sizeof(this->start)); - if (ret) { - printk(KERN_INFO "Failed to read an extent.\n"); - toi_kfree(2, this, sizeof(struct hibernate_extent)); - free_bdev_info(chain); - return 1; - } - - if (last) - last->next = this; - else { - char b1[32], b2[32], b3[32]; - /* - * Open the bdev - */ - toi_message(TOI_BIO, TOI_VERBOSE, 0, - "Chain dev_t is %s. Resume dev t is %s. Header" - " bdev_t is %s.\n", - format_dev_t(b1, chain->dev_t), - format_dev_t(b2, resume_dev_t), - format_dev_t(b3, toi_sig_data->header_dev_t)); - - if (chain->dev_t == resume_dev_t) - chain->bdev = resume_block_device; - else if (chain->dev_t == toi_sig_data->header_dev_t) - chain->bdev = header_block_device; - else { - chain->bdev = toi_open_bdev(chain->uuid, - chain->dev_t, 1); - if (IS_ERR(chain->bdev)) { - free_bdev_info(chain); - return -ENODEV; - } - } - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Chain bmap shift " - "is %d and blocks per page is %d.", - chain->bmap_shift, - chain->blocks_per_page); - - chain->blocks.first = this; - - /* - * Couldn't do this earlier, but can't do - * goto_start now - we may have already used blocks - * in the first chain. - */ - chain->blocks.current_extent = this; - chain->blocks.current_offset = this->start; - - /* - * Can't wait until we've read the whole chain - * before we insert it in the list. We might need - * this chain to read the next page in the header - */ - toi_insert_chain_in_prio_list(chain); - } - - /* - * We have to wait until 2 extents are loaded before setting up - * properly because if the first extent has only one page, we - * will need to put the position on the second extent. Sounds - * obvious, but it wasn't! - */ - (*num_loaded)++; - if ((*num_loaded) == 2) - set_up_start_position(); - last = this; - } - - /* - * Shouldn't get empty chains, but it's not impossible. Link them in so - * they get freed properly later. - */ - if (!chain->blocks.num_extents) - toi_insert_chain_in_prio_list(chain); - - if (!chain->blocks.current_extent) { - chain->blocks.current_extent = chain->blocks.first; - if (chain->blocks.current_extent) - chain->blocks.current_offset = - chain->blocks.current_extent->start; - } - return 0; -} - -int toi_load_extent_chains(void) -{ - int result; - int to_load; - int i; - int extents_loaded = 0; - - result = toiActiveAllocator->rw_header_chunk_noreadahead(READ, NULL, - (char *) &to_load, - sizeof(int)); - if (result) - return result; - toi_message(TOI_BIO, TOI_VERBOSE, 0, "%d chains to read.", to_load); - - for (i = 0; i < to_load; i++) { - toi_message(TOI_BIO, TOI_VERBOSE, 0, " >> Loading chain %d/%d.", - i, to_load); - result = toi_load_extent_chain(i, &extents_loaded); - if (result) - return result; - } - - /* If we never got to a second extent, we still need to do this. */ - if (extents_loaded == 1) - set_up_start_position(); - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Save chain numbers."); - result = toiActiveAllocator->rw_header_chunk_noreadahead(READ, - &toi_blockwriter_ops, - (char *) &toi_writer_posn.saved_chain_number[0], - 4 * sizeof(int)); - - return result; -} - -static int toi_end_of_stream(int writing, int section_barrier) -{ - struct toi_bdev_info *cur_chain = toi_writer_posn.current_chain; - int compare_to = next_section[current_stream]; - struct toi_bdev_info *compare_chain = - toi_writer_posn.saved_chain_ptr[compare_to]; - int compare_offset = compare_chain ? - compare_chain->saved_state[compare_to].offset : 0; - - if (!section_barrier) - return 0; - - if (!cur_chain) - return 1; - - if (cur_chain == compare_chain && - cur_chain->blocks.current_offset == compare_offset) { - if (writing) { - if (!current_stream) { - debug_broken_header(); - return 1; - } - } else { - more_readahead = 0; - toi_message(TOI_BIO, TOI_VERBOSE, 0, - "Reached the end of stream %d " - "(not an error).", current_stream); - return 1; - } - } - - return 0; -} - -/** - * go_next_page - skip blocks to the start of the next page - * @writing: Whether we're reading or writing the image. - * - * Go forward one page. - **/ -int go_next_page(int writing, int section_barrier) -{ - struct toi_bdev_info *cur_chain = toi_writer_posn.current_chain; - int max = cur_chain ? cur_chain->blocks_per_page : 1; - - /* Nope. Go foward a page - or maybe two. Don't stripe the header, - * so that bad fragmentation doesn't put the extent data containing - * the location of the second page out of the first header page. - */ - if (toi_extent_state_next(max, current_stream)) { - /* Don't complain if readahead falls off the end */ - if (writing && section_barrier) { - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Extent state eof. " - "Expected compression ratio too optimistic?"); - if (test_action_state(TOI_LOGALL)) - dump_block_chains(); - } - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Ran out of extents to " - "read/write. (Not necessarily a fatal error."); - return -ENOSPC; - } - - return 0; -} - -int devices_of_same_priority(struct toi_bdev_info *this) -{ - struct toi_bdev_info *check = prio_chain_head; - int i = 0; - - while (check) { - if (check->prio == this->prio) - i++; - check = check->next; - } - - return i; -} - -/** - * toi_bio_rw_page - do i/o on the next disk page in the image - * @writing: Whether reading or writing. - * @page: Page to do i/o on. - * @is_readahead: Whether we're doing readahead - * @free_group: The group used in allocating the page - * - * Submit a page for reading or writing, possibly readahead. - * Pass the group used in allocating the page as well, as it should - * be freed on completion of the bio if we're writing the page. - **/ -int toi_bio_rw_page(int writing, struct page *page, - int is_readahead, int free_group) -{ - int result = toi_end_of_stream(writing, 1); - struct toi_bdev_info *dev_info = toi_writer_posn.current_chain; - - if (result) { - if (writing) - abort_hibernate(TOI_INSUFFICIENT_STORAGE, - "Insufficient storage for your image."); - else - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Seeking to " - "read/write another page when stream has " - "ended."); - return -ENOSPC; - } - - toi_message(TOI_BIO, TOI_VERBOSE, 0, - "%s %lx:%ld", - writing ? "Write" : "Read", - dev_info->dev_t, dev_info->blocks.current_offset); - - result = toi_do_io(writing, dev_info->bdev, - dev_info->blocks.current_offset << dev_info->bmap_shift, - page, is_readahead, 0, free_group); - - /* Ignore the result here - will check end of stream if come in again */ - go_next_page(writing, 1); - - if (result) - printk(KERN_ERR "toi_do_io returned %d.\n", result); - return result; -} - -dev_t get_header_dev_t(void) -{ - return prio_chain_head->dev_t; -} - -struct block_device *get_header_bdev(void) -{ - return prio_chain_head->bdev; -} - -unsigned long get_headerblock(void) -{ - return prio_chain_head->blocks.first->start << - prio_chain_head->bmap_shift; -} - -int get_main_pool_phys_params(void) -{ - struct toi_bdev_info *this = prio_chain_head; - int result; - - while (this) { - result = this->allocator->bio_allocator_ops->bmap(this); - if (result) - return result; - this = this->next; - } - - return 0; -} - -static int apply_header_reservation(void) -{ - int i; - - if (!header_pages_reserved) { - toi_message(TOI_BIO, TOI_VERBOSE, 0, - "No header pages reserved at the moment."); - return 0; - } - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Applying header reservation."); - - /* Apply header space reservation */ - toi_extent_state_goto_start(); - - for (i = 0; i < header_pages_reserved; i++) - if (go_next_page(1, 0)) - return -ENOSPC; - - /* The end of header pages will be the start of pageset 2 */ - toi_extent_state_save(2); - - toi_message(TOI_BIO, TOI_VERBOSE, 0, - "Finished applying header reservation."); - return 0; -} - -static int toi_bio_register_storage(void) -{ - int result = 0; - struct toi_module_ops *this_module; - - list_for_each_entry(this_module, &toi_modules, module_list) { - if (!this_module->enabled || - this_module->type != BIO_ALLOCATOR_MODULE) - continue; - toi_message(TOI_BIO, TOI_VERBOSE, 0, - "Registering storage from %s.", - this_module->name); - result = this_module->bio_allocator_ops->register_storage(); - if (result) - break; - } - - return result; -} - -void toi_bio_free_unused_storage(void) -{ - struct toi_bdev_info *this = prio_chain_head; - - while (this) { - toi_bio_free_unused_storage_chain(this); - this = this->next; - } -} - -int toi_bio_allocate_storage(unsigned long request) -{ - struct toi_bdev_info *chain = prio_chain_head; - unsigned long to_get = request; - unsigned long extra_pages, needed; - int no_free = 0; - - if (!chain) { - int result = toi_bio_register_storage(); - toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_allocate_storage: " - "Registering storage."); - if (result) - return 0; - chain = prio_chain_head; - if (!chain) { - printk("TuxOnIce: No storage was registered.\n"); - return 0; - } - } - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_allocate_storage: " - "Request is %lu pages.", request); - extra_pages = DIV_ROUND_UP(request * (sizeof(unsigned long) - + sizeof(int)), PAGE_SIZE); - needed = request + extra_pages + header_pages_reserved; - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Adding %lu extra pages and %lu " - "for header => %lu.", - extra_pages, header_pages_reserved, needed); - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Already allocated %lu pages.", - raw_pages_allocd); - - to_get = needed > raw_pages_allocd ? needed - raw_pages_allocd : 0; - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Need to get %lu pages.", to_get); - - if (!to_get) - return apply_header_reservation(); - - while (to_get && chain) { - int num_group = devices_of_same_priority(chain); - int divisor = num_group - no_free; - int i; - unsigned long portion = DIV_ROUND_UP(to_get, divisor); - unsigned long got = 0; - unsigned long got_this_round = 0; - struct toi_bdev_info *top = chain; - - toi_message(TOI_BIO, TOI_VERBOSE, 0, - " Start of loop. To get is %lu. Divisor is %d.", - to_get, divisor); - no_free = 0; - - /* - * We're aiming to spread the allocated storage as evenly - * as possible, but we also want to get all the storage we - * can off this priority. - */ - for (i = 0; i < num_group; i++) { - struct toi_bio_allocator_ops *ops = - chain->allocator->bio_allocator_ops; - toi_message(TOI_BIO, TOI_VERBOSE, 0, - " Asking for %lu pages from chain %p.", - portion, chain); - got = ops->allocate_storage(chain, portion); - toi_message(TOI_BIO, TOI_VERBOSE, 0, - " Got %lu pages from allocator %p.", - got, chain); - if (!got) - no_free++; - got_this_round += got; - chain = chain->next; - } - toi_message(TOI_BIO, TOI_VERBOSE, 0, " Loop finished. Got a " - "total of %lu pages from %d allocators.", - got_this_round, divisor - no_free); - - raw_pages_allocd += got_this_round; - to_get = needed > raw_pages_allocd ? needed - raw_pages_allocd : - 0; - - /* - * If we got anything from chains of this priority and we - * still have storage to allocate, go over this priority - * again. - */ - if (got_this_round && to_get) - chain = top; - else - no_free = 0; - } - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Finished allocating. Calling " - "get_main_pool_phys_params"); - /* Now let swap allocator bmap the pages */ - get_main_pool_phys_params(); - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Done. Reserving header."); - return apply_header_reservation(); -} - -void toi_bio_chains_post_atomic(struct toi_boot_kernel_data *bkd) -{ - int i = 0; - struct toi_bdev_info *cur_chain = prio_chain_head; - - while (cur_chain) { - cur_chain->pages_used = bkd->pages_used[i]; - cur_chain = cur_chain->next; - i++; - } -} - -int toi_bio_chains_debug_info(char *buffer, int size) -{ - /* Show what we actually used */ - struct toi_bdev_info *cur_chain = prio_chain_head; - int len = 0; - - while (cur_chain) { - len += scnprintf(buffer + len, size - len, " Used %lu pages " - "from %s.\n", cur_chain->pages_used, - cur_chain->name); - cur_chain = cur_chain->next; - } - - return len; -} - -void toi_bio_store_inc_image_ptr(struct toi_incremental_image_pointer *ptr) -{ - struct toi_bdev_info *this = toi_writer_posn.current_chain, - *cmp = prio_chain_head; - - ptr->save.chain = 1; - while (this != cmp) { - ptr->save.chain++; - cmp = cmp->next; - } - ptr->save.block = this->blocks.current_offset; - - /* Save the raw info internally for quicker access when updating pointers */ - ptr->bdev = this->bdev; - ptr->block = this->blocks.current_offset << this->bmap_shift; -} - -void toi_bio_restore_inc_image_ptr(struct toi_incremental_image_pointer *ptr) -{ - int i = ptr->save.chain - 1; - struct toi_bdev_info *this; - struct hibernate_extent *hib; - - /* Find chain by stored index */ - this = prio_chain_head; - while (i) { - this = this->next; - i--; - } - toi_writer_posn.current_chain = this; - - /* Restore block */ - this->blocks.current_offset = ptr->save.block; - - /* Find current offset from block number */ - hib = this->blocks.first; - - while (hib->start > ptr->save.block) { - hib = hib->next; - } - - this->blocks.last_touched = this->blocks.current_extent = hib; -} diff --git a/kernel/power/tuxonice_bio_core.c b/kernel/power/tuxonice_bio_core.c deleted file mode 100644 index 87aa4c96e..000000000 --- a/kernel/power/tuxonice_bio_core.c +++ /dev/null @@ -1,1932 +0,0 @@ -/* - * kernel/power/tuxonice_bio.c - * - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * Distributed under GPLv2. - * - * This file contains block io functions for TuxOnIce. These are - * used by the swapwriter and it is planned that they will also - * be used by the NFSwriter. - * - */ - -#include <linux/blkdev.h> -#include <linux/syscalls.h> -#include <linux/suspend.h> -#include <linux/ctype.h> -#include <linux/mount.h> -#include <linux/fs_uuid.h> - -#include "tuxonice.h" -#include "tuxonice_sysfs.h" -#include "tuxonice_modules.h" -#include "tuxonice_prepare_image.h" -#include "tuxonice_bio.h" -#include "tuxonice_ui.h" -#include "tuxonice_alloc.h" -#include "tuxonice_io.h" -#include "tuxonice_builtin.h" -#include "tuxonice_bio_internal.h" - -#define MEMORY_ONLY 1 -#define THROTTLE_WAIT 2 - -/* #define MEASURE_MUTEX_CONTENTION */ -#ifndef MEASURE_MUTEX_CONTENTION -#define my_mutex_lock(index, the_lock) mutex_lock(the_lock) -#define my_mutex_unlock(index, the_lock) mutex_unlock(the_lock) -#else -unsigned long mutex_times[2][2][NR_CPUS]; -#define my_mutex_lock(index, the_lock) do { \ - int have_mutex; \ - have_mutex = mutex_trylock(the_lock); \ - if (!have_mutex) { \ - mutex_lock(the_lock); \ - mutex_times[index][0][smp_processor_id()]++; \ - } else { \ - mutex_times[index][1][smp_processor_id()]++; \ - } - -#define my_mutex_unlock(index, the_lock) \ - mutex_unlock(the_lock); \ -} while (0) -#endif - -static int page_idx, reset_idx; - -static int target_outstanding_io = 1024; -static int max_outstanding_writes, max_outstanding_reads; - -static struct page *bio_queue_head, *bio_queue_tail; -static atomic_t toi_bio_queue_size; -static DEFINE_SPINLOCK(bio_queue_lock); - -static int free_mem_throttle, throughput_throttle; -int more_readahead = 1; -static struct page *readahead_list_head, *readahead_list_tail; - -static struct page *waiting_on; - -static atomic_t toi_io_in_progress, toi_io_done; -static DECLARE_WAIT_QUEUE_HEAD(num_in_progress_wait); - -int current_stream; -/* Not static, so that the allocators can setup and complete - * writing the header */ -char *toi_writer_buffer; -int toi_writer_buffer_posn; - -static DEFINE_MUTEX(toi_bio_mutex); -static DEFINE_MUTEX(toi_bio_readahead_mutex); - -static struct task_struct *toi_queue_flusher; -static int toi_bio_queue_flush_pages(int dedicated_thread); - -struct toi_module_ops toi_blockwriter_ops; - -struct toi_incremental_image_pointer toi_inc_ptr[2][2]; - -#define TOTAL_OUTSTANDING_IO (atomic_read(&toi_io_in_progress) + \ - atomic_read(&toi_bio_queue_size)) - -unsigned long raw_pages_allocd, header_pages_reserved; - -static int toi_rw_buffer(int writing, char *buffer, int buffer_size, - int no_readahead); - -/** - * set_free_mem_throttle - set the point where we pause to avoid oom. - * - * Initially, this value is zero, but when we first fail to allocate memory, - * we set it (plus a buffer) and thereafter throttle i/o once that limit is - * reached. - **/ -static void set_free_mem_throttle(void) -{ - int new_throttle = nr_free_buffer_pages() + 256; - - if (new_throttle > free_mem_throttle) - free_mem_throttle = new_throttle; -} - -#define NUM_REASONS 7 -static atomic_t reasons[NUM_REASONS]; -static char *reason_name[NUM_REASONS] = { - "readahead not ready", - "bio allocation", - "synchronous I/O", - "toi_bio_get_new_page", - "memory low", - "readahead buffer allocation", - "throughput_throttle", -}; - -/* User Specified Parameters. */ -unsigned long resume_firstblock; -dev_t resume_dev_t; -struct block_device *resume_block_device; -static atomic_t resume_bdev_open_count; - -struct block_device *header_block_device; - -/** - * toi_open_bdev: Open a bdev at resume time. - * - * index: The swap index. May be MAX_SWAPFILES for the resume_dev_t - * (the user can have resume= pointing at a swap partition/file that isn't - * swapon'd when they hibernate. MAX_SWAPFILES+1 for the first page of the - * header. It will be from a swap partition that was enabled when we hibernated, - * but we don't know it's real index until we read that first page. - * dev_t: The device major/minor. - * display_errs: Whether to try to do this quietly. - * - * We stored a dev_t in the image header. Open the matching device without - * requiring /dev/<whatever> in most cases and record the details needed - * to close it later and avoid duplicating work. - */ -struct block_device *toi_open_bdev(char *uuid, dev_t default_device, - int display_errs) -{ - struct block_device *bdev; - dev_t device = default_device; - char buf[32]; - int retried = 0; - -retry: - if (uuid) { - struct fs_info seek; - strncpy((char *) &seek.uuid, uuid, 16); - seek.dev_t = 0; - seek.last_mount_size = 0; - device = blk_lookup_fs_info(&seek); - if (!device) { - device = default_device; - printk(KERN_DEBUG "Unable to resolve uuid. Falling back" - " to dev_t.\n"); - } else - printk(KERN_DEBUG "Resolved uuid to device %s.\n", - format_dev_t(buf, device)); - } - - if (!device) { - printk(KERN_ERR "TuxOnIce attempting to open a " - "blank dev_t!\n"); - dump_stack(); - return NULL; - } - bdev = toi_open_by_devnum(device); - - if (IS_ERR(bdev) || !bdev) { - if (!retried) { - retried = 1; - wait_for_device_probe(); - goto retry; - } - if (display_errs) - toi_early_boot_message(1, TOI_CONTINUE_REQ, - "Failed to get access to block device " - "\"%x\" (error %d).\n Maybe you need " - "to run mknod and/or lvmsetup in an " - "initrd/ramfs?", device, bdev); - return ERR_PTR(-EINVAL); - } - toi_message(TOI_BIO, TOI_VERBOSE, 0, - "TuxOnIce got bdev %p for dev_t %x.", - bdev, device); - - return bdev; -} - -static void toi_bio_reserve_header_space(unsigned long request) -{ - header_pages_reserved = request; -} - -/** - * do_bio_wait - wait for some TuxOnIce I/O to complete - * @reason: The array index of the reason we're waiting. - * - * Wait for a particular page of I/O if we're after a particular page. - * If we're not after a particular page, wait instead for all in flight - * I/O to be completed or for us to have enough free memory to be able - * to submit more I/O. - * - * If we wait, we also update our statistics regarding why we waited. - **/ -static void do_bio_wait(int reason) -{ - struct page *was_waiting_on = waiting_on; - - /* On SMP, waiting_on can be reset, so we make a copy */ - if (was_waiting_on) { - wait_on_page_locked(was_waiting_on); - atomic_inc(&reasons[reason]); - } else { - atomic_inc(&reasons[reason]); - - wait_event(num_in_progress_wait, - !atomic_read(&toi_io_in_progress) || - nr_free_buffer_pages() > free_mem_throttle); - } -} - -/** - * throttle_if_needed - wait for I/O completion if throttle points are reached - * @flags: What to check and how to act. - * - * Check whether we need to wait for some I/O to complete. We always check - * whether we have enough memory available, but may also (depending upon - * @reason) check if the throughput throttle limit has been reached. - **/ -static int throttle_if_needed(int flags) -{ - int free_pages = nr_free_buffer_pages(); - - /* Getting low on memory and I/O is in progress? */ - while (unlikely(free_pages < free_mem_throttle) && - atomic_read(&toi_io_in_progress) && - !test_result_state(TOI_ABORTED)) { - if (!(flags & THROTTLE_WAIT)) - return -ENOMEM; - do_bio_wait(4); - free_pages = nr_free_buffer_pages(); - } - - while (!(flags & MEMORY_ONLY) && throughput_throttle && - TOTAL_OUTSTANDING_IO >= throughput_throttle && - !test_result_state(TOI_ABORTED)) { - int result = toi_bio_queue_flush_pages(0); - if (result) - return result; - atomic_inc(&reasons[6]); - wait_event(num_in_progress_wait, - !atomic_read(&toi_io_in_progress) || - TOTAL_OUTSTANDING_IO < throughput_throttle); - } - - return 0; -} - -/** - * update_throughput_throttle - update the raw throughput throttle - * @jif_index: The number of times this function has been called. - * - * This function is called four times per second by the core, and used to limit - * the amount of I/O we submit at once, spreading out our waiting through the - * whole job and letting userui get an opportunity to do its work. - * - * We don't start limiting I/O until 1/4s has gone so that we get a - * decent sample for our initial limit, and keep updating it because - * throughput may vary (on rotating media, eg) with our block number. - * - * We throttle to 1/10s worth of I/O. - **/ -static void update_throughput_throttle(int jif_index) -{ - int done = atomic_read(&toi_io_done); - throughput_throttle = done * 2 / 5 / jif_index; -} - -/** - * toi_finish_all_io - wait for all outstanding i/o to complete - * - * Flush any queued but unsubmitted I/O and wait for it all to complete. - **/ -static int toi_finish_all_io(void) -{ - int result = toi_bio_queue_flush_pages(0); - toi_bio_queue_flusher_should_finish = 1; - wake_up(&toi_io_queue_flusher); - wait_event(num_in_progress_wait, !TOTAL_OUTSTANDING_IO); - return result; -} - -/** - * toi_end_bio - bio completion function. - * @bio: bio that has completed. - * - * Function called by the block driver from interrupt context when I/O is - * completed. If we were writing the page, we want to free it and will have - * set bio->bi_private to the parameter we should use in telling the page - * allocation accounting code what the page was allocated for. If we're - * reading the page, it will be in the singly linked list made from - * page->private pointers. - **/ -static void toi_end_bio(struct bio *bio) -{ - struct page *page = bio->bi_io_vec[0].bv_page; - - BUG_ON(bio->bi_error); - - unlock_page(page); - bio_put(bio); - - if (waiting_on == page) - waiting_on = NULL; - - put_page(page); - - if (bio->bi_private) - toi__free_page((int) ((unsigned long) bio->bi_private) , page); - - bio_put(bio); - - atomic_dec(&toi_io_in_progress); - atomic_inc(&toi_io_done); - - wake_up(&num_in_progress_wait); -} - -/** - * submit - submit BIO request - * @writing: READ or WRITE. - * @dev: The block device we're using. - * @first_block: The first sector we're using. - * @page: The page being used for I/O. - * @free_group: If writing, the group that was used in allocating the page - * and which will be used in freeing the page from the completion - * routine. - * - * Based on Patrick Mochell's pmdisk code from long ago: "Straight from the - * textbook - allocate and initialize the bio. If we're writing, make sure - * the page is marked as dirty. Then submit it and carry on." - * - * If we're just testing the speed of our own code, we fake having done all - * the hard work and all toi_end_bio immediately. - **/ -static int submit(int writing, struct block_device *dev, sector_t first_block, - struct page *page, int free_group) -{ - struct bio *bio = NULL; - int cur_outstanding_io, result; - - /* - * Shouldn't throttle if reading - can deadlock in the single - * threaded case as pages are only freed when we use the - * readahead. - */ - if (writing) { - result = throttle_if_needed(MEMORY_ONLY | THROTTLE_WAIT); - if (result) - return result; - } - - while (!bio) { - bio = bio_alloc(TOI_ATOMIC_GFP, 1); - if (!bio) { - set_free_mem_throttle(); - do_bio_wait(1); - } - } - - bio->bi_bdev = dev; - bio->bi_iter.bi_sector = first_block; - bio->bi_private = (void *) ((unsigned long) free_group); - bio->bi_end_io = toi_end_bio; - bio_set_flag(bio, BIO_TOI); - - if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { - printk(KERN_DEBUG "ERROR: adding page to bio at %lld\n", - (unsigned long long) first_block); - bio_put(bio); - return -EFAULT; - } - - bio_get(bio); - - cur_outstanding_io = atomic_add_return(1, &toi_io_in_progress); - if (writing) { - if (cur_outstanding_io > max_outstanding_writes) - max_outstanding_writes = cur_outstanding_io; - } else { - if (cur_outstanding_io > max_outstanding_reads) - max_outstanding_reads = cur_outstanding_io; - } - - /* Still read the header! */ - if (unlikely(test_action_state(TOI_TEST_BIO) && writing)) { - /* Fake having done the hard work */ - bio->bi_error = 0; - toi_end_bio(bio); - } else - submit_bio(writing | REQ_SYNC, bio); - - return 0; -} - -/** - * toi_do_io: Prepare to do some i/o on a page and submit or batch it. - * - * @writing: Whether reading or writing. - * @bdev: The block device which we're using. - * @block0: The first sector we're reading or writing. - * @page: The page on which I/O is being done. - * @readahead_index: If doing readahead, the index (reset this flag when done). - * @syncio: Whether the i/o is being done synchronously. - * - * Prepare and start a read or write operation. - * - * Note that we always work with our own page. If writing, we might be given a - * compression buffer that will immediately be used to start compressing the - * next page. For reading, we do readahead and therefore don't know the final - * address where the data needs to go. - **/ -int toi_do_io(int writing, struct block_device *bdev, long block0, - struct page *page, int is_readahead, int syncio, int free_group) -{ - page->private = 0; - - /* Do here so we don't race against toi_bio_get_next_page_read */ - lock_page(page); - - if (is_readahead) { - if (readahead_list_head) - readahead_list_tail->private = (unsigned long) page; - else - readahead_list_head = page; - - readahead_list_tail = page; - } - - /* Done before submitting to avoid races. */ - if (syncio) - waiting_on = page; - - /* Submit the page */ - get_page(page); - - if (submit(writing, bdev, block0, page, free_group)) - return -EFAULT; - - if (syncio) - do_bio_wait(2); - - return 0; -} - -/** - * toi_bdev_page_io - simpler interface to do directly i/o on a single page - * @writing: Whether reading or writing. - * @bdev: Block device on which we're operating. - * @pos: Sector at which page to read or write starts. - * @page: Page to be read/written. - * - * A simple interface to submit a page of I/O and wait for its completion. - * The caller must free the page used. - **/ -static int toi_bdev_page_io(int writing, struct block_device *bdev, - long pos, struct page *page) -{ - return toi_do_io(writing, bdev, pos, page, 0, 1, 0); -} - -/** - * toi_bio_memory_needed - report the amount of memory needed for block i/o - * - * We want to have at least enough memory so as to have target_outstanding_io - * or more transactions on the fly at once. If we can do more, fine. - **/ -static int toi_bio_memory_needed(void) -{ - return target_outstanding_io * (PAGE_SIZE + sizeof(struct request) + - sizeof(struct bio)); -} - -/** - * toi_bio_print_debug_stats - put out debugging info in the buffer provided - * @buffer: A buffer of size @size into which text should be placed. - * @size: The size of @buffer. - * - * Fill a buffer with debugging info. This is used for both our debug_info sysfs - * entry and for recording the same info in dmesg. - **/ -static int toi_bio_print_debug_stats(char *buffer, int size) -{ - int len = 0; - - if (toiActiveAllocator != &toi_blockwriter_ops) { - len = scnprintf(buffer, size, - "- Block I/O inactive.\n"); - return len; - } - - len = scnprintf(buffer, size, "- Block I/O active.\n"); - - len += toi_bio_chains_debug_info(buffer + len, size - len); - - len += scnprintf(buffer + len, size - len, - "- Max outstanding reads %d. Max writes %d.\n", - max_outstanding_reads, max_outstanding_writes); - - len += scnprintf(buffer + len, size - len, - " Memory_needed: %d x (%lu + %u + %u) = %d bytes.\n", - target_outstanding_io, - PAGE_SIZE, (unsigned int) sizeof(struct request), - (unsigned int) sizeof(struct bio), toi_bio_memory_needed()); - -#ifdef MEASURE_MUTEX_CONTENTION - { - int i; - - len += scnprintf(buffer + len, size - len, - " Mutex contention while reading:\n Contended Free\n"); - - for_each_online_cpu(i) - len += scnprintf(buffer + len, size - len, - " %9lu %9lu\n", - mutex_times[0][0][i], mutex_times[0][1][i]); - - len += scnprintf(buffer + len, size - len, - " Mutex contention while writing:\n Contended Free\n"); - - for_each_online_cpu(i) - len += scnprintf(buffer + len, size - len, - " %9lu %9lu\n", - mutex_times[1][0][i], mutex_times[1][1][i]); - - } -#endif - - return len + scnprintf(buffer + len, size - len, - " Free mem throttle point reached %d.\n", free_mem_throttle); -} - -static int total_header_bytes; -static int unowned; - -void debug_broken_header(void) -{ - printk(KERN_DEBUG "Image header too big for size allocated!\n"); - print_toi_header_storage_for_modules(); - printk(KERN_DEBUG "Page flags : %d.\n", toi_pageflags_space_needed()); - printk(KERN_DEBUG "toi_header : %zu.\n", sizeof(struct toi_header)); - printk(KERN_DEBUG "Total unowned : %d.\n", unowned); - printk(KERN_DEBUG "Total used : %d (%ld pages).\n", total_header_bytes, - DIV_ROUND_UP(total_header_bytes, PAGE_SIZE)); - printk(KERN_DEBUG "Space needed now : %ld.\n", - get_header_storage_needed()); - dump_block_chains(); - abort_hibernate(TOI_HEADER_TOO_BIG, "Header reservation too small."); -} - -static int toi_bio_update_previous_inc_img_ptr(int stream) -{ - int result; - char * buffer = (char *) toi_get_zeroed_page(12, TOI_ATOMIC_GFP); - struct page *page; - struct toi_incremental_image_pointer *prev, *this; - - prev = &toi_inc_ptr[stream][0]; - this = &toi_inc_ptr[stream][1]; - - if (!buffer) { - // We're at the start of writing a pageset. Memory should not be that scarce. - return -ENOMEM; - } - - page = virt_to_page(buffer); - result = toi_do_io(READ, prev->bdev, prev->block, page, 0, 1, 0); - - if (result) - goto out; - - memcpy(buffer, (char *) this, sizeof(this->save)); - - result = toi_do_io(WRITE, prev->bdev, prev->block, page, 0, 0, 12); - - // If the IO is successfully submitted (!result), the page will be freed - // asynchronously on completion. -out: - if (result) - toi__free_page(12, virt_to_page(buffer)); - return result; -} - -/** - * toi_rw_init_incremental - incremental image part of setting up to write new section - */ -static int toi_write_init_incremental(int stream) -{ - int result = 0; - - // Remember the location of this block so we can link to it. - toi_bio_store_inc_image_ptr(&toi_inc_ptr[stream][1]); - - // Update the pointer at the start of the last pageset with the same stream number. - result = toi_bio_update_previous_inc_img_ptr(stream); - if (result) - return result; - - // Move the current to the previous slot. - memcpy(&toi_inc_ptr[stream][0], &toi_inc_ptr[stream][1], sizeof(toi_inc_ptr[stream][1])); - - // Store a blank pointer at the start of this incremental pageset - memset(&toi_inc_ptr[stream][1], 0, sizeof(toi_inc_ptr[stream][1])); - result = toi_rw_buffer(WRITE, (char *) &toi_inc_ptr[stream][1], sizeof(toi_inc_ptr[stream][1]), 0); - if (result) - return result; - - // Serialise extent chains if this is an incremental pageset - return toi_serialise_extent_chains(); -} - -/** - * toi_read_init_incremental - incremental image part of setting up to read new section - */ -static int toi_read_init_incremental(int stream) -{ - int result; - - // Set our position to the start of the next pageset - toi_bio_restore_inc_image_ptr(&toi_inc_ptr[stream][1]); - - // Read the start of the next incremental pageset (if any) - result = toi_rw_buffer(READ, (char *) &toi_inc_ptr[stream][1], sizeof(toi_inc_ptr[stream][1]), 0); - - if (!result) - result = toi_load_extent_chains(); - - return result; -} - -/** - * toi_rw_init - prepare to read or write a stream in the image - * @writing: Whether reading or writing. - * @stream number: Section of the image being processed. - * - * Prepare to read or write a section ('stream') in the image. - **/ -static int toi_rw_init(int writing, int stream_number) -{ - if (stream_number) - toi_extent_state_restore(stream_number); - else - toi_extent_state_goto_start(); - - if (writing) { - reset_idx = 0; - if (!current_stream) - page_idx = 0; - } else { - reset_idx = 1; - } - - atomic_set(&toi_io_done, 0); - if (!toi_writer_buffer) - toi_writer_buffer = (char *) toi_get_zeroed_page(11, - TOI_ATOMIC_GFP); - toi_writer_buffer_posn = writing ? 0 : PAGE_SIZE; - - current_stream = stream_number; - - more_readahead = 1; - - if (test_result_state(TOI_KEPT_IMAGE)) { - int result; - - if (writing) { - result = toi_write_init_incremental(stream_number); - } else { - result = toi_read_init_incremental(stream_number); - } - - if (result) - return result; - } - - return toi_writer_buffer ? 0 : -ENOMEM; -} - -/** - * toi_bio_queue_write - queue a page for writing - * @full_buffer: Pointer to a page to be queued - * - * Add a page to the queue to be submitted. If we're the queue flusher, - * we'll do this once we've dropped toi_bio_mutex, so other threads can - * continue to submit I/O while we're on the slow path doing the actual - * submission. - **/ -static void toi_bio_queue_write(char **full_buffer) -{ - struct page *page = virt_to_page(*full_buffer); - unsigned long flags; - - *full_buffer = NULL; - page->private = 0; - - spin_lock_irqsave(&bio_queue_lock, flags); - if (!bio_queue_head) - bio_queue_head = page; - else - bio_queue_tail->private = (unsigned long) page; - - bio_queue_tail = page; - atomic_inc(&toi_bio_queue_size); - - spin_unlock_irqrestore(&bio_queue_lock, flags); - wake_up(&toi_io_queue_flusher); -} - -/** - * toi_rw_cleanup - Cleanup after i/o. - * @writing: Whether we were reading or writing. - * - * Flush all I/O and clean everything up after reading or writing a - * section of the image. - **/ -static int toi_rw_cleanup(int writing) -{ - int i, result = 0; - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_rw_cleanup."); - if (writing) { - if (toi_writer_buffer_posn && !test_result_state(TOI_ABORTED)) - toi_bio_queue_write(&toi_writer_buffer); - - while (bio_queue_head && !result) - result = toi_bio_queue_flush_pages(0); - - if (result) - return result; - - if (current_stream == 2) - toi_extent_state_save(1); - else if (current_stream == 1) - toi_extent_state_save(3); - } - - result = toi_finish_all_io(); - - while (readahead_list_head) { - void *next = (void *) readahead_list_head->private; - toi__free_page(12, readahead_list_head); - readahead_list_head = next; - } - - readahead_list_tail = NULL; - - if (!current_stream) - return result; - - for (i = 0; i < NUM_REASONS; i++) { - if (!atomic_read(&reasons[i])) - continue; - printk(KERN_DEBUG "Waited for i/o due to %s %d times.\n", - reason_name[i], atomic_read(&reasons[i])); - atomic_set(&reasons[i], 0); - } - - current_stream = 0; - return result; -} - -/** - * toi_start_one_readahead - start one page of readahead - * @dedicated_thread: Is this a thread dedicated to doing readahead? - * - * Start one new page of readahead. If this is being called by a thread - * whose only just is to submit readahead, don't quit because we failed - * to allocate a page. - **/ -static int toi_start_one_readahead(int dedicated_thread) -{ - char *buffer = NULL; - int oom = 0, result; - - result = throttle_if_needed(dedicated_thread ? THROTTLE_WAIT : 0); - if (result) { - printk("toi_start_one_readahead: throttle_if_needed returned %d.\n", result); - return result; - } - - mutex_lock(&toi_bio_readahead_mutex); - - while (!buffer) { - buffer = (char *) toi_get_zeroed_page(12, - TOI_ATOMIC_GFP); - if (!buffer) { - if (oom && !dedicated_thread) { - mutex_unlock(&toi_bio_readahead_mutex); - printk("toi_start_one_readahead: oom and !dedicated thread %d.\n", result); - return -ENOMEM; - } - - oom = 1; - set_free_mem_throttle(); - do_bio_wait(5); - } - } - - result = toi_bio_rw_page(READ, virt_to_page(buffer), 1, 0); - if (result) { - printk("toi_start_one_readahead: toi_bio_rw_page returned %d.\n", result); - } - if (result == -ENOSPC) - toi__free_page(12, virt_to_page(buffer)); - mutex_unlock(&toi_bio_readahead_mutex); - if (result) { - if (result == -ENOSPC) - toi_message(TOI_BIO, TOI_VERBOSE, 0, - "Last readahead page submitted."); - else - printk(KERN_DEBUG "toi_bio_rw_page returned %d.\n", - result); - } - return result; -} - -/** - * toi_start_new_readahead - start new readahead - * @dedicated_thread: Are we dedicated to this task? - * - * Start readahead of image pages. - * - * We can be called as a thread dedicated to this task (may be helpful on - * systems with lots of CPUs), in which case we don't exit until there's no - * more readahead. - * - * If this is not called by a dedicated thread, we top up our queue until - * there's no more readahead to submit, we've submitted the number given - * in target_outstanding_io or the number in progress exceeds the target - * outstanding I/O value. - * - * No mutex needed because this is only ever called by the first cpu. - **/ -static int toi_start_new_readahead(int dedicated_thread) -{ - int last_result, num_submitted = 0; - - /* Start a new readahead? */ - if (!more_readahead) - return 0; - - do { - last_result = toi_start_one_readahead(dedicated_thread); - - if (last_result) { - if (last_result == -ENOMEM || last_result == -ENOSPC) - return 0; - - printk(KERN_DEBUG - "Begin read chunk returned %d.\n", - last_result); - } else - num_submitted++; - - } while (more_readahead && !last_result && - (dedicated_thread || - (num_submitted < target_outstanding_io && - atomic_read(&toi_io_in_progress) < target_outstanding_io))); - - return last_result; -} - -/** - * bio_io_flusher - start the dedicated I/O flushing routine - * @writing: Whether we're writing the image. - **/ -static int bio_io_flusher(int writing) -{ - - if (writing) - return toi_bio_queue_flush_pages(1); - else - return toi_start_new_readahead(1); -} - -/** - * toi_bio_get_next_page_read - read a disk page, perhaps with readahead - * @no_readahead: Whether we can use readahead - * - * Read a page from disk, submitting readahead and cleaning up finished i/o - * while we wait for the page we're after. - **/ -static int toi_bio_get_next_page_read(int no_readahead) -{ - char *virt; - struct page *old_readahead_list_head; - - /* - * When reading the second page of the header, we have to - * delay submitting the read until after we've gotten the - * extents out of the first page. - */ - if (unlikely(no_readahead)) { - int result = toi_start_one_readahead(0); - if (result) { - printk(KERN_EMERG "No readahead and toi_start_one_readahead " - "returned non-zero.\n"); - return -EIO; - } - } - - if (unlikely(!readahead_list_head)) { - /* - * If the last page finishes exactly on the page - * boundary, we will be called one extra time and - * have no data to return. In this case, we should - * not BUG(), like we used to! - */ - if (!more_readahead) { - printk(KERN_EMERG "No more readahead.\n"); - return -ENOSPC; - } - if (unlikely(toi_start_one_readahead(0))) { - printk(KERN_EMERG "No readahead and " - "toi_start_one_readahead returned non-zero.\n"); - return -EIO; - } - } - - if (PageLocked(readahead_list_head)) { - waiting_on = readahead_list_head; - do_bio_wait(0); - } - - virt = page_address(readahead_list_head); - memcpy(toi_writer_buffer, virt, PAGE_SIZE); - - mutex_lock(&toi_bio_readahead_mutex); - old_readahead_list_head = readahead_list_head; - readahead_list_head = (struct page *) readahead_list_head->private; - mutex_unlock(&toi_bio_readahead_mutex); - toi__free_page(12, old_readahead_list_head); - return 0; -} - -/** - * toi_bio_queue_flush_pages - flush the queue of pages queued for writing - * @dedicated_thread: Whether we're a dedicated thread - * - * Flush the queue of pages ready to be written to disk. - * - * If we're a dedicated thread, stay in here until told to leave, - * sleeping in wait_event. - * - * The first thread is normally the only one to come in here. Another - * thread can enter this routine too, though, via throttle_if_needed. - * Since that's the case, we must be careful to only have one thread - * doing this work at a time. Otherwise we have a race and could save - * pages out of order. - * - * If an error occurs, free all remaining pages without submitting them - * for I/O. - **/ - -int toi_bio_queue_flush_pages(int dedicated_thread) -{ - unsigned long flags; - int result = 0; - static DEFINE_MUTEX(busy); - - if (!mutex_trylock(&busy)) - return 0; - -top: - spin_lock_irqsave(&bio_queue_lock, flags); - while (bio_queue_head) { - struct page *page = bio_queue_head; - bio_queue_head = (struct page *) page->private; - if (bio_queue_tail == page) - bio_queue_tail = NULL; - atomic_dec(&toi_bio_queue_size); - spin_unlock_irqrestore(&bio_queue_lock, flags); - - /* Don't generate more error messages if already had one */ - if (!result) - result = toi_bio_rw_page(WRITE, page, 0, 11); - /* - * If writing the page failed, don't drop out. - * Flush the rest of the queue too. - */ - if (result) - toi__free_page(11 , page); - spin_lock_irqsave(&bio_queue_lock, flags); - } - spin_unlock_irqrestore(&bio_queue_lock, flags); - - if (dedicated_thread) { - wait_event(toi_io_queue_flusher, bio_queue_head || - toi_bio_queue_flusher_should_finish); - if (likely(!toi_bio_queue_flusher_should_finish)) - goto top; - toi_bio_queue_flusher_should_finish = 0; - } - - mutex_unlock(&busy); - return result; -} - -/** - * toi_bio_get_new_page - get a new page for I/O - * @full_buffer: Pointer to a page to allocate. - **/ -static int toi_bio_get_new_page(char **full_buffer) -{ - int result = throttle_if_needed(THROTTLE_WAIT); - if (result) - return result; - - while (!*full_buffer) { - *full_buffer = (char *) toi_get_zeroed_page(11, TOI_ATOMIC_GFP); - if (!*full_buffer) { - set_free_mem_throttle(); - do_bio_wait(3); - } - } - - return 0; -} - -/** - * toi_rw_buffer - combine smaller buffers into PAGE_SIZE I/O - * @writing: Bool - whether writing (or reading). - * @buffer: The start of the buffer to write or fill. - * @buffer_size: The size of the buffer to write or fill. - * @no_readahead: Don't try to start readhead (when getting extents). - **/ -static int toi_rw_buffer(int writing, char *buffer, int buffer_size, - int no_readahead) -{ - int bytes_left = buffer_size, result = 0; - - while (bytes_left) { - char *source_start = buffer + buffer_size - bytes_left; - char *dest_start = toi_writer_buffer + toi_writer_buffer_posn; - int capacity = PAGE_SIZE - toi_writer_buffer_posn; - char *to = writing ? dest_start : source_start; - char *from = writing ? source_start : dest_start; - - if (bytes_left <= capacity) { - memcpy(to, from, bytes_left); - toi_writer_buffer_posn += bytes_left; - return 0; - } - - /* Complete this page and start a new one */ - memcpy(to, from, capacity); - bytes_left -= capacity; - - if (!writing) { - /* - * Perform actual I/O: - * read readahead_list_head into toi_writer_buffer - */ - int result = toi_bio_get_next_page_read(no_readahead); - if (result && bytes_left) { - printk("toi_bio_get_next_page_read " - "returned %d. Expecting to read %d bytes.\n", result, bytes_left); - return result; - } - } else { - toi_bio_queue_write(&toi_writer_buffer); - result = toi_bio_get_new_page(&toi_writer_buffer); - if (result) { - printk(KERN_ERR "toi_bio_get_new_page returned " - "%d.\n", result); - return result; - } - } - - toi_writer_buffer_posn = 0; - toi_cond_pause(0, NULL); - } - - return 0; -} - -/** - * toi_bio_read_page - read a page of the image - * @pfn: The pfn where the data belongs. - * @buffer_page: The page containing the (possibly compressed) data. - * @buf_size: The number of bytes on @buffer_page used (PAGE_SIZE). - * - * Read a (possibly compressed) page from the image, into buffer_page, - * returning its pfn and the buffer size. - **/ -static int toi_bio_read_page(unsigned long *pfn, int buf_type, - void *buffer_page, unsigned int *buf_size) -{ - int result = 0; - int this_idx; - char *buffer_virt = TOI_MAP(buf_type, buffer_page); - - /* - * Only call start_new_readahead if we don't have a dedicated thread - * and we're the queue flusher. - */ - if (current == toi_queue_flusher && more_readahead && - !test_action_state(TOI_NO_READAHEAD)) { - int result2 = toi_start_new_readahead(0); - if (result2) { - printk(KERN_DEBUG "Queue flusher and " - "toi_start_one_readahead returned non-zero.\n"); - result = -EIO; - goto out; - } - } - - my_mutex_lock(0, &toi_bio_mutex); - - /* - * Structure in the image: - * [destination pfn|page size|page data] - * buf_size is PAGE_SIZE - * We can validly find there's nothing to read in a multithreaded - * situation. - */ - if (toi_rw_buffer(READ, (char *) &this_idx, sizeof(int), 0) || - toi_rw_buffer(READ, (char *) pfn, sizeof(unsigned long), 0) || - toi_rw_buffer(READ, (char *) buf_size, sizeof(int), 0) || - toi_rw_buffer(READ, buffer_virt, *buf_size, 0)) { - result = -ENODATA; - goto out_unlock; - } - - if (reset_idx) { - page_idx = this_idx; - reset_idx = 0; - } else { - page_idx++; - if (!this_idx) - result = -ENODATA; - else if (page_idx != this_idx) - printk(KERN_ERR "Got page index %d, expected %d.\n", - this_idx, page_idx); - } - -out_unlock: - my_mutex_unlock(0, &toi_bio_mutex); -out: - TOI_UNMAP(buf_type, buffer_page); - return result; -} - -/** - * toi_bio_write_page - write a page of the image - * @pfn: The pfn where the data belongs. - * @buffer_page: The page containing the (possibly compressed) data. - * @buf_size: The number of bytes on @buffer_page used. - * - * Write a (possibly compressed) page to the image from the buffer, together - * with it's index and buffer size. - **/ -static int toi_bio_write_page(unsigned long pfn, int buf_type, - void *buffer_page, unsigned int buf_size) -{ - char *buffer_virt; - int result = 0, result2 = 0; - - if (unlikely(test_action_state(TOI_TEST_FILTER_SPEED))) - return 0; - - my_mutex_lock(1, &toi_bio_mutex); - - if (test_result_state(TOI_ABORTED)) { - my_mutex_unlock(1, &toi_bio_mutex); - return 0; - } - - buffer_virt = TOI_MAP(buf_type, buffer_page); - page_idx++; - - /* - * Structure in the image: - * [destination pfn|page size|page data] - * buf_size is PAGE_SIZE - */ - if (toi_rw_buffer(WRITE, (char *) &page_idx, sizeof(int), 0) || - toi_rw_buffer(WRITE, (char *) &pfn, sizeof(unsigned long), 0) || - toi_rw_buffer(WRITE, (char *) &buf_size, sizeof(int), 0) || - toi_rw_buffer(WRITE, buffer_virt, buf_size, 0)) { - printk(KERN_DEBUG "toi_rw_buffer returned non-zero to " - "toi_bio_write_page.\n"); - result = -EIO; - } - - TOI_UNMAP(buf_type, buffer_page); - my_mutex_unlock(1, &toi_bio_mutex); - - if (current == toi_queue_flusher) - result2 = toi_bio_queue_flush_pages(0); - - return result ? result : result2; -} - -/** - * _toi_rw_header_chunk - read or write a portion of the image header - * @writing: Whether reading or writing. - * @owner: The module for which we're writing. - * Used for confirming that modules - * don't use more header space than they asked for. - * @buffer: Address of the data to write. - * @buffer_size: Size of the data buffer. - * @no_readahead: Don't try to start readhead (when getting extents). - * - * Perform PAGE_SIZE I/O. Start readahead if needed. - **/ -static int _toi_rw_header_chunk(int writing, struct toi_module_ops *owner, - char *buffer, int buffer_size, int no_readahead) -{ - int result = 0; - - if (owner) { - owner->header_used += buffer_size; - toi_message(TOI_HEADER, TOI_LOW, 1, - "Header: %s : %d bytes (%d/%d) from offset %d.", - owner->name, - buffer_size, owner->header_used, - owner->header_requested, - toi_writer_buffer_posn); - if (owner->header_used > owner->header_requested && writing) { - printk(KERN_EMERG "TuxOnIce module %s is using more " - "header space (%u) than it requested (%u).\n", - owner->name, - owner->header_used, - owner->header_requested); - return buffer_size; - } - } else { - unowned += buffer_size; - toi_message(TOI_HEADER, TOI_LOW, 1, - "Header: (No owner): %d bytes (%d total so far) from " - "offset %d.", buffer_size, unowned, - toi_writer_buffer_posn); - } - - if (!writing && !no_readahead && more_readahead) { - result = toi_start_new_readahead(0); - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Start new readahead " - "returned %d.", result); - } - - if (!result) { - result = toi_rw_buffer(writing, buffer, buffer_size, - no_readahead); - toi_message(TOI_BIO, TOI_VERBOSE, 0, "rw_buffer returned " - "%d.", result); - } - - total_header_bytes += buffer_size; - toi_message(TOI_BIO, TOI_VERBOSE, 0, "_toi_rw_header_chunk returning " - "%d.", result); - return result; -} - -static int toi_rw_header_chunk(int writing, struct toi_module_ops *owner, - char *buffer, int size) -{ - return _toi_rw_header_chunk(writing, owner, buffer, size, 1); -} - -static int toi_rw_header_chunk_noreadahead(int writing, - struct toi_module_ops *owner, char *buffer, int size) -{ - return _toi_rw_header_chunk(writing, owner, buffer, size, 1); -} - -/** - * toi_bio_storage_needed - get the amount of storage needed for my fns - **/ -static int toi_bio_storage_needed(void) -{ - return sizeof(int) + PAGE_SIZE + toi_bio_devinfo_storage_needed(); -} - -/** - * toi_bio_save_config_info - save block I/O config to image header - * @buf: PAGE_SIZE'd buffer into which data should be saved. - **/ -static int toi_bio_save_config_info(char *buf) -{ - int *ints = (int *) buf; - ints[0] = target_outstanding_io; - return sizeof(int); -} - -/** - * toi_bio_load_config_info - restore block I/O config - * @buf: Data to be reloaded. - * @size: Size of the buffer saved. - **/ -static void toi_bio_load_config_info(char *buf, int size) -{ - int *ints = (int *) buf; - target_outstanding_io = ints[0]; -} - -void close_resume_dev_t(int force) -{ - if (!resume_block_device) - return; - - if (force) - atomic_set(&resume_bdev_open_count, 0); - else - atomic_dec(&resume_bdev_open_count); - - if (!atomic_read(&resume_bdev_open_count)) { - toi_close_bdev(resume_block_device); - resume_block_device = NULL; - } -} - -int open_resume_dev_t(int force, int quiet) -{ - if (force) { - close_resume_dev_t(1); - atomic_set(&resume_bdev_open_count, 1); - } else - atomic_inc(&resume_bdev_open_count); - - if (resume_block_device) - return 0; - - resume_block_device = toi_open_bdev(NULL, resume_dev_t, 0); - if (IS_ERR(resume_block_device)) { - if (!quiet) - toi_early_boot_message(1, TOI_CONTINUE_REQ, - "Failed to open device %x, where" - " the header should be found.", - resume_dev_t); - resume_block_device = NULL; - atomic_set(&resume_bdev_open_count, 0); - return 1; - } - - return 0; -} - -/** - * toi_bio_initialise - initialise bio code at start of some action - * @starting_cycle: Whether starting a hibernation cycle, or just reading or - * writing a sysfs value. - **/ -static int toi_bio_initialise(int starting_cycle) -{ - int result; - - if (!starting_cycle || !resume_dev_t) - return 0; - - max_outstanding_writes = 0; - max_outstanding_reads = 0; - current_stream = 0; - toi_queue_flusher = current; -#ifdef MEASURE_MUTEX_CONTENTION - { - int i, j, k; - - for (i = 0; i < 2; i++) - for (j = 0; j < 2; j++) - for_each_online_cpu(k) - mutex_times[i][j][k] = 0; - } -#endif - result = open_resume_dev_t(0, 1); - - if (result) - return result; - - return get_signature_page(); -} - -static unsigned long raw_to_real(unsigned long raw) -{ - unsigned long extra; - - extra = (raw * (sizeof(unsigned long) + sizeof(int)) + - (PAGE_SIZE + sizeof(unsigned long) + sizeof(int) + 1)) / - (PAGE_SIZE + sizeof(unsigned long) + sizeof(int)); - - return raw > extra ? raw - extra : 0; -} - -static unsigned long toi_bio_storage_available(void) -{ - unsigned long sum = 0; - struct toi_module_ops *this_module; - - list_for_each_entry(this_module, &toi_modules, module_list) { - if (!this_module->enabled || - this_module->type != BIO_ALLOCATOR_MODULE) - continue; - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Seeking storage " - "available from %s.", this_module->name); - sum += this_module->bio_allocator_ops->storage_available(); - } - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Total storage available is %lu " - "pages (%d header pages).", sum, header_pages_reserved); - - return sum > header_pages_reserved ? - raw_to_real(sum - header_pages_reserved) : 0; - -} - -static unsigned long toi_bio_storage_allocated(void) -{ - return raw_pages_allocd > header_pages_reserved ? - raw_to_real(raw_pages_allocd - header_pages_reserved) : 0; -} - -/* - * If we have read part of the image, we might have filled memory with - * data that should be zeroed out. - */ -static void toi_bio_noresume_reset(void) -{ - toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_noresume_reset."); - toi_rw_cleanup(READ); - free_all_bdev_info(); -} - -/** - * toi_bio_cleanup - cleanup after some action - * @finishing_cycle: Whether completing a cycle. - **/ -static void toi_bio_cleanup(int finishing_cycle) -{ - if (!finishing_cycle) - return; - - if (toi_writer_buffer) { - toi_free_page(11, (unsigned long) toi_writer_buffer); - toi_writer_buffer = NULL; - } - - forget_signature_page(); - - if (header_block_device && toi_sig_data && - toi_sig_data->header_dev_t != resume_dev_t) - toi_close_bdev(header_block_device); - - header_block_device = NULL; - - close_resume_dev_t(0); -} - -static int toi_bio_write_header_init(void) -{ - int result; - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_write_header_init"); - toi_rw_init(WRITE, 0); - toi_writer_buffer_posn = 0; - - /* Info needed to bootstrap goes at the start of the header. - * First we save the positions and devinfo, including the number - * of header pages. Then we save the structs containing data needed - * for reading the header pages back. - * Note that even if header pages take more than one page, when we - * read back the info, we will have restored the location of the - * next header page by the time we go to use it. - */ - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "serialise extent chains."); - result = toi_serialise_extent_chains(); - - if (result) - return result; - - /* - * Signature page hasn't been modified at this point. Write it in - * the header so we can restore it later. - */ - toi_message(TOI_BIO, TOI_VERBOSE, 0, "serialise signature page."); - return toi_rw_header_chunk_noreadahead(WRITE, &toi_blockwriter_ops, - (char *) toi_cur_sig_page, - PAGE_SIZE); -} - -static int toi_bio_write_header_cleanup(void) -{ - int result = 0; - - if (toi_writer_buffer_posn) - toi_bio_queue_write(&toi_writer_buffer); - - result = toi_finish_all_io(); - - unowned = 0; - total_header_bytes = 0; - - /* Set signature to save we have an image */ - if (!result) - result = toi_bio_mark_have_image(); - - return result; -} - -/* - * toi_bio_read_header_init() - * - * Description: - * 1. Attempt to read the device specified with resume=. - * 2. Check the contents of the swap header for our signature. - * 3. Warn, ignore, reset and/or continue as appropriate. - * 4. If continuing, read the toi_swap configuration section - * of the header and set up block device info so we can read - * the rest of the header & image. - * - * Returns: - * May not return if user choose to reboot at a warning. - * -EINVAL if cannot resume at this time. Booting should continue - * normally. - */ - -static int toi_bio_read_header_init(void) -{ - int result = 0; - char buf[32]; - - toi_writer_buffer_posn = 0; - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_read_header_init"); - - if (!toi_sig_data) { - printk(KERN_INFO "toi_bio_read_header_init called when we " - "haven't verified there is an image!\n"); - return -EINVAL; - } - - /* - * If the header is not on the resume_swap_dev_t, get the resume device - * first. - */ - toi_message(TOI_BIO, TOI_VERBOSE, 0, "Header dev_t is %lx.", - toi_sig_data->header_dev_t); - if (toi_sig_data->have_uuid) { - struct fs_info seek; - dev_t device; - - strncpy((char *) seek.uuid, toi_sig_data->header_uuid, 16); - seek.dev_t = toi_sig_data->header_dev_t; - seek.last_mount_size = 0; - device = blk_lookup_fs_info(&seek); - if (device) { - printk("Using dev_t %s, returned by blk_lookup_fs_info.\n", - format_dev_t(buf, device)); - toi_sig_data->header_dev_t = device; - } - } - if (toi_sig_data->header_dev_t != resume_dev_t) { - header_block_device = toi_open_bdev(NULL, - toi_sig_data->header_dev_t, 1); - - if (IS_ERR(header_block_device)) - return PTR_ERR(header_block_device); - } else - header_block_device = resume_block_device; - - if (!toi_writer_buffer) - toi_writer_buffer = (char *) toi_get_zeroed_page(11, - TOI_ATOMIC_GFP); - more_readahead = 1; - - /* - * Read toi_swap configuration. - * Headerblock size taken into account already. - */ - result = toi_bio_ops.bdev_page_io(READ, header_block_device, - toi_sig_data->first_header_block, - virt_to_page((unsigned long) toi_writer_buffer)); - if (result) - return result; - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "load extent chains."); - result = toi_load_extent_chains(); - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "load original signature page."); - toi_orig_sig_page = (char *) toi_get_zeroed_page(38, TOI_ATOMIC_GFP); - if (!toi_orig_sig_page) { - printk(KERN_ERR "Failed to allocate memory for the current" - " image signature.\n"); - return -ENOMEM; - } - - return toi_rw_header_chunk_noreadahead(READ, &toi_blockwriter_ops, - (char *) toi_orig_sig_page, - PAGE_SIZE); -} - -static int toi_bio_read_header_cleanup(void) -{ - toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_read_header_cleanup."); - return toi_rw_cleanup(READ); -} - -/* Works only for digits and letters, but small and fast */ -#define TOLOWER(x) ((x) | 0x20) - -/* - * UUID must be 32 chars long. It may have dashes, but nothing - * else. - */ -char *uuid_from_commandline(char *commandline) -{ - int low = 0; - char *result = NULL, *output, *ptr; - - if (strncmp(commandline, "UUID=", 5)) - return NULL; - - result = kzalloc(17, GFP_KERNEL); - if (!result) { - printk("Failed to kzalloc UUID text memory.\n"); - return NULL; - } - - ptr = commandline + 5; - output = result; - - while (*ptr && (output - result) < 16) { - if (isxdigit(*ptr)) { - int value = isdigit(*ptr) ? *ptr - '0' : - TOLOWER(*ptr) - 'a' + 10; - if (low) { - *output += value; - output++; - } else { - *output = value << 4; - } - low = !low; - } else if (*ptr != '-') - break; - ptr++; - } - - if ((output - result) < 16 || *ptr) { - printk(KERN_DEBUG "Found resume=UUID=, but the value looks " - "invalid.\n"); - kfree(result); - result = NULL; - } - - return result; -} - -#define retry_if_fails(command) \ -do { \ - command; \ - if (!resume_dev_t && !waited_for_device_probe) { \ - wait_for_device_probe(); \ - command; \ - waited_for_device_probe = 1; \ - } \ -} while(0) - -/** - * try_to_open_resume_device: Try to parse and open resume= - * - * Any "swap:" has been stripped away and we just have the path to deal with. - * We attempt to do name_to_dev_t, open and stat the file. Having opened the - * file, get the struct block_device * to match. - */ -static int try_to_open_resume_device(char *commandline, int quiet) -{ - struct kstat stat; - int error = 0; - char *uuid = uuid_from_commandline(commandline); - int waited_for_device_probe = 0; - - resume_dev_t = MKDEV(0, 0); - - if (!strlen(commandline)) - retry_if_fails(toi_bio_scan_for_image(quiet)); - - if (uuid) { - struct fs_info seek; - strncpy((char *) &seek.uuid, uuid, 16); - seek.dev_t = resume_dev_t; - seek.last_mount_size = 0; - retry_if_fails(resume_dev_t = blk_lookup_fs_info(&seek)); - kfree(uuid); - } - - if (!resume_dev_t) - retry_if_fails(resume_dev_t = name_to_dev_t(commandline)); - - if (!resume_dev_t) { - struct file *file = filp_open(commandline, - O_RDONLY|O_LARGEFILE, 0); - - if (!IS_ERR(file) && file) { - vfs_getattr(&file->f_path, &stat); - filp_close(file, NULL); - } else - error = vfs_stat(commandline, &stat); - if (!error) - resume_dev_t = stat.rdev; - } - - if (!resume_dev_t) { - if (quiet) - return 1; - - if (test_toi_state(TOI_TRYING_TO_RESUME)) - toi_early_boot_message(1, toi_translate_err_default, - "Failed to translate \"%s\" into a device id.\n", - commandline); - else - printk("TuxOnIce: Can't translate \"%s\" into a device " - "id yet.\n", commandline); - return 1; - } - - return open_resume_dev_t(1, quiet); -} - -/* - * Parse Image Location - * - * Attempt to parse a resume= parameter. - * Swap Writer accepts: - * resume=[swap:|file:]DEVNAME[:FIRSTBLOCK][@BLOCKSIZE] - * - * Where: - * DEVNAME is convertable to a dev_t by name_to_dev_t - * FIRSTBLOCK is the location of the first block in the swap file - * (specifying for a swap partition is nonsensical but not prohibited). - * Data is validated by attempting to read a swap header from the - * location given. Failure will result in toi_swap refusing to - * save an image, and a reboot with correct parameters will be - * necessary. - */ -static int toi_bio_parse_sig_location(char *commandline, - int only_allocator, int quiet) -{ - char *thischar, *devstart, *colon = NULL; - int signature_found, result = -EINVAL, temp_result = 0; - - if (strncmp(commandline, "swap:", 5) && - strncmp(commandline, "file:", 5)) { - /* - * Failing swap:, we'll take a simple resume=/dev/hda2, or a - * blank value (scan) but fall through to other allocators - * if /dev/ or UUID= isn't matched. - */ - if (strncmp(commandline, "/dev/", 5) && - strncmp(commandline, "UUID=", 5) && - strlen(commandline)) - return 1; - } else - commandline += 5; - - devstart = commandline; - thischar = commandline; - while ((*thischar != ':') && (*thischar != '@') && - ((thischar - commandline) < 250) && (*thischar)) - thischar++; - - if (*thischar == ':') { - colon = thischar; - *colon = 0; - thischar++; - } - - while ((thischar - commandline) < 250 && *thischar) - thischar++; - - if (colon) { - unsigned long block; - temp_result = kstrtoul(colon + 1, 0, &block); - if (!temp_result) - resume_firstblock = (int) block; - } else - resume_firstblock = 0; - - clear_toi_state(TOI_CAN_HIBERNATE); - clear_toi_state(TOI_CAN_RESUME); - - if (!temp_result) - temp_result = try_to_open_resume_device(devstart, quiet); - - if (colon) - *colon = ':'; - - /* No error if we only scanned */ - if (temp_result) - return strlen(commandline) ? -EINVAL : 1; - - signature_found = toi_bio_image_exists(quiet); - - if (signature_found != -1) { - result = 0; - /* - * TODO: If only file storage, CAN_HIBERNATE should only be - * set if file allocator's target is valid. - */ - set_toi_state(TOI_CAN_HIBERNATE); - set_toi_state(TOI_CAN_RESUME); - } else - if (!quiet) - printk(KERN_ERR "TuxOnIce: Block I/O: No " - "signature found at %s.\n", devstart); - - return result; -} - -static void toi_bio_release_storage(void) -{ - header_pages_reserved = 0; - raw_pages_allocd = 0; - - free_all_bdev_info(); -} - -/* toi_swap_remove_image - * - */ -static int toi_bio_remove_image(void) -{ - int result; - - toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_remove_image."); - - result = toi_bio_restore_original_signature(); - - /* - * We don't do a sanity check here: we want to restore the swap - * whatever version of kernel made the hibernate image. - * - * We need to write swap, but swap may not be enabled so - * we write the device directly - * - * If we don't have an current_signature_page, we didn't - * read an image header, so don't change anything. - */ - - toi_bio_release_storage(); - - return result; -} - -struct toi_bio_ops toi_bio_ops = { - .bdev_page_io = toi_bdev_page_io, - .register_storage = toi_register_storage_chain, - .free_storage = toi_bio_release_storage, -}; - -static struct toi_sysfs_data sysfs_params[] = { - SYSFS_INT("target_outstanding_io", SYSFS_RW, &target_outstanding_io, - 0, 16384, 0, NULL), -}; - -struct toi_module_ops toi_blockwriter_ops = { - .type = WRITER_MODULE, - .name = "block i/o", - .directory = "block_io", - .module = THIS_MODULE, - .memory_needed = toi_bio_memory_needed, - .print_debug_info = toi_bio_print_debug_stats, - .storage_needed = toi_bio_storage_needed, - .save_config_info = toi_bio_save_config_info, - .load_config_info = toi_bio_load_config_info, - .initialise = toi_bio_initialise, - .cleanup = toi_bio_cleanup, - .post_atomic_restore = toi_bio_chains_post_atomic, - - .rw_init = toi_rw_init, - .rw_cleanup = toi_rw_cleanup, - .read_page = toi_bio_read_page, - .write_page = toi_bio_write_page, - .rw_header_chunk = toi_rw_header_chunk, - .rw_header_chunk_noreadahead = toi_rw_header_chunk_noreadahead, - .io_flusher = bio_io_flusher, - .update_throughput_throttle = update_throughput_throttle, - .finish_all_io = toi_finish_all_io, - - .noresume_reset = toi_bio_noresume_reset, - .storage_available = toi_bio_storage_available, - .storage_allocated = toi_bio_storage_allocated, - .reserve_header_space = toi_bio_reserve_header_space, - .allocate_storage = toi_bio_allocate_storage, - .free_unused_storage = toi_bio_free_unused_storage, - .image_exists = toi_bio_image_exists, - .mark_resume_attempted = toi_bio_mark_resume_attempted, - .write_header_init = toi_bio_write_header_init, - .write_header_cleanup = toi_bio_write_header_cleanup, - .read_header_init = toi_bio_read_header_init, - .read_header_cleanup = toi_bio_read_header_cleanup, - .get_header_version = toi_bio_get_header_version, - .remove_image = toi_bio_remove_image, - .parse_sig_location = toi_bio_parse_sig_location, - - .sysfs_data = sysfs_params, - .num_sysfs_entries = sizeof(sysfs_params) / - sizeof(struct toi_sysfs_data), -}; - -/** - * toi_block_io_load - load time routine for block I/O module - * - * Register block i/o ops and sysfs entries. - **/ -static __init int toi_block_io_load(void) -{ - return toi_register_module(&toi_blockwriter_ops); -} - -late_initcall(toi_block_io_load); diff --git a/kernel/power/tuxonice_bio_internal.h b/kernel/power/tuxonice_bio_internal.h deleted file mode 100644 index 5e1964a61..000000000 --- a/kernel/power/tuxonice_bio_internal.h +++ /dev/null @@ -1,101 +0,0 @@ -/* - * kernel/power/tuxonice_bio_internal.h - * - * Copyright (C) 2009-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * Distributed under GPLv2. - * - * This file contains declarations for functions exported from - * tuxonice_bio.c, which contains low level io functions. - */ - -/* Extent chains */ -void toi_extent_state_goto_start(void); -void toi_extent_state_save(int slot); -int go_next_page(int writing, int section_barrier); -void toi_extent_state_restore(int slot); -void free_all_bdev_info(void); -int devices_of_same_priority(struct toi_bdev_info *this); -int toi_register_storage_chain(struct toi_bdev_info *new); -int toi_serialise_extent_chains(void); -int toi_load_extent_chains(void); -int toi_bio_rw_page(int writing, struct page *page, int is_readahead, - int free_group); -int toi_bio_restore_original_signature(void); -int toi_bio_devinfo_storage_needed(void); -unsigned long get_headerblock(void); -dev_t get_header_dev_t(void); -struct block_device *get_header_bdev(void); -int toi_bio_allocate_storage(unsigned long request); -void toi_bio_free_unused_storage(void); - -/* Signature functions */ -#define HaveImage "HaveImage" -#define NoImage "TuxOnIce" -#define sig_size (sizeof(HaveImage)) - -struct sig_data { - char sig[sig_size]; - int have_image; - int resumed_before; - - char have_uuid; - char header_uuid[17]; - dev_t header_dev_t; - unsigned long first_header_block; - - /* Repeat the signature to be sure we have a header version */ - char sig2[sig_size]; - int header_version; -}; - -void forget_signature_page(void); -int toi_check_for_signature(void); -int toi_bio_image_exists(int quiet); -int get_signature_page(void); -int toi_bio_mark_resume_attempted(int); -extern char *toi_cur_sig_page; -extern char *toi_orig_sig_page; -int toi_bio_mark_have_image(void); -extern struct sig_data *toi_sig_data; -extern dev_t resume_dev_t; -extern struct block_device *resume_block_device; -extern struct block_device *header_block_device; -extern unsigned long resume_firstblock; - -struct block_device *open_bdev(dev_t device, int display_errs); -extern int current_stream; -extern int more_readahead; -int toi_do_io(int writing, struct block_device *bdev, long block0, - struct page *page, int is_readahead, int syncio, int free_group); -int get_main_pool_phys_params(void); - -void toi_close_bdev(struct block_device *bdev); -struct block_device *toi_open_bdev(char *uuid, dev_t default_device, - int display_errs); - -extern struct toi_module_ops toi_blockwriter_ops; -void dump_block_chains(void); -void debug_broken_header(void); -extern unsigned long raw_pages_allocd, header_pages_reserved; -int toi_bio_chains_debug_info(char *buffer, int size); -void toi_bio_chains_post_atomic(struct toi_boot_kernel_data *bkd); -int toi_bio_scan_for_image(int quiet); -int toi_bio_get_header_version(void); - -void close_resume_dev_t(int force); -int open_resume_dev_t(int force, int quiet); - -struct toi_incremental_image_pointer_saved_data { - unsigned long block; - int chain; -}; - -struct toi_incremental_image_pointer { - struct toi_incremental_image_pointer_saved_data save; - struct block_device *bdev; - unsigned long block; -}; - -void toi_bio_store_inc_image_ptr(struct toi_incremental_image_pointer *ptr); -void toi_bio_restore_inc_image_ptr(struct toi_incremental_image_pointer *ptr); diff --git a/kernel/power/tuxonice_bio_signature.c b/kernel/power/tuxonice_bio_signature.c deleted file mode 100644 index f5418f092..000000000 --- a/kernel/power/tuxonice_bio_signature.c +++ /dev/null @@ -1,403 +0,0 @@ -/* - * kernel/power/tuxonice_bio_signature.c - * - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * Distributed under GPLv2. - * - */ - -#include <linux/fs_uuid.h> - -#include "tuxonice.h" -#include "tuxonice_sysfs.h" -#include "tuxonice_modules.h" -#include "tuxonice_prepare_image.h" -#include "tuxonice_bio.h" -#include "tuxonice_ui.h" -#include "tuxonice_alloc.h" -#include "tuxonice_io.h" -#include "tuxonice_builtin.h" -#include "tuxonice_bio_internal.h" - -struct sig_data *toi_sig_data; - -/* Struct of swap header pages */ - -struct old_sig_data { - dev_t device; - unsigned long sector; - int resume_attempted; - int orig_sig_type; -}; - -union diskpage { - union swap_header swh; /* swh.magic is the only member used */ - struct sig_data sig_data; - struct old_sig_data old_sig_data; -}; - -union p_diskpage { - union diskpage *pointer; - char *ptr; - unsigned long address; -}; - -char *toi_cur_sig_page; -char *toi_orig_sig_page; -int have_image; -int have_old_image; - -int get_signature_page(void) -{ - if (!toi_cur_sig_page) { - toi_message(TOI_IO, TOI_VERBOSE, 0, - "Allocating current signature page."); - toi_cur_sig_page = (char *) toi_get_zeroed_page(38, - TOI_ATOMIC_GFP); - if (!toi_cur_sig_page) { - printk(KERN_ERR "Failed to allocate memory for the " - "current image signature.\n"); - return -ENOMEM; - } - - toi_sig_data = (struct sig_data *) toi_cur_sig_page; - } - - toi_message(TOI_IO, TOI_VERBOSE, 0, "Reading signature from dev %lx," - " sector %d.", - resume_block_device->bd_dev, resume_firstblock); - - return toi_bio_ops.bdev_page_io(READ, resume_block_device, - resume_firstblock, virt_to_page(toi_cur_sig_page)); -} - -void forget_signature_page(void) -{ - if (toi_cur_sig_page) { - toi_sig_data = NULL; - toi_message(TOI_IO, TOI_VERBOSE, 0, "Freeing toi_cur_sig_page" - " (%p).", toi_cur_sig_page); - toi_free_page(38, (unsigned long) toi_cur_sig_page); - toi_cur_sig_page = NULL; - } - - if (toi_orig_sig_page) { - toi_message(TOI_IO, TOI_VERBOSE, 0, "Freeing toi_orig_sig_page" - " (%p).", toi_orig_sig_page); - toi_free_page(38, (unsigned long) toi_orig_sig_page); - toi_orig_sig_page = NULL; - } -} - -/* - * We need to ensure we use the signature page that's currently on disk, - * so as to not remove the image header. Post-atomic-restore, the orig sig - * page will be empty, so we can use that as our method of knowing that we - * need to load the on-disk signature and not use the non-image sig in - * memory. (We're going to powerdown after writing the change, so it's safe. - */ -int toi_bio_mark_resume_attempted(int flag) -{ - toi_message(TOI_IO, TOI_VERBOSE, 0, "Make resume attempted = %d.", - flag); - if (!toi_orig_sig_page) { - forget_signature_page(); - get_signature_page(); - } - toi_sig_data->resumed_before = flag; - return toi_bio_ops.bdev_page_io(WRITE, resume_block_device, - resume_firstblock, virt_to_page(toi_cur_sig_page)); -} - -int toi_bio_mark_have_image(void) -{ - int result = 0; - char buf[32]; - struct fs_info *fs_info; - - toi_message(TOI_IO, TOI_VERBOSE, 0, "Recording that an image exists."); - memcpy(toi_sig_data->sig, tuxonice_signature, - sizeof(tuxonice_signature)); - toi_sig_data->have_image = 1; - toi_sig_data->resumed_before = 0; - toi_sig_data->header_dev_t = get_header_dev_t(); - toi_sig_data->have_uuid = 0; - - fs_info = fs_info_from_block_dev(get_header_bdev()); - if (fs_info && !IS_ERR(fs_info)) { - memcpy(toi_sig_data->header_uuid, &fs_info->uuid, 16); - free_fs_info(fs_info); - } else - result = (int) PTR_ERR(fs_info); - - if (!result) { - toi_message(TOI_IO, TOI_VERBOSE, 0, "Got uuid for dev_t %s.", - format_dev_t(buf, get_header_dev_t())); - toi_sig_data->have_uuid = 1; - } else - toi_message(TOI_IO, TOI_VERBOSE, 0, "Could not get uuid for " - "dev_t %s.", - format_dev_t(buf, get_header_dev_t())); - - toi_sig_data->first_header_block = get_headerblock(); - have_image = 1; - toi_message(TOI_IO, TOI_VERBOSE, 0, "header dev_t is %x. First block " - "is %d.", toi_sig_data->header_dev_t, - toi_sig_data->first_header_block); - - memcpy(toi_sig_data->sig2, tuxonice_signature, - sizeof(tuxonice_signature)); - toi_sig_data->header_version = TOI_HEADER_VERSION; - - return toi_bio_ops.bdev_page_io(WRITE, resume_block_device, - resume_firstblock, virt_to_page(toi_cur_sig_page)); -} - -int remove_old_signature(void) -{ - union p_diskpage swap_header_page = (union p_diskpage) toi_cur_sig_page; - char *orig_sig; - char *header_start = (char *) toi_get_zeroed_page(38, TOI_ATOMIC_GFP); - int result; - struct block_device *header_bdev; - struct old_sig_data *old_sig_data = - &swap_header_page.pointer->old_sig_data; - - header_bdev = toi_open_bdev(NULL, old_sig_data->device, 1); - result = toi_bio_ops.bdev_page_io(READ, header_bdev, - old_sig_data->sector, virt_to_page(header_start)); - - if (result) - goto out; - - /* - * TODO: Get the original contents of the first bytes of the swap - * header page. - */ - if (!old_sig_data->orig_sig_type) - orig_sig = "SWAP-SPACE"; - else - orig_sig = "SWAPSPACE2"; - - memcpy(swap_header_page.pointer->swh.magic.magic, orig_sig, 10); - memcpy(swap_header_page.ptr, header_start, 10); - - result = toi_bio_ops.bdev_page_io(WRITE, resume_block_device, - resume_firstblock, virt_to_page(swap_header_page.ptr)); - -out: - toi_close_bdev(header_bdev); - have_old_image = 0; - toi_free_page(38, (unsigned long) header_start); - return result; -} - -/* - * toi_bio_restore_original_signature - restore the original signature - * - * At boot time (aborting pre atomic-restore), toi_orig_sig_page gets used. - * It will have the original signature page contents, stored in the image - * header. Post atomic-restore, we use :toi_cur_sig_page, which will contain - * the contents that were loaded when we started the cycle. - */ -int toi_bio_restore_original_signature(void) -{ - char *use = toi_orig_sig_page ? toi_orig_sig_page : toi_cur_sig_page; - - if (have_old_image) - return remove_old_signature(); - - if (!use) { - printk("toi_bio_restore_original_signature: No signature " - "page loaded.\n"); - return 0; - } - - toi_message(TOI_IO, TOI_VERBOSE, 0, "Recording that no image exists."); - have_image = 0; - toi_sig_data->have_image = 0; - return toi_bio_ops.bdev_page_io(WRITE, resume_block_device, - resume_firstblock, virt_to_page(use)); -} - -/* - * check_for_signature - See whether we have an image. - * - * Returns 0 if no image, 1 if there is one, -1 if indeterminate. - */ -int toi_check_for_signature(void) -{ - union p_diskpage swap_header_page; - int type; - const char *normal_sigs[] = {"SWAP-SPACE", "SWAPSPACE2" }; - const char *swsusp_sigs[] = {"S1SUSP", "S2SUSP", "S1SUSPEND" }; - char *swap_header; - - if (!toi_cur_sig_page) { - int result = get_signature_page(); - - if (result) - return result; - } - - /* - * Start by looking for the binary header. - */ - if (!memcmp(tuxonice_signature, toi_cur_sig_page, - sizeof(tuxonice_signature))) { - have_image = toi_sig_data->have_image; - toi_message(TOI_IO, TOI_VERBOSE, 0, "Have binary signature. " - "Have image is %d.", have_image); - if (have_image) - toi_message(TOI_IO, TOI_VERBOSE, 0, "header dev_t is " - "%x. First block is %d.", - toi_sig_data->header_dev_t, - toi_sig_data->first_header_block); - return toi_sig_data->have_image; - } - - /* - * Failing that, try old file allocator headers. - */ - - if (!memcmp(HaveImage, toi_cur_sig_page, strlen(HaveImage))) { - have_image = 1; - return 1; - } - - have_image = 0; - - if (!memcmp(NoImage, toi_cur_sig_page, strlen(NoImage))) - return 0; - - /* - * Nope? How about swap? - */ - swap_header_page = (union p_diskpage) toi_cur_sig_page; - swap_header = swap_header_page.pointer->swh.magic.magic; - - /* Normal swapspace? */ - for (type = 0; type < 2; type++) - if (!memcmp(normal_sigs[type], swap_header, - strlen(normal_sigs[type]))) - return 0; - - /* Swsusp or uswsusp? */ - for (type = 0; type < 3; type++) - if (!memcmp(swsusp_sigs[type], swap_header, - strlen(swsusp_sigs[type]))) - return 2; - - /* Old TuxOnIce version? */ - if (!memcmp(tuxonice_signature, swap_header, - sizeof(tuxonice_signature) - 1)) { - toi_message(TOI_IO, TOI_VERBOSE, 0, "Found old TuxOnIce " - "signature."); - have_old_image = 1; - return 3; - } - - return -1; -} - -/* - * Image_exists - * - * Returns -1 if don't know, otherwise 0 (no) or 1 (yes). - */ -int toi_bio_image_exists(int quiet) -{ - int result; - char *msg = NULL; - - toi_message(TOI_IO, TOI_VERBOSE, 0, "toi_bio_image_exists."); - - if (!resume_dev_t) { - if (!quiet) - printk(KERN_INFO "Not even trying to read header " - "because resume_dev_t is not set.\n"); - return -1; - } - - if (open_resume_dev_t(0, quiet)) - return -1; - - result = toi_check_for_signature(); - - clear_toi_state(TOI_RESUMED_BEFORE); - if (toi_sig_data->resumed_before) - set_toi_state(TOI_RESUMED_BEFORE); - - if (quiet || result == -ENOMEM) - return result; - - if (result == -1) - msg = "TuxOnIce: Unable to find a signature." - " Could you have moved a swap file?\n"; - else if (!result) - msg = "TuxOnIce: No image found.\n"; - else if (result == 1) - msg = "TuxOnIce: Image found.\n"; - else if (result == 2) - msg = "TuxOnIce: uswsusp or swsusp image found.\n"; - else if (result == 3) - msg = "TuxOnIce: Old implementation's signature found.\n"; - - printk(KERN_INFO "%s", msg); - - return result; -} - -int toi_bio_scan_for_image(int quiet) -{ - struct block_device *bdev; - char default_name[255] = ""; - - if (!quiet) - printk(KERN_DEBUG "Scanning swap devices for TuxOnIce " - "signature...\n"); - for (bdev = next_bdev_of_type(NULL, "swap"); bdev; - bdev = next_bdev_of_type(bdev, "swap")) { - int result; - char name[255] = ""; - sprintf(name, "%u:%u", MAJOR(bdev->bd_dev), - MINOR(bdev->bd_dev)); - if (!quiet) - printk(KERN_DEBUG "- Trying %s.\n", name); - resume_block_device = bdev; - resume_dev_t = bdev->bd_dev; - - result = toi_check_for_signature(); - - resume_block_device = NULL; - resume_dev_t = MKDEV(0, 0); - - if (!default_name[0]) - strcpy(default_name, name); - - if (result == 1) { - /* Got one! */ - strcpy(resume_file, name); - next_bdev_of_type(bdev, NULL); - if (!quiet) - printk(KERN_DEBUG " ==> Image found on %s.\n", - resume_file); - return 1; - } - forget_signature_page(); - } - - if (!quiet) - printk(KERN_DEBUG "TuxOnIce scan: No image found.\n"); - strcpy(resume_file, default_name); - return 0; -} - -int toi_bio_get_header_version(void) -{ - return (memcmp(toi_sig_data->sig2, tuxonice_signature, - sizeof(tuxonice_signature))) ? - 0 : toi_sig_data->header_version; - -} diff --git a/kernel/power/tuxonice_builtin.c b/kernel/power/tuxonice_builtin.c deleted file mode 100644 index 22bf07a43..000000000 --- a/kernel/power/tuxonice_builtin.c +++ /dev/null @@ -1,498 +0,0 @@ -/* - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - */ -#include <linux/kernel.h> -#include <linux/swap.h> -#include <linux/syscalls.h> -#include <linux/bio.h> -#include <linux/root_dev.h> -#include <linux/freezer.h> -#include <linux/reboot.h> -#include <linux/writeback.h> -#include <linux/tty.h> -#include <linux/crypto.h> -#include <linux/cpu.h> -#include <linux/ctype.h> -#include <linux/kthread.h> -#include "tuxonice_io.h" -#include "tuxonice.h" -#include "tuxonice_extent.h" -#include "tuxonice_netlink.h" -#include "tuxonice_prepare_image.h" -#include "tuxonice_ui.h" -#include "tuxonice_sysfs.h" -#include "tuxonice_pagedir.h" -#include "tuxonice_modules.h" -#include "tuxonice_builtin.h" -#include "tuxonice_power_off.h" -#include "tuxonice_alloc.h" - -unsigned long toi_bootflags_mask; - -/* - * Highmem related functions (x86 only). - */ - -#ifdef CONFIG_HIGHMEM - -/** - * copyback_high: Restore highmem pages. - * - * Highmem data and pbe lists are/can be stored in highmem. - * The format is slightly different to the lowmem pbe lists - * used for the assembly code: the last pbe in each page is - * a struct page * instead of struct pbe *, pointing to the - * next page where pbes are stored (or NULL if happens to be - * the end of the list). Since we don't want to generate - * unnecessary deltas against swsusp code, we use a cast - * instead of a union. - **/ - -static void copyback_high(void) -{ - struct page *pbe_page = (struct page *) restore_highmem_pblist; - struct pbe *this_pbe, *first_pbe; - unsigned long *origpage, *copypage; - int pbe_index = 1; - - if (!pbe_page) - return; - - this_pbe = (struct pbe *) kmap_atomic(pbe_page); - first_pbe = this_pbe; - - while (this_pbe) { - int loop = (PAGE_SIZE / sizeof(unsigned long)) - 1; - - origpage = kmap_atomic(pfn_to_page((unsigned long) this_pbe->orig_address)); - copypage = kmap_atomic((struct page *) this_pbe->address); - - while (loop >= 0) { - *(origpage + loop) = *(copypage + loop); - loop--; - } - - kunmap_atomic(origpage); - kunmap_atomic(copypage); - - if (!this_pbe->next) - break; - - if (pbe_index < PBES_PER_PAGE) { - this_pbe++; - pbe_index++; - } else { - pbe_page = (struct page *) this_pbe->next; - kunmap_atomic(first_pbe); - if (!pbe_page) - return; - this_pbe = (struct pbe *) kmap_atomic(pbe_page); - first_pbe = this_pbe; - pbe_index = 1; - } - } - kunmap_atomic(first_pbe); -} - -#else /* CONFIG_HIGHMEM */ -static void copyback_high(void) { } -#endif - -char toi_wait_for_keypress_dev_console(int timeout) -{ - int fd, this_timeout = 255, orig_kthread = 0; - char key = '\0'; - struct termios t, t_backup; - - /* We should be guaranteed /dev/console exists after populate_rootfs() - * in init/main.c. - */ - fd = sys_open("/dev/console", O_RDONLY, 0); - if (fd < 0) { - printk(KERN_INFO "Couldn't open /dev/console.\n"); - return key; - } - - if (sys_ioctl(fd, TCGETS, (long)&t) < 0) - goto out_close; - - memcpy(&t_backup, &t, sizeof(t)); - - t.c_lflag &= ~(ISIG|ICANON|ECHO); - t.c_cc[VMIN] = 0; - -new_timeout: - if (timeout > 0) { - this_timeout = timeout < 26 ? timeout : 25; - timeout -= this_timeout; - this_timeout *= 10; - } - - t.c_cc[VTIME] = this_timeout; - - if (sys_ioctl(fd, TCSETS, (long)&t) < 0) - goto out_restore; - - if (current->flags & PF_KTHREAD) { - orig_kthread = (current->flags & PF_KTHREAD); - current->flags &= ~PF_KTHREAD; - } - - while (1) { - if (sys_read(fd, &key, 1) <= 0) { - if (timeout) - goto new_timeout; - key = '\0'; - break; - } - key = tolower(key); - if (test_toi_state(TOI_SANITY_CHECK_PROMPT)) { - if (key == 'c') { - set_toi_state(TOI_CONTINUE_REQ); - break; - } else if (key == ' ') - break; - } else - break; - } - if (orig_kthread) { - current->flags |= PF_KTHREAD; - } - -out_restore: - sys_ioctl(fd, TCSETS, (long)&t_backup); -out_close: - sys_close(fd); - - return key; -} - -struct toi_boot_kernel_data toi_bkd __nosavedata - __attribute__((aligned(PAGE_SIZE))) = { - MY_BOOT_KERNEL_DATA_VERSION, - 0, -#ifdef CONFIG_TOI_REPLACE_SWSUSP - (1 << TOI_REPLACE_SWSUSP) | -#endif - (1 << TOI_NO_FLUSHER_THREAD) | - (1 << TOI_PAGESET2_FULL), -}; - -struct block_device *toi_open_by_devnum(dev_t dev) -{ - struct block_device *bdev = bdget(dev); - int err = -ENOMEM; - if (bdev) - err = blkdev_get(bdev, FMODE_READ | FMODE_NDELAY, NULL); - return err ? ERR_PTR(err) : bdev; -} - -/** - * toi_close_bdev: Close a swap bdev. - * - * int: The swap entry number to close. - */ -void toi_close_bdev(struct block_device *bdev) -{ - blkdev_put(bdev, FMODE_READ | FMODE_NDELAY); -} - -int toi_wait = CONFIG_TOI_DEFAULT_WAIT; -struct toi_core_fns *toi_core_fns; -unsigned long toi_result; -struct pagedir pagedir1 = {1}; -struct toi_cbw **toi_first_cbw; -int toi_next_cbw; - -unsigned long toi_get_nonconflicting_page(void) -{ - return toi_core_fns->get_nonconflicting_page(); -} - -int toi_post_context_save(void) -{ - return toi_core_fns->post_context_save(); -} - -int try_tuxonice_hibernate(void) -{ - if (!toi_core_fns) - return -ENODEV; - - return toi_core_fns->try_hibernate(); -} - -static int num_resume_calls; -#ifdef CONFIG_TOI_IGNORE_LATE_INITCALL -static int ignore_late_initcall = 1; -#else -static int ignore_late_initcall; -#endif - -int toi_translate_err_default = TOI_CONTINUE_REQ; - -void try_tuxonice_resume(void) -{ - if (!hibernation_available()) - return; - - /* Don't let it wrap around eventually */ - if (num_resume_calls < 2) - num_resume_calls++; - - if (num_resume_calls == 1 && ignore_late_initcall) { - printk(KERN_INFO "TuxOnIce: Ignoring late initcall, as requested.\n"); - return; - } - - if (toi_core_fns) - toi_core_fns->try_resume(); - else - printk(KERN_INFO "TuxOnIce core not loaded yet.\n"); -} - -int toi_lowlevel_builtin(void) -{ - int error = 0; - - save_processor_state(); - error = swsusp_arch_suspend(); - if (error) - printk(KERN_ERR "Error %d hibernating\n", error); - - /* Restore control flow appears here */ - if (!toi_in_hibernate) { - copyback_high(); - set_toi_state(TOI_NOW_RESUMING); - } - - restore_processor_state(); - return error; -} - -unsigned long toi_compress_bytes_in; -unsigned long toi_compress_bytes_out; - -int toi_in_suspend(void) -{ - return in_suspend; -} - -unsigned long toi_state = ((1 << TOI_BOOT_TIME) | - (1 << TOI_IGNORE_LOGLEVEL) | - (1 << TOI_IO_STOPPED)); - -/* The number of hibernates we have started (some may have been cancelled) */ -unsigned int nr_hibernates; -int toi_running; -__nosavedata int toi_in_hibernate; -__nosavedata struct pbe *restore_highmem_pblist; - -int toi_trace_allocs; - -void toi_read_lock_tasklist(void) -{ - read_lock(&tasklist_lock); -} - -void toi_read_unlock_tasklist(void) -{ - read_unlock(&tasklist_lock); -} - -#ifdef CONFIG_TOI_ZRAM_SUPPORT -int (*toi_flag_zram_disks) (void); - -int toi_do_flag_zram_disks(void) -{ - return toi_flag_zram_disks ? (*toi_flag_zram_disks)() : 0; -} - -#endif - -/* toi_generate_free_page_map - * - * Description: This routine generates a bitmap of free pages from the - * lists used by the memory manager. We then use the bitmap - * to quickly calculate which pages to save and in which - * pagesets. - */ -void toi_generate_free_page_map(void) -{ - int order, cpu, t; - unsigned long flags, i; - struct zone *zone; - struct list_head *curr; - unsigned long pfn; - struct page *page; - - for_each_populated_zone(zone) { - - if (!zone->spanned_pages) - continue; - - spin_lock_irqsave(&zone->lock, flags); - - for (i = 0; i < zone->spanned_pages; i++) { - pfn = zone->zone_start_pfn + i; - - if (!pfn_valid(pfn)) - continue; - - page = pfn_to_page(pfn); - - ClearPageNosaveFree(page); - } - - for_each_migratetype_order(order, t) { - list_for_each(curr, - &zone->free_area[order].free_list[t]) { - unsigned long j; - - pfn = page_to_pfn(list_entry(curr, struct page, - lru)); - for (j = 0; j < (1UL << order); j++) - SetPageNosaveFree(pfn_to_page(pfn + j)); - } - } - - for_each_online_cpu(cpu) { - struct per_cpu_pageset *pset = - per_cpu_ptr(zone->pageset, cpu); - struct per_cpu_pages *pcp = &pset->pcp; - struct page *page; - int t; - - for (t = 0; t < MIGRATE_PCPTYPES; t++) - list_for_each_entry(page, &pcp->lists[t], lru) - SetPageNosaveFree(page); - } - - spin_unlock_irqrestore(&zone->lock, flags); - } -} - -/* toi_size_of_free_region - * - * Description: Return the number of pages that are free, beginning with and - * including this one. - */ -int toi_size_of_free_region(struct zone *zone, unsigned long start_pfn) -{ - unsigned long this_pfn = start_pfn, - end_pfn = zone_end_pfn(zone); - - while (pfn_valid(this_pfn) && this_pfn < end_pfn && PageNosaveFree(pfn_to_page(this_pfn))) - this_pfn++; - - return this_pfn - start_pfn; -} - -static int __init toi_wait_setup(char *str) -{ - int value; - - if (sscanf(str, "=%d", &value)) { - if (value < -1 || value > 255) - printk(KERN_INFO "TuxOnIce_wait outside range -1 to " - "255.\n"); - else - toi_wait = value; - } - - return 1; -} -__setup("toi_wait", toi_wait_setup); - -static int __init toi_translate_retry_setup(char *str) -{ - toi_translate_err_default = 0; - return 1; -} -__setup("toi_translate_retry", toi_translate_retry_setup); - -static int __init toi_debug_setup(char *str) -{ - toi_bkd.toi_action |= (1 << TOI_LOGALL); - toi_bootflags_mask |= (1 << TOI_LOGALL); - toi_bkd.toi_debug_state = 255; - toi_bkd.toi_default_console_level = 7; - return 1; -} -__setup("toi_debug_setup", toi_debug_setup); - -static int __init toi_pause_setup(char *str) -{ - toi_bkd.toi_action |= (1 << TOI_PAUSE); - toi_bootflags_mask |= (1 << TOI_PAUSE); - return 1; -} -__setup("toi_pause", toi_pause_setup); - -#ifdef CONFIG_PM_DEBUG -static int __init toi_trace_allocs_setup(char *str) -{ - int value; - - if (sscanf(str, "=%d", &value)) - toi_trace_allocs = value; - - return 1; -} -__setup("toi_trace_allocs", toi_trace_allocs_setup); -#endif - -static int __init toi_ignore_late_initcall_setup(char *str) -{ - int value; - - if (sscanf(str, "=%d", &value)) - ignore_late_initcall = value; - - return 1; -} -__setup("toi_initramfs_resume_only", toi_ignore_late_initcall_setup); - -static int __init toi_force_no_multithreaded_setup(char *str) -{ - int value; - - toi_bkd.toi_action &= ~(1 << TOI_NO_MULTITHREADED_IO); - toi_bootflags_mask |= (1 << TOI_NO_MULTITHREADED_IO); - - if (sscanf(str, "=%d", &value) && value) - toi_bkd.toi_action |= (1 << TOI_NO_MULTITHREADED_IO); - - return 1; -} -__setup("toi_no_multithreaded", toi_force_no_multithreaded_setup); - -#ifdef CONFIG_KGDB -static int __init toi_post_resume_breakpoint_setup(char *str) -{ - int value; - - toi_bkd.toi_action &= ~(1 << TOI_POST_RESUME_BREAKPOINT); - toi_bootflags_mask |= (1 << TOI_POST_RESUME_BREAKPOINT); - if (sscanf(str, "=%d", &value) && value) - toi_bkd.toi_action |= (1 << TOI_POST_RESUME_BREAKPOINT); - - return 1; -} -__setup("toi_post_resume_break", toi_post_resume_breakpoint_setup); -#endif - -static int __init toi_disable_readahead_setup(char *str) -{ - int value; - - toi_bkd.toi_action &= ~(1 << TOI_NO_READAHEAD); - toi_bootflags_mask |= (1 << TOI_NO_READAHEAD); - if (sscanf(str, "=%d", &value) && value) - toi_bkd.toi_action |= (1 << TOI_NO_READAHEAD); - - return 1; -} -__setup("toi_no_readahead", toi_disable_readahead_setup); diff --git a/kernel/power/tuxonice_builtin.h b/kernel/power/tuxonice_builtin.h deleted file mode 100644 index 9539818e0..000000000 --- a/kernel/power/tuxonice_builtin.h +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - */ -#include <asm/setup.h> - -extern struct toi_core_fns *toi_core_fns; -extern unsigned long toi_compress_bytes_in, toi_compress_bytes_out; -extern unsigned int nr_hibernates; -extern int toi_in_hibernate; - -extern __nosavedata struct pbe *restore_highmem_pblist; - -int toi_lowlevel_builtin(void); - -#ifdef CONFIG_HIGHMEM -extern __nosavedata struct zone_data *toi_nosave_zone_list; -extern __nosavedata unsigned long toi_nosave_max_pfn; -#endif - -extern unsigned long toi_get_nonconflicting_page(void); -extern int toi_post_context_save(void); - -extern char toi_wait_for_keypress_dev_console(int timeout); -extern struct block_device *toi_open_by_devnum(dev_t dev); -extern void toi_close_bdev(struct block_device *bdev); -extern int toi_wait; -extern int toi_translate_err_default; -extern int toi_force_no_multithreaded; -extern void toi_read_lock_tasklist(void); -extern void toi_read_unlock_tasklist(void); -extern int toi_in_suspend(void); -extern void toi_generate_free_page_map(void); -extern int toi_size_of_free_region(struct zone *zone, unsigned long start_pfn); - -#ifdef CONFIG_TOI_ZRAM_SUPPORT -extern int toi_do_flag_zram_disks(void); -#else -#define toi_do_flag_zram_disks() (0) -#endif diff --git a/kernel/power/tuxonice_checksum.c b/kernel/power/tuxonice_checksum.c deleted file mode 100644 index 1c4e10c72..000000000 --- a/kernel/power/tuxonice_checksum.c +++ /dev/null @@ -1,392 +0,0 @@ -/* - * kernel/power/tuxonice_checksum.c - * - * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * This file contains data checksum routines for TuxOnIce, - * using cryptoapi. They are used to locate any modifications - * made to pageset 2 while we're saving it. - */ - -#include <linux/suspend.h> -#include <linux/highmem.h> -#include <linux/vmalloc.h> -#include <linux/crypto.h> -#include <linux/scatterlist.h> - -#include "tuxonice.h" -#include "tuxonice_modules.h" -#include "tuxonice_sysfs.h" -#include "tuxonice_io.h" -#include "tuxonice_pageflags.h" -#include "tuxonice_checksum.h" -#include "tuxonice_pagedir.h" -#include "tuxonice_alloc.h" -#include "tuxonice_ui.h" - -static struct toi_module_ops toi_checksum_ops; - -/* Constant at the mo, but I might allow tuning later */ -static char toi_checksum_name[32] = "md4"; -/* Bytes per checksum */ -#define CHECKSUM_SIZE (16) - -#define CHECKSUMS_PER_PAGE ((PAGE_SIZE - sizeof(void *)) / CHECKSUM_SIZE) - -struct cpu_context { - struct crypto_hash *transform; - struct hash_desc desc; - struct scatterlist sg[2]; - char *buf; -}; - -static DEFINE_PER_CPU(struct cpu_context, contexts); -static int pages_allocated; -static unsigned long page_list; - -static int toi_num_resaved; - -static unsigned long this_checksum, next_page; -static int checksum_count; - -static inline int checksum_pages_needed(void) -{ - return DIV_ROUND_UP(pagedir2.size, CHECKSUMS_PER_PAGE); -} - -/* ---- Local buffer management ---- */ - -/* - * toi_checksum_cleanup - * - * Frees memory allocated for our labours. - */ -static void toi_checksum_cleanup(int ending_cycle) -{ - int cpu; - - if (ending_cycle) { - for_each_online_cpu(cpu) { - struct cpu_context *this = &per_cpu(contexts, cpu); - if (this->transform) { - crypto_free_hash(this->transform); - this->transform = NULL; - this->desc.tfm = NULL; - } - - if (this->buf) { - toi_free_page(27, (unsigned long) this->buf); - this->buf = NULL; - } - } - } -} - -/* - * toi_crypto_initialise - * - * Prepare to do some work by allocating buffers and transforms. - * Returns: Int: Zero. Even if we can't set up checksum, we still - * seek to hibernate. - */ -static int toi_checksum_initialise(int starting_cycle) -{ - int cpu; - - if (!(starting_cycle & SYSFS_HIBERNATE) || !toi_checksum_ops.enabled) - return 0; - - if (!*toi_checksum_name) { - printk(KERN_INFO "TuxOnIce: No checksum algorithm name set.\n"); - return 1; - } - - for_each_online_cpu(cpu) { - struct cpu_context *this = &per_cpu(contexts, cpu); - struct page *page; - - this->transform = crypto_alloc_hash(toi_checksum_name, 0, 0); - if (IS_ERR(this->transform)) { - printk(KERN_INFO "TuxOnIce: Failed to initialise the " - "%s checksum algorithm: %ld.\n", - toi_checksum_name, (long) this->transform); - this->transform = NULL; - return 1; - } - - this->desc.tfm = this->transform; - this->desc.flags = 0; - - page = toi_alloc_page(27, GFP_KERNEL); - if (!page) - return 1; - this->buf = page_address(page); - sg_init_one(&this->sg[0], this->buf, PAGE_SIZE); - } - return 0; -} - -/* - * toi_checksum_print_debug_stats - * @buffer: Pointer to a buffer into which the debug info will be printed. - * @size: Size of the buffer. - * - * Print information to be recorded for debugging purposes into a buffer. - * Returns: Number of characters written to the buffer. - */ - -static int toi_checksum_print_debug_stats(char *buffer, int size) -{ - int len; - - if (!toi_checksum_ops.enabled) - return scnprintf(buffer, size, - "- Checksumming disabled.\n"); - - len = scnprintf(buffer, size, "- Checksum method is '%s'.\n", - toi_checksum_name); - len += scnprintf(buffer + len, size - len, - " %d pages resaved in atomic copy.\n", toi_num_resaved); - return len; -} - -static int toi_checksum_memory_needed(void) -{ - return toi_checksum_ops.enabled ? - checksum_pages_needed() << PAGE_SHIFT : 0; -} - -static int toi_checksum_storage_needed(void) -{ - if (toi_checksum_ops.enabled) - return strlen(toi_checksum_name) + sizeof(int) + 1; - else - return 0; -} - -/* - * toi_checksum_save_config_info - * @buffer: Pointer to a buffer of size PAGE_SIZE. - * - * Save informaton needed when reloading the image at resume time. - * Returns: Number of bytes used for saving our data. - */ -static int toi_checksum_save_config_info(char *buffer) -{ - int namelen = strlen(toi_checksum_name) + 1; - int total_len; - - *((unsigned int *) buffer) = namelen; - strncpy(buffer + sizeof(unsigned int), toi_checksum_name, namelen); - total_len = sizeof(unsigned int) + namelen; - return total_len; -} - -/* toi_checksum_load_config_info - * @buffer: Pointer to the start of the data. - * @size: Number of bytes that were saved. - * - * Description: Reload information needed for dechecksuming the image at - * resume time. - */ -static void toi_checksum_load_config_info(char *buffer, int size) -{ - int namelen; - - namelen = *((unsigned int *) (buffer)); - strncpy(toi_checksum_name, buffer + sizeof(unsigned int), - namelen); - return; -} - -/* - * Free Checksum Memory - */ - -void free_checksum_pages(void) -{ - while (pages_allocated) { - unsigned long next = *((unsigned long *) page_list); - ClearPageNosave(virt_to_page(page_list)); - toi_free_page(15, (unsigned long) page_list); - page_list = next; - pages_allocated--; - } -} - -/* - * Allocate Checksum Memory - */ - -int allocate_checksum_pages(void) -{ - int pages_needed = checksum_pages_needed(); - - if (!toi_checksum_ops.enabled) - return 0; - - while (pages_allocated < pages_needed) { - unsigned long *new_page = - (unsigned long *) toi_get_zeroed_page(15, TOI_ATOMIC_GFP); - if (!new_page) { - printk(KERN_ERR "Unable to allocate checksum pages.\n"); - return -ENOMEM; - } - SetPageNosave(virt_to_page(new_page)); - (*new_page) = page_list; - page_list = (unsigned long) new_page; - pages_allocated++; - } - - next_page = (unsigned long) page_list; - checksum_count = 0; - - return 0; -} - -char *tuxonice_get_next_checksum(void) -{ - if (!toi_checksum_ops.enabled) - return NULL; - - if (checksum_count % CHECKSUMS_PER_PAGE) - this_checksum += CHECKSUM_SIZE; - else { - this_checksum = next_page + sizeof(void *); - next_page = *((unsigned long *) next_page); - } - - checksum_count++; - return (char *) this_checksum; -} - -int tuxonice_calc_checksum(struct page *page, char *checksum_locn) -{ - char *pa; - int result, cpu = smp_processor_id(); - struct cpu_context *ctx = &per_cpu(contexts, cpu); - - if (!toi_checksum_ops.enabled) - return 0; - - pa = kmap(page); - memcpy(ctx->buf, pa, PAGE_SIZE); - kunmap(page); - result = crypto_hash_digest(&ctx->desc, ctx->sg, PAGE_SIZE, - checksum_locn); - if (result) - printk(KERN_ERR "TuxOnIce checksumming: crypto_hash_digest " - "returned %d.\n", result); - return result; -} -/* - * Calculate checksums - */ - -void check_checksums(void) -{ - int index = 0, cpu = smp_processor_id(); - char current_checksum[CHECKSUM_SIZE]; - struct cpu_context *ctx = &per_cpu(contexts, cpu); - unsigned long pfn; - - if (!toi_checksum_ops.enabled) { - toi_message(TOI_IO, TOI_VERBOSE, 0, "Checksumming disabled."); - return; - } - - next_page = (unsigned long) page_list; - - toi_num_resaved = 0; - this_checksum = 0; - - toi_trace_index++; - - toi_message(TOI_IO, TOI_VERBOSE, 0, "Verifying checksums."); - memory_bm_position_reset(pageset2_map); - for (pfn = memory_bm_next_pfn(pageset2_map, 0); pfn != BM_END_OF_MAP; - pfn = memory_bm_next_pfn(pageset2_map, 0)) { - int ret, resave_needed = false; - char *pa; - struct page *page = pfn_to_page(pfn); - - if (index < checksum_count) { - if (index % CHECKSUMS_PER_PAGE) { - this_checksum += CHECKSUM_SIZE; - } else { - this_checksum = next_page + sizeof(void *); - next_page = *((unsigned long *) next_page); - } - - /* Done when IRQs disabled so must be atomic */ - pa = kmap_atomic(page); - memcpy(ctx->buf, pa, PAGE_SIZE); - kunmap_atomic(pa); - ret = crypto_hash_digest(&ctx->desc, ctx->sg, PAGE_SIZE, - current_checksum); - - if (ret) { - printk(KERN_INFO "Digest failed. Returned %d.\n", ret); - return; - } - - resave_needed = memcmp(current_checksum, (char *) this_checksum, - CHECKSUM_SIZE); - } else { - resave_needed = true; - } - - if (resave_needed) { - TOI_TRACE_DEBUG(pfn, "_Resaving %d", resave_needed); - SetPageResave(pfn_to_page(pfn)); - toi_num_resaved++; - if (test_action_state(TOI_ABORT_ON_RESAVE_NEEDED)) - set_abort_result(TOI_RESAVE_NEEDED); - } - - index++; - } - toi_message(TOI_IO, TOI_VERBOSE, 0, "Checksum verification complete."); -} - -static struct toi_sysfs_data sysfs_params[] = { - SYSFS_INT("enabled", SYSFS_RW, &toi_checksum_ops.enabled, 0, 1, 0, - NULL), - SYSFS_BIT("abort_if_resave_needed", SYSFS_RW, &toi_bkd.toi_action, - TOI_ABORT_ON_RESAVE_NEEDED, 0) -}; - -/* - * Ops structure. - */ -static struct toi_module_ops toi_checksum_ops = { - .type = MISC_MODULE, - .name = "checksumming", - .directory = "checksum", - .module = THIS_MODULE, - .initialise = toi_checksum_initialise, - .cleanup = toi_checksum_cleanup, - .print_debug_info = toi_checksum_print_debug_stats, - .save_config_info = toi_checksum_save_config_info, - .load_config_info = toi_checksum_load_config_info, - .memory_needed = toi_checksum_memory_needed, - .storage_needed = toi_checksum_storage_needed, - - .sysfs_data = sysfs_params, - .num_sysfs_entries = sizeof(sysfs_params) / - sizeof(struct toi_sysfs_data), -}; - -/* ---- Registration ---- */ -int toi_checksum_init(void) -{ - int result = toi_register_module(&toi_checksum_ops); - return result; -} - -void toi_checksum_exit(void) -{ - toi_unregister_module(&toi_checksum_ops); -} diff --git a/kernel/power/tuxonice_checksum.h b/kernel/power/tuxonice_checksum.h deleted file mode 100644 index c8196fbb0..000000000 --- a/kernel/power/tuxonice_checksum.h +++ /dev/null @@ -1,31 +0,0 @@ -/* - * kernel/power/tuxonice_checksum.h - * - * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * This file contains data checksum routines for TuxOnIce, - * using cryptoapi. They are used to locate any modifications - * made to pageset 2 while we're saving it. - */ - -#if defined(CONFIG_TOI_CHECKSUM) -extern int toi_checksum_init(void); -extern void toi_checksum_exit(void); -void check_checksums(void); -int allocate_checksum_pages(void); -void free_checksum_pages(void); -char *tuxonice_get_next_checksum(void); -int tuxonice_calc_checksum(struct page *page, char *checksum_locn); -#else -static inline int toi_checksum_init(void) { return 0; } -static inline void toi_checksum_exit(void) { } -static inline void check_checksums(void) { }; -static inline int allocate_checksum_pages(void) { return 0; }; -static inline void free_checksum_pages(void) { }; -static inline char *tuxonice_get_next_checksum(void) { return NULL; }; -static inline int tuxonice_calc_checksum(struct page *page, char *checksum_locn) - { return 0; } -#endif - diff --git a/kernel/power/tuxonice_cluster.c b/kernel/power/tuxonice_cluster.c deleted file mode 100644 index 2873f93c6..000000000 --- a/kernel/power/tuxonice_cluster.c +++ /dev/null @@ -1,1058 +0,0 @@ -/* - * kernel/power/tuxonice_cluster.c - * - * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * This file contains routines for cluster hibernation support. - * - * Based on ip autoconfiguration code in net/ipv4/ipconfig.c. - * - * How does it work? - * - * There is no 'master' node that tells everyone else what to do. All nodes - * send messages to the broadcast address/port, maintain a list of peers - * and figure out when to progress to the next step in hibernating or resuming. - * This makes us more fault tolerant when it comes to nodes coming and going - * (which may be more of an issue if we're hibernating when power supplies - * are being unreliable). - * - * At boot time, we start a ktuxonice thread that handles communication with - * other nodes. This node maintains a state machine that controls our progress - * through hibernating and resuming, keeping us in step with other nodes. Nodes - * are identified by their hw address. - * - * On startup, the node sends CLUSTER_PING on the configured interface's - * broadcast address, port $toi_cluster_port (see below) and begins to listen - * for other broadcast messages. CLUSTER_PING messages are repeated at - * intervals of 5 minutes, with a random offset to spread traffic out. - * - * A hibernation cycle is initiated from any node via - * - * echo > /sys/power/tuxonice/do_hibernate - * - * and (possibily) the hibernate script. At each step of the process, the node - * completes its work, and waits for all other nodes to signal completion of - * their work (or timeout) before progressing to the next step. - * - * Request/state Action before reply Possible reply Next state - * HIBERNATE capable, pre-script HIBERNATE|ACK NODE_PREP - * HIBERNATE|NACK INIT_0 - * - * PREP prepare_image PREP|ACK IMAGE_WRITE - * PREP|NACK INIT_0 - * ABORT RUNNING - * - * IO write image IO|ACK power off - * ABORT POST_RESUME - * - * (Boot time) check for image IMAGE|ACK RESUME_PREP - * (Note 1) - * IMAGE|NACK (Note 2) - * - * PREP prepare read image PREP|ACK IMAGE_READ - * PREP|NACK (As NACK_IMAGE) - * - * IO read image IO|ACK POST_RESUME - * - * POST_RESUME thaw, post-script RUNNING - * - * INIT_0 init 0 - * - * Other messages: - * - * - PING: Request for all other live nodes to send a PONG. Used at startup to - * announce presence, when a node is suspected dead and periodically, in case - * segments of the network are [un]plugged. - * - * - PONG: Response to a PING. - * - * - ABORT: Request to cancel writing an image. - * - * - BYE: Notification that this node is shutting down. - * - * Note 1: Repeated at 3s intervals until we continue to boot/resume, so that - * nodes which are slower to start up can get state synchronised. If a node - * starting up sees other nodes sending RESUME_PREP or IMAGE_READ, it may send - * ACK_IMAGE and they will wait for it to catch up. If it sees ACK_READ, it - * must invalidate its image (if any) and boot normally. - * - * Note 2: May occur when one node lost power or powered off while others - * hibernated. This node waits for others to complete resuming (ACK_READ) - * before completing its boot, so that it appears as a fail node restarting. - * - * If any node has an image, then it also has a list of nodes that hibernated - * in synchronisation with it. The node will wait for other nodes to appear - * or timeout before beginning its restoration. - * - * If a node has no image, it needs to wait, in case other nodes which do have - * an image are going to resume, but are taking longer to announce their - * presence. For this reason, the user can specify a timeout value and a number - * of nodes detected before we just continue. (We might want to assume in a - * cluster of, say, 15 nodes, if 8 others have booted without finding an image, - * the remaining nodes will too. This might help in situations where some nodes - * are much slower to boot, or more subject to hardware failures or such like). - */ - -#include <linux/suspend.h> -#include <linux/if.h> -#include <linux/rtnetlink.h> -#include <linux/ip.h> -#include <linux/udp.h> -#include <linux/in.h> -#include <linux/if_arp.h> -#include <linux/kthread.h> -#include <linux/wait.h> -#include <linux/netdevice.h> -#include <net/ip.h> - -#include "tuxonice.h" -#include "tuxonice_modules.h" -#include "tuxonice_sysfs.h" -#include "tuxonice_alloc.h" -#include "tuxonice_io.h" - -#if 1 -#define PRINTK(a, b...) do { printk(a, ##b); } while (0) -#else -#define PRINTK(a, b...) do { } while (0) -#endif - -static int loopback_mode; -static int num_local_nodes = 1; -#define MAX_LOCAL_NODES 8 -#define SADDR (loopback_mode ? b->sid : h->saddr) - -#define MYNAME "TuxOnIce Clustering" - -enum cluster_message { - MSG_ACK = 1, - MSG_NACK = 2, - MSG_PING = 4, - MSG_ABORT = 8, - MSG_BYE = 16, - MSG_HIBERNATE = 32, - MSG_IMAGE = 64, - MSG_IO = 128, - MSG_RUNNING = 256 -}; - -static char *str_message(int message) -{ - switch (message) { - case 4: - return "Ping"; - case 8: - return "Abort"; - case 9: - return "Abort acked"; - case 10: - return "Abort nacked"; - case 16: - return "Bye"; - case 17: - return "Bye acked"; - case 18: - return "Bye nacked"; - case 32: - return "Hibernate request"; - case 33: - return "Hibernate ack"; - case 34: - return "Hibernate nack"; - case 64: - return "Image exists?"; - case 65: - return "Image does exist"; - case 66: - return "No image here"; - case 128: - return "I/O"; - case 129: - return "I/O okay"; - case 130: - return "I/O failed"; - case 256: - return "Running"; - default: - printk(KERN_ERR "Unrecognised message %d.\n", message); - return "Unrecognised message (see dmesg)"; - } -} - -#define MSG_ACK_MASK (MSG_ACK | MSG_NACK) -#define MSG_STATE_MASK (~MSG_ACK_MASK) - -struct node_info { - struct list_head member_list; - wait_queue_head_t member_events; - spinlock_t member_list_lock; - spinlock_t receive_lock; - int peer_count, ignored_peer_count; - struct toi_sysfs_data sysfs_data; - enum cluster_message current_message; -}; - -struct node_info node_array[MAX_LOCAL_NODES]; - -struct cluster_member { - __be32 addr; - enum cluster_message message; - struct list_head list; - int ignore; -}; - -#define toi_cluster_port_send 3501 -#define toi_cluster_port_recv 3502 - -static struct net_device *net_dev; -static struct toi_module_ops toi_cluster_ops; - -static int toi_recv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt, struct net_device *orig_dev); - -static struct packet_type toi_cluster_packet_type = { - .type = __constant_htons(ETH_P_IP), - .func = toi_recv, -}; - -struct toi_pkt { /* BOOTP packet format */ - struct iphdr iph; /* IP header */ - struct udphdr udph; /* UDP header */ - u8 htype; /* HW address type */ - u8 hlen; /* HW address length */ - __be32 xid; /* Transaction ID */ - __be16 secs; /* Seconds since we started */ - __be16 flags; /* Just what it says */ - u8 hw_addr[16]; /* Sender's HW address */ - u16 message; /* Message */ - unsigned long sid; /* Source ID for loopback testing */ -}; - -static char toi_cluster_iface[IFNAMSIZ] = CONFIG_TOI_DEFAULT_CLUSTER_INTERFACE; - -static int added_pack; - -static int others_have_image; - -/* Key used to allow multiple clusters on the same lan */ -static char toi_cluster_key[32] = CONFIG_TOI_DEFAULT_CLUSTER_KEY; -static char pre_hibernate_script[255] = - CONFIG_TOI_DEFAULT_CLUSTER_PRE_HIBERNATE; -static char post_hibernate_script[255] = - CONFIG_TOI_DEFAULT_CLUSTER_POST_HIBERNATE; - -/* List of cluster members */ -static unsigned long continue_delay = 5 * HZ; -static unsigned long cluster_message_timeout = 3 * HZ; - -/* === Membership list === */ - -static void print_member_info(int index) -{ - struct cluster_member *this; - - printk(KERN_INFO "==> Dumping node %d.\n", index); - - list_for_each_entry(this, &node_array[index].member_list, list) - printk(KERN_INFO "%d.%d.%d.%d last message %s. %s\n", - NIPQUAD(this->addr), - str_message(this->message), - this->ignore ? "(Ignored)" : ""); - printk(KERN_INFO "== Done ==\n"); -} - -static struct cluster_member *__find_member(int index, __be32 addr) -{ - struct cluster_member *this; - - list_for_each_entry(this, &node_array[index].member_list, list) { - if (this->addr != addr) - continue; - - return this; - } - - return NULL; -} - -static void set_ignore(int index, __be32 addr, struct cluster_member *this) -{ - if (this->ignore) { - PRINTK("Node %d already ignoring %d.%d.%d.%d.\n", - index, NIPQUAD(addr)); - return; - } - - PRINTK("Node %d sees node %d.%d.%d.%d now being ignored.\n", - index, NIPQUAD(addr)); - this->ignore = 1; - node_array[index].ignored_peer_count++; -} - -static int __add_update_member(int index, __be32 addr, int message) -{ - struct cluster_member *this; - - this = __find_member(index, addr); - if (this) { - if (this->message != message) { - this->message = message; - if ((message & MSG_NACK) && - (message & (MSG_HIBERNATE | MSG_IMAGE | MSG_IO))) - set_ignore(index, addr, this); - PRINTK("Node %d sees node %d.%d.%d.%d now sending " - "%s.\n", index, NIPQUAD(addr), - str_message(message)); - wake_up(&node_array[index].member_events); - } - return 0; - } - - this = (struct cluster_member *) toi_kzalloc(36, - sizeof(struct cluster_member), GFP_KERNEL); - - if (!this) - return -1; - - this->addr = addr; - this->message = message; - this->ignore = 0; - INIT_LIST_HEAD(&this->list); - - node_array[index].peer_count++; - - PRINTK("Node %d sees node %d.%d.%d.%d sending %s.\n", index, - NIPQUAD(addr), str_message(message)); - - if ((message & MSG_NACK) && - (message & (MSG_HIBERNATE | MSG_IMAGE | MSG_IO))) - set_ignore(index, addr, this); - list_add_tail(&this->list, &node_array[index].member_list); - return 1; -} - -static int add_update_member(int index, __be32 addr, int message) -{ - int result; - unsigned long flags; - spin_lock_irqsave(&node_array[index].member_list_lock, flags); - result = __add_update_member(index, addr, message); - spin_unlock_irqrestore(&node_array[index].member_list_lock, flags); - - print_member_info(index); - - wake_up(&node_array[index].member_events); - - return result; -} - -static void del_member(int index, __be32 addr) -{ - struct cluster_member *this; - unsigned long flags; - - spin_lock_irqsave(&node_array[index].member_list_lock, flags); - this = __find_member(index, addr); - - if (this) { - list_del_init(&this->list); - toi_kfree(36, this, sizeof(*this)); - node_array[index].peer_count--; - } - - spin_unlock_irqrestore(&node_array[index].member_list_lock, flags); -} - -/* === Message transmission === */ - -static void toi_send_if(int message, unsigned long my_id); - -/* - * Process received TOI packet. - */ -static int toi_recv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt, struct net_device *orig_dev) -{ - struct toi_pkt *b; - struct iphdr *h; - int len, result, index; - unsigned long addr, message, ack; - - /* Perform verifications before taking the lock. */ - if (skb->pkt_type == PACKET_OTHERHOST) - goto drop; - - if (dev != net_dev) - goto drop; - - skb = skb_share_check(skb, GFP_ATOMIC); - if (!skb) - return NET_RX_DROP; - - if (!pskb_may_pull(skb, - sizeof(struct iphdr) + - sizeof(struct udphdr))) - goto drop; - - b = (struct toi_pkt *)skb_network_header(skb); - h = &b->iph; - - if (h->ihl != 5 || h->version != 4 || h->protocol != IPPROTO_UDP) - goto drop; - - /* Fragments are not supported */ - if (h->frag_off & htons(IP_OFFSET | IP_MF)) { - if (net_ratelimit()) - printk(KERN_ERR "TuxOnIce: Ignoring fragmented " - "cluster message.\n"); - goto drop; - } - - if (skb->len < ntohs(h->tot_len)) - goto drop; - - if (ip_fast_csum((char *) h, h->ihl)) - goto drop; - - if (b->udph.source != htons(toi_cluster_port_send) || - b->udph.dest != htons(toi_cluster_port_recv)) - goto drop; - - if (ntohs(h->tot_len) < ntohs(b->udph.len) + sizeof(struct iphdr)) - goto drop; - - len = ntohs(b->udph.len) - sizeof(struct udphdr); - - /* Ok the front looks good, make sure we can get at the rest. */ - if (!pskb_may_pull(skb, skb->len)) - goto drop; - - b = (struct toi_pkt *)skb_network_header(skb); - h = &b->iph; - - addr = SADDR; - PRINTK(">>> Message %s received from " NIPQUAD_FMT ".\n", - str_message(b->message), NIPQUAD(addr)); - - message = b->message & MSG_STATE_MASK; - ack = b->message & MSG_ACK_MASK; - - for (index = 0; index < num_local_nodes; index++) { - int new_message = node_array[index].current_message, - old_message = new_message; - - if (index == SADDR || !old_message) { - PRINTK("Ignoring node %d (offline or self).\n", index); - continue; - } - - /* One message at a time, please. */ - spin_lock(&node_array[index].receive_lock); - - result = add_update_member(index, SADDR, b->message); - if (result == -1) { - printk(KERN_INFO "Failed to add new cluster member " - NIPQUAD_FMT ".\n", - NIPQUAD(addr)); - goto drop_unlock; - } - - switch (b->message & MSG_STATE_MASK) { - case MSG_PING: - break; - case MSG_ABORT: - break; - case MSG_BYE: - break; - case MSG_HIBERNATE: - /* Can I hibernate? */ - new_message = MSG_HIBERNATE | - ((index & 1) ? MSG_NACK : MSG_ACK); - break; - case MSG_IMAGE: - /* Can I resume? */ - new_message = MSG_IMAGE | - ((index & 1) ? MSG_NACK : MSG_ACK); - if (new_message != old_message) - printk(KERN_ERR "Setting whether I can resume " - "to %d.\n", new_message); - break; - case MSG_IO: - new_message = MSG_IO | MSG_ACK; - break; - case MSG_RUNNING: - break; - default: - if (net_ratelimit()) - printk(KERN_ERR "Unrecognised TuxOnIce cluster" - " message %d from " NIPQUAD_FMT ".\n", - b->message, NIPQUAD(addr)); - }; - - if (old_message != new_message) { - node_array[index].current_message = new_message; - printk(KERN_INFO ">>> Sending new message for node " - "%d.\n", index); - toi_send_if(new_message, index); - } else if (!ack) { - printk(KERN_INFO ">>> Resending message for node %d.\n", - index); - toi_send_if(new_message, index); - } -drop_unlock: - spin_unlock(&node_array[index].receive_lock); - }; - -drop: - /* Throw the packet out. */ - kfree_skb(skb); - - return 0; -} - -/* - * Send cluster message to single interface. - */ -static void toi_send_if(int message, unsigned long my_id) -{ - struct sk_buff *skb; - struct toi_pkt *b; - int hh_len = LL_RESERVED_SPACE(net_dev); - struct iphdr *h; - - /* Allocate packet */ - skb = alloc_skb(sizeof(struct toi_pkt) + hh_len + 15, GFP_KERNEL); - if (!skb) - return; - skb_reserve(skb, hh_len); - b = (struct toi_pkt *) skb_put(skb, sizeof(struct toi_pkt)); - memset(b, 0, sizeof(struct toi_pkt)); - - /* Construct IP header */ - skb_reset_network_header(skb); - h = ip_hdr(skb); - h->version = 4; - h->ihl = 5; - h->tot_len = htons(sizeof(struct toi_pkt)); - h->frag_off = htons(IP_DF); - h->ttl = 64; - h->protocol = IPPROTO_UDP; - h->daddr = htonl(INADDR_BROADCAST); - h->check = ip_fast_csum((unsigned char *) h, h->ihl); - - /* Construct UDP header */ - b->udph.source = htons(toi_cluster_port_send); - b->udph.dest = htons(toi_cluster_port_recv); - b->udph.len = htons(sizeof(struct toi_pkt) - sizeof(struct iphdr)); - /* UDP checksum not calculated -- explicitly allowed in BOOTP RFC */ - - /* Construct message */ - b->message = message; - b->sid = my_id; - b->htype = net_dev->type; /* can cause undefined behavior */ - b->hlen = net_dev->addr_len; - memcpy(b->hw_addr, net_dev->dev_addr, net_dev->addr_len); - b->secs = htons(3); /* 3 seconds */ - - /* Chain packet down the line... */ - skb->dev = net_dev; - skb->protocol = htons(ETH_P_IP); - if ((dev_hard_header(skb, net_dev, ntohs(skb->protocol), - net_dev->broadcast, net_dev->dev_addr, skb->len) < 0) || - dev_queue_xmit(skb) < 0) - printk(KERN_INFO "E"); -} - -/* ========================================= */ - -/* kTOICluster */ - -static atomic_t num_cluster_threads; -static DECLARE_WAIT_QUEUE_HEAD(clusterd_events); - -static int kTOICluster(void *data) -{ - unsigned long my_id; - - my_id = atomic_add_return(1, &num_cluster_threads) - 1; - node_array[my_id].current_message = (unsigned long) data; - - PRINTK("kTOICluster daemon %lu starting.\n", my_id); - - current->flags |= PF_NOFREEZE; - - while (node_array[my_id].current_message) { - toi_send_if(node_array[my_id].current_message, my_id); - sleep_on_timeout(&clusterd_events, - cluster_message_timeout); - PRINTK("Link state %lu is %d.\n", my_id, - node_array[my_id].current_message); - } - - toi_send_if(MSG_BYE, my_id); - atomic_dec(&num_cluster_threads); - wake_up(&clusterd_events); - - PRINTK("kTOICluster daemon %lu exiting.\n", my_id); - __set_current_state(TASK_RUNNING); - return 0; -} - -static void kill_clusterd(void) -{ - int i; - - for (i = 0; i < num_local_nodes; i++) { - if (node_array[i].current_message) { - PRINTK("Seeking to kill clusterd %d.\n", i); - node_array[i].current_message = 0; - } - } - wait_event(clusterd_events, - !atomic_read(&num_cluster_threads)); - PRINTK("All cluster daemons have exited.\n"); -} - -static int peers_not_in_message(int index, int message, int precise) -{ - struct cluster_member *this; - unsigned long flags; - int result = 0; - - spin_lock_irqsave(&node_array[index].member_list_lock, flags); - list_for_each_entry(this, &node_array[index].member_list, list) { - if (this->ignore) - continue; - - PRINTK("Peer %d.%d.%d.%d sending %s. " - "Seeking %s.\n", - NIPQUAD(this->addr), - str_message(this->message), str_message(message)); - if ((precise ? this->message : - this->message & MSG_STATE_MASK) != - message) - result++; - } - spin_unlock_irqrestore(&node_array[index].member_list_lock, flags); - PRINTK("%d peers in sought message.\n", result); - return result; -} - -static void reset_ignored(int index) -{ - struct cluster_member *this; - unsigned long flags; - - spin_lock_irqsave(&node_array[index].member_list_lock, flags); - list_for_each_entry(this, &node_array[index].member_list, list) - this->ignore = 0; - node_array[index].ignored_peer_count = 0; - spin_unlock_irqrestore(&node_array[index].member_list_lock, flags); -} - -static int peers_in_message(int index, int message, int precise) -{ - return node_array[index].peer_count - - node_array[index].ignored_peer_count - - peers_not_in_message(index, message, precise); -} - -static int time_to_continue(int index, unsigned long start, int message) -{ - int first = peers_not_in_message(index, message, 0); - int second = peers_in_message(index, message, 1); - - PRINTK("First part returns %d, second returns %d.\n", first, second); - - if (!first && !second) { - PRINTK("All peers answered message %d.\n", - message); - return 1; - } - - if (time_after(jiffies, start + continue_delay)) { - PRINTK("Timeout reached.\n"); - return 1; - } - - PRINTK("Not time to continue yet (%lu < %lu).\n", jiffies, - start + continue_delay); - return 0; -} - -void toi_initiate_cluster_hibernate(void) -{ - int result; - unsigned long start; - - result = do_toi_step(STEP_HIBERNATE_PREPARE_IMAGE); - if (result) - return; - - toi_send_if(MSG_HIBERNATE, 0); - - start = jiffies; - wait_event(node_array[0].member_events, - time_to_continue(0, start, MSG_HIBERNATE)); - - if (test_action_state(TOI_FREEZER_TEST)) { - toi_send_if(MSG_ABORT, 0); - - start = jiffies; - wait_event(node_array[0].member_events, - time_to_continue(0, start, MSG_RUNNING)); - - do_toi_step(STEP_QUIET_CLEANUP); - return; - } - - toi_send_if(MSG_IO, 0); - - result = do_toi_step(STEP_HIBERNATE_SAVE_IMAGE); - if (result) - return; - - /* This code runs at resume time too! */ - if (toi_in_hibernate) - result = do_toi_step(STEP_HIBERNATE_POWERDOWN); -} - -/* toi_cluster_print_debug_stats - * - * Description: Print information to be recorded for debugging purposes into a - * buffer. - * Arguments: buffer: Pointer to a buffer into which the debug info will be - * printed. - * size: Size of the buffer. - * Returns: Number of characters written to the buffer. - */ -static int toi_cluster_print_debug_stats(char *buffer, int size) -{ - int len; - - if (strlen(toi_cluster_iface)) - len = scnprintf(buffer, size, - "- Cluster interface is '%s'.\n", - toi_cluster_iface); - else - len = scnprintf(buffer, size, - "- Cluster support is disabled.\n"); - return len; -} - -/* cluster_memory_needed - * - * Description: Tell the caller how much memory we need to operate during - * hibernate/resume. - * Returns: Unsigned long. Maximum number of bytes of memory required for - * operation. - */ -static int toi_cluster_memory_needed(void) -{ - return 0; -} - -static int toi_cluster_storage_needed(void) -{ - return 1 + strlen(toi_cluster_iface); -} - -/* toi_cluster_save_config_info - * - * Description: Save informaton needed when reloading the image at resume time. - * Arguments: Buffer: Pointer to a buffer of size PAGE_SIZE. - * Returns: Number of bytes used for saving our data. - */ -static int toi_cluster_save_config_info(char *buffer) -{ - strcpy(buffer, toi_cluster_iface); - return strlen(toi_cluster_iface + 1); -} - -/* toi_cluster_load_config_info - * - * Description: Reload information needed for declustering the image at - * resume time. - * Arguments: Buffer: Pointer to the start of the data. - * Size: Number of bytes that were saved. - */ -static void toi_cluster_load_config_info(char *buffer, int size) -{ - strncpy(toi_cluster_iface, buffer, size); - return; -} - -static void cluster_startup(void) -{ - int have_image = do_check_can_resume(), i; - unsigned long start = jiffies, initial_message; - struct task_struct *p; - - initial_message = MSG_IMAGE; - - have_image = 1; - - for (i = 0; i < num_local_nodes; i++) { - PRINTK("Starting ktoiclusterd %d.\n", i); - p = kthread_create(kTOICluster, (void *) initial_message, - "ktoiclusterd/%d", i); - if (IS_ERR(p)) { - printk(KERN_ERR "Failed to start ktoiclusterd.\n"); - return; - } - - wake_up_process(p); - } - - /* Wait for delay or someone else sending first message */ - wait_event(node_array[0].member_events, time_to_continue(0, start, - MSG_IMAGE)); - - others_have_image = peers_in_message(0, MSG_IMAGE | MSG_ACK, 1); - - printk(KERN_INFO "Continuing. I %shave an image. Peers with image:" - " %d.\n", have_image ? "" : "don't ", others_have_image); - - if (have_image) { - int result; - - /* Start to resume */ - printk(KERN_INFO " === Starting to resume === \n"); - node_array[0].current_message = MSG_IO; - toi_send_if(MSG_IO, 0); - - /* result = do_toi_step(STEP_RESUME_LOAD_PS1); */ - result = 0; - - if (!result) { - /* - * Atomic restore - we'll come back in the hibernation - * path. - */ - - /* result = do_toi_step(STEP_RESUME_DO_RESTORE); */ - result = 0; - - /* do_toi_step(STEP_QUIET_CLEANUP); */ - } - - node_array[0].current_message |= MSG_NACK; - - /* For debugging - disable for real life? */ - wait_event(node_array[0].member_events, - time_to_continue(0, start, MSG_IO)); - } - - if (others_have_image) { - /* Wait for them to resume */ - printk(KERN_INFO "Waiting for other nodes to resume.\n"); - start = jiffies; - wait_event(node_array[0].member_events, - time_to_continue(0, start, MSG_RUNNING)); - if (peers_not_in_message(0, MSG_RUNNING, 0)) - printk(KERN_INFO "Timed out while waiting for other " - "nodes to resume.\n"); - } - - /* Find out whether an image exists here. Send ACK_IMAGE or NACK_IMAGE - * as appropriate. - * - * If we don't have an image: - * - Wait until someone else says they have one, or conditions are met - * for continuing to boot (n machines or t seconds). - * - If anyone has an image, wait for them to resume before continuing - * to boot. - * - * If we have an image: - * - Wait until conditions are met before continuing to resume (n - * machines or t seconds). Send RESUME_PREP and freeze processes. - * NACK_PREP if freezing fails (shouldn't) and follow logic for - * us having no image above. On success, wait for [N]ACK_PREP from - * other machines. Read image (including atomic restore) until done. - * Wait for ACK_READ from others (should never fail). Thaw processes - * and do post-resume. (The section after the atomic restore is done - * via the code for hibernating). - */ - - node_array[0].current_message = MSG_RUNNING; -} - -/* toi_cluster_open_iface - * - * Description: Prepare to use an interface. - */ - -static int toi_cluster_open_iface(void) -{ - struct net_device *dev; - - rtnl_lock(); - - for_each_netdev(&init_net, dev) { - if (/* dev == &init_net.loopback_dev || */ - strcmp(dev->name, toi_cluster_iface)) - continue; - - net_dev = dev; - break; - } - - rtnl_unlock(); - - if (!net_dev) { - printk(KERN_ERR MYNAME ": Device %s not found.\n", - toi_cluster_iface); - return -ENODEV; - } - - dev_add_pack(&toi_cluster_packet_type); - added_pack = 1; - - loopback_mode = (net_dev == init_net.loopback_dev); - num_local_nodes = loopback_mode ? 8 : 1; - - PRINTK("Loopback mode is %s. Number of local nodes is %d.\n", - loopback_mode ? "on" : "off", num_local_nodes); - - cluster_startup(); - return 0; -} - -/* toi_cluster_close_iface - * - * Description: Stop using an interface. - */ - -static int toi_cluster_close_iface(void) -{ - kill_clusterd(); - if (added_pack) { - dev_remove_pack(&toi_cluster_packet_type); - added_pack = 0; - } - return 0; -} - -static void write_side_effect(void) -{ - if (toi_cluster_ops.enabled) { - toi_cluster_open_iface(); - set_toi_state(TOI_CLUSTER_MODE); - } else { - toi_cluster_close_iface(); - clear_toi_state(TOI_CLUSTER_MODE); - } -} - -static void node_write_side_effect(void) -{ -} - -/* - * data for our sysfs entries. - */ -static struct toi_sysfs_data sysfs_params[] = { - SYSFS_STRING("interface", SYSFS_RW, toi_cluster_iface, IFNAMSIZ, 0, - NULL), - SYSFS_INT("enabled", SYSFS_RW, &toi_cluster_ops.enabled, 0, 1, 0, - write_side_effect), - SYSFS_STRING("cluster_name", SYSFS_RW, toi_cluster_key, 32, 0, NULL), - SYSFS_STRING("pre-hibernate-script", SYSFS_RW, pre_hibernate_script, - 256, 0, NULL), - SYSFS_STRING("post-hibernate-script", SYSFS_RW, post_hibernate_script, - 256, 0, STRING), - SYSFS_UL("continue_delay", SYSFS_RW, &continue_delay, HZ / 2, 60 * HZ, - 0) -}; - -/* - * Ops structure. - */ - -static struct toi_module_ops toi_cluster_ops = { - .type = FILTER_MODULE, - .name = "Cluster", - .directory = "cluster", - .module = THIS_MODULE, - .memory_needed = toi_cluster_memory_needed, - .print_debug_info = toi_cluster_print_debug_stats, - .save_config_info = toi_cluster_save_config_info, - .load_config_info = toi_cluster_load_config_info, - .storage_needed = toi_cluster_storage_needed, - - .sysfs_data = sysfs_params, - .num_sysfs_entries = sizeof(sysfs_params) / - sizeof(struct toi_sysfs_data), -}; - -/* ---- Registration ---- */ - -#ifdef MODULE -#define INIT static __init -#define EXIT static __exit -#else -#define INIT -#define EXIT -#endif - -INIT int toi_cluster_init(void) -{ - int temp = toi_register_module(&toi_cluster_ops), i; - struct kobject *kobj = toi_cluster_ops.dir_kobj; - - for (i = 0; i < MAX_LOCAL_NODES; i++) { - node_array[i].current_message = 0; - INIT_LIST_HEAD(&node_array[i].member_list); - init_waitqueue_head(&node_array[i].member_events); - spin_lock_init(&node_array[i].member_list_lock); - spin_lock_init(&node_array[i].receive_lock); - - /* Set up sysfs entry */ - node_array[i].sysfs_data.attr.name = toi_kzalloc(8, - sizeof(node_array[i].sysfs_data.attr.name), - GFP_KERNEL); - sprintf((char *) node_array[i].sysfs_data.attr.name, "node_%d", - i); - node_array[i].sysfs_data.attr.mode = SYSFS_RW; - node_array[i].sysfs_data.type = TOI_SYSFS_DATA_INTEGER; - node_array[i].sysfs_data.flags = 0; - node_array[i].sysfs_data.data.integer.variable = - (int *) &node_array[i].current_message; - node_array[i].sysfs_data.data.integer.minimum = 0; - node_array[i].sysfs_data.data.integer.maximum = INT_MAX; - node_array[i].sysfs_data.write_side_effect = - node_write_side_effect; - toi_register_sysfs_file(kobj, &node_array[i].sysfs_data); - } - - toi_cluster_ops.enabled = (strlen(toi_cluster_iface) > 0); - - if (toi_cluster_ops.enabled) - toi_cluster_open_iface(); - - return temp; -} - -EXIT void toi_cluster_exit(void) -{ - int i; - toi_cluster_close_iface(); - - for (i = 0; i < MAX_LOCAL_NODES; i++) - toi_unregister_sysfs_file(toi_cluster_ops.dir_kobj, - &node_array[i].sysfs_data); - toi_unregister_module(&toi_cluster_ops); -} - -static int __init toi_cluster_iface_setup(char *iface) -{ - toi_cluster_ops.enabled = (*iface && - strcmp(iface, "off")); - - if (toi_cluster_ops.enabled) - strncpy(toi_cluster_iface, iface, strlen(iface)); -} - -__setup("toi_cluster=", toi_cluster_iface_setup); diff --git a/kernel/power/tuxonice_cluster.h b/kernel/power/tuxonice_cluster.h deleted file mode 100644 index 84356b304..000000000 --- a/kernel/power/tuxonice_cluster.h +++ /dev/null @@ -1,18 +0,0 @@ -/* - * kernel/power/tuxonice_cluster.h - * - * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - */ - -#ifdef CONFIG_TOI_CLUSTER -extern int toi_cluster_init(void); -extern void toi_cluster_exit(void); -extern void toi_initiate_cluster_hibernate(void); -#else -static inline int toi_cluster_init(void) { return 0; } -static inline void toi_cluster_exit(void) { } -static inline void toi_initiate_cluster_hibernate(void) { } -#endif - diff --git a/kernel/power/tuxonice_compress.c b/kernel/power/tuxonice_compress.c deleted file mode 100644 index 84b85226d..000000000 --- a/kernel/power/tuxonice_compress.c +++ /dev/null @@ -1,452 +0,0 @@ -/* - * kernel/power/compression.c - * - * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * This file contains data compression routines for TuxOnIce, - * using cryptoapi. - */ - -#include <linux/suspend.h> -#include <linux/highmem.h> -#include <linux/vmalloc.h> -#include <linux/crypto.h> - -#include "tuxonice_builtin.h" -#include "tuxonice.h" -#include "tuxonice_modules.h" -#include "tuxonice_sysfs.h" -#include "tuxonice_io.h" -#include "tuxonice_ui.h" -#include "tuxonice_alloc.h" - -static int toi_expected_compression; - -static struct toi_module_ops toi_compression_ops; -static struct toi_module_ops *next_driver; - -static char toi_compressor_name[32] = "lzo"; - -static DEFINE_MUTEX(stats_lock); - -struct cpu_context { - u8 *page_buffer; - struct crypto_comp *transform; - unsigned int len; - u8 *buffer_start; - u8 *output_buffer; -}; - -#define OUT_BUF_SIZE (2 * PAGE_SIZE) - -static DEFINE_PER_CPU(struct cpu_context, contexts); - -/* - * toi_crypto_prepare - * - * Prepare to do some work by allocating buffers and transforms. - */ -static int toi_compress_crypto_prepare(void) -{ - int cpu; - - if (!*toi_compressor_name) { - printk(KERN_INFO "TuxOnIce: Compression enabled but no " - "compressor name set.\n"); - return 1; - } - - for_each_online_cpu(cpu) { - struct cpu_context *this = &per_cpu(contexts, cpu); - this->transform = crypto_alloc_comp(toi_compressor_name, 0, 0); - if (IS_ERR(this->transform)) { - printk(KERN_INFO "TuxOnIce: Failed to initialise the " - "%s compression transform.\n", - toi_compressor_name); - this->transform = NULL; - return 1; - } - - this->page_buffer = - (char *) toi_get_zeroed_page(16, TOI_ATOMIC_GFP); - - if (!this->page_buffer) { - printk(KERN_ERR - "Failed to allocate a page buffer for TuxOnIce " - "compression driver.\n"); - return -ENOMEM; - } - - this->output_buffer = - (char *) vmalloc_32(OUT_BUF_SIZE); - - if (!this->output_buffer) { - printk(KERN_ERR - "Failed to allocate a output buffer for TuxOnIce " - "compression driver.\n"); - return -ENOMEM; - } - } - - return 0; -} - -static int toi_compress_rw_cleanup(int writing) -{ - int cpu; - - for_each_online_cpu(cpu) { - struct cpu_context *this = &per_cpu(contexts, cpu); - if (this->transform) { - crypto_free_comp(this->transform); - this->transform = NULL; - } - - if (this->page_buffer) - toi_free_page(16, (unsigned long) this->page_buffer); - - this->page_buffer = NULL; - - if (this->output_buffer) - vfree(this->output_buffer); - - this->output_buffer = NULL; - } - - return 0; -} - -/* - * toi_compress_init - */ - -static int toi_compress_init(int toi_or_resume) -{ - if (!toi_or_resume) - return 0; - - toi_compress_bytes_in = 0; - toi_compress_bytes_out = 0; - - next_driver = toi_get_next_filter(&toi_compression_ops); - - return next_driver ? 0 : -ECHILD; -} - -/* - * toi_compress_rw_init() - */ - -static int toi_compress_rw_init(int rw, int stream_number) -{ - if (toi_compress_crypto_prepare()) { - printk(KERN_ERR "Failed to initialise compression " - "algorithm.\n"); - if (rw == READ) { - printk(KERN_INFO "Unable to read the image.\n"); - return -ENODEV; - } else { - printk(KERN_INFO "Continuing without " - "compressing the image.\n"); - toi_compression_ops.enabled = 0; - } - } - - return 0; -} - -/* - * toi_compress_write_page() - * - * Compress a page of data, buffering output and passing on filled - * pages to the next module in the pipeline. - * - * Buffer_page: Pointer to a buffer of size PAGE_SIZE, containing - * data to be compressed. - * - * Returns: 0 on success. Otherwise the error is that returned by later - * modules, -ECHILD if we have a broken pipeline or -EIO if - * zlib errs. - */ -static int toi_compress_write_page(unsigned long index, int buf_type, - void *buffer_page, unsigned int buf_size) -{ - int ret = 0, cpu = smp_processor_id(); - struct cpu_context *ctx = &per_cpu(contexts, cpu); - u8* output_buffer = buffer_page; - int output_len = buf_size; - int out_buf_type = buf_type; - - if (ctx->transform) { - - ctx->buffer_start = TOI_MAP(buf_type, buffer_page); - ctx->len = OUT_BUF_SIZE; - - ret = crypto_comp_compress(ctx->transform, - ctx->buffer_start, buf_size, - ctx->output_buffer, &ctx->len); - - TOI_UNMAP(buf_type, buffer_page); - - toi_message(TOI_COMPRESS, TOI_VERBOSE, 0, - "CPU %d, index %lu: %d bytes", - cpu, index, ctx->len); - - if (!ret && ctx->len < buf_size) { /* some compression */ - output_buffer = ctx->output_buffer; - output_len = ctx->len; - out_buf_type = TOI_VIRT; - } - - } - - mutex_lock(&stats_lock); - - toi_compress_bytes_in += buf_size; - toi_compress_bytes_out += output_len; - - mutex_unlock(&stats_lock); - - if (!ret) - ret = next_driver->write_page(index, out_buf_type, - output_buffer, output_len); - - return ret; -} - -/* - * toi_compress_read_page() - * @buffer_page: struct page *. Pointer to a buffer of size PAGE_SIZE. - * - * Retrieve data from later modules and decompress it until the input buffer - * is filled. - * Zero if successful. Error condition from me or from downstream on failure. - */ -static int toi_compress_read_page(unsigned long *index, int buf_type, - void *buffer_page, unsigned int *buf_size) -{ - int ret, cpu = smp_processor_id(); - unsigned int len; - unsigned int outlen = PAGE_SIZE; - char *buffer_start; - struct cpu_context *ctx = &per_cpu(contexts, cpu); - - if (!ctx->transform) - return next_driver->read_page(index, TOI_PAGE, buffer_page, - buf_size); - - /* - * All our reads must be synchronous - we can't decompress - * data that hasn't been read yet. - */ - - ret = next_driver->read_page(index, TOI_VIRT, ctx->page_buffer, &len); - - buffer_start = kmap(buffer_page); - - /* Error or uncompressed data */ - if (ret || len == PAGE_SIZE) { - memcpy(buffer_start, ctx->page_buffer, len); - goto out; - } - - ret = crypto_comp_decompress( - ctx->transform, - ctx->page_buffer, - len, buffer_start, &outlen); - - toi_message(TOI_COMPRESS, TOI_VERBOSE, 0, - "CPU %d, index %lu: %d=>%d (%d).", - cpu, *index, len, outlen, ret); - - if (ret) - abort_hibernate(TOI_FAILED_IO, - "Compress_read returned %d.\n", ret); - else if (outlen != PAGE_SIZE) { - abort_hibernate(TOI_FAILED_IO, - "Decompression yielded %d bytes instead of %ld.\n", - outlen, PAGE_SIZE); - printk(KERN_ERR "Decompression yielded %d bytes instead of " - "%ld.\n", outlen, PAGE_SIZE); - ret = -EIO; - *buf_size = outlen; - } -out: - TOI_UNMAP(buf_type, buffer_page); - return ret; -} - -/* - * toi_compress_print_debug_stats - * @buffer: Pointer to a buffer into which the debug info will be printed. - * @size: Size of the buffer. - * - * Print information to be recorded for debugging purposes into a buffer. - * Returns: Number of characters written to the buffer. - */ - -static int toi_compress_print_debug_stats(char *buffer, int size) -{ - unsigned long pages_in = toi_compress_bytes_in >> PAGE_SHIFT, - pages_out = toi_compress_bytes_out >> PAGE_SHIFT; - int len; - - /* Output the compression ratio achieved. */ - if (*toi_compressor_name) - len = scnprintf(buffer, size, "- Compressor is '%s'.\n", - toi_compressor_name); - else - len = scnprintf(buffer, size, "- Compressor is not set.\n"); - - if (pages_in) - len += scnprintf(buffer+len, size - len, " Compressed " - "%lu bytes into %lu (%ld percent compression).\n", - toi_compress_bytes_in, - toi_compress_bytes_out, - (pages_in - pages_out) * 100 / pages_in); - return len; -} - -/* - * toi_compress_compression_memory_needed - * - * Tell the caller how much memory we need to operate during hibernate/resume. - * Returns: Unsigned long. Maximum number of bytes of memory required for - * operation. - */ -static int toi_compress_memory_needed(void) -{ - return 2 * PAGE_SIZE; -} - -static int toi_compress_storage_needed(void) -{ - return 2 * sizeof(unsigned long) + 2 * sizeof(int) + - strlen(toi_compressor_name) + 1; -} - -/* - * toi_compress_save_config_info - * @buffer: Pointer to a buffer of size PAGE_SIZE. - * - * Save informaton needed when reloading the image at resume time. - * Returns: Number of bytes used for saving our data. - */ -static int toi_compress_save_config_info(char *buffer) -{ - int len = strlen(toi_compressor_name) + 1, offset = 0; - - *((unsigned long *) buffer) = toi_compress_bytes_in; - offset += sizeof(unsigned long); - *((unsigned long *) (buffer + offset)) = toi_compress_bytes_out; - offset += sizeof(unsigned long); - *((int *) (buffer + offset)) = toi_expected_compression; - offset += sizeof(int); - *((int *) (buffer + offset)) = len; - offset += sizeof(int); - strncpy(buffer + offset, toi_compressor_name, len); - return offset + len; -} - -/* toi_compress_load_config_info - * @buffer: Pointer to the start of the data. - * @size: Number of bytes that were saved. - * - * Description: Reload information needed for decompressing the image at - * resume time. - */ -static void toi_compress_load_config_info(char *buffer, int size) -{ - int len, offset = 0; - - toi_compress_bytes_in = *((unsigned long *) buffer); - offset += sizeof(unsigned long); - toi_compress_bytes_out = *((unsigned long *) (buffer + offset)); - offset += sizeof(unsigned long); - toi_expected_compression = *((int *) (buffer + offset)); - offset += sizeof(int); - len = *((int *) (buffer + offset)); - offset += sizeof(int); - strncpy(toi_compressor_name, buffer + offset, len); -} - -static void toi_compress_pre_atomic_restore(struct toi_boot_kernel_data *bkd) -{ - bkd->compress_bytes_in = toi_compress_bytes_in; - bkd->compress_bytes_out = toi_compress_bytes_out; -} - -static void toi_compress_post_atomic_restore(struct toi_boot_kernel_data *bkd) -{ - toi_compress_bytes_in = bkd->compress_bytes_in; - toi_compress_bytes_out = bkd->compress_bytes_out; -} - -/* - * toi_expected_compression_ratio - * - * Description: Returns the expected ratio between data passed into this module - * and the amount of data output when writing. - * Returns: 100 if the module is disabled. Otherwise the value set by the - * user via our sysfs entry. - */ - -static int toi_compress_expected_ratio(void) -{ - if (!toi_compression_ops.enabled) - return 100; - else - return 100 - toi_expected_compression; -} - -/* - * data for our sysfs entries. - */ -static struct toi_sysfs_data sysfs_params[] = { - SYSFS_INT("expected_compression", SYSFS_RW, &toi_expected_compression, - 0, 99, 0, NULL), - SYSFS_INT("enabled", SYSFS_RW, &toi_compression_ops.enabled, 0, 1, 0, - NULL), - SYSFS_STRING("algorithm", SYSFS_RW, toi_compressor_name, 31, 0, NULL), -}; - -/* - * Ops structure. - */ -static struct toi_module_ops toi_compression_ops = { - .type = FILTER_MODULE, - .name = "compression", - .directory = "compression", - .module = THIS_MODULE, - .initialise = toi_compress_init, - .memory_needed = toi_compress_memory_needed, - .print_debug_info = toi_compress_print_debug_stats, - .save_config_info = toi_compress_save_config_info, - .load_config_info = toi_compress_load_config_info, - .storage_needed = toi_compress_storage_needed, - .expected_compression = toi_compress_expected_ratio, - - .pre_atomic_restore = toi_compress_pre_atomic_restore, - .post_atomic_restore = toi_compress_post_atomic_restore, - - .rw_init = toi_compress_rw_init, - .rw_cleanup = toi_compress_rw_cleanup, - - .write_page = toi_compress_write_page, - .read_page = toi_compress_read_page, - - .sysfs_data = sysfs_params, - .num_sysfs_entries = sizeof(sysfs_params) / - sizeof(struct toi_sysfs_data), -}; - -/* ---- Registration ---- */ - -static __init int toi_compress_load(void) -{ - return toi_register_module(&toi_compression_ops); -} - -late_initcall(toi_compress_load); diff --git a/kernel/power/tuxonice_copy_before_write.c b/kernel/power/tuxonice_copy_before_write.c deleted file mode 100644 index eb627915e..000000000 --- a/kernel/power/tuxonice_copy_before_write.c +++ /dev/null @@ -1,240 +0,0 @@ -/* - * kernel/power/tuxonice_copy_before_write.c - * - * Copyright (C) 2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * Routines (apart from the fault handling code) to deal with allocating memory - * for copying pages before they are modified, restoring the contents and getting - * the contents written to disk. - */ - -#include <linux/percpu-defs.h> -#include <linux/sched.h> -#include <linux/tuxonice.h> -#include "tuxonice_alloc.h" -#include "tuxonice_modules.h" -#include "tuxonice_sysfs.h" -#include "tuxonice.h" - -DEFINE_PER_CPU(struct toi_cbw_state, toi_cbw_states); -#define CBWS_PER_PAGE (PAGE_SIZE / sizeof(struct toi_cbw)) -#define toi_cbw_pool_size 100 - -static void _toi_free_cbw_data(struct toi_cbw_state *state) -{ - struct toi_cbw *page_ptr, *ptr, *next; - - page_ptr = ptr = state->first; - - while(ptr) { - next = ptr->next; - - if (ptr->virt) { - toi__free_page(40, virt_to_page(ptr->virt)); - } - if ((((unsigned long) ptr) & PAGE_MASK) != (unsigned long) page_ptr) { - /* Must be on a new page - free the previous one. */ - toi__free_page(40, virt_to_page(page_ptr)); - page_ptr = ptr; - } - ptr = next; - } - - if (page_ptr) { - toi__free_page(40, virt_to_page(page_ptr)); - } - - state->first = state->next = state->last = NULL; - state->size = 0; -} - -void toi_free_cbw_data(void) -{ - int i; - - for_each_online_cpu(i) { - struct toi_cbw_state *state = &per_cpu(toi_cbw_states, i); - - if (!state->first) - continue; - - state->enabled = 0; - - while (state->active) { - schedule(); - } - - _toi_free_cbw_data(state); - } -} - -static int _toi_allocate_cbw_data(struct toi_cbw_state *state) -{ - while(state->size < toi_cbw_pool_size) { - int i; - struct toi_cbw *ptr; - - ptr = (struct toi_cbw *) toi_get_zeroed_page(40, GFP_KERNEL); - - if (!ptr) { - return -ENOMEM; - } - - if (!state->first) { - state->first = state->next = state->last = ptr; - } - - for (i = 0; i < CBWS_PER_PAGE; i++) { - struct toi_cbw *cbw = &ptr[i]; - - cbw->virt = (char *) toi_get_zeroed_page(40, GFP_KERNEL); - if (!cbw->virt) { - state->size += i; - printk("Out of memory allocating CBW pages.\n"); - return -ENOMEM; - } - - if (cbw == state->first) - continue; - - state->last->next = cbw; - state->last = cbw; - } - - state->size += CBWS_PER_PAGE; - } - - state->enabled = 1; - - return 0; -} - - -int toi_allocate_cbw_data(void) -{ - int i, result; - - for_each_online_cpu(i) { - struct toi_cbw_state *state = &per_cpu(toi_cbw_states, i); - - result = _toi_allocate_cbw_data(state); - - if (result) - return result; - } - - return 0; -} - -void toi_cbw_restore(void) -{ - if (!toi_keeping_image) - return; - -} - -void toi_cbw_write(void) -{ - if (!toi_keeping_image) - return; - -} - -/** - * toi_cbw_test_read - Test copy before write on one page - * - * Allocate copy before write buffers, then make one page only copy-before-write - * and attempt to write to it. We should then be able to retrieve the original - * version from the cbw buffer and the modified version from the page itself. - */ -static int toi_cbw_test_read(const char *buffer, int count) -{ - unsigned long virt = toi_get_zeroed_page(40, GFP_KERNEL); - char *original = "Original contents"; - char *modified = "Modified material"; - struct page *page = virt_to_page(virt); - int i, len = 0, found = 0, pfn = page_to_pfn(page); - - if (!page) { - printk("toi_cbw_test_read: Unable to allocate a page for testing.\n"); - return -ENOMEM; - } - - memcpy((char *) virt, original, strlen(original)); - - if (toi_allocate_cbw_data()) { - printk("toi_cbw_test_read: Unable to allocate cbw data.\n"); - return -ENOMEM; - } - - toi_reset_dirtiness_one(pfn, 0); - - SetPageTOI_CBW(page); - - memcpy((char *) virt, modified, strlen(modified)); - - if (strncmp((char *) virt, modified, strlen(modified))) { - len += sprintf((char *) buffer + len, "Failed to write to page after protecting it.\n"); - } - - for_each_online_cpu(i) { - struct toi_cbw_state *state = &per_cpu(toi_cbw_states, i); - struct toi_cbw *ptr = state->first, *last_ptr = ptr; - - if (!found) { - while (ptr) { - if (ptr->pfn == pfn) { - found = 1; - if (strncmp(ptr->virt, original, strlen(original))) { - len += sprintf((char *) buffer + len, "Contents of original buffer are not original.\n"); - } else { - len += sprintf((char *) buffer + len, "Test passed. Buffer changed and original contents preserved.\n"); - } - break; - } - - last_ptr = ptr; - ptr = ptr->next; - } - } - - if (!last_ptr) - len += sprintf((char *) buffer + len, "All available CBW buffers on cpu %d used.\n", i); - } - - if (!found) - len += sprintf((char *) buffer + len, "Copy before write buffer not found.\n"); - - toi_free_cbw_data(); - - return len; -} - -/* - * This array contains entries that are automatically registered at - * boot. Modules and the console code register their own entries separately. - */ -static struct toi_sysfs_data sysfs_params[] = { - SYSFS_CUSTOM("test", SYSFS_RW, toi_cbw_test_read, - NULL, SYSFS_NEEDS_SM_FOR_READ, NULL), -}; - -static struct toi_module_ops toi_cbw_ops = { - .type = MISC_HIDDEN_MODULE, - .name = "copy_before_write debugging", - .directory = "cbw", - .module = THIS_MODULE, - .early = 1, - - .sysfs_data = sysfs_params, - .num_sysfs_entries = sizeof(sysfs_params) / - sizeof(struct toi_sysfs_data), -}; - -int toi_cbw_init(void) -{ - int result = toi_register_module(&toi_cbw_ops); - return result; -} diff --git a/kernel/power/tuxonice_extent.c b/kernel/power/tuxonice_extent.c deleted file mode 100644 index 522c836ad..000000000 --- a/kernel/power/tuxonice_extent.c +++ /dev/null @@ -1,144 +0,0 @@ -/* - * kernel/power/tuxonice_extent.c - * - * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * Distributed under GPLv2. - * - * These functions encapsulate the manipulation of storage metadata. - */ - -#include <linux/suspend.h> -#include "tuxonice_modules.h" -#include "tuxonice_extent.h" -#include "tuxonice_alloc.h" -#include "tuxonice_ui.h" -#include "tuxonice.h" - -/** - * toi_get_extent - return a free extent - * - * May fail, returning NULL instead. - **/ -static struct hibernate_extent *toi_get_extent(void) -{ - return (struct hibernate_extent *) toi_kzalloc(2, - sizeof(struct hibernate_extent), TOI_ATOMIC_GFP); -} - -/** - * toi_put_extent_chain - free a chain of extents starting from value 'from' - * @chain: Chain to free. - * - * Note that 'from' is an extent value, and may be part way through an extent. - * In this case, the extent should be truncated (if necessary) and following - * extents freed. - **/ -void toi_put_extent_chain_from(struct hibernate_extent_chain *chain, unsigned long from) -{ - struct hibernate_extent *this; - - this = chain->first; - - while (this) { - struct hibernate_extent *next = this->next; - - // Delete the whole extent? - if (this->start >= from) { - chain->size -= (this->end - this->start + 1); - if (chain->first == this) - chain->first = next; - if (chain->last_touched == this) - chain->last_touched = NULL; - if (chain->current_extent == this) - chain->current_extent = NULL; - toi_kfree(2, this, sizeof(*this)); - chain->num_extents--; - } else if (this->end >= from) { - // Delete part of the extent - chain->size -= (this->end - from + 1); - this->start = from; - } - this = next; - } -} - -/** - * toi_put_extent_chain - free a whole chain of extents - * @chain: Chain to free. - **/ -void toi_put_extent_chain(struct hibernate_extent_chain *chain) -{ - toi_put_extent_chain_from(chain, 0); -} - -/** - * toi_add_to_extent_chain - add an extent to an existing chain - * @chain: Chain to which the extend should be added - * @start: Start of the extent (first physical block) - * @end: End of the extent (last physical block) - * - * The chain information is updated if the insertion is successful. - **/ -int toi_add_to_extent_chain(struct hibernate_extent_chain *chain, - unsigned long start, unsigned long end) -{ - struct hibernate_extent *new_ext = NULL, *cur_ext = NULL; - - toi_message(TOI_IO, TOI_VERBOSE, 0, - "Adding extent %lu-%lu to chain %p.\n", start, end, chain); - - /* Find the right place in the chain */ - if (chain->last_touched && chain->last_touched->start < start) - cur_ext = chain->last_touched; - else if (chain->first && chain->first->start < start) - cur_ext = chain->first; - - if (cur_ext) { - while (cur_ext->next && cur_ext->next->start < start) - cur_ext = cur_ext->next; - - if (cur_ext->end == (start - 1)) { - struct hibernate_extent *next_ext = cur_ext->next; - cur_ext->end = end; - - /* Merge with the following one? */ - if (next_ext && cur_ext->end + 1 == next_ext->start) { - cur_ext->end = next_ext->end; - cur_ext->next = next_ext->next; - toi_kfree(2, next_ext, sizeof(*next_ext)); - chain->num_extents--; - } - - chain->last_touched = cur_ext; - chain->size += (end - start + 1); - - return 0; - } - } - - new_ext = toi_get_extent(); - if (!new_ext) { - printk(KERN_INFO "Error unable to append a new extent to the " - "chain.\n"); - return -ENOMEM; - } - - chain->num_extents++; - chain->size += (end - start + 1); - new_ext->start = start; - new_ext->end = end; - - chain->last_touched = new_ext; - - if (cur_ext) { - new_ext->next = cur_ext->next; - cur_ext->next = new_ext; - } else { - if (chain->first) - new_ext->next = chain->first; - chain->first = new_ext; - } - - return 0; -} diff --git a/kernel/power/tuxonice_extent.h b/kernel/power/tuxonice_extent.h deleted file mode 100644 index aeccf1f5e..000000000 --- a/kernel/power/tuxonice_extent.h +++ /dev/null @@ -1,45 +0,0 @@ -/* - * kernel/power/tuxonice_extent.h - * - * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * It contains declarations related to extents. Extents are - * TuxOnIce's method of storing some of the metadata for the image. - * See tuxonice_extent.c for more info. - * - */ - -#include "tuxonice_modules.h" - -#ifndef EXTENT_H -#define EXTENT_H - -struct hibernate_extent { - unsigned long start, end; - struct hibernate_extent *next; -}; - -struct hibernate_extent_chain { - unsigned long size; /* size of the chain ie sum (max-min+1) */ - int num_extents; - struct hibernate_extent *first, *last_touched; - struct hibernate_extent *current_extent; - unsigned long current_offset; -}; - -/* Simplify iterating through all the values in an extent chain */ -#define toi_extent_for_each(extent_chain, extentpointer, value) \ -if ((extent_chain)->first) \ - for ((extentpointer) = (extent_chain)->first, (value) = \ - (extentpointer)->start; \ - ((extentpointer) && ((extentpointer)->next || (value) <= \ - (extentpointer)->end)); \ - (((value) == (extentpointer)->end) ? \ - ((extentpointer) = (extentpointer)->next, (value) = \ - ((extentpointer) ? (extentpointer)->start : 0)) : \ - (value)++)) - -extern void toi_put_extent_chain_from(struct hibernate_extent_chain *chain, unsigned long from); -#endif diff --git a/kernel/power/tuxonice_file.c b/kernel/power/tuxonice_file.c deleted file mode 100644 index baf191211..000000000 --- a/kernel/power/tuxonice_file.c +++ /dev/null @@ -1,484 +0,0 @@ -/* - * kernel/power/tuxonice_file.c - * - * Copyright (C) 2005-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * Distributed under GPLv2. - * - * This file encapsulates functions for usage of a simple file as a - * backing store. It is based upon the swapallocator, and shares the - * same basic working. Here, though, we have nothing to do with - * swapspace, and only one device to worry about. - * - * The user can just - * - * echo TuxOnIce > /path/to/my_file - * - * dd if=/dev/zero bs=1M count=<file_size_desired> >> /path/to/my_file - * - * and - * - * echo /path/to/my_file > /sys/power/tuxonice/file/target - * - * then put what they find in /sys/power/tuxonice/resume - * as their resume= parameter in lilo.conf (and rerun lilo if using it). - * - * Having done this, they're ready to hibernate and resume. - * - * TODO: - * - File resizing. - */ - -#include <linux/blkdev.h> -#include <linux/mount.h> -#include <linux/fs.h> -#include <linux/fs_uuid.h> - -#include "tuxonice.h" -#include "tuxonice_modules.h" -#include "tuxonice_bio.h" -#include "tuxonice_alloc.h" -#include "tuxonice_builtin.h" -#include "tuxonice_sysfs.h" -#include "tuxonice_ui.h" -#include "tuxonice_io.h" - -#define target_is_normal_file() (S_ISREG(target_inode->i_mode)) - -static struct toi_module_ops toi_fileops; - -static struct file *target_file; -static struct block_device *toi_file_target_bdev; -static unsigned long pages_available, pages_allocated; -static char toi_file_target[256]; -static struct inode *target_inode; -static int file_target_priority; -static int used_devt; -static int target_claim; -static dev_t toi_file_dev_t; -static int sig_page_index; - -/* For test_toi_file_target */ -static struct toi_bdev_info *file_chain; - -static int has_contiguous_blocks(struct toi_bdev_info *dev_info, int page_num) -{ - int j; - sector_t last = 0; - - for (j = 0; j < dev_info->blocks_per_page; j++) { - sector_t this = bmap(target_inode, - page_num * dev_info->blocks_per_page + j); - - if (!this || (last && (last + 1) != this)) - break; - - last = this; - } - - return j == dev_info->blocks_per_page; -} - -static unsigned long get_usable_pages(struct toi_bdev_info *dev_info) -{ - unsigned long result = 0; - struct block_device *bdev = dev_info->bdev; - int i; - - switch (target_inode->i_mode & S_IFMT) { - case S_IFSOCK: - case S_IFCHR: - case S_IFIFO: /* Socket, Char, Fifo */ - return -1; - case S_IFREG: /* Regular file: current size - holes + free - space on part */ - for (i = 0; i < (target_inode->i_size >> PAGE_SHIFT) ; i++) { - if (has_contiguous_blocks(dev_info, i)) - result++; - } - break; - case S_IFBLK: /* Block device */ - if (!bdev->bd_disk) { - toi_message(TOI_IO, TOI_VERBOSE, 0, - "bdev->bd_disk null."); - return 0; - } - - result = (bdev->bd_part ? - bdev->bd_part->nr_sects : - get_capacity(bdev->bd_disk)) >> (PAGE_SHIFT - 9); - } - - - return result; -} - -static int toi_file_register_storage(void) -{ - struct toi_bdev_info *devinfo; - int result = 0; - struct fs_info *fs_info; - - toi_message(TOI_IO, TOI_VERBOSE, 0, "toi_file_register_storage."); - if (!strlen(toi_file_target)) { - toi_message(TOI_IO, TOI_VERBOSE, 0, "Register file storage: " - "No target filename set."); - return 0; - } - - target_file = filp_open(toi_file_target, O_RDONLY|O_LARGEFILE, 0); - toi_message(TOI_IO, TOI_VERBOSE, 0, "filp_open %s returned %p.", - toi_file_target, target_file); - - if (IS_ERR(target_file) || !target_file) { - target_file = NULL; - toi_file_dev_t = name_to_dev_t(toi_file_target); - if (!toi_file_dev_t) { - struct kstat stat; - int error = vfs_stat(toi_file_target, &stat); - printk(KERN_INFO "Open file %s returned %p and " - "name_to_devt failed.\n", - toi_file_target, target_file); - if (error) { - printk(KERN_INFO "Stating the file also failed." - " Nothing more we can do.\n"); - return 0; - } else - toi_file_dev_t = stat.rdev; - } - - toi_file_target_bdev = toi_open_by_devnum(toi_file_dev_t); - if (IS_ERR(toi_file_target_bdev)) { - printk(KERN_INFO "Got a dev_num (%lx) but failed to " - "open it.\n", - (unsigned long) toi_file_dev_t); - toi_file_target_bdev = NULL; - return 0; - } - used_devt = 1; - target_inode = toi_file_target_bdev->bd_inode; - } else - target_inode = target_file->f_mapping->host; - - toi_message(TOI_IO, TOI_VERBOSE, 0, "Succeeded in opening the target."); - if (S_ISLNK(target_inode->i_mode) || S_ISDIR(target_inode->i_mode) || - S_ISSOCK(target_inode->i_mode) || S_ISFIFO(target_inode->i_mode)) { - printk(KERN_INFO "File support works with regular files," - " character files and block devices.\n"); - /* Cleanup routine will undo the above */ - return 0; - } - - if (!used_devt) { - if (S_ISBLK(target_inode->i_mode)) { - toi_file_target_bdev = I_BDEV(target_inode); - if (!blkdev_get(toi_file_target_bdev, FMODE_WRITE | - FMODE_READ, NULL)) - target_claim = 1; - } else - toi_file_target_bdev = target_inode->i_sb->s_bdev; - if (!toi_file_target_bdev) { - printk(KERN_INFO "%s is not a valid file allocator " - "target.\n", toi_file_target); - return 0; - } - toi_file_dev_t = toi_file_target_bdev->bd_dev; - } - - devinfo = toi_kzalloc(39, sizeof(struct toi_bdev_info), GFP_ATOMIC); - if (!devinfo) { - printk("Failed to allocate a toi_bdev_info struct for the file allocator.\n"); - return -ENOMEM; - } - - devinfo->bdev = toi_file_target_bdev; - devinfo->allocator = &toi_fileops; - devinfo->allocator_index = 0; - - fs_info = fs_info_from_block_dev(toi_file_target_bdev); - if (fs_info && !IS_ERR(fs_info)) { - memcpy(devinfo->uuid, &fs_info->uuid, 16); - free_fs_info(fs_info); - } else - result = (int) PTR_ERR(fs_info); - - /* Unlike swap code, only complain if fs_info_from_block_dev returned - * -ENOMEM. The 'file' might be a full partition, so might validly not - * have an identifiable type, UUID etc. - */ - if (result) - printk(KERN_DEBUG "Failed to get fs_info for file device (%d).\n", - result); - devinfo->dev_t = toi_file_dev_t; - devinfo->prio = file_target_priority; - devinfo->bmap_shift = target_inode->i_blkbits - 9; - devinfo->blocks_per_page = - (1 << (PAGE_SHIFT - target_inode->i_blkbits)); - sprintf(devinfo->name, "file %s", toi_file_target); - file_chain = devinfo; - toi_message(TOI_IO, TOI_VERBOSE, 0, "Dev_t is %lx. Prio is %d. Bmap " - "shift is %d. Blocks per page %d.", - devinfo->dev_t, devinfo->prio, devinfo->bmap_shift, - devinfo->blocks_per_page); - - /* Keep one aside for the signature */ - pages_available = get_usable_pages(devinfo) - 1; - - toi_message(TOI_IO, TOI_VERBOSE, 0, "Registering file storage, %lu " - "pages.", pages_available); - - toi_bio_ops.register_storage(devinfo); - return 0; -} - -static unsigned long toi_file_storage_available(void) -{ - return pages_available; -} - -static int toi_file_allocate_storage(struct toi_bdev_info *chain, - unsigned long request) -{ - unsigned long available = pages_available - pages_allocated; - unsigned long to_add = min(available, request); - - toi_message(TOI_IO, TOI_VERBOSE, 0, "Pages available is %lu. Allocated " - "is %lu. Allocating %lu pages from file.", - pages_available, pages_allocated, to_add); - pages_allocated += to_add; - - return to_add; -} - -/** - * __populate_block_list - add an extent to the chain - * @min: Start of the extent (first physical block = sector) - * @max: End of the extent (last physical block = sector) - * - * If TOI_TEST_BIO is set, print a debug message, outputting the min and max - * fs block numbers. - **/ -static int __populate_block_list(struct toi_bdev_info *chain, int min, int max) -{ - if (test_action_state(TOI_TEST_BIO)) - toi_message(TOI_IO, TOI_VERBOSE, 0, "Adding extent %d-%d.", - min << chain->bmap_shift, - ((max + 1) << chain->bmap_shift) - 1); - - return toi_add_to_extent_chain(&chain->blocks, min, max); -} - -static int get_main_pool_phys_params(struct toi_bdev_info *chain) -{ - int i, extent_min = -1, extent_max = -1, result = 0, have_sig_page = 0; - unsigned long pages_mapped = 0; - - toi_message(TOI_IO, TOI_VERBOSE, 0, "Getting file allocator blocks."); - - if (chain->blocks.first) - toi_put_extent_chain(&chain->blocks); - - if (!target_is_normal_file()) { - result = (pages_available > 0) ? - __populate_block_list(chain, chain->blocks_per_page, - (pages_allocated + 1) * - chain->blocks_per_page - 1) : 0; - return result; - } - - /* - * FIXME: We are assuming the first page is contiguous. Is that - * assumption always right? - */ - - for (i = 0; i < (target_inode->i_size >> PAGE_SHIFT); i++) { - sector_t new_sector; - - if (!has_contiguous_blocks(chain, i)) - continue; - - if (!have_sig_page) { - have_sig_page = 1; - sig_page_index = i; - continue; - } - - pages_mapped++; - - /* Ignore first page - it has the header */ - if (pages_mapped == 1) - continue; - - new_sector = bmap(target_inode, (i * chain->blocks_per_page)); - - /* - * I'd love to be able to fill in holes and resize - * files, but not yet... - */ - - if (new_sector == extent_max + 1) - extent_max += chain->blocks_per_page; - else { - if (extent_min > -1) { - result = __populate_block_list(chain, - extent_min, extent_max); - if (result) - return result; - } - - extent_min = new_sector; - extent_max = extent_min + - chain->blocks_per_page - 1; - } - - if (pages_mapped == pages_allocated) - break; - } - - if (extent_min > -1) { - result = __populate_block_list(chain, extent_min, extent_max); - if (result) - return result; - } - - return 0; -} - -static void toi_file_free_storage(struct toi_bdev_info *chain) -{ - pages_allocated = 0; - file_chain = NULL; -} - -/** - * toi_file_print_debug_stats - print debug info - * @buffer: Buffer to data to populate - * @size: Size of the buffer - **/ -static int toi_file_print_debug_stats(char *buffer, int size) -{ - int len = scnprintf(buffer, size, "- File Allocator active.\n"); - - len += scnprintf(buffer+len, size-len, " Storage available for " - "image: %lu pages.\n", pages_available); - - return len; -} - -static void toi_file_cleanup(int finishing_cycle) -{ - if (toi_file_target_bdev) { - if (target_claim) { - blkdev_put(toi_file_target_bdev, FMODE_WRITE | FMODE_READ); - target_claim = 0; - } - - if (used_devt) { - blkdev_put(toi_file_target_bdev, - FMODE_READ | FMODE_NDELAY); - used_devt = 0; - } - toi_file_target_bdev = NULL; - target_inode = NULL; - } - - if (target_file) { - filp_close(target_file, NULL); - target_file = NULL; - } - - pages_available = 0; -} - -/** - * test_toi_file_target - sysfs callback for /sys/power/tuxonince/file/target - * - * Test wheter the target file is valid for hibernating. - **/ -static void test_toi_file_target(void) -{ - int result = toi_file_register_storage(); - sector_t sector; - char buf[50]; - struct fs_info *fs_info; - - if (result || !file_chain) - return; - - /* This doesn't mean we're in business. Is any storage available? */ - if (!pages_available) - goto out; - - toi_file_allocate_storage(file_chain, 1); - result = get_main_pool_phys_params(file_chain); - if (result) - goto out; - - - sector = bmap(target_inode, sig_page_index * - file_chain->blocks_per_page) << file_chain->bmap_shift; - - /* Use the uuid, or the dev_t if that fails */ - fs_info = fs_info_from_block_dev(toi_file_target_bdev); - if (!fs_info || IS_ERR(fs_info)) { - bdevname(toi_file_target_bdev, buf); - sprintf(resume_file, "/dev/%s:%llu", buf, - (unsigned long long) sector); - } else { - int i; - hex_dump_to_buffer(fs_info->uuid, 16, 32, 1, buf, 50, 0); - - /* Remove the spaces */ - for (i = 1; i < 16; i++) { - buf[2 * i] = buf[3 * i]; - buf[2 * i + 1] = buf[3 * i + 1]; - } - buf[32] = 0; - sprintf(resume_file, "UUID=%s:0x%llx", buf, - (unsigned long long) sector); - free_fs_info(fs_info); - } - - toi_attempt_to_parse_resume_device(0); -out: - toi_file_free_storage(file_chain); - toi_bio_ops.free_storage(); -} - -static struct toi_sysfs_data sysfs_params[] = { - SYSFS_STRING("target", SYSFS_RW, toi_file_target, 256, - SYSFS_NEEDS_SM_FOR_WRITE, test_toi_file_target), - SYSFS_INT("enabled", SYSFS_RW, &toi_fileops.enabled, 0, 1, 0, NULL), - SYSFS_INT("priority", SYSFS_RW, &file_target_priority, -4095, - 4096, 0, NULL), -}; - -static struct toi_bio_allocator_ops toi_bio_fileops = { - .register_storage = toi_file_register_storage, - .storage_available = toi_file_storage_available, - .allocate_storage = toi_file_allocate_storage, - .bmap = get_main_pool_phys_params, - .free_storage = toi_file_free_storage, -}; - -static struct toi_module_ops toi_fileops = { - .type = BIO_ALLOCATOR_MODULE, - .name = "file storage", - .directory = "file", - .module = THIS_MODULE, - .print_debug_info = toi_file_print_debug_stats, - .cleanup = toi_file_cleanup, - .bio_allocator_ops = &toi_bio_fileops, - - .sysfs_data = sysfs_params, - .num_sysfs_entries = sizeof(sysfs_params) / - sizeof(struct toi_sysfs_data), -}; - -/* ---- Registration ---- */ -static __init int toi_file_load(void) -{ - return toi_register_module(&toi_fileops); -} - -late_initcall(toi_file_load); diff --git a/kernel/power/tuxonice_highlevel.c b/kernel/power/tuxonice_highlevel.c deleted file mode 100644 index 16cf14cbc..000000000 --- a/kernel/power/tuxonice_highlevel.c +++ /dev/null @@ -1,1413 +0,0 @@ -/* - * kernel/power/tuxonice_highlevel.c - */ -/** \mainpage TuxOnIce. - * - * TuxOnIce provides support for saving and restoring an image of - * system memory to an arbitrary storage device, either on the local computer, - * or across some network. The support is entirely OS based, so TuxOnIce - * works without requiring BIOS, APM or ACPI support. The vast majority of the - * code is also architecture independant, so it should be very easy to port - * the code to new architectures. TuxOnIce includes support for SMP, 4G HighMem - * and preemption. Initramfses and initrds are also supported. - * - * TuxOnIce uses a modular design, in which the method of storing the image is - * completely abstracted from the core code, as are transformations on the data - * such as compression and/or encryption (multiple 'modules' can be used to - * provide arbitrary combinations of functionality). The user interface is also - * modular, so that arbitrarily simple or complex interfaces can be used to - * provide anything from debugging information through to eye candy. - * - * \section Copyright - * - * TuxOnIce is released under the GPLv2. - * - * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu><BR> - * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz><BR> - * Copyright (C) 2002-2003 Florent Chabaud <fchabaud@free.fr><BR> - * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au)<BR> - * - * \section Credits - * - * Nigel would like to thank the following people for their work: - * - * Bernard Blackham <bernard@blackham.com.au><BR> - * Web page & Wiki administration, some coding. A person without whom - * TuxOnIce would not be where it is. - * - * Michael Frank <mhf@linuxmail.org><BR> - * Extensive testing and help with improving stability. I was constantly - * amazed by the quality and quantity of Michael's help. - * - * Pavel Machek <pavel@ucw.cz><BR> - * Modifications, defectiveness pointing, being with Gabor at the very - * beginning, suspend to swap space, stop all tasks. Port to 2.4.18-ac and - * 2.5.17. Even though Pavel and I disagree on the direction suspend to - * disk should take, I appreciate the valuable work he did in helping Gabor - * get the concept working. - * - * ..and of course the myriads of TuxOnIce users who have helped diagnose - * and fix bugs, made suggestions on how to improve the code, proofread - * documentation, and donated time and money. - * - * Thanks also to corporate sponsors: - * - * <B>Redhat.</B>Sometime employer from May 2006 (my fault, not Redhat's!). - * - * <B>Cyclades.com.</B> Nigel's employers from Dec 2004 until May 2006, who - * allowed him to work on TuxOnIce and PM related issues on company time. - * - * <B>LinuxFund.org.</B> Sponsored Nigel's work on TuxOnIce for four months Oct - * 2003 to Jan 2004. - * - * <B>LAC Linux.</B> Donated P4 hardware that enabled development and ongoing - * maintenance of SMP and Highmem support. - * - * <B>OSDL.</B> Provided access to various hardware configurations, make - * occasional small donations to the project. - */ - -#include <linux/suspend.h> -#include <linux/module.h> -#include <linux/freezer.h> -#include <generated/utsrelease.h> -#include <linux/cpu.h> -#include <linux/console.h> -#include <linux/writeback.h> -#include <linux/uaccess.h> /* for get/set_fs & KERNEL_DS on i386 */ -#include <linux/bio.h> -#include <linux/kgdb.h> - -#include "tuxonice.h" -#include "tuxonice_modules.h" -#include "tuxonice_sysfs.h" -#include "tuxonice_prepare_image.h" -#include "tuxonice_io.h" -#include "tuxonice_ui.h" -#include "tuxonice_power_off.h" -#include "tuxonice_storage.h" -#include "tuxonice_checksum.h" -#include "tuxonice_builtin.h" -#include "tuxonice_atomic_copy.h" -#include "tuxonice_alloc.h" -#include "tuxonice_cluster.h" - -/*! Pageset metadata. */ -struct pagedir pagedir2 = {2}; - -static mm_segment_t oldfs; -static DEFINE_MUTEX(tuxonice_in_use); -static int block_dump_save; - -int toi_trace_index; - -/* Binary signature if an image is present */ -char tuxonice_signature[9] = "\xed\xc3\x02\xe9\x98\x56\xe5\x0c"; - -unsigned long boot_kernel_data_buffer; - -static char *result_strings[] = { - "Hibernation was aborted", - "The user requested that we cancel the hibernation", - "No storage was available", - "Insufficient storage was available", - "Freezing filesystems and/or tasks failed", - "A pre-existing image was used", - "We would free memory, but image size limit doesn't allow this", - "Unable to free enough memory to hibernate", - "Unable to obtain the Power Management Semaphore", - "A device suspend/resume returned an error", - "A system device suspend/resume returned an error", - "The extra pages allowance is too small", - "We were unable to successfully prepare an image", - "TuxOnIce module initialisation failed", - "TuxOnIce module cleanup failed", - "I/O errors were encountered", - "Ran out of memory", - "An error was encountered while reading the image", - "Platform preparation failed", - "CPU Hotplugging failed", - "Architecture specific preparation failed", - "Pages needed resaving, but we were told to abort if this happens", - "We can't hibernate at the moment (invalid resume= or filewriter " - "target?)", - "A hibernation preparation notifier chain member cancelled the " - "hibernation", - "Pre-snapshot preparation failed", - "Pre-restore preparation failed", - "Failed to disable usermode helpers", - "Can't resume from alternate image", - "Header reservation too small", - "Device Power Management Preparation failed", -}; - -/** - * toi_finish_anything - cleanup after doing anything - * @hibernate_or_resume: Whether finishing a cycle or attempt at - * resuming. - * - * This is our basic clean-up routine, matching start_anything below. We - * call cleanup routines, drop module references and restore process fs and - * cpus allowed masks, together with the global block_dump variable's value. - **/ -void toi_finish_anything(int hibernate_or_resume) -{ - toi_running = 0; - toi_cleanup_modules(hibernate_or_resume); - toi_put_modules(); - if (hibernate_or_resume) { - block_dump = block_dump_save; - set_cpus_allowed_ptr(current, cpu_all_mask); - toi_alloc_print_debug_stats(); - atomic_inc(&snapshot_device_available); - unlock_system_sleep(); - } - - set_fs(oldfs); - mutex_unlock(&tuxonice_in_use); -} - -/** - * toi_start_anything - basic initialisation for TuxOnIce - * @toi_or_resume: Whether starting a cycle or attempt at resuming. - * - * Our basic initialisation routine. Take references on modules, use the - * kernel segment, recheck resume= if no active allocator is set, initialise - * modules, save and reset block_dump and ensure we're running on CPU0. - **/ -int toi_start_anything(int hibernate_or_resume) -{ - mutex_lock(&tuxonice_in_use); - - oldfs = get_fs(); - set_fs(KERNEL_DS); - - toi_trace_index = 0; - - if (hibernate_or_resume) { - lock_system_sleep(); - - if (!atomic_add_unless(&snapshot_device_available, -1, 0)) - goto snapshotdevice_unavailable; - } - - if (hibernate_or_resume == SYSFS_HIBERNATE) - toi_print_modules(); - - if (toi_get_modules()) { - printk(KERN_INFO "TuxOnIce: Get modules failed!\n"); - goto prehibernate_err; - } - - if (hibernate_or_resume) { - block_dump_save = block_dump; - block_dump = 0; - set_cpus_allowed_ptr(current, - cpumask_of(cpumask_first(cpu_online_mask))); - } - - if (toi_initialise_modules_early(hibernate_or_resume)) - goto early_init_err; - - if (!toiActiveAllocator) - toi_attempt_to_parse_resume_device(!hibernate_or_resume); - - if (!toi_initialise_modules_late(hibernate_or_resume)) { - toi_running = 1; /* For the swsusp code we use :< */ - return 0; - } - - toi_cleanup_modules(hibernate_or_resume); -early_init_err: - if (hibernate_or_resume) { - block_dump_save = block_dump; - set_cpus_allowed_ptr(current, cpu_all_mask); - } - toi_put_modules(); -prehibernate_err: - if (hibernate_or_resume) - atomic_inc(&snapshot_device_available); -snapshotdevice_unavailable: - if (hibernate_or_resume) - mutex_unlock(&pm_mutex); - set_fs(oldfs); - mutex_unlock(&tuxonice_in_use); - return -EBUSY; -} - -/* - * Nosave page tracking. - * - * Here rather than in prepare_image because we want to do it once only at the - * start of a cycle. - */ - -/** - * mark_nosave_pages - set up our Nosave bitmap - * - * Build a bitmap of Nosave pages from the list. The bitmap allows faster - * use when preparing the image. - **/ -static void mark_nosave_pages(void) -{ - struct nosave_region *region; - - list_for_each_entry(region, &nosave_regions, list) { - unsigned long pfn; - - for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++) - if (pfn_valid(pfn)) { - SetPageNosave(pfn_to_page(pfn)); - } - } -} - -/** - * allocate_bitmaps - allocate bitmaps used to record page states - * - * Allocate the bitmaps we use to record the various TuxOnIce related - * page states. - **/ -static int allocate_bitmaps(void) -{ - if (toi_alloc_bitmap(&pageset1_map) || - toi_alloc_bitmap(&pageset1_copy_map) || - toi_alloc_bitmap(&pageset2_map) || - toi_alloc_bitmap(&io_map) || - toi_alloc_bitmap(&nosave_map) || - toi_alloc_bitmap(&free_map) || - toi_alloc_bitmap(&compare_map) || - toi_alloc_bitmap(&page_resave_map)) - return 1; - - return 0; -} - -/** - * free_bitmaps - free the bitmaps used to record page states - * - * Free the bitmaps allocated above. It is not an error to call - * memory_bm_free on a bitmap that isn't currently allocated. - **/ -static void free_bitmaps(void) -{ - toi_free_bitmap(&pageset1_map); - toi_free_bitmap(&pageset1_copy_map); - toi_free_bitmap(&pageset2_map); - toi_free_bitmap(&io_map); - toi_free_bitmap(&nosave_map); - toi_free_bitmap(&free_map); - toi_free_bitmap(&compare_map); - toi_free_bitmap(&page_resave_map); -} - -/** - * io_MB_per_second - return the number of MB/s read or written - * @write: Whether to return the speed at which we wrote. - * - * Calculate the number of megabytes per second that were read or written. - **/ -static int io_MB_per_second(int write) -{ - return (toi_bkd.toi_io_time[write][1]) ? - MB((unsigned long) toi_bkd.toi_io_time[write][0]) * HZ / - toi_bkd.toi_io_time[write][1] : 0; -} - -#define SNPRINTF(a...) do { len += scnprintf(((char *) buffer) + len, \ - count - len - 1, ## a); } while (0) - -/** - * get_debug_info - fill a buffer with debugging information - * @buffer: The buffer to be filled. - * @count: The size of the buffer, in bytes. - * - * Fill a (usually PAGE_SIZEd) buffer with the debugging info that we will - * either printk or return via sysfs. - **/ -static int get_toi_debug_info(const char *buffer, int count) -{ - int len = 0, i, first_result = 1; - - SNPRINTF("TuxOnIce debugging info:\n"); - SNPRINTF("- TuxOnIce core : " TOI_CORE_VERSION "\n"); - SNPRINTF("- Kernel Version : " UTS_RELEASE "\n"); - SNPRINTF("- Compiler vers. : %d.%d\n", __GNUC__, __GNUC_MINOR__); - SNPRINTF("- Attempt number : %d\n", nr_hibernates); - SNPRINTF("- Parameters : %ld %ld %ld %d %ld %ld\n", - toi_result, - toi_bkd.toi_action, - toi_bkd.toi_debug_state, - toi_bkd.toi_default_console_level, - image_size_limit, - toi_poweroff_method); - SNPRINTF("- Overall expected compression percentage: %d.\n", - 100 - toi_expected_compression_ratio()); - len += toi_print_module_debug_info(((char *) buffer) + len, - count - len - 1); - if (toi_bkd.toi_io_time[0][1]) { - if ((io_MB_per_second(0) < 5) || (io_MB_per_second(1) < 5)) { - SNPRINTF("- I/O speed: Write %ld KB/s", - (KB((unsigned long) toi_bkd.toi_io_time[0][0]) * HZ / - toi_bkd.toi_io_time[0][1])); - if (toi_bkd.toi_io_time[1][1]) - SNPRINTF(", Read %ld KB/s", - (KB((unsigned long) - toi_bkd.toi_io_time[1][0]) * HZ / - toi_bkd.toi_io_time[1][1])); - } else { - SNPRINTF("- I/O speed: Write %ld MB/s", - (MB((unsigned long) toi_bkd.toi_io_time[0][0]) * HZ / - toi_bkd.toi_io_time[0][1])); - if (toi_bkd.toi_io_time[1][1]) - SNPRINTF(", Read %ld MB/s", - (MB((unsigned long) - toi_bkd.toi_io_time[1][0]) * HZ / - toi_bkd.toi_io_time[1][1])); - } - SNPRINTF(".\n"); - } else - SNPRINTF("- No I/O speed stats available.\n"); - SNPRINTF("- Extra pages : %lu used/%lu.\n", - extra_pd1_pages_used, extra_pd1_pages_allowance); - - for (i = 0; i < TOI_NUM_RESULT_STATES; i++) - if (test_result_state(i)) { - SNPRINTF("%s: %s.\n", first_result ? - "- Result " : - " ", - result_strings[i]); - first_result = 0; - } - if (first_result) - SNPRINTF("- Result : %s.\n", nr_hibernates ? - "Succeeded" : - "No hibernation attempts so far"); - return len; -} - -#ifdef CONFIG_TOI_INCREMENTAL -/** - * get_toi_page_state - fill a buffer with page state information - * @buffer: The buffer to be filled. - * @count: The size of the buffer, in bytes. - * - * Fill a (usually PAGE_SIZEd) buffer with the debugging info that we will - * either printk or return via sysfs. - **/ -static int get_toi_page_state(const char *buffer, int count) -{ - int free = 0, untracked = 0, dirty = 0, ro = 0, invalid = 0, other = 0, total = 0; - int len = 0; - struct zone *zone; - int allocated_bitmaps = 0; - - set_cpus_allowed_ptr(current, - cpumask_of(cpumask_first(cpu_online_mask))); - - if (!free_map) { - BUG_ON(toi_alloc_bitmap(&free_map)); - allocated_bitmaps = 1; - } - - toi_generate_free_page_map(); - - for_each_populated_zone(zone) { - unsigned long loop; - - total += zone->spanned_pages; - - for (loop = 0; loop < zone->spanned_pages; loop++) { - unsigned long pfn = zone->zone_start_pfn + loop; - struct page *page; - int chunk_size; - - if (!pfn_valid(pfn)) { - continue; - } - - chunk_size = toi_size_of_free_region(zone, pfn); - if (chunk_size) { - /* - * If the page gets allocated, it will be need - * saving in an image. - * Don't bother with explicitly removing any - * RO protection applied below. - * We'll SetPageTOI_Dirty(page) if/when it - * gets allocated. - */ - free += chunk_size; - loop += chunk_size - 1; - continue; - } - - page = pfn_to_page(pfn); - - if (PageTOI_Untracked(page)) { - untracked++; - } else if (PageTOI_RO(page)) { - ro++; - } else if (PageTOI_Dirty(page)) { - dirty++; - } else { - printk("Page %ld state 'other'.\n", pfn); - other++; - } - } - } - - if (allocated_bitmaps) { - toi_free_bitmap(&free_map); - } - - set_cpus_allowed_ptr(current, cpu_all_mask); - - SNPRINTF("TuxOnIce page breakdown:\n"); - SNPRINTF("- Free : %d\n", free); - SNPRINTF("- Untracked : %d\n", untracked); - SNPRINTF("- Read only : %d\n", ro); - SNPRINTF("- Dirty : %d\n", dirty); - SNPRINTF("- Other : %d\n", other); - SNPRINTF("- Invalid : %d\n", invalid); - SNPRINTF("- Total : %d\n", total); - return len; -} -#endif - -/** - * do_cleanup - cleanup after attempting to hibernate or resume - * @get_debug_info: Whether to allocate and return debugging info. - * - * Cleanup after attempting to hibernate or resume, possibly getting - * debugging info as we do so. - **/ -static void do_cleanup(int get_debug_info, int restarting) -{ - int i = 0; - char *buffer = NULL; - - trap_non_toi_io = 0; - - if (get_debug_info) - toi_prepare_status(DONT_CLEAR_BAR, "Cleaning up..."); - - free_checksum_pages(); - - toi_cbw_restore(); - toi_free_cbw_data(); - - if (get_debug_info) - buffer = (char *) toi_get_zeroed_page(20, TOI_ATOMIC_GFP); - - if (buffer) - i = get_toi_debug_info(buffer, PAGE_SIZE); - - toi_free_extra_pagedir_memory(); - - pagedir1.size = 0; - pagedir2.size = 0; - set_highmem_size(pagedir1, 0); - set_highmem_size(pagedir2, 0); - - if (boot_kernel_data_buffer) { - if (!test_toi_state(TOI_BOOT_KERNEL)) - toi_free_page(37, boot_kernel_data_buffer); - boot_kernel_data_buffer = 0; - } - - if (test_toi_state(TOI_DEVICE_HOTPLUG_LOCKED)) { - unlock_device_hotplug(); - clear_toi_state(TOI_DEVICE_HOTPLUG_LOCKED); - } - - clear_toi_state(TOI_BOOT_KERNEL); - if (current->flags & PF_SUSPEND_TASK) - thaw_processes(); - - if (!restarting) - toi_stop_other_threads(); - - if (toi_keeping_image && - !test_result_state(TOI_ABORTED)) { - toi_message(TOI_ANY_SECTION, TOI_LOW, 1, - "TuxOnIce: Not invalidating the image due " - "to Keep Image or Incremental Image being enabled."); - set_result_state(TOI_KEPT_IMAGE); - - /* - * For an incremental image, free unused storage so - * swap (if any) can be used for normal system operation, - * if so desired. - */ - - toiActiveAllocator->free_unused_storage(); - } else - if (toiActiveAllocator) - toiActiveAllocator->remove_image(); - - free_bitmaps(); - usermodehelper_enable(); - - if (test_toi_state(TOI_NOTIFIERS_PREPARE)) { - pm_notifier_call_chain(PM_POST_HIBERNATION); - clear_toi_state(TOI_NOTIFIERS_PREPARE); - } - - if (buffer && i) { - /* Printk can only handle 1023 bytes, including - * its level mangling. */ - for (i = 0; i < 3; i++) - printk(KERN_ERR "%s", buffer + (1023 * i)); - toi_free_page(20, (unsigned long) buffer); - } - - if (!restarting) - toi_cleanup_console(); - - free_attention_list(); - - if (!restarting) - toi_deactivate_storage(0); - - clear_toi_state(TOI_IGNORE_LOGLEVEL); - clear_toi_state(TOI_TRYING_TO_RESUME); - clear_toi_state(TOI_NOW_RESUMING); -} - -/** - * check_still_keeping_image - we kept an image; check whether to reuse it. - * - * We enter this routine when we have kept an image. If the user has said they - * want to still keep it, all we need to do is powerdown. If powering down - * means hibernating to ram and the power doesn't run out, we'll return 1. - * If we do power off properly or the battery runs out, we'll resume via the - * normal paths. - * - * If the user has said they want to remove the previously kept image, we - * remove it, and return 0. We'll then store a new image. - **/ -static int check_still_keeping_image(void) -{ - if (toi_keeping_image) { - if (!test_action_state(TOI_INCREMENTAL_IMAGE)) { - printk(KERN_INFO "Image already stored: powering down " - "immediately."); - do_toi_step(STEP_HIBERNATE_POWERDOWN); - return 1; - } - /** - * Incremental image - need to write new part. - * We detect that we're writing an incremental image by looking - * at test_result_state(TOI_KEPT_IMAGE) - **/ - return 0; - } - - printk(KERN_INFO "Invalidating previous image.\n"); - toiActiveAllocator->remove_image(); - - return 0; -} - -/** - * toi_init - prepare to hibernate to disk - * - * Initialise variables & data structures, in preparation for - * hibernating to disk. - **/ -static int toi_init(int restarting) -{ - int result, i, j; - - toi_result = 0; - - printk(KERN_INFO "Initiating a hibernation cycle.\n"); - - nr_hibernates++; - - for (i = 0; i < 2; i++) - for (j = 0; j < 2; j++) - toi_bkd.toi_io_time[i][j] = 0; - - if (!test_toi_state(TOI_CAN_HIBERNATE) || - allocate_bitmaps()) - return 1; - - mark_nosave_pages(); - - if (!restarting) - toi_prepare_console(); - - result = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); - if (result) { - set_result_state(TOI_NOTIFIERS_PREPARE_FAILED); - return 1; - } - set_toi_state(TOI_NOTIFIERS_PREPARE); - - if (!restarting) { - printk(KERN_ERR "Starting other threads."); - toi_start_other_threads(); - } - - result = usermodehelper_disable(); - if (result) { - printk(KERN_ERR "TuxOnIce: Failed to disable usermode " - "helpers\n"); - set_result_state(TOI_USERMODE_HELPERS_ERR); - return 1; - } - - boot_kernel_data_buffer = toi_get_zeroed_page(37, TOI_ATOMIC_GFP); - if (!boot_kernel_data_buffer) { - printk(KERN_ERR "TuxOnIce: Failed to allocate " - "boot_kernel_data_buffer.\n"); - set_result_state(TOI_OUT_OF_MEMORY); - return 1; - } - - toi_allocate_cbw_data(); - - return 0; -} - -/** - * can_hibernate - perform basic 'Can we hibernate?' tests - * - * Perform basic tests that must pass if we're going to be able to hibernate: - * Can we get the pm_mutex? Is resume= valid (we need to know where to write - * the image header). - **/ -static int can_hibernate(void) -{ - if (!test_toi_state(TOI_CAN_HIBERNATE)) - toi_attempt_to_parse_resume_device(0); - - if (!test_toi_state(TOI_CAN_HIBERNATE)) { - printk(KERN_INFO "TuxOnIce: Hibernation is disabled.\n" - "This may be because you haven't put something along " - "the lines of\n\nresume=swap:/dev/hda1\n\n" - "in lilo.conf or equivalent. (Where /dev/hda1 is your " - "swap partition).\n"); - set_abort_result(TOI_CANT_SUSPEND); - return 0; - } - - if (strlen(alt_resume_param)) { - attempt_to_parse_alt_resume_param(); - - if (!strlen(alt_resume_param)) { - printk(KERN_INFO "Alternate resume parameter now " - "invalid. Aborting.\n"); - set_abort_result(TOI_CANT_USE_ALT_RESUME); - return 0; - } - } - - return 1; -} - -/** - * do_post_image_write - having written an image, figure out what to do next - * - * After writing an image, we might load an alternate image or power down. - * Powering down might involve hibernating to ram, in which case we also - * need to handle reloading pageset2. - **/ -static int do_post_image_write(void) -{ - /* If switching images fails, do normal powerdown */ - if (alt_resume_param[0]) - do_toi_step(STEP_RESUME_ALT_IMAGE); - - toi_power_down(); - - barrier(); - mb(); - return 0; -} - -/** - * __save_image - do the hard work of saving the image - * - * High level routine for getting the image saved. The key assumptions made - * are that processes have been frozen and sufficient memory is available. - * - * We also exit through here at resume time, coming back from toi_hibernate - * after the atomic restore. This is the reason for the toi_in_hibernate - * test. - **/ -static int __save_image(void) -{ - int temp_result, did_copy = 0; - - toi_prepare_status(DONT_CLEAR_BAR, "Starting to save the image.."); - - toi_message(TOI_ANY_SECTION, TOI_LOW, 1, - " - Final values: %d and %d.", - pagedir1.size, pagedir2.size); - - toi_cond_pause(1, "About to write pagedir2."); - - temp_result = write_pageset(&pagedir2); - - if (temp_result == -1 || test_result_state(TOI_ABORTED)) - return 1; - - toi_cond_pause(1, "About to copy pageset 1."); - - if (test_result_state(TOI_ABORTED)) - return 1; - - toi_deactivate_storage(1); - - toi_prepare_status(DONT_CLEAR_BAR, "Doing atomic copy/restore."); - - toi_in_hibernate = 1; - - if (toi_go_atomic(PMSG_FREEZE, 1)) - goto Failed; - - temp_result = toi_hibernate(); - -#ifdef CONFIG_KGDB - if (test_action_state(TOI_POST_RESUME_BREAKPOINT)) - kgdb_breakpoint(); -#endif - - if (!temp_result) - did_copy = 1; - - /* We return here at resume time too! */ - toi_end_atomic(ATOMIC_ALL_STEPS, toi_in_hibernate, temp_result); - -Failed: - if (toi_activate_storage(1)) - panic("Failed to reactivate our storage."); - - /* Resume time? */ - if (!toi_in_hibernate) { - copyback_post(); - return 0; - } - - /* Nope. Hibernating. So, see if we can save the image... */ - - if (temp_result || test_result_state(TOI_ABORTED)) { - if (did_copy) - goto abort_reloading_pagedir_two; - else - return 1; - } - - toi_update_status(pagedir2.size, pagedir1.size + pagedir2.size, - NULL); - - if (test_result_state(TOI_ABORTED)) - goto abort_reloading_pagedir_two; - - toi_cond_pause(1, "About to write pageset1."); - - toi_message(TOI_ANY_SECTION, TOI_LOW, 1, "-- Writing pageset1"); - - temp_result = write_pageset(&pagedir1); - - /* We didn't overwrite any memory, so no reread needs to be done. */ - if (test_action_state(TOI_TEST_FILTER_SPEED) || - test_action_state(TOI_TEST_BIO)) - return 1; - - if (temp_result == 1 || test_result_state(TOI_ABORTED)) - goto abort_reloading_pagedir_two; - - toi_cond_pause(1, "About to write header."); - - if (test_result_state(TOI_ABORTED)) - goto abort_reloading_pagedir_two; - - temp_result = write_image_header(); - - if (!temp_result && !test_result_state(TOI_ABORTED)) - return 0; - -abort_reloading_pagedir_two: - temp_result = read_pageset2(1); - - /* If that failed, we're sunk. Panic! */ - if (temp_result) - panic("Attempt to reload pagedir 2 while aborting " - "a hibernate failed."); - - return 1; -} - -static void map_ps2_pages(int enable) -{ - unsigned long pfn = 0; - - memory_bm_position_reset(pageset2_map); - pfn = memory_bm_next_pfn(pageset2_map, 0); - - while (pfn != BM_END_OF_MAP) { - struct page *page = pfn_to_page(pfn); - kernel_map_pages(page, 1, enable); - pfn = memory_bm_next_pfn(pageset2_map, 0); - } -} - -/** - * do_save_image - save the image and handle the result - * - * Save the prepared image. If we fail or we're in the path returning - * from the atomic restore, cleanup. - **/ -static int do_save_image(void) -{ - int result; - map_ps2_pages(0); - result = __save_image(); - map_ps2_pages(1); - return result; -} - -/** - * do_prepare_image - try to prepare an image - * - * Seek to initialise and prepare an image to be saved. On failure, - * cleanup. - **/ -static int do_prepare_image(void) -{ - int restarting = test_result_state(TOI_EXTRA_PAGES_ALLOW_TOO_SMALL); - - if (!restarting && toi_activate_storage(0)) - return 1; - - /* - * If kept image and still keeping image and hibernating to RAM, (non - * incremental image case) we will return 1 after hibernating and - * resuming (provided the power doesn't run out. In that case, we skip - * directly to cleaning up and exiting. - */ - - if (!can_hibernate() || - (test_result_state(TOI_KEPT_IMAGE) && - check_still_keeping_image())) - return 1; - - if (toi_init(restarting) || toi_prepare_image() || - test_result_state(TOI_ABORTED)) - return 1; - - trap_non_toi_io = 1; - - return 0; -} - -/** - * do_check_can_resume - find out whether an image has been stored - * - * Read whether an image exists. We use the same routine as the - * image_exists sysfs entry, and just look to see whether the - * first character in the resulting buffer is a '1'. - **/ -int do_check_can_resume(void) -{ - int result = -1; - - if (toi_activate_storage(0)) - return -1; - - if (!test_toi_state(TOI_RESUME_DEVICE_OK)) - toi_attempt_to_parse_resume_device(1); - - if (toiActiveAllocator) - result = toiActiveAllocator->image_exists(1); - - toi_deactivate_storage(0); - return result; -} - -/** - * do_load_atomic_copy - load the first part of an image, if it exists - * - * Check whether we have an image. If one exists, do sanity checking - * (possibly invalidating the image or even rebooting if the user - * requests that) before loading it into memory in preparation for the - * atomic restore. - * - * If and only if we have an image loaded and ready to restore, we return 1. - **/ -static int do_load_atomic_copy(void) -{ - int read_image_result = 0; - - if (sizeof(swp_entry_t) != sizeof(long)) { - printk(KERN_WARNING "TuxOnIce: The size of swp_entry_t != size" - " of long. Please report this!\n"); - return 1; - } - - if (!resume_file[0]) - printk(KERN_WARNING "TuxOnIce: " - "You need to use a resume= command line parameter to " - "tell TuxOnIce where to look for an image.\n"); - - toi_activate_storage(0); - - if (!(test_toi_state(TOI_RESUME_DEVICE_OK)) && - !toi_attempt_to_parse_resume_device(0)) { - /* - * Without a usable storage device we can do nothing - - * even if noresume is given - */ - - if (!toiNumAllocators) - printk(KERN_ALERT "TuxOnIce: " - "No storage allocators have been registered.\n"); - else - printk(KERN_ALERT "TuxOnIce: " - "Missing or invalid storage location " - "(resume= parameter). Please correct and " - "rerun lilo (or equivalent) before " - "hibernating.\n"); - toi_deactivate_storage(0); - return 1; - } - - if (allocate_bitmaps()) - return 1; - - read_image_result = read_pageset1(); /* non fatal error ignored */ - - if (test_toi_state(TOI_NORESUME_SPECIFIED)) - clear_toi_state(TOI_NORESUME_SPECIFIED); - - toi_deactivate_storage(0); - - if (read_image_result) - return 1; - - return 0; -} - -/** - * prepare_restore_load_alt_image - save & restore alt image variables - * - * Save and restore the pageset1 maps, when loading an alternate image. - **/ -static void prepare_restore_load_alt_image(int prepare) -{ - static struct memory_bitmap *pageset1_map_save, *pageset1_copy_map_save; - - if (prepare) { - pageset1_map_save = pageset1_map; - pageset1_map = NULL; - pageset1_copy_map_save = pageset1_copy_map; - pageset1_copy_map = NULL; - set_toi_state(TOI_LOADING_ALT_IMAGE); - toi_reset_alt_image_pageset2_pfn(); - } else { - toi_free_bitmap(&pageset1_map); - pageset1_map = pageset1_map_save; - toi_free_bitmap(&pageset1_copy_map); - pageset1_copy_map = pageset1_copy_map_save; - clear_toi_state(TOI_NOW_RESUMING); - clear_toi_state(TOI_LOADING_ALT_IMAGE); - } -} - -/** - * do_toi_step - perform a step in hibernating or resuming - * - * Perform a step in hibernating or resuming an image. This abstraction - * is in preparation for implementing cluster support, and perhaps replacing - * uswsusp too (haven't looked whether that's possible yet). - **/ -int do_toi_step(int step) -{ - switch (step) { - case STEP_HIBERNATE_PREPARE_IMAGE: - return do_prepare_image(); - case STEP_HIBERNATE_SAVE_IMAGE: - return do_save_image(); - case STEP_HIBERNATE_POWERDOWN: - return do_post_image_write(); - case STEP_RESUME_CAN_RESUME: - return do_check_can_resume(); - case STEP_RESUME_LOAD_PS1: - return do_load_atomic_copy(); - case STEP_RESUME_DO_RESTORE: - /* - * If we succeed, this doesn't return. - * Instead, we return from do_save_image() in the - * hibernated kernel. - */ - return toi_atomic_restore(); - case STEP_RESUME_ALT_IMAGE: - printk(KERN_INFO "Trying to resume alternate image.\n"); - toi_in_hibernate = 0; - save_restore_alt_param(SAVE, NOQUIET); - prepare_restore_load_alt_image(1); - if (!do_check_can_resume()) { - printk(KERN_INFO "Nothing to resume from.\n"); - goto out; - } - if (!do_load_atomic_copy()) - toi_atomic_restore(); - - printk(KERN_INFO "Failed to load image.\n"); -out: - prepare_restore_load_alt_image(0); - save_restore_alt_param(RESTORE, NOQUIET); - break; - case STEP_CLEANUP: - do_cleanup(1, 0); - break; - case STEP_QUIET_CLEANUP: - do_cleanup(0, 0); - break; - } - - return 0; -} - -/* -- Functions for kickstarting a hibernate or resume --- */ - -/** - * toi_try_resume - try to do the steps in resuming - * - * Check if we have an image and if so try to resume. Clear the status - * flags too. - **/ -void toi_try_resume(void) -{ - set_toi_state(TOI_TRYING_TO_RESUME); - resume_attempted = 1; - - current->flags |= PF_MEMALLOC; - toi_start_other_threads(); - - if (do_toi_step(STEP_RESUME_CAN_RESUME) && - !do_toi_step(STEP_RESUME_LOAD_PS1)) - do_toi_step(STEP_RESUME_DO_RESTORE); - - toi_stop_other_threads(); - do_cleanup(0, 0); - - current->flags &= ~PF_MEMALLOC; - - clear_toi_state(TOI_IGNORE_LOGLEVEL); - clear_toi_state(TOI_TRYING_TO_RESUME); - clear_toi_state(TOI_NOW_RESUMING); -} - -/** - * toi_sys_power_disk_try_resume - wrapper calling toi_try_resume - * - * Wrapper for when __toi_try_resume is called from swsusp resume path, - * rather than from echo > /sys/power/tuxonice/do_resume. - **/ -static void toi_sys_power_disk_try_resume(void) -{ - resume_attempted = 1; - - /* - * There's a comment in kernel/power/disk.c that indicates - * we should be able to use mutex_lock_nested below. That - * doesn't seem to cut it, though, so let's just turn lockdep - * off for now. - */ - lockdep_off(); - - if (toi_start_anything(SYSFS_RESUMING)) - goto out; - - toi_try_resume(); - - /* - * For initramfs, we have to clear the boot time - * flag after trying to resume - */ - clear_toi_state(TOI_BOOT_TIME); - - toi_finish_anything(SYSFS_RESUMING); -out: - lockdep_on(); -} - -/** - * toi_try_hibernate - try to start a hibernation cycle - * - * Start a hibernation cycle, coming in from either - * echo > /sys/power/tuxonice/do_suspend - * - * or - * - * echo disk > /sys/power/state - * - * In the later case, we come in without pm_sem taken; in the - * former, it has been taken. - **/ -int toi_try_hibernate(void) -{ - int result = 0, sys_power_disk = 0, retries = 0; - - if (!mutex_is_locked(&tuxonice_in_use)) { - /* Came in via /sys/power/disk */ - if (toi_start_anything(SYSFS_HIBERNATING)) - return -EBUSY; - sys_power_disk = 1; - } - - current->flags |= PF_MEMALLOC; - - if (test_toi_state(TOI_CLUSTER_MODE)) { - toi_initiate_cluster_hibernate(); - goto out; - } - -prepare: - result = do_toi_step(STEP_HIBERNATE_PREPARE_IMAGE); - - if (result) - goto out; - - if (test_action_state(TOI_FREEZER_TEST)) - goto out_restore_gfp_mask; - - result = do_toi_step(STEP_HIBERNATE_SAVE_IMAGE); - - if (test_result_state(TOI_EXTRA_PAGES_ALLOW_TOO_SMALL)) { - if (retries < 2) { - do_cleanup(0, 1); - retries++; - clear_result_state(TOI_ABORTED); - extra_pd1_pages_allowance = extra_pd1_pages_used + 500; - printk(KERN_INFO "Automatically adjusting the extra" - " pages allowance to %ld and restarting.\n", - extra_pd1_pages_allowance); - pm_restore_gfp_mask(); - goto prepare; - } - - printk(KERN_INFO "Adjusted extra pages allowance twice and " - "still couldn't hibernate successfully. Giving up."); - } - - /* This code runs at resume time too! */ - if (!result && toi_in_hibernate) - result = do_toi_step(STEP_HIBERNATE_POWERDOWN); - -out_restore_gfp_mask: - pm_restore_gfp_mask(); -out: - do_cleanup(1, 0); - current->flags &= ~PF_MEMALLOC; - - if (sys_power_disk) - toi_finish_anything(SYSFS_HIBERNATING); - - return result; -} - -/* - * channel_no: If !0, -c <channel_no> is added to args (userui). - */ -int toi_launch_userspace_program(char *command, int channel_no, - int wait, int debug) -{ - int retval; - static char *envp[] = { - "HOME=/", - "TERM=linux", - "PATH=/sbin:/usr/sbin:/bin:/usr/bin", - NULL }; - static char *argv[] = { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL - }; - char *channel = NULL; - int arg = 0, size; - char test_read[255]; - char *orig_posn = command; - - if (!strlen(orig_posn)) - return 1; - - if (channel_no) { - channel = toi_kzalloc(4, 6, GFP_KERNEL); - if (!channel) { - printk(KERN_INFO "Failed to allocate memory in " - "preparing to launch userspace program.\n"); - return 1; - } - } - - /* Up to 6 args supported */ - while (arg < 6) { - sscanf(orig_posn, "%s", test_read); - size = strlen(test_read); - if (!(size)) - break; - argv[arg] = toi_kzalloc(5, size + 1, TOI_ATOMIC_GFP); - strcpy(argv[arg], test_read); - orig_posn += size + 1; - *test_read = 0; - arg++; - } - - if (channel_no) { - sprintf(channel, "-c%d", channel_no); - argv[arg] = channel; - } else - arg--; - - if (debug) { - argv[++arg] = toi_kzalloc(5, 8, TOI_ATOMIC_GFP); - strcpy(argv[arg], "--debug"); - } - - retval = call_usermodehelper(argv[0], argv, envp, wait); - - /* - * If the program reports an error, retval = 256. Don't complain - * about that here. - */ - if (retval && retval != 256) - printk(KERN_ERR "Failed to launch userspace program '%s': " - "Error %d\n", command, retval); - - { - int i; - for (i = 0; i < arg; i++) - if (argv[i] && argv[i] != channel) - toi_kfree(5, argv[i], sizeof(*argv[i])); - } - - toi_kfree(4, channel, sizeof(*channel)); - - return retval; -} - -/* - * This array contains entries that are automatically registered at - * boot. Modules and the console code register their own entries separately. - */ -static struct toi_sysfs_data sysfs_params[] = { - SYSFS_LONG("extra_pages_allowance", SYSFS_RW, - &extra_pd1_pages_allowance, 0, LONG_MAX, 0), - SYSFS_CUSTOM("image_exists", SYSFS_RW, image_exists_read, - image_exists_write, SYSFS_NEEDS_SM_FOR_BOTH, NULL), - SYSFS_STRING("resume", SYSFS_RW, resume_file, 255, - SYSFS_NEEDS_SM_FOR_WRITE, - attempt_to_parse_resume_device2), - SYSFS_STRING("alt_resume_param", SYSFS_RW, alt_resume_param, 255, - SYSFS_NEEDS_SM_FOR_WRITE, - attempt_to_parse_alt_resume_param), - SYSFS_CUSTOM("debug_info", SYSFS_READONLY, get_toi_debug_info, NULL, 0, - NULL), - SYSFS_BIT("ignore_rootfs", SYSFS_RW, &toi_bkd.toi_action, - TOI_IGNORE_ROOTFS, 0), - SYSFS_LONG("image_size_limit", SYSFS_RW, &image_size_limit, -2, - INT_MAX, 0), - SYSFS_UL("last_result", SYSFS_RW, &toi_result, 0, 0, 0), - SYSFS_BIT("no_multithreaded_io", SYSFS_RW, &toi_bkd.toi_action, - TOI_NO_MULTITHREADED_IO, 0), - SYSFS_BIT("no_flusher_thread", SYSFS_RW, &toi_bkd.toi_action, - TOI_NO_FLUSHER_THREAD, 0), - SYSFS_BIT("full_pageset2", SYSFS_RW, &toi_bkd.toi_action, - TOI_PAGESET2_FULL, 0), - SYSFS_BIT("reboot", SYSFS_RW, &toi_bkd.toi_action, TOI_REBOOT, 0), - SYSFS_BIT("replace_swsusp", SYSFS_RW, &toi_bkd.toi_action, - TOI_REPLACE_SWSUSP, 0), - SYSFS_STRING("resume_commandline", SYSFS_RW, - toi_bkd.toi_nosave_commandline, COMMAND_LINE_SIZE, 0, - NULL), - SYSFS_STRING("version", SYSFS_READONLY, TOI_CORE_VERSION, 0, 0, NULL), - SYSFS_BIT("freezer_test", SYSFS_RW, &toi_bkd.toi_action, - TOI_FREEZER_TEST, 0), - SYSFS_BIT("test_bio", SYSFS_RW, &toi_bkd.toi_action, TOI_TEST_BIO, 0), - SYSFS_BIT("test_filter_speed", SYSFS_RW, &toi_bkd.toi_action, - TOI_TEST_FILTER_SPEED, 0), - SYSFS_BIT("no_pageset2", SYSFS_RW, &toi_bkd.toi_action, - TOI_NO_PAGESET2, 0), - SYSFS_BIT("no_pageset2_if_unneeded", SYSFS_RW, &toi_bkd.toi_action, - TOI_NO_PS2_IF_UNNEEDED, 0), - SYSFS_STRING("binary_signature", SYSFS_READONLY, - tuxonice_signature, 9, 0, NULL), - SYSFS_INT("max_workers", SYSFS_RW, &toi_max_workers, 0, NR_CPUS, 0, - NULL), -#ifdef CONFIG_KGDB - SYSFS_BIT("post_resume_breakpoint", SYSFS_RW, &toi_bkd.toi_action, - TOI_POST_RESUME_BREAKPOINT, 0), -#endif - SYSFS_BIT("no_readahead", SYSFS_RW, &toi_bkd.toi_action, - TOI_NO_READAHEAD, 0), - SYSFS_BIT("trace_debug_on", SYSFS_RW, &toi_bkd.toi_action, - TOI_TRACE_DEBUG_ON, 0), -#ifdef CONFIG_TOI_KEEP_IMAGE - SYSFS_BIT("keep_image", SYSFS_RW , &toi_bkd.toi_action, TOI_KEEP_IMAGE, - 0), -#endif -#ifdef CONFIG_TOI_INCREMENTAL - SYSFS_CUSTOM("pagestate", SYSFS_READONLY, get_toi_page_state, NULL, 0, - NULL), - SYSFS_BIT("incremental", SYSFS_RW, &toi_bkd.toi_action, - TOI_INCREMENTAL_IMAGE, 1), -#endif -}; - -static struct toi_core_fns my_fns = { - .get_nonconflicting_page = __toi_get_nonconflicting_page, - .post_context_save = __toi_post_context_save, - .try_hibernate = toi_try_hibernate, - .try_resume = toi_sys_power_disk_try_resume, -}; - -/** - * core_load - initialisation of TuxOnIce core - * - * Initialise the core, beginning with sysfs. Checksum and so on are part of - * the core, but have their own initialisation routines because they either - * aren't compiled in all the time or have their own subdirectories. - **/ -static __init int core_load(void) -{ - int i, - numfiles = sizeof(sysfs_params) / sizeof(struct toi_sysfs_data); - - printk(KERN_INFO "TuxOnIce " TOI_CORE_VERSION - " (http://tuxonice.net)\n"); - - if (!hibernation_available()) { - printk(KERN_INFO "TuxOnIce disabled due to request for hibernation" - " to be disabled in this kernel.\n"); - return 1; - } - - if (toi_sysfs_init()) - return 1; - - for (i = 0; i < numfiles; i++) - toi_register_sysfs_file(tuxonice_kobj, &sysfs_params[i]); - - toi_core_fns = &my_fns; - - if (toi_alloc_init()) - return 1; - if (toi_checksum_init()) - return 1; - if (toi_usm_init()) - return 1; - if (toi_ui_init()) - return 1; - if (toi_poweroff_init()) - return 1; - if (toi_cluster_init()) - return 1; - if (toi_cbw_init()) - return 1; - - return 0; -} - -late_initcall(core_load); diff --git a/kernel/power/tuxonice_incremental.c b/kernel/power/tuxonice_incremental.c deleted file mode 100644 index a8c5f3660..000000000 --- a/kernel/power/tuxonice_incremental.c +++ /dev/null @@ -1,402 +0,0 @@ -/* - * kernel/power/tuxonice_incremental.c - * - * Copyright (C) 2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * This file contains routines related to storing incremental images - that - * is, retaining an image after an initial cycle and then storing incremental - * changes on subsequent hibernations. - * - * Based in part on on... - * - * Debug helper to dump the current kernel pagetables of the system - * so that we can see what the various memory ranges are set to. - * - * (C) Copyright 2008 Intel Corporation - * - * Author: Arjan van de Ven <arjan@linux.intel.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. - */ - -#include <linux/mm.h> -#include <linux/tuxonice.h> -#include <linux/sched.h> -#include <asm/pgtable.h> -#include <asm/cacheflush.h> -#include <asm/tlbflush.h> -#include <asm/page.h> -#include "tuxonice_pageflags.h" -#include "tuxonice_builtin.h" -#include "power.h" - -int toi_do_incremental_initcall; - -extern void kdb_init(int level); -extern noinline void kgdb_breakpoint(void); - -#undef pr_debug -#if 0 -#define pr_debug(a, b...) do { printk(a, ##b); } while(0) -#else -#define pr_debug(a, b...) do { } while(0) -#endif - -/* Multipliers for offsets within the PTEs */ -#define PTE_LEVEL_MULT (PAGE_SIZE) -#define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT) -#define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT) -#define PGD_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT) - -/* - * This function gets called on a break in a continuous series - * of PTE entries; the next one is different so we need to - * print what we collected so far. - */ -static void note_page(void *addr) -{ - static struct page *lastpage; - struct page *page; - - page = virt_to_page(addr); - - if (page != lastpage) { - unsigned int level; - pte_t *pte = lookup_address((unsigned long) addr, &level); - struct page *pt_page2 = pte_page(*pte); - //debug("Note page %p (=> %p => %p|%ld).\n", addr, pte, pt_page2, page_to_pfn(pt_page2)); - SetPageTOI_Untracked(pt_page2); - lastpage = page; - } -} - -static void walk_pte_level(pmd_t addr) -{ - int i; - pte_t *start; - - start = (pte_t *) pmd_page_vaddr(addr); - for (i = 0; i < PTRS_PER_PTE; i++) { - note_page(start); - start++; - } -} - -#if PTRS_PER_PMD > 1 - -static void walk_pmd_level(pud_t addr) -{ - int i; - pmd_t *start; - - start = (pmd_t *) pud_page_vaddr(addr); - for (i = 0; i < PTRS_PER_PMD; i++) { - if (!pmd_none(*start)) { - if (pmd_large(*start) || !pmd_present(*start)) - note_page(start); - else - walk_pte_level(*start); - } else - note_page(start); - start++; - } -} - -#else -#define walk_pmd_level(a) walk_pte_level(__pmd(pud_val(a))) -#define pud_large(a) pmd_large(__pmd(pud_val(a))) -#define pud_none(a) pmd_none(__pmd(pud_val(a))) -#endif - -#if PTRS_PER_PUD > 1 - -static void walk_pud_level(pgd_t addr) -{ - int i; - pud_t *start; - - start = (pud_t *) pgd_page_vaddr(addr); - - for (i = 0; i < PTRS_PER_PUD; i++) { - if (!pud_none(*start)) { - if (pud_large(*start) || !pud_present(*start)) - note_page(start); - else - walk_pmd_level(*start); - } else - note_page(start); - - start++; - } -} - -#else -#define walk_pud_level(a) walk_pmd_level(__pud(pgd_val(a))) -#define pgd_large(a) pud_large(__pud(pgd_val(a))) -#define pgd_none(a) pud_none(__pud(pgd_val(a))) -#endif - -/* - * Not static in the original at the time of writing, so needs renaming here. - */ -static void toi_ptdump_walk_pgd_level(pgd_t *pgd) -{ -#ifdef CONFIG_X86_64 - pgd_t *start = (pgd_t *) &init_level4_pgt; -#else - pgd_t *start = swapper_pg_dir; -#endif - int i; - if (pgd) { - start = pgd; - } - - for (i = 0; i < PTRS_PER_PGD; i++) { - if (!pgd_none(*start)) { - if (pgd_large(*start) || !pgd_present(*start)) - note_page(start); - else - walk_pud_level(*start); - } else - note_page(start); - - start++; - } - - /* Flush out the last page */ - note_page(start); -} - -#ifdef CONFIG_PARAVIRT -extern struct pv_info pv_info; - -static void toi_set_paravirt_ops_untracked(void) { - int i; - - unsigned long pvpfn = page_to_pfn(virt_to_page(__parainstructions)), - pvpfn_end = page_to_pfn(virt_to_page(__parainstructions_end)); - //debug(KERN_EMERG ".parainstructions goes from pfn %ld to %ld.\n", pvpfn, pvpfn_end); - for (i = pvpfn; i <= pvpfn_end; i++) { - SetPageTOI_Untracked(pfn_to_page(i)); - } -} -#else -#define toi_set_paravirt_ops_untracked() { do { } while(0) } -#endif - -extern void toi_mark_per_cpus_pages_untracked(void); - -void toi_untrack_stack(unsigned long *stack) -{ - int i; - struct page *stack_page = virt_to_page(stack); - - for (i = 0; i < (1 << THREAD_SIZE_ORDER); i++) { - pr_debug("Untrack stack page %p.\n", page_address(stack_page + i)); - SetPageTOI_Untracked(stack_page + i); - } -} -void toi_untrack_process(struct task_struct *p) -{ - SetPageTOI_Untracked(virt_to_page(p)); - pr_debug("Untrack process %d page %p.\n", p->pid, page_address(virt_to_page(p))); - - toi_untrack_stack(p->stack); -} - -void toi_generate_untracked_map(void) -{ - struct task_struct *p, *t; - struct page *page; - pte_t *pte; - int i; - unsigned int level; - static int been_here = 0; - - if (been_here) - return; - - been_here = 1; - - /* Pagetable pages */ - toi_ptdump_walk_pgd_level(NULL); - - /* Printk buffer - not normally needed but can be helpful for debugging. */ - //toi_set_logbuf_untracked(); - - /* Paravirt ops */ - toi_set_paravirt_ops_untracked(); - - /* Task structs and stacks */ - for_each_process_thread(p, t) { - toi_untrack_process(p); - //toi_untrack_stack((unsigned long *) t->thread.sp); - } - - for (i = 0; i < NR_CPUS; i++) { - struct task_struct *idle = idle_task(i); - - if (idle) { - pr_debug("Untrack idle process for CPU %d.\n", i); - toi_untrack_process(idle); - } - - /* IRQ stack */ - pr_debug("Untrack IRQ stack for CPU %d.\n", i); - toi_untrack_stack((unsigned long *)per_cpu(irq_stack_ptr, i)); - } - - /* Per CPU data */ - //pr_debug("Untracking per CPU variable pages.\n"); - toi_mark_per_cpus_pages_untracked(); - - /* Init stack - for bringing up secondary CPUs */ - page = virt_to_page(init_stack); - for (i = 0; i < DIV_ROUND_UP(sizeof(init_stack), PAGE_SIZE); i++) { - SetPageTOI_Untracked(page + i); - } - - pte = lookup_address((unsigned long) &mmu_cr4_features, &level); - SetPageTOI_Untracked(pte_page(*pte)); - SetPageTOI_Untracked(virt_to_page(trampoline_cr4_features)); -} - -/** - * toi_reset_dirtiness_one - */ - -void toi_reset_dirtiness_one(unsigned long pfn, int verbose) -{ - struct page *page = pfn_to_page(pfn); - - /** - * Don't worry about whether the Dirty flag is - * already set. If this is our first call, it - * won't be. - */ - - preempt_disable(); - - ClearPageTOI_Dirty(page); - SetPageTOI_RO(page); - if (verbose) - printk(KERN_EMERG "Making page %ld (%p|%p) read only.\n", pfn, page, page_address(page)); - - set_memory_ro((unsigned long) page_address(page), 1); - - preempt_enable(); -} - -/** - * TuxOnIce's incremental image support works by marking all memory apart from - * the page tables read-only, then in the page-faults that result enabling - * writing if appropriate and flagging the page as dirty. Free pages are also - * marked as dirty and not protected so that if allocated, they will be included - * in the image without further processing. - * - * toi_reset_dirtiness is called when and image exists and incremental images are - * enabled, and each time we resume thereafter. It is not invoked on a fresh boot. - * - * This routine should be called from a single-cpu-running context to avoid races in setting - * page dirty/read only flags. - * - * TODO: Make "it is not invoked on a fresh boot" true when I've finished developing it! - * - * TODO: Consider Xen paravirt guest boot issues. See arch/x86/mm/pageattr.c. - **/ - -int toi_reset_dirtiness(int verbose) -{ - struct zone *zone; - unsigned long loop; - int allocated_map = 0; - - toi_generate_untracked_map(); - - if (!free_map) { - if (!toi_alloc_bitmap(&free_map)) - return -ENOMEM; - allocated_map = 1; - } - - toi_generate_free_page_map(); - - pr_debug(KERN_EMERG "Reset dirtiness.\n"); - for_each_populated_zone(zone) { - // 64 bit only. No need to worry about highmem. - for (loop = 0; loop < zone->spanned_pages; loop++) { - unsigned long pfn = zone->zone_start_pfn + loop; - struct page *page; - int chunk_size; - - if (!pfn_valid(pfn)) { - continue; - } - - chunk_size = toi_size_of_free_region(zone, pfn); - if (chunk_size) { - loop += chunk_size - 1; - continue; - } - - page = pfn_to_page(pfn); - - if (PageNosave(page) || !saveable_page(zone, pfn)) { - continue; - } - - if (PageTOI_Untracked(page)) { - continue; - } - - /** - * Do we need to (re)protect the page? - * If it is already protected (PageTOI_RO), there is - * nothing to do - skip the following. - * If it is marked as dirty (PageTOI_Dirty), it was - * either free and has been allocated or has been - * written to and marked dirty. Reset the dirty flag - * and (re)apply the protection. - */ - if (!PageTOI_RO(page)) { - toi_reset_dirtiness_one(pfn, verbose); - } - } - } - - pr_debug(KERN_EMERG "Done resetting dirtiness.\n"); - - if (allocated_map) { - toi_free_bitmap(&free_map); - } - return 0; -} - -static int toi_reset_dirtiness_initcall(void) -{ - if (toi_do_incremental_initcall) { - pr_info("TuxOnIce: Enabling dirty page tracking.\n"); - toi_reset_dirtiness(0); - } - return 1; -} -extern void toi_generate_untracked_map(void); - -// Leave early_initcall for pages to register untracked sections. -early_initcall(toi_reset_dirtiness_initcall); - -static int __init toi_incremental_initcall_setup(char *str) -{ - int value; - - if (sscanf(str, "=%d", &value) && value) - toi_do_incremental_initcall = value; - - return 1; -} -__setup("toi_incremental_initcall", toi_incremental_initcall_setup); diff --git a/kernel/power/tuxonice_io.c b/kernel/power/tuxonice_io.c deleted file mode 100644 index 3c62c2682..000000000 --- a/kernel/power/tuxonice_io.c +++ /dev/null @@ -1,1932 +0,0 @@ -/* - * kernel/power/tuxonice_io.c - * - * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu> - * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz> - * Copyright (C) 2002-2003 Florent Chabaud <fchabaud@free.fr> - * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * It contains high level IO routines for hibernating. - * - */ - -#include <linux/suspend.h> -#include <linux/version.h> -#include <linux/utsname.h> -#include <linux/mount.h> -#include <linux/highmem.h> -#include <linux/kthread.h> -#include <linux/cpu.h> -#include <linux/fs_struct.h> -#include <linux/bio.h> -#include <linux/fs_uuid.h> -#include <linux/kmod.h> -#include <asm/tlbflush.h> - -#include "tuxonice.h" -#include "tuxonice_modules.h" -#include "tuxonice_pageflags.h" -#include "tuxonice_io.h" -#include "tuxonice_ui.h" -#include "tuxonice_storage.h" -#include "tuxonice_prepare_image.h" -#include "tuxonice_extent.h" -#include "tuxonice_sysfs.h" -#include "tuxonice_builtin.h" -#include "tuxonice_checksum.h" -#include "tuxonice_alloc.h" -char alt_resume_param[256]; - -/* Version read from image header at resume */ -static int toi_image_header_version; - -#define read_if_version(VERS, VAR, DESC, ERR_ACT) do { \ - if (likely(toi_image_header_version >= VERS)) \ - if (toiActiveAllocator->rw_header_chunk(READ, NULL, \ - (char *) &VAR, sizeof(VAR))) { \ - abort_hibernate(TOI_FAILED_IO, "Failed to read DESC."); \ - ERR_ACT; \ - } \ -} while(0) \ - -/* Variables shared between threads and updated under the mutex */ -static int io_write, io_finish_at, io_base, io_barmax, io_pageset, io_result; -static int io_index, io_nextupdate, io_pc, io_pc_step; -static DEFINE_MUTEX(io_mutex); -static DEFINE_PER_CPU(struct page *, last_sought); -static DEFINE_PER_CPU(struct page *, last_high_page); -static DEFINE_PER_CPU(char *, checksum_locn); -static DEFINE_PER_CPU(struct pbe *, last_low_page); -static atomic_t io_count; -atomic_t toi_io_workers; - -static int using_flusher; - -DECLARE_WAIT_QUEUE_HEAD(toi_io_queue_flusher); - -int toi_bio_queue_flusher_should_finish; - -int toi_max_workers; - -static char *image_version_error = "The image header version is newer than " \ - "this kernel supports."; - -struct toi_module_ops *first_filter; - -static atomic_t toi_num_other_threads; -static DECLARE_WAIT_QUEUE_HEAD(toi_worker_wait_queue); -enum toi_worker_commands { - TOI_IO_WORKER_STOP, - TOI_IO_WORKER_RUN, - TOI_IO_WORKER_EXIT -}; -static enum toi_worker_commands toi_worker_command; - -/** - * toi_attempt_to_parse_resume_device - determine if we can hibernate - * - * Can we hibernate, using the current resume= parameter? - **/ -int toi_attempt_to_parse_resume_device(int quiet) -{ - struct list_head *Allocator; - struct toi_module_ops *thisAllocator; - int result, returning = 0; - - if (toi_activate_storage(0)) - return 0; - - toiActiveAllocator = NULL; - clear_toi_state(TOI_RESUME_DEVICE_OK); - clear_toi_state(TOI_CAN_RESUME); - clear_result_state(TOI_ABORTED); - - if (!toiNumAllocators) { - if (!quiet) - printk(KERN_INFO "TuxOnIce: No storage allocators have " - "been registered. Hibernating will be " - "disabled.\n"); - goto cleanup; - } - - list_for_each(Allocator, &toiAllocators) { - thisAllocator = list_entry(Allocator, struct toi_module_ops, - type_list); - - /* - * Not sure why you'd want to disable an allocator, but - * we should honour the flag if we're providing it - */ - if (!thisAllocator->enabled) - continue; - - result = thisAllocator->parse_sig_location( - resume_file, (toiNumAllocators == 1), - quiet); - - switch (result) { - case -EINVAL: - /* For this allocator, but not a valid - * configuration. Error already printed. */ - goto cleanup; - - case 0: - /* For this allocator and valid. */ - toiActiveAllocator = thisAllocator; - - set_toi_state(TOI_RESUME_DEVICE_OK); - set_toi_state(TOI_CAN_RESUME); - returning = 1; - goto cleanup; - } - } - if (!quiet) - printk(KERN_INFO "TuxOnIce: No matching enabled allocator " - "found. Resuming disabled.\n"); -cleanup: - toi_deactivate_storage(0); - return returning; -} - -void attempt_to_parse_resume_device2(void) -{ - toi_prepare_usm(); - toi_attempt_to_parse_resume_device(0); - toi_cleanup_usm(); -} - -void save_restore_alt_param(int replace, int quiet) -{ - static char resume_param_save[255]; - static unsigned long toi_state_save; - - if (replace) { - toi_state_save = toi_state; - strcpy(resume_param_save, resume_file); - strcpy(resume_file, alt_resume_param); - } else { - strcpy(resume_file, resume_param_save); - toi_state = toi_state_save; - } - toi_attempt_to_parse_resume_device(quiet); -} - -void attempt_to_parse_alt_resume_param(void) -{ - int ok = 0; - - /* Temporarily set resume_param to the poweroff value */ - if (!strlen(alt_resume_param)) - return; - - printk(KERN_INFO "=== Trying Poweroff Resume2 ===\n"); - save_restore_alt_param(SAVE, NOQUIET); - if (test_toi_state(TOI_CAN_RESUME)) - ok = 1; - - printk(KERN_INFO "=== Done ===\n"); - save_restore_alt_param(RESTORE, QUIET); - - /* If not ok, clear the string */ - if (ok) - return; - - printk(KERN_INFO "Can't resume from that location; clearing " - "alt_resume_param.\n"); - alt_resume_param[0] = '\0'; -} - -/** - * noresume_reset_modules - reset data structures in case of non resuming - * - * When we read the start of an image, modules (and especially the - * active allocator) might need to reset data structures if we - * decide to remove the image rather than resuming from it. - **/ -static void noresume_reset_modules(void) -{ - struct toi_module_ops *this_filter; - - list_for_each_entry(this_filter, &toi_filters, type_list) - if (this_filter->noresume_reset) - this_filter->noresume_reset(); - - if (toiActiveAllocator && toiActiveAllocator->noresume_reset) - toiActiveAllocator->noresume_reset(); -} - -/** - * fill_toi_header - fill the hibernate header structure - * @struct toi_header: Header data structure to be filled. - **/ -static int fill_toi_header(struct toi_header *sh) -{ - int i, error; - - error = init_header((struct swsusp_info *) sh); - if (error) - return error; - - sh->pagedir = pagedir1; - sh->pageset_2_size = pagedir2.size; - sh->param0 = toi_result; - sh->param1 = toi_bkd.toi_action; - sh->param2 = toi_bkd.toi_debug_state; - sh->param3 = toi_bkd.toi_default_console_level; - sh->root_fs = current->fs->root.mnt->mnt_sb->s_dev; - for (i = 0; i < 4; i++) - sh->io_time[i/2][i%2] = toi_bkd.toi_io_time[i/2][i%2]; - sh->bkd = boot_kernel_data_buffer; - return 0; -} - -/** - * rw_init_modules - initialize modules - * @rw: Whether we are reading of writing an image. - * @which: Section of the image being processed. - * - * Iterate over modules, preparing the ones that will be used to read or write - * data. - **/ -static int rw_init_modules(int rw, int which) -{ - struct toi_module_ops *this_module; - /* Initialise page transformers */ - list_for_each_entry(this_module, &toi_filters, type_list) { - if (!this_module->enabled) - continue; - if (this_module->rw_init && this_module->rw_init(rw, which)) { - abort_hibernate(TOI_FAILED_MODULE_INIT, - "Failed to initialize the %s filter.", - this_module->name); - return 1; - } - } - - /* Initialise allocator */ - if (toiActiveAllocator->rw_init(rw, which)) { - abort_hibernate(TOI_FAILED_MODULE_INIT, - "Failed to initialise the allocator."); - return 1; - } - - /* Initialise other modules */ - list_for_each_entry(this_module, &toi_modules, module_list) { - if (!this_module->enabled || - this_module->type == FILTER_MODULE || - this_module->type == WRITER_MODULE) - continue; - if (this_module->rw_init && this_module->rw_init(rw, which)) { - set_abort_result(TOI_FAILED_MODULE_INIT); - printk(KERN_INFO "Setting aborted flag due to module " - "init failure.\n"); - return 1; - } - } - - return 0; -} - -/** - * rw_cleanup_modules - cleanup modules - * @rw: Whether we are reading of writing an image. - * - * Cleanup components after reading or writing a set of pages. - * Only the allocator may fail. - **/ -static int rw_cleanup_modules(int rw) -{ - struct toi_module_ops *this_module; - int result = 0; - - /* Cleanup other modules */ - list_for_each_entry(this_module, &toi_modules, module_list) { - if (!this_module->enabled || - this_module->type == FILTER_MODULE || - this_module->type == WRITER_MODULE) - continue; - if (this_module->rw_cleanup) - result |= this_module->rw_cleanup(rw); - } - - /* Flush data and cleanup */ - list_for_each_entry(this_module, &toi_filters, type_list) { - if (!this_module->enabled) - continue; - if (this_module->rw_cleanup) - result |= this_module->rw_cleanup(rw); - } - - result |= toiActiveAllocator->rw_cleanup(rw); - - return result; -} - -static struct page *copy_page_from_orig_page(struct page *orig_page, int is_high) -{ - int index, min, max; - struct page *high_page = NULL, - **my_last_high_page = raw_cpu_ptr(&last_high_page), - **my_last_sought = raw_cpu_ptr(&last_sought); - struct pbe *this, **my_last_low_page = raw_cpu_ptr(&last_low_page); - void *compare; - - if (is_high) { - if (*my_last_sought && *my_last_high_page && - *my_last_sought < orig_page) - high_page = *my_last_high_page; - else - high_page = (struct page *) restore_highmem_pblist; - this = (struct pbe *) kmap(high_page); - compare = orig_page; - } else { - if (*my_last_sought && *my_last_low_page && - *my_last_sought < orig_page) - this = *my_last_low_page; - else - this = restore_pblist; - compare = page_address(orig_page); - } - - *my_last_sought = orig_page; - - /* Locate page containing pbe */ - while (this[PBES_PER_PAGE - 1].next && - this[PBES_PER_PAGE - 1].orig_address < compare) { - if (is_high) { - struct page *next_high_page = (struct page *) - this[PBES_PER_PAGE - 1].next; - kunmap(high_page); - this = kmap(next_high_page); - high_page = next_high_page; - } else - this = this[PBES_PER_PAGE - 1].next; - } - - /* Do a binary search within the page */ - min = 0; - max = PBES_PER_PAGE; - index = PBES_PER_PAGE / 2; - while (max - min) { - if (!this[index].orig_address || - this[index].orig_address > compare) - max = index; - else if (this[index].orig_address == compare) { - if (is_high) { - struct page *page = this[index].address; - *my_last_high_page = high_page; - kunmap(high_page); - return page; - } - *my_last_low_page = this; - return virt_to_page(this[index].address); - } else - min = index; - index = ((max + min) / 2); - }; - - if (is_high) - kunmap(high_page); - - abort_hibernate(TOI_FAILED_IO, "Failed to get destination page for" - " orig page %p. This[min].orig_address=%p.\n", orig_page, - this[index].orig_address); - return NULL; -} - -/** - * write_next_page - write the next page in a pageset - * @data_pfn: The pfn where the next data to write is located. - * @my_io_index: The index of the page in the pageset. - * @write_pfn: The pfn number to write in the image (where the data belongs). - * - * Get the pfn of the next page to write, map the page if necessary and do the - * write. - **/ -static int write_next_page(unsigned long *data_pfn, int *my_io_index, - unsigned long *write_pfn) -{ - struct page *page; - char **my_checksum_locn = raw_cpu_ptr(&checksum_locn); - int result = 0, was_present; - - *data_pfn = memory_bm_next_pfn(io_map, 0); - - /* Another thread could have beaten us to it. */ - if (*data_pfn == BM_END_OF_MAP) { - if (atomic_read(&io_count)) { - printk(KERN_INFO "Ran out of pfns but io_count is " - "still %d.\n", atomic_read(&io_count)); - BUG(); - } - mutex_unlock(&io_mutex); - return -ENODATA; - } - - *my_io_index = io_finish_at - atomic_sub_return(1, &io_count); - - memory_bm_clear_bit(io_map, 0, *data_pfn); - page = pfn_to_page(*data_pfn); - - was_present = kernel_page_present(page); - if (!was_present) - kernel_map_pages(page, 1, 1); - - if (io_pageset == 1) - *write_pfn = memory_bm_next_pfn(pageset1_map, 0); - else { - *write_pfn = *data_pfn; - *my_checksum_locn = tuxonice_get_next_checksum(); - } - - TOI_TRACE_DEBUG(*data_pfn, "_PS%d_write %d", io_pageset, *my_io_index); - - mutex_unlock(&io_mutex); - - if (io_pageset == 2 && tuxonice_calc_checksum(page, *my_checksum_locn)) - return 1; - - result = first_filter->write_page(*write_pfn, TOI_PAGE, page, - PAGE_SIZE); - - if (!was_present) - kernel_map_pages(page, 1, 0); - - return result; -} - -/** - * read_next_page - read the next page in a pageset - * @my_io_index: The index of the page in the pageset. - * @write_pfn: The pfn in which the data belongs. - * - * Read a page of the image into our buffer. It can happen (here and in the - * write routine) that threads don't get run until after other CPUs have done - * all the work. This was the cause of the long standing issue with - * occasionally getting -ENODATA errors at the end of reading the image. We - * therefore need to check there's actually a page to read before trying to - * retrieve one. - **/ - -static int read_next_page(int *my_io_index, unsigned long *write_pfn, - struct page *buffer) -{ - unsigned int buf_size = PAGE_SIZE; - unsigned long left = atomic_read(&io_count); - - if (!left) - return -ENODATA; - - /* Start off assuming the page we read isn't resaved */ - *my_io_index = io_finish_at - atomic_sub_return(1, &io_count); - - mutex_unlock(&io_mutex); - - /* - * Are we aborting? If so, don't submit any more I/O as - * resetting the resume_attempted flag (from ui.c) will - * clear the bdev flags, making this thread oops. - */ - if (unlikely(test_toi_state(TOI_STOP_RESUME))) { - atomic_dec(&toi_io_workers); - if (!atomic_read(&toi_io_workers)) { - /* - * So we can be sure we'll have memory for - * marking that we haven't resumed. - */ - rw_cleanup_modules(READ); - set_toi_state(TOI_IO_STOPPED); - } - while (1) - schedule(); - } - - /* - * See toi_bio_read_page in tuxonice_bio.c: - * read the next page in the image. - */ - return first_filter->read_page(write_pfn, TOI_PAGE, buffer, &buf_size); -} - -static void use_read_page(unsigned long write_pfn, struct page *buffer) -{ - struct page *final_page = pfn_to_page(write_pfn), - *copy_page = final_page; - char *virt, *buffer_virt; - int was_present, cpu = smp_processor_id(); - unsigned long idx = 0; - - if (io_pageset == 1 && (!pageset1_copy_map || - !memory_bm_test_bit(pageset1_copy_map, cpu, write_pfn))) { - int is_high = PageHighMem(final_page); - copy_page = copy_page_from_orig_page(is_high ? (void *) write_pfn : final_page, is_high); - } - - if (!memory_bm_test_bit(io_map, cpu, write_pfn)) { - int test = !memory_bm_test_bit(io_map, cpu, write_pfn); - toi_message(TOI_IO, TOI_VERBOSE, 0, "Discard %ld (%d).", write_pfn, test); - mutex_lock(&io_mutex); - idx = atomic_add_return(1, &io_count); - mutex_unlock(&io_mutex); - return; - } - - virt = kmap(copy_page); - buffer_virt = kmap(buffer); - was_present = kernel_page_present(copy_page); - if (!was_present) - kernel_map_pages(copy_page, 1, 1); - memcpy(virt, buffer_virt, PAGE_SIZE); - if (!was_present) - kernel_map_pages(copy_page, 1, 0); - kunmap(copy_page); - kunmap(buffer); - memory_bm_clear_bit(io_map, cpu, write_pfn); - TOI_TRACE_DEBUG(write_pfn, "_PS%d_read", io_pageset); -} - -static unsigned long status_update(int writing, unsigned long done, - unsigned long ticks) -{ - int cs_index = writing ? 0 : 1; - unsigned long ticks_so_far = toi_bkd.toi_io_time[cs_index][1] + ticks; - unsigned long msec = jiffies_to_msecs(abs(ticks_so_far)); - unsigned long pgs_per_s, estimate = 0, pages_left; - - if (msec) { - pages_left = io_barmax - done; - pgs_per_s = 1000 * done / msec; - if (pgs_per_s) - estimate = DIV_ROUND_UP(pages_left, pgs_per_s); - } - - if (estimate && ticks > HZ / 2) - return toi_update_status(done, io_barmax, - " %d/%d MB (%lu sec left)", - MB(done+1), MB(io_barmax), estimate); - - return toi_update_status(done, io_barmax, " %d/%d MB", - MB(done+1), MB(io_barmax)); -} - -/** - * worker_rw_loop - main loop to read/write pages - * - * The main I/O loop for reading or writing pages. The io_map bitmap is used to - * track the pages to read/write. - * If we are reading, the pages are loaded to their final (mapped) pfn. - * Data is non zero iff this is a thread started via start_other_threads. - * In that case, we stay in here until told to quit. - **/ -static int worker_rw_loop(void *data) -{ - unsigned long data_pfn, write_pfn, next_jiffies = jiffies + HZ / 4, - jif_index = 1, start_time = jiffies, thread_num; - int result = 0, my_io_index = 0, last_worker; - struct page *buffer = toi_alloc_page(28, TOI_ATOMIC_GFP); - cpumask_var_t orig_mask; - - if (!alloc_cpumask_var(&orig_mask, GFP_KERNEL)) { - printk(KERN_EMERG "Failed to allocate cpumask for TuxOnIce I/O thread %ld.\n", (unsigned long) data); - result = -ENOMEM; - goto out; - } - - cpumask_copy(orig_mask, tsk_cpus_allowed(current)); - - current->flags |= PF_NOFREEZE; - -top: - mutex_lock(&io_mutex); - thread_num = atomic_read(&toi_io_workers); - - cpumask_copy(tsk_cpus_allowed(current), orig_mask); - schedule(); - - atomic_inc(&toi_io_workers); - - while (atomic_read(&io_count) >= atomic_read(&toi_io_workers) && - !(io_write && test_result_state(TOI_ABORTED)) && - toi_worker_command == TOI_IO_WORKER_RUN) { - if (!thread_num && jiffies > next_jiffies) { - next_jiffies += HZ / 4; - if (toiActiveAllocator->update_throughput_throttle) - toiActiveAllocator->update_throughput_throttle( - jif_index); - jif_index++; - } - - /* - * What page to use? If reading, don't know yet which page's - * data will be read, so always use the buffer. If writing, - * use the copy (Pageset1) or original page (Pageset2), but - * always write the pfn of the original page. - */ - if (io_write) - result = write_next_page(&data_pfn, &my_io_index, - &write_pfn); - else /* Reading */ - result = read_next_page(&my_io_index, &write_pfn, - buffer); - - if (result) { - mutex_lock(&io_mutex); - /* Nothing to do? */ - if (result == -ENODATA) { - toi_message(TOI_IO, TOI_VERBOSE, 0, - "Thread %d has no more work.", - smp_processor_id()); - break; - } - - io_result = result; - - if (io_write) { - printk(KERN_INFO "Write chunk returned %d.\n", - result); - abort_hibernate(TOI_FAILED_IO, - "Failed to write a chunk of the " - "image."); - break; - } - - if (io_pageset == 1) { - printk(KERN_ERR "\nBreaking out of I/O loop " - "because of result code %d.\n", result); - break; - } - panic("Read chunk returned (%d)", result); - } - - /* - * Discard reads of resaved pages while reading ps2 - * and unwanted pages while rereading ps2 when aborting. - */ - if (!io_write) { - if (!PageResave(pfn_to_page(write_pfn))) - use_read_page(write_pfn, buffer); - else { - mutex_lock(&io_mutex); - toi_message(TOI_IO, TOI_VERBOSE, 0, - "Resaved %ld.", write_pfn); - atomic_inc(&io_count); - mutex_unlock(&io_mutex); - } - } - - if (!thread_num) { - if(my_io_index + io_base > io_nextupdate) - io_nextupdate = status_update(io_write, - my_io_index + io_base, - jiffies - start_time); - - if (my_io_index > io_pc) { - printk(KERN_CONT "...%d%%", 20 * io_pc_step); - io_pc_step++; - io_pc = io_finish_at * io_pc_step / 5; - } - } - - toi_cond_pause(0, NULL); - - /* - * Subtle: If there's less I/O still to be done than threads - * running, quit. This stops us doing I/O beyond the end of - * the image when reading. - * - * Possible race condition. Two threads could do the test at - * the same time; one should exit and one should continue. - * Therefore we take the mutex before comparing and exiting. - */ - - mutex_lock(&io_mutex); - } - - last_worker = atomic_dec_and_test(&toi_io_workers); - toi_message(TOI_IO, TOI_VERBOSE, 0, "%d workers left.", atomic_read(&toi_io_workers)); - mutex_unlock(&io_mutex); - - if ((unsigned long) data && toi_worker_command != TOI_IO_WORKER_EXIT) { - /* Were we the last thread and we're using a flusher thread? */ - if (last_worker && using_flusher) { - toiActiveAllocator->finish_all_io(); - } - /* First, if we're doing I/O, wait for it to finish */ - wait_event(toi_worker_wait_queue, toi_worker_command != TOI_IO_WORKER_RUN); - /* Then wait to be told what to do next */ - wait_event(toi_worker_wait_queue, toi_worker_command != TOI_IO_WORKER_STOP); - if (toi_worker_command == TOI_IO_WORKER_RUN) - goto top; - } - - if (thread_num) - atomic_dec(&toi_num_other_threads); - -out: - toi_message(TOI_IO, TOI_LOW, 0, "Thread %d exiting.", thread_num); - toi__free_page(28, buffer); - free_cpumask_var(orig_mask); - - return result; -} - -int toi_start_other_threads(void) -{ - int cpu; - struct task_struct *p; - int to_start = (toi_max_workers ? toi_max_workers : num_online_cpus()) - 1; - unsigned long num_started = 0; - - if (test_action_state(TOI_NO_MULTITHREADED_IO)) - return 0; - - toi_worker_command = TOI_IO_WORKER_STOP; - - for_each_online_cpu(cpu) { - if (num_started == to_start) - break; - - if (cpu == smp_processor_id()) - continue; - - p = kthread_create_on_node(worker_rw_loop, (void *) num_started + 1, - cpu_to_node(cpu), "ktoi_io/%d", cpu); - if (IS_ERR(p)) { - printk(KERN_ERR "ktoi_io for %i failed\n", cpu); - continue; - } - kthread_bind(p, cpu); - p->flags |= PF_MEMALLOC; - wake_up_process(p); - num_started++; - atomic_inc(&toi_num_other_threads); - } - - toi_message(TOI_IO, TOI_LOW, 0, "Started %d threads.", num_started); - return num_started; -} - -void toi_stop_other_threads(void) -{ - toi_message(TOI_IO, TOI_LOW, 0, "Stopping other threads."); - toi_worker_command = TOI_IO_WORKER_EXIT; - wake_up(&toi_worker_wait_queue); -} - -/** - * do_rw_loop - main highlevel function for reading or writing pages - * - * Create the io_map bitmap and call worker_rw_loop to perform I/O operations. - **/ -static int do_rw_loop(int write, int finish_at, struct memory_bitmap *pageflags, - int base, int barmax, int pageset) -{ - int index = 0, cpu, result = 0, workers_started; - unsigned long pfn, next; - - first_filter = toi_get_next_filter(NULL); - - if (!finish_at) - return 0; - - io_write = write; - io_finish_at = finish_at; - io_base = base; - io_barmax = barmax; - io_pageset = pageset; - io_index = 0; - io_pc = io_finish_at / 5; - io_pc_step = 1; - io_result = 0; - io_nextupdate = base + 1; - toi_bio_queue_flusher_should_finish = 0; - - for_each_online_cpu(cpu) { - per_cpu(last_sought, cpu) = NULL; - per_cpu(last_low_page, cpu) = NULL; - per_cpu(last_high_page, cpu) = NULL; - } - - /* Ensure all bits clear */ - memory_bm_clear(io_map); - - memory_bm_position_reset(io_map); - next = memory_bm_next_pfn(io_map, 0); - - BUG_ON(next != BM_END_OF_MAP); - - /* Set the bits for the pages to write */ - memory_bm_position_reset(pageflags); - - pfn = memory_bm_next_pfn(pageflags, 0); - toi_trace_index++; - - while (pfn != BM_END_OF_MAP && index < finish_at) { - TOI_TRACE_DEBUG(pfn, "_io_pageset_%d (%d/%d)", pageset, index + 1, finish_at); - memory_bm_set_bit(io_map, 0, pfn); - pfn = memory_bm_next_pfn(pageflags, 0); - index++; - } - - BUG_ON(next != BM_END_OF_MAP || index < finish_at); - - memory_bm_position_reset(io_map); - toi_trace_index++; - - atomic_set(&io_count, finish_at); - - memory_bm_position_reset(pageset1_map); - - mutex_lock(&io_mutex); - - clear_toi_state(TOI_IO_STOPPED); - - using_flusher = (atomic_read(&toi_num_other_threads) && - toiActiveAllocator->io_flusher && - !test_action_state(TOI_NO_FLUSHER_THREAD)); - - workers_started = atomic_read(&toi_num_other_threads); - - memory_bm_position_reset(io_map); - memory_bm_position_reset(pageset1_copy_map); - - toi_worker_command = TOI_IO_WORKER_RUN; - wake_up(&toi_worker_wait_queue); - - mutex_unlock(&io_mutex); - - if (using_flusher) - result = toiActiveAllocator->io_flusher(write); - else - worker_rw_loop(NULL); - - while (atomic_read(&toi_io_workers)) - schedule(); - - printk(KERN_CONT "\n"); - - toi_worker_command = TOI_IO_WORKER_STOP; - wake_up(&toi_worker_wait_queue); - - if (unlikely(test_toi_state(TOI_STOP_RESUME))) { - if (!atomic_read(&toi_io_workers)) { - rw_cleanup_modules(READ); - set_toi_state(TOI_IO_STOPPED); - } - while (1) - schedule(); - } - set_toi_state(TOI_IO_STOPPED); - - if (!io_result && !result && !test_result_state(TOI_ABORTED)) { - unsigned long next; - - toi_update_status(io_base + io_finish_at, io_barmax, - " %d/%d MB ", - MB(io_base + io_finish_at), MB(io_barmax)); - - memory_bm_position_reset(io_map); - next = memory_bm_next_pfn(io_map, 0); - if (next != BM_END_OF_MAP) { - printk(KERN_INFO "Finished I/O loop but still work to " - "do?\nFinish at = %d. io_count = %d.\n", - finish_at, atomic_read(&io_count)); - printk(KERN_INFO "I/O bitmap still records work to do." - "%ld.\n", next); - BUG(); - do { - cpu_relax(); - } while (0); - } - } - - return io_result ? io_result : result; -} - -/** - * write_pageset - write a pageset to disk. - * @pagedir: Which pagedir to write. - * - * Returns: - * Zero on success or -1 on failure. - **/ -int write_pageset(struct pagedir *pagedir) -{ - int finish_at, base = 0; - int barmax = pagedir1.size + pagedir2.size; - long error = 0; - struct memory_bitmap *pageflags; - unsigned long start_time, end_time; - - /* - * Even if there is nothing to read or write, the allocator - * may need the init/cleanup for it's housekeeping. (eg: - * Pageset1 may start where pageset2 ends when writing). - */ - finish_at = pagedir->size; - - if (pagedir->id == 1) { - toi_prepare_status(DONT_CLEAR_BAR, - "Writing kernel & process data..."); - base = pagedir2.size; - if (test_action_state(TOI_TEST_FILTER_SPEED) || - test_action_state(TOI_TEST_BIO)) - pageflags = pageset1_map; - else - pageflags = pageset1_copy_map; - } else { - toi_prepare_status(DONT_CLEAR_BAR, "Writing caches..."); - pageflags = pageset2_map; - } - - start_time = jiffies; - - if (rw_init_modules(WRITE, pagedir->id)) { - abort_hibernate(TOI_FAILED_MODULE_INIT, - "Failed to initialise modules for writing."); - error = 1; - } - - if (!error) - error = do_rw_loop(WRITE, finish_at, pageflags, base, barmax, - pagedir->id); - - if (rw_cleanup_modules(WRITE) && !error) { - abort_hibernate(TOI_FAILED_MODULE_CLEANUP, - "Failed to cleanup after writing."); - error = 1; - } - - end_time = jiffies; - - if ((end_time - start_time) && (!test_result_state(TOI_ABORTED))) { - toi_bkd.toi_io_time[0][0] += finish_at, - toi_bkd.toi_io_time[0][1] += (end_time - start_time); - } - - return error; -} - -/** - * read_pageset - highlevel function to read a pageset from disk - * @pagedir: pageset to read - * @overwrittenpagesonly: Whether to read the whole pageset or - * only part of it. - * - * Returns: - * Zero on success or -1 on failure. - **/ -static int read_pageset(struct pagedir *pagedir, int overwrittenpagesonly) -{ - int result = 0, base = 0; - int finish_at = pagedir->size; - int barmax = pagedir1.size + pagedir2.size; - struct memory_bitmap *pageflags; - unsigned long start_time, end_time; - - if (pagedir->id == 1) { - toi_prepare_status(DONT_CLEAR_BAR, - "Reading kernel & process data..."); - pageflags = pageset1_map; - } else { - toi_prepare_status(DONT_CLEAR_BAR, "Reading caches..."); - if (overwrittenpagesonly) { - barmax = min(pagedir1.size, pagedir2.size); - finish_at = min(pagedir1.size, pagedir2.size); - } else - base = pagedir1.size; - pageflags = pageset2_map; - } - - start_time = jiffies; - - if (rw_init_modules(READ, pagedir->id)) { - toiActiveAllocator->remove_image(); - result = 1; - } else - result = do_rw_loop(READ, finish_at, pageflags, base, barmax, - pagedir->id); - - if (rw_cleanup_modules(READ) && !result) { - abort_hibernate(TOI_FAILED_MODULE_CLEANUP, - "Failed to cleanup after reading."); - result = 1; - } - - /* Statistics */ - end_time = jiffies; - - if ((end_time - start_time) && (!test_result_state(TOI_ABORTED))) { - toi_bkd.toi_io_time[1][0] += finish_at, - toi_bkd.toi_io_time[1][1] += (end_time - start_time); - } - - return result; -} - -/** - * write_module_configs - store the modules configuration - * - * The configuration for each module is stored in the image header. - * Returns: Int - * Zero on success, Error value otherwise. - **/ -static int write_module_configs(void) -{ - struct toi_module_ops *this_module; - char *buffer = (char *) toi_get_zeroed_page(22, TOI_ATOMIC_GFP); - int len, index = 1; - struct toi_module_header toi_module_header; - - if (!buffer) { - printk(KERN_INFO "Failed to allocate a buffer for saving " - "module configuration info.\n"); - return -ENOMEM; - } - - /* - * We have to know which data goes with which module, so we at - * least write a length of zero for a module. Note that we are - * also assuming every module's config data takes <= PAGE_SIZE. - */ - - /* For each module (in registration order) */ - list_for_each_entry(this_module, &toi_modules, module_list) { - if (!this_module->enabled || !this_module->storage_needed || - (this_module->type == WRITER_MODULE && - toiActiveAllocator != this_module)) - continue; - - /* Get the data from the module */ - len = 0; - if (this_module->save_config_info) - len = this_module->save_config_info(buffer); - - /* Save the details of the module */ - toi_module_header.enabled = this_module->enabled; - toi_module_header.type = this_module->type; - toi_module_header.index = index++; - strncpy(toi_module_header.name, this_module->name, - sizeof(toi_module_header.name)); - toiActiveAllocator->rw_header_chunk(WRITE, - this_module, - (char *) &toi_module_header, - sizeof(toi_module_header)); - - /* Save the size of the data and any data returned */ - toiActiveAllocator->rw_header_chunk(WRITE, - this_module, - (char *) &len, sizeof(int)); - if (len) - toiActiveAllocator->rw_header_chunk( - WRITE, this_module, buffer, len); - } - - /* Write a blank header to terminate the list */ - toi_module_header.name[0] = '\0'; - toiActiveAllocator->rw_header_chunk(WRITE, NULL, - (char *) &toi_module_header, sizeof(toi_module_header)); - - toi_free_page(22, (unsigned long) buffer); - return 0; -} - -/** - * read_one_module_config - read and configure one module - * - * Read the configuration for one module, and configure the module - * to match if it is loaded. - * - * Returns: Int - * Zero on success, Error value otherwise. - **/ -static int read_one_module_config(struct toi_module_header *header) -{ - struct toi_module_ops *this_module; - int result, len; - char *buffer; - - /* Find the module */ - this_module = toi_find_module_given_name(header->name); - - if (!this_module) { - if (header->enabled) { - toi_early_boot_message(1, TOI_CONTINUE_REQ, - "It looks like we need module %s for reading " - "the image but it hasn't been registered.\n", - header->name); - if (!(test_toi_state(TOI_CONTINUE_REQ))) - return -EINVAL; - } else - printk(KERN_INFO "Module %s configuration data found, " - "but the module hasn't registered. Looks like " - "it was disabled, so we're ignoring its data.", - header->name); - } - - /* Get the length of the data (if any) */ - result = toiActiveAllocator->rw_header_chunk(READ, NULL, (char *) &len, - sizeof(int)); - if (result) { - printk(KERN_ERR "Failed to read the length of the module %s's" - " configuration data.\n", - header->name); - return -EINVAL; - } - - /* Read any data and pass to the module (if we found one) */ - if (!len) - return 0; - - buffer = (char *) toi_get_zeroed_page(23, TOI_ATOMIC_GFP); - - if (!buffer) { - printk(KERN_ERR "Failed to allocate a buffer for reloading " - "module configuration info.\n"); - return -ENOMEM; - } - - toiActiveAllocator->rw_header_chunk(READ, NULL, buffer, len); - - if (!this_module) - goto out; - - if (!this_module->save_config_info) - printk(KERN_ERR "Huh? Module %s appears to have a " - "save_config_info, but not a load_config_info " - "function!\n", this_module->name); - else - this_module->load_config_info(buffer, len); - - /* - * Now move this module to the tail of its lists. This will put it in - * order. Any new modules will end up at the top of the lists. They - * should have been set to disabled when loaded (people will - * normally not edit an initrd to load a new module and then hibernate - * without using it!). - */ - - toi_move_module_tail(this_module); - - this_module->enabled = header->enabled; - -out: - toi_free_page(23, (unsigned long) buffer); - return 0; -} - -/** - * read_module_configs - reload module configurations from the image header. - * - * Returns: Int - * Zero on success or an error code. - **/ -static int read_module_configs(void) -{ - int result = 0; - struct toi_module_header toi_module_header; - struct toi_module_ops *this_module; - - /* All modules are initially disabled. That way, if we have a module - * loaded now that wasn't loaded when we hibernated, it won't be used - * in trying to read the data. - */ - list_for_each_entry(this_module, &toi_modules, module_list) - this_module->enabled = 0; - - /* Get the first module header */ - result = toiActiveAllocator->rw_header_chunk(READ, NULL, - (char *) &toi_module_header, - sizeof(toi_module_header)); - if (result) { - printk(KERN_ERR "Failed to read the next module header.\n"); - return -EINVAL; - } - - /* For each module (in registration order) */ - while (toi_module_header.name[0]) { - result = read_one_module_config(&toi_module_header); - - if (result) - return -EINVAL; - - /* Get the next module header */ - result = toiActiveAllocator->rw_header_chunk(READ, NULL, - (char *) &toi_module_header, - sizeof(toi_module_header)); - - if (result) { - printk(KERN_ERR "Failed to read the next module " - "header.\n"); - return -EINVAL; - } - } - - return 0; -} - -static inline int save_fs_info(struct fs_info *fs, struct block_device *bdev) -{ - return (!fs || IS_ERR(fs) || !fs->last_mount_size) ? 0 : 1; -} - -int fs_info_space_needed(void) -{ - const struct super_block *sb; - int result = sizeof(int); - - list_for_each_entry(sb, &super_blocks, s_list) { - struct fs_info *fs; - - if (!sb->s_bdev) - continue; - - fs = fs_info_from_block_dev(sb->s_bdev); - if (save_fs_info(fs, sb->s_bdev)) - result += 16 + sizeof(dev_t) + sizeof(int) + - fs->last_mount_size; - free_fs_info(fs); - } - return result; -} - -static int fs_info_num_to_save(void) -{ - const struct super_block *sb; - int to_save = 0; - - list_for_each_entry(sb, &super_blocks, s_list) { - struct fs_info *fs; - - if (!sb->s_bdev) - continue; - - fs = fs_info_from_block_dev(sb->s_bdev); - if (save_fs_info(fs, sb->s_bdev)) - to_save++; - free_fs_info(fs); - } - - return to_save; -} - -static int fs_info_save(void) -{ - const struct super_block *sb; - int to_save = fs_info_num_to_save(); - - if (toiActiveAllocator->rw_header_chunk(WRITE, NULL, (char *) &to_save, - sizeof(int))) { - abort_hibernate(TOI_FAILED_IO, "Failed to write num fs_info" - " to save."); - return -EIO; - } - - list_for_each_entry(sb, &super_blocks, s_list) { - struct fs_info *fs; - - if (!sb->s_bdev) - continue; - - fs = fs_info_from_block_dev(sb->s_bdev); - if (save_fs_info(fs, sb->s_bdev)) { - if (toiActiveAllocator->rw_header_chunk(WRITE, NULL, - &fs->uuid[0], 16)) { - abort_hibernate(TOI_FAILED_IO, "Failed to " - "write uuid."); - return -EIO; - } - if (toiActiveAllocator->rw_header_chunk(WRITE, NULL, - (char *) &fs->dev_t, sizeof(dev_t))) { - abort_hibernate(TOI_FAILED_IO, "Failed to " - "write dev_t."); - return -EIO; - } - if (toiActiveAllocator->rw_header_chunk(WRITE, NULL, - (char *) &fs->last_mount_size, sizeof(int))) { - abort_hibernate(TOI_FAILED_IO, "Failed to " - "write last mount length."); - return -EIO; - } - if (toiActiveAllocator->rw_header_chunk(WRITE, NULL, - fs->last_mount, fs->last_mount_size)) { - abort_hibernate(TOI_FAILED_IO, "Failed to " - "write uuid."); - return -EIO; - } - } - free_fs_info(fs); - } - return 0; -} - -static int fs_info_load_and_check_one(void) -{ - char uuid[16], *last_mount; - int result = 0, ln; - dev_t dev_t; - struct block_device *dev; - struct fs_info *fs_info, seek; - - if (toiActiveAllocator->rw_header_chunk(READ, NULL, uuid, 16)) { - abort_hibernate(TOI_FAILED_IO, "Failed to read uuid."); - return -EIO; - } - - read_if_version(3, dev_t, "uuid dev_t field", return -EIO); - - if (toiActiveAllocator->rw_header_chunk(READ, NULL, (char *) &ln, - sizeof(int))) { - abort_hibernate(TOI_FAILED_IO, - "Failed to read last mount size."); - return -EIO; - } - - last_mount = kzalloc(ln, GFP_KERNEL); - - if (!last_mount) - return -ENOMEM; - - if (toiActiveAllocator->rw_header_chunk(READ, NULL, last_mount, ln)) { - abort_hibernate(TOI_FAILED_IO, - "Failed to read last mount timestamp."); - result = -EIO; - goto out_lmt; - } - - strncpy((char *) &seek.uuid, uuid, 16); - seek.dev_t = dev_t; - seek.last_mount_size = ln; - seek.last_mount = last_mount; - dev_t = blk_lookup_fs_info(&seek); - if (!dev_t) - goto out_lmt; - - dev = toi_open_by_devnum(dev_t); - - fs_info = fs_info_from_block_dev(dev); - if (fs_info && !IS_ERR(fs_info)) { - if (ln != fs_info->last_mount_size) { - printk(KERN_EMERG "Found matching uuid but last mount " - "time lengths differ?! " - "(%d vs %d).\n", ln, - fs_info->last_mount_size); - result = -EINVAL; - } else { - char buf[BDEVNAME_SIZE]; - result = !!memcmp(fs_info->last_mount, last_mount, ln); - if (result) - printk(KERN_EMERG "Last mount time for %s has " - "changed!\n", bdevname(dev, buf)); - } - } - toi_close_bdev(dev); - free_fs_info(fs_info); -out_lmt: - kfree(last_mount); - return result; -} - -static int fs_info_load_and_check(void) -{ - int to_do, result = 0; - - if (toiActiveAllocator->rw_header_chunk(READ, NULL, (char *) &to_do, - sizeof(int))) { - abort_hibernate(TOI_FAILED_IO, "Failed to read num fs_info " - "to load."); - return -EIO; - } - - while(to_do--) - result |= fs_info_load_and_check_one(); - - return result; -} - -/** - * write_image_header - write the image header after write the image proper - * - * Returns: Int - * Zero on success, error value otherwise. - **/ -int write_image_header(void) -{ - int ret; - int total = pagedir1.size + pagedir2.size+2; - char *header_buffer = NULL; - - /* Now prepare to write the header */ - ret = toiActiveAllocator->write_header_init(); - if (ret) { - abort_hibernate(TOI_FAILED_MODULE_INIT, - "Active allocator's write_header_init" - " function failed."); - goto write_image_header_abort; - } - - /* Get a buffer */ - header_buffer = (char *) toi_get_zeroed_page(24, TOI_ATOMIC_GFP); - if (!header_buffer) { - abort_hibernate(TOI_OUT_OF_MEMORY, - "Out of memory when trying to get page for header!"); - goto write_image_header_abort; - } - - /* Write hibernate header */ - if (fill_toi_header((struct toi_header *) header_buffer)) { - abort_hibernate(TOI_OUT_OF_MEMORY, - "Failure to fill header information!"); - goto write_image_header_abort; - } - - if (toiActiveAllocator->rw_header_chunk(WRITE, NULL, - header_buffer, sizeof(struct toi_header))) { - abort_hibernate(TOI_OUT_OF_MEMORY, - "Failure to write header info."); - goto write_image_header_abort; - } - - if (toiActiveAllocator->rw_header_chunk(WRITE, NULL, - (char *) &toi_max_workers, sizeof(toi_max_workers))) { - abort_hibernate(TOI_OUT_OF_MEMORY, - "Failure to number of workers to use."); - goto write_image_header_abort; - } - - /* Write filesystem info */ - if (fs_info_save()) - goto write_image_header_abort; - - /* Write module configurations */ - ret = write_module_configs(); - if (ret) { - abort_hibernate(TOI_FAILED_IO, - "Failed to write module configs."); - goto write_image_header_abort; - } - - if (memory_bm_write(pageset1_map, - toiActiveAllocator->rw_header_chunk)) { - abort_hibernate(TOI_FAILED_IO, - "Failed to write bitmaps."); - goto write_image_header_abort; - } - - /* Flush data and let allocator cleanup */ - if (toiActiveAllocator->write_header_cleanup()) { - abort_hibernate(TOI_FAILED_IO, - "Failed to cleanup writing header."); - goto write_image_header_abort_no_cleanup; - } - - if (test_result_state(TOI_ABORTED)) - goto write_image_header_abort_no_cleanup; - - toi_update_status(total, total, NULL); - -out: - if (header_buffer) - toi_free_page(24, (unsigned long) header_buffer); - return ret; - -write_image_header_abort: - toiActiveAllocator->write_header_cleanup(); -write_image_header_abort_no_cleanup: - ret = -1; - goto out; -} - -/** - * sanity_check - check the header - * @sh: the header which was saved at hibernate time. - * - * Perform a few checks, seeking to ensure that the kernel being - * booted matches the one hibernated. They need to match so we can - * be _sure_ things will work. It is not absolutely impossible for - * resuming from a different kernel to work, just not assured. - **/ -static char *sanity_check(struct toi_header *sh) -{ - char *reason = check_image_kernel((struct swsusp_info *) sh); - - if (reason) - return reason; - - if (!test_action_state(TOI_IGNORE_ROOTFS)) { - const struct super_block *sb; - list_for_each_entry(sb, &super_blocks, s_list) { - if ((!(sb->s_flags & MS_RDONLY)) && - (sb->s_type->fs_flags & FS_REQUIRES_DEV)) - return "Device backed fs has been mounted " - "rw prior to resume or initrd/ramfs " - "is mounted rw."; - } - } - - return NULL; -} - -static DECLARE_WAIT_QUEUE_HEAD(freeze_wait); - -#define FREEZE_IN_PROGRESS (~0) - -static int freeze_result; - -static void do_freeze(struct work_struct *dummy) -{ - freeze_result = freeze_processes(); - wake_up(&freeze_wait); - trap_non_toi_io = 1; -} - -static DECLARE_WORK(freeze_work, do_freeze); - -/** - * __read_pageset1 - test for the existence of an image and attempt to load it - * - * Returns: Int - * Zero if image found and pageset1 successfully loaded. - * Error if no image found or loaded. - **/ -static int __read_pageset1(void) -{ - int i, result = 0; - char *header_buffer = (char *) toi_get_zeroed_page(25, TOI_ATOMIC_GFP), - *sanity_error = NULL; - struct toi_header *toi_header; - - if (!header_buffer) { - printk(KERN_INFO "Unable to allocate a page for reading the " - "signature.\n"); - return -ENOMEM; - } - - /* Check for an image */ - result = toiActiveAllocator->image_exists(1); - if (result == 3) { - result = -ENODATA; - toi_early_boot_message(1, 0, "The signature from an older " - "version of TuxOnIce has been detected."); - goto out_remove_image; - } - - if (result != 1) { - result = -ENODATA; - noresume_reset_modules(); - printk(KERN_INFO "TuxOnIce: No image found.\n"); - goto out; - } - - /* - * Prepare the active allocator for reading the image header. The - * activate allocator might read its own configuration. - * - * NB: This call may never return because there might be a signature - * for a different image such that we warn the user and they choose - * to reboot. (If the device ids look erroneous (2.4 vs 2.6) or the - * location of the image might be unavailable if it was stored on a - * network connection). - */ - - result = toiActiveAllocator->read_header_init(); - if (result) { - printk(KERN_INFO "TuxOnIce: Failed to initialise, reading the " - "image header.\n"); - goto out_remove_image; - } - - /* Check for noresume command line option */ - if (test_toi_state(TOI_NORESUME_SPECIFIED)) { - printk(KERN_INFO "TuxOnIce: Noresume on command line. Removed " - "image.\n"); - goto out_remove_image; - } - - /* Check whether we've resumed before */ - if (test_toi_state(TOI_RESUMED_BEFORE)) { - toi_early_boot_message(1, 0, NULL); - if (!(test_toi_state(TOI_CONTINUE_REQ))) { - printk(KERN_INFO "TuxOnIce: Tried to resume before: " - "Invalidated image.\n"); - goto out_remove_image; - } - } - - clear_toi_state(TOI_CONTINUE_REQ); - - toi_image_header_version = toiActiveAllocator->get_header_version(); - - if (unlikely(toi_image_header_version > TOI_HEADER_VERSION)) { - toi_early_boot_message(1, 0, image_version_error); - if (!(test_toi_state(TOI_CONTINUE_REQ))) { - printk(KERN_INFO "TuxOnIce: Header version too new: " - "Invalidated image.\n"); - goto out_remove_image; - } - } - - /* Read hibernate header */ - result = toiActiveAllocator->rw_header_chunk(READ, NULL, - header_buffer, sizeof(struct toi_header)); - if (result < 0) { - printk(KERN_ERR "TuxOnIce: Failed to read the image " - "signature.\n"); - goto out_remove_image; - } - - toi_header = (struct toi_header *) header_buffer; - - /* - * NB: This call may also result in a reboot rather than returning. - */ - - sanity_error = sanity_check(toi_header); - if (sanity_error) { - toi_early_boot_message(1, TOI_CONTINUE_REQ, - sanity_error); - printk(KERN_INFO "TuxOnIce: Sanity check failed.\n"); - goto out_remove_image; - } - - /* - * We have an image and it looks like it will load okay. - * - * Get metadata from header. Don't override commandline parameters. - * - * We don't need to save the image size limit because it's not used - * during resume and will be restored with the image anyway. - */ - - memcpy((char *) &pagedir1, - (char *) &toi_header->pagedir, sizeof(pagedir1)); - toi_result = toi_header->param0; - if (!toi_bkd.toi_debug_state) { - toi_bkd.toi_action = - (toi_header->param1 & ~toi_bootflags_mask) | - (toi_bkd.toi_action & toi_bootflags_mask); - toi_bkd.toi_debug_state = toi_header->param2; - toi_bkd.toi_default_console_level = toi_header->param3; - } - clear_toi_state(TOI_IGNORE_LOGLEVEL); - pagedir2.size = toi_header->pageset_2_size; - for (i = 0; i < 4; i++) - toi_bkd.toi_io_time[i/2][i%2] = - toi_header->io_time[i/2][i%2]; - - set_toi_state(TOI_BOOT_KERNEL); - boot_kernel_data_buffer = toi_header->bkd; - - read_if_version(1, toi_max_workers, "TuxOnIce max workers", - goto out_remove_image); - - /* Read filesystem info */ - if (fs_info_load_and_check()) { - printk(KERN_EMERG "TuxOnIce: File system mount time checks " - "failed. Refusing to corrupt your filesystems!\n"); - goto out_remove_image; - } - - /* Read module configurations */ - result = read_module_configs(); - if (result) { - pagedir1.size = 0; - pagedir2.size = 0; - printk(KERN_INFO "TuxOnIce: Failed to read TuxOnIce module " - "configurations.\n"); - clear_action_state(TOI_KEEP_IMAGE); - goto out_remove_image; - } - - toi_prepare_console(); - - set_toi_state(TOI_NOW_RESUMING); - - result = pm_notifier_call_chain(PM_RESTORE_PREPARE); - if (result) - goto out_notifier_call_chain;; - - if (usermodehelper_disable()) - goto out_enable_usermodehelper; - - current->flags |= PF_NOFREEZE; - freeze_result = FREEZE_IN_PROGRESS; - - schedule_work_on(cpumask_first(cpu_online_mask), &freeze_work); - - toi_cond_pause(1, "About to read original pageset1 locations."); - - /* - * See _toi_rw_header_chunk in tuxonice_bio.c: - * Initialize pageset1_map by reading the map from the image. - */ - if (memory_bm_read(pageset1_map, toiActiveAllocator->rw_header_chunk)) - goto out_thaw; - - /* - * See toi_rw_cleanup in tuxonice_bio.c: - * Clean up after reading the header. - */ - result = toiActiveAllocator->read_header_cleanup(); - if (result) { - printk(KERN_ERR "TuxOnIce: Failed to cleanup after reading the " - "image header.\n"); - goto out_thaw; - } - - toi_cond_pause(1, "About to read pagedir."); - - /* - * Get the addresses of pages into which we will load the kernel to - * be copied back and check if they conflict with the ones we are using. - */ - if (toi_get_pageset1_load_addresses()) { - printk(KERN_INFO "TuxOnIce: Failed to get load addresses for " - "pageset1.\n"); - goto out_thaw; - } - - /* Read the original kernel back */ - toi_cond_pause(1, "About to read pageset 1."); - - /* Given the pagemap, read back the data from disk */ - if (read_pageset(&pagedir1, 0)) { - toi_prepare_status(DONT_CLEAR_BAR, "Failed to read pageset 1."); - result = -EIO; - goto out_thaw; - } - - toi_cond_pause(1, "About to restore original kernel."); - result = 0; - - if (!toi_keeping_image && - toiActiveAllocator->mark_resume_attempted) - toiActiveAllocator->mark_resume_attempted(1); - - wait_event(freeze_wait, freeze_result != FREEZE_IN_PROGRESS); -out: - current->flags &= ~PF_NOFREEZE; - toi_free_page(25, (unsigned long) header_buffer); - return result; - -out_thaw: - wait_event(freeze_wait, freeze_result != FREEZE_IN_PROGRESS); - trap_non_toi_io = 0; - thaw_processes(); -out_enable_usermodehelper: - usermodehelper_enable(); -out_notifier_call_chain: - pm_notifier_call_chain(PM_POST_RESTORE); - toi_cleanup_console(); -out_remove_image: - result = -EINVAL; - if (!toi_keeping_image) - toiActiveAllocator->remove_image(); - toiActiveAllocator->read_header_cleanup(); - noresume_reset_modules(); - goto out; -} - -/** - * read_pageset1 - highlevel function to read the saved pages - * - * Attempt to read the header and pageset1 of a hibernate image. - * Handle the outcome, complaining where appropriate. - **/ -int read_pageset1(void) -{ - int error; - - error = __read_pageset1(); - - if (error && error != -ENODATA && error != -EINVAL && - !test_result_state(TOI_ABORTED)) - abort_hibernate(TOI_IMAGE_ERROR, - "TuxOnIce: Error %d resuming\n", error); - - return error; -} - -/** - * get_have_image_data - check the image header - **/ -static char *get_have_image_data(void) -{ - char *output_buffer = (char *) toi_get_zeroed_page(26, TOI_ATOMIC_GFP); - struct toi_header *toi_header; - - if (!output_buffer) { - printk(KERN_INFO "Output buffer null.\n"); - return NULL; - } - - /* Check for an image */ - if (!toiActiveAllocator->image_exists(1) || - toiActiveAllocator->read_header_init() || - toiActiveAllocator->rw_header_chunk(READ, NULL, - output_buffer, sizeof(struct toi_header))) { - sprintf(output_buffer, "0\n"); - /* - * From an initrd/ramfs, catting have_image and - * getting a result of 0 is sufficient. - */ - clear_toi_state(TOI_BOOT_TIME); - goto out; - } - - toi_header = (struct toi_header *) output_buffer; - - sprintf(output_buffer, "1\n%s\n%s\n", - toi_header->uts.machine, - toi_header->uts.version); - - /* Check whether we've resumed before */ - if (test_toi_state(TOI_RESUMED_BEFORE)) - strcat(output_buffer, "Resumed before.\n"); - -out: - noresume_reset_modules(); - return output_buffer; -} - -/** - * read_pageset2 - read second part of the image - * @overwrittenpagesonly: Read only pages which would have been - * verwritten by pageset1? - * - * Read in part or all of pageset2 of an image, depending upon - * whether we are hibernating and have only overwritten a portion - * with pageset1 pages, or are resuming and need to read them - * all. - * - * Returns: Int - * Zero if no error, otherwise the error value. - **/ -int read_pageset2(int overwrittenpagesonly) -{ - int result = 0; - - if (!pagedir2.size) - return 0; - - result = read_pageset(&pagedir2, overwrittenpagesonly); - - toi_cond_pause(1, "Pagedir 2 read."); - - return result; -} - -/** - * image_exists_read - has an image been found? - * @page: Output buffer - * - * Store 0 or 1 in page, depending on whether an image is found. - * Incoming buffer is PAGE_SIZE and result is guaranteed - * to be far less than that, so we don't worry about - * overflow. - **/ -int image_exists_read(const char *page, int count) -{ - int len = 0; - char *result; - - if (toi_activate_storage(0)) - return count; - - if (!test_toi_state(TOI_RESUME_DEVICE_OK)) - toi_attempt_to_parse_resume_device(0); - - if (!toiActiveAllocator) { - len = sprintf((char *) page, "-1\n"); - } else { - result = get_have_image_data(); - if (result) { - len = sprintf((char *) page, "%s", result); - toi_free_page(26, (unsigned long) result); - } - } - - toi_deactivate_storage(0); - - return len; -} - -/** - * image_exists_write - invalidate an image if one exists - **/ -int image_exists_write(const char *buffer, int count) -{ - if (toi_activate_storage(0)) - return count; - - if (toiActiveAllocator && toiActiveAllocator->image_exists(1)) - toiActiveAllocator->remove_image(); - - toi_deactivate_storage(0); - - clear_result_state(TOI_KEPT_IMAGE); - - return count; -} diff --git a/kernel/power/tuxonice_io.h b/kernel/power/tuxonice_io.h deleted file mode 100644 index 683eab7a0..000000000 --- a/kernel/power/tuxonice_io.h +++ /dev/null @@ -1,72 +0,0 @@ -/* - * kernel/power/tuxonice_io.h - * - * Copyright (C) 2005-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * It contains high level IO routines for hibernating. - * - */ - -#include <linux/utsname.h> -#include "tuxonice_pagedir.h" - -/* Non-module data saved in our image header */ -struct toi_header { - /* - * Mirror struct swsusp_info, but without - * the page aligned attribute - */ - struct new_utsname uts; - u32 version_code; - unsigned long num_physpages; - int cpus; - unsigned long image_pages; - unsigned long pages; - unsigned long size; - - /* Our own data */ - unsigned long orig_mem_free; - int page_size; - int pageset_2_size; - int param0; - int param1; - int param2; - int param3; - int progress0; - int progress1; - int progress2; - int progress3; - int io_time[2][2]; - struct pagedir pagedir; - dev_t root_fs; - unsigned long bkd; /* Boot kernel data locn */ -}; - -extern int write_pageset(struct pagedir *pagedir); -extern int write_image_header(void); -extern int read_pageset1(void); -extern int read_pageset2(int overwrittenpagesonly); - -extern int toi_attempt_to_parse_resume_device(int quiet); -extern void attempt_to_parse_resume_device2(void); -extern void attempt_to_parse_alt_resume_param(void); -int image_exists_read(const char *page, int count); -int image_exists_write(const char *buffer, int count); -extern void save_restore_alt_param(int replace, int quiet); -extern atomic_t toi_io_workers; - -/* Args to save_restore_alt_param */ -#define RESTORE 0 -#define SAVE 1 - -#define NOQUIET 0 -#define QUIET 1 - -extern wait_queue_head_t toi_io_queue_flusher; -extern int toi_bio_queue_flusher_should_finish; - -int fs_info_space_needed(void); - -extern int toi_max_workers; diff --git a/kernel/power/tuxonice_modules.c b/kernel/power/tuxonice_modules.c deleted file mode 100644 index a203c8fb9..000000000 --- a/kernel/power/tuxonice_modules.c +++ /dev/null @@ -1,520 +0,0 @@ -/* - * kernel/power/tuxonice_modules.c - * - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - */ - -#include <linux/suspend.h> -#include <linux/module.h> -#include "tuxonice.h" -#include "tuxonice_modules.h" -#include "tuxonice_sysfs.h" -#include "tuxonice_ui.h" - -LIST_HEAD(toi_filters); -LIST_HEAD(toiAllocators); - -LIST_HEAD(toi_modules); - -struct toi_module_ops *toiActiveAllocator; - -static int toi_num_filters; -int toiNumAllocators, toi_num_modules; - -/* - * toi_header_storage_for_modules - * - * Returns the amount of space needed to store configuration - * data needed by the modules prior to copying back the original - * kernel. We can exclude data for pageset2 because it will be - * available anyway once the kernel is copied back. - */ -long toi_header_storage_for_modules(void) -{ - struct toi_module_ops *this_module; - int bytes = 0; - - list_for_each_entry(this_module, &toi_modules, module_list) { - if (!this_module->enabled || - (this_module->type == WRITER_MODULE && - toiActiveAllocator != this_module)) - continue; - if (this_module->storage_needed) { - int this = this_module->storage_needed() + - sizeof(struct toi_module_header) + - sizeof(int); - this_module->header_requested = this; - bytes += this; - } - } - - /* One more for the empty terminator */ - return bytes + sizeof(struct toi_module_header); -} - -void print_toi_header_storage_for_modules(void) -{ - struct toi_module_ops *this_module; - int bytes = 0; - - printk(KERN_DEBUG "Header storage:\n"); - list_for_each_entry(this_module, &toi_modules, module_list) { - if (!this_module->enabled || - (this_module->type == WRITER_MODULE && - toiActiveAllocator != this_module)) - continue; - if (this_module->storage_needed) { - int this = this_module->storage_needed() + - sizeof(struct toi_module_header) + - sizeof(int); - this_module->header_requested = this; - bytes += this; - printk(KERN_DEBUG "+ %16s : %-4d/%d.\n", - this_module->name, - this_module->header_used, this); - } - } - - printk(KERN_DEBUG "+ empty terminator : %zu.\n", - sizeof(struct toi_module_header)); - printk(KERN_DEBUG " ====\n"); - printk(KERN_DEBUG " %zu\n", - bytes + sizeof(struct toi_module_header)); -} - -/* - * toi_memory_for_modules - * - * Returns the amount of memory requested by modules for - * doing their work during the cycle. - */ - -long toi_memory_for_modules(int print_parts) -{ - long bytes = 0, result; - struct toi_module_ops *this_module; - - if (print_parts) - printk(KERN_INFO "Memory for modules:\n===================\n"); - list_for_each_entry(this_module, &toi_modules, module_list) { - int this; - if (!this_module->enabled) - continue; - if (this_module->memory_needed) { - this = this_module->memory_needed(); - if (print_parts) - printk(KERN_INFO "%10d bytes (%5ld pages) for " - "module '%s'.\n", this, - DIV_ROUND_UP(this, PAGE_SIZE), - this_module->name); - bytes += this; - } - } - - result = DIV_ROUND_UP(bytes, PAGE_SIZE); - if (print_parts) - printk(KERN_INFO " => %ld bytes, %ld pages.\n", bytes, result); - - return result; -} - -/* - * toi_expected_compression_ratio - * - * Returns the compression ratio expected when saving the image. - */ - -int toi_expected_compression_ratio(void) -{ - int ratio = 100; - struct toi_module_ops *this_module; - - list_for_each_entry(this_module, &toi_modules, module_list) { - if (!this_module->enabled) - continue; - if (this_module->expected_compression) - ratio = ratio * this_module->expected_compression() - / 100; - } - - return ratio; -} - -/* toi_find_module_given_dir - * Functionality : Return a module (if found), given a pointer - * to its directory name - */ - -static struct toi_module_ops *toi_find_module_given_dir(char *name) -{ - struct toi_module_ops *this_module, *found_module = NULL; - - list_for_each_entry(this_module, &toi_modules, module_list) { - if (!strcmp(name, this_module->directory)) { - found_module = this_module; - break; - } - } - - return found_module; -} - -/* toi_find_module_given_name - * Functionality : Return a module (if found), given a pointer - * to its name - */ - -struct toi_module_ops *toi_find_module_given_name(char *name) -{ - struct toi_module_ops *this_module, *found_module = NULL; - - list_for_each_entry(this_module, &toi_modules, module_list) { - if (!strcmp(name, this_module->name)) { - found_module = this_module; - break; - } - } - - return found_module; -} - -/* - * toi_print_module_debug_info - * Functionality : Get debugging info from modules into a buffer. - */ -int toi_print_module_debug_info(char *buffer, int buffer_size) -{ - struct toi_module_ops *this_module; - int len = 0; - - list_for_each_entry(this_module, &toi_modules, module_list) { - if (!this_module->enabled) - continue; - if (this_module->print_debug_info) { - int result; - result = this_module->print_debug_info(buffer + len, - buffer_size - len); - len += result; - } - } - - /* Ensure null terminated */ - buffer[buffer_size] = 0; - - return len; -} - -/* - * toi_register_module - * - * Register a module. - */ -int toi_register_module(struct toi_module_ops *module) -{ - int i; - struct kobject *kobj; - - if (!hibernation_available()) - return -ENODEV; - - module->enabled = 1; - - if (toi_find_module_given_name(module->name)) { - printk(KERN_INFO "TuxOnIce: Trying to load module %s," - " which is already registered.\n", - module->name); - return -EBUSY; - } - - switch (module->type) { - case FILTER_MODULE: - list_add_tail(&module->type_list, &toi_filters); - toi_num_filters++; - break; - case WRITER_MODULE: - list_add_tail(&module->type_list, &toiAllocators); - toiNumAllocators++; - break; - case MISC_MODULE: - case MISC_HIDDEN_MODULE: - case BIO_ALLOCATOR_MODULE: - break; - default: - printk(KERN_ERR "Hmmm. Module '%s' has an invalid type." - " It has been ignored.\n", module->name); - return -EINVAL; - } - list_add_tail(&module->module_list, &toi_modules); - toi_num_modules++; - - if ((!module->directory && !module->shared_directory) || - !module->sysfs_data || !module->num_sysfs_entries) - return 0; - - /* - * Modules may share a directory, but those with shared_dir - * set must be loaded (via symbol dependencies) after parents - * and unloaded beforehand. - */ - if (module->shared_directory) { - struct toi_module_ops *shared = - toi_find_module_given_dir(module->shared_directory); - if (!shared) { - printk(KERN_ERR "TuxOnIce: Module %s wants to share " - "%s's directory but %s isn't loaded.\n", - module->name, module->shared_directory, - module->shared_directory); - toi_unregister_module(module); - return -ENODEV; - } - kobj = shared->dir_kobj; - } else { - if (!strncmp(module->directory, "[ROOT]", 6)) - kobj = tuxonice_kobj; - else - kobj = make_toi_sysdir(module->directory); - } - module->dir_kobj = kobj; - for (i = 0; i < module->num_sysfs_entries; i++) { - int result = toi_register_sysfs_file(kobj, - &module->sysfs_data[i]); - if (result) - return result; - } - return 0; -} - -/* - * toi_unregister_module - * - * Remove a module. - */ -void toi_unregister_module(struct toi_module_ops *module) -{ - int i; - - if (module->dir_kobj) - for (i = 0; i < module->num_sysfs_entries; i++) - toi_unregister_sysfs_file(module->dir_kobj, - &module->sysfs_data[i]); - - if (!module->shared_directory && module->directory && - strncmp(module->directory, "[ROOT]", 6)) - remove_toi_sysdir(module->dir_kobj); - - switch (module->type) { - case FILTER_MODULE: - list_del(&module->type_list); - toi_num_filters--; - break; - case WRITER_MODULE: - list_del(&module->type_list); - toiNumAllocators--; - if (toiActiveAllocator == module) { - toiActiveAllocator = NULL; - clear_toi_state(TOI_CAN_RESUME); - clear_toi_state(TOI_CAN_HIBERNATE); - } - break; - case MISC_MODULE: - case MISC_HIDDEN_MODULE: - case BIO_ALLOCATOR_MODULE: - break; - default: - printk(KERN_ERR "Module '%s' has an invalid type." - " It has been ignored.\n", module->name); - return; - } - list_del(&module->module_list); - toi_num_modules--; -} - -/* - * toi_move_module_tail - * - * Rearrange modules when reloading the config. - */ -void toi_move_module_tail(struct toi_module_ops *module) -{ - switch (module->type) { - case FILTER_MODULE: - if (toi_num_filters > 1) - list_move_tail(&module->type_list, &toi_filters); - break; - case WRITER_MODULE: - if (toiNumAllocators > 1) - list_move_tail(&module->type_list, &toiAllocators); - break; - case MISC_MODULE: - case MISC_HIDDEN_MODULE: - case BIO_ALLOCATOR_MODULE: - break; - default: - printk(KERN_ERR "Module '%s' has an invalid type." - " It has been ignored.\n", module->name); - return; - } - if ((toi_num_filters + toiNumAllocators) > 1) - list_move_tail(&module->module_list, &toi_modules); -} - -/* - * toi_initialise_modules - * - * Get ready to do some work! - */ -int toi_initialise_modules(int starting_cycle, int early) -{ - struct toi_module_ops *this_module; - int result; - - list_for_each_entry(this_module, &toi_modules, module_list) { - this_module->header_requested = 0; - this_module->header_used = 0; - if (!this_module->enabled) - continue; - if (this_module->early != early) - continue; - if (this_module->initialise) { - result = this_module->initialise(starting_cycle); - if (result) { - toi_cleanup_modules(starting_cycle); - return result; - } - this_module->initialised = 1; - } - } - - return 0; -} - -/* - * toi_cleanup_modules - * - * Tell modules the work is done. - */ -void toi_cleanup_modules(int finishing_cycle) -{ - struct toi_module_ops *this_module; - - list_for_each_entry(this_module, &toi_modules, module_list) { - if (!this_module->enabled || !this_module->initialised) - continue; - if (this_module->cleanup) - this_module->cleanup(finishing_cycle); - this_module->initialised = 0; - } -} - -/* - * toi_pre_atomic_restore_modules - * - * Get ready to do some work! - */ -void toi_pre_atomic_restore_modules(struct toi_boot_kernel_data *bkd) -{ - struct toi_module_ops *this_module; - - list_for_each_entry(this_module, &toi_modules, module_list) { - if (this_module->enabled && this_module->pre_atomic_restore) - this_module->pre_atomic_restore(bkd); - } -} - -/* - * toi_post_atomic_restore_modules - * - * Get ready to do some work! - */ -void toi_post_atomic_restore_modules(struct toi_boot_kernel_data *bkd) -{ - struct toi_module_ops *this_module; - - list_for_each_entry(this_module, &toi_modules, module_list) { - if (this_module->enabled && this_module->post_atomic_restore) - this_module->post_atomic_restore(bkd); - } -} - -/* - * toi_get_next_filter - * - * Get the next filter in the pipeline. - */ -struct toi_module_ops *toi_get_next_filter(struct toi_module_ops *filter_sought) -{ - struct toi_module_ops *last_filter = NULL, *this_filter = NULL; - - list_for_each_entry(this_filter, &toi_filters, type_list) { - if (!this_filter->enabled) - continue; - if ((last_filter == filter_sought) || (!filter_sought)) - return this_filter; - last_filter = this_filter; - } - - return toiActiveAllocator; -} - -/** - * toi_show_modules: Printk what support is loaded. - */ -void toi_print_modules(void) -{ - struct toi_module_ops *this_module; - int prev = 0; - - printk(KERN_INFO "TuxOnIce " TOI_CORE_VERSION ", with support for"); - - list_for_each_entry(this_module, &toi_modules, module_list) { - if (this_module->type == MISC_HIDDEN_MODULE) - continue; - printk("%s %s%s%s", prev ? "," : "", - this_module->enabled ? "" : "[", - this_module->name, - this_module->enabled ? "" : "]"); - prev = 1; - } - - printk(".\n"); -} - -/* toi_get_modules - * - * Take a reference to modules so they can't go away under us. - */ - -int toi_get_modules(void) -{ - struct toi_module_ops *this_module; - - list_for_each_entry(this_module, &toi_modules, module_list) { - struct toi_module_ops *this_module2; - - if (try_module_get(this_module->module)) - continue; - - /* Failed! Reverse gets and return error */ - list_for_each_entry(this_module2, &toi_modules, - module_list) { - if (this_module == this_module2) - return -EINVAL; - module_put(this_module2->module); - } - } - return 0; -} - -/* toi_put_modules - * - * Release our references to modules we used. - */ - -void toi_put_modules(void) -{ - struct toi_module_ops *this_module; - - list_for_each_entry(this_module, &toi_modules, module_list) - module_put(this_module->module); -} diff --git a/kernel/power/tuxonice_modules.h b/kernel/power/tuxonice_modules.h deleted file mode 100644 index 44f10abb9..000000000 --- a/kernel/power/tuxonice_modules.h +++ /dev/null @@ -1,212 +0,0 @@ -/* - * kernel/power/tuxonice_modules.h - * - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * It contains declarations for modules. Modules are additions to - * TuxOnIce that provide facilities such as image compression or - * encryption, backends for storage of the image and user interfaces. - * - */ - -#ifndef TOI_MODULES_H -#define TOI_MODULES_H - -/* This is the maximum size we store in the image header for a module name */ -#define TOI_MAX_MODULE_NAME_LENGTH 30 - -struct toi_boot_kernel_data; - -/* Per-module metadata */ -struct toi_module_header { - char name[TOI_MAX_MODULE_NAME_LENGTH]; - int enabled; - int type; - int index; - int data_length; - unsigned long signature; -}; - -enum { - FILTER_MODULE, - WRITER_MODULE, - BIO_ALLOCATOR_MODULE, - MISC_MODULE, - MISC_HIDDEN_MODULE, -}; - -enum { - TOI_ASYNC, - TOI_SYNC -}; - -enum { - TOI_VIRT, - TOI_PAGE, -}; - -#define TOI_MAP(type, addr) \ - (type == TOI_PAGE ? kmap(addr) : addr) - -#define TOI_UNMAP(type, addr) \ - do { \ - if (type == TOI_PAGE) \ - kunmap(addr); \ - } while(0) - -struct toi_module_ops { - /* Functions common to all modules */ - int type; - char *name; - char *directory; - char *shared_directory; - struct kobject *dir_kobj; - struct module *module; - int enabled, early, initialised; - struct list_head module_list; - - /* List of filters or allocators */ - struct list_head list, type_list; - - /* - * Requirements for memory and storage in - * the image header.. - */ - int (*memory_needed) (void); - int (*storage_needed) (void); - - int header_requested, header_used; - - int (*expected_compression) (void); - - /* - * Debug info - */ - int (*print_debug_info) (char *buffer, int size); - int (*save_config_info) (char *buffer); - void (*load_config_info) (char *buffer, int len); - - /* - * Initialise & cleanup - general routines called - * at the start and end of a cycle. - */ - int (*initialise) (int starting_cycle); - void (*cleanup) (int finishing_cycle); - - void (*pre_atomic_restore) (struct toi_boot_kernel_data *bkd); - void (*post_atomic_restore) (struct toi_boot_kernel_data *bkd); - - /* - * Calls for allocating storage (allocators only). - * - * Header space is requested separately and cannot fail, but the - * reservation is only applied when main storage is allocated. - * The header space reservation is thus always set prior to - * requesting the allocation of storage - and prior to querying - * how much storage is available. - */ - - unsigned long (*storage_available) (void); - void (*reserve_header_space) (unsigned long space_requested); - int (*register_storage) (void); - int (*allocate_storage) (unsigned long space_requested); - unsigned long (*storage_allocated) (void); - void (*free_unused_storage) (void); - - /* - * Routines used in image I/O. - */ - int (*rw_init) (int rw, int stream_number); - int (*rw_cleanup) (int rw); - int (*write_page) (unsigned long index, int buf_type, void *buf, - unsigned int buf_size); - int (*read_page) (unsigned long *index, int buf_type, void *buf, - unsigned int *buf_size); - int (*io_flusher) (int rw); - - /* Reset module if image exists but reading aborted */ - void (*noresume_reset) (void); - - /* Read and write the metadata */ - int (*write_header_init) (void); - int (*write_header_cleanup) (void); - - int (*read_header_init) (void); - int (*read_header_cleanup) (void); - - /* To be called after read_header_init */ - int (*get_header_version) (void); - - int (*rw_header_chunk) (int rw, struct toi_module_ops *owner, - char *buffer_start, int buffer_size); - - int (*rw_header_chunk_noreadahead) (int rw, - struct toi_module_ops *owner, char *buffer_start, - int buffer_size); - - /* Attempt to parse an image location */ - int (*parse_sig_location) (char *buffer, int only_writer, int quiet); - - /* Throttle I/O according to throughput */ - void (*update_throughput_throttle) (int jif_index); - - /* Flush outstanding I/O */ - int (*finish_all_io) (void); - - /* Determine whether image exists that we can restore */ - int (*image_exists) (int quiet); - - /* Mark the image as having tried to resume */ - int (*mark_resume_attempted) (int); - - /* Destroy image if one exists */ - int (*remove_image) (void); - - /* Sysfs Data */ - struct toi_sysfs_data *sysfs_data; - int num_sysfs_entries; - - /* Block I/O allocator */ - struct toi_bio_allocator_ops *bio_allocator_ops; -}; - -extern int toi_num_modules, toiNumAllocators; - -extern struct toi_module_ops *toiActiveAllocator; -extern struct list_head toi_filters, toiAllocators, toi_modules; - -extern void toi_prepare_console_modules(void); -extern void toi_cleanup_console_modules(void); - -extern struct toi_module_ops *toi_find_module_given_name(char *name); -extern struct toi_module_ops *toi_get_next_filter(struct toi_module_ops *); - -extern int toi_register_module(struct toi_module_ops *module); -extern void toi_move_module_tail(struct toi_module_ops *module); - -extern long toi_header_storage_for_modules(void); -extern long toi_memory_for_modules(int print_parts); -extern void print_toi_header_storage_for_modules(void); -extern int toi_expected_compression_ratio(void); - -extern int toi_print_module_debug_info(char *buffer, int buffer_size); -extern int toi_register_module(struct toi_module_ops *module); -extern void toi_unregister_module(struct toi_module_ops *module); - -extern int toi_initialise_modules(int starting_cycle, int early); -#define toi_initialise_modules_early(starting) \ - toi_initialise_modules(starting, 1) -#define toi_initialise_modules_late(starting) \ - toi_initialise_modules(starting, 0) -extern void toi_cleanup_modules(int finishing_cycle); - -extern void toi_post_atomic_restore_modules(struct toi_boot_kernel_data *bkd); -extern void toi_pre_atomic_restore_modules(struct toi_boot_kernel_data *bkd); - -extern void toi_print_modules(void); - -int toi_get_modules(void); -void toi_put_modules(void); -#endif diff --git a/kernel/power/tuxonice_netlink.c b/kernel/power/tuxonice_netlink.c deleted file mode 100644 index 78bd31b05..000000000 --- a/kernel/power/tuxonice_netlink.c +++ /dev/null @@ -1,324 +0,0 @@ -/* - * kernel/power/tuxonice_netlink.c - * - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * Functions for communicating with a userspace helper via netlink. - */ - -#include <linux/suspend.h> -#include <linux/sched.h> -#include <linux/kmod.h> -#include "tuxonice_netlink.h" -#include "tuxonice.h" -#include "tuxonice_modules.h" -#include "tuxonice_alloc.h" -#include "tuxonice_builtin.h" - -static struct user_helper_data *uhd_list; - -/* - * Refill our pool of SKBs for use in emergencies (eg, when eating memory and - * none can be allocated). - */ -static void toi_fill_skb_pool(struct user_helper_data *uhd) -{ - while (uhd->pool_level < uhd->pool_limit) { - struct sk_buff *new_skb = - alloc_skb(NLMSG_SPACE(uhd->skb_size), TOI_ATOMIC_GFP); - - if (!new_skb) - break; - - new_skb->next = uhd->emerg_skbs; - uhd->emerg_skbs = new_skb; - uhd->pool_level++; - } -} - -/* - * Try to allocate a single skb. If we can't get one, try to use one from - * our pool. - */ -static struct sk_buff *toi_get_skb(struct user_helper_data *uhd) -{ - struct sk_buff *skb = - alloc_skb(NLMSG_SPACE(uhd->skb_size), TOI_ATOMIC_GFP); - - if (skb) - return skb; - - skb = uhd->emerg_skbs; - if (skb) { - uhd->pool_level--; - uhd->emerg_skbs = skb->next; - skb->next = NULL; - } - - return skb; -} - -void toi_send_netlink_message(struct user_helper_data *uhd, - int type, void *params, size_t len) -{ - struct sk_buff *skb; - struct nlmsghdr *nlh; - void *dest; - struct task_struct *t; - - if (uhd->pid == -1) - return; - - if (uhd->debug) - printk(KERN_ERR "toi_send_netlink_message: Send " - "message type %d.\n", type); - - skb = toi_get_skb(uhd); - if (!skb) { - printk(KERN_INFO "toi_netlink: Can't allocate skb!\n"); - return; - } - - nlh = nlmsg_put(skb, 0, uhd->sock_seq, type, len, 0); - uhd->sock_seq++; - - dest = NLMSG_DATA(nlh); - if (params && len > 0) - memcpy(dest, params, len); - - netlink_unicast(uhd->nl, skb, uhd->pid, 0); - - toi_read_lock_tasklist(); - t = find_task_by_pid_ns(uhd->pid, &init_pid_ns); - if (!t) { - toi_read_unlock_tasklist(); - if (uhd->pid > -1) - printk(KERN_INFO "Hmm. Can't find the userspace task" - " %d.\n", uhd->pid); - return; - } - wake_up_process(t); - toi_read_unlock_tasklist(); - - yield(); -} - -static void send_whether_debugging(struct user_helper_data *uhd) -{ - static u8 is_debugging = 1; - - toi_send_netlink_message(uhd, NETLINK_MSG_IS_DEBUGGING, - &is_debugging, sizeof(u8)); -} - -/* - * Set the PF_NOFREEZE flag on the given process to ensure it can run whilst we - * are hibernating. - */ -static int nl_set_nofreeze(struct user_helper_data *uhd, __u32 pid) -{ - struct task_struct *t; - - if (uhd->debug) - printk(KERN_ERR "nl_set_nofreeze for pid %d.\n", pid); - - toi_read_lock_tasklist(); - t = find_task_by_pid_ns(pid, &init_pid_ns); - if (!t) { - toi_read_unlock_tasklist(); - printk(KERN_INFO "Strange. Can't find the userspace task %d.\n", - pid); - return -EINVAL; - } - - t->flags |= PF_NOFREEZE; - - toi_read_unlock_tasklist(); - uhd->pid = pid; - - toi_send_netlink_message(uhd, NETLINK_MSG_NOFREEZE_ACK, NULL, 0); - - return 0; -} - -/* - * Called when the userspace process has informed us that it's ready to roll. - */ -static int nl_ready(struct user_helper_data *uhd, u32 version) -{ - if (version != uhd->interface_version) { - printk(KERN_INFO "%s userspace process using invalid interface" - " version (%d - kernel wants %d). Trying to " - "continue without it.\n", - uhd->name, version, uhd->interface_version); - if (uhd->not_ready) - uhd->not_ready(); - return -EINVAL; - } - - complete(&uhd->wait_for_process); - - return 0; -} - -void toi_netlink_close_complete(struct user_helper_data *uhd) -{ - if (uhd->nl) { - netlink_kernel_release(uhd->nl); - uhd->nl = NULL; - } - - while (uhd->emerg_skbs) { - struct sk_buff *next = uhd->emerg_skbs->next; - kfree_skb(uhd->emerg_skbs); - uhd->emerg_skbs = next; - } - - uhd->pid = -1; -} - -static int toi_nl_gen_rcv_msg(struct user_helper_data *uhd, - struct sk_buff *skb, struct nlmsghdr *nlh) -{ - int type = nlh->nlmsg_type; - int *data; - int err; - - if (uhd->debug) - printk(KERN_ERR "toi_user_rcv_skb: Received message %d.\n", - type); - - /* Let the more specific handler go first. It returns - * 1 for valid messages that it doesn't know. */ - err = uhd->rcv_msg(skb, nlh); - if (err != 1) - return err; - - /* Only allow one task to receive NOFREEZE privileges */ - if (type == NETLINK_MSG_NOFREEZE_ME && uhd->pid != -1) { - printk(KERN_INFO "Received extra nofreeze me requests.\n"); - return -EBUSY; - } - - data = NLMSG_DATA(nlh); - - switch (type) { - case NETLINK_MSG_NOFREEZE_ME: - return nl_set_nofreeze(uhd, nlh->nlmsg_pid); - case NETLINK_MSG_GET_DEBUGGING: - send_whether_debugging(uhd); - return 0; - case NETLINK_MSG_READY: - if (nlh->nlmsg_len != NLMSG_LENGTH(sizeof(u32))) { - printk(KERN_INFO "Invalid ready mesage.\n"); - if (uhd->not_ready) - uhd->not_ready(); - return -EINVAL; - } - return nl_ready(uhd, (u32) *data); - case NETLINK_MSG_CLEANUP: - toi_netlink_close_complete(uhd); - return 0; - } - - return -EINVAL; -} - -static void toi_user_rcv_skb(struct sk_buff *skb) -{ - int err; - struct nlmsghdr *nlh; - struct user_helper_data *uhd = uhd_list; - - while (uhd && uhd->netlink_id != skb->sk->sk_protocol) - uhd = uhd->next; - - if (!uhd) - return; - - while (skb->len >= NLMSG_SPACE(0)) { - u32 rlen; - - nlh = (struct nlmsghdr *) skb->data; - if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) - return; - - rlen = NLMSG_ALIGN(nlh->nlmsg_len); - if (rlen > skb->len) - rlen = skb->len; - - err = toi_nl_gen_rcv_msg(uhd, skb, nlh); - if (err) - netlink_ack(skb, nlh, err); - else if (nlh->nlmsg_flags & NLM_F_ACK) - netlink_ack(skb, nlh, 0); - skb_pull(skb, rlen); - } -} - -static int netlink_prepare(struct user_helper_data *uhd) -{ - struct netlink_kernel_cfg cfg = { - .groups = 0, - .input = toi_user_rcv_skb, - }; - - uhd->next = uhd_list; - uhd_list = uhd; - - uhd->sock_seq = 0x42c0ffee; - uhd->nl = netlink_kernel_create(&init_net, uhd->netlink_id, &cfg); - if (!uhd->nl) { - printk(KERN_INFO "Failed to allocate netlink socket for %s.\n", - uhd->name); - return -ENOMEM; - } - - toi_fill_skb_pool(uhd); - - return 0; -} - -void toi_netlink_close(struct user_helper_data *uhd) -{ - struct task_struct *t; - - toi_read_lock_tasklist(); - t = find_task_by_pid_ns(uhd->pid, &init_pid_ns); - if (t) - t->flags &= ~PF_NOFREEZE; - toi_read_unlock_tasklist(); - - toi_send_netlink_message(uhd, NETLINK_MSG_CLEANUP, NULL, 0); -} -int toi_netlink_setup(struct user_helper_data *uhd) -{ - /* In case userui didn't cleanup properly on us */ - toi_netlink_close_complete(uhd); - - if (netlink_prepare(uhd) < 0) { - printk(KERN_INFO "Netlink prepare failed.\n"); - return 1; - } - - if (toi_launch_userspace_program(uhd->program, uhd->netlink_id, - UMH_WAIT_EXEC, uhd->debug) < 0) { - printk(KERN_INFO "Launch userspace program failed.\n"); - toi_netlink_close_complete(uhd); - return 1; - } - - /* Wait 2 seconds for the userspace process to make contact */ - wait_for_completion_timeout(&uhd->wait_for_process, 2*HZ); - - if (uhd->pid == -1) { - printk(KERN_INFO "%s: Failed to contact userspace process.\n", - uhd->name); - toi_netlink_close_complete(uhd); - return 1; - } - - return 0; -} diff --git a/kernel/power/tuxonice_netlink.h b/kernel/power/tuxonice_netlink.h deleted file mode 100644 index 6613c8eaa..000000000 --- a/kernel/power/tuxonice_netlink.h +++ /dev/null @@ -1,62 +0,0 @@ -/* - * kernel/power/tuxonice_netlink.h - * - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * Declarations for functions for communicating with a userspace helper - * via netlink. - */ - -#include <linux/netlink.h> -#include <net/sock.h> - -#define NETLINK_MSG_BASE 0x10 - -#define NETLINK_MSG_READY 0x10 -#define NETLINK_MSG_NOFREEZE_ME 0x16 -#define NETLINK_MSG_GET_DEBUGGING 0x19 -#define NETLINK_MSG_CLEANUP 0x24 -#define NETLINK_MSG_NOFREEZE_ACK 0x27 -#define NETLINK_MSG_IS_DEBUGGING 0x28 - -struct user_helper_data { - int (*rcv_msg) (struct sk_buff *skb, struct nlmsghdr *nlh); - void (*not_ready) (void); - struct sock *nl; - u32 sock_seq; - pid_t pid; - char *comm; - char program[256]; - int pool_level; - int pool_limit; - struct sk_buff *emerg_skbs; - int skb_size; - int netlink_id; - char *name; - struct user_helper_data *next; - struct completion wait_for_process; - u32 interface_version; - int must_init; - int debug; -}; - -#ifdef CONFIG_NET -int toi_netlink_setup(struct user_helper_data *uhd); -void toi_netlink_close(struct user_helper_data *uhd); -void toi_send_netlink_message(struct user_helper_data *uhd, - int type, void *params, size_t len); -void toi_netlink_close_complete(struct user_helper_data *uhd); -#else -static inline int toi_netlink_setup(struct user_helper_data *uhd) -{ - return 0; -} - -static inline void toi_netlink_close(struct user_helper_data *uhd) { }; -static inline void toi_send_netlink_message(struct user_helper_data *uhd, - int type, void *params, size_t len) { }; -static inline void toi_netlink_close_complete(struct user_helper_data *uhd) - { }; -#endif diff --git a/kernel/power/tuxonice_pagedir.c b/kernel/power/tuxonice_pagedir.c deleted file mode 100644 index d469f3d2d..000000000 --- a/kernel/power/tuxonice_pagedir.c +++ /dev/null @@ -1,345 +0,0 @@ -/* - * kernel/power/tuxonice_pagedir.c - * - * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu> - * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz> - * Copyright (C) 2002-2003 Florent Chabaud <fchabaud@free.fr> - * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * Routines for handling pagesets. - * Note that pbes aren't actually stored as such. They're stored as - * bitmaps and extents. - */ - -#include <linux/suspend.h> -#include <linux/highmem.h> -#include <linux/bootmem.h> -#include <linux/hardirq.h> -#include <linux/sched.h> -#include <linux/cpu.h> -#include <asm/tlbflush.h> - -#include "tuxonice_pageflags.h" -#include "tuxonice_ui.h" -#include "tuxonice_pagedir.h" -#include "tuxonice_prepare_image.h" -#include "tuxonice.h" -#include "tuxonice_builtin.h" -#include "tuxonice_alloc.h" - -static int ptoi_pfn; -static struct pbe *this_low_pbe; -static struct pbe **last_low_pbe_ptr; - -void toi_reset_alt_image_pageset2_pfn(void) -{ - memory_bm_position_reset(pageset2_map); -} - -static struct page *first_conflicting_page; - -/* - * free_conflicting_pages - */ - -static void free_conflicting_pages(void) -{ - while (first_conflicting_page) { - struct page *next = - *((struct page **) kmap(first_conflicting_page)); - kunmap(first_conflicting_page); - toi__free_page(29, first_conflicting_page); - first_conflicting_page = next; - } -} - -/* __toi_get_nonconflicting_page - * - * Description: Gets order zero pages that won't be overwritten - * while copying the original pages. - */ - -struct page *___toi_get_nonconflicting_page(int can_be_highmem) -{ - struct page *page; - gfp_t flags = TOI_ATOMIC_GFP; - if (can_be_highmem) - flags |= __GFP_HIGHMEM; - - - if (test_toi_state(TOI_LOADING_ALT_IMAGE) && - pageset2_map && ptoi_pfn) { - do { - ptoi_pfn = memory_bm_next_pfn(pageset2_map, 0); - if (ptoi_pfn != BM_END_OF_MAP) { - page = pfn_to_page(ptoi_pfn); - if (!PagePageset1(page) && - (can_be_highmem || !PageHighMem(page))) - return page; - } - } while (ptoi_pfn); - } - - do { - page = toi_alloc_page(29, flags | __GFP_ZERO); - if (!page) { - printk(KERN_INFO "Failed to get nonconflicting " - "page.\n"); - return NULL; - } - if (PagePageset1(page)) { - struct page **next = (struct page **) kmap(page); - *next = first_conflicting_page; - first_conflicting_page = page; - kunmap(page); - } - } while (PagePageset1(page)); - - return page; -} - -unsigned long __toi_get_nonconflicting_page(void) -{ - struct page *page = ___toi_get_nonconflicting_page(0); - return page ? (unsigned long) page_address(page) : 0; -} - -static struct pbe *get_next_pbe(struct page **page_ptr, struct pbe *this_pbe, - int highmem) -{ - if (((((unsigned long) this_pbe) & (PAGE_SIZE - 1)) - + 2 * sizeof(struct pbe)) > PAGE_SIZE) { - struct page *new_page = - ___toi_get_nonconflicting_page(highmem); - if (!new_page) - return ERR_PTR(-ENOMEM); - this_pbe = (struct pbe *) kmap(new_page); - memset(this_pbe, 0, PAGE_SIZE); - *page_ptr = new_page; - } else - this_pbe++; - - return this_pbe; -} - -/** - * get_pageset1_load_addresses - generate pbes for conflicting pages - * - * We check here that pagedir & pages it points to won't collide - * with pages where we're going to restore from the loaded pages - * later. - * - * Returns: - * Zero on success, one if couldn't find enough pages (shouldn't - * happen). - **/ -int toi_get_pageset1_load_addresses(void) -{ - int pfn, highallocd = 0, lowallocd = 0; - int low_needed = pagedir1.size - get_highmem_size(pagedir1); - int high_needed = get_highmem_size(pagedir1); - int low_pages_for_highmem = 0; - gfp_t flags = GFP_ATOMIC | __GFP_NOWARN | __GFP_HIGHMEM; - struct page *page, *high_pbe_page = NULL, *last_high_pbe_page = NULL, - *low_pbe_page, *last_low_pbe_page = NULL; - struct pbe **last_high_pbe_ptr = &restore_highmem_pblist, - *this_high_pbe = NULL; - unsigned long orig_low_pfn, orig_high_pfn; - int high_pbes_done = 0, low_pbes_done = 0; - int low_direct = 0, high_direct = 0, result = 0, i; - int high_page = 1, high_offset = 0, low_page = 1, low_offset = 0; - - toi_trace_index++; - - memory_bm_position_reset(pageset1_map); - memory_bm_position_reset(pageset1_copy_map); - - last_low_pbe_ptr = &restore_pblist; - - /* First, allocate pages for the start of our pbe lists. */ - if (high_needed) { - high_pbe_page = ___toi_get_nonconflicting_page(1); - if (!high_pbe_page) { - result = -ENOMEM; - goto out; - } - this_high_pbe = (struct pbe *) kmap(high_pbe_page); - memset(this_high_pbe, 0, PAGE_SIZE); - } - - low_pbe_page = ___toi_get_nonconflicting_page(0); - if (!low_pbe_page) { - result = -ENOMEM; - goto out; - } - this_low_pbe = (struct pbe *) page_address(low_pbe_page); - - /* - * Next, allocate the number of pages we need. - */ - - i = low_needed + high_needed; - - do { - int is_high; - - if (i == low_needed) - flags &= ~__GFP_HIGHMEM; - - page = toi_alloc_page(30, flags); - BUG_ON(!page); - - SetPagePageset1Copy(page); - is_high = PageHighMem(page); - - if (PagePageset1(page)) { - if (is_high) - high_direct++; - else - low_direct++; - } else { - if (is_high) - highallocd++; - else - lowallocd++; - } - } while (--i); - - high_needed -= high_direct; - low_needed -= low_direct; - - /* - * Do we need to use some lowmem pages for the copies of highmem - * pages? - */ - if (high_needed > highallocd) { - low_pages_for_highmem = high_needed - highallocd; - high_needed -= low_pages_for_highmem; - low_needed += low_pages_for_highmem; - } - - /* - * Now generate our pbes (which will be used for the atomic restore), - * and free unneeded pages. - */ - memory_bm_position_reset(pageset1_copy_map); - for (pfn = memory_bm_next_pfn(pageset1_copy_map, 0); pfn != BM_END_OF_MAP; - pfn = memory_bm_next_pfn(pageset1_copy_map, 0)) { - int is_high; - page = pfn_to_page(pfn); - is_high = PageHighMem(page); - - if (PagePageset1(page)) - continue; - - /* Nope. We're going to use this page. Add a pbe. */ - if (is_high || low_pages_for_highmem) { - struct page *orig_page; - high_pbes_done++; - if (!is_high) - low_pages_for_highmem--; - do { - orig_high_pfn = memory_bm_next_pfn(pageset1_map, 0); - BUG_ON(orig_high_pfn == BM_END_OF_MAP); - orig_page = pfn_to_page(orig_high_pfn); - } while (!PageHighMem(orig_page) || - PagePageset1Copy(orig_page)); - - this_high_pbe->orig_address = (void *) orig_high_pfn; - this_high_pbe->address = page; - this_high_pbe->next = NULL; - toi_message(TOI_PAGEDIR, TOI_VERBOSE, 0, "High pbe %d/%d: %p(%d)=>%p", - high_page, high_offset, page, orig_high_pfn, orig_page); - if (last_high_pbe_page != high_pbe_page) { - *last_high_pbe_ptr = - (struct pbe *) high_pbe_page; - if (last_high_pbe_page) { - kunmap(last_high_pbe_page); - high_page++; - high_offset = 0; - } else - high_offset++; - last_high_pbe_page = high_pbe_page; - } else { - *last_high_pbe_ptr = this_high_pbe; - high_offset++; - } - last_high_pbe_ptr = &this_high_pbe->next; - this_high_pbe = get_next_pbe(&high_pbe_page, - this_high_pbe, 1); - if (IS_ERR(this_high_pbe)) { - printk(KERN_INFO - "This high pbe is an error.\n"); - return -ENOMEM; - } - } else { - struct page *orig_page; - low_pbes_done++; - do { - orig_low_pfn = memory_bm_next_pfn(pageset1_map, 0); - BUG_ON(orig_low_pfn == BM_END_OF_MAP); - orig_page = pfn_to_page(orig_low_pfn); - } while (PageHighMem(orig_page) || - PagePageset1Copy(orig_page)); - - this_low_pbe->orig_address = page_address(orig_page); - this_low_pbe->address = page_address(page); - this_low_pbe->next = NULL; - toi_message(TOI_PAGEDIR, TOI_VERBOSE, 0, "Low pbe %d/%d: %p(%d)=>%p", - low_page, low_offset, this_low_pbe->orig_address, - orig_low_pfn, this_low_pbe->address); - TOI_TRACE_DEBUG(orig_low_pfn, "LoadAddresses (%d/%d): %p=>%p", low_page, low_offset, this_low_pbe->orig_address, this_low_pbe->address); - *last_low_pbe_ptr = this_low_pbe; - last_low_pbe_ptr = &this_low_pbe->next; - this_low_pbe = get_next_pbe(&low_pbe_page, - this_low_pbe, 0); - if (low_pbe_page != last_low_pbe_page) { - if (last_low_pbe_page) { - low_page++; - low_offset = 0; - } else { - low_offset++; - } - last_low_pbe_page = low_pbe_page; - } else - low_offset++; - if (IS_ERR(this_low_pbe)) { - printk(KERN_INFO "this_low_pbe is an error.\n"); - return -ENOMEM; - } - } - } - - if (high_pbe_page) - kunmap(high_pbe_page); - - if (last_high_pbe_page != high_pbe_page) { - if (last_high_pbe_page) - kunmap(last_high_pbe_page); - toi__free_page(29, high_pbe_page); - } - - free_conflicting_pages(); - -out: - return result; -} - -int add_boot_kernel_data_pbe(void) -{ - this_low_pbe->address = (char *) __toi_get_nonconflicting_page(); - if (!this_low_pbe->address) { - printk(KERN_INFO "Failed to get bkd atomic restore buffer."); - return -ENOMEM; - } - - toi_bkd.size = sizeof(toi_bkd); - memcpy(this_low_pbe->address, &toi_bkd, sizeof(toi_bkd)); - - *last_low_pbe_ptr = this_low_pbe; - this_low_pbe->orig_address = (char *) boot_kernel_data_buffer; - this_low_pbe->next = NULL; - return 0; -} diff --git a/kernel/power/tuxonice_pagedir.h b/kernel/power/tuxonice_pagedir.h deleted file mode 100644 index 046535918..000000000 --- a/kernel/power/tuxonice_pagedir.h +++ /dev/null @@ -1,50 +0,0 @@ -/* - * kernel/power/tuxonice_pagedir.h - * - * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * Declarations for routines for handling pagesets. - */ - -#ifndef KERNEL_POWER_PAGEDIR_H -#define KERNEL_POWER_PAGEDIR_H - -/* Pagedir - * - * Contains the metadata for a set of pages saved in the image. - */ - -struct pagedir { - int id; - unsigned long size; -#ifdef CONFIG_HIGHMEM - unsigned long size_high; -#endif -}; - -#ifdef CONFIG_HIGHMEM -#define get_highmem_size(pagedir) (pagedir.size_high) -#define set_highmem_size(pagedir, sz) do { pagedir.size_high = sz; } while (0) -#define inc_highmem_size(pagedir) do { pagedir.size_high++; } while (0) -#define get_lowmem_size(pagedir) (pagedir.size - pagedir.size_high) -#else -#define get_highmem_size(pagedir) (0) -#define set_highmem_size(pagedir, sz) do { } while (0) -#define inc_highmem_size(pagedir) do { } while (0) -#define get_lowmem_size(pagedir) (pagedir.size) -#endif - -extern struct pagedir pagedir1, pagedir2; - -extern void toi_copy_pageset1(void); - -extern int toi_get_pageset1_load_addresses(void); - -extern unsigned long __toi_get_nonconflicting_page(void); -struct page *___toi_get_nonconflicting_page(int can_be_highmem); - -extern void toi_reset_alt_image_pageset2_pfn(void); -extern int add_boot_kernel_data_pbe(void); -#endif diff --git a/kernel/power/tuxonice_pageflags.c b/kernel/power/tuxonice_pageflags.c deleted file mode 100644 index 0fe92edd7..000000000 --- a/kernel/power/tuxonice_pageflags.c +++ /dev/null @@ -1,18 +0,0 @@ -/* - * kernel/power/tuxonice_pageflags.c - * - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * Routines for serialising and relocating pageflags in which we - * store our image metadata. - */ - -#include "tuxonice_pageflags.h" -#include "power.h" - -int toi_pageflags_space_needed(void) -{ - return memory_bm_space_needed(pageset1_map); -} diff --git a/kernel/power/tuxonice_pageflags.h b/kernel/power/tuxonice_pageflags.h deleted file mode 100644 index ddeeaf1e7..000000000 --- a/kernel/power/tuxonice_pageflags.h +++ /dev/null @@ -1,106 +0,0 @@ -/* - * kernel/power/tuxonice_pageflags.h - * - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - */ - -#ifndef KERNEL_POWER_TUXONICE_PAGEFLAGS_H -#define KERNEL_POWER_TUXONICE_PAGEFLAGS_H - -struct memory_bitmap; -void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free); -void memory_bm_clear(struct memory_bitmap *bm); - -int mem_bm_set_bit_check(struct memory_bitmap *bm, int index, unsigned long pfn); -void memory_bm_set_bit(struct memory_bitmap *bm, int index, unsigned long pfn); -unsigned long memory_bm_next_pfn(struct memory_bitmap *bm, int index); -unsigned long memory_bm_next_pfn_index(struct memory_bitmap *bm, int index); -void memory_bm_position_reset(struct memory_bitmap *bm); -void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free); -int toi_alloc_bitmap(struct memory_bitmap **bm); -void toi_free_bitmap(struct memory_bitmap **bm); -void memory_bm_clear(struct memory_bitmap *bm); -void memory_bm_clear_bit(struct memory_bitmap *bm, int index, unsigned long pfn); -void memory_bm_set_bit(struct memory_bitmap *bm, int index, unsigned long pfn); -int memory_bm_test_bit(struct memory_bitmap *bm, int index, unsigned long pfn); -int memory_bm_test_bit_index(struct memory_bitmap *bm, int index, unsigned long pfn); -void memory_bm_clear_bit_index(struct memory_bitmap *bm, int index, unsigned long pfn); - -struct toi_module_ops; -int memory_bm_write(struct memory_bitmap *bm, int (*rw_chunk) - (int rw, struct toi_module_ops *owner, char *buffer, int buffer_size)); -int memory_bm_read(struct memory_bitmap *bm, int (*rw_chunk) - (int rw, struct toi_module_ops *owner, char *buffer, int buffer_size)); -int memory_bm_space_needed(struct memory_bitmap *bm); - -extern struct memory_bitmap *pageset1_map; -extern struct memory_bitmap *pageset1_copy_map; -extern struct memory_bitmap *pageset2_map; -extern struct memory_bitmap *page_resave_map; -extern struct memory_bitmap *io_map; -extern struct memory_bitmap *nosave_map; -extern struct memory_bitmap *free_map; -extern struct memory_bitmap *compare_map; - -#define PagePageset1(page) \ - (pageset1_map && memory_bm_test_bit(pageset1_map, smp_processor_id(), page_to_pfn(page))) -#define SetPagePageset1(page) \ - (memory_bm_set_bit(pageset1_map, smp_processor_id(), page_to_pfn(page))) -#define ClearPagePageset1(page) \ - (memory_bm_clear_bit(pageset1_map, smp_processor_id(), page_to_pfn(page))) - -#define PagePageset1Copy(page) \ - (memory_bm_test_bit(pageset1_copy_map, smp_processor_id(), page_to_pfn(page))) -#define SetPagePageset1Copy(page) \ - (memory_bm_set_bit(pageset1_copy_map, smp_processor_id(), page_to_pfn(page))) -#define ClearPagePageset1Copy(page) \ - (memory_bm_clear_bit(pageset1_copy_map, smp_processor_id(), page_to_pfn(page))) - -#define PagePageset2(page) \ - (memory_bm_test_bit(pageset2_map, smp_processor_id(), page_to_pfn(page))) -#define SetPagePageset2(page) \ - (memory_bm_set_bit(pageset2_map, smp_processor_id(), page_to_pfn(page))) -#define ClearPagePageset2(page) \ - (memory_bm_clear_bit(pageset2_map, smp_processor_id(), page_to_pfn(page))) - -#define PageWasRW(page) \ - (memory_bm_test_bit(pageset2_map, smp_processor_id(), page_to_pfn(page))) -#define SetPageWasRW(page) \ - (memory_bm_set_bit(pageset2_map, smp_processor_id(), page_to_pfn(page))) -#define ClearPageWasRW(page) \ - (memory_bm_clear_bit(pageset2_map, smp_processor_id(), page_to_pfn(page))) - -#define PageResave(page) (page_resave_map ? \ - memory_bm_test_bit(page_resave_map, smp_processor_id(), page_to_pfn(page)) : 0) -#define SetPageResave(page) \ - (memory_bm_set_bit(page_resave_map, smp_processor_id(), page_to_pfn(page))) -#define ClearPageResave(page) \ - (memory_bm_clear_bit(page_resave_map, smp_processor_id(), page_to_pfn(page))) - -#define PageNosave(page) (nosave_map ? \ - memory_bm_test_bit(nosave_map, smp_processor_id(), page_to_pfn(page)) : 0) -#define SetPageNosave(page) \ - (mem_bm_set_bit_check(nosave_map, smp_processor_id(), page_to_pfn(page))) -#define ClearPageNosave(page) \ - (memory_bm_clear_bit(nosave_map, smp_processor_id(), page_to_pfn(page))) - -#define PageNosaveFree(page) (free_map ? \ - memory_bm_test_bit(free_map, smp_processor_id(), page_to_pfn(page)) : 0) -#define SetPageNosaveFree(page) \ - (memory_bm_set_bit(free_map, smp_processor_id(), page_to_pfn(page))) -#define ClearPageNosaveFree(page) \ - (memory_bm_clear_bit(free_map, smp_processor_id(), page_to_pfn(page))) - -#define PageCompareChanged(page) (compare_map ? \ - memory_bm_test_bit(compare_map, smp_processor_id(), page_to_pfn(page)) : 0) -#define SetPageCompareChanged(page) \ - (memory_bm_set_bit(compare_map, smp_processor_id(), page_to_pfn(page))) -#define ClearPageCompareChanged(page) \ - (memory_bm_clear_bit(compare_map, smp_processor_id(), page_to_pfn(page))) - -extern void save_pageflags(struct memory_bitmap *pagemap); -extern int load_pageflags(struct memory_bitmap *pagemap); -extern int toi_pageflags_space_needed(void); -#endif diff --git a/kernel/power/tuxonice_power_off.c b/kernel/power/tuxonice_power_off.c deleted file mode 100644 index 7c78773cf..000000000 --- a/kernel/power/tuxonice_power_off.c +++ /dev/null @@ -1,286 +0,0 @@ -/* - * kernel/power/tuxonice_power_off.c - * - * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * Support for powering down. - */ - -#include <linux/device.h> -#include <linux/suspend.h> -#include <linux/mm.h> -#include <linux/pm.h> -#include <linux/reboot.h> -#include <linux/cpu.h> -#include <linux/console.h> -#include <linux/fs.h> -#include "tuxonice.h" -#include "tuxonice_ui.h" -#include "tuxonice_power_off.h" -#include "tuxonice_sysfs.h" -#include "tuxonice_modules.h" -#include "tuxonice_io.h" - -unsigned long toi_poweroff_method; /* 0 - Kernel power off */ - -static int wake_delay; -static char lid_state_file[256], wake_alarm_dir[256]; -static struct file *lid_file, *alarm_file, *epoch_file; -static int post_wake_state = -1; - -static int did_suspend_to_both; - -/* - * __toi_power_down - * Functionality : Powers down or reboots the computer once the image - * has been written to disk. - * Key Assumptions : Able to reboot/power down via code called or that - * the warning emitted if the calls fail will be visible - * to the user (ie printk resumes devices). - */ - -static void __toi_power_down(int method) -{ - int error; - - toi_cond_pause(1, test_action_state(TOI_REBOOT) ? "Ready to reboot." : - "Powering down."); - - if (test_result_state(TOI_ABORTED)) - goto out; - - if (test_action_state(TOI_REBOOT)) - kernel_restart(NULL); - - switch (method) { - case 0: - break; - case 3: - /* - * Re-read the overwritten part of pageset2 to make post-resume - * faster. - */ - if (read_pageset2(1)) - panic("Attempt to reload pagedir 2 failed. " - "Try rebooting."); - - pm_prepare_console(); - - error = pm_notifier_call_chain(PM_SUSPEND_PREPARE); - if (!error) { - pm_restore_gfp_mask(); - error = suspend_devices_and_enter(PM_SUSPEND_MEM); - pm_restrict_gfp_mask(); - if (!error) - did_suspend_to_both = 1; - } - pm_notifier_call_chain(PM_POST_SUSPEND); - pm_restore_console(); - - /* Success - we're now post-resume-from-ram */ - if (did_suspend_to_both) - return; - - /* Failed to suspend to ram - do normal power off */ - break; - case 4: - /* - * If succeeds, doesn't return. If fails, do a simple - * powerdown. - */ - hibernation_platform_enter(); - break; - case 5: - /* Historic entry only now */ - break; - } - - if (method && method != 5) - toi_cond_pause(1, - "Falling back to alternate power off method."); - - if (test_result_state(TOI_ABORTED)) - goto out; - - if (pm_power_off) - kernel_power_off(); - kernel_halt(); - toi_cond_pause(1, "Powerdown failed."); - while (1) - cpu_relax(); - -out: - if (read_pageset2(1)) - panic("Attempt to reload pagedir 2 failed. Try rebooting."); - return; -} - -#define CLOSE_FILE(file) \ - if (file) { \ - filp_close(file, NULL); file = NULL; \ - } - -static void powerdown_cleanup(int toi_or_resume) -{ - if (!toi_or_resume) - return; - - CLOSE_FILE(lid_file); - CLOSE_FILE(alarm_file); - CLOSE_FILE(epoch_file); -} - -static void open_file(char *format, char *arg, struct file **var, int mode, - char *desc) -{ - char buf[256]; - - if (strlen(arg)) { - sprintf(buf, format, arg); - *var = filp_open(buf, mode, 0); - if (IS_ERR(*var) || !*var) { - printk(KERN_INFO "Failed to open %s file '%s' (%p).\n", - desc, buf, *var); - *var = NULL; - } - } -} - -static int powerdown_init(int toi_or_resume) -{ - if (!toi_or_resume) - return 0; - - did_suspend_to_both = 0; - - open_file("/proc/acpi/button/%s/state", lid_state_file, &lid_file, - O_RDONLY, "lid"); - - if (strlen(wake_alarm_dir)) { - open_file("/sys/class/rtc/%s/wakealarm", wake_alarm_dir, - &alarm_file, O_WRONLY, "alarm"); - - open_file("/sys/class/rtc/%s/since_epoch", wake_alarm_dir, - &epoch_file, O_RDONLY, "epoch"); - } - - return 0; -} - -static int lid_closed(void) -{ - char array[25]; - ssize_t size; - loff_t pos = 0; - - if (!lid_file) - return 0; - - size = vfs_read(lid_file, (char __user *) array, 25, &pos); - if ((int) size < 1) { - printk(KERN_INFO "Failed to read lid state file (%d).\n", - (int) size); - return 0; - } - - if (!strcmp(array, "state: closed\n")) - return 1; - - return 0; -} - -static void write_alarm_file(int value) -{ - ssize_t size; - char buf[40]; - loff_t pos = 0; - - if (!alarm_file) - return; - - sprintf(buf, "%d\n", value); - - size = vfs_write(alarm_file, (char __user *)buf, strlen(buf), &pos); - - if (size < 0) - printk(KERN_INFO "Error %d writing alarm value %s.\n", - (int) size, buf); -} - -/** - * toi_check_resleep: See whether to powerdown again after waking. - * - * After waking, check whether we should powerdown again in a (usually - * different) way. We only do this if the lid switch is still closed. - */ -void toi_check_resleep(void) -{ - /* We only return if we suspended to ram and woke. */ - if (lid_closed() && post_wake_state >= 0) - __toi_power_down(post_wake_state); -} - -void toi_power_down(void) -{ - if (alarm_file && wake_delay) { - char array[25]; - loff_t pos = 0; - size_t size = vfs_read(epoch_file, (char __user *) array, 25, - &pos); - - if (((int) size) < 1) - printk(KERN_INFO "Failed to read epoch file (%d).\n", - (int) size); - else { - unsigned long since_epoch; - if (!kstrtoul(array, 0, &since_epoch)) { - /* Clear any wakeup time. */ - write_alarm_file(0); - - /* Set new wakeup time. */ - write_alarm_file(since_epoch + wake_delay); - } - } - } - - __toi_power_down(toi_poweroff_method); - - toi_check_resleep(); -} - -static struct toi_sysfs_data sysfs_params[] = { -#if defined(CONFIG_ACPI) - SYSFS_STRING("lid_file", SYSFS_RW, lid_state_file, 256, 0, NULL), - SYSFS_INT("wake_delay", SYSFS_RW, &wake_delay, 0, INT_MAX, 0, NULL), - SYSFS_STRING("wake_alarm_dir", SYSFS_RW, wake_alarm_dir, 256, 0, NULL), - SYSFS_INT("post_wake_state", SYSFS_RW, &post_wake_state, -1, 5, 0, - NULL), - SYSFS_UL("powerdown_method", SYSFS_RW, &toi_poweroff_method, 0, 5, 0), - SYSFS_INT("did_suspend_to_both", SYSFS_READONLY, &did_suspend_to_both, - 0, 0, 0, NULL) -#endif -}; - -static struct toi_module_ops powerdown_ops = { - .type = MISC_HIDDEN_MODULE, - .name = "poweroff", - .initialise = powerdown_init, - .cleanup = powerdown_cleanup, - .directory = "[ROOT]", - .module = THIS_MODULE, - .sysfs_data = sysfs_params, - .num_sysfs_entries = sizeof(sysfs_params) / - sizeof(struct toi_sysfs_data), -}; - -int toi_poweroff_init(void) -{ - return toi_register_module(&powerdown_ops); -} - -void toi_poweroff_exit(void) -{ - toi_unregister_module(&powerdown_ops); -} diff --git a/kernel/power/tuxonice_power_off.h b/kernel/power/tuxonice_power_off.h deleted file mode 100644 index 6e1d8bb39..000000000 --- a/kernel/power/tuxonice_power_off.h +++ /dev/null @@ -1,24 +0,0 @@ -/* - * kernel/power/tuxonice_power_off.h - * - * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * Support for the powering down. - */ - -int toi_pm_state_finish(void); -void toi_power_down(void); -extern unsigned long toi_poweroff_method; -int toi_poweroff_init(void); -void toi_poweroff_exit(void); -void toi_check_resleep(void); - -extern int platform_begin(int platform_mode); -extern int platform_pre_snapshot(int platform_mode); -extern void platform_leave(int platform_mode); -extern void platform_end(int platform_mode); -extern void platform_finish(int platform_mode); -extern int platform_pre_restore(int platform_mode); -extern void platform_restore_cleanup(int platform_mode); diff --git a/kernel/power/tuxonice_prepare_image.c b/kernel/power/tuxonice_prepare_image.c deleted file mode 100644 index a10d62080..000000000 --- a/kernel/power/tuxonice_prepare_image.c +++ /dev/null @@ -1,1080 +0,0 @@ -/* - * kernel/power/tuxonice_prepare_image.c - * - * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * We need to eat memory until we can: - * 1. Perform the save without changing anything (RAM_NEEDED < #pages) - * 2. Fit it all in available space (toiActiveAllocator->available_space() >= - * main_storage_needed()) - * 3. Reload the pagedir and pageset1 to places that don't collide with their - * final destinations, not knowing to what extent the resumed kernel will - * overlap with the one loaded at boot time. I think the resumed kernel - * should overlap completely, but I don't want to rely on this as it is - * an unproven assumption. We therefore assume there will be no overlap at - * all (worse case). - * 4. Meet the user's requested limit (if any) on the size of the image. - * The limit is in MB, so pages/256 (assuming 4K pages). - * - */ - -#include <linux/highmem.h> -#include <linux/freezer.h> -#include <linux/hardirq.h> -#include <linux/mmzone.h> -#include <linux/console.h> -#include <linux/tuxonice.h> - -#include "tuxonice_pageflags.h" -#include "tuxonice_modules.h" -#include "tuxonice_io.h" -#include "tuxonice_ui.h" -#include "tuxonice_prepare_image.h" -#include "tuxonice.h" -#include "tuxonice_extent.h" -#include "tuxonice_checksum.h" -#include "tuxonice_sysfs.h" -#include "tuxonice_alloc.h" -#include "tuxonice_atomic_copy.h" -#include "tuxonice_builtin.h" - -static unsigned long num_nosave, main_storage_allocated, storage_limit, - header_storage_needed; -unsigned long extra_pd1_pages_allowance = - CONFIG_TOI_DEFAULT_EXTRA_PAGES_ALLOWANCE; -long image_size_limit = CONFIG_TOI_DEFAULT_IMAGE_SIZE_LIMIT; -static int no_ps2_needed; - -struct attention_list { - struct task_struct *task; - struct attention_list *next; -}; - -static struct attention_list *attention_list; - -#define PAGESET1 0 -#define PAGESET2 1 - -void free_attention_list(void) -{ - struct attention_list *last = NULL; - - while (attention_list) { - last = attention_list; - attention_list = attention_list->next; - toi_kfree(6, last, sizeof(*last)); - } -} - -static int build_attention_list(void) -{ - int i, task_count = 0; - struct task_struct *p; - struct attention_list *next; - - /* - * Count all userspace process (with task->mm) marked PF_NOFREEZE. - */ - toi_read_lock_tasklist(); - for_each_process(p) - if ((p->flags & PF_NOFREEZE) || p == current) - task_count++; - toi_read_unlock_tasklist(); - - /* - * Allocate attention list structs. - */ - for (i = 0; i < task_count; i++) { - struct attention_list *this = - toi_kzalloc(6, sizeof(struct attention_list), - TOI_WAIT_GFP); - if (!this) { - printk(KERN_INFO "Failed to allocate slab for " - "attention list.\n"); - free_attention_list(); - return 1; - } - this->next = NULL; - if (attention_list) - this->next = attention_list; - attention_list = this; - } - - next = attention_list; - toi_read_lock_tasklist(); - for_each_process(p) - if ((p->flags & PF_NOFREEZE) || p == current) { - next->task = p; - next = next->next; - } - toi_read_unlock_tasklist(); - return 0; -} - -static void pageset2_full(void) -{ - struct zone *zone; - struct page *page; - unsigned long flags; - int i; - - toi_trace_index++; - - for_each_populated_zone(zone) { - spin_lock_irqsave(&zone->lru_lock, flags); - for_each_lru(i) { - if (!zone_page_state(zone, NR_LRU_BASE + i)) - continue; - - list_for_each_entry(page, &zone->lruvec.lists[i], lru) { - struct address_space *mapping; - - mapping = page_mapping(page); - if (!mapping || !mapping->host || - !(mapping->host->i_flags & S_ATOMIC_COPY)) { - if (PageTOI_RO(page) && test_result_state(TOI_KEPT_IMAGE)) { - TOI_TRACE_DEBUG(page_to_pfn(page), "_Pageset2 unmodified."); - } else { - TOI_TRACE_DEBUG(page_to_pfn(page), "_Pageset2 pageset2_full."); - SetPagePageset2(page); - } - } - } - } - spin_unlock_irqrestore(&zone->lru_lock, flags); - } -} - -/* - * toi_mark_task_as_pageset - * Functionality : Marks all the saveable pages belonging to a given process - * as belonging to a particular pageset. - */ - -static void toi_mark_task_as_pageset(struct task_struct *t, int pageset2) -{ - struct vm_area_struct *vma; - struct mm_struct *mm; - - mm = t->active_mm; - - if (!mm || !mm->mmap) - return; - - toi_trace_index++; - - if (!irqs_disabled()) - down_read(&mm->mmap_sem); - - for (vma = mm->mmap; vma; vma = vma->vm_next) { - unsigned long posn; - - if (!vma->vm_start || - vma->vm_flags & VM_PFNMAP) - continue; - - for (posn = vma->vm_start; posn < vma->vm_end; - posn += PAGE_SIZE) { - struct page *page = follow_page(vma, posn, 0); - struct address_space *mapping; - - if (!page || !pfn_valid(page_to_pfn(page))) - continue; - - mapping = page_mapping(page); - if (mapping && mapping->host && - mapping->host->i_flags & S_ATOMIC_COPY && pageset2) - continue; - - if (PageTOI_RO(page) && test_result_state(TOI_KEPT_IMAGE)) { - TOI_TRACE_DEBUG(page_to_pfn(page), "_Unmodified %d", pageset2 ? 1 : 2); - continue; - } - - if (pageset2) { - TOI_TRACE_DEBUG(page_to_pfn(page), "_MarkTaskAsPageset 1"); - SetPagePageset2(page); - } else { - TOI_TRACE_DEBUG(page_to_pfn(page), "_MarkTaskAsPageset 2"); - ClearPagePageset2(page); - SetPagePageset1(page); - } - } - } - - if (!irqs_disabled()) - up_read(&mm->mmap_sem); -} - -static void mark_tasks(int pageset) -{ - struct task_struct *p; - - toi_read_lock_tasklist(); - for_each_process(p) { - if (!p->mm) - continue; - - if (p->flags & PF_KTHREAD) - continue; - - toi_mark_task_as_pageset(p, pageset); - } - toi_read_unlock_tasklist(); - -} - -/* mark_pages_for_pageset2 - * - * Description: Mark unshared pages in processes not needed for hibernate as - * being able to be written out in a separate pagedir. - * HighMem pages are simply marked as pageset2. They won't be - * needed during hibernate. - */ - -static void toi_mark_pages_for_pageset2(void) -{ - struct attention_list *this = attention_list; - - memory_bm_clear(pageset2_map); - - if (test_action_state(TOI_NO_PAGESET2) || no_ps2_needed) - return; - - if (test_action_state(TOI_PAGESET2_FULL)) - pageset2_full(); - else - mark_tasks(PAGESET2); - - /* - * Because the tasks in attention_list are ones related to hibernating, - * we know that they won't go away under us. - */ - - while (this) { - if (!test_result_state(TOI_ABORTED)) - toi_mark_task_as_pageset(this->task, PAGESET1); - this = this->next; - } -} - -/* - * The atomic copy of pageset1 is stored in pageset2 pages. - * But if pageset1 is larger (normally only just after boot), - * we need to allocate extra pages to store the atomic copy. - * The following data struct and functions are used to handle - * the allocation and freeing of that memory. - */ - -static unsigned long extra_pages_allocated; - -struct extras { - struct page *page; - int order; - struct extras *next; -}; - -static struct extras *extras_list; - -/* toi_free_extra_pagedir_memory - * - * Description: Free previously allocated extra pagedir memory. - */ -void toi_free_extra_pagedir_memory(void) -{ - /* Free allocated pages */ - while (extras_list) { - struct extras *this = extras_list; - int i; - - extras_list = this->next; - - for (i = 0; i < (1 << this->order); i++) - ClearPageNosave(this->page + i); - - toi_free_pages(9, this->page, this->order); - toi_kfree(7, this, sizeof(*this)); - } - - extra_pages_allocated = 0; -} - -/* toi_allocate_extra_pagedir_memory - * - * Description: Allocate memory for making the atomic copy of pagedir1 in the - * case where it is bigger than pagedir2. - * Arguments: int num_to_alloc: Number of extra pages needed. - * Result: int. Number of extra pages we now have allocated. - */ -static int toi_allocate_extra_pagedir_memory(int extra_pages_needed) -{ - int j, order, num_to_alloc = extra_pages_needed - extra_pages_allocated; - gfp_t flags = TOI_ATOMIC_GFP; - - if (num_to_alloc < 1) - return 0; - - order = fls(num_to_alloc); - if (order >= MAX_ORDER) - order = MAX_ORDER - 1; - - while (num_to_alloc) { - struct page *newpage; - unsigned long virt; - struct extras *extras_entry; - - while ((1 << order) > num_to_alloc) - order--; - - extras_entry = (struct extras *) toi_kzalloc(7, - sizeof(struct extras), TOI_ATOMIC_GFP); - - if (!extras_entry) - return extra_pages_allocated; - - virt = toi_get_free_pages(9, flags, order); - while (!virt && order) { - order--; - virt = toi_get_free_pages(9, flags, order); - } - - if (!virt) { - toi_kfree(7, extras_entry, sizeof(*extras_entry)); - return extra_pages_allocated; - } - - newpage = virt_to_page(virt); - - extras_entry->page = newpage; - extras_entry->order = order; - extras_entry->next = extras_list; - - extras_list = extras_entry; - - for (j = 0; j < (1 << order); j++) { - SetPageNosave(newpage + j); - SetPagePageset1Copy(newpage + j); - } - - extra_pages_allocated += (1 << order); - num_to_alloc -= (1 << order); - } - - return extra_pages_allocated; -} - -/* - * real_nr_free_pages: Count pcp pages for a zone type or all zones - * (-1 for all, otherwise zone_idx() result desired). - */ -unsigned long real_nr_free_pages(unsigned long zone_idx_mask) -{ - struct zone *zone; - int result = 0, cpu; - - /* PCP lists */ - for_each_populated_zone(zone) { - if (!(zone_idx_mask & (1 << zone_idx(zone)))) - continue; - - for_each_online_cpu(cpu) { - struct per_cpu_pageset *pset = - per_cpu_ptr(zone->pageset, cpu); - struct per_cpu_pages *pcp = &pset->pcp; - result += pcp->count; - } - - result += zone_page_state(zone, NR_FREE_PAGES); - } - return result; -} - -/* - * Discover how much extra memory will be required by the drivers - * when they're asked to hibernate. We can then ensure that amount - * of memory is available when we really want it. - */ -static void get_extra_pd1_allowance(void) -{ - unsigned long orig_num_free = real_nr_free_pages(all_zones_mask), final; - - toi_prepare_status(CLEAR_BAR, "Finding allowance for drivers."); - - if (toi_go_atomic(PMSG_FREEZE, 1)) - return; - - final = real_nr_free_pages(all_zones_mask); - toi_end_atomic(ATOMIC_ALL_STEPS, 1, 0); - - extra_pd1_pages_allowance = (orig_num_free > final) ? - orig_num_free - final + MIN_EXTRA_PAGES_ALLOWANCE : - MIN_EXTRA_PAGES_ALLOWANCE; -} - -/* - * Amount of storage needed, possibly taking into account the - * expected compression ratio and possibly also ignoring our - * allowance for extra pages. - */ -static unsigned long main_storage_needed(int use_ecr, - int ignore_extra_pd1_allow) -{ - return (pagedir1.size + pagedir2.size + - (ignore_extra_pd1_allow ? 0 : extra_pd1_pages_allowance)) * - (use_ecr ? toi_expected_compression_ratio() : 100) / 100; -} - -/* - * Storage needed for the image header, in bytes until the return. - */ -unsigned long get_header_storage_needed(void) -{ - unsigned long bytes = sizeof(struct toi_header) + - toi_header_storage_for_modules() + - toi_pageflags_space_needed() + - fs_info_space_needed(); - - return DIV_ROUND_UP(bytes, PAGE_SIZE); -} - -/* - * When freeing memory, pages from either pageset might be freed. - * - * When seeking to free memory to be able to hibernate, for every ps1 page - * freed, we need 2 less pages for the atomic copy because there is one less - * page to copy and one more page into which data can be copied. - * - * Freeing ps2 pages saves us nothing directly. No more memory is available - * for the atomic copy. Indirectly, a ps1 page might be freed (slab?), but - * that's too much work to figure out. - * - * => ps1_to_free functions - * - * Of course if we just want to reduce the image size, because of storage - * limitations or an image size limit either ps will do. - * - * => any_to_free function - */ - -static unsigned long lowpages_usable_for_highmem_copy(void) -{ - unsigned long needed = get_lowmem_size(pagedir1) + - extra_pd1_pages_allowance + MIN_FREE_RAM + - toi_memory_for_modules(0), - available = get_lowmem_size(pagedir2) + - real_nr_free_low_pages() + extra_pages_allocated; - - return available > needed ? available - needed : 0; -} - -static unsigned long highpages_ps1_to_free(void) -{ - unsigned long need = get_highmem_size(pagedir1), - available = get_highmem_size(pagedir2) + - real_nr_free_high_pages() + - lowpages_usable_for_highmem_copy(); - - return need > available ? DIV_ROUND_UP(need - available, 2) : 0; -} - -static unsigned long lowpages_ps1_to_free(void) -{ - unsigned long needed = get_lowmem_size(pagedir1) + - extra_pd1_pages_allowance + MIN_FREE_RAM + - toi_memory_for_modules(0), - available = get_lowmem_size(pagedir2) + - real_nr_free_low_pages() + extra_pages_allocated; - - return needed > available ? DIV_ROUND_UP(needed - available, 2) : 0; -} - -static unsigned long current_image_size(void) -{ - return pagedir1.size + pagedir2.size + header_storage_needed; -} - -static unsigned long storage_still_required(void) -{ - unsigned long needed = main_storage_needed(1, 1); - return needed > storage_limit ? needed - storage_limit : 0; -} - -static unsigned long ram_still_required(void) -{ - unsigned long needed = MIN_FREE_RAM + toi_memory_for_modules(0) + - 2 * extra_pd1_pages_allowance, - available = real_nr_free_low_pages() + extra_pages_allocated; - return needed > available ? needed - available : 0; -} - -unsigned long any_to_free(int use_image_size_limit) -{ - int use_soft_limit = use_image_size_limit && image_size_limit > 0; - unsigned long current_size = current_image_size(), - soft_limit = use_soft_limit ? (image_size_limit << 8) : 0, - to_free = use_soft_limit ? (current_size > soft_limit ? - current_size - soft_limit : 0) : 0, - storage_limit = storage_still_required(), - ram_limit = ram_still_required(), - first_max = max(to_free, storage_limit); - - return max(first_max, ram_limit); -} - -static int need_pageset2(void) -{ - return (real_nr_free_low_pages() + extra_pages_allocated - - 2 * extra_pd1_pages_allowance - MIN_FREE_RAM - - toi_memory_for_modules(0) - pagedir1.size) < pagedir2.size; -} - -/* amount_needed - * - * Calculates the amount by which the image size needs to be reduced to meet - * our constraints. - */ -static unsigned long amount_needed(int use_image_size_limit) -{ - return max(highpages_ps1_to_free() + lowpages_ps1_to_free(), - any_to_free(use_image_size_limit)); -} - -static int image_not_ready(int use_image_size_limit) -{ - toi_message(TOI_EAT_MEMORY, TOI_LOW, 1, - "Amount still needed (%lu) > 0:%u," - " Storage allocd: %lu < %lu: %u.\n", - amount_needed(use_image_size_limit), - (amount_needed(use_image_size_limit) > 0), - main_storage_allocated, - main_storage_needed(1, 1), - main_storage_allocated < main_storage_needed(1, 1)); - - toi_cond_pause(0, NULL); - - return (amount_needed(use_image_size_limit) > 0) || - main_storage_allocated < main_storage_needed(1, 1); -} - -static void display_failure_reason(int tries_exceeded) -{ - unsigned long storage_required = storage_still_required(), - ram_required = ram_still_required(), - high_ps1 = highpages_ps1_to_free(), - low_ps1 = lowpages_ps1_to_free(); - - printk(KERN_INFO "Failed to prepare the image because...\n"); - - if (!storage_limit) { - printk(KERN_INFO "- You need some storage available to be " - "able to hibernate.\n"); - return; - } - - if (tries_exceeded) - printk(KERN_INFO "- The maximum number of iterations was " - "reached without successfully preparing the " - "image.\n"); - - if (storage_required) { - printk(KERN_INFO " - We need at least %lu pages of storage " - "(ignoring the header), but only have %lu.\n", - main_storage_needed(1, 1), - main_storage_allocated); - set_abort_result(TOI_INSUFFICIENT_STORAGE); - } - - if (ram_required) { - printk(KERN_INFO " - We need %lu more free pages of low " - "memory.\n", ram_required); - printk(KERN_INFO " Minimum free : %8d\n", MIN_FREE_RAM); - printk(KERN_INFO " + Reqd. by modules : %8lu\n", - toi_memory_for_modules(0)); - printk(KERN_INFO " + 2 * extra allow : %8lu\n", - 2 * extra_pd1_pages_allowance); - printk(KERN_INFO " - Currently free : %8lu\n", - real_nr_free_low_pages()); - printk(KERN_INFO " - Pages allocd : %8lu\n", - extra_pages_allocated); - printk(KERN_INFO " : ========\n"); - printk(KERN_INFO " Still needed : %8lu\n", - ram_required); - - /* Print breakdown of memory needed for modules */ - toi_memory_for_modules(1); - set_abort_result(TOI_UNABLE_TO_FREE_ENOUGH_MEMORY); - } - - if (high_ps1) { - printk(KERN_INFO "- We need to free %lu highmem pageset 1 " - "pages.\n", high_ps1); - set_abort_result(TOI_UNABLE_TO_FREE_ENOUGH_MEMORY); - } - - if (low_ps1) { - printk(KERN_INFO " - We need to free %ld lowmem pageset 1 " - "pages.\n", low_ps1); - set_abort_result(TOI_UNABLE_TO_FREE_ENOUGH_MEMORY); - } -} - -static void display_stats(int always, int sub_extra_pd1_allow) -{ - char buffer[255]; - snprintf(buffer, 254, - "Free:%lu(%lu). Sets:%lu(%lu),%lu(%lu). " - "Nosave:%lu-%lu=%lu. Storage:%lu/%lu(%lu=>%lu). " - "Needed:%lu,%lu,%lu(%u,%lu,%lu,%ld) (PS2:%s)\n", - - /* Free */ - real_nr_free_pages(all_zones_mask), - real_nr_free_low_pages(), - - /* Sets */ - pagedir1.size, pagedir1.size - get_highmem_size(pagedir1), - pagedir2.size, pagedir2.size - get_highmem_size(pagedir2), - - /* Nosave */ - num_nosave, extra_pages_allocated, - num_nosave - extra_pages_allocated, - - /* Storage */ - main_storage_allocated, - storage_limit, - main_storage_needed(1, sub_extra_pd1_allow), - main_storage_needed(1, 1), - - /* Needed */ - lowpages_ps1_to_free(), highpages_ps1_to_free(), - any_to_free(1), - MIN_FREE_RAM, toi_memory_for_modules(0), - extra_pd1_pages_allowance, - image_size_limit, - - need_pageset2() ? "yes" : "no"); - - if (always) - printk("%s", buffer); - else - toi_message(TOI_EAT_MEMORY, TOI_MEDIUM, 1, buffer); -} - -/* flag_image_pages - * - * This routine generates our lists of pages to be stored in each - * pageset. Since we store the data using extents, and adding new - * extents might allocate a new extent page, this routine may well - * be called more than once. - */ -static void flag_image_pages(int atomic_copy) -{ - int num_free = 0, num_unmodified = 0; - unsigned long loop; - struct zone *zone; - - pagedir1.size = 0; - pagedir2.size = 0; - - set_highmem_size(pagedir1, 0); - set_highmem_size(pagedir2, 0); - - num_nosave = 0; - toi_trace_index++; - - memory_bm_clear(pageset1_map); - - toi_generate_free_page_map(); - - /* - * Pages not to be saved are marked Nosave irrespective of being - * reserved. - */ - for_each_populated_zone(zone) { - int highmem = is_highmem(zone); - - for (loop = 0; loop < zone->spanned_pages; loop++) { - unsigned long pfn = zone->zone_start_pfn + loop; - struct page *page; - int chunk_size; - - if (!pfn_valid(pfn)) { - TOI_TRACE_DEBUG(pfn, "_Flag Invalid"); - continue; - } - - chunk_size = toi_size_of_free_region(zone, pfn); - if (chunk_size) { - unsigned long y; - for (y = pfn; y < pfn + chunk_size; y++) { - page = pfn_to_page(y); - TOI_TRACE_DEBUG(y, "_Flag Free"); - ClearPagePageset1(page); - ClearPagePageset2(page); - } - num_free += chunk_size; - loop += chunk_size - 1; - continue; - } - - page = pfn_to_page(pfn); - - if (PageNosave(page)) { - char *desc = PagePageset1Copy(page) ? "Pageset1Copy" : "NoSave"; - TOI_TRACE_DEBUG(pfn, "_Flag %s", desc); - num_nosave++; - continue; - } - - page = highmem ? saveable_highmem_page(zone, pfn) : - saveable_page(zone, pfn); - - if (!page) { - TOI_TRACE_DEBUG(pfn, "_Flag Nosave2"); - num_nosave++; - continue; - } - - if (PageTOI_RO(page) && test_result_state(TOI_KEPT_IMAGE)) { - TOI_TRACE_DEBUG(pfn, "_Unmodified"); - num_unmodified++; - continue; - } - - if (PagePageset2(page)) { - pagedir2.size++; - TOI_TRACE_DEBUG(pfn, "_Flag PS2"); - if (PageHighMem(page)) - inc_highmem_size(pagedir2); - else - SetPagePageset1Copy(page); - if (PageResave(page)) { - SetPagePageset1(page); - ClearPagePageset1Copy(page); - pagedir1.size++; - if (PageHighMem(page)) - inc_highmem_size(pagedir1); - } - } else { - pagedir1.size++; - TOI_TRACE_DEBUG(pfn, "_Flag PS1"); - SetPagePageset1(page); - if (PageHighMem(page)) - inc_highmem_size(pagedir1); - } - } - } - - if (!atomic_copy) - toi_message(TOI_EAT_MEMORY, TOI_MEDIUM, 0, - "Count data pages: Set1 (%d) + Set2 (%d) + Nosave (%ld)" - " + Unmodified (%d) + NumFree (%d) = %d.\n", - pagedir1.size, pagedir2.size, num_nosave, num_unmodified, - num_free, pagedir1.size + pagedir2.size + num_nosave + num_free); -} - -void toi_recalculate_image_contents(int atomic_copy) -{ - memory_bm_clear(pageset1_map); - if (!atomic_copy) { - unsigned long pfn; - memory_bm_position_reset(pageset2_map); - for (pfn = memory_bm_next_pfn(pageset2_map, 0); - pfn != BM_END_OF_MAP; - pfn = memory_bm_next_pfn(pageset2_map, 0)) - ClearPagePageset1Copy(pfn_to_page(pfn)); - /* Need to call this before getting pageset1_size! */ - toi_mark_pages_for_pageset2(); - } - memory_bm_position_reset(pageset2_map); - flag_image_pages(atomic_copy); - - if (!atomic_copy) { - storage_limit = toiActiveAllocator->storage_available(); - display_stats(0, 0); - } -} - -int try_allocate_extra_memory(void) -{ - unsigned long wanted = pagedir1.size + extra_pd1_pages_allowance - - get_lowmem_size(pagedir2); - if (wanted > extra_pages_allocated) { - unsigned long got = toi_allocate_extra_pagedir_memory(wanted); - if (wanted < got) { - toi_message(TOI_EAT_MEMORY, TOI_LOW, 1, - "Want %d extra pages for pageset1, got %d.\n", - wanted, got); - return 1; - } - } - return 0; -} - -/* update_image - * - * Allocate [more] memory and storage for the image. - */ -static void update_image(int ps2_recalc) -{ - int old_header_req; - unsigned long seek; - - if (try_allocate_extra_memory()) - return; - - if (ps2_recalc) - goto recalc; - - thaw_kernel_threads(); - - /* - * Allocate remaining storage space, if possible, up to the - * maximum we know we'll need. It's okay to allocate the - * maximum if the writer is the swapwriter, but - * we don't want to grab all available space on an NFS share. - * We therefore ignore the expected compression ratio here, - * thereby trying to allocate the maximum image size we could - * need (assuming compression doesn't expand the image), but - * don't complain if we can't get the full amount we're after. - */ - - do { - int result; - - old_header_req = header_storage_needed; - toiActiveAllocator->reserve_header_space(header_storage_needed); - - /* How much storage is free with the reservation applied? */ - storage_limit = toiActiveAllocator->storage_available(); - seek = min(storage_limit, main_storage_needed(0, 0)); - - result = toiActiveAllocator->allocate_storage(seek); - if (result) - printk("Failed to allocate storage (%d).\n", result); - - main_storage_allocated = - toiActiveAllocator->storage_allocated(); - - /* Need more header because more storage allocated? */ - header_storage_needed = get_header_storage_needed(); - - } while (header_storage_needed > old_header_req); - - if (freeze_kernel_threads()) - set_abort_result(TOI_FREEZING_FAILED); - -recalc: - toi_recalculate_image_contents(0); -} - -/* attempt_to_freeze - * - * Try to freeze processes. - */ - -static int attempt_to_freeze(void) -{ - int result; - - /* Stop processes before checking again */ - toi_prepare_status(CLEAR_BAR, "Freezing processes & syncing " - "filesystems."); - result = freeze_processes(); - - if (result) - set_abort_result(TOI_FREEZING_FAILED); - - result = freeze_kernel_threads(); - - if (result) - set_abort_result(TOI_FREEZING_FAILED); - - return result; -} - -/* eat_memory - * - * Try to free some memory, either to meet hard or soft constraints on the image - * characteristics. - * - * Hard constraints: - * - Pageset1 must be < half of memory; - * - We must have enough memory free at resume time to have pageset1 - * be able to be loaded in pages that don't conflict with where it has to - * be restored. - * Soft constraints - * - User specificied image size limit. - */ -static void eat_memory(void) -{ - unsigned long amount_wanted = 0; - int did_eat_memory = 0; - - /* - * Note that if we have enough storage space and enough free memory, we - * may exit without eating anything. We give up when the last 10 - * iterations ate no extra pages because we're not going to get much - * more anyway, but the few pages we get will take a lot of time. - * - * We freeze processes before beginning, and then unfreeze them if we - * need to eat memory until we think we have enough. If our attempts - * to freeze fail, we give up and abort. - */ - - amount_wanted = amount_needed(1); - - switch (image_size_limit) { - case -1: /* Don't eat any memory */ - if (amount_wanted > 0) { - set_abort_result(TOI_WOULD_EAT_MEMORY); - return; - } - break; - case -2: /* Free caches only */ - drop_pagecache(); - toi_recalculate_image_contents(0); - amount_wanted = amount_needed(1); - break; - default: - break; - } - - if (amount_wanted > 0 && !test_result_state(TOI_ABORTED) && - image_size_limit != -1) { - unsigned long request = amount_wanted; - unsigned long high_req = max(highpages_ps1_to_free(), - any_to_free(1)); - unsigned long low_req = lowpages_ps1_to_free(); - unsigned long got = 0; - - toi_prepare_status(CLEAR_BAR, - "Seeking to free %ldMB of memory.", - MB(amount_wanted)); - - thaw_kernel_threads(); - - /* - * Ask for too many because shrink_memory_mask doesn't - * currently return enough most of the time. - */ - - if (low_req) - got = shrink_memory_mask(low_req, GFP_KERNEL); - if (high_req) - shrink_memory_mask(high_req - got, GFP_HIGHUSER); - - did_eat_memory = 1; - - toi_recalculate_image_contents(0); - - amount_wanted = amount_needed(1); - - printk(KERN_DEBUG "Asked shrink_memory_mask for %ld low pages &" - " %ld pages from anywhere, got %ld.\n", - high_req, low_req, - request - amount_wanted); - - toi_cond_pause(0, NULL); - - if (freeze_kernel_threads()) - set_abort_result(TOI_FREEZING_FAILED); - } - - if (did_eat_memory) - toi_recalculate_image_contents(0); -} - -/* toi_prepare_image - * - * Entry point to the whole image preparation section. - * - * We do four things: - * - Freeze processes; - * - Ensure image size constraints are met; - * - Complete all the preparation for saving the image, - * including allocation of storage. The only memory - * that should be needed when we're finished is that - * for actually storing the image (and we know how - * much is needed for that because the modules tell - * us). - * - Make sure that all dirty buffers are written out. - */ -#define MAX_TRIES 2 -int toi_prepare_image(void) -{ - int result = 1, tries = 1; - - main_storage_allocated = 0; - no_ps2_needed = 0; - - if (attempt_to_freeze()) - return 1; - - lock_device_hotplug(); - set_toi_state(TOI_DEVICE_HOTPLUG_LOCKED); - - if (!extra_pd1_pages_allowance) - get_extra_pd1_allowance(); - - storage_limit = toiActiveAllocator->storage_available(); - - if (!storage_limit) { - printk(KERN_INFO "No storage available. Didn't try to prepare " - "an image.\n"); - display_failure_reason(0); - set_abort_result(TOI_NOSTORAGE_AVAILABLE); - return 1; - } - - if (build_attention_list()) { - abort_hibernate(TOI_UNABLE_TO_PREPARE_IMAGE, - "Unable to successfully prepare the image.\n"); - return 1; - } - - toi_recalculate_image_contents(0); - - do { - toi_prepare_status(CLEAR_BAR, - "Preparing Image. Try %d.", tries); - - eat_memory(); - - if (test_result_state(TOI_ABORTED)) - break; - - update_image(0); - - tries++; - - } while (image_not_ready(1) && tries <= MAX_TRIES && - !test_result_state(TOI_ABORTED)); - - result = image_not_ready(0); - - /* TODO: Handle case where need to remove existing image and resave - * instead of adding to incremental image. */ - - if (!test_result_state(TOI_ABORTED)) { - if (result) { - display_stats(1, 0); - display_failure_reason(tries > MAX_TRIES); - abort_hibernate(TOI_UNABLE_TO_PREPARE_IMAGE, - "Unable to successfully prepare the image.\n"); - } else { - /* Pageset 2 needed? */ - if (!need_pageset2() && - test_action_state(TOI_NO_PS2_IF_UNNEEDED)) { - no_ps2_needed = 1; - toi_recalculate_image_contents(0); - update_image(1); - } - - toi_cond_pause(1, "Image preparation complete."); - } - } - - return result ? result : allocate_checksum_pages(); -} diff --git a/kernel/power/tuxonice_prepare_image.h b/kernel/power/tuxonice_prepare_image.h deleted file mode 100644 index c1508975c..000000000 --- a/kernel/power/tuxonice_prepare_image.h +++ /dev/null @@ -1,38 +0,0 @@ -/* - * kernel/power/tuxonice_prepare_image.h - * - * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - */ - -#include <asm/sections.h> - -extern int toi_prepare_image(void); -extern void toi_recalculate_image_contents(int storage_available); -extern unsigned long real_nr_free_pages(unsigned long zone_idx_mask); -extern long image_size_limit; -extern void toi_free_extra_pagedir_memory(void); -extern unsigned long extra_pd1_pages_allowance; -extern void free_attention_list(void); - -#define MIN_FREE_RAM 100 -#define MIN_EXTRA_PAGES_ALLOWANCE 500 - -#define all_zones_mask ((unsigned long) ((1 << MAX_NR_ZONES) - 1)) -#ifdef CONFIG_HIGHMEM -#define real_nr_free_high_pages() (real_nr_free_pages(1 << ZONE_HIGHMEM)) -#define real_nr_free_low_pages() (real_nr_free_pages(all_zones_mask - \ - (1 << ZONE_HIGHMEM))) -#else -#define real_nr_free_high_pages() (0) -#define real_nr_free_low_pages() (real_nr_free_pages(all_zones_mask)) - -/* For eat_memory function */ -#define ZONE_HIGHMEM (MAX_NR_ZONES + 1) -#endif - -unsigned long get_header_storage_needed(void); -unsigned long any_to_free(int use_image_size_limit); -int try_allocate_extra_memory(void); diff --git a/kernel/power/tuxonice_prune.c b/kernel/power/tuxonice_prune.c deleted file mode 100644 index 5bc56d3a1..000000000 --- a/kernel/power/tuxonice_prune.c +++ /dev/null @@ -1,406 +0,0 @@ -/* - * kernel/power/tuxonice_prune.c - * - * Copyright (C) 2012 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * This file implements a TuxOnIce module that seeks to prune the - * amount of data written to disk. It builds a table of hashes - * of the uncompressed data, and writes the pfn of the previous page - * with the same contents instead of repeating the data when a match - * is found. - */ - -#include <linux/suspend.h> -#include <linux/highmem.h> -#include <linux/vmalloc.h> -#include <linux/crypto.h> -#include <linux/scatterlist.h> -#include <crypto/hash.h> - -#include "tuxonice_builtin.h" -#include "tuxonice.h" -#include "tuxonice_modules.h" -#include "tuxonice_sysfs.h" -#include "tuxonice_io.h" -#include "tuxonice_ui.h" -#include "tuxonice_alloc.h" - -/* - * We never write a page bigger than PAGE_SIZE, so use a large number - * to indicate that data is a PFN. - */ -#define PRUNE_DATA_IS_PFN (PAGE_SIZE + 100) - -static unsigned long toi_pruned_pages; - -static struct toi_module_ops toi_prune_ops; -static struct toi_module_ops *next_driver; - -static char toi_prune_hash_algo_name[32] = "sha1"; - -static DEFINE_MUTEX(stats_lock); - -struct cpu_context { - struct shash_desc desc; - char *digest; -}; - -#define OUT_BUF_SIZE (2 * PAGE_SIZE) - -static DEFINE_PER_CPU(struct cpu_context, contexts); - -/* - * toi_crypto_prepare - * - * Prepare to do some work by allocating buffers and transforms. - */ -static int toi_prune_crypto_prepare(void) -{ - int cpu, ret, digestsize; - - if (!*toi_prune_hash_algo_name) { - printk(KERN_INFO "TuxOnIce: Pruning enabled but no " - "hash algorithm set.\n"); - return 1; - } - - for_each_online_cpu(cpu) { - struct cpu_context *this = &per_cpu(contexts, cpu); - this->desc.tfm = crypto_alloc_shash(toi_prune_hash_algo_name, 0, 0); - if (IS_ERR(this->desc.tfm)) { - printk(KERN_INFO "TuxOnIce: Failed to allocate the " - "%s prune hash algorithm.\n", - toi_prune_hash_algo_name); - this->desc.tfm = NULL; - return 1; - } - - if (!digestsize) - digestsize = crypto_shash_digestsize(this->desc.tfm); - - this->digest = kmalloc(digestsize, GFP_KERNEL); - if (!this->digest) { - printk(KERN_INFO "TuxOnIce: Failed to allocate space " - "for digest output.\n"); - crypto_free_shash(this->desc.tfm); - this->desc.tfm = NULL; - } - - this->desc.flags = 0; - - ret = crypto_shash_init(&this->desc); - if (ret < 0) { - printk(KERN_INFO "TuxOnIce: Failed to initialise the " - "%s prune hash algorithm.\n", - toi_prune_hash_algo_name); - kfree(this->digest); - this->digest = NULL; - crypto_free_shash(this->desc.tfm); - this->desc.tfm = NULL; - return 1; - } - } - - return 0; -} - -static int toi_prune_rw_cleanup(int writing) -{ - int cpu; - - for_each_online_cpu(cpu) { - struct cpu_context *this = &per_cpu(contexts, cpu); - if (this->desc.tfm) { - crypto_free_shash(this->desc.tfm); - this->desc.tfm = NULL; - } - - if (this->digest) { - kfree(this->digest); - this->digest = NULL; - } - } - - return 0; -} - -/* - * toi_prune_init - */ - -static int toi_prune_init(int toi_or_resume) -{ - if (!toi_or_resume) - return 0; - - toi_pruned_pages = 0; - - next_driver = toi_get_next_filter(&toi_prune_ops); - - return next_driver ? 0 : -ECHILD; -} - -/* - * toi_prune_rw_init() - */ - -static int toi_prune_rw_init(int rw, int stream_number) -{ - if (toi_prune_crypto_prepare()) { - printk(KERN_ERR "Failed to initialise prune " - "algorithm.\n"); - if (rw == READ) { - printk(KERN_INFO "Unable to read the image.\n"); - return -ENODEV; - } else { - printk(KERN_INFO "Continuing without " - "pruning the image.\n"); - toi_prune_ops.enabled = 0; - } - } - - return 0; -} - -/* - * toi_prune_write_page() - * - * Compress a page of data, buffering output and passing on filled - * pages to the next module in the pipeline. - * - * Buffer_page: Pointer to a buffer of size PAGE_SIZE, containing - * data to be checked. - * - * Returns: 0 on success. Otherwise the error is that returned by later - * modules, -ECHILD if we have a broken pipeline or -EIO if - * zlib errs. - */ -static int toi_prune_write_page(unsigned long index, int buf_type, - void *buffer_page, unsigned int buf_size) -{ - int ret = 0, cpu = smp_processor_id(), write_data = 1; - struct cpu_context *ctx = &per_cpu(contexts, cpu); - u8* output_buffer = buffer_page; - int output_len = buf_size; - int out_buf_type = buf_type; - void *buffer_start; - u32 buf[4]; - - if (ctx->desc.tfm) { - - buffer_start = TOI_MAP(buf_type, buffer_page); - ctx->len = OUT_BUF_SIZE; - - ret = crypto_shash_digest(&ctx->desc, buffer_start, buf_size, &ctx->digest); - if (ret) { - printk(KERN_INFO "TuxOnIce: Failed to calculate digest (%d).\n", ret); - } else { - mutex_lock(&stats_lock); - - toi_pruned_pages++; - - mutex_unlock(&stats_lock); - - } - - TOI_UNMAP(buf_type, buffer_page); - } - - if (write_data) - ret = next_driver->write_page(index, out_buf_type, - output_buffer, output_len); - else - ret = next_driver->write_page(index, out_buf_type, - output_buffer, output_len); - - return ret; -} - -/* - * toi_prune_read_page() - * @buffer_page: struct page *. Pointer to a buffer of size PAGE_SIZE. - * - * Retrieve data from later modules or from a previously loaded page and - * fill the input buffer. - * Zero if successful. Error condition from me or from downstream on failure. - */ -static int toi_prune_read_page(unsigned long *index, int buf_type, - void *buffer_page, unsigned int *buf_size) -{ - int ret, cpu = smp_processor_id(); - unsigned int len; - char *buffer_start; - struct cpu_context *ctx = &per_cpu(contexts, cpu); - - if (!ctx->desc.tfm) - return next_driver->read_page(index, TOI_PAGE, buffer_page, - buf_size); - - /* - * All our reads must be synchronous - we can't handle - * data that hasn't been read yet. - */ - - ret = next_driver->read_page(index, buf_type, buffer_page, &len); - - if (len == PRUNE_DATA_IS_PFN) { - buffer_start = kmap(buffer_page); - } - - return ret; -} - -/* - * toi_prune_print_debug_stats - * @buffer: Pointer to a buffer into which the debug info will be printed. - * @size: Size of the buffer. - * - * Print information to be recorded for debugging purposes into a buffer. - * Returns: Number of characters written to the buffer. - */ - -static int toi_prune_print_debug_stats(char *buffer, int size) -{ - int len; - - /* Output the number of pages pruned. */ - if (*toi_prune_hash_algo_name) - len = scnprintf(buffer, size, "- Compressor is '%s'.\n", - toi_prune_hash_algo_name); - else - len = scnprintf(buffer, size, "- Compressor is not set.\n"); - - if (toi_pruned_pages) - len += scnprintf(buffer+len, size - len, " Pruned " - "%lu pages).\n", - toi_pruned_pages); - return len; -} - -/* - * toi_prune_memory_needed - * - * Tell the caller how much memory we need to operate during hibernate/resume. - * Returns: Unsigned long. Maximum number of bytes of memory required for - * operation. - */ -static int toi_prune_memory_needed(void) -{ - return 2 * PAGE_SIZE; -} - -static int toi_prune_storage_needed(void) -{ - return 2 * sizeof(unsigned long) + 2 * sizeof(int) + - strlen(toi_prune_hash_algo_name) + 1; -} - -/* - * toi_prune_save_config_info - * @buffer: Pointer to a buffer of size PAGE_SIZE. - * - * Save informaton needed when reloading the image at resume time. - * Returns: Number of bytes used for saving our data. - */ -static int toi_prune_save_config_info(char *buffer) -{ - int len = strlen(toi_prune_hash_algo_name) + 1, offset = 0; - - *((unsigned long *) buffer) = toi_pruned_pages; - offset += sizeof(unsigned long); - *((int *) (buffer + offset)) = len; - offset += sizeof(int); - strncpy(buffer + offset, toi_prune_hash_algo_name, len); - return offset + len; -} - -/* toi_prune_load_config_info - * @buffer: Pointer to the start of the data. - * @size: Number of bytes that were saved. - * - * Description: Reload information needed for passing back to the - * resumed kernel. - */ -static void toi_prune_load_config_info(char *buffer, int size) -{ - int len, offset = 0; - - toi_pruned_pages = *((unsigned long *) buffer); - offset += sizeof(unsigned long); - len = *((int *) (buffer + offset)); - offset += sizeof(int); - strncpy(toi_prune_hash_algo_name, buffer + offset, len); -} - -static void toi_prune_pre_atomic_restore(struct toi_boot_kernel_data *bkd) -{ - bkd->pruned_pages = toi_pruned_pages; -} - -static void toi_prune_post_atomic_restore(struct toi_boot_kernel_data *bkd) -{ - toi_pruned_pages = bkd->pruned_pages; -} - -/* - * toi_expected_ratio - * - * Description: Returns the expected ratio between data passed into this module - * and the amount of data output when writing. - * Returns: 100 - we have no idea how many pages will be pruned. - */ - -static int toi_prune_expected_ratio(void) -{ - return 100; -} - -/* - * data for our sysfs entries. - */ -static struct toi_sysfs_data sysfs_params[] = { - SYSFS_INT("enabled", SYSFS_RW, &toi_prune_ops.enabled, 0, 1, 0, - NULL), - SYSFS_STRING("algorithm", SYSFS_RW, toi_prune_hash_algo_name, 31, 0, NULL), -}; - -/* - * Ops structure. - */ -static struct toi_module_ops toi_prune_ops = { - .type = FILTER_MODULE, - .name = "prune", - .directory = "prune", - .module = THIS_MODULE, - .initialise = toi_prune_init, - .memory_needed = toi_prune_memory_needed, - .print_debug_info = toi_prune_print_debug_stats, - .save_config_info = toi_prune_save_config_info, - .load_config_info = toi_prune_load_config_info, - .storage_needed = toi_prune_storage_needed, - .expected_compression = toi_prune_expected_ratio, - - .pre_atomic_restore = toi_prune_pre_atomic_restore, - .post_atomic_restore = toi_prune_post_atomic_restore, - - .rw_init = toi_prune_rw_init, - .rw_cleanup = toi_prune_rw_cleanup, - - .write_page = toi_prune_write_page, - .read_page = toi_prune_read_page, - - .sysfs_data = sysfs_params, - .num_sysfs_entries = sizeof(sysfs_params) / - sizeof(struct toi_sysfs_data), -}; - -/* ---- Registration ---- */ - -static __init int toi_prune_load(void) -{ - return toi_register_module(&toi_prune_ops); -} - -late_initcall(toi_prune_load); diff --git a/kernel/power/tuxonice_storage.c b/kernel/power/tuxonice_storage.c deleted file mode 100644 index d8539c275..000000000 --- a/kernel/power/tuxonice_storage.c +++ /dev/null @@ -1,282 +0,0 @@ -/* - * kernel/power/tuxonice_storage.c - * - * Copyright (C) 2005-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * Routines for talking to a userspace program that manages storage. - * - * The kernel side: - * - starts the userspace program; - * - sends messages telling it when to open and close the connection; - * - tells it when to quit; - * - * The user space side: - * - passes messages regarding status; - * - */ - -#include <linux/suspend.h> -#include <linux/freezer.h> - -#include "tuxonice_sysfs.h" -#include "tuxonice_modules.h" -#include "tuxonice_netlink.h" -#include "tuxonice_storage.h" -#include "tuxonice_ui.h" - -static struct user_helper_data usm_helper_data; -static struct toi_module_ops usm_ops; -static int message_received, usm_prepare_count; -static int storage_manager_last_action, storage_manager_action; - -static int usm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) -{ - int type; - int *data; - - type = nlh->nlmsg_type; - - /* A control message: ignore them */ - if (type < NETLINK_MSG_BASE) - return 0; - - /* Unknown message: reply with EINVAL */ - if (type >= USM_MSG_MAX) - return -EINVAL; - - /* All operations require privileges, even GET */ - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - - /* Only allow one task to receive NOFREEZE privileges */ - if (type == NETLINK_MSG_NOFREEZE_ME && usm_helper_data.pid != -1) - return -EBUSY; - - data = (int *) NLMSG_DATA(nlh); - - switch (type) { - case USM_MSG_SUCCESS: - case USM_MSG_FAILED: - message_received = type; - complete(&usm_helper_data.wait_for_process); - break; - default: - printk(KERN_INFO "Storage manager doesn't recognise " - "message %d.\n", type); - } - - return 1; -} - -#ifdef CONFIG_NET -static int activations; - -int toi_activate_storage(int force) -{ - int tries = 1; - - if (usm_helper_data.pid == -1 || !usm_ops.enabled) - return 0; - - message_received = 0; - activations++; - - if (activations > 1 && !force) - return 0; - - while ((!message_received || message_received == USM_MSG_FAILED) && - tries < 2) { - toi_prepare_status(DONT_CLEAR_BAR, "Activate storage attempt " - "%d.\n", tries); - - init_completion(&usm_helper_data.wait_for_process); - - toi_send_netlink_message(&usm_helper_data, - USM_MSG_CONNECT, - NULL, 0); - - /* Wait 2 seconds for the userspace process to make contact */ - wait_for_completion_timeout(&usm_helper_data.wait_for_process, - 2*HZ); - - tries++; - } - - return 0; -} - -int toi_deactivate_storage(int force) -{ - if (usm_helper_data.pid == -1 || !usm_ops.enabled) - return 0; - - message_received = 0; - activations--; - - if (activations && !force) - return 0; - - init_completion(&usm_helper_data.wait_for_process); - - toi_send_netlink_message(&usm_helper_data, - USM_MSG_DISCONNECT, - NULL, 0); - - wait_for_completion_timeout(&usm_helper_data.wait_for_process, 2*HZ); - - if (!message_received || message_received == USM_MSG_FAILED) { - printk(KERN_INFO "Returning failure disconnecting storage.\n"); - return 1; - } - - return 0; -} -#endif - -static void storage_manager_simulate(void) -{ - printk(KERN_INFO "--- Storage manager simulate ---\n"); - toi_prepare_usm(); - schedule(); - printk(KERN_INFO "--- Activate storage 1 ---\n"); - toi_activate_storage(1); - schedule(); - printk(KERN_INFO "--- Deactivate storage 1 ---\n"); - toi_deactivate_storage(1); - schedule(); - printk(KERN_INFO "--- Cleanup usm ---\n"); - toi_cleanup_usm(); - schedule(); - printk(KERN_INFO "--- Storage manager simulate ends ---\n"); -} - -static int usm_storage_needed(void) -{ - return sizeof(int) + strlen(usm_helper_data.program) + 1; -} - -static int usm_save_config_info(char *buf) -{ - int len = strlen(usm_helper_data.program); - memcpy(buf, usm_helper_data.program, len + 1); - return sizeof(int) + len + 1; -} - -static void usm_load_config_info(char *buf, int size) -{ - /* Don't load the saved path if one has already been set */ - if (usm_helper_data.program[0]) - return; - - memcpy(usm_helper_data.program, buf + sizeof(int), *((int *) buf)); -} - -static int usm_memory_needed(void) -{ - /* ball park figure of 32 pages */ - return 32 * PAGE_SIZE; -} - -/* toi_prepare_usm - */ -int toi_prepare_usm(void) -{ - usm_prepare_count++; - - if (usm_prepare_count > 1 || !usm_ops.enabled) - return 0; - - usm_helper_data.pid = -1; - - if (!*usm_helper_data.program) - return 0; - - toi_netlink_setup(&usm_helper_data); - - if (usm_helper_data.pid == -1) - printk(KERN_INFO "TuxOnIce Storage Manager wanted, but couldn't" - " start it.\n"); - - toi_activate_storage(0); - - return usm_helper_data.pid != -1; -} - -void toi_cleanup_usm(void) -{ - usm_prepare_count--; - - if (usm_helper_data.pid > -1 && !usm_prepare_count) { - toi_deactivate_storage(0); - toi_netlink_close(&usm_helper_data); - } -} - -static void storage_manager_activate(void) -{ - if (storage_manager_action == storage_manager_last_action) - return; - - if (storage_manager_action) - toi_prepare_usm(); - else - toi_cleanup_usm(); - - storage_manager_last_action = storage_manager_action; -} - -/* - * User interface specific /sys/power/tuxonice entries. - */ - -static struct toi_sysfs_data sysfs_params[] = { - SYSFS_NONE("simulate_atomic_copy", storage_manager_simulate), - SYSFS_INT("enabled", SYSFS_RW, &usm_ops.enabled, 0, 1, 0, NULL), - SYSFS_STRING("program", SYSFS_RW, usm_helper_data.program, 254, 0, - NULL), - SYSFS_INT("activate_storage", SYSFS_RW , &storage_manager_action, 0, 1, - 0, storage_manager_activate) -}; - -static struct toi_module_ops usm_ops = { - .type = MISC_MODULE, - .name = "usm", - .directory = "storage_manager", - .module = THIS_MODULE, - .storage_needed = usm_storage_needed, - .save_config_info = usm_save_config_info, - .load_config_info = usm_load_config_info, - .memory_needed = usm_memory_needed, - - .sysfs_data = sysfs_params, - .num_sysfs_entries = sizeof(sysfs_params) / - sizeof(struct toi_sysfs_data), -}; - -/* toi_usm_sysfs_init - * Description: Boot time initialisation for user interface. - */ -int toi_usm_init(void) -{ - usm_helper_data.nl = NULL; - usm_helper_data.program[0] = '\0'; - usm_helper_data.pid = -1; - usm_helper_data.skb_size = 0; - usm_helper_data.pool_limit = 6; - usm_helper_data.netlink_id = NETLINK_TOI_USM; - usm_helper_data.name = "userspace storage manager"; - usm_helper_data.rcv_msg = usm_user_rcv_msg; - usm_helper_data.interface_version = 2; - usm_helper_data.must_init = 0; - init_completion(&usm_helper_data.wait_for_process); - - return toi_register_module(&usm_ops); -} - -void toi_usm_exit(void) -{ - toi_netlink_close_complete(&usm_helper_data); - toi_unregister_module(&usm_ops); -} diff --git a/kernel/power/tuxonice_storage.h b/kernel/power/tuxonice_storage.h deleted file mode 100644 index 0189c8888..000000000 --- a/kernel/power/tuxonice_storage.h +++ /dev/null @@ -1,45 +0,0 @@ -/* - * kernel/power/tuxonice_storage.h - * - * Copyright (C) 2005-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - */ - -#ifdef CONFIG_NET -int toi_prepare_usm(void); -void toi_cleanup_usm(void); - -int toi_activate_storage(int force); -int toi_deactivate_storage(int force); -extern int toi_usm_init(void); -extern void toi_usm_exit(void); -#else -static inline int toi_usm_init(void) { return 0; } -static inline void toi_usm_exit(void) { } - -static inline int toi_activate_storage(int force) -{ - return 0; -} - -static inline int toi_deactivate_storage(int force) -{ - return 0; -} - -static inline int toi_prepare_usm(void) { return 0; } -static inline void toi_cleanup_usm(void) { } -#endif - -enum { - USM_MSG_BASE = 0x10, - - /* Kernel -> Userspace */ - USM_MSG_CONNECT = 0x30, - USM_MSG_DISCONNECT = 0x31, - USM_MSG_SUCCESS = 0x40, - USM_MSG_FAILED = 0x41, - - USM_MSG_MAX, -}; diff --git a/kernel/power/tuxonice_swap.c b/kernel/power/tuxonice_swap.c deleted file mode 100644 index 9f555c932..000000000 --- a/kernel/power/tuxonice_swap.c +++ /dev/null @@ -1,474 +0,0 @@ -/* - * kernel/power/tuxonice_swap.c - * - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * Distributed under GPLv2. - * - * This file encapsulates functions for usage of swap space as a - * backing store. - */ - -#include <linux/suspend.h> -#include <linux/blkdev.h> -#include <linux/swapops.h> -#include <linux/swap.h> -#include <linux/syscalls.h> -#include <linux/fs_uuid.h> - -#include "tuxonice.h" -#include "tuxonice_sysfs.h" -#include "tuxonice_modules.h" -#include "tuxonice_io.h" -#include "tuxonice_ui.h" -#include "tuxonice_extent.h" -#include "tuxonice_bio.h" -#include "tuxonice_alloc.h" -#include "tuxonice_builtin.h" - -static struct toi_module_ops toi_swapops; - -/* For swapfile automatically swapon/off'd. */ -static char swapfilename[255] = ""; -static int toi_swapon_status; - -/* Swap Pages */ -static unsigned long swap_allocated; - -static struct sysinfo swapinfo; - -static int is_ram_backed(struct swap_info_struct *si) -{ - if (!strncmp(si->bdev->bd_disk->disk_name, "ram", 3) || - !strncmp(si->bdev->bd_disk->disk_name, "zram", 4)) - return 1; - - return 0; -} - -/** - * enable_swapfile: Swapon the user specified swapfile prior to hibernating. - * - * Activate the given swapfile if it wasn't already enabled. Remember whether - * we really did swapon it for swapoffing later. - */ -static void enable_swapfile(void) -{ - int activateswapresult = -EINVAL; - - if (swapfilename[0]) { - /* Attempt to swap on with maximum priority */ - activateswapresult = sys_swapon(swapfilename, 0xFFFF); - if (activateswapresult && activateswapresult != -EBUSY) - printk(KERN_ERR "TuxOnIce: The swapfile/partition " - "specified by /sys/power/tuxonice/swap/swapfile" - " (%s) could not be turned on (error %d). " - "Attempting to continue.\n", - swapfilename, activateswapresult); - if (!activateswapresult) - toi_swapon_status = 1; - } -} - -/** - * disable_swapfile: Swapoff any file swaponed at the start of the cycle. - * - * If we did successfully swapon a file at the start of the cycle, swapoff - * it now (finishing up). - */ -static void disable_swapfile(void) -{ - if (!toi_swapon_status) - return; - - sys_swapoff(swapfilename); - toi_swapon_status = 0; -} - -static int add_blocks_to_extent_chain(struct toi_bdev_info *chain, - unsigned long start, unsigned long end) -{ - if (test_action_state(TOI_TEST_BIO)) - toi_message(TOI_IO, TOI_VERBOSE, 0, "Adding extent %lu-%lu to " - "chain %p.", start << chain->bmap_shift, - end << chain->bmap_shift, chain); - - return toi_add_to_extent_chain(&chain->blocks, start, end); -} - - -static int get_main_pool_phys_params(struct toi_bdev_info *chain) -{ - struct hibernate_extent *extentpointer = NULL; - unsigned long address, extent_min = 0, extent_max = 0; - int empty = 1; - - toi_message(TOI_IO, TOI_VERBOSE, 0, "get main pool phys params for " - "chain %d.", chain->allocator_index); - - if (!chain->allocations.first) - return 0; - - if (chain->blocks.first) - toi_put_extent_chain(&chain->blocks); - - toi_extent_for_each(&chain->allocations, extentpointer, address) { - swp_entry_t swap_address = (swp_entry_t) { address }; - struct block_device *bdev; - sector_t new_sector = map_swap_entry(swap_address, &bdev); - - if (empty) { - empty = 0; - extent_min = extent_max = new_sector; - continue; - } - - if (new_sector == extent_max + 1) { - extent_max++; - continue; - } - - if (add_blocks_to_extent_chain(chain, extent_min, extent_max)) { - printk(KERN_ERR "Out of memory while making block " - "chains.\n"); - return -ENOMEM; - } - - extent_min = new_sector; - extent_max = new_sector; - } - - if (!empty && - add_blocks_to_extent_chain(chain, extent_min, extent_max)) { - printk(KERN_ERR "Out of memory while making block chains.\n"); - return -ENOMEM; - } - - return 0; -} - -/* - * Like si_swapinfo, except that we don't include ram backed swap (compcache!) - * and don't need to use the spinlocks (userspace is stopped when this - * function is called). - */ -void si_swapinfo_no_compcache(void) -{ - unsigned int i; - - si_swapinfo(&swapinfo); - swapinfo.freeswap = 0; - swapinfo.totalswap = 0; - - for (i = 0; i < MAX_SWAPFILES; i++) { - struct swap_info_struct *si = get_swap_info_struct(i); - if (si && (si->flags & SWP_WRITEOK) && !is_ram_backed(si)) { - swapinfo.totalswap += si->inuse_pages; - swapinfo.freeswap += si->pages - si->inuse_pages; - } - } -} -/* - * We can't just remember the value from allocation time, because other - * processes might have allocated swap in the mean time. - */ -static unsigned long toi_swap_storage_available(void) -{ - toi_message(TOI_IO, TOI_VERBOSE, 0, "In toi_swap_storage_available."); - si_swapinfo_no_compcache(); - return swapinfo.freeswap + swap_allocated; -} - -static int toi_swap_initialise(int starting_cycle) -{ - if (!starting_cycle) - return 0; - - enable_swapfile(); - return 0; -} - -static void toi_swap_cleanup(int ending_cycle) -{ - if (!ending_cycle) - return; - - disable_swapfile(); -} - -static void toi_swap_free_storage(struct toi_bdev_info *chain) -{ - /* Free swap entries */ - struct hibernate_extent *extentpointer; - unsigned long extentvalue; - - toi_message(TOI_IO, TOI_VERBOSE, 0, "Freeing storage for chain %p.", - chain); - - swap_allocated -= chain->allocations.size; - toi_extent_for_each(&chain->allocations, extentpointer, extentvalue) - swap_free((swp_entry_t) { extentvalue }); - - toi_put_extent_chain(&chain->allocations); -} - -static void free_swap_range(unsigned long min, unsigned long max) -{ - int j; - - for (j = min; j <= max; j++) - swap_free((swp_entry_t) { j }); - swap_allocated -= (max - min + 1); -} - -/* - * Allocation of a single swap type. Swap priorities are handled at the higher - * level. - */ -static int toi_swap_allocate_storage(struct toi_bdev_info *chain, - unsigned long request) -{ - unsigned long gotten = 0; - - toi_message(TOI_IO, TOI_VERBOSE, 0, " Swap allocate storage: Asked to" - " allocate %lu pages from device %d.", request, - chain->allocator_index); - - while (gotten < request) { - swp_entry_t start, end; - if (0) { - /* Broken at the moment for SSDs */ - get_swap_range_of_type(chain->allocator_index, &start, &end, - request - gotten + 1); - } else { - start = end = get_swap_page_of_type(chain->allocator_index); - } - if (start.val) { - int added = end.val - start.val + 1; - if (toi_add_to_extent_chain(&chain->allocations, - start.val, end.val)) { - printk(KERN_INFO "Failed to allocate extent for " - "%lu-%lu.\n", start.val, end.val); - free_swap_range(start.val, end.val); - break; - } - gotten += added; - swap_allocated += added; - } else - break; - } - - toi_message(TOI_IO, TOI_VERBOSE, 0, " Allocated %lu pages.", gotten); - return gotten; -} - -static int toi_swap_register_storage(void) -{ - int i, result = 0; - - toi_message(TOI_IO, TOI_VERBOSE, 0, "toi_swap_register_storage."); - for (i = 0; i < MAX_SWAPFILES; i++) { - struct swap_info_struct *si = get_swap_info_struct(i); - struct toi_bdev_info *devinfo; - unsigned char *p; - unsigned char buf[256]; - struct fs_info *fs_info; - - if (!si || !(si->flags & SWP_WRITEOK) || is_ram_backed(si)) - continue; - - devinfo = toi_kzalloc(39, sizeof(struct toi_bdev_info), - GFP_ATOMIC); - if (!devinfo) { - printk("Failed to allocate devinfo struct for swap " - "device %d.\n", i); - return -ENOMEM; - } - - devinfo->bdev = si->bdev; - devinfo->allocator = &toi_swapops; - devinfo->allocator_index = i; - - fs_info = fs_info_from_block_dev(si->bdev); - if (fs_info && !IS_ERR(fs_info)) { - memcpy(devinfo->uuid, &fs_info->uuid, 16); - free_fs_info(fs_info); - } else - result = (int) PTR_ERR(fs_info); - - if (!fs_info) - printk("fs_info from block dev returned %d.\n", result); - devinfo->dev_t = si->bdev->bd_dev; - devinfo->prio = si->prio; - devinfo->bmap_shift = 3; - devinfo->blocks_per_page = 1; - - p = d_path(&si->swap_file->f_path, buf, sizeof(buf)); - sprintf(devinfo->name, "swap on %s", p); - - toi_message(TOI_IO, TOI_VERBOSE, 0, "Registering swap storage:" - " Device %d (%lx), prio %d.", i, - (unsigned long) devinfo->dev_t, devinfo->prio); - toi_bio_ops.register_storage(devinfo); - } - - return 0; -} - -static unsigned long toi_swap_free_unused_storage(struct toi_bdev_info *chain, unsigned long used) -{ - struct hibernate_extent *extentpointer = NULL; - unsigned long extentvalue; - unsigned long i = 0, first_freed = 0; - - toi_extent_for_each(&chain->allocations, extentpointer, extentvalue) { - i++; - if (i > used) { - swap_free((swp_entry_t) { extentvalue }); - if (!first_freed) - first_freed = extentvalue; - } - } - - return first_freed; -} - -/* - * workspace_size - * - * Description: - * Returns the number of bytes of RAM needed for this - * code to do its work. (Used when calculating whether - * we have enough memory to be able to hibernate & resume). - * - */ -static int toi_swap_memory_needed(void) -{ - return 1; -} - -/* - * Print debug info - * - * Description: - */ -static int toi_swap_print_debug_stats(char *buffer, int size) -{ - int len = 0; - - len = scnprintf(buffer, size, "- Swap Allocator enabled.\n"); - if (swapfilename[0]) - len += scnprintf(buffer+len, size-len, - " Attempting to automatically swapon: %s.\n", - swapfilename); - - si_swapinfo_no_compcache(); - - len += scnprintf(buffer+len, size-len, - " Swap available for image: %lu pages.\n", - swapinfo.freeswap + swap_allocated); - - return len; -} - -static int header_locations_read_sysfs(const char *page, int count) -{ - int i, printedpartitionsmessage = 0, len = 0, haveswap = 0; - struct inode *swapf = NULL; - int zone; - char *path_page = (char *) toi_get_free_page(10, GFP_KERNEL); - char *path, *output = (char *) page; - int path_len; - - if (!page) - return 0; - - for (i = 0; i < MAX_SWAPFILES; i++) { - struct swap_info_struct *si = get_swap_info_struct(i); - - if (!si || !(si->flags & SWP_WRITEOK)) - continue; - - if (S_ISBLK(si->swap_file->f_mapping->host->i_mode)) { - haveswap = 1; - if (!printedpartitionsmessage) { - len += sprintf(output + len, - "For swap partitions, simply use the " - "format: resume=swap:/dev/hda1.\n"); - printedpartitionsmessage = 1; - } - } else { - path_len = 0; - - path = d_path(&si->swap_file->f_path, path_page, - PAGE_SIZE); - path_len = snprintf(path_page, PAGE_SIZE, "%s", path); - - haveswap = 1; - swapf = si->swap_file->f_mapping->host; - zone = bmap(swapf, 0); - if (!zone) { - len += sprintf(output + len, - "Swapfile %s has been corrupted. Reuse" - " mkswap on it and try again.\n", - path_page); - } else { - char name_buffer[BDEVNAME_SIZE]; - len += sprintf(output + len, - "For swapfile `%s`," - " use resume=swap:/dev/%s:0x%x.\n", - path_page, - bdevname(si->bdev, name_buffer), - zone << (swapf->i_blkbits - 9)); - } - } - } - - if (!haveswap) - len = sprintf(output, "You need to turn on swap partitions " - "before examining this file.\n"); - - toi_free_page(10, (unsigned long) path_page); - return len; -} - -static struct toi_sysfs_data sysfs_params[] = { - SYSFS_STRING("swapfilename", SYSFS_RW, swapfilename, 255, 0, NULL), - SYSFS_CUSTOM("headerlocations", SYSFS_READONLY, - header_locations_read_sysfs, NULL, 0, NULL), - SYSFS_INT("enabled", SYSFS_RW, &toi_swapops.enabled, 0, 1, 0, - attempt_to_parse_resume_device2), -}; - -static struct toi_bio_allocator_ops toi_bio_swapops = { - .register_storage = toi_swap_register_storage, - .storage_available = toi_swap_storage_available, - .allocate_storage = toi_swap_allocate_storage, - .bmap = get_main_pool_phys_params, - .free_storage = toi_swap_free_storage, - .free_unused_storage = toi_swap_free_unused_storage, -}; - -static struct toi_module_ops toi_swapops = { - .type = BIO_ALLOCATOR_MODULE, - .name = "swap storage", - .directory = "swap", - .module = THIS_MODULE, - .memory_needed = toi_swap_memory_needed, - .print_debug_info = toi_swap_print_debug_stats, - .initialise = toi_swap_initialise, - .cleanup = toi_swap_cleanup, - .bio_allocator_ops = &toi_bio_swapops, - - .sysfs_data = sysfs_params, - .num_sysfs_entries = sizeof(sysfs_params) / - sizeof(struct toi_sysfs_data), -}; - -/* ---- Registration ---- */ -static __init int toi_swap_load(void) -{ - return toi_register_module(&toi_swapops); -} - -late_initcall(toi_swap_load); diff --git a/kernel/power/tuxonice_sysfs.c b/kernel/power/tuxonice_sysfs.c deleted file mode 100644 index 77f36dbeb..000000000 --- a/kernel/power/tuxonice_sysfs.c +++ /dev/null @@ -1,333 +0,0 @@ -/* - * kernel/power/tuxonice_sysfs.c - * - * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * This file contains support for sysfs entries for tuning TuxOnIce. - * - * We have a generic handler that deals with the most common cases, and - * hooks for special handlers to use. - */ - -#include <linux/suspend.h> - -#include "tuxonice_sysfs.h" -#include "tuxonice.h" -#include "tuxonice_storage.h" -#include "tuxonice_alloc.h" - -static int toi_sysfs_initialised; - -static void toi_initialise_sysfs(void); - -static struct toi_sysfs_data sysfs_params[]; - -#define to_sysfs_data(_attr) container_of(_attr, struct toi_sysfs_data, attr) - -static void toi_main_wrapper(void) -{ - toi_try_hibernate(); -} - -static ssize_t toi_attr_show(struct kobject *kobj, struct attribute *attr, - char *page) -{ - struct toi_sysfs_data *sysfs_data = to_sysfs_data(attr); - int len = 0; - int full_prep = sysfs_data->flags & SYSFS_NEEDS_SM_FOR_READ; - - if (full_prep && toi_start_anything(0)) - return -EBUSY; - - if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_READ) - toi_prepare_usm(); - - switch (sysfs_data->type) { - case TOI_SYSFS_DATA_CUSTOM: - len = (sysfs_data->data.special.read_sysfs) ? - (sysfs_data->data.special.read_sysfs)(page, PAGE_SIZE) - : 0; - break; - case TOI_SYSFS_DATA_BIT: - len = sprintf(page, "%d\n", - -test_bit(sysfs_data->data.bit.bit, - sysfs_data->data.bit.bit_vector)); - break; - case TOI_SYSFS_DATA_INTEGER: - len = sprintf(page, "%d\n", - *(sysfs_data->data.integer.variable)); - break; - case TOI_SYSFS_DATA_LONG: - len = sprintf(page, "%ld\n", - *(sysfs_data->data.a_long.variable)); - break; - case TOI_SYSFS_DATA_UL: - len = sprintf(page, "%lu\n", - *(sysfs_data->data.ul.variable)); - break; - case TOI_SYSFS_DATA_STRING: - len = sprintf(page, "%s\n", - sysfs_data->data.string.variable); - break; - } - - if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_READ) - toi_cleanup_usm(); - - if (full_prep) - toi_finish_anything(0); - - return len; -} - -#define BOUND(_variable, _type) do { \ - if (*_variable < sysfs_data->data._type.minimum) \ - *_variable = sysfs_data->data._type.minimum; \ - else if (*_variable > sysfs_data->data._type.maximum) \ - *_variable = sysfs_data->data._type.maximum; \ -} while (0) - -static ssize_t toi_attr_store(struct kobject *kobj, struct attribute *attr, - const char *my_buf, size_t count) -{ - int assigned_temp_buffer = 0, result = count; - struct toi_sysfs_data *sysfs_data = to_sysfs_data(attr); - - if (toi_start_anything((sysfs_data->flags & SYSFS_HIBERNATE_OR_RESUME))) - return -EBUSY; - - ((char *) my_buf)[count] = 0; - - if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_WRITE) - toi_prepare_usm(); - - switch (sysfs_data->type) { - case TOI_SYSFS_DATA_CUSTOM: - if (sysfs_data->data.special.write_sysfs) - result = (sysfs_data->data.special.write_sysfs)(my_buf, - count); - break; - case TOI_SYSFS_DATA_BIT: - { - unsigned long value; - result = kstrtoul(my_buf, 0, &value); - if (result) - break; - if (value) - set_bit(sysfs_data->data.bit.bit, - (sysfs_data->data.bit.bit_vector)); - else - clear_bit(sysfs_data->data.bit.bit, - (sysfs_data->data.bit.bit_vector)); - } - break; - case TOI_SYSFS_DATA_INTEGER: - { - long temp; - result = kstrtol(my_buf, 0, &temp); - if (result) - break; - *(sysfs_data->data.integer.variable) = (int) temp; - BOUND(sysfs_data->data.integer.variable, integer); - break; - } - case TOI_SYSFS_DATA_LONG: - { - long *variable = - sysfs_data->data.a_long.variable; - result = kstrtol(my_buf, 0, variable); - if (result) - break; - BOUND(variable, a_long); - break; - } - case TOI_SYSFS_DATA_UL: - { - unsigned long *variable = - sysfs_data->data.ul.variable; - result = kstrtoul(my_buf, 0, variable); - if (result) - break; - BOUND(variable, ul); - break; - } - break; - case TOI_SYSFS_DATA_STRING: - { - int copy_len = count; - char *variable = - sysfs_data->data.string.variable; - - if (sysfs_data->data.string.max_length && - (copy_len > sysfs_data->data.string.max_length)) - copy_len = sysfs_data->data.string.max_length; - - if (!variable) { - variable = (char *) toi_get_zeroed_page(31, - TOI_ATOMIC_GFP); - sysfs_data->data.string.variable = variable; - assigned_temp_buffer = 1; - } - strncpy(variable, my_buf, copy_len); - if (copy_len && my_buf[copy_len - 1] == '\n') - variable[count - 1] = 0; - variable[count] = 0; - } - break; - } - - if (!result) - result = count; - - /* Side effect routine? */ - if (result == count && sysfs_data->write_side_effect) - sysfs_data->write_side_effect(); - - /* Free temporary buffers */ - if (assigned_temp_buffer) { - toi_free_page(31, - (unsigned long) sysfs_data->data.string.variable); - sysfs_data->data.string.variable = NULL; - } - - if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_WRITE) - toi_cleanup_usm(); - - toi_finish_anything(sysfs_data->flags & SYSFS_HIBERNATE_OR_RESUME); - - return result; -} - -static struct sysfs_ops toi_sysfs_ops = { - .show = &toi_attr_show, - .store = &toi_attr_store, -}; - -static struct kobj_type toi_ktype = { - .sysfs_ops = &toi_sysfs_ops, -}; - -struct kobject *tuxonice_kobj; - -/* Non-module sysfs entries. - * - * This array contains entries that are automatically registered at - * boot. Modules and the console code register their own entries separately. - */ - -static struct toi_sysfs_data sysfs_params[] = { - SYSFS_CUSTOM("do_hibernate", SYSFS_WRITEONLY, NULL, NULL, - SYSFS_HIBERNATING, toi_main_wrapper), - SYSFS_CUSTOM("do_resume", SYSFS_WRITEONLY, NULL, NULL, - SYSFS_RESUMING, toi_try_resume) -}; - -void remove_toi_sysdir(struct kobject *kobj) -{ - if (!kobj) - return; - - kobject_put(kobj); -} - -struct kobject *make_toi_sysdir(char *name) -{ - struct kobject *kobj = kobject_create_and_add(name, tuxonice_kobj); - - if (!kobj) { - printk(KERN_INFO "TuxOnIce: Can't allocate kobject for sysfs " - "dir!\n"); - return NULL; - } - - kobj->ktype = &toi_ktype; - - return kobj; -} - -/* toi_register_sysfs_file - * - * Helper for registering a new /sysfs/tuxonice entry. - */ - -int toi_register_sysfs_file( - struct kobject *kobj, - struct toi_sysfs_data *toi_sysfs_data) -{ - int result; - - if (!toi_sysfs_initialised) - toi_initialise_sysfs(); - - result = sysfs_create_file(kobj, &toi_sysfs_data->attr); - if (result) - printk(KERN_INFO "TuxOnIce: sysfs_create_file for %s " - "returned %d.\n", - toi_sysfs_data->attr.name, result); - kobj->ktype = &toi_ktype; - - return result; -} - -/* toi_unregister_sysfs_file - * - * Helper for removing unwanted /sys/power/tuxonice entries. - * - */ -void toi_unregister_sysfs_file(struct kobject *kobj, - struct toi_sysfs_data *toi_sysfs_data) -{ - sysfs_remove_file(kobj, &toi_sysfs_data->attr); -} - -void toi_cleanup_sysfs(void) -{ - int i, - numfiles = sizeof(sysfs_params) / sizeof(struct toi_sysfs_data); - - if (!toi_sysfs_initialised) - return; - - for (i = 0; i < numfiles; i++) - toi_unregister_sysfs_file(tuxonice_kobj, &sysfs_params[i]); - - kobject_put(tuxonice_kobj); - toi_sysfs_initialised = 0; -} - -/* toi_initialise_sysfs - * - * Initialise the /sysfs/tuxonice directory. - */ - -static void toi_initialise_sysfs(void) -{ - int i; - int numfiles = sizeof(sysfs_params) / sizeof(struct toi_sysfs_data); - - if (toi_sysfs_initialised) - return; - - /* Make our TuxOnIce directory a child of /sys/power */ - tuxonice_kobj = kobject_create_and_add("tuxonice", power_kobj); - if (!tuxonice_kobj) - return; - - toi_sysfs_initialised = 1; - - for (i = 0; i < numfiles; i++) - toi_register_sysfs_file(tuxonice_kobj, &sysfs_params[i]); -} - -int toi_sysfs_init(void) -{ - toi_initialise_sysfs(); - return 0; -} - -void toi_sysfs_exit(void) -{ - toi_cleanup_sysfs(); -} diff --git a/kernel/power/tuxonice_sysfs.h b/kernel/power/tuxonice_sysfs.h deleted file mode 100644 index 1de954ce1..000000000 --- a/kernel/power/tuxonice_sysfs.h +++ /dev/null @@ -1,137 +0,0 @@ -/* - * kernel/power/tuxonice_sysfs.h - * - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - */ - -#include <linux/sysfs.h> - -struct toi_sysfs_data { - struct attribute attr; - int type; - int flags; - union { - struct { - unsigned long *bit_vector; - int bit; - } bit; - struct { - int *variable; - int minimum; - int maximum; - } integer; - struct { - long *variable; - long minimum; - long maximum; - } a_long; - struct { - unsigned long *variable; - unsigned long minimum; - unsigned long maximum; - } ul; - struct { - char *variable; - int max_length; - } string; - struct { - int (*read_sysfs) (const char *buffer, int count); - int (*write_sysfs) (const char *buffer, int count); - void *data; - } special; - } data; - - /* Side effects routine. Used, eg, for reparsing the - * resume= entry when it changes */ - void (*write_side_effect) (void); - struct list_head sysfs_data_list; -}; - -enum { - TOI_SYSFS_DATA_NONE = 1, - TOI_SYSFS_DATA_CUSTOM, - TOI_SYSFS_DATA_BIT, - TOI_SYSFS_DATA_INTEGER, - TOI_SYSFS_DATA_UL, - TOI_SYSFS_DATA_LONG, - TOI_SYSFS_DATA_STRING -}; - -#define SYSFS_WRITEONLY 0200 -#define SYSFS_READONLY 0444 -#define SYSFS_RW 0644 - -#define SYSFS_BIT(_name, _mode, _ul, _bit, _flags) { \ - .attr = {.name = _name , .mode = _mode }, \ - .type = TOI_SYSFS_DATA_BIT, \ - .flags = _flags, \ - .data = { .bit = { .bit_vector = _ul, .bit = _bit } } } - -#define SYSFS_INT(_name, _mode, _int, _min, _max, _flags, _wse) { \ - .attr = {.name = _name , .mode = _mode }, \ - .type = TOI_SYSFS_DATA_INTEGER, \ - .flags = _flags, \ - .data = { .integer = { .variable = _int, .minimum = _min, \ - .maximum = _max } }, \ - .write_side_effect = _wse } - -#define SYSFS_UL(_name, _mode, _ul, _min, _max, _flags) { \ - .attr = {.name = _name , .mode = _mode }, \ - .type = TOI_SYSFS_DATA_UL, \ - .flags = _flags, \ - .data = { .ul = { .variable = _ul, .minimum = _min, \ - .maximum = _max } } } - -#define SYSFS_LONG(_name, _mode, _long, _min, _max, _flags) { \ - .attr = {.name = _name , .mode = _mode }, \ - .type = TOI_SYSFS_DATA_LONG, \ - .flags = _flags, \ - .data = { .a_long = { .variable = _long, .minimum = _min, \ - .maximum = _max } } } - -#define SYSFS_STRING(_name, _mode, _string, _max_len, _flags, _wse) { \ - .attr = {.name = _name , .mode = _mode }, \ - .type = TOI_SYSFS_DATA_STRING, \ - .flags = _flags, \ - .data = { .string = { .variable = _string, .max_length = _max_len } }, \ - .write_side_effect = _wse } - -#define SYSFS_CUSTOM(_name, _mode, _read, _write, _flags, _wse) { \ - .attr = {.name = _name , .mode = _mode }, \ - .type = TOI_SYSFS_DATA_CUSTOM, \ - .flags = _flags, \ - .data = { .special = { .read_sysfs = _read, .write_sysfs = _write } }, \ - .write_side_effect = _wse } - -#define SYSFS_NONE(_name, _wse) { \ - .attr = {.name = _name , .mode = SYSFS_WRITEONLY }, \ - .type = TOI_SYSFS_DATA_NONE, \ - .write_side_effect = _wse, \ -} - -/* Flags */ -#define SYSFS_NEEDS_SM_FOR_READ 1 -#define SYSFS_NEEDS_SM_FOR_WRITE 2 -#define SYSFS_HIBERNATE 4 -#define SYSFS_RESUME 8 -#define SYSFS_HIBERNATE_OR_RESUME (SYSFS_HIBERNATE | SYSFS_RESUME) -#define SYSFS_HIBERNATING (SYSFS_HIBERNATE | SYSFS_NEEDS_SM_FOR_WRITE) -#define SYSFS_RESUMING (SYSFS_RESUME | SYSFS_NEEDS_SM_FOR_WRITE) -#define SYSFS_NEEDS_SM_FOR_BOTH \ - (SYSFS_NEEDS_SM_FOR_READ | SYSFS_NEEDS_SM_FOR_WRITE) - -int toi_register_sysfs_file(struct kobject *kobj, - struct toi_sysfs_data *toi_sysfs_data); -void toi_unregister_sysfs_file(struct kobject *kobj, - struct toi_sysfs_data *toi_sysfs_data); - -extern struct kobject *tuxonice_kobj; - -struct kobject *make_toi_sysdir(char *name); -void remove_toi_sysdir(struct kobject *obj); -extern void toi_cleanup_sysfs(void); - -extern int toi_sysfs_init(void); -extern void toi_sysfs_exit(void); diff --git a/kernel/power/tuxonice_ui.c b/kernel/power/tuxonice_ui.c deleted file mode 100644 index 76152f3ff..000000000 --- a/kernel/power/tuxonice_ui.c +++ /dev/null @@ -1,247 +0,0 @@ -/* - * kernel/power/tuxonice_ui.c - * - * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu> - * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz> - * Copyright (C) 2002-2003 Florent Chabaud <fchabaud@free.fr> - * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * Routines for TuxOnIce's user interface. - * - * The user interface code talks to a userspace program via a - * netlink socket. - * - * The kernel side: - * - starts the userui program; - * - sends text messages and progress bar status; - * - * The user space side: - * - passes messages regarding user requests (abort, toggle reboot etc) - * - */ - -#define __KERNEL_SYSCALLS__ - -#include <linux/reboot.h> - -#include "tuxonice_sysfs.h" -#include "tuxonice_modules.h" -#include "tuxonice.h" -#include "tuxonice_ui.h" -#include "tuxonice_netlink.h" -#include "tuxonice_power_off.h" -#include "tuxonice_builtin.h" - -static char local_printf_buf[1024]; /* Same as printk - should be safe */ -struct ui_ops *toi_current_ui; - -/** - * toi_wait_for_keypress - Wait for keypress via userui or /dev/console. - * - * @timeout: Maximum time to wait. - * - * Wait for a keypress, either from userui or /dev/console if userui isn't - * available. The non-userui path is particularly for at boot-time, prior - * to userui being started, when we have an important warning to give to - * the user. - */ -static char toi_wait_for_keypress(int timeout) -{ - if (toi_current_ui && toi_current_ui->wait_for_key(timeout)) - return ' '; - - return toi_wait_for_keypress_dev_console(timeout); -} - -/* toi_early_boot_message() - * Description: Handle errors early in the process of booting. - * The user may press C to continue booting, perhaps - * invalidating the image, or space to reboot. - * This works from either the serial console or normally - * attached keyboard. - * - * Note that we come in here from init, while the kernel is - * locked. If we want to get events from the serial console, - * we need to temporarily unlock the kernel. - * - * toi_early_boot_message may also be called post-boot. - * In this case, it simply printks the message and returns. - * - * Arguments: int Whether we are able to erase the image. - * int default_answer. What to do when we timeout. This - * will normally be continue, but the user might - * provide command line options (__setup) to override - * particular cases. - * Char *. Pointer to a string explaining why we're moaning. - */ - -#define say(message, a...) printk(KERN_EMERG message, ##a) - -void toi_early_boot_message(int message_detail, int default_answer, - char *warning_reason, ...) -{ -#if defined(CONFIG_VT) || defined(CONFIG_SERIAL_CONSOLE) - unsigned long orig_state = get_toi_state(), continue_req = 0; - unsigned long orig_loglevel = console_loglevel; - int can_ask = 1; -#else - int can_ask = 0; -#endif - - va_list args; - int printed_len; - - if (!toi_wait) { - set_toi_state(TOI_CONTINUE_REQ); - can_ask = 0; - } - - if (warning_reason) { - va_start(args, warning_reason); - printed_len = vsnprintf(local_printf_buf, - sizeof(local_printf_buf), - warning_reason, - args); - va_end(args); - } - - if (!test_toi_state(TOI_BOOT_TIME)) { - printk("TuxOnIce: %s\n", local_printf_buf); - return; - } - - if (!can_ask) { - continue_req = !!default_answer; - goto post_ask; - } - -#if defined(CONFIG_VT) || defined(CONFIG_SERIAL_CONSOLE) - console_loglevel = 7; - - say("=== TuxOnIce ===\n\n"); - if (warning_reason) { - say("BIG FAT WARNING!! %s\n\n", local_printf_buf); - switch (message_detail) { - case 0: - say("If you continue booting, note that any image WILL" - "NOT BE REMOVED.\nTuxOnIce is unable to do so " - "because the appropriate modules aren't\n" - "loaded. You should manually remove the image " - "to avoid any\npossibility of corrupting your " - "filesystem(s) later.\n"); - break; - case 1: - say("If you want to use the current TuxOnIce image, " - "reboot and try\nagain with the same kernel " - "that you hibernated from. If you want\n" - "to forget that image, continue and the image " - "will be erased.\n"); - break; - } - say("Press SPACE to reboot or C to continue booting with " - "this kernel\n\n"); - if (toi_wait > 0) - say("Default action if you don't select one in %d " - "seconds is: %s.\n", - toi_wait, - default_answer == TOI_CONTINUE_REQ ? - "continue booting" : "reboot"); - } else { - say("BIG FAT WARNING!!\n\n" - "You have tried to resume from this image before.\n" - "If it failed once, it may well fail again.\n" - "Would you like to remove the image and boot " - "normally?\nThis will be equivalent to entering " - "noresume on the\nkernel command line.\n\n" - "Press SPACE to remove the image or C to continue " - "resuming.\n\n"); - if (toi_wait > 0) - say("Default action if you don't select one in %d " - "seconds is: %s.\n", toi_wait, - !!default_answer ? - "continue resuming" : "remove the image"); - } - console_loglevel = orig_loglevel; - - set_toi_state(TOI_SANITY_CHECK_PROMPT); - clear_toi_state(TOI_CONTINUE_REQ); - - if (toi_wait_for_keypress(toi_wait) == 0) /* We timed out */ - continue_req = !!default_answer; - else - continue_req = test_toi_state(TOI_CONTINUE_REQ); - -#endif /* CONFIG_VT or CONFIG_SERIAL_CONSOLE */ - -post_ask: - if ((warning_reason) && (!continue_req)) - kernel_restart(NULL); - - restore_toi_state(orig_state); - if (continue_req) - set_toi_state(TOI_CONTINUE_REQ); -} - -#undef say - -/* - * User interface specific /sys/power/tuxonice entries. - */ - -static struct toi_sysfs_data sysfs_params[] = { -#if defined(CONFIG_NET) && defined(CONFIG_SYSFS) - SYSFS_INT("default_console_level", SYSFS_RW, - &toi_bkd.toi_default_console_level, 0, 7, 0, NULL), - SYSFS_UL("debug_sections", SYSFS_RW, &toi_bkd.toi_debug_state, 0, - 1 << 30, 0), - SYSFS_BIT("log_everything", SYSFS_RW, &toi_bkd.toi_action, TOI_LOGALL, - 0) -#endif -}; - -static struct toi_module_ops userui_ops = { - .type = MISC_HIDDEN_MODULE, - .name = "printk ui", - .directory = "user_interface", - .module = THIS_MODULE, - .sysfs_data = sysfs_params, - .num_sysfs_entries = sizeof(sysfs_params) / - sizeof(struct toi_sysfs_data), -}; - -int toi_register_ui_ops(struct ui_ops *this_ui) -{ - if (toi_current_ui) { - printk(KERN_INFO "Only one TuxOnIce user interface module can " - "be loaded at a time."); - return -EBUSY; - } - - toi_current_ui = this_ui; - - return 0; -} - -void toi_remove_ui_ops(struct ui_ops *this_ui) -{ - if (toi_current_ui != this_ui) - return; - - toi_current_ui = NULL; -} - -/* toi_console_sysfs_init - * Description: Boot time initialisation for user interface. - */ - -int toi_ui_init(void) -{ - return toi_register_module(&userui_ops); -} - -void toi_ui_exit(void) -{ - toi_unregister_module(&userui_ops); -} diff --git a/kernel/power/tuxonice_ui.h b/kernel/power/tuxonice_ui.h deleted file mode 100644 index 4934e3a91..000000000 --- a/kernel/power/tuxonice_ui.h +++ /dev/null @@ -1,97 +0,0 @@ -/* - * kernel/power/tuxonice_ui.h - * - * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) - */ - -enum { - DONT_CLEAR_BAR, - CLEAR_BAR -}; - -enum { - /* Userspace -> Kernel */ - USERUI_MSG_ABORT = 0x11, - USERUI_MSG_SET_STATE = 0x12, - USERUI_MSG_GET_STATE = 0x13, - USERUI_MSG_GET_DEBUG_STATE = 0x14, - USERUI_MSG_SET_DEBUG_STATE = 0x15, - USERUI_MSG_SPACE = 0x18, - USERUI_MSG_GET_POWERDOWN_METHOD = 0x1A, - USERUI_MSG_SET_POWERDOWN_METHOD = 0x1B, - USERUI_MSG_GET_LOGLEVEL = 0x1C, - USERUI_MSG_SET_LOGLEVEL = 0x1D, - USERUI_MSG_PRINTK = 0x1E, - - /* Kernel -> Userspace */ - USERUI_MSG_MESSAGE = 0x21, - USERUI_MSG_PROGRESS = 0x22, - USERUI_MSG_POST_ATOMIC_RESTORE = 0x25, - - USERUI_MSG_MAX, -}; - -struct userui_msg_params { - u32 a, b, c, d; - char text[255]; -}; - -struct ui_ops { - char (*wait_for_key) (int timeout); - u32 (*update_status) (u32 value, u32 maximum, const char *fmt, ...); - void (*prepare_status) (int clearbar, const char *fmt, ...); - void (*cond_pause) (int pause, char *message); - void (*abort)(int result_code, const char *fmt, ...); - void (*prepare)(void); - void (*cleanup)(void); - void (*message)(u32 section, u32 level, u32 normally_logged, - const char *fmt, ...); -}; - -extern struct ui_ops *toi_current_ui; - -#define toi_update_status(val, max, fmt, args...) \ - (toi_current_ui ? (toi_current_ui->update_status) (val, max, fmt, ##args) : \ - max) - -#define toi_prepare_console(void) \ - do { if (toi_current_ui) \ - (toi_current_ui->prepare)(); \ - } while (0) - -#define toi_cleanup_console(void) \ - do { if (toi_current_ui) \ - (toi_current_ui->cleanup)(); \ - } while (0) - -#define abort_hibernate(result, fmt, args...) \ - do { if (toi_current_ui) \ - (toi_current_ui->abort)(result, fmt, ##args); \ - else { \ - set_abort_result(result); \ - } \ - } while (0) - -#define toi_cond_pause(pause, message) \ - do { if (toi_current_ui) \ - (toi_current_ui->cond_pause)(pause, message); \ - } while (0) - -#define toi_prepare_status(clear, fmt, args...) \ - do { if (toi_current_ui) \ - (toi_current_ui->prepare_status)(clear, fmt, ##args); \ - else \ - printk(KERN_INFO fmt "%s", ##args, "\n"); \ - } while (0) - -#define toi_message(sn, lev, log, fmt, a...) \ -do { \ - if (toi_current_ui && (!sn || test_debug_state(sn))) \ - toi_current_ui->message(sn, lev, log, fmt, ##a); \ -} while (0) - -__exit void toi_ui_cleanup(void); -extern int toi_ui_init(void); -extern void toi_ui_exit(void); -extern int toi_register_ui_ops(struct ui_ops *this_ui); -extern void toi_remove_ui_ops(struct ui_ops *this_ui); diff --git a/kernel/power/tuxonice_userui.c b/kernel/power/tuxonice_userui.c deleted file mode 100644 index 6aa5ac3eb..000000000 --- a/kernel/power/tuxonice_userui.c +++ /dev/null @@ -1,658 +0,0 @@ -/* - * kernel/power/user_ui.c - * - * Copyright (C) 2005-2007 Bernard Blackham - * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au) - * - * This file is released under the GPLv2. - * - * Routines for TuxOnIce's user interface. - * - * The user interface code talks to a userspace program via a - * netlink socket. - * - * The kernel side: - * - starts the userui program; - * - sends text messages and progress bar status; - * - * The user space side: - * - passes messages regarding user requests (abort, toggle reboot etc) - * - */ - -#define __KERNEL_SYSCALLS__ - -#include <linux/suspend.h> -#include <linux/freezer.h> -#include <linux/console.h> -#include <linux/ctype.h> -#include <linux/tty.h> -#include <linux/vt_kern.h> -#include <linux/reboot.h> -#include <linux/security.h> -#include <linux/syscalls.h> -#include <linux/vt.h> - -#include "tuxonice_sysfs.h" -#include "tuxonice_modules.h" -#include "tuxonice.h" -#include "tuxonice_ui.h" -#include "tuxonice_netlink.h" -#include "tuxonice_power_off.h" - -static char local_printf_buf[1024]; /* Same as printk - should be safe */ - -static struct user_helper_data ui_helper_data; -static struct toi_module_ops userui_ops; -static int orig_kmsg; - -static char lastheader[512]; -static int lastheader_message_len; -static int ui_helper_changed; /* Used at resume-time so don't overwrite value - set from initrd/ramfs. */ - -/* Number of distinct progress amounts that userspace can display */ -static int progress_granularity = 30; - -static DECLARE_WAIT_QUEUE_HEAD(userui_wait_for_key); -static int userui_wait_should_wake; - -#define toi_stop_waiting_for_userui_key() \ -{ \ - userui_wait_should_wake = true; \ - wake_up_interruptible(&userui_wait_for_key); \ -} - -/** - * ui_nl_set_state - Update toi_action based on a message from userui. - * - * @n: The bit (1 << bit) to set. - */ -static void ui_nl_set_state(int n) -{ - /* Only let them change certain settings */ - static const u32 toi_action_mask = - (1 << TOI_REBOOT) | (1 << TOI_PAUSE) | - (1 << TOI_LOGALL) | - (1 << TOI_SINGLESTEP) | - (1 << TOI_PAUSE_NEAR_PAGESET_END); - static unsigned long new_action; - - new_action = (toi_bkd.toi_action & (~toi_action_mask)) | - (n & toi_action_mask); - - printk(KERN_DEBUG "n is %x. Action flags being changed from %lx " - "to %lx.", n, toi_bkd.toi_action, new_action); - toi_bkd.toi_action = new_action; - - if (!test_action_state(TOI_PAUSE) && - !test_action_state(TOI_SINGLESTEP)) - toi_stop_waiting_for_userui_key(); -} - -/** - * userui_post_atomic_restore - Tell userui that atomic restore just happened. - * - * Tell userui that atomic restore just occured, so that it can do things like - * redrawing the screen, re-getting settings and so on. - */ -static void userui_post_atomic_restore(struct toi_boot_kernel_data *bkd) -{ - toi_send_netlink_message(&ui_helper_data, - USERUI_MSG_POST_ATOMIC_RESTORE, NULL, 0); -} - -/** - * userui_storage_needed - Report how much memory in image header is needed. - */ -static int userui_storage_needed(void) -{ - return sizeof(ui_helper_data.program) + 1 + sizeof(int); -} - -/** - * userui_save_config_info - Fill buffer with config info for image header. - * - * @buf: Buffer into which to put the config info we want to save. - */ -static int userui_save_config_info(char *buf) -{ - *((int *) buf) = progress_granularity; - memcpy(buf + sizeof(int), ui_helper_data.program, - sizeof(ui_helper_data.program)); - return sizeof(ui_helper_data.program) + sizeof(int) + 1; -} - -/** - * userui_load_config_info - Restore config info from buffer. - * - * @buf: Buffer containing header info loaded. - * @size: Size of data loaded for this module. - */ -static void userui_load_config_info(char *buf, int size) -{ - progress_granularity = *((int *) buf); - size -= sizeof(int); - - /* Don't load the saved path if one has already been set */ - if (ui_helper_changed) - return; - - if (size > sizeof(ui_helper_data.program)) - size = sizeof(ui_helper_data.program); - - memcpy(ui_helper_data.program, buf + sizeof(int), size); - ui_helper_data.program[sizeof(ui_helper_data.program)-1] = '\0'; -} - -/** - * set_ui_program_set: Record that userui program was changed. - * - * Side effect routine for when the userui program is set. In an initrd or - * ramfs, the user may set a location for the userui program. If this happens, - * we don't want to reload the value that was saved in the image header. This - * routine allows us to flag that we shouldn't restore the program name from - * the image header. - */ -static void set_ui_program_set(void) -{ - ui_helper_changed = 1; -} - -/** - * userui_memory_needed - Tell core how much memory to reserve for us. - */ -static int userui_memory_needed(void) -{ - /* ball park figure of 128 pages */ - return 128 * PAGE_SIZE; -} - -/** - * userui_update_status - Update the progress bar and (if on) in-bar message. - * - * @value: Current progress percentage numerator. - * @maximum: Current progress percentage denominator. - * @fmt: Message to be displayed in the middle of the progress bar. - * - * Note that a NULL message does not mean that any previous message is erased! - * For that, you need toi_prepare_status with clearbar on. - * - * Returns an unsigned long, being the next numerator (as determined by the - * maximum and progress granularity) where status needs to be updated. - * This is to reduce unnecessary calls to update_status. - */ -static u32 userui_update_status(u32 value, u32 maximum, const char *fmt, ...) -{ - static u32 last_step = 9999; - struct userui_msg_params msg; - u32 this_step, next_update; - int bitshift; - - if (ui_helper_data.pid == -1) - return 0; - - if ((!maximum) || (!progress_granularity)) - return maximum; - - if (value < 0) - value = 0; - - if (value > maximum) - value = maximum; - - /* Try to avoid math problems - we can't do 64 bit math here - * (and shouldn't need it - anyone got screen resolution - * of 65536 pixels or more?) */ - bitshift = fls(maximum) - 16; - if (bitshift > 0) { - u32 temp_maximum = maximum >> bitshift; - u32 temp_value = value >> bitshift; - this_step = (u32) - (temp_value * progress_granularity / temp_maximum); - next_update = (((this_step + 1) * temp_maximum / - progress_granularity) + 1) << bitshift; - } else { - this_step = (u32) (value * progress_granularity / maximum); - next_update = ((this_step + 1) * maximum / - progress_granularity) + 1; - } - - if (this_step == last_step) - return next_update; - - memset(&msg, 0, sizeof(msg)); - - msg.a = this_step; - msg.b = progress_granularity; - - if (fmt) { - va_list args; - va_start(args, fmt); - vsnprintf(msg.text, sizeof(msg.text), fmt, args); - va_end(args); - msg.text[sizeof(msg.text)-1] = '\0'; - } - - toi_send_netlink_message(&ui_helper_data, USERUI_MSG_PROGRESS, - &msg, sizeof(msg)); - last_step = this_step; - - return next_update; -} - -/** - * userui_message - Display a message without necessarily logging it. - * - * @section: Type of message. Messages can be filtered by type. - * @level: Degree of importance of the message. Lower values = higher priority. - * @normally_logged: Whether logged even if log_everything is off. - * @fmt: Message (and parameters). - * - * This function is intended to do the same job as printk, but without normally - * logging what is printed. The point is to be able to get debugging info on - * screen without filling the logs with "1/534. ^M 2/534^M. 3/534^M" - * - * It may be called from an interrupt context - can't sleep! - */ -static void userui_message(u32 section, u32 level, u32 normally_logged, - const char *fmt, ...) -{ - struct userui_msg_params msg; - - if ((level) && (level > console_loglevel)) - return; - - memset(&msg, 0, sizeof(msg)); - - msg.a = section; - msg.b = level; - msg.c = normally_logged; - - if (fmt) { - va_list args; - va_start(args, fmt); - vsnprintf(msg.text, sizeof(msg.text), fmt, args); - va_end(args); - msg.text[sizeof(msg.text)-1] = '\0'; - } - - if (test_action_state(TOI_LOGALL)) - printk(KERN_INFO "%s\n", msg.text); - - toi_send_netlink_message(&ui_helper_data, USERUI_MSG_MESSAGE, - &msg, sizeof(msg)); -} - -/** - * wait_for_key_via_userui - Wait for userui to receive a keypress. - */ -static void wait_for_key_via_userui(void) -{ - DECLARE_WAITQUEUE(wait, current); - - add_wait_queue(&userui_wait_for_key, &wait); - set_current_state(TASK_INTERRUPTIBLE); - - wait_event_interruptible(userui_wait_for_key, userui_wait_should_wake); - userui_wait_should_wake = false; - - set_current_state(TASK_RUNNING); - remove_wait_queue(&userui_wait_for_key, &wait); -} - -/** - * userui_prepare_status - Display high level messages. - * - * @clearbar: Whether to clear the progress bar. - * @fmt...: New message for the title. - * - * Prepare the 'nice display', drawing the header and version, along with the - * current action and perhaps also resetting the progress bar. - */ -static void userui_prepare_status(int clearbar, const char *fmt, ...) -{ - va_list args; - - if (fmt) { - va_start(args, fmt); - lastheader_message_len = vsnprintf(lastheader, 512, fmt, args); - va_end(args); - } - - if (clearbar) - toi_update_status(0, 1, NULL); - - if (ui_helper_data.pid == -1) - printk(KERN_EMERG "%s\n", lastheader); - else - toi_message(0, TOI_STATUS, 1, lastheader, NULL); -} - -/** - * toi_wait_for_keypress - Wait for keypress via userui. - * - * @timeout: Maximum time to wait. - * - * Wait for a keypress from userui. - * - * FIXME: Implement timeout? - */ -static char userui_wait_for_keypress(int timeout) -{ - char key = '\0'; - - if (ui_helper_data.pid != -1) { - wait_for_key_via_userui(); - key = ' '; - } - - return key; -} - -/** - * userui_abort_hibernate - Abort a cycle & tell user if they didn't request it. - * - * @result_code: Reason why we're aborting (1 << bit). - * @fmt: Message to display if telling the user what's going on. - * - * Abort a cycle. If this wasn't at the user's request (and we're displaying - * output), tell the user why and wait for them to acknowledge the message. - */ -static void userui_abort_hibernate(int result_code, const char *fmt, ...) -{ - va_list args; - int printed_len = 0; - - set_result_state(result_code); - - if (test_result_state(TOI_ABORTED)) - return; - - set_result_state(TOI_ABORTED); - - if (test_result_state(TOI_ABORT_REQUESTED)) - return; - - va_start(args, fmt); - printed_len = vsnprintf(local_printf_buf, sizeof(local_printf_buf), - fmt, args); - va_end(args); - if (ui_helper_data.pid != -1) - printed_len = sprintf(local_printf_buf + printed_len, - " (Press SPACE to continue)"); - - toi_prepare_status(CLEAR_BAR, "%s", local_printf_buf); - - if (ui_helper_data.pid != -1) - userui_wait_for_keypress(0); -} - -/** - * request_abort_hibernate - Abort hibernating or resuming at user request. - * - * Handle the user requesting the cancellation of a hibernation or resume by - * pressing escape. - */ -static void request_abort_hibernate(void) -{ - if (test_result_state(TOI_ABORT_REQUESTED) || - !test_action_state(TOI_CAN_CANCEL)) - return; - - if (test_toi_state(TOI_NOW_RESUMING)) { - toi_prepare_status(CLEAR_BAR, "Escape pressed. " - "Powering down again."); - set_toi_state(TOI_STOP_RESUME); - while (!test_toi_state(TOI_IO_STOPPED)) - schedule(); - if (toiActiveAllocator->mark_resume_attempted) - toiActiveAllocator->mark_resume_attempted(0); - toi_power_down(); - } - - toi_prepare_status(CLEAR_BAR, "--- ESCAPE PRESSED :" - " ABORTING HIBERNATION ---"); - set_abort_result(TOI_ABORT_REQUESTED); - toi_stop_waiting_for_userui_key(); -} - -/** - * userui_user_rcv_msg - Receive a netlink message from userui. - * - * @skb: skb received. - * @nlh: Netlink header received. - */ -static int userui_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) -{ - int type; - int *data; - - type = nlh->nlmsg_type; - - /* A control message: ignore them */ - if (type < NETLINK_MSG_BASE) - return 0; - - /* Unknown message: reply with EINVAL */ - if (type >= USERUI_MSG_MAX) - return -EINVAL; - - /* All operations require privileges, even GET */ - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - - /* Only allow one task to receive NOFREEZE privileges */ - if (type == NETLINK_MSG_NOFREEZE_ME && ui_helper_data.pid != -1) { - printk(KERN_INFO "Got NOFREEZE_ME request when " - "ui_helper_data.pid is %d.\n", ui_helper_data.pid); - return -EBUSY; - } - - data = (int *) NLMSG_DATA(nlh); - - switch (type) { - case USERUI_MSG_ABORT: - request_abort_hibernate(); - return 0; - case USERUI_MSG_GET_STATE: - toi_send_netlink_message(&ui_helper_data, - USERUI_MSG_GET_STATE, &toi_bkd.toi_action, - sizeof(toi_bkd.toi_action)); - return 0; - case USERUI_MSG_GET_DEBUG_STATE: - toi_send_netlink_message(&ui_helper_data, - USERUI_MSG_GET_DEBUG_STATE, - &toi_bkd.toi_debug_state, - sizeof(toi_bkd.toi_debug_state)); - return 0; - case USERUI_MSG_SET_STATE: - if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int))) - return -EINVAL; - ui_nl_set_state(*data); - return 0; - case USERUI_MSG_SET_DEBUG_STATE: - if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int))) - return -EINVAL; - toi_bkd.toi_debug_state = (*data); - return 0; - case USERUI_MSG_SPACE: - toi_stop_waiting_for_userui_key(); - return 0; - case USERUI_MSG_GET_POWERDOWN_METHOD: - toi_send_netlink_message(&ui_helper_data, - USERUI_MSG_GET_POWERDOWN_METHOD, - &toi_poweroff_method, - sizeof(toi_poweroff_method)); - return 0; - case USERUI_MSG_SET_POWERDOWN_METHOD: - if (nlh->nlmsg_len != NLMSG_LENGTH(sizeof(char))) - return -EINVAL; - toi_poweroff_method = (unsigned long)(*data); - return 0; - case USERUI_MSG_GET_LOGLEVEL: - toi_send_netlink_message(&ui_helper_data, - USERUI_MSG_GET_LOGLEVEL, - &toi_bkd.toi_default_console_level, - sizeof(toi_bkd.toi_default_console_level)); - return 0; - case USERUI_MSG_SET_LOGLEVEL: - if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int))) - return -EINVAL; - toi_bkd.toi_default_console_level = (*data); - return 0; - case USERUI_MSG_PRINTK: - printk(KERN_INFO "%s", (char *) data); - return 0; - } - - /* Unhandled here */ - return 1; -} - -/** - * userui_cond_pause - Possibly pause at user request. - * - * @pause: Whether to pause or just display the message. - * @message: Message to display at the start of pausing. - * - * Potentially pause and wait for the user to tell us to continue. We normally - * only pause when @pause is set. While paused, the user can do things like - * changing the loglevel, toggling the display of debugging sections and such - * like. - */ -static void userui_cond_pause(int pause, char *message) -{ - int displayed_message = 0, last_key = 0; - - while (last_key != 32 && - ui_helper_data.pid != -1 && - ((test_action_state(TOI_PAUSE) && pause) || - (test_action_state(TOI_SINGLESTEP)))) { - if (!displayed_message) { - toi_prepare_status(DONT_CLEAR_BAR, - "%s Press SPACE to continue.%s", - message ? message : "", - (test_action_state(TOI_SINGLESTEP)) ? - " Single step on." : ""); - displayed_message = 1; - } - last_key = userui_wait_for_keypress(0); - } - schedule(); -} - -/** - * userui_prepare_console - Prepare the console for use. - * - * Prepare a console for use, saving current kmsg settings and attempting to - * start userui. Console loglevel changes are handled by userui. - */ -static void userui_prepare_console(void) -{ - orig_kmsg = vt_kmsg_redirect(fg_console + 1); - - ui_helper_data.pid = -1; - - if (!userui_ops.enabled) { - printk(KERN_INFO "TuxOnIce: Userui disabled.\n"); - return; - } - - if (*ui_helper_data.program) - toi_netlink_setup(&ui_helper_data); - else - printk(KERN_INFO "TuxOnIce: Userui program not configured.\n"); -} - -/** - * userui_cleanup_console - Cleanup after a cycle. - * - * Tell userui to cleanup, and restore kmsg_redirect to its original value. - */ - -static void userui_cleanup_console(void) -{ - if (ui_helper_data.pid > -1) - toi_netlink_close(&ui_helper_data); - - vt_kmsg_redirect(orig_kmsg); -} - -/* - * User interface specific /sys/power/tuxonice entries. - */ - -static struct toi_sysfs_data sysfs_params[] = { -#if defined(CONFIG_NET) && defined(CONFIG_SYSFS) - SYSFS_BIT("enable_escape", SYSFS_RW, &toi_bkd.toi_action, - TOI_CAN_CANCEL, 0), - SYSFS_BIT("pause_between_steps", SYSFS_RW, &toi_bkd.toi_action, - TOI_PAUSE, 0), - SYSFS_INT("enabled", SYSFS_RW, &userui_ops.enabled, 0, 1, 0, NULL), - SYSFS_INT("progress_granularity", SYSFS_RW, &progress_granularity, 1, - 2048, 0, NULL), - SYSFS_STRING("program", SYSFS_RW, ui_helper_data.program, 255, 0, - set_ui_program_set), - SYSFS_INT("debug", SYSFS_RW, &ui_helper_data.debug, 0, 1, 0, NULL) -#endif -}; - -static struct toi_module_ops userui_ops = { - .type = MISC_MODULE, - .name = "userui", - .shared_directory = "user_interface", - .module = THIS_MODULE, - .storage_needed = userui_storage_needed, - .save_config_info = userui_save_config_info, - .load_config_info = userui_load_config_info, - .memory_needed = userui_memory_needed, - .post_atomic_restore = userui_post_atomic_restore, - .sysfs_data = sysfs_params, - .num_sysfs_entries = sizeof(sysfs_params) / - sizeof(struct toi_sysfs_data), -}; - -static struct ui_ops my_ui_ops = { - .update_status = userui_update_status, - .message = userui_message, - .prepare_status = userui_prepare_status, - .abort = userui_abort_hibernate, - .cond_pause = userui_cond_pause, - .prepare = userui_prepare_console, - .cleanup = userui_cleanup_console, - .wait_for_key = userui_wait_for_keypress, -}; - -/** - * toi_user_ui_init - Boot time initialisation for user interface. - * - * Invoked from the core init routine. - */ -static __init int toi_user_ui_init(void) -{ - int result; - - ui_helper_data.nl = NULL; - strncpy(ui_helper_data.program, CONFIG_TOI_USERUI_DEFAULT_PATH, 255); - ui_helper_data.pid = -1; - ui_helper_data.skb_size = sizeof(struct userui_msg_params); - ui_helper_data.pool_limit = 6; - ui_helper_data.netlink_id = NETLINK_TOI_USERUI; - ui_helper_data.name = "userspace ui"; - ui_helper_data.rcv_msg = userui_user_rcv_msg; - ui_helper_data.interface_version = 8; - ui_helper_data.must_init = 0; - ui_helper_data.not_ready = userui_cleanup_console; - init_completion(&ui_helper_data.wait_for_process); - result = toi_register_module(&userui_ops); - if (!result) { - result = toi_register_ui_ops(&my_ui_ops); - if (result) - toi_unregister_module(&userui_ops); - } - - return result; -} - -late_initcall(toi_user_ui_init); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 8362f1979..af4e6968c 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -33,7 +33,6 @@ #include <linux/bootmem.h> #include <linux/memblock.h> #include <linux/syscalls.h> -#include <linux/suspend.h> #include <linux/kexec.h> #include <linux/kdb.h> #include <linux/ratelimit.h> @@ -49,6 +48,7 @@ #include <linux/uio.h> #include <asm/uaccess.h> +#include <asm-generic/sections.h> #define CREATE_TRACE_POINTS #include <trace/events/printk.h> @@ -233,7 +233,11 @@ struct printk_log { u8 facility; /* syslog facility */ u8 flags:5; /* internal record flags */ u8 level:3; /* syslog level */ -}; +} +#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS +__packed __aligned(4) +#endif +; /* * The logbuf_lock protects kmsg buffer, indices, counters. This can be taken @@ -274,30 +278,12 @@ static u32 clear_idx; #define LOG_FACILITY(v) ((v) >> 3 & 0xff) /* record buffer */ -#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) -#define LOG_ALIGN 4 -#else #define LOG_ALIGN __alignof__(struct printk_log) -#endif #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); static char *log_buf = __log_buf; static u32 log_buf_len = __LOG_BUF_LEN; -#ifdef CONFIG_TOI_INCREMENTAL -void toi_set_logbuf_untracked(void) -{ - int i; - struct page *log_buf_start_page = virt_to_page(__log_buf); - - printk("Not protecting kernel printk log buffer (%p-%p).\n", - __log_buf, __log_buf + __LOG_BUF_LEN); - - for (i = 0; i < (1 << (CONFIG_LOG_BUF_SHIFT - PAGE_SHIFT)); i++) - SetPageTOI_Untracked(log_buf_start_page + i); -} -#endif - /* Return log buffer address */ char *log_buf_addr_get(void) { @@ -1675,7 +1661,7 @@ asmlinkage int vprintk_emit(int facility, int level, const char *dict, size_t dictlen, const char *fmt, va_list args) { - static int recursion_bug; + static bool recursion_bug; static char textbuf[LOG_LINE_MAX]; char *text = textbuf; size_t text_len = 0; @@ -1711,7 +1697,7 @@ asmlinkage int vprintk_emit(int facility, int level, * it can be printed at the next appropriate moment: */ if (!oops_in_progress && !lockdep_recursing(current)) { - recursion_bug = 1; + recursion_bug = true; local_irq_restore(flags); return 0; } @@ -1726,7 +1712,7 @@ asmlinkage int vprintk_emit(int facility, int level, static const char recursion_msg[] = "BUG: recent printk recursion!"; - recursion_bug = 0; + recursion_bug = false; /* emit KERN_CRIT message */ printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, NULL, 0, recursion_msg, @@ -2706,13 +2692,36 @@ int unregister_console(struct console *console) } EXPORT_SYMBOL(unregister_console); +/* + * Some boot consoles access data that is in the init section and which will + * be discarded after the initcalls have been run. To make sure that no code + * will access this data, unregister the boot consoles in a late initcall. + * + * If for some reason, such as deferred probe or the driver being a loadable + * module, the real console hasn't registered yet at this point, there will + * be a brief interval in which no messages are logged to the console, which + * makes it difficult to diagnose problems that occur during this time. + * + * To mitigate this problem somewhat, only unregister consoles whose memory + * intersects with the init section. Note that code exists elsewhere to get + * rid of the boot console as soon as the proper console shows up, so there + * won't be side-effects from postponing the removal. + */ static int __init printk_late_init(void) { struct console *con; for_each_console(con) { if (!keep_bootcon && con->flags & CON_BOOT) { - unregister_console(con); + /* + * Make sure to unregister boot consoles whose data + * resides in the init section before the init section + * is discarded. Boot consoles whose data will stick + * around will automatically be unregistered when the + * proper console replaces them. + */ + if (init_section_intersects(con, sizeof(*con))) + unregister_console(con); } } hotcpu_notifier(console_cpu_notify, 0); diff --git a/kernel/profile.c b/kernel/profile.c index 99513e116..513696974 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -59,6 +59,7 @@ int profile_setup(char *str) if (!strncmp(str, sleepstr, strlen(sleepstr))) { #ifdef CONFIG_SCHEDSTATS + force_schedstat_enabled(); prof_on = SLEEP_PROFILING; if (str[strlen(sleepstr)] == ',') str += strlen(sleepstr) + 1; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 3189e51db..2341efe7f 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -387,8 +387,14 @@ unlock_creds: mutex_unlock(&task->signal->cred_guard_mutex); out: if (!retval) { - wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT, - TASK_UNINTERRUPTIBLE); + /* + * We do not bother to change retval or clear JOBCTL_TRAPPING + * if wait_on_bit() was interrupted by SIGKILL. The tracer will + * not return to user-mode, it will exit and clear this bit in + * __ptrace_unlink() if it wasn't already cleared by the tracee; + * and until then nobody can ptrace this task. + */ + wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT, TASK_KILLABLE); proc_ptrace_connector(task, PTRACE_ATTACH); } diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index d89328e26..d2988d047 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -162,6 +162,27 @@ static int rcu_torture_writer_state; #define RTWS_SYNC 7 #define RTWS_STUTTER 8 #define RTWS_STOPPING 9 +static const char * const rcu_torture_writer_state_names[] = { + "RTWS_FIXED_DELAY", + "RTWS_DELAY", + "RTWS_REPLACE", + "RTWS_DEF_FREE", + "RTWS_EXP_SYNC", + "RTWS_COND_GET", + "RTWS_COND_SYNC", + "RTWS_SYNC", + "RTWS_STUTTER", + "RTWS_STOPPING", +}; + +static const char *rcu_torture_writer_state_getname(void) +{ + unsigned int i = READ_ONCE(rcu_torture_writer_state); + + if (i >= ARRAY_SIZE(rcu_torture_writer_state_names)) + return "???"; + return rcu_torture_writer_state_names[i]; +} #if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) #define RCUTORTURE_RUNNABLE_INIT 1 @@ -1307,7 +1328,8 @@ rcu_torture_stats_print(void) rcutorture_get_gp_data(cur_ops->ttype, &flags, &gpnum, &completed); - pr_alert("??? Writer stall state %d g%lu c%lu f%#x\n", + pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x\n", + rcu_torture_writer_state_getname(), rcu_torture_writer_state, gpnum, completed, flags); show_rcu_gp_kthreads(); diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index a63a1ea5a..9b9cdd549 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c @@ -489,7 +489,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount) */ void synchronize_srcu(struct srcu_struct *sp) { - __synchronize_srcu(sp, rcu_gp_is_expedited() + __synchronize_srcu(sp, (rcu_gp_is_expedited() && !rcu_gp_is_normal()) ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT : SYNCHRONIZE_SRCU_TRYCOUNT); } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f07343b54..9fd5b628a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -68,10 +68,6 @@ MODULE_ALIAS("rcutree"); /* Data structures. */ -static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; -static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; -static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS]; - /* * In order to export the rcu_state name to the tracing tools, it * needs to be added in the __tracepoint_string section. @@ -246,24 +242,17 @@ static int rcu_gp_in_progress(struct rcu_state *rsp) */ void rcu_sched_qs(void) { - unsigned long flags; - - if (__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) { - trace_rcu_grace_period(TPS("rcu_sched"), - __this_cpu_read(rcu_sched_data.gpnum), - TPS("cpuqs")); - __this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false); - if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) - return; - local_irq_save(flags); - if (__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) { - __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false); - rcu_report_exp_rdp(&rcu_sched_state, - this_cpu_ptr(&rcu_sched_data), - true); - } - local_irq_restore(flags); - } + if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) + return; + trace_rcu_grace_period(TPS("rcu_sched"), + __this_cpu_read(rcu_sched_data.gpnum), + TPS("cpuqs")); + __this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false); + if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) + return; + __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false); + rcu_report_exp_rdp(&rcu_sched_state, + this_cpu_ptr(&rcu_sched_data), true); } void rcu_bh_qs(void) @@ -300,17 +289,16 @@ EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr); * We inform the RCU core by emulating a zero-duration dyntick-idle * period, which we in turn do by incrementing the ->dynticks counter * by two. + * + * The caller must have disabled interrupts. */ static void rcu_momentary_dyntick_idle(void) { - unsigned long flags; struct rcu_data *rdp; struct rcu_dynticks *rdtp; int resched_mask; struct rcu_state *rsp; - local_irq_save(flags); - /* * Yes, we can lose flag-setting operations. This is OK, because * the flag will be set again after some delay. @@ -340,13 +328,12 @@ static void rcu_momentary_dyntick_idle(void) smp_mb__after_atomic(); /* Later stuff after QS. */ break; } - local_irq_restore(flags); } /* * Note a context switch. This is a quiescent state for RCU-sched, * and requires special handling for preemptible RCU. - * The caller must have disabled preemption. + * The caller must have disabled interrupts. */ void rcu_note_context_switch(void) { @@ -376,9 +363,14 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch); */ void rcu_all_qs(void) { + unsigned long flags; + barrier(); /* Avoid RCU read-side critical sections leaking down. */ - if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) + if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) { + local_irq_save(flags); rcu_momentary_dyntick_idle(); + local_irq_restore(flags); + } this_cpu_inc(rcu_qs_ctr); barrier(); /* Avoid RCU read-side critical sections leaking up. */ } @@ -605,25 +597,25 @@ static int rcu_future_needs_gp(struct rcu_state *rsp) * The caller must have disabled interrupts to prevent races with * normal callback registry. */ -static int +static bool cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) { int i; if (rcu_gp_in_progress(rsp)) - return 0; /* No, a grace period is already in progress. */ + return false; /* No, a grace period is already in progress. */ if (rcu_future_needs_gp(rsp)) - return 1; /* Yes, a no-CBs CPU needs one. */ + return true; /* Yes, a no-CBs CPU needs one. */ if (!rdp->nxttail[RCU_NEXT_TAIL]) - return 0; /* No, this is a no-CBs (or offline) CPU. */ + return false; /* No, this is a no-CBs (or offline) CPU. */ if (*rdp->nxttail[RCU_NEXT_READY_TAIL]) - return 1; /* Yes, this CPU has newly registered callbacks. */ + return true; /* Yes, CPU has newly registered callbacks. */ for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) if (rdp->nxttail[i - 1] != rdp->nxttail[i] && ULONG_CMP_LT(READ_ONCE(rsp->completed), rdp->nxtcompleted[i])) - return 1; /* Yes, CBs for future grace period. */ - return 0; /* No grace period needed. */ + return true; /* Yes, CBs for future grace period. */ + return false; /* No grace period needed. */ } /* @@ -740,7 +732,7 @@ void rcu_user_enter(void) * * Exit from an interrupt handler, which might possibly result in entering * idle mode, in other words, leaving the mode in which read-side critical - * sections can occur. + * sections can occur. The caller must have disabled interrupts. * * This code assumes that the idle loop never does anything that might * result in unbalanced calls to irq_enter() and irq_exit(). If your @@ -753,11 +745,10 @@ void rcu_user_enter(void) */ void rcu_irq_exit(void) { - unsigned long flags; long long oldval; struct rcu_dynticks *rdtp; - local_irq_save(flags); + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_exit() invoked with irqs enabled!!!"); rdtp = this_cpu_ptr(&rcu_dynticks); oldval = rdtp->dynticks_nesting; rdtp->dynticks_nesting--; @@ -768,6 +759,17 @@ void rcu_irq_exit(void) else rcu_eqs_enter_common(oldval, true); rcu_sysidle_enter(1); +} + +/* + * Wrapper for rcu_irq_exit() where interrupts are enabled. + */ +void rcu_irq_exit_irqson(void) +{ + unsigned long flags; + + local_irq_save(flags); + rcu_irq_exit(); local_irq_restore(flags); } @@ -865,7 +867,7 @@ void rcu_user_exit(void) * * Enter an interrupt handler, which might possibly result in exiting * idle mode, in other words, entering the mode in which read-side critical - * sections can occur. + * sections can occur. The caller must have disabled interrupts. * * Note that the Linux kernel is fully capable of entering an interrupt * handler that it never exits, for example when doing upcalls to @@ -881,11 +883,10 @@ void rcu_user_exit(void) */ void rcu_irq_enter(void) { - unsigned long flags; struct rcu_dynticks *rdtp; long long oldval; - local_irq_save(flags); + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_enter() invoked with irqs enabled!!!"); rdtp = this_cpu_ptr(&rcu_dynticks); oldval = rdtp->dynticks_nesting; rdtp->dynticks_nesting++; @@ -896,6 +897,17 @@ void rcu_irq_enter(void) else rcu_eqs_exit_common(oldval, true); rcu_sysidle_exit(1); +} + +/* + * Wrapper for rcu_irq_enter() where interrupts are enabled. + */ +void rcu_irq_enter_irqson(void) +{ + unsigned long flags; + + local_irq_save(flags); + rcu_irq_enter(); local_irq_restore(flags); } @@ -1187,6 +1199,16 @@ static void record_gp_stall_check_time(struct rcu_state *rsp) } /* + * Convert a ->gp_state value to a character string. + */ +static const char *gp_state_getname(short gs) +{ + if (gs < 0 || gs >= ARRAY_SIZE(gp_state_names)) + return "???"; + return gp_state_names[gs]; +} + +/* * Complain about starvation of grace-period kthread. */ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) @@ -1196,12 +1218,16 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) j = jiffies; gpa = READ_ONCE(rsp->gp_activity); - if (j - gpa > 2 * HZ) - pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x s%d ->state=%#lx\n", + if (j - gpa > 2 * HZ) { + pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x %s(%d) ->state=%#lx\n", rsp->name, j - gpa, rsp->gpnum, rsp->completed, - rsp->gp_flags, rsp->gp_state, - rsp->gp_kthread ? rsp->gp_kthread->state : 0); + rsp->gp_flags, + gp_state_getname(rsp->gp_state), rsp->gp_state, + rsp->gp_kthread ? rsp->gp_kthread->state : ~0); + if (rsp->gp_kthread) + sched_show_task(rsp->gp_kthread); + } } /* @@ -1214,7 +1240,7 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp) struct rcu_node *rnp; rcu_for_each_leaf_node(rsp, rnp) { - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->qsmask != 0) { for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) if (rnp->qsmask & (1UL << cpu)) @@ -1237,7 +1263,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) /* Only let one CPU complain about others per time interval. */ - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); delta = jiffies - READ_ONCE(rsp->jiffies_stall); if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { raw_spin_unlock_irqrestore(&rnp->lock, flags); @@ -1256,7 +1282,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) rsp->name); print_cpu_stall_info_begin(); rcu_for_each_leaf_node(rsp, rnp) { - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); ndetected += rcu_print_task_stall(rnp); if (rnp->qsmask != 0) { for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) @@ -1327,7 +1353,7 @@ static void print_cpu_stall(struct rcu_state *rsp) rcu_dump_cpu_stacks(rsp); - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); if (ULONG_CMP_GE(jiffies, READ_ONCE(rsp->jiffies_stall))) WRITE_ONCE(rsp->jiffies_stall, jiffies + 3 * rcu_jiffies_till_stall_check() + 3); @@ -1534,10 +1560,8 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, * hold it, acquire the root rcu_node structure's lock in order to * start one (if needed). */ - if (rnp != rnp_root) { - raw_spin_lock(&rnp_root->lock); - smp_mb__after_unlock_lock(); - } + if (rnp != rnp_root) + raw_spin_lock_rcu_node(rnp_root); /* * Get a new grace-period number. If there really is no grace @@ -1590,7 +1614,6 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) int needmore; struct rcu_data *rdp = this_cpu_ptr(rsp->rda); - rcu_nocb_gp_cleanup(rsp, rnp); rnp->need_future_gp[c & 0x1] = 0; needmore = rnp->need_future_gp[(c + 1) & 0x1]; trace_rcu_future_gp(rnp, rdp, c, @@ -1611,7 +1634,7 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp) !READ_ONCE(rsp->gp_flags) || !rsp->gp_kthread) return; - wake_up(&rsp->gp_wq); + swake_up(&rsp->gp_wq); } /* @@ -1786,11 +1809,10 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) if ((rdp->gpnum == READ_ONCE(rnp->gpnum) && rdp->completed == READ_ONCE(rnp->completed) && !unlikely(READ_ONCE(rdp->gpwrap))) || /* w/out lock. */ - !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ + !raw_spin_trylock_rcu_node(rnp)) { /* irqs already off, so later. */ local_irq_restore(flags); return; } - smp_mb__after_unlock_lock(); needwake = __note_gp_changes(rsp, rnp, rdp); raw_spin_unlock_irqrestore(&rnp->lock, flags); if (needwake) @@ -1805,21 +1827,20 @@ static void rcu_gp_slow(struct rcu_state *rsp, int delay) } /* - * Initialize a new grace period. Return 0 if no grace period required. + * Initialize a new grace period. Return false if no grace period required. */ -static int rcu_gp_init(struct rcu_state *rsp) +static bool rcu_gp_init(struct rcu_state *rsp) { unsigned long oldmask; struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(rsp); WRITE_ONCE(rsp->gp_activity, jiffies); - raw_spin_lock_irq(&rnp->lock); - smp_mb__after_unlock_lock(); + raw_spin_lock_irq_rcu_node(rnp); if (!READ_ONCE(rsp->gp_flags)) { /* Spurious wakeup, tell caller to go back to sleep. */ raw_spin_unlock_irq(&rnp->lock); - return 0; + return false; } WRITE_ONCE(rsp->gp_flags, 0); /* Clear all flags: New grace period. */ @@ -1829,7 +1850,7 @@ static int rcu_gp_init(struct rcu_state *rsp) * Not supposed to be able to happen. */ raw_spin_unlock_irq(&rnp->lock); - return 0; + return false; } /* Advance to a new grace period and initialize state. */ @@ -1847,8 +1868,7 @@ static int rcu_gp_init(struct rcu_state *rsp) */ rcu_for_each_leaf_node(rsp, rnp) { rcu_gp_slow(rsp, gp_preinit_delay); - raw_spin_lock_irq(&rnp->lock); - smp_mb__after_unlock_lock(); + raw_spin_lock_irq_rcu_node(rnp); if (rnp->qsmaskinit == rnp->qsmaskinitnext && !rnp->wait_blkd_tasks) { /* Nothing to do on this leaf rcu_node structure. */ @@ -1904,8 +1924,7 @@ static int rcu_gp_init(struct rcu_state *rsp) */ rcu_for_each_node_breadth_first(rsp, rnp) { rcu_gp_slow(rsp, gp_init_delay); - raw_spin_lock_irq(&rnp->lock); - smp_mb__after_unlock_lock(); + raw_spin_lock_irq_rcu_node(rnp); rdp = this_cpu_ptr(rsp->rda); rcu_preempt_check_blocked_tasks(rnp); rnp->qsmask = rnp->qsmaskinit; @@ -1923,7 +1942,7 @@ static int rcu_gp_init(struct rcu_state *rsp) WRITE_ONCE(rsp->gp_activity, jiffies); } - return 1; + return true; } /* @@ -1973,8 +1992,7 @@ static void rcu_gp_fqs(struct rcu_state *rsp, bool first_time) } /* Clear flag to prevent immediate re-entry. */ if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { - raw_spin_lock_irq(&rnp->lock); - smp_mb__after_unlock_lock(); + raw_spin_lock_irq_rcu_node(rnp); WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS); raw_spin_unlock_irq(&rnp->lock); @@ -1991,10 +2009,10 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) int nocb = 0; struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(rsp); + struct swait_queue_head *sq; WRITE_ONCE(rsp->gp_activity, jiffies); - raw_spin_lock_irq(&rnp->lock); - smp_mb__after_unlock_lock(); + raw_spin_lock_irq_rcu_node(rnp); gp_duration = jiffies - rsp->gp_start; if (gp_duration > rsp->gp_max) rsp->gp_max = gp_duration; @@ -2019,8 +2037,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) * grace period is recorded in any of the rcu_node structures. */ rcu_for_each_node_breadth_first(rsp, rnp) { - raw_spin_lock_irq(&rnp->lock); - smp_mb__after_unlock_lock(); + raw_spin_lock_irq_rcu_node(rnp); WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); WARN_ON_ONCE(rnp->qsmask); WRITE_ONCE(rnp->completed, rsp->gpnum); @@ -2029,14 +2046,15 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) needgp = __note_gp_changes(rsp, rnp, rdp) || needgp; /* smp_mb() provided by prior unlock-lock pair. */ nocb += rcu_future_gp_cleanup(rsp, rnp); + sq = rcu_nocb_gp_get(rnp); raw_spin_unlock_irq(&rnp->lock); + rcu_nocb_gp_cleanup(sq); cond_resched_rcu_qs(); WRITE_ONCE(rsp->gp_activity, jiffies); rcu_gp_slow(rsp, gp_cleanup_delay); } rnp = rcu_get_root(rsp); - raw_spin_lock_irq(&rnp->lock); - smp_mb__after_unlock_lock(); /* Order GP before ->completed update. */ + raw_spin_lock_irq_rcu_node(rnp); /* Order GP before ->completed update. */ rcu_nocb_gp_set(rnp, nocb); /* Declare grace period done. */ @@ -2076,7 +2094,7 @@ static int __noreturn rcu_gp_kthread(void *arg) READ_ONCE(rsp->gpnum), TPS("reqwait")); rsp->gp_state = RCU_GP_WAIT_GPS; - wait_event_interruptible(rsp->gp_wq, + swait_event_interruptible(rsp->gp_wq, READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_INIT); rsp->gp_state = RCU_GP_DONE_GPS; @@ -2106,7 +2124,7 @@ static int __noreturn rcu_gp_kthread(void *arg) READ_ONCE(rsp->gpnum), TPS("fqswait")); rsp->gp_state = RCU_GP_WAIT_FQS; - ret = wait_event_interruptible_timeout(rsp->gp_wq, + ret = swait_event_interruptible_timeout(rsp->gp_wq, rcu_gp_fqs_check_wake(rsp, &gf), j); rsp->gp_state = RCU_GP_DOING_FQS; /* Locking provides needed memory barriers. */ @@ -2230,7 +2248,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); - rcu_gp_kthread_wake(rsp); + swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */ } /* @@ -2284,8 +2302,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, raw_spin_unlock_irqrestore(&rnp->lock, flags); rnp_c = rnp; rnp = rnp->parent; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); oldmask = rnp_c->qsmask; } @@ -2332,8 +2349,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, gps = rnp->gpnum; mask = rnp->grpmask; raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */ - smp_mb__after_unlock_lock(); + raw_spin_lock_rcu_node(rnp_p); /* irqs already disabled. */ rcu_report_qs_rnp(mask, rsp, rnp_p, gps, flags); } @@ -2355,8 +2371,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) struct rcu_node *rnp; rnp = rdp->mynode; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); if ((rdp->cpu_no_qs.b.norm && rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) || rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum || @@ -2582,8 +2597,7 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) rnp = rnp->parent; if (!rnp) break; - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - smp_mb__after_unlock_lock(); /* GP memory ordering. */ + raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ rnp->qsmaskinit &= ~mask; rnp->qsmask &= ~mask; if (rnp->qsmaskinit) { @@ -2611,8 +2625,7 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ mask = rdp->grpmask; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */ + raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ rnp->qsmaskinitnext &= ~mask; raw_spin_unlock_irqrestore(&rnp->lock, flags); } @@ -2809,8 +2822,7 @@ static void force_qs_rnp(struct rcu_state *rsp, rcu_for_each_leaf_node(rsp, rnp) { cond_resched_rcu_qs(); mask = 0; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->qsmask == 0) { if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p || @@ -2881,8 +2893,7 @@ static void force_quiescent_state(struct rcu_state *rsp) /* rnp_old == rcu_get_root(rsp), rnp == NULL. */ /* Reached the root of the rcu_node tree, acquire lock. */ - raw_spin_lock_irqsave(&rnp_old->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp_old, flags); raw_spin_unlock(&rnp_old->fqslock); if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { rsp->n_force_qs_lh++; @@ -2891,7 +2902,7 @@ static void force_quiescent_state(struct rcu_state *rsp) } WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); raw_spin_unlock_irqrestore(&rnp_old->lock, flags); - rcu_gp_kthread_wake(rsp); + swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */ } /* @@ -2914,7 +2925,7 @@ __rcu_process_callbacks(struct rcu_state *rsp) /* Does this CPU require a not-yet-started grace period? */ local_irq_save(flags); if (cpu_needs_another_gp(rsp, rdp)) { - raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */ + raw_spin_lock_rcu_node(rcu_get_root(rsp)); /* irqs disabled. */ needwake = rcu_start_gp(rsp); raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); if (needwake) @@ -3005,8 +3016,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, if (!rcu_gp_in_progress(rsp)) { struct rcu_node *rnp_root = rcu_get_root(rsp); - raw_spin_lock(&rnp_root->lock); - smp_mb__after_unlock_lock(); + raw_spin_lock_rcu_node(rnp_root); needwake = rcu_start_gp(rsp); raw_spin_unlock(&rnp_root->lock); if (needwake) @@ -3365,7 +3375,6 @@ static unsigned long rcu_seq_snap(unsigned long *sp) { unsigned long s; - smp_mb(); /* Caller's modifications seen first by other CPUs. */ s = (READ_ONCE(*sp) + 3) & ~0x1; smp_mb(); /* Above access must not bleed into critical section. */ return s; @@ -3392,6 +3401,7 @@ static void rcu_exp_gp_seq_end(struct rcu_state *rsp) } static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp) { + smp_mb(); /* Caller's modifications seen first by other CPUs. */ return rcu_seq_snap(&rsp->expedited_sequence); } static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) @@ -3426,8 +3436,7 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) * CPUs for the current rcu_node structure up the rcu_node tree. */ rcu_for_each_leaf_node(rsp, rnp) { - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->expmaskinit == rnp->expmaskinitnext) { raw_spin_unlock_irqrestore(&rnp->lock, flags); continue; /* No new CPUs, nothing to do. */ @@ -3447,8 +3456,7 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) rnp_up = rnp->parent; done = false; while (rnp_up) { - raw_spin_lock_irqsave(&rnp_up->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp_up, flags); if (rnp_up->expmaskinit) done = true; rnp_up->expmaskinit |= mask; @@ -3472,8 +3480,7 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) sync_exp_reset_tree_hotplug(rsp); rcu_for_each_node_breadth_first(rsp, rnp) { - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); WARN_ON_ONCE(rnp->expmask); rnp->expmask = rnp->expmaskinit; raw_spin_unlock_irqrestore(&rnp->lock, flags); @@ -3524,15 +3531,14 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, raw_spin_unlock_irqrestore(&rnp->lock, flags); if (wake) { smp_mb(); /* EGP done before wake_up(). */ - wake_up(&rsp->expedited_wq); + swake_up(&rsp->expedited_wq); } break; } mask = rnp->grpmask; raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ rnp = rnp->parent; - raw_spin_lock(&rnp->lock); /* irqs already disabled */ - smp_mb__after_unlock_lock(); + raw_spin_lock_rcu_node(rnp); /* irqs already disabled */ WARN_ON_ONCE(!(rnp->expmask & mask)); rnp->expmask &= ~mask; } @@ -3549,8 +3555,7 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, { unsigned long flags; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); __rcu_report_exp_rnp(rsp, rnp, wake, flags); } @@ -3564,8 +3569,7 @@ static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, { unsigned long flags; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); if (!(rnp->expmask & mask)) { raw_spin_unlock_irqrestore(&rnp->lock, flags); return; @@ -3609,7 +3613,7 @@ static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp, */ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) { - struct rcu_data *rdp; + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); struct rcu_node *rnp0; struct rcu_node *rnp1 = NULL; @@ -3623,7 +3627,7 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) if (!mutex_is_locked(&rnp0->exp_funnel_mutex)) { if (mutex_trylock(&rnp0->exp_funnel_mutex)) { if (sync_exp_work_done(rsp, rnp0, NULL, - &rsp->expedited_workdone0, s)) + &rdp->expedited_workdone0, s)) return NULL; return rnp0; } @@ -3637,14 +3641,13 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) * can be inexact, as it is just promoting locality and is not * strictly needed for correctness. */ - rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); - if (sync_exp_work_done(rsp, NULL, NULL, &rsp->expedited_workdone1, s)) + if (sync_exp_work_done(rsp, NULL, NULL, &rdp->expedited_workdone1, s)) return NULL; mutex_lock(&rdp->exp_funnel_mutex); rnp0 = rdp->mynode; for (; rnp0 != NULL; rnp0 = rnp0->parent) { if (sync_exp_work_done(rsp, rnp1, rdp, - &rsp->expedited_workdone2, s)) + &rdp->expedited_workdone2, s)) return NULL; mutex_lock(&rnp0->exp_funnel_mutex); if (rnp1) @@ -3654,7 +3657,7 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) rnp1 = rnp0; } if (sync_exp_work_done(rsp, rnp1, rdp, - &rsp->expedited_workdone3, s)) + &rdp->expedited_workdone3, s)) return NULL; return rnp1; } @@ -3708,8 +3711,7 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, sync_exp_reset_tree(rsp); rcu_for_each_leaf_node(rsp, rnp) { - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Each pass checks a CPU for identity, offline, and idle. */ mask_ofl_test = 0; @@ -3741,24 +3743,22 @@ retry_ipi: ret = smp_call_function_single(cpu, func, rsp, 0); if (!ret) { mask_ofl_ipi &= ~mask; - } else { - /* Failed, raced with offline. */ - raw_spin_lock_irqsave(&rnp->lock, flags); - if (cpu_online(cpu) && - (rnp->expmask & mask)) { - raw_spin_unlock_irqrestore(&rnp->lock, - flags); - schedule_timeout_uninterruptible(1); - if (cpu_online(cpu) && - (rnp->expmask & mask)) - goto retry_ipi; - raw_spin_lock_irqsave(&rnp->lock, - flags); - } - if (!(rnp->expmask & mask)) - mask_ofl_ipi &= ~mask; + continue; + } + /* Failed, raced with offline. */ + raw_spin_lock_irqsave_rcu_node(rnp, flags); + if (cpu_online(cpu) && + (rnp->expmask & mask)) { raw_spin_unlock_irqrestore(&rnp->lock, flags); + schedule_timeout_uninterruptible(1); + if (cpu_online(cpu) && + (rnp->expmask & mask)) + goto retry_ipi; + raw_spin_lock_irqsave_rcu_node(rnp, flags); } + if (!(rnp->expmask & mask)) + mask_ofl_ipi &= ~mask; + raw_spin_unlock_irqrestore(&rnp->lock, flags); } /* Report quiescent states for those that went offline. */ mask_ofl_test |= mask_ofl_ipi; @@ -3773,6 +3773,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) unsigned long jiffies_stall; unsigned long jiffies_start; unsigned long mask; + int ndetected; struct rcu_node *rnp; struct rcu_node *rnp_root = rcu_get_root(rsp); int ret; @@ -3781,28 +3782,30 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) jiffies_start = jiffies; for (;;) { - ret = wait_event_interruptible_timeout( + ret = swait_event_timeout( rsp->expedited_wq, sync_rcu_preempt_exp_done(rnp_root), jiffies_stall); - if (ret > 0) + if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root)) return; if (ret < 0) { /* Hit a signal, disable CPU stall warnings. */ - wait_event(rsp->expedited_wq, + swait_event(rsp->expedited_wq, sync_rcu_preempt_exp_done(rnp_root)); return; } pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", rsp->name); + ndetected = 0; rcu_for_each_leaf_node(rsp, rnp) { - (void)rcu_print_task_exp_stall(rnp); + ndetected = rcu_print_task_exp_stall(rnp); mask = 1; for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { struct rcu_data *rdp; if (!(rnp->expmask & mask)) continue; + ndetected++; rdp = per_cpu_ptr(rsp->rda, cpu); pr_cont(" %d-%c%c%c", cpu, "O."[cpu_online(cpu)], @@ -3811,8 +3814,23 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) } mask <<= 1; } - pr_cont(" } %lu jiffies s: %lu\n", - jiffies - jiffies_start, rsp->expedited_sequence); + pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", + jiffies - jiffies_start, rsp->expedited_sequence, + rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]); + if (!ndetected) { + pr_err("blocking rcu_node structures:"); + rcu_for_each_node_breadth_first(rsp, rnp) { + if (rnp == rnp_root) + continue; /* printed unconditionally */ + if (sync_rcu_preempt_exp_done(rnp)) + continue; + pr_cont(" l=%u:%d-%d:%#lx/%c", + rnp->level, rnp->grplo, rnp->grphi, + rnp->expmask, + ".T"[!!rnp->exp_tasks]); + } + pr_cont("\n"); + } rcu_for_each_leaf_node(rsp, rnp) { mask = 1; for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { @@ -3847,6 +3865,16 @@ void synchronize_sched_expedited(void) struct rcu_node *rnp; struct rcu_state *rsp = &rcu_sched_state; + /* If only one CPU, this is automatically a grace period. */ + if (rcu_blocking_is_gp()) + return; + + /* If expedited grace periods are prohibited, fall back to normal. */ + if (rcu_gp_is_normal()) { + wait_rcu_gp(call_rcu_sched); + return; + } + /* Take a snapshot of the sequence number. */ s = rcu_exp_gp_seq_snap(rsp); @@ -4135,7 +4163,7 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) rnp = rnp->parent; if (rnp == NULL) return; - raw_spin_lock(&rnp->lock); /* Interrupts already disabled. */ + raw_spin_lock_rcu_node(rnp); /* Interrupts already disabled. */ rnp->qsmaskinit |= mask; raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */ } @@ -4152,7 +4180,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) struct rcu_node *rnp = rcu_get_root(rsp); /* Set up local state, ensuring consistent view of global state. */ - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); rdp->dynticks = &per_cpu(rcu_dynticks, cpu); WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); @@ -4179,7 +4207,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) struct rcu_node *rnp = rcu_get_root(rsp); /* Set up local state, ensuring consistent view of global state. */ - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); rdp->qlen_last_fqs_check = 0; rdp->n_force_qs_snap = rsp->n_force_qs; rdp->blimit = blimit; @@ -4198,8 +4226,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) */ rnp = rdp->mynode; mask = rdp->grpmask; - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - smp_mb__after_unlock_lock(); + raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ rnp->qsmaskinitnext |= mask; rnp->expmaskinitnext |= mask; if (!rdp->beenonline) @@ -4327,14 +4354,14 @@ static int __init rcu_spawn_gp_kthread(void) t = kthread_create(rcu_gp_kthread, rsp, "%s", rsp->name); BUG_ON(IS_ERR(t)); rnp = rcu_get_root(rsp); - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); rsp->gp_kthread = t; if (kthread_prio) { sp.sched_priority = kthread_prio; sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); } - wake_up_process(t); raw_spin_unlock_irqrestore(&rnp->lock, flags); + wake_up_process(t); } rcu_spawn_nocb_kthreads(); rcu_spawn_boost_kthreads(); @@ -4385,12 +4412,14 @@ static void __init rcu_init_levelspread(int *levelspread, const int *levelcnt) /* * Helper function for rcu_init() that initializes one rcu_state structure. */ -static void __init rcu_init_one(struct rcu_state *rsp, - struct rcu_data __percpu *rda) +static void __init rcu_init_one(struct rcu_state *rsp) { static const char * const buf[] = RCU_NODE_NAME_INIT; static const char * const fqs[] = RCU_FQS_NAME_INIT; static const char * const exp[] = RCU_EXP_NAME_INIT; + static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; + static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; + static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS]; static u8 fl_mask = 0x1; int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */ @@ -4455,8 +4484,8 @@ static void __init rcu_init_one(struct rcu_state *rsp, } } - init_waitqueue_head(&rsp->gp_wq); - init_waitqueue_head(&rsp->expedited_wq); + init_swait_queue_head(&rsp->gp_wq); + init_swait_queue_head(&rsp->expedited_wq); rnp = rsp->level[rcu_num_lvls - 1]; for_each_possible_cpu(i) { while (i > rnp->grphi) @@ -4576,8 +4605,8 @@ void __init rcu_init(void) rcu_bootup_announce(); rcu_init_geometry(); - rcu_init_one(&rcu_bh_state, &rcu_bh_data); - rcu_init_one(&rcu_sched_state, &rcu_sched_data); + rcu_init_one(&rcu_bh_state); + rcu_init_one(&rcu_sched_state); if (dump_tree) rcu_dump_rcu_node_tree(&rcu_sched_state); __rcu_init_preempt(); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 9fb4e238d..bbd235d0e 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -27,6 +27,7 @@ #include <linux/threads.h> #include <linux/cpumask.h> #include <linux/seqlock.h> +#include <linux/swait.h> #include <linux/stop_machine.h> /* @@ -178,6 +179,8 @@ struct rcu_node { /* beginning of each expedited GP. */ unsigned long expmaskinitnext; /* Online CPUs for next expedited GP. */ + /* Any CPU that has ever been online will */ + /* have its bit set. */ unsigned long grpmask; /* Mask to apply to parent qsmask. */ /* Only one bit will be set in this mask. */ int grplo; /* lowest-numbered CPU or group here. */ @@ -241,7 +244,7 @@ struct rcu_node { /* Refused to boost: not sure why, though. */ /* This can happen due to race conditions. */ #ifdef CONFIG_RCU_NOCB_CPU - wait_queue_head_t nocb_gp_wq[2]; + struct swait_queue_head nocb_gp_wq[2]; /* Place for rcu_nocb_kthread() to wait GP. */ #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ int need_future_gp[2]; @@ -384,6 +387,10 @@ struct rcu_data { struct rcu_head oom_head; #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ struct mutex exp_funnel_mutex; + atomic_long_t expedited_workdone0; /* # done by others #0. */ + atomic_long_t expedited_workdone1; /* # done by others #1. */ + atomic_long_t expedited_workdone2; /* # done by others #2. */ + atomic_long_t expedited_workdone3; /* # done by others #3. */ /* 7) Callback offloading. */ #ifdef CONFIG_RCU_NOCB_CPU @@ -393,7 +400,7 @@ struct rcu_data { atomic_long_t nocb_q_count_lazy; /* invocation (all stages). */ struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */ struct rcu_head **nocb_follower_tail; - wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */ + struct swait_queue_head nocb_wq; /* For nocb kthreads to sleep on. */ struct task_struct *nocb_kthread; int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ @@ -472,7 +479,7 @@ struct rcu_state { unsigned long gpnum; /* Current gp number. */ unsigned long completed; /* # of last completed gp. */ struct task_struct *gp_kthread; /* Task for grace periods. */ - wait_queue_head_t gp_wq; /* Where GP task waits. */ + struct swait_queue_head gp_wq; /* Where GP task waits. */ short gp_flags; /* Commands for GP task. */ short gp_state; /* GP kthread sleep state. */ @@ -498,13 +505,9 @@ struct rcu_state { /* End of fields guarded by barrier_mutex. */ unsigned long expedited_sequence; /* Take a ticket. */ - atomic_long_t expedited_workdone0; /* # done by others #0. */ - atomic_long_t expedited_workdone1; /* # done by others #1. */ - atomic_long_t expedited_workdone2; /* # done by others #2. */ - atomic_long_t expedited_workdone3; /* # done by others #3. */ atomic_long_t expedited_normal; /* # fallbacks to normal. */ atomic_t expedited_need_qs; /* # CPUs left to check in. */ - wait_queue_head_t expedited_wq; /* Wait for check-ins. */ + struct swait_queue_head expedited_wq; /* Wait for check-ins. */ int ncpus_snap; /* # CPUs seen last time. */ unsigned long jiffies_force_qs; /* Time at which to invoke */ @@ -545,6 +548,18 @@ struct rcu_state { #define RCU_GP_CLEANUP 5 /* Grace-period cleanup started. */ #define RCU_GP_CLEANED 6 /* Grace-period cleanup complete. */ +#ifndef RCU_TREE_NONCORE +static const char * const gp_state_names[] = { + "RCU_GP_IDLE", + "RCU_GP_WAIT_GPS", + "RCU_GP_DONE_GPS", + "RCU_GP_WAIT_FQS", + "RCU_GP_DOING_FQS", + "RCU_GP_CLEANUP", + "RCU_GP_CLEANED", +}; +#endif /* #ifndef RCU_TREE_NONCORE */ + extern struct list_head rcu_struct_flavors; /* Sequence through rcu_state structures for each RCU flavor. */ @@ -607,7 +622,8 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp); static void increment_cpu_stall_ticks(void); static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu); static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); -static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); +static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp); +static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq); static void rcu_init_one_nocb(struct rcu_node *rnp); static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, bool lazy, unsigned long flags); @@ -664,3 +680,42 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) #else /* #ifdef CONFIG_PPC */ #define smp_mb__after_unlock_lock() do { } while (0) #endif /* #else #ifdef CONFIG_PPC */ + +/* + * Wrappers for the rcu_node::lock acquire. + * + * Because the rcu_nodes form a tree, the tree traversal locking will observe + * different lock values, this in turn means that an UNLOCK of one level + * followed by a LOCK of another level does not imply a full memory barrier; + * and most importantly transitivity is lost. + * + * In order to restore full ordering between tree levels, augment the regular + * lock acquire functions with smp_mb__after_unlock_lock(). + */ +static inline void raw_spin_lock_rcu_node(struct rcu_node *rnp) +{ + raw_spin_lock(&rnp->lock); + smp_mb__after_unlock_lock(); +} + +static inline void raw_spin_lock_irq_rcu_node(struct rcu_node *rnp) +{ + raw_spin_lock_irq(&rnp->lock); + smp_mb__after_unlock_lock(); +} + +#define raw_spin_lock_irqsave_rcu_node(rnp, flags) \ +do { \ + typecheck(unsigned long, flags); \ + raw_spin_lock_irqsave(&(rnp)->lock, flags); \ + smp_mb__after_unlock_lock(); \ +} while (0) + +static inline bool raw_spin_trylock_rcu_node(struct rcu_node *rnp) +{ + bool locked = raw_spin_trylock(&rnp->lock); + + if (locked) + smp_mb__after_unlock_lock(); + return locked; +} diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 630c19772..080bd202d 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -63,8 +63,7 @@ static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */ /* * Check the RCU kernel configuration parameters and print informative - * messages about anything out of the ordinary. If you like #ifdef, you - * will love this function. + * messages about anything out of the ordinary. */ static void __init rcu_bootup_announce_oddness(void) { @@ -147,8 +146,8 @@ static void __init rcu_bootup_announce(void) * the corresponding expedited grace period will also be the end of the * normal grace period. */ -static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp, - unsigned long flags) __releases(rnp->lock) +static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) + __releases(rnp->lock) /* But leaves rrupts disabled. */ { int blkd_state = (rnp->gp_tasks ? RCU_GP_TASKS : 0) + (rnp->exp_tasks ? RCU_EXP_TASKS : 0) + @@ -236,7 +235,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp, rnp->gp_tasks = &t->rcu_node_entry; if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD)) rnp->exp_tasks = &t->rcu_node_entry; - raw_spin_unlock(&rnp->lock); + raw_spin_unlock(&rnp->lock); /* rrupts remain disabled. */ /* * Report the quiescent state for the expedited GP. This expedited @@ -251,7 +250,6 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp, } else { WARN_ON_ONCE(t->rcu_read_unlock_special.b.exp_need_qs); } - local_irq_restore(flags); } /* @@ -286,12 +284,11 @@ static void rcu_preempt_qs(void) * predating the current grace period drain, in other words, until * rnp->gp_tasks becomes NULL. * - * Caller must disable preemption. + * Caller must disable interrupts. */ static void rcu_preempt_note_context_switch(void) { struct task_struct *t = current; - unsigned long flags; struct rcu_data *rdp; struct rcu_node *rnp; @@ -301,8 +298,7 @@ static void rcu_preempt_note_context_switch(void) /* Possibly blocking in an RCU read-side critical section. */ rdp = this_cpu_ptr(rcu_state_p->rda); rnp = rdp->mynode; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_rcu_node(rnp); t->rcu_read_unlock_special.b.blocked = true; t->rcu_blocked_node = rnp; @@ -318,7 +314,7 @@ static void rcu_preempt_note_context_switch(void) (rnp->qsmask & rdp->grpmask) ? rnp->gpnum : rnp->gpnum + 1); - rcu_preempt_ctxt_queue(rnp, rdp, flags); + rcu_preempt_ctxt_queue(rnp, rdp); } else if (t->rcu_read_lock_nesting < 0 && t->rcu_read_unlock_special.s) { @@ -450,20 +446,13 @@ void rcu_read_unlock_special(struct task_struct *t) /* * Remove this task from the list it blocked on. The task - * now remains queued on the rcu_node corresponding to - * the CPU it first blocked on, so the first attempt to - * acquire the task's rcu_node's ->lock will succeed. - * Keep the loop and add a WARN_ON() out of sheer paranoia. + * now remains queued on the rcu_node corresponding to the + * CPU it first blocked on, so there is no longer any need + * to loop. Retain a WARN_ON_ONCE() out of sheer paranoia. */ - for (;;) { - rnp = t->rcu_blocked_node; - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - smp_mb__after_unlock_lock(); - if (rnp == t->rcu_blocked_node) - break; - WARN_ON_ONCE(1); - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - } + rnp = t->rcu_blocked_node; + raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ + WARN_ON_ONCE(rnp != t->rcu_blocked_node); empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); empty_exp = sync_rcu_preempt_exp_done(rnp); smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ @@ -527,7 +516,7 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) unsigned long flags; struct task_struct *t; - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); if (!rcu_preempt_blocked_readers_cgp(rnp)) { raw_spin_unlock_irqrestore(&rnp->lock, flags); return; @@ -748,6 +737,12 @@ void synchronize_rcu_expedited(void) struct rcu_state *rsp = rcu_state_p; unsigned long s; + /* If expedited grace periods are prohibited, fall back to normal. */ + if (rcu_gp_is_normal()) { + wait_rcu_gp(call_rcu); + return; + } + s = rcu_exp_gp_seq_snap(rsp); rnp_unlock = exp_funnel_lock(rsp, s); @@ -788,7 +783,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier); */ static void __init __rcu_init_preempt(void) { - rcu_init_one(rcu_state_p, rcu_data_p); + rcu_init_one(rcu_state_p); } /* @@ -989,8 +984,7 @@ static int rcu_boost(struct rcu_node *rnp) READ_ONCE(rnp->boost_tasks) == NULL) return 0; /* Nothing left to boost. */ - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); /* * Recheck under the lock: all tasks in need of boosting @@ -1176,8 +1170,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, "rcub/%d", rnp_index); if (IS_ERR(t)) return PTR_ERR(t); - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); rnp->boost_kthread_task = t; raw_spin_unlock_irqrestore(&rnp->lock, flags); sp.sched_priority = kthread_prio; @@ -1524,7 +1517,8 @@ static void rcu_prepare_for_idle(void) struct rcu_state *rsp; int tne; - if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)) + if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) || + rcu_is_nocb_cpu(smp_processor_id())) return; /* Handle nohz enablement switches conservatively. */ @@ -1538,10 +1532,6 @@ static void rcu_prepare_for_idle(void) if (!tne) return; - /* If this is a no-CBs CPU, no callbacks, just return. */ - if (rcu_is_nocb_cpu(smp_processor_id())) - return; - /* * If a non-lazy callback arrived at a CPU having only lazy * callbacks, invoke RCU core for the side-effect of recalculating @@ -1567,8 +1557,7 @@ static void rcu_prepare_for_idle(void) if (!*rdp->nxttail[RCU_DONE_TAIL]) continue; rnp = rdp->mynode; - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - smp_mb__after_unlock_lock(); + raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ needwake = rcu_accelerate_cbs(rsp, rnp, rdp); raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ if (needwake) @@ -1822,9 +1811,9 @@ early_param("rcu_nocb_poll", parse_rcu_nocb_poll); * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended * grace period. */ -static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) +static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) { - wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]); + swake_up_all(sq); } /* @@ -1840,10 +1829,15 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq; } +static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) +{ + return &rnp->nocb_gp_wq[rnp->completed & 0x1]; +} + static void rcu_init_one_nocb(struct rcu_node *rnp) { - init_waitqueue_head(&rnp->nocb_gp_wq[0]); - init_waitqueue_head(&rnp->nocb_gp_wq[1]); + init_swait_queue_head(&rnp->nocb_gp_wq[0]); + init_swait_queue_head(&rnp->nocb_gp_wq[1]); } #ifndef CONFIG_RCU_NOCB_CPU_ALL @@ -1868,7 +1862,7 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force) if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) { /* Prior smp_mb__after_atomic() orders against prior enqueue. */ WRITE_ONCE(rdp_leader->nocb_leader_sleep, false); - wake_up(&rdp_leader->nocb_wq); + swake_up(&rdp_leader->nocb_wq); } } @@ -2068,8 +2062,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) bool needwake; struct rcu_node *rnp = rdp->mynode; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); needwake = rcu_start_future_gp(rnp, rdp, &c); raw_spin_unlock_irqrestore(&rnp->lock, flags); if (needwake) @@ -2081,7 +2074,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) */ trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait")); for (;;) { - wait_event_interruptible( + swait_event_interruptible( rnp->nocb_gp_wq[c & 0x1], (d = ULONG_CMP_GE(READ_ONCE(rnp->completed), c))); if (likely(d)) @@ -2109,7 +2102,7 @@ wait_again: /* Wait for callbacks to appear. */ if (!rcu_nocb_poll) { trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep"); - wait_event_interruptible(my_rdp->nocb_wq, + swait_event_interruptible(my_rdp->nocb_wq, !READ_ONCE(my_rdp->nocb_leader_sleep)); /* Memory barrier handled by smp_mb() calls below and repoll. */ } else if (firsttime) { @@ -2184,7 +2177,7 @@ wait_again: * List was empty, wake up the follower. * Memory barriers supplied by atomic_long_add(). */ - wake_up(&rdp->nocb_wq); + swake_up(&rdp->nocb_wq); } } @@ -2205,7 +2198,7 @@ static void nocb_follower_wait(struct rcu_data *rdp) if (!rcu_nocb_poll) { trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "FollowerSleep"); - wait_event_interruptible(rdp->nocb_wq, + swait_event_interruptible(rdp->nocb_wq, READ_ONCE(rdp->nocb_follower_head)); } else if (firsttime) { /* Don't drown trace log with "Poll"! */ @@ -2364,7 +2357,7 @@ void __init rcu_init_nohz(void) static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) { rdp->nocb_tail = &rdp->nocb_head; - init_waitqueue_head(&rdp->nocb_wq); + init_swait_queue_head(&rdp->nocb_wq); rdp->nocb_follower_tail = &rdp->nocb_follower_head; } @@ -2514,7 +2507,7 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu) return false; } -static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) +static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) { } @@ -2522,6 +2515,11 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) { } +static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) +{ + return NULL; +} + static void rcu_init_one_nocb(struct rcu_node *rnp) { } diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index ef7093cc9..1088e64f0 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -1,5 +1,5 @@ /* - * Read-Copy Update tracing for classic implementation + * Read-Copy Update tracing for hierarchical implementation. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -16,6 +16,7 @@ * http://www.gnu.org/licenses/gpl-2.0.html. * * Copyright IBM Corporation, 2008 + * Author: Paul E. McKenney * * Papers: http://www.rdrop.com/users/paulmck/RCU * @@ -33,9 +34,7 @@ #include <linux/sched.h> #include <linux/atomic.h> #include <linux/bitops.h> -#include <linux/module.h> #include <linux/completion.h> -#include <linux/moduleparam.h> #include <linux/percpu.h> #include <linux/notifier.h> #include <linux/cpu.h> @@ -183,14 +182,20 @@ static const struct file_operations rcudata_fops = { static int show_rcuexp(struct seq_file *m, void *v) { + int cpu; struct rcu_state *rsp = (struct rcu_state *)m->private; - + struct rcu_data *rdp; + unsigned long s0 = 0, s1 = 0, s2 = 0, s3 = 0; + + for_each_possible_cpu(cpu) { + rdp = per_cpu_ptr(rsp->rda, cpu); + s0 += atomic_long_read(&rdp->expedited_workdone0); + s1 += atomic_long_read(&rdp->expedited_workdone1); + s2 += atomic_long_read(&rdp->expedited_workdone2); + s3 += atomic_long_read(&rdp->expedited_workdone3); + } seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n", - rsp->expedited_sequence, - atomic_long_read(&rsp->expedited_workdone0), - atomic_long_read(&rsp->expedited_workdone1), - atomic_long_read(&rsp->expedited_workdone2), - atomic_long_read(&rsp->expedited_workdone3), + rsp->expedited_sequence, s0, s1, s2, s3, atomic_long_read(&rsp->expedited_normal), atomic_read(&rsp->expedited_need_qs), rsp->expedited_sequence / 2); @@ -319,7 +324,7 @@ static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) unsigned long gpmax; struct rcu_node *rnp = &rsp->node[0]; - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); completed = READ_ONCE(rsp->completed); gpnum = READ_ONCE(rsp->gpnum); if (completed == gpnum) @@ -487,16 +492,4 @@ free_out: debugfs_remove_recursive(rcudir); return 1; } - -static void __exit rcutree_trace_cleanup(void) -{ - debugfs_remove_recursive(rcudir); -} - - -module_init(rcutree_trace_init); -module_exit(rcutree_trace_cleanup); - -MODULE_AUTHOR("Paul E. McKenney"); -MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation"); -MODULE_LICENSE("GPL"); +device_initcall(rcutree_trace_init); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 5f748c5a4..76b94e194 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -60,7 +60,12 @@ MODULE_ALIAS("rcupdate"); #endif #define MODULE_PARAM_PREFIX "rcupdate." +#ifndef CONFIG_TINY_RCU module_param(rcu_expedited, int, 0); +module_param(rcu_normal, int, 0); +static int rcu_normal_after_boot; +module_param(rcu_normal_after_boot, int, 0); +#endif /* #ifndef CONFIG_TINY_RCU */ #if defined(CONFIG_DEBUG_LOCK_ALLOC) && defined(CONFIG_PREEMPT_COUNT) /** @@ -113,6 +118,17 @@ EXPORT_SYMBOL(rcu_read_lock_sched_held); #ifndef CONFIG_TINY_RCU +/* + * Should expedited grace-period primitives always fall back to their + * non-expedited counterparts? Intended for use within RCU. Note + * that if the user specifies both rcu_expedited and rcu_normal, then + * rcu_normal wins. + */ +bool rcu_gp_is_normal(void) +{ + return READ_ONCE(rcu_normal); +} + static atomic_t rcu_expedited_nesting = ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0); @@ -157,8 +173,6 @@ void rcu_unexpedite_gp(void) } EXPORT_SYMBOL_GPL(rcu_unexpedite_gp); -#endif /* #ifndef CONFIG_TINY_RCU */ - /* * Inform RCU of the end of the in-kernel boot sequence. */ @@ -166,8 +180,12 @@ void rcu_end_inkernel_boot(void) { if (IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT)) rcu_unexpedite_gp(); + if (rcu_normal_after_boot) + WRITE_ONCE(rcu_normal, 1); } +#endif /* #ifndef CONFIG_TINY_RCU */ + #ifdef CONFIG_PREEMPT_RCU /* diff --git a/kernel/relay.c b/kernel/relay.c index 0b4570cfa..074994bcf 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1133,7 +1133,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos, if (!desc->count) return 0; - mutex_lock(&file_inode(filp)->i_mutex); + inode_lock(file_inode(filp)); do { if (!relay_file_read_avail(buf, *ppos)) break; @@ -1153,7 +1153,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos, *ppos = relay_file_read_end_pos(buf, read_start, ret); } } while (desc->count && ret); - mutex_unlock(&file_inode(filp)->i_mutex); + inode_unlock(file_inode(filp)); return desc->written; } diff --git a/kernel/resource.c b/kernel/resource.c index 249b1eb1e..3669d1bfc 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1499,8 +1499,15 @@ int iomem_is_exclusive(u64 addr) break; if (p->end < addr) continue; - if (p->flags & IORESOURCE_BUSY && - p->flags & IORESOURCE_EXCLUSIVE) { + /* + * A resource is exclusive if IORESOURCE_EXCLUSIVE is set + * or CONFIG_IO_STRICT_DEVMEM is enabled and the + * resource is busy. + */ + if ((p->flags & IORESOURCE_BUSY) == 0) + continue; + if (IS_ENABLED(CONFIG_IO_STRICT_DEVMEM) + || p->flags & IORESOURCE_EXCLUSIVE) { err = 1; break; } diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 67687973c..7d4cba227 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -13,7 +13,7 @@ endif obj-y += core.o loadavg.o clock.o cputime.o obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o -obj-y += wait.o completion.o idle.o +obj-y += wait.o swait.o completion.o idle.o obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index 750ed601d..a5d966cb8 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -212,7 +212,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice) ag = autogroup_task_get(p); down_write(&ag->lock); - err = sched_group_set_shares(ag->tg, prio_to_weight[nice + 20]); + err = sched_group_set_shares(ag->tg, sched_prio_to_weight[nice + 20]); if (!err) ag->nice = nice; up_write(&ag->lock); diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index caf4041f5..bc54e8467 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -354,7 +354,7 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) return; sched_clock_tick(); - touch_softlockup_watchdog(); + touch_softlockup_watchdog_sched(); } EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index eb70592f0..05114b15b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -26,6 +26,7 @@ * Thomas Gleixner, Mike Kravetz */ +#include <linux/kasan.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/nmi.h> @@ -66,12 +67,10 @@ #include <linux/pagemap.h> #include <linux/hrtimer.h> #include <linux/tick.h> -#include <linux/debugfs.h> #include <linux/ctype.h> #include <linux/ftrace.h> #include <linux/slab.h> #include <linux/init_task.h> -#include <linux/binfmts.h> #include <linux/context_tracking.h> #include <linux/compiler.h> @@ -124,138 +123,6 @@ const_debug unsigned int sysctl_sched_features = #undef SCHED_FEAT -#ifdef CONFIG_SCHED_DEBUG -#define SCHED_FEAT(name, enabled) \ - #name , - -static const char * const sched_feat_names[] = { -#include "features.h" -}; - -#undef SCHED_FEAT - -static int sched_feat_show(struct seq_file *m, void *v) -{ - int i; - - for (i = 0; i < __SCHED_FEAT_NR; i++) { - if (!(sysctl_sched_features & (1UL << i))) - seq_puts(m, "NO_"); - seq_printf(m, "%s ", sched_feat_names[i]); - } - seq_puts(m, "\n"); - - return 0; -} - -#ifdef HAVE_JUMP_LABEL - -#define jump_label_key__true STATIC_KEY_INIT_TRUE -#define jump_label_key__false STATIC_KEY_INIT_FALSE - -#define SCHED_FEAT(name, enabled) \ - jump_label_key__##enabled , - -struct static_key sched_feat_keys[__SCHED_FEAT_NR] = { -#include "features.h" -}; - -#undef SCHED_FEAT - -static void sched_feat_disable(int i) -{ - static_key_disable(&sched_feat_keys[i]); -} - -static void sched_feat_enable(int i) -{ - static_key_enable(&sched_feat_keys[i]); -} -#else -static void sched_feat_disable(int i) { }; -static void sched_feat_enable(int i) { }; -#endif /* HAVE_JUMP_LABEL */ - -static int sched_feat_set(char *cmp) -{ - int i; - int neg = 0; - - if (strncmp(cmp, "NO_", 3) == 0) { - neg = 1; - cmp += 3; - } - - for (i = 0; i < __SCHED_FEAT_NR; i++) { - if (strcmp(cmp, sched_feat_names[i]) == 0) { - if (neg) { - sysctl_sched_features &= ~(1UL << i); - sched_feat_disable(i); - } else { - sysctl_sched_features |= (1UL << i); - sched_feat_enable(i); - } - break; - } - } - - return i; -} - -static ssize_t -sched_feat_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[64]; - char *cmp; - int i; - struct inode *inode; - - if (cnt > 63) - cnt = 63; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - cmp = strstrip(buf); - - /* Ensure the static_key remains in a consistent state */ - inode = file_inode(filp); - mutex_lock(&inode->i_mutex); - i = sched_feat_set(cmp); - mutex_unlock(&inode->i_mutex); - if (i == __SCHED_FEAT_NR) - return -EINVAL; - - *ppos += cnt; - - return cnt; -} - -static int sched_feat_open(struct inode *inode, struct file *filp) -{ - return single_open(filp, sched_feat_show, NULL); -} - -static const struct file_operations sched_feat_fops = { - .open = sched_feat_open, - .write = sched_feat_write, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static __init int sched_init_debug(void) -{ - debugfs_create_file("sched_features", 0644, NULL, NULL, - &sched_feat_fops); - - return 0; -} -late_initcall(sched_init_debug); -#endif /* CONFIG_SCHED_DEBUG */ - /* * Number of tasks to iterate in a single balance run. * Limited because this is done with IRQs disabled. @@ -731,7 +598,7 @@ bool sched_can_stop_tick(void) if (current->policy == SCHED_RR) { struct sched_rt_entity *rt_se = ¤t->rt; - return rt_se->run_list.prev == rt_se->run_list.next; + return list_is_singular(&rt_se->run_list); } /* @@ -823,8 +690,8 @@ static void set_load_weight(struct task_struct *p) return; } - load->weight = scale_load(prio_to_weight[prio]); - load->inv_weight = prio_to_wmult[prio]; + load->weight = scale_load(sched_prio_to_weight[prio]); + load->inv_weight = sched_prio_to_wmult[prio]; } static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) @@ -1071,8 +938,8 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new { lockdep_assert_held(&rq->lock); - dequeue_task(rq, p, 0); p->on_rq = TASK_ON_RQ_MIGRATING; + dequeue_task(rq, p, 0); set_task_cpu(p, new_cpu); raw_spin_unlock(&rq->lock); @@ -1080,8 +947,8 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new raw_spin_lock(&rq->lock); BUG_ON(task_cpu(p) != new_cpu); - p->on_rq = TASK_ON_RQ_QUEUED; enqueue_task(rq, p, 0); + p->on_rq = TASK_ON_RQ_QUEUED; check_preempt_curr(rq, p, 0); return rq; @@ -1274,6 +1141,15 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && !p->on_rq); + /* + * Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING, + * because schedstat_wait_{start,end} rebase migrating task's wait_start + * time relying on p->on_rq. + */ + WARN_ON_ONCE(p->state == TASK_RUNNING && + p->sched_class == &fair_sched_class && + (p->on_rq && !task_on_rq_migrating(p))); + #ifdef CONFIG_LOCKDEP /* * The caller should hold either p->pi_lock or rq->lock, when changing @@ -1310,9 +1186,11 @@ static void __migrate_swap_task(struct task_struct *p, int cpu) src_rq = task_rq(p); dst_rq = cpu_rq(cpu); + p->on_rq = TASK_ON_RQ_MIGRATING; deactivate_task(src_rq, p, 0); set_task_cpu(p, cpu); activate_task(dst_rq, p, 0); + p->on_rq = TASK_ON_RQ_QUEUED; check_preempt_curr(dst_rq, p, 0); } else { /* @@ -1905,6 +1783,97 @@ static void ttwu_queue(struct task_struct *p, int cpu) raw_spin_unlock(&rq->lock); } +/* + * Notes on Program-Order guarantees on SMP systems. + * + * MIGRATION + * + * The basic program-order guarantee on SMP systems is that when a task [t] + * migrates, all its activity on its old cpu [c0] happens-before any subsequent + * execution on its new cpu [c1]. + * + * For migration (of runnable tasks) this is provided by the following means: + * + * A) UNLOCK of the rq(c0)->lock scheduling out task t + * B) migration for t is required to synchronize *both* rq(c0)->lock and + * rq(c1)->lock (if not at the same time, then in that order). + * C) LOCK of the rq(c1)->lock scheduling in task + * + * Transitivity guarantees that B happens after A and C after B. + * Note: we only require RCpc transitivity. + * Note: the cpu doing B need not be c0 or c1 + * + * Example: + * + * CPU0 CPU1 CPU2 + * + * LOCK rq(0)->lock + * sched-out X + * sched-in Y + * UNLOCK rq(0)->lock + * + * LOCK rq(0)->lock // orders against CPU0 + * dequeue X + * UNLOCK rq(0)->lock + * + * LOCK rq(1)->lock + * enqueue X + * UNLOCK rq(1)->lock + * + * LOCK rq(1)->lock // orders against CPU2 + * sched-out Z + * sched-in X + * UNLOCK rq(1)->lock + * + * + * BLOCKING -- aka. SLEEP + WAKEUP + * + * For blocking we (obviously) need to provide the same guarantee as for + * migration. However the means are completely different as there is no lock + * chain to provide order. Instead we do: + * + * 1) smp_store_release(X->on_cpu, 0) + * 2) smp_cond_acquire(!X->on_cpu) + * + * Example: + * + * CPU0 (schedule) CPU1 (try_to_wake_up) CPU2 (schedule) + * + * LOCK rq(0)->lock LOCK X->pi_lock + * dequeue X + * sched-out X + * smp_store_release(X->on_cpu, 0); + * + * smp_cond_acquire(!X->on_cpu); + * X->state = WAKING + * set_task_cpu(X,2) + * + * LOCK rq(2)->lock + * enqueue X + * X->state = RUNNING + * UNLOCK rq(2)->lock + * + * LOCK rq(2)->lock // orders against CPU1 + * sched-out Z + * sched-in X + * UNLOCK rq(2)->lock + * + * UNLOCK X->pi_lock + * UNLOCK rq(0)->lock + * + * + * However; for wakeups there is a second guarantee we must provide, namely we + * must observe the state that lead to our wakeup. That is, not only must our + * task observe its own prior state, it must also observe the stores prior to + * its wakeup. + * + * This means that any means of doing remote wakeups must order the CPU doing + * the wakeup against the CPU the task is going to end up running on. This, + * however, is already required for the regular Program-Order guarantee above, + * since the waking CPU is the one issueing the ACQUIRE (smp_cond_acquire). + * + */ + /** * try_to_wake_up - wake up a thread * @p: the thread to be awakened @@ -1968,19 +1937,13 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) /* * If the owning (remote) cpu is still in the middle of schedule() with * this task as prev, wait until its done referencing the task. - */ - while (p->on_cpu) - cpu_relax(); - /* - * Combined with the control dependency above, we have an effective - * smp_load_acquire() without the need for full barriers. * * Pairs with the smp_store_release() in finish_lock_switch(). * * This ensures that tasks getting woken will be fully ordered against * their previous state and preserve Program Order. */ - smp_rmb(); + smp_cond_acquire(!p->on_cpu); p->sched_contributes_to_load = !!task_contributes_to_load(p); p->state = TASK_WAKING; @@ -1997,7 +1960,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) ttwu_queue(p, cpu); stat: - ttwu_stat(p, cpu, wake_flags); + if (schedstat_enabled()) + ttwu_stat(p, cpu, wake_flags); out: raw_spin_unlock_irqrestore(&p->pi_lock, flags); @@ -2045,7 +2009,8 @@ static void try_to_wake_up_local(struct task_struct *p) ttwu_activate(rq, p, ENQUEUE_WAKEUP); ttwu_do_wakeup(rq, p, 0); - ttwu_stat(p, smp_processor_id(), 0); + if (schedstat_enabled()) + ttwu_stat(p, smp_processor_id(), 0); out: raw_spin_unlock(&p->pi_lock); } @@ -2087,7 +2052,6 @@ void __dl_clear_params(struct task_struct *p) dl_se->dl_bw = 0; dl_se->dl_throttled = 0; - dl_se->dl_new = 1; dl_se->dl_yielded = 0; } @@ -2109,7 +2073,12 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->se.vruntime = 0; INIT_LIST_HEAD(&p->se.group_node); +#ifdef CONFIG_FAIR_GROUP_SCHED + p->se.cfs_rq = NULL; +#endif + #ifdef CONFIG_SCHEDSTATS + /* Even if schedstat is disabled, there should not be garbage */ memset(&p->se.statistics, 0, sizeof(p->se.statistics)); #endif @@ -2118,6 +2087,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) __dl_clear_params(p); INIT_LIST_HEAD(&p->rt.run_list); + p->rt.timeout = 0; + p->rt.time_slice = sched_rr_timeslice; + p->rt.on_rq = 0; + p->rt.on_list = 0; #ifdef CONFIG_PREEMPT_NOTIFIERS INIT_HLIST_HEAD(&p->preempt_notifiers); @@ -2181,6 +2154,69 @@ int sysctl_numa_balancing(struct ctl_table *table, int write, #endif #endif +DEFINE_STATIC_KEY_FALSE(sched_schedstats); + +#ifdef CONFIG_SCHEDSTATS +static void set_schedstats(bool enabled) +{ + if (enabled) + static_branch_enable(&sched_schedstats); + else + static_branch_disable(&sched_schedstats); +} + +void force_schedstat_enabled(void) +{ + if (!schedstat_enabled()) { + pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); + static_branch_enable(&sched_schedstats); + } +} + +static int __init setup_schedstats(char *str) +{ + int ret = 0; + if (!str) + goto out; + + if (!strcmp(str, "enable")) { + set_schedstats(true); + ret = 1; + } else if (!strcmp(str, "disable")) { + set_schedstats(false); + ret = 1; + } +out: + if (!ret) + pr_warn("Unable to parse schedstats=\n"); + + return ret; +} +__setup("schedstats=", setup_schedstats); + +#ifdef CONFIG_PROC_SYSCTL +int sysctl_schedstats(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table t; + int err; + int state = static_branch_likely(&sched_schedstats); + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + t = *table; + t.data = &state; + err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); + if (err < 0) + return err; + if (write) + set_schedstats(state); + return err; +} +#endif +#endif + /* * fork()/clone()-time setup: */ @@ -2910,16 +2946,6 @@ u64 scheduler_tick_max_deferment(void) } #endif -notrace unsigned long get_parent_ip(unsigned long addr) -{ - if (in_lock_functions(addr)) { - addr = CALLER_ADDR2; - if (in_lock_functions(addr)) - addr = CALLER_ADDR3; - } - return addr; -} - #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ defined(CONFIG_PREEMPT_TRACER)) @@ -2941,7 +2967,7 @@ void preempt_count_add(int val) PREEMPT_MASK - 10); #endif if (preempt_count() == val) { - unsigned long ip = get_parent_ip(CALLER_ADDR1); + unsigned long ip = get_lock_parent_ip(); #ifdef CONFIG_DEBUG_PREEMPT current->preempt_disable_ip = ip; #endif @@ -2968,7 +2994,7 @@ void preempt_count_sub(int val) #endif if (preempt_count() == val) - trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); + trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); __preempt_count_sub(val); } EXPORT_SYMBOL(preempt_count_sub); @@ -3109,7 +3135,6 @@ static void __sched notrace __schedule(bool preempt) cpu = smp_processor_id(); rq = cpu_rq(cpu); - rcu_note_context_switch(); prev = rq->curr; /* @@ -3128,13 +3153,16 @@ static void __sched notrace __schedule(bool preempt) if (sched_feat(HRTICK)) hrtick_clear(rq); + local_irq_disable(); + rcu_note_context_switch(); + /* * Make sure that signal_pending_state()->signal_pending() below * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) * done by the caller to avoid the race with signal_wake_up(). */ smp_mb__before_spinlock(); - raw_spin_lock_irq(&rq->lock); + raw_spin_lock(&rq->lock); lockdep_pin_lock(&rq->lock); rq->clock_skip_update <<= 1; /* promote REQ to ACT */ @@ -3178,7 +3206,6 @@ static void __sched notrace __schedule(bool preempt) trace_sched_switch(preempt, prev, next); rq = context_switch(rq, prev, next); /* unlocks the rq */ - cpu = cpu_of(rq); } else { lockdep_unpin_lock(&rq->lock); raw_spin_unlock_irq(&rq->lock); @@ -3364,7 +3391,7 @@ EXPORT_SYMBOL(default_wake_function); */ void rt_mutex_setprio(struct task_struct *p, int prio) { - int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE; + int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE; struct rq *rq; const struct sched_class *prev_class; @@ -3392,11 +3419,15 @@ void rt_mutex_setprio(struct task_struct *p, int prio) trace_sched_pi_setprio(p, prio); oldprio = p->prio; + + if (oldprio == prio) + queue_flag &= ~DEQUEUE_MOVE; + prev_class = p->sched_class; queued = task_on_rq_queued(p); running = task_current(rq, p); if (queued) - dequeue_task(rq, p, DEQUEUE_SAVE); + dequeue_task(rq, p, queue_flag); if (running) put_prev_task(rq, p); @@ -3414,7 +3445,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (!dl_prio(p->normal_prio) || (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { p->dl.dl_boosted = 1; - enqueue_flag |= ENQUEUE_REPLENISH; + queue_flag |= ENQUEUE_REPLENISH; } else p->dl.dl_boosted = 0; p->sched_class = &dl_sched_class; @@ -3422,7 +3453,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (dl_prio(oldprio)) p->dl.dl_boosted = 0; if (oldprio < prio) - enqueue_flag |= ENQUEUE_HEAD; + queue_flag |= ENQUEUE_HEAD; p->sched_class = &rt_sched_class; } else { if (dl_prio(oldprio)) @@ -3437,7 +3468,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (running) p->sched_class->set_curr_task(rq); if (queued) - enqueue_task(rq, p, enqueue_flag); + enqueue_task(rq, p, queue_flag); check_class_changed(rq, p, prev_class, oldprio); out_unlock: @@ -3793,6 +3824,7 @@ static int __sched_setscheduler(struct task_struct *p, const struct sched_class *prev_class; struct rq *rq; int reset_on_fork; + int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; /* may grab non-irq protected spin_locks */ BUG_ON(in_interrupt()); @@ -3975,17 +4007,14 @@ change: * itself. */ new_effective_prio = rt_mutex_get_effective_prio(p, newprio); - if (new_effective_prio == oldprio) { - __setscheduler_params(p, attr); - task_rq_unlock(rq, p, &flags); - return 0; - } + if (new_effective_prio == oldprio) + queue_flags &= ~DEQUEUE_MOVE; } queued = task_on_rq_queued(p); running = task_current(rq, p); if (queued) - dequeue_task(rq, p, DEQUEUE_SAVE); + dequeue_task(rq, p, queue_flags); if (running) put_prev_task(rq, p); @@ -3995,15 +4024,14 @@ change: if (running) p->sched_class->set_curr_task(rq); if (queued) { - int enqueue_flags = ENQUEUE_RESTORE; /* * We enqueue to tail when the priority of a task is * increased (user space view). */ - if (oldprio <= p->prio) - enqueue_flags |= ENQUEUE_HEAD; + if (oldprio < p->prio) + queue_flags |= ENQUEUE_HEAD; - enqueue_task(rq, p, enqueue_flags); + enqueue_task(rq, p, queue_flags); } check_class_changed(rq, p, prev_class, oldprio); @@ -4994,6 +5022,8 @@ void init_idle(struct task_struct *idle, int cpu) idle->state = TASK_RUNNING; idle->se.exec_start = sched_clock(); + kasan_unpoison_task_stack(idle); + #ifdef CONFIG_SMP /* * Its possible that init_idle() gets called multiple times on a task, @@ -5303,183 +5333,6 @@ static void migrate_tasks(struct rq *dead_rq) } #endif /* CONFIG_HOTPLUG_CPU */ -#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) - -static struct ctl_table sd_ctl_dir[] = { - { - .procname = "sched_domain", - .mode = 0555, - }, - {} -}; - -static struct ctl_table sd_ctl_root[] = { - { - .procname = "kernel", - .mode = 0555, - .child = sd_ctl_dir, - }, - {} -}; - -static struct ctl_table *sd_alloc_ctl_entry(int n) -{ - struct ctl_table *entry = - kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); - - return entry; -} - -static void sd_free_ctl_entry(struct ctl_table **tablep) -{ - struct ctl_table *entry; - - /* - * In the intermediate directories, both the child directory and - * procname are dynamically allocated and could fail but the mode - * will always be set. In the lowest directory the names are - * static strings and all have proc handlers. - */ - for (entry = *tablep; entry->mode; entry++) { - if (entry->child) - sd_free_ctl_entry(&entry->child); - if (entry->proc_handler == NULL) - kfree(entry->procname); - } - - kfree(*tablep); - *tablep = NULL; -} - -static int min_load_idx = 0; -static int max_load_idx = CPU_LOAD_IDX_MAX-1; - -static void -set_table_entry(struct ctl_table *entry, - const char *procname, void *data, int maxlen, - umode_t mode, proc_handler *proc_handler, - bool load_idx) -{ - entry->procname = procname; - entry->data = data; - entry->maxlen = maxlen; - entry->mode = mode; - entry->proc_handler = proc_handler; - - if (load_idx) { - entry->extra1 = &min_load_idx; - entry->extra2 = &max_load_idx; - } -} - -static struct ctl_table * -sd_alloc_ctl_domain_table(struct sched_domain *sd) -{ - struct ctl_table *table = sd_alloc_ctl_entry(14); - - if (table == NULL) - return NULL; - - set_table_entry(&table[0], "min_interval", &sd->min_interval, - sizeof(long), 0644, proc_doulongvec_minmax, false); - set_table_entry(&table[1], "max_interval", &sd->max_interval, - sizeof(long), 0644, proc_doulongvec_minmax, false); - set_table_entry(&table[2], "busy_idx", &sd->busy_idx, - sizeof(int), 0644, proc_dointvec_minmax, true); - set_table_entry(&table[3], "idle_idx", &sd->idle_idx, - sizeof(int), 0644, proc_dointvec_minmax, true); - set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, - sizeof(int), 0644, proc_dointvec_minmax, true); - set_table_entry(&table[5], "wake_idx", &sd->wake_idx, - sizeof(int), 0644, proc_dointvec_minmax, true); - set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, - sizeof(int), 0644, proc_dointvec_minmax, true); - set_table_entry(&table[7], "busy_factor", &sd->busy_factor, - sizeof(int), 0644, proc_dointvec_minmax, false); - set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, - sizeof(int), 0644, proc_dointvec_minmax, false); - set_table_entry(&table[9], "cache_nice_tries", - &sd->cache_nice_tries, - sizeof(int), 0644, proc_dointvec_minmax, false); - set_table_entry(&table[10], "flags", &sd->flags, - sizeof(int), 0644, proc_dointvec_minmax, false); - set_table_entry(&table[11], "max_newidle_lb_cost", - &sd->max_newidle_lb_cost, - sizeof(long), 0644, proc_doulongvec_minmax, false); - set_table_entry(&table[12], "name", sd->name, - CORENAME_MAX_SIZE, 0444, proc_dostring, false); - /* &table[13] is terminator */ - - return table; -} - -static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) -{ - struct ctl_table *entry, *table; - struct sched_domain *sd; - int domain_num = 0, i; - char buf[32]; - - for_each_domain(cpu, sd) - domain_num++; - entry = table = sd_alloc_ctl_entry(domain_num + 1); - if (table == NULL) - return NULL; - - i = 0; - for_each_domain(cpu, sd) { - snprintf(buf, 32, "domain%d", i); - entry->procname = kstrdup(buf, GFP_KERNEL); - entry->mode = 0555; - entry->child = sd_alloc_ctl_domain_table(sd); - entry++; - i++; - } - return table; -} - -static struct ctl_table_header *sd_sysctl_header; -static void register_sched_domain_sysctl(void) -{ - int i, cpu_num = num_possible_cpus(); - struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); - char buf[32]; - - WARN_ON(sd_ctl_dir[0].child); - sd_ctl_dir[0].child = entry; - - if (entry == NULL) - return; - - for_each_possible_cpu(i) { - snprintf(buf, 32, "cpu%d", i); - entry->procname = kstrdup(buf, GFP_KERNEL); - entry->mode = 0555; - entry->child = sd_alloc_ctl_cpu_table(i); - entry++; - } - - WARN_ON(sd_sysctl_header); - sd_sysctl_header = register_sysctl_table(sd_ctl_root); -} - -/* may be called multiple times per register */ -static void unregister_sched_domain_sysctl(void) -{ - unregister_sysctl_table(sd_sysctl_header); - sd_sysctl_header = NULL; - if (sd_ctl_dir[0].child) - sd_free_ctl_entry(&sd_ctl_dir[0].child); -} -#else -static void register_sched_domain_sysctl(void) -{ -} -static void unregister_sched_domain_sysctl(void) -{ -} -#endif /* CONFIG_SCHED_DEBUG && CONFIG_SYSCTL */ - static void set_rq_online(struct rq *rq) { if (!rq->online) { @@ -6071,11 +5924,16 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) /* Setup the mask of cpus configured for isolated domains */ static int __init isolated_cpu_setup(char *str) { + int ret; + alloc_bootmem_cpumask_var(&cpu_isolated_map); - cpulist_parse(str, cpu_isolated_map); + ret = cpulist_parse(str, cpu_isolated_map); + if (ret) { + pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids); + return 0; + } return 1; } - __setup("isolcpus=", isolated_cpu_setup); struct s_data { @@ -7355,6 +7213,9 @@ int in_sched_functions(unsigned long addr) */ struct task_group root_task_group; LIST_HEAD(task_groups); + +/* Cacheline aligned slab cache for task_group */ +static struct kmem_cache *task_group_cache __read_mostly; #endif DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); @@ -7412,11 +7273,12 @@ void __init sched_init(void) #endif /* CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_CGROUP_SCHED + task_group_cache = KMEM_CACHE(task_group, 0); + list_add(&root_task_group.list, &task_groups); INIT_LIST_HEAD(&root_task_group.children); INIT_LIST_HEAD(&root_task_group.siblings); autogroup_init(&init_task); - #endif /* CONFIG_CGROUP_SCHED */ for_each_possible_cpu(i) { @@ -7697,7 +7559,7 @@ static void free_sched_group(struct task_group *tg) free_fair_sched_group(tg); free_rt_sched_group(tg); autogroup_free(tg); - kfree(tg); + kmem_cache_free(task_group_cache, tg); } /* allocate runqueue etc for a new task group */ @@ -7705,7 +7567,7 @@ struct task_group *sched_create_group(struct task_group *parent) { struct task_group *tg; - tg = kzalloc(sizeof(*tg), GFP_KERNEL); + tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO); if (!tg) return ERR_PTR(-ENOMEM); @@ -7754,11 +7616,9 @@ void sched_destroy_group(struct task_group *tg) void sched_offline_group(struct task_group *tg) { unsigned long flags; - int i; /* end participation in shares distribution */ - for_each_possible_cpu(i) - unregister_fair_sched_group(tg, i); + unregister_fair_sched_group(tg); spin_lock_irqsave(&task_group_lock, flags); list_del_rcu(&tg->list); @@ -7784,7 +7644,7 @@ void sched_move_task(struct task_struct *tsk) queued = task_on_rq_queued(tsk); if (queued) - dequeue_task(rq, tsk, DEQUEUE_SAVE); + dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE); if (unlikely(running)) put_prev_task(rq, tsk); @@ -7808,7 +7668,7 @@ void sched_move_task(struct task_struct *tsk) if (unlikely(running)) tsk->sched_class->set_curr_task(rq); if (queued) - enqueue_task(rq, tsk, ENQUEUE_RESTORE); + enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE); task_rq_unlock(rq, tsk, &flags); } @@ -8236,7 +8096,7 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) sched_offline_group(tg); } -static void cpu_cgroup_fork(struct task_struct *task, void *private) +static void cpu_cgroup_fork(struct task_struct *task) { sched_move_task(task); } @@ -8610,3 +8470,44 @@ void dump_cpu_task(int cpu) pr_info("Task dump for CPU %d:\n", cpu); sched_show_task(cpu_curr(cpu)); } + +/* + * Nice levels are multiplicative, with a gentle 10% change for every + * nice level changed. I.e. when a CPU-bound task goes from nice 0 to + * nice 1, it will get ~10% less CPU time than another CPU-bound task + * that remained on nice 0. + * + * The "10% effect" is relative and cumulative: from _any_ nice level, + * if you go up 1 level, it's -10% CPU usage, if you go down 1 level + * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. + * If a task goes up by ~10% and another task goes down by ~10% then + * the relative distance between them is ~25%.) + */ +const int sched_prio_to_weight[40] = { + /* -20 */ 88761, 71755, 56483, 46273, 36291, + /* -15 */ 29154, 23254, 18705, 14949, 11916, + /* -10 */ 9548, 7620, 6100, 4904, 3906, + /* -5 */ 3121, 2501, 1991, 1586, 1277, + /* 0 */ 1024, 820, 655, 526, 423, + /* 5 */ 335, 272, 215, 172, 137, + /* 10 */ 110, 87, 70, 56, 45, + /* 15 */ 36, 29, 23, 18, 15, +}; + +/* + * Inverse (2^32/x) values of the sched_prio_to_weight[] array, precalculated. + * + * In cases where the weight does not change often, we can use the + * precalculated inverse to speed up arithmetics by turning divisions + * into multiplications: + */ +const u32 sched_prio_to_wmult[40] = { + /* -20 */ 48388, 59856, 76040, 92818, 118348, + /* -15 */ 147320, 184698, 229616, 287308, 360437, + /* -10 */ 449829, 563644, 704093, 875809, 1099582, + /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326, + /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587, + /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126, + /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, + /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, +}; diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 05de80b48..75f98c549 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -5,6 +5,9 @@ #include <linux/static_key.h> #include <linux/context_tracking.h> #include "sched.h" +#ifdef CONFIG_PARAVIRT +#include <asm/paravirt.h> +#endif #ifdef CONFIG_IRQ_TIME_ACCOUNTING @@ -259,21 +262,21 @@ static __always_inline bool steal_account_process_tick(void) #ifdef CONFIG_PARAVIRT if (static_key_false(¶virt_steal_enabled)) { u64 steal; - cputime_t steal_ct; + unsigned long steal_jiffies; steal = paravirt_steal_clock(smp_processor_id()); steal -= this_rq()->prev_steal_time; /* - * cputime_t may be less precise than nsecs (eg: if it's - * based on jiffies). Lets cast the result to cputime + * steal is in nsecs but our caller is expecting steal + * time in jiffies. Lets cast the result to jiffies * granularity and account the rest on the next rounds. */ - steal_ct = nsecs_to_cputime(steal); - this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct); + steal_jiffies = nsecs_to_jiffies(steal); + this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies); - account_steal_time(steal_ct); - return steal_ct; + account_steal_time(jiffies_to_cputime(steal_jiffies)); + return steal_jiffies; } #endif return false; @@ -466,7 +469,7 @@ void account_process_tick(struct task_struct *p, int user_tick) cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); struct rq *rq = this_rq(); - if (vtime_accounting_enabled()) + if (vtime_accounting_cpu_enabled()) return; if (sched_clock_irqtime) { @@ -665,26 +668,25 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN -static unsigned long long vtime_delta(struct task_struct *tsk) +static cputime_t vtime_delta(struct task_struct *tsk) { - unsigned long long clock; + unsigned long now = READ_ONCE(jiffies); - clock = local_clock(); - if (clock < tsk->vtime_snap) + if (time_before(now, (unsigned long)tsk->vtime_snap)) return 0; - return clock - tsk->vtime_snap; + return jiffies_to_cputime(now - tsk->vtime_snap); } static cputime_t get_vtime_delta(struct task_struct *tsk) { - unsigned long long delta = vtime_delta(tsk); + unsigned long now = READ_ONCE(jiffies); + unsigned long delta = now - tsk->vtime_snap; - WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_SLEEPING); - tsk->vtime_snap += delta; + WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); + tsk->vtime_snap = now; - /* CHECKME: always safe to convert nsecs to cputime? */ - return nsecs_to_cputime(delta); + return jiffies_to_cputime(delta); } static void __vtime_account_system(struct task_struct *tsk) @@ -696,37 +698,44 @@ static void __vtime_account_system(struct task_struct *tsk) void vtime_account_system(struct task_struct *tsk) { - write_seqlock(&tsk->vtime_seqlock); + if (!vtime_delta(tsk)) + return; + + write_seqcount_begin(&tsk->vtime_seqcount); __vtime_account_system(tsk); - write_sequnlock(&tsk->vtime_seqlock); + write_seqcount_end(&tsk->vtime_seqcount); } void vtime_gen_account_irq_exit(struct task_struct *tsk) { - write_seqlock(&tsk->vtime_seqlock); - __vtime_account_system(tsk); + write_seqcount_begin(&tsk->vtime_seqcount); + if (vtime_delta(tsk)) + __vtime_account_system(tsk); if (context_tracking_in_user()) tsk->vtime_snap_whence = VTIME_USER; - write_sequnlock(&tsk->vtime_seqlock); + write_seqcount_end(&tsk->vtime_seqcount); } void vtime_account_user(struct task_struct *tsk) { cputime_t delta_cpu; - write_seqlock(&tsk->vtime_seqlock); - delta_cpu = get_vtime_delta(tsk); + write_seqcount_begin(&tsk->vtime_seqcount); tsk->vtime_snap_whence = VTIME_SYS; - account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); - write_sequnlock(&tsk->vtime_seqlock); + if (vtime_delta(tsk)) { + delta_cpu = get_vtime_delta(tsk); + account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); + } + write_seqcount_end(&tsk->vtime_seqcount); } void vtime_user_enter(struct task_struct *tsk) { - write_seqlock(&tsk->vtime_seqlock); - __vtime_account_system(tsk); + write_seqcount_begin(&tsk->vtime_seqcount); + if (vtime_delta(tsk)) + __vtime_account_system(tsk); tsk->vtime_snap_whence = VTIME_USER; - write_sequnlock(&tsk->vtime_seqlock); + write_seqcount_end(&tsk->vtime_seqcount); } void vtime_guest_enter(struct task_struct *tsk) @@ -738,19 +747,20 @@ void vtime_guest_enter(struct task_struct *tsk) * synchronization against the reader (task_gtime()) * that can thus safely catch up with a tickless delta. */ - write_seqlock(&tsk->vtime_seqlock); - __vtime_account_system(tsk); + write_seqcount_begin(&tsk->vtime_seqcount); + if (vtime_delta(tsk)) + __vtime_account_system(tsk); current->flags |= PF_VCPU; - write_sequnlock(&tsk->vtime_seqlock); + write_seqcount_end(&tsk->vtime_seqcount); } EXPORT_SYMBOL_GPL(vtime_guest_enter); void vtime_guest_exit(struct task_struct *tsk) { - write_seqlock(&tsk->vtime_seqlock); + write_seqcount_begin(&tsk->vtime_seqcount); __vtime_account_system(tsk); current->flags &= ~PF_VCPU; - write_sequnlock(&tsk->vtime_seqlock); + write_seqcount_end(&tsk->vtime_seqcount); } EXPORT_SYMBOL_GPL(vtime_guest_exit); @@ -763,24 +773,26 @@ void vtime_account_idle(struct task_struct *tsk) void arch_vtime_task_switch(struct task_struct *prev) { - write_seqlock(&prev->vtime_seqlock); - prev->vtime_snap_whence = VTIME_SLEEPING; - write_sequnlock(&prev->vtime_seqlock); + write_seqcount_begin(&prev->vtime_seqcount); + prev->vtime_snap_whence = VTIME_INACTIVE; + write_seqcount_end(&prev->vtime_seqcount); - write_seqlock(¤t->vtime_seqlock); + write_seqcount_begin(¤t->vtime_seqcount); current->vtime_snap_whence = VTIME_SYS; - current->vtime_snap = sched_clock_cpu(smp_processor_id()); - write_sequnlock(¤t->vtime_seqlock); + current->vtime_snap = jiffies; + write_seqcount_end(¤t->vtime_seqcount); } void vtime_init_idle(struct task_struct *t, int cpu) { unsigned long flags; - write_seqlock_irqsave(&t->vtime_seqlock, flags); + local_irq_save(flags); + write_seqcount_begin(&t->vtime_seqcount); t->vtime_snap_whence = VTIME_SYS; - t->vtime_snap = sched_clock_cpu(cpu); - write_sequnlock_irqrestore(&t->vtime_seqlock, flags); + t->vtime_snap = jiffies; + write_seqcount_end(&t->vtime_seqcount); + local_irq_restore(flags); } cputime_t task_gtime(struct task_struct *t) @@ -788,17 +800,17 @@ cputime_t task_gtime(struct task_struct *t) unsigned int seq; cputime_t gtime; - if (!context_tracking_is_enabled()) + if (!vtime_accounting_enabled()) return t->gtime; do { - seq = read_seqbegin(&t->vtime_seqlock); + seq = read_seqcount_begin(&t->vtime_seqcount); gtime = t->gtime; - if (t->flags & PF_VCPU) + if (t->vtime_snap_whence == VTIME_SYS && t->flags & PF_VCPU) gtime += vtime_delta(t); - } while (read_seqretry(&t->vtime_seqlock, seq)); + } while (read_seqcount_retry(&t->vtime_seqcount, seq)); return gtime; } @@ -821,7 +833,7 @@ fetch_task_cputime(struct task_struct *t, *udelta = 0; *sdelta = 0; - seq = read_seqbegin(&t->vtime_seqlock); + seq = read_seqcount_begin(&t->vtime_seqcount); if (u_dst) *u_dst = *u_src; @@ -829,7 +841,7 @@ fetch_task_cputime(struct task_struct *t, *s_dst = *s_src; /* Task is sleeping, nothing to add */ - if (t->vtime_snap_whence == VTIME_SLEEPING || + if (t->vtime_snap_whence == VTIME_INACTIVE || is_idle_task(t)) continue; @@ -845,7 +857,7 @@ fetch_task_cputime(struct task_struct *t, if (t->vtime_snap_whence == VTIME_SYS) *sdelta = delta; } - } while (read_seqretry(&t->vtime_seqlock, seq)); + } while (read_seqcount_retry(&t->vtime_seqcount, seq)); } @@ -853,6 +865,14 @@ void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime) { cputime_t udelta, sdelta; + if (!vtime_accounting_enabled()) { + if (utime) + *utime = t->utime; + if (stime) + *stime = t->stime; + return; + } + fetch_task_cputime(t, utime, stime, &t->utime, &t->stime, &udelta, &sdelta); if (utime) @@ -866,6 +886,14 @@ void task_cputime_scaled(struct task_struct *t, { cputime_t udelta, sdelta; + if (!vtime_accounting_enabled()) { + if (utimescaled) + *utimescaled = t->utimescaled; + if (stimescaled) + *stimescaled = t->stimescaled; + return; + } + fetch_task_cputime(t, utimescaled, stimescaled, &t->utimescaled, &t->stimescaled, &udelta, &sdelta); if (utimescaled) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 8b0a15e28..c7a036fac 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -176,8 +176,10 @@ static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) } } - if (leftmost) + if (leftmost) { dl_rq->pushable_dl_tasks_leftmost = &p->pushable_dl_tasks; + dl_rq->earliest_dl.next = p->dl.deadline; + } rb_link_node(&p->pushable_dl_tasks, parent, link); rb_insert_color(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root); @@ -195,6 +197,10 @@ static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) next_node = rb_next(&p->pushable_dl_tasks); dl_rq->pushable_dl_tasks_leftmost = next_node; + if (next_node) { + dl_rq->earliest_dl.next = rb_entry(next_node, + struct task_struct, pushable_dl_tasks)->dl.deadline; + } } rb_erase(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root); @@ -346,7 +352,15 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); - WARN_ON(!dl_se->dl_new || dl_se->dl_throttled); + WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline)); + + /* + * We are racing with the deadline timer. So, do nothing because + * the deadline timer handler will take care of properly recharging + * the runtime and postponing the deadline + */ + if (dl_se->dl_throttled) + return; /* * We use the regular wall clock time to set deadlines in the @@ -355,7 +369,6 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se, */ dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; dl_se->runtime = pi_se->dl_runtime; - dl_se->dl_new = 0; } /* @@ -393,6 +406,9 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, dl_se->runtime = pi_se->dl_runtime; } + if (dl_se->dl_yielded && dl_se->runtime > 0) + dl_se->runtime = 0; + /* * We keep moving the deadline away until we get some * available runtime for the entity. This ensures correct @@ -414,7 +430,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, * entity. */ if (dl_time_before(dl_se->deadline, rq_clock(rq))) { - printk_deferred_once("sched: DL replenish lagged to much\n"); + printk_deferred_once("sched: DL replenish lagged too much\n"); dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; dl_se->runtime = pi_se->dl_runtime; } @@ -494,15 +510,6 @@ static void update_dl_entity(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); - /* - * The arrival of a new instance needs special treatment, i.e., - * the actual scheduling parameters have to be "renewed". - */ - if (dl_se->dl_new) { - setup_new_dl_entity(dl_se, pi_se); - return; - } - if (dl_time_before(dl_se->deadline, rq_clock(rq)) || dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) { dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; @@ -599,16 +606,6 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) } /* - * This is possible if switched_from_dl() raced against a running - * callback that took the above !dl_task() path and we've since then - * switched back into SCHED_DEADLINE. - * - * There's nothing to do except drop our task reference. - */ - if (dl_se->dl_new) - goto unlock; - - /* * The task might have been boosted by someone else and might be in the * boosting/deboosting path, its not throttled. */ @@ -729,8 +726,11 @@ static void update_curr_dl(struct rq *rq) * approach need further study. */ delta_exec = rq_clock_task(rq) - curr->se.exec_start; - if (unlikely((s64)delta_exec <= 0)) + if (unlikely((s64)delta_exec <= 0)) { + if (unlikely(dl_se->dl_yielded)) + goto throttle; return; + } schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); @@ -743,8 +743,10 @@ static void update_curr_dl(struct rq *rq) sched_rt_avg_update(rq, delta_exec); - dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec; - if (dl_runtime_exceeded(dl_se)) { + dl_se->runtime -= delta_exec; + +throttle: + if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) { dl_se->dl_throttled = 1; __dequeue_task_dl(rq, curr, 0); if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr))) @@ -782,42 +784,14 @@ static void update_curr_dl(struct rq *rq) #ifdef CONFIG_SMP -static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu); - -static inline u64 next_deadline(struct rq *rq) -{ - struct task_struct *next = pick_next_earliest_dl_task(rq, rq->cpu); - - if (next && dl_prio(next->prio)) - return next->dl.deadline; - else - return 0; -} - static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) { struct rq *rq = rq_of_dl_rq(dl_rq); if (dl_rq->earliest_dl.curr == 0 || dl_time_before(deadline, dl_rq->earliest_dl.curr)) { - /* - * If the dl_rq had no -deadline tasks, or if the new task - * has shorter deadline than the current one on dl_rq, we - * know that the previous earliest becomes our next earliest, - * as the new task becomes the earliest itself. - */ - dl_rq->earliest_dl.next = dl_rq->earliest_dl.curr; dl_rq->earliest_dl.curr = deadline; cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1); - } else if (dl_rq->earliest_dl.next == 0 || - dl_time_before(deadline, dl_rq->earliest_dl.next)) { - /* - * On the other hand, if the new -deadline task has a - * a later deadline than the earliest one on dl_rq, but - * it is earlier than the next (if any), we must - * recompute the next-earliest. - */ - dl_rq->earliest_dl.next = next_deadline(rq); } } @@ -839,7 +813,6 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) entry = rb_entry(leftmost, struct sched_dl_entity, rb_node); dl_rq->earliest_dl.curr = entry->deadline; - dl_rq->earliest_dl.next = next_deadline(rq); cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1); } } @@ -940,7 +913,7 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, * parameters of the task might need updating. Otherwise, * we want a replenishment of its runtime. */ - if (dl_se->dl_new || flags & ENQUEUE_WAKEUP) + if (flags & ENQUEUE_WAKEUP) update_dl_entity(dl_se, pi_se); else if (flags & ENQUEUE_REPLENISH) replenish_dl_entity(dl_se, pi_se); @@ -1017,18 +990,14 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) */ static void yield_task_dl(struct rq *rq) { - struct task_struct *p = rq->curr; - /* * We make the task go to sleep until its current deadline by * forcing its runtime to zero. This way, update_curr_dl() stops * it and the bandwidth timer will wake it up and will give it * new scheduling parameters (thanks to dl_yielded=1). */ - if (p->dl.runtime > 0) { - rq->curr->dl.dl_yielded = 1; - p->dl.runtime = 0; - } + rq->curr->dl.dl_yielded = 1; + update_rq_clock(rq); update_curr_dl(rq); /* @@ -1274,28 +1243,6 @@ static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) return 0; } -/* Returns the second earliest -deadline task, NULL otherwise */ -static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu) -{ - struct rb_node *next_node = rq->dl.rb_leftmost; - struct sched_dl_entity *dl_se; - struct task_struct *p = NULL; - -next_node: - next_node = rb_next(next_node); - if (next_node) { - dl_se = rb_entry(next_node, struct sched_dl_entity, rb_node); - p = dl_task_of(dl_se); - - if (pick_dl_task(rq, p, cpu)) - return p; - - goto next_node; - } - - return NULL; -} - /* * Return the earliest pushable rq's task, which is suitable to be executed * on the CPU, NULL otherwise: @@ -1767,6 +1714,9 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) */ static void switched_to_dl(struct rq *rq, struct task_struct *p) { + if (dl_time_before(p->dl.deadline, rq_clock(rq))) + setup_new_dl_entity(&p->dl, &p->dl); + if (task_on_rq_queued(p) && rq->curr != p) { #ifdef CONFIG_SMP if (p->nr_cpus_allowed > 1 && rq->dl.overloaded) @@ -1813,8 +1763,7 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, */ resched_curr(rq); #endif /* CONFIG_SMP */ - } else - switched_to_dl(rq, p); + } } const struct sched_class dl_sched_class = { diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 641511771..4fbc3bd5f 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -16,6 +16,7 @@ #include <linux/kallsyms.h> #include <linux/utsname.h> #include <linux/mempolicy.h> +#include <linux/debugfs.h> #include "sched.h" @@ -58,6 +59,309 @@ static unsigned long nsec_low(unsigned long long nsec) #define SPLIT_NS(x) nsec_high(x), nsec_low(x) +#define SCHED_FEAT(name, enabled) \ + #name , + +static const char * const sched_feat_names[] = { +#include "features.h" +}; + +#undef SCHED_FEAT + +static int sched_feat_show(struct seq_file *m, void *v) +{ + int i; + + for (i = 0; i < __SCHED_FEAT_NR; i++) { + if (!(sysctl_sched_features & (1UL << i))) + seq_puts(m, "NO_"); + seq_printf(m, "%s ", sched_feat_names[i]); + } + seq_puts(m, "\n"); + + return 0; +} + +#ifdef HAVE_JUMP_LABEL + +#define jump_label_key__true STATIC_KEY_INIT_TRUE +#define jump_label_key__false STATIC_KEY_INIT_FALSE + +#define SCHED_FEAT(name, enabled) \ + jump_label_key__##enabled , + +struct static_key sched_feat_keys[__SCHED_FEAT_NR] = { +#include "features.h" +}; + +#undef SCHED_FEAT + +static void sched_feat_disable(int i) +{ + static_key_disable(&sched_feat_keys[i]); +} + +static void sched_feat_enable(int i) +{ + static_key_enable(&sched_feat_keys[i]); +} +#else +static void sched_feat_disable(int i) { }; +static void sched_feat_enable(int i) { }; +#endif /* HAVE_JUMP_LABEL */ + +static int sched_feat_set(char *cmp) +{ + int i; + int neg = 0; + + if (strncmp(cmp, "NO_", 3) == 0) { + neg = 1; + cmp += 3; + } + + for (i = 0; i < __SCHED_FEAT_NR; i++) { + if (strcmp(cmp, sched_feat_names[i]) == 0) { + if (neg) { + sysctl_sched_features &= ~(1UL << i); + sched_feat_disable(i); + } else { + sysctl_sched_features |= (1UL << i); + sched_feat_enable(i); + } + break; + } + } + + return i; +} + +static ssize_t +sched_feat_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + char *cmp; + int i; + struct inode *inode; + + if (cnt > 63) + cnt = 63; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + cmp = strstrip(buf); + + /* Ensure the static_key remains in a consistent state */ + inode = file_inode(filp); + inode_lock(inode); + i = sched_feat_set(cmp); + inode_unlock(inode); + if (i == __SCHED_FEAT_NR) + return -EINVAL; + + *ppos += cnt; + + return cnt; +} + +static int sched_feat_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, sched_feat_show, NULL); +} + +static const struct file_operations sched_feat_fops = { + .open = sched_feat_open, + .write = sched_feat_write, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static __init int sched_init_debug(void) +{ + debugfs_create_file("sched_features", 0644, NULL, NULL, + &sched_feat_fops); + + return 0; +} +late_initcall(sched_init_debug); + +#ifdef CONFIG_SMP + +#ifdef CONFIG_SYSCTL + +static struct ctl_table sd_ctl_dir[] = { + { + .procname = "sched_domain", + .mode = 0555, + }, + {} +}; + +static struct ctl_table sd_ctl_root[] = { + { + .procname = "kernel", + .mode = 0555, + .child = sd_ctl_dir, + }, + {} +}; + +static struct ctl_table *sd_alloc_ctl_entry(int n) +{ + struct ctl_table *entry = + kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); + + return entry; +} + +static void sd_free_ctl_entry(struct ctl_table **tablep) +{ + struct ctl_table *entry; + + /* + * In the intermediate directories, both the child directory and + * procname are dynamically allocated and could fail but the mode + * will always be set. In the lowest directory the names are + * static strings and all have proc handlers. + */ + for (entry = *tablep; entry->mode; entry++) { + if (entry->child) + sd_free_ctl_entry(&entry->child); + if (entry->proc_handler == NULL) + kfree(entry->procname); + } + + kfree(*tablep); + *tablep = NULL; +} + +static int min_load_idx = 0; +static int max_load_idx = CPU_LOAD_IDX_MAX-1; + +static void +set_table_entry(struct ctl_table *entry, + const char *procname, void *data, int maxlen, + umode_t mode, proc_handler *proc_handler, + bool load_idx) +{ + entry->procname = procname; + entry->data = data; + entry->maxlen = maxlen; + entry->mode = mode; + entry->proc_handler = proc_handler; + + if (load_idx) { + entry->extra1 = &min_load_idx; + entry->extra2 = &max_load_idx; + } +} + +static struct ctl_table * +sd_alloc_ctl_domain_table(struct sched_domain *sd) +{ + struct ctl_table *table = sd_alloc_ctl_entry(14); + + if (table == NULL) + return NULL; + + set_table_entry(&table[0], "min_interval", &sd->min_interval, + sizeof(long), 0644, proc_doulongvec_minmax, false); + set_table_entry(&table[1], "max_interval", &sd->max_interval, + sizeof(long), 0644, proc_doulongvec_minmax, false); + set_table_entry(&table[2], "busy_idx", &sd->busy_idx, + sizeof(int), 0644, proc_dointvec_minmax, true); + set_table_entry(&table[3], "idle_idx", &sd->idle_idx, + sizeof(int), 0644, proc_dointvec_minmax, true); + set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, + sizeof(int), 0644, proc_dointvec_minmax, true); + set_table_entry(&table[5], "wake_idx", &sd->wake_idx, + sizeof(int), 0644, proc_dointvec_minmax, true); + set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, + sizeof(int), 0644, proc_dointvec_minmax, true); + set_table_entry(&table[7], "busy_factor", &sd->busy_factor, + sizeof(int), 0644, proc_dointvec_minmax, false); + set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, + sizeof(int), 0644, proc_dointvec_minmax, false); + set_table_entry(&table[9], "cache_nice_tries", + &sd->cache_nice_tries, + sizeof(int), 0644, proc_dointvec_minmax, false); + set_table_entry(&table[10], "flags", &sd->flags, + sizeof(int), 0644, proc_dointvec_minmax, false); + set_table_entry(&table[11], "max_newidle_lb_cost", + &sd->max_newidle_lb_cost, + sizeof(long), 0644, proc_doulongvec_minmax, false); + set_table_entry(&table[12], "name", sd->name, + CORENAME_MAX_SIZE, 0444, proc_dostring, false); + /* &table[13] is terminator */ + + return table; +} + +static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) +{ + struct ctl_table *entry, *table; + struct sched_domain *sd; + int domain_num = 0, i; + char buf[32]; + + for_each_domain(cpu, sd) + domain_num++; + entry = table = sd_alloc_ctl_entry(domain_num + 1); + if (table == NULL) + return NULL; + + i = 0; + for_each_domain(cpu, sd) { + snprintf(buf, 32, "domain%d", i); + entry->procname = kstrdup(buf, GFP_KERNEL); + entry->mode = 0555; + entry->child = sd_alloc_ctl_domain_table(sd); + entry++; + i++; + } + return table; +} + +static struct ctl_table_header *sd_sysctl_header; +void register_sched_domain_sysctl(void) +{ + int i, cpu_num = num_possible_cpus(); + struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); + char buf[32]; + + WARN_ON(sd_ctl_dir[0].child); + sd_ctl_dir[0].child = entry; + + if (entry == NULL) + return; + + for_each_possible_cpu(i) { + snprintf(buf, 32, "cpu%d", i); + entry->procname = kstrdup(buf, GFP_KERNEL); + entry->mode = 0555; + entry->child = sd_alloc_ctl_cpu_table(i); + entry++; + } + + WARN_ON(sd_sysctl_header); + sd_sysctl_header = register_sysctl_table(sd_ctl_root); +} + +/* may be called multiple times per register */ +void unregister_sched_domain_sysctl(void) +{ + unregister_sysctl_table(sd_sysctl_header); + sd_sysctl_header = NULL; + if (sd_ctl_dir[0].child) + sd_free_ctl_entry(&sd_ctl_dir[0].child); +} +#endif /* CONFIG_SYSCTL */ +#endif /* CONFIG_SMP */ + #ifdef CONFIG_FAIR_GROUP_SCHED static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg) { @@ -75,16 +379,18 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group PN(se->vruntime); PN(se->sum_exec_runtime); #ifdef CONFIG_SCHEDSTATS - PN(se->statistics.wait_start); - PN(se->statistics.sleep_start); - PN(se->statistics.block_start); - PN(se->statistics.sleep_max); - PN(se->statistics.block_max); - PN(se->statistics.exec_max); - PN(se->statistics.slice_max); - PN(se->statistics.wait_max); - PN(se->statistics.wait_sum); - P(se->statistics.wait_count); + if (schedstat_enabled()) { + PN(se->statistics.wait_start); + PN(se->statistics.sleep_start); + PN(se->statistics.block_start); + PN(se->statistics.sleep_max); + PN(se->statistics.block_max); + PN(se->statistics.exec_max); + PN(se->statistics.slice_max); + PN(se->statistics.wait_max); + PN(se->statistics.wait_sum); + P(se->statistics.wait_count); + } #endif P(se->load.weight); #ifdef CONFIG_SMP @@ -122,10 +428,12 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) (long long)(p->nvcsw + p->nivcsw), p->prio); #ifdef CONFIG_SCHEDSTATS - SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", - SPLIT_NS(p->se.statistics.wait_sum), - SPLIT_NS(p->se.sum_exec_runtime), - SPLIT_NS(p->se.statistics.sum_sleep_runtime)); + if (schedstat_enabled()) { + SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", + SPLIT_NS(p->se.statistics.wait_sum), + SPLIT_NS(p->se.sum_exec_runtime), + SPLIT_NS(p->se.statistics.sum_sleep_runtime)); + } #else SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", 0LL, 0L, @@ -258,8 +566,17 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq) { + struct dl_bw *dl_bw; + SEQ_printf(m, "\ndl_rq[%d]:\n", cpu); SEQ_printf(m, " .%-30s: %ld\n", "dl_nr_running", dl_rq->dl_nr_running); +#ifdef CONFIG_SMP + dl_bw = &cpu_rq(cpu)->rd->dl_bw; +#else + dl_bw = &dl_rq->dl_bw; +#endif + SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->bw", dl_bw->bw); + SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->total_bw", dl_bw->total_bw); } extern __read_mostly int sched_clock_running; @@ -313,17 +630,18 @@ do { \ #define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); #define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n); - P(yld_count); - - P(sched_count); - P(sched_goidle); #ifdef CONFIG_SMP P64(avg_idle); P64(max_idle_balance_cost); #endif - P(ttwu_count); - P(ttwu_local); + if (schedstat_enabled()) { + P(yld_count); + P(sched_count); + P(sched_goidle); + P(ttwu_count); + P(ttwu_local); + } #undef P #undef P64 @@ -569,38 +887,39 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) nr_switches = p->nvcsw + p->nivcsw; #ifdef CONFIG_SCHEDSTATS - PN(se.statistics.sum_sleep_runtime); - PN(se.statistics.wait_start); - PN(se.statistics.sleep_start); - PN(se.statistics.block_start); - PN(se.statistics.sleep_max); - PN(se.statistics.block_max); - PN(se.statistics.exec_max); - PN(se.statistics.slice_max); - PN(se.statistics.wait_max); - PN(se.statistics.wait_sum); - P(se.statistics.wait_count); - PN(se.statistics.iowait_sum); - P(se.statistics.iowait_count); P(se.nr_migrations); - P(se.statistics.nr_migrations_cold); - P(se.statistics.nr_failed_migrations_affine); - P(se.statistics.nr_failed_migrations_running); - P(se.statistics.nr_failed_migrations_hot); - P(se.statistics.nr_forced_migrations); - P(se.statistics.nr_wakeups); - P(se.statistics.nr_wakeups_sync); - P(se.statistics.nr_wakeups_migrate); - P(se.statistics.nr_wakeups_local); - P(se.statistics.nr_wakeups_remote); - P(se.statistics.nr_wakeups_affine); - P(se.statistics.nr_wakeups_affine_attempts); - P(se.statistics.nr_wakeups_passive); - P(se.statistics.nr_wakeups_idle); - { + if (schedstat_enabled()) { u64 avg_atom, avg_per_cpu; + PN(se.statistics.sum_sleep_runtime); + PN(se.statistics.wait_start); + PN(se.statistics.sleep_start); + PN(se.statistics.block_start); + PN(se.statistics.sleep_max); + PN(se.statistics.block_max); + PN(se.statistics.exec_max); + PN(se.statistics.slice_max); + PN(se.statistics.wait_max); + PN(se.statistics.wait_sum); + P(se.statistics.wait_count); + PN(se.statistics.iowait_sum); + P(se.statistics.iowait_count); + P(se.statistics.nr_migrations_cold); + P(se.statistics.nr_failed_migrations_affine); + P(se.statistics.nr_failed_migrations_running); + P(se.statistics.nr_failed_migrations_hot); + P(se.statistics.nr_forced_migrations); + P(se.statistics.nr_wakeups); + P(se.statistics.nr_wakeups_sync); + P(se.statistics.nr_wakeups_migrate); + P(se.statistics.nr_wakeups_local); + P(se.statistics.nr_wakeups_remote); + P(se.statistics.nr_wakeups_affine); + P(se.statistics.nr_wakeups_affine_attempts); + P(se.statistics.nr_wakeups_passive); + P(se.statistics.nr_wakeups_idle); + avg_atom = p->se.sum_exec_runtime; if (nr_switches) avg_atom = div64_ul(avg_atom, nr_switches); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 82e905862..ac7fb39c3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -20,8 +20,8 @@ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra */ -#include <linux/latencytop.h> #include <linux/sched.h> +#include <linux/latencytop.h> #include <linux/cpumask.h> #include <linux/cpuidle.h> #include <linux/slab.h> @@ -763,16 +763,52 @@ static void update_curr_fair(struct rq *rq) update_curr(cfs_rq_of(&rq->curr->se)); } +#ifdef CONFIG_SCHEDSTATS static inline void update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) { - schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq))); + u64 wait_start = rq_clock(rq_of(cfs_rq)); + + if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) && + likely(wait_start > se->statistics.wait_start)) + wait_start -= se->statistics.wait_start; + + se->statistics.wait_start = wait_start; +} + +static void +update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + struct task_struct *p; + u64 delta; + + delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start; + + if (entity_is_task(se)) { + p = task_of(se); + if (task_on_rq_migrating(p)) { + /* + * Preserve migrating task's wait time so wait_start + * time stamp can be adjusted to accumulate wait time + * prior to migration. + */ + se->statistics.wait_start = delta; + return; + } + trace_sched_stat_wait(p, delta); + } + + se->statistics.wait_max = max(se->statistics.wait_max, delta); + se->statistics.wait_count++; + se->statistics.wait_sum += delta; + se->statistics.wait_start = 0; } /* * Task is being enqueued - update stats: */ -static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) +static inline void +update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { /* * Are we enqueueing a waiting task? (for current tasks @@ -782,25 +818,8 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) update_stats_wait_start(cfs_rq, se); } -static void -update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max, - rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start)); - schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1); - schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum + - rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start); -#ifdef CONFIG_SCHEDSTATS - if (entity_is_task(se)) { - trace_sched_stat_wait(task_of(se), - rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start); - } -#endif - schedstat_set(se->statistics.wait_start, 0); -} - static inline void -update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) +update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { /* * Mark the end of the wait period if dequeueing a @@ -808,8 +827,41 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) */ if (se != cfs_rq->curr) update_stats_wait_end(cfs_rq, se); + + if (flags & DEQUEUE_SLEEP) { + if (entity_is_task(se)) { + struct task_struct *tsk = task_of(se); + + if (tsk->state & TASK_INTERRUPTIBLE) + se->statistics.sleep_start = rq_clock(rq_of(cfs_rq)); + if (tsk->state & TASK_UNINTERRUPTIBLE) + se->statistics.block_start = rq_clock(rq_of(cfs_rq)); + } + } + +} +#else +static inline void +update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +} + +static inline void +update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ } +static inline void +update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +} + +static inline void +update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +{ +} +#endif + /* * We are picking a new current task - update its stats: */ @@ -905,10 +957,11 @@ struct numa_group { spinlock_t lock; /* nr_tasks, tasks */ int nr_tasks; pid_t gid; + int active_nodes; struct rcu_head rcu; - nodemask_t active_nodes; unsigned long total_faults; + unsigned long max_faults_cpu; /* * Faults_cpu is used to decide whether memory should move * towards the CPU. As a consequence, these stats are weighted @@ -967,6 +1020,18 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)]; } +/* + * A node triggering more than 1/3 as many NUMA faults as the maximum is + * considered part of a numa group's pseudo-interleaving set. Migrations + * between these nodes are slowed down, to allow things to settle down. + */ +#define ACTIVE_NODE_FRACTION 3 + +static bool numa_is_active_node(int nid, struct numa_group *ng) +{ + return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu; +} + /* Handle placement on systems where not all nodes are directly connected. */ static unsigned long score_nearby_nodes(struct task_struct *p, int nid, int maxdist, bool task) @@ -1116,27 +1181,23 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, return true; /* - * Do not migrate if the destination is not a node that - * is actively used by this numa group. + * Destination node is much more heavily used than the source + * node? Allow migration. */ - if (!node_isset(dst_nid, ng->active_nodes)) - return false; - - /* - * Source is a node that is not actively used by this - * numa group, while the destination is. Migrate. - */ - if (!node_isset(src_nid, ng->active_nodes)) + if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) * + ACTIVE_NODE_FRACTION) return true; /* - * Both source and destination are nodes in active - * use by this numa group. Maximize memory bandwidth - * by migrating from more heavily used groups, to less - * heavily used ones, spreading the load around. - * Use a 1/4 hysteresis to avoid spurious page movement. + * Distribute memory according to CPU & memory use on each node, + * with 3/4 hysteresis to avoid unnecessary memory migrations: + * + * faults_cpu(dst) 3 faults_cpu(src) + * --------------- * - > --------------- + * faults_mem(dst) 4 faults_mem(src) */ - return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4); + return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 > + group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4; } static unsigned long weighted_cpuload(const int cpu); @@ -1218,8 +1279,6 @@ static void task_numa_assign(struct task_numa_env *env, { if (env->best_task) put_task_struct(env->best_task); - if (p) - get_task_struct(p); env->best_task = p; env->best_imp = imp; @@ -1287,20 +1346,30 @@ static void task_numa_compare(struct task_numa_env *env, long imp = env->p->numa_group ? groupimp : taskimp; long moveimp = imp; int dist = env->dist; + bool assigned = false; rcu_read_lock(); raw_spin_lock_irq(&dst_rq->lock); cur = dst_rq->curr; /* - * No need to move the exiting task, and this ensures that ->curr - * wasn't reaped and thus get_task_struct() in task_numa_assign() - * is safe under RCU read lock. - * Note that rcu_read_lock() itself can't protect from the final - * put_task_struct() after the last schedule(). + * No need to move the exiting task or idle task. */ if ((cur->flags & PF_EXITING) || is_idle_task(cur)) cur = NULL; + else { + /* + * The task_struct must be protected here to protect the + * p->numa_faults access in the task_weight since the + * numa_faults could already be freed in the following path: + * finish_task_switch() + * --> put_task_struct() + * --> __put_task_struct() + * --> task_numa_free() + */ + get_task_struct(cur); + } + raw_spin_unlock_irq(&dst_rq->lock); /* @@ -1384,6 +1453,7 @@ balance: */ if (!load_too_imbalanced(src_load, dst_load, env)) { imp = moveimp - 1; + put_task_struct(cur); cur = NULL; goto assign; } @@ -1409,9 +1479,16 @@ balance: env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); assign: + assigned = true; task_numa_assign(env, cur, imp); unlock: rcu_read_unlock(); + /* + * The dst_rq->curr isn't assigned. The protection for task_struct is + * finished. + */ + if (cur && !assigned) + put_task_struct(cur); } static void task_numa_find_cpu(struct task_numa_env *env, @@ -1466,7 +1543,7 @@ static int task_numa_migrate(struct task_struct *p) .best_task = NULL, .best_imp = 0, - .best_cpu = -1 + .best_cpu = -1, }; struct sched_domain *sd; unsigned long taskweight, groupweight; @@ -1518,8 +1595,7 @@ static int task_numa_migrate(struct task_struct *p) * multiple NUMA nodes; in order to better consolidate the group, * we need to check other locations. */ - if (env.best_cpu == -1 || (p->numa_group && - nodes_weight(p->numa_group->active_nodes) > 1)) { + if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) { for_each_online_node(nid) { if (nid == env.src_nid || nid == p->numa_preferred_nid) continue; @@ -1554,12 +1630,14 @@ static int task_numa_migrate(struct task_struct *p) * trying for a better one later. Do not set the preferred node here. */ if (p->numa_group) { + struct numa_group *ng = p->numa_group; + if (env.best_cpu == -1) nid = env.src_nid; else nid = env.dst_nid; - if (node_isset(nid, p->numa_group->active_nodes)) + if (ng->active_nodes > 1 && numa_is_active_node(env.dst_nid, ng)) sched_setnuma(p, env.dst_nid); } @@ -1609,20 +1687,15 @@ static void numa_migrate_preferred(struct task_struct *p) } /* - * Find the nodes on which the workload is actively running. We do this by + * Find out how many nodes on the workload is actively running on. Do this by * tracking the nodes from which NUMA hinting faults are triggered. This can * be different from the set of nodes where the workload's memory is currently * located. - * - * The bitmask is used to make smarter decisions on when to do NUMA page - * migrations, To prevent flip-flopping, and excessive page migrations, nodes - * are added when they cause over 6/16 of the maximum number of faults, but - * only removed when they drop below 3/16. */ -static void update_numa_active_node_mask(struct numa_group *numa_group) +static void numa_group_count_active_nodes(struct numa_group *numa_group) { unsigned long faults, max_faults = 0; - int nid; + int nid, active_nodes = 0; for_each_online_node(nid) { faults = group_faults_cpu(numa_group, nid); @@ -1632,12 +1705,12 @@ static void update_numa_active_node_mask(struct numa_group *numa_group) for_each_online_node(nid) { faults = group_faults_cpu(numa_group, nid); - if (!node_isset(nid, numa_group->active_nodes)) { - if (faults > max_faults * 6 / 16) - node_set(nid, numa_group->active_nodes); - } else if (faults < max_faults * 3 / 16) - node_clear(nid, numa_group->active_nodes); + if (faults * ACTIVE_NODE_FRACTION > max_faults) + active_nodes++; } + + numa_group->max_faults_cpu = max_faults; + numa_group->active_nodes = active_nodes; } /* @@ -1928,7 +2001,7 @@ static void task_numa_placement(struct task_struct *p) update_task_scan_period(p, fault_types[0], fault_types[1]); if (p->numa_group) { - update_numa_active_node_mask(p->numa_group); + numa_group_count_active_nodes(p->numa_group); spin_unlock_irq(group_lock); max_nid = preferred_group_nid(p, max_group_nid); } @@ -1972,14 +2045,14 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, return; atomic_set(&grp->refcount, 1); + grp->active_nodes = 1; + grp->max_faults_cpu = 0; spin_lock_init(&grp->lock); grp->gid = p->pid; /* Second half of the array tracks nids where faults happen */ grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES * nr_node_ids; - node_set(task_node(current), grp->active_nodes); - for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) grp->faults[i] = p->numa_faults[i]; @@ -2093,6 +2166,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) bool migrated = flags & TNF_MIGRATED; int cpu_node = task_node(current); int local = !!(flags & TNF_FAULT_LOCAL); + struct numa_group *ng; int priv; if (!static_branch_likely(&sched_numa_balancing)) @@ -2133,9 +2207,10 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) * actively using should be counted as local. This allows the * scan rate to slow down when a workload has settled down. */ - if (!priv && !local && p->numa_group && - node_isset(cpu_node, p->numa_group->active_nodes) && - node_isset(mem_node, p->numa_group->active_nodes)) + ng = p->numa_group; + if (!priv && !local && ng && ng->active_nodes > 1 && + numa_is_active_node(cpu_node, ng) && + numa_is_active_node(mem_node, ng)) local = 1; task_numa_placement(p); @@ -2180,6 +2255,7 @@ void task_numa_work(struct callback_head *work) unsigned long migrate, next_scan, now = jiffies; struct task_struct *p = current; struct mm_struct *mm = p->mm; + u64 runtime = p->se.sum_exec_runtime; struct vm_area_struct *vma; unsigned long start, end; unsigned long nr_pte_updates = 0; @@ -2302,6 +2378,17 @@ out: else reset_ptenuma_scan(p); up_read(&mm->mmap_sem); + + /* + * Make sure tasks use at least 32x as much time to run other code + * than they used here, to limit NUMA PTE scanning overhead to 3% max. + * Usually update_task_scan_period slows down scanning enough; on an + * overloaded system we need to limit overhead on a per task basis. + */ + if (unlikely(p->se.sum_exec_runtime != runtime)) { + u64 diff = p->se.sum_exec_runtime - runtime; + p->node_stamp += 32 * diff; + } } /* @@ -2695,12 +2782,64 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) { long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib; + /* + * No need to update load_avg for root_task_group as it is not used. + */ + if (cfs_rq->tg == &root_task_group) + return; + if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) { atomic_long_add(delta, &cfs_rq->tg->load_avg); cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg; } } +/* + * Called within set_task_rq() right before setting a task's cpu. The + * caller only guarantees p->pi_lock is held; no other assumptions, + * including the state of rq->lock, should be made. + */ +void set_task_rq_fair(struct sched_entity *se, + struct cfs_rq *prev, struct cfs_rq *next) +{ + if (!sched_feat(ATTACH_AGE_LOAD)) + return; + + /* + * We are supposed to update the task to "current" time, then its up to + * date and ready to go to new CPU/cfs_rq. But we have difficulty in + * getting what current time is, so simply throw away the out-of-date + * time. This will result in the wakee task is less decayed, but giving + * the wakee more load sounds not bad. + */ + if (se->avg.last_update_time && prev) { + u64 p_last_update_time; + u64 n_last_update_time; + +#ifndef CONFIG_64BIT + u64 p_last_update_time_copy; + u64 n_last_update_time_copy; + + do { + p_last_update_time_copy = prev->load_last_update_time_copy; + n_last_update_time_copy = next->load_last_update_time_copy; + + smp_rmb(); + + p_last_update_time = prev->avg.last_update_time; + n_last_update_time = next->avg.last_update_time; + + } while (p_last_update_time != p_last_update_time_copy || + n_last_update_time != n_last_update_time_copy); +#else + p_last_update_time = prev->avg.last_update_time; + n_last_update_time = next->avg.last_update_time; +#endif + __update_load_avg(p_last_update_time, cpu_of(rq_of(prev)), + &se->avg, 0, 0, NULL); + se->avg.last_update_time = n_last_update_time; + } +} #else /* CONFIG_FAIR_GROUP_SCHED */ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -2834,48 +2973,48 @@ dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0); } -/* - * Task first catches up with cfs_rq, and then subtract - * itself from the cfs_rq (task must be off the queue now). - */ -void remove_entity_load_avg(struct sched_entity *se) -{ - struct cfs_rq *cfs_rq = cfs_rq_of(se); - u64 last_update_time; - #ifndef CONFIG_64BIT +static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) +{ u64 last_update_time_copy; + u64 last_update_time; do { last_update_time_copy = cfs_rq->load_last_update_time_copy; smp_rmb(); last_update_time = cfs_rq->avg.last_update_time; } while (last_update_time != last_update_time_copy); -#else - last_update_time = cfs_rq->avg.last_update_time; -#endif - __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL); - atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg); - atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg); + return last_update_time; } - -/* - * Update the rq's load with the elapsed running time before entering - * idle. if the last scheduled task is not a CFS task, idle_enter will - * be the only way to update the runnable statistic. - */ -void idle_enter_fair(struct rq *this_rq) +#else +static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) { + return cfs_rq->avg.last_update_time; } +#endif /* - * Update the rq's load with the elapsed idle time before a task is - * scheduled. if the newly scheduled task is not a CFS task, idle_exit will - * be the only way to update the runnable statistic. + * Task first catches up with cfs_rq, and then subtract + * itself from the cfs_rq (task must be off the queue now). */ -void idle_exit_fair(struct rq *this_rq) +void remove_entity_load_avg(struct sched_entity *se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 last_update_time; + + /* + * Newly created task or never used group entity should not be removed + * from its (source) cfs_rq + */ + if (se->avg.last_update_time == 0) + return; + + last_update_time = cfs_rq_last_update_time(cfs_rq); + + __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL); + atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg); + atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg); } static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq) @@ -3020,6 +3159,26 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) static void check_enqueue_throttle(struct cfs_rq *cfs_rq); +static inline void check_schedstat_required(void) +{ +#ifdef CONFIG_SCHEDSTATS + if (schedstat_enabled()) + return; + + /* Force schedstat enabled if a dependent tracepoint is active */ + if (trace_sched_stat_wait_enabled() || + trace_sched_stat_sleep_enabled() || + trace_sched_stat_iowait_enabled() || + trace_sched_stat_blocked_enabled() || + trace_sched_stat_runtime_enabled()) { + pr_warn_once("Scheduler tracepoints stat_sleep, stat_iowait, " + "stat_blocked and stat_runtime require the " + "kernel parameter schedstats=enabled or " + "kernel.sched_schedstats=1\n"); + } +#endif +} + static void enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { @@ -3040,11 +3199,15 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (flags & ENQUEUE_WAKEUP) { place_entity(cfs_rq, se, 0); - enqueue_sleeper(cfs_rq, se); + if (schedstat_enabled()) + enqueue_sleeper(cfs_rq, se); } - update_stats_enqueue(cfs_rq, se); - check_spread(cfs_rq, se); + check_schedstat_required(); + if (schedstat_enabled()) { + update_stats_enqueue(cfs_rq, se); + check_spread(cfs_rq, se); + } if (se != cfs_rq->curr) __enqueue_entity(cfs_rq, se); se->on_rq = 1; @@ -3111,19 +3274,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) update_curr(cfs_rq); dequeue_entity_load_avg(cfs_rq, se); - update_stats_dequeue(cfs_rq, se); - if (flags & DEQUEUE_SLEEP) { -#ifdef CONFIG_SCHEDSTATS - if (entity_is_task(se)) { - struct task_struct *tsk = task_of(se); - - if (tsk->state & TASK_INTERRUPTIBLE) - se->statistics.sleep_start = rq_clock(rq_of(cfs_rq)); - if (tsk->state & TASK_UNINTERRUPTIBLE) - se->statistics.block_start = rq_clock(rq_of(cfs_rq)); - } -#endif - } + if (schedstat_enabled()) + update_stats_dequeue(cfs_rq, se, flags); clear_buddies(cfs_rq, se); @@ -3197,7 +3349,8 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) * a CPU. So account for the time it spent waiting on the * runqueue. */ - update_stats_wait_end(cfs_rq, se); + if (schedstat_enabled()) + update_stats_wait_end(cfs_rq, se); __dequeue_entity(cfs_rq, se); update_load_avg(se, 1); } @@ -3210,7 +3363,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) * least twice that of our own weight (i.e. dont track it * when there are only lesser-weight tasks around): */ - if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { + if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { se->statistics.slice_max = max(se->statistics.slice_max, se->sum_exec_runtime - se->prev_sum_exec_runtime); } @@ -3293,9 +3446,13 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) /* throttle cfs_rqs exceeding runtime */ check_cfs_rq_runtime(cfs_rq); - check_spread(cfs_rq, prev); + if (schedstat_enabled()) { + check_spread(cfs_rq, prev); + if (prev->on_rq) + update_stats_wait_start(cfs_rq, prev); + } + if (prev->on_rq) { - update_stats_wait_start(cfs_rq, prev); /* Put 'current' back into the tree. */ __enqueue_entity(cfs_rq, prev); /* in !on_rq case, update occurred at dequeue */ @@ -4265,42 +4422,37 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) */ /* - * The exact cpuload at various idx values, calculated at every tick would be - * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load + * The exact cpuload calculated at every tick would be: + * + * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load * - * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called - * on nth tick when cpu may be busy, then we have: - * load = ((2^idx - 1) / 2^idx)^(n-1) * load - * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load + * If a cpu misses updates for n ticks (as it was idle) and update gets + * called on the n+1-th tick when cpu may be busy, then we have: + * + * load_n = (1 - 1/2^i)^n * load_0 + * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load * * decay_load_missed() below does efficient calculation of - * load = ((2^idx - 1) / 2^idx)^(n-1) * load - * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load + * + * load' = (1 - 1/2^i)^n * load + * + * Because x^(n+m) := x^n * x^m we can decompose any x^n in power-of-2 factors. + * This allows us to precompute the above in said factors, thereby allowing the + * reduction of an arbitrary n in O(log_2 n) steps. (See also + * fixed_power_int()) * * The calculation is approximated on a 128 point scale. - * degrade_zero_ticks is the number of ticks after which load at any - * particular idx is approximated to be zero. - * degrade_factor is a precomputed table, a row for each load idx. - * Each column corresponds to degradation factor for a power of two ticks, - * based on 128 point scale. - * Example: - * row 2, col 3 (=12) says that the degradation at load idx 2 after - * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). - * - * With this power of 2 load factors, we can degrade the load n times - * by looking at 1 bits in n and doing as many mult/shift instead of - * n mult/shifts needed by the exact degradation. */ #define DEGRADE_SHIFT 7 -static const unsigned char - degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; -static const unsigned char - degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { - {0, 0, 0, 0, 0, 0, 0, 0}, - {64, 32, 8, 0, 0, 0, 0, 0}, - {96, 72, 40, 12, 1, 0, 0}, - {112, 98, 75, 43, 15, 1, 0}, - {120, 112, 98, 76, 45, 16, 2} }; + +static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; +static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { + { 0, 0, 0, 0, 0, 0, 0, 0 }, + { 64, 32, 8, 0, 0, 0, 0, 0 }, + { 96, 72, 40, 12, 1, 0, 0, 0 }, + { 112, 98, 75, 43, 15, 1, 0, 0 }, + { 120, 112, 98, 76, 45, 16, 2, 0 } +}; /* * Update cpu_load for any missed ticks, due to tickless idle. The backlog @@ -4331,14 +4483,46 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) return load; } -/* +/** + * __update_cpu_load - update the rq->cpu_load[] statistics + * @this_rq: The rq to update statistics for + * @this_load: The current load + * @pending_updates: The number of missed updates + * @active: !0 for NOHZ_FULL + * * Update rq->cpu_load[] statistics. This function is usually called every - * scheduler tick (TICK_NSEC). With tickless idle this will not be called - * every tick. We fix it up based on jiffies. + * scheduler tick (TICK_NSEC). + * + * This function computes a decaying average: + * + * load[i]' = (1 - 1/2^i) * load[i] + (1/2^i) * load + * + * Because of NOHZ it might not get called on every tick which gives need for + * the @pending_updates argument. + * + * load[i]_n = (1 - 1/2^i) * load[i]_n-1 + (1/2^i) * load_n-1 + * = A * load[i]_n-1 + B ; A := (1 - 1/2^i), B := (1/2^i) * load + * = A * (A * load[i]_n-2 + B) + B + * = A * (A * (A * load[i]_n-3 + B) + B) + B + * = A^3 * load[i]_n-3 + (A^2 + A + 1) * B + * = A^n * load[i]_0 + (A^(n-1) + A^(n-2) + ... + 1) * B + * = A^n * load[i]_0 + ((1 - A^n) / (1 - A)) * B + * = (1 - 1/2^i)^n * (load[i]_0 - load) + load + * + * In the above we've assumed load_n := load, which is true for NOHZ_FULL as + * any change in load would have resulted in the tick being turned back on. + * + * For regular NOHZ, this reduces to: + * + * load[i]_n = (1 - 1/2^i)^n * load[i]_0 + * + * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra + * term. See the @active paramter. */ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, - unsigned long pending_updates) + unsigned long pending_updates, int active) { + unsigned long tickless_load = active ? this_rq->cpu_load[0] : 0; int i, scale; this_rq->nr_load_updates++; @@ -4352,6 +4536,15 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, old_load = this_rq->cpu_load[i]; old_load = decay_load_missed(old_load, pending_updates - 1, i); + if (tickless_load) { + old_load -= decay_load_missed(tickless_load, pending_updates - 1, i); + /* + * old_load can never be a negative value because a + * decayed tickless_load cannot be greater than the + * original tickless_load. + */ + old_load += tickless_load; + } new_load = this_load; /* * Round up the averaging division if load is increasing. This @@ -4374,6 +4567,25 @@ static unsigned long weighted_cpuload(const int cpu) } #ifdef CONFIG_NO_HZ_COMMON +static void __update_cpu_load_nohz(struct rq *this_rq, + unsigned long curr_jiffies, + unsigned long load, + int active) +{ + unsigned long pending_updates; + + pending_updates = curr_jiffies - this_rq->last_load_update_tick; + if (pending_updates) { + this_rq->last_load_update_tick = curr_jiffies; + /* + * In the regular NOHZ case, we were idle, this means load 0. + * In the NOHZ_FULL case, we were non-idle, we should consider + * its weighted load. + */ + __update_cpu_load(this_rq, load, pending_updates, active); + } +} + /* * There is no sane way to deal with nohz on smp when using jiffies because the * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading @@ -4391,46 +4603,31 @@ static unsigned long weighted_cpuload(const int cpu) * Called from nohz_idle_balance() to update the load ratings before doing the * idle balance. */ -static void update_idle_cpu_load(struct rq *this_rq) +static void update_cpu_load_idle(struct rq *this_rq) { - unsigned long curr_jiffies = READ_ONCE(jiffies); - unsigned long load = weighted_cpuload(cpu_of(this_rq)); - unsigned long pending_updates; - /* * bail if there's load or we're actually up-to-date. */ - if (load || curr_jiffies == this_rq->last_load_update_tick) + if (weighted_cpuload(cpu_of(this_rq))) return; - pending_updates = curr_jiffies - this_rq->last_load_update_tick; - this_rq->last_load_update_tick = curr_jiffies; - - __update_cpu_load(this_rq, load, pending_updates); + __update_cpu_load_nohz(this_rq, READ_ONCE(jiffies), 0, 0); } /* * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. */ -void update_cpu_load_nohz(void) +void update_cpu_load_nohz(int active) { struct rq *this_rq = this_rq(); unsigned long curr_jiffies = READ_ONCE(jiffies); - unsigned long pending_updates; + unsigned long load = active ? weighted_cpuload(cpu_of(this_rq)) : 0; if (curr_jiffies == this_rq->last_load_update_tick) return; raw_spin_lock(&this_rq->lock); - pending_updates = curr_jiffies - this_rq->last_load_update_tick; - if (pending_updates) { - this_rq->last_load_update_tick = curr_jiffies; - /* - * We were idle, this means load 0, the current load might be - * !0 due to remote wakeups and the sort. - */ - __update_cpu_load(this_rq, 0, pending_updates); - } + __update_cpu_load_nohz(this_rq, curr_jiffies, load, active); raw_spin_unlock(&this_rq->lock); } #endif /* CONFIG_NO_HZ */ @@ -4442,10 +4639,10 @@ void update_cpu_load_active(struct rq *this_rq) { unsigned long load = weighted_cpuload(cpu_of(this_rq)); /* - * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). + * See the mess around update_cpu_load_idle() / update_cpu_load_nohz(). */ this_rq->last_load_update_tick = jiffies; - __update_cpu_load(this_rq, load, 1); + __update_cpu_load(this_rq, load, 1, 1); } /* @@ -5032,8 +5229,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f /* * Called immediately before a task is migrated to a new cpu; task_cpu(p) and * cfs_rq_of(p) references at time of call are still valid and identify the - * previous cpu. However, the caller only guarantees p->pi_lock is held; no - * other assumptions, including the state of rq->lock, should be made. + * previous cpu. The caller guarantees p->pi_lock or task_rq(p)->lock is held. */ static void migrate_task_rq_fair(struct task_struct *p) { @@ -5746,8 +5942,8 @@ static void detach_task(struct task_struct *p, struct lb_env *env) { lockdep_assert_held(&env->src_rq->lock); - deactivate_task(env->src_rq, p, 0); p->on_rq = TASK_ON_RQ_MIGRATING; + deactivate_task(env->src_rq, p, 0); set_task_cpu(p, env->dst_cpu); } @@ -5880,8 +6076,8 @@ static void attach_task(struct rq *rq, struct task_struct *p) lockdep_assert_held(&rq->lock); BUG_ON(task_rq(p) != rq); - p->on_rq = TASK_ON_RQ_QUEUED; activate_task(rq, p, 0); + p->on_rq = TASK_ON_RQ_QUEUED; check_preempt_curr(rq, p, 0); } @@ -6327,7 +6523,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, bool *overload) { unsigned long load; - int i; + int i, nr_running; memset(sgs, 0, sizeof(*sgs)); @@ -6344,7 +6540,8 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_util += cpu_util(i); sgs->sum_nr_running += rq->cfs.h_nr_running; - if (rq->nr_running > 1) + nr_running = rq->nr_running; + if (nr_running > 1) *overload = true; #ifdef CONFIG_NUMA_BALANCING @@ -6352,7 +6549,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->nr_preferred_running += rq->nr_preferred_running; #endif sgs->sum_weighted_load += weighted_cpuload(i); - if (idle_cpu(i)) + /* + * No need to call idle_cpu() if nr_running is not 0 + */ + if (!nr_running && idle_cpu(i)) sgs->idle_cpus++; } @@ -7273,8 +7473,6 @@ static int idle_balance(struct rq *this_rq) int pulled_task = 0; u64 curr_cost = 0; - idle_enter_fair(this_rq); - /* * We must set idle_stamp _before_ calling idle_balance(), such that we * measure the duration of idle_balance() as idle time. @@ -7355,10 +7553,8 @@ out: if (this_rq->nr_running != this_rq->cfs.h_nr_running) pulled_task = -1; - if (pulled_task) { - idle_exit_fair(this_rq); + if (pulled_task) this_rq->idle_stamp = 0; - } return pulled_task; } @@ -7737,7 +7933,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) if (time_after_eq(jiffies, rq->next_balance)) { raw_spin_lock_irq(&rq->lock); update_rq_clock(rq); - update_idle_cpu_load(rq); + update_cpu_load_idle(rq); raw_spin_unlock_irq(&rq->lock); rebalance_domains(rq, CPU_IDLE); } @@ -8123,11 +8319,8 @@ void free_fair_sched_group(struct task_group *tg) for_each_possible_cpu(i) { if (tg->cfs_rq) kfree(tg->cfs_rq[i]); - if (tg->se) { - if (tg->se[i]) - remove_entity_load_avg(tg->se[i]); + if (tg->se) kfree(tg->se[i]); - } } kfree(tg->cfs_rq); @@ -8175,21 +8368,29 @@ err: return 0; } -void unregister_fair_sched_group(struct task_group *tg, int cpu) +void unregister_fair_sched_group(struct task_group *tg) { - struct rq *rq = cpu_rq(cpu); unsigned long flags; + struct rq *rq; + int cpu; - /* - * Only empty task groups can be destroyed; so we can speculatively - * check on_list without danger of it being re-added. - */ - if (!tg->cfs_rq[cpu]->on_list) - return; + for_each_possible_cpu(cpu) { + if (tg->se[cpu]) + remove_entity_load_avg(tg->se[cpu]); - raw_spin_lock_irqsave(&rq->lock, flags); - list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); - raw_spin_unlock_irqrestore(&rq->lock, flags); + /* + * Only empty task groups can be destroyed; so we can speculatively + * check on_list without danger of it being re-added. + */ + if (!tg->cfs_rq[cpu]->on_list) + continue; + + rq = cpu_rq(cpu); + + raw_spin_lock_irqsave(&rq->lock, flags); + list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); + raw_spin_unlock_irqrestore(&rq->lock, flags); + } } void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, @@ -8271,7 +8472,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) return 1; } -void unregister_fair_sched_group(struct task_group *tg, int cpu) { } +void unregister_fair_sched_group(struct task_group *tg) { } #endif /* CONFIG_FAIR_GROUP_SCHED */ diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 4a2ef5a02..544a7133c 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -97,12 +97,6 @@ void default_idle_call(void) static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev, int next_state) { - /* Fall back to the default arch idle method on errors. */ - if (next_state < 0) { - default_idle_call(); - return next_state; - } - /* * The idle task must be scheduled, it is pointless to go to idle, just * update no idle residency and return. @@ -168,7 +162,7 @@ static void cpuidle_idle_call(void) */ if (idle_should_freeze()) { entered_state = cpuidle_enter_freeze(drv, dev); - if (entered_state >= 0) { + if (entered_state > 0) { local_irq_enable(); goto exit_idle; } @@ -219,6 +213,7 @@ static void cpu_idle_loop(void) */ __current_set_polling(); + quiet_vmstat(); tick_nohz_idle_enter(); while (!need_resched()) { diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index c4ae0f1fd..47ce94931 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -47,7 +47,6 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) { - idle_exit_fair(rq); rq_last_tick_reset(rq); } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 8ec86abe0..a774b4dbf 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -58,7 +58,15 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) raw_spin_lock(&rt_b->rt_runtime_lock); if (!rt_b->rt_period_active) { rt_b->rt_period_active = 1; - hrtimer_forward_now(&rt_b->rt_period_timer, rt_b->rt_period); + /* + * SCHED_DEADLINE updates the bandwidth, as a run away + * RT task with a DL task could hog a CPU. But DL does + * not reset the period. If a deadline task was running + * without an RT task running, it can cause RT tasks to + * throttle when they start up. Kick the timer right away + * to update the period. + */ + hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0)); hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED); } raw_spin_unlock(&rt_b->rt_runtime_lock); @@ -436,7 +444,7 @@ static void dequeue_top_rt_rq(struct rt_rq *rt_rq); static inline int on_rt_rq(struct sched_rt_entity *rt_se) { - return !list_empty(&rt_se->run_list); + return rt_se->on_rq; } #ifdef CONFIG_RT_GROUP_SCHED @@ -482,8 +490,8 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) return rt_se->my_q; } -static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head); -static void dequeue_rt_entity(struct sched_rt_entity *rt_se); +static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags); +static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags); static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) { @@ -499,7 +507,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) if (!rt_se) enqueue_top_rt_rq(rt_rq); else if (!on_rt_rq(rt_se)) - enqueue_rt_entity(rt_se, false); + enqueue_rt_entity(rt_se, 0); if (rt_rq->highest_prio.curr < curr->prio) resched_curr(rq); @@ -516,7 +524,7 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) if (!rt_se) dequeue_top_rt_rq(rt_rq); else if (on_rt_rq(rt_se)) - dequeue_rt_entity(rt_se); + dequeue_rt_entity(rt_se, 0); } static inline int rt_rq_throttled(struct rt_rq *rt_rq) @@ -1166,7 +1174,30 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) dec_rt_group(rt_se, rt_rq); } -static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) +/* + * Change rt_se->run_list location unless SAVE && !MOVE + * + * assumes ENQUEUE/DEQUEUE flags match + */ +static inline bool move_entity(unsigned int flags) +{ + if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE) + return false; + + return true; +} + +static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array) +{ + list_del_init(&rt_se->run_list); + + if (list_empty(array->queue + rt_se_prio(rt_se))) + __clear_bit(rt_se_prio(rt_se), array->bitmap); + + rt_se->on_list = 0; +} + +static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) { struct rt_rq *rt_rq = rt_rq_of_se(rt_se); struct rt_prio_array *array = &rt_rq->active; @@ -1179,26 +1210,37 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) * get throttled and the current group doesn't have any other * active members. */ - if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) + if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) { + if (rt_se->on_list) + __delist_rt_entity(rt_se, array); return; + } - if (head) - list_add(&rt_se->run_list, queue); - else - list_add_tail(&rt_se->run_list, queue); - __set_bit(rt_se_prio(rt_se), array->bitmap); + if (move_entity(flags)) { + WARN_ON_ONCE(rt_se->on_list); + if (flags & ENQUEUE_HEAD) + list_add(&rt_se->run_list, queue); + else + list_add_tail(&rt_se->run_list, queue); + + __set_bit(rt_se_prio(rt_se), array->bitmap); + rt_se->on_list = 1; + } + rt_se->on_rq = 1; inc_rt_tasks(rt_se, rt_rq); } -static void __dequeue_rt_entity(struct sched_rt_entity *rt_se) +static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) { struct rt_rq *rt_rq = rt_rq_of_se(rt_se); struct rt_prio_array *array = &rt_rq->active; - list_del_init(&rt_se->run_list); - if (list_empty(array->queue + rt_se_prio(rt_se))) - __clear_bit(rt_se_prio(rt_se), array->bitmap); + if (move_entity(flags)) { + WARN_ON_ONCE(!rt_se->on_list); + __delist_rt_entity(rt_se, array); + } + rt_se->on_rq = 0; dec_rt_tasks(rt_se, rt_rq); } @@ -1207,7 +1249,7 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se) * Because the prio of an upper entry depends on the lower * entries, we must remove entries top - down. */ -static void dequeue_rt_stack(struct sched_rt_entity *rt_se) +static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags) { struct sched_rt_entity *back = NULL; @@ -1220,31 +1262,31 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se) for (rt_se = back; rt_se; rt_se = rt_se->back) { if (on_rt_rq(rt_se)) - __dequeue_rt_entity(rt_se); + __dequeue_rt_entity(rt_se, flags); } } -static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) +static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) { struct rq *rq = rq_of_rt_se(rt_se); - dequeue_rt_stack(rt_se); + dequeue_rt_stack(rt_se, flags); for_each_sched_rt_entity(rt_se) - __enqueue_rt_entity(rt_se, head); + __enqueue_rt_entity(rt_se, flags); enqueue_top_rt_rq(&rq->rt); } -static void dequeue_rt_entity(struct sched_rt_entity *rt_se) +static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) { struct rq *rq = rq_of_rt_se(rt_se); - dequeue_rt_stack(rt_se); + dequeue_rt_stack(rt_se, flags); for_each_sched_rt_entity(rt_se) { struct rt_rq *rt_rq = group_rt_rq(rt_se); if (rt_rq && rt_rq->rt_nr_running) - __enqueue_rt_entity(rt_se, false); + __enqueue_rt_entity(rt_se, flags); } enqueue_top_rt_rq(&rq->rt); } @@ -1260,7 +1302,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) if (flags & ENQUEUE_WAKEUP) rt_se->timeout = 0; - enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD); + enqueue_rt_entity(rt_se, flags); if (!task_current(rq, p) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); @@ -1271,7 +1313,7 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) struct sched_rt_entity *rt_se = &p->rt; update_curr_rt(rq); - dequeue_rt_entity(rt_se); + dequeue_rt_entity(rt_se, flags); dequeue_pushable_task(rq, p); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b242775bf..ef5875fff 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -3,6 +3,7 @@ #include <linux/sched/sysctl.h> #include <linux/sched/rt.h> #include <linux/sched/deadline.h> +#include <linux/binfmts.h> #include <linux/mutex.h> #include <linux/spinlock.h> #include <linux/stop_machine.h> @@ -248,7 +249,12 @@ struct task_group { unsigned long shares; #ifdef CONFIG_SMP - atomic_long_t load_avg; + /* + * load_avg can be heavily contended at clock tick time, so put + * it in its own cacheline separated from the fields above which + * will also be accessed at each tick. + */ + atomic_long_t load_avg ____cacheline_aligned; #endif #endif @@ -308,12 +314,11 @@ extern int tg_nop(struct task_group *tg, void *data); extern void free_fair_sched_group(struct task_group *tg); extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); -extern void unregister_fair_sched_group(struct task_group *tg, int cpu); +extern void unregister_fair_sched_group(struct task_group *tg); extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, struct sched_entity *se, int cpu, struct sched_entity *parent); extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b); -extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); @@ -335,7 +340,15 @@ extern void sched_move_task(struct task_struct *tsk); #ifdef CONFIG_FAIR_GROUP_SCHED extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); -#endif + +#ifdef CONFIG_SMP +extern void set_task_rq_fair(struct sched_entity *se, + struct cfs_rq *prev, struct cfs_rq *next); +#else /* !CONFIG_SMP */ +static inline void set_task_rq_fair(struct sched_entity *se, + struct cfs_rq *prev, struct cfs_rq *next) { } +#endif /* CONFIG_SMP */ +#endif /* CONFIG_FAIR_GROUP_SCHED */ #else /* CONFIG_CGROUP_SCHED */ @@ -896,6 +909,18 @@ static inline unsigned int group_first_cpu(struct sched_group *group) extern int group_balance_cpu(struct sched_group *sg); +#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) +void register_sched_domain_sysctl(void); +void unregister_sched_domain_sysctl(void); +#else +static inline void register_sched_domain_sysctl(void) +{ +} +static inline void unregister_sched_domain_sysctl(void) +{ +} +#endif + #else static inline void sched_ttwu_pending(void) { } @@ -933,6 +958,7 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) #endif #ifdef CONFIG_FAIR_GROUP_SCHED + set_task_rq_fair(&p->se, p->se.cfs_rq, tg->cfs_rq[cpu]); p->se.cfs_rq = tg->cfs_rq[cpu]; p->se.parent = tg->se[cpu]; #endif @@ -1008,6 +1034,7 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ extern struct static_key_false sched_numa_balancing; +extern struct static_key_false sched_schedstats; static inline u64 global_rt_period(void) { @@ -1076,7 +1103,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) * In particular, the load of prev->state in finish_task_switch() must * happen before this. * - * Pairs with the control dependency and rmb in try_to_wake_up(). + * Pairs with the smp_cond_acquire() in try_to_wake_up(). */ smp_store_release(&prev->on_cpu, 0); #endif @@ -1113,59 +1140,43 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) #define WEIGHT_IDLEPRIO 3 #define WMULT_IDLEPRIO 1431655765 -/* - * Nice levels are multiplicative, with a gentle 10% change for every - * nice level changed. I.e. when a CPU-bound task goes from nice 0 to - * nice 1, it will get ~10% less CPU time than another CPU-bound task - * that remained on nice 0. - * - * The "10% effect" is relative and cumulative: from _any_ nice level, - * if you go up 1 level, it's -10% CPU usage, if you go down 1 level - * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. - * If a task goes up by ~10% and another task goes down by ~10% then - * the relative distance between them is ~25%.) - */ -static const int prio_to_weight[40] = { - /* -20 */ 88761, 71755, 56483, 46273, 36291, - /* -15 */ 29154, 23254, 18705, 14949, 11916, - /* -10 */ 9548, 7620, 6100, 4904, 3906, - /* -5 */ 3121, 2501, 1991, 1586, 1277, - /* 0 */ 1024, 820, 655, 526, 423, - /* 5 */ 335, 272, 215, 172, 137, - /* 10 */ 110, 87, 70, 56, 45, - /* 15 */ 36, 29, 23, 18, 15, -}; +extern const int sched_prio_to_weight[40]; +extern const u32 sched_prio_to_wmult[40]; /* - * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated. + * {de,en}queue flags: + * + * DEQUEUE_SLEEP - task is no longer runnable + * ENQUEUE_WAKEUP - task just became runnable + * + * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks + * are in a known state which allows modification. Such pairs + * should preserve as much state as possible. + * + * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location + * in the runqueue. + * + * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) + * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) + * ENQUEUE_WAKING - sched_class::task_waking was called * - * In cases where the weight does not change often, we can use the - * precalculated inverse to speed up arithmetics by turning divisions - * into multiplications: */ -static const u32 prio_to_wmult[40] = { - /* -20 */ 48388, 59856, 76040, 92818, 118348, - /* -15 */ 147320, 184698, 229616, 287308, 360437, - /* -10 */ 449829, 563644, 704093, 875809, 1099582, - /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326, - /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587, - /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126, - /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, - /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, -}; + +#define DEQUEUE_SLEEP 0x01 +#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */ +#define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */ #define ENQUEUE_WAKEUP 0x01 -#define ENQUEUE_HEAD 0x02 +#define ENQUEUE_RESTORE 0x02 +#define ENQUEUE_MOVE 0x04 + +#define ENQUEUE_HEAD 0x08 +#define ENQUEUE_REPLENISH 0x10 #ifdef CONFIG_SMP -#define ENQUEUE_WAKING 0x04 /* sched_class::task_waking was called */ +#define ENQUEUE_WAKING 0x20 #else #define ENQUEUE_WAKING 0x00 #endif -#define ENQUEUE_REPLENISH 0x08 -#define ENQUEUE_RESTORE 0x10 - -#define DEQUEUE_SLEEP 0x01 -#define DEQUEUE_SAVE 0x02 #define RETRY_TASK ((void *)-1UL) @@ -1252,16 +1263,8 @@ extern void update_group_capacity(struct sched_domain *sd, int cpu); extern void trigger_load_balance(struct rq *rq); -extern void idle_enter_fair(struct rq *this_rq); -extern void idle_exit_fair(struct rq *this_rq); - extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask); -#else - -static inline void idle_enter_fair(struct rq *rq) { } -static inline void idle_exit_fair(struct rq *rq) { } - #endif #ifdef CONFIG_CPU_IDLE diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index b0fbc7632..70b3b6a20 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -29,9 +29,10 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) if (rq) rq->rq_sched_info.run_delay += delta; } -# define schedstat_inc(rq, field) do { (rq)->field++; } while (0) -# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) -# define schedstat_set(var, val) do { var = (val); } while (0) +# define schedstat_enabled() static_branch_unlikely(&sched_schedstats) +# define schedstat_inc(rq, field) do { if (schedstat_enabled()) { (rq)->field++; } } while (0) +# define schedstat_add(rq, field, amt) do { if (schedstat_enabled()) { (rq)->field += (amt); } } while (0) +# define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) #else /* !CONFIG_SCHEDSTATS */ static inline void rq_sched_info_arrive(struct rq *rq, unsigned long long delta) @@ -42,6 +43,7 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) static inline void rq_sched_info_depart(struct rq *rq, unsigned long long delta) {} +# define schedstat_enabled() 0 # define schedstat_inc(rq, field) do { } while (0) # define schedstat_add(rq, field, amt) do { } while (0) # define schedstat_set(var, val) do { } while (0) diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c new file mode 100644 index 000000000..82f0dff90 --- /dev/null +++ b/kernel/sched/swait.c @@ -0,0 +1,123 @@ +#include <linux/sched.h> +#include <linux/swait.h> + +void __init_swait_queue_head(struct swait_queue_head *q, const char *name, + struct lock_class_key *key) +{ + raw_spin_lock_init(&q->lock); + lockdep_set_class_and_name(&q->lock, key, name); + INIT_LIST_HEAD(&q->task_list); +} +EXPORT_SYMBOL(__init_swait_queue_head); + +/* + * The thing about the wake_up_state() return value; I think we can ignore it. + * + * If for some reason it would return 0, that means the previously waiting + * task is already running, so it will observe condition true (or has already). + */ +void swake_up_locked(struct swait_queue_head *q) +{ + struct swait_queue *curr; + + if (list_empty(&q->task_list)) + return; + + curr = list_first_entry(&q->task_list, typeof(*curr), task_list); + wake_up_process(curr->task); + list_del_init(&curr->task_list); +} +EXPORT_SYMBOL(swake_up_locked); + +void swake_up(struct swait_queue_head *q) +{ + unsigned long flags; + + if (!swait_active(q)) + return; + + raw_spin_lock_irqsave(&q->lock, flags); + swake_up_locked(q); + raw_spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(swake_up); + +/* + * Does not allow usage from IRQ disabled, since we must be able to + * release IRQs to guarantee bounded hold time. + */ +void swake_up_all(struct swait_queue_head *q) +{ + struct swait_queue *curr; + LIST_HEAD(tmp); + + if (!swait_active(q)) + return; + + raw_spin_lock_irq(&q->lock); + list_splice_init(&q->task_list, &tmp); + while (!list_empty(&tmp)) { + curr = list_first_entry(&tmp, typeof(*curr), task_list); + + wake_up_state(curr->task, TASK_NORMAL); + list_del_init(&curr->task_list); + + if (list_empty(&tmp)) + break; + + raw_spin_unlock_irq(&q->lock); + raw_spin_lock_irq(&q->lock); + } + raw_spin_unlock_irq(&q->lock); +} +EXPORT_SYMBOL(swake_up_all); + +void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait) +{ + wait->task = current; + if (list_empty(&wait->task_list)) + list_add(&wait->task_list, &q->task_list); +} + +void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&q->lock, flags); + __prepare_to_swait(q, wait); + set_current_state(state); + raw_spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(prepare_to_swait); + +long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state) +{ + if (signal_pending_state(state, current)) + return -ERESTARTSYS; + + prepare_to_swait(q, wait, state); + + return 0; +} +EXPORT_SYMBOL(prepare_to_swait_event); + +void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait) +{ + __set_current_state(TASK_RUNNING); + if (!list_empty(&wait->task_list)) + list_del_init(&wait->task_list); +} + +void finish_swait(struct swait_queue_head *q, struct swait_queue *wait) +{ + unsigned long flags; + + __set_current_state(TASK_RUNNING); + + if (!list_empty_careful(&wait->task_list)) { + raw_spin_lock_irqsave(&q->lock, flags); + list_del_init(&wait->task_list); + raw_spin_unlock_irqrestore(&q->lock, flags); + } +} +EXPORT_SYMBOL(finish_swait); diff --git a/kernel/signal.c b/kernel/signal.c index f3f1f7a97..0508544c8 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3508,8 +3508,10 @@ static int sigsuspend(sigset_t *set) current->saved_sigmask = current->blocked; set_current_blocked(set); - __set_current_state(TASK_INTERRUPTIBLE); - schedule(); + while (!signal_pending(current)) { + __set_current_state(TASK_INTERRUPTIBLE); + schedule(); + } set_restore_sigmask(); return -ERESTARTNOHAND; } diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 28c8e736d..d264f59bf 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -174,7 +174,7 @@ __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu) if (tsk) return 0; - td = kzalloc_node(sizeof(*td), GFP_KERNEL | ___GFP_TOI_NOTRACK, cpu_to_node(cpu)); + td = kzalloc_node(sizeof(*td), GFP_KERNEL, cpu_to_node(cpu)); if (!td) return -ENOMEM; td->cpu = cpu; diff --git a/kernel/softirq.c b/kernel/softirq.c index 479e4436f..8aae49dd7 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -116,9 +116,9 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) if (preempt_count() == cnt) { #ifdef CONFIG_DEBUG_PREEMPT - current->preempt_disable_ip = get_parent_ip(CALLER_ADDR1); + current->preempt_disable_ip = get_lock_parent_ip(); #endif - trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); + trace_preempt_off(CALLER_ADDR0, get_lock_parent_ip()); } } EXPORT_SYMBOL(__local_bh_disable_ip); diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index a3bbaee77..a467e6c28 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -28,7 +28,6 @@ */ struct cpu_stop_done { atomic_t nr_todo; /* nr left to execute */ - bool executed; /* actually executed? */ int ret; /* collected return value */ struct completion completion; /* fired if nr_todo reaches 0 */ }; @@ -63,14 +62,10 @@ static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo) } /* signal completion unless @done is NULL */ -static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed) +static void cpu_stop_signal_done(struct cpu_stop_done *done) { - if (done) { - if (executed) - done->executed = true; - if (atomic_dec_and_test(&done->nr_todo)) - complete(&done->completion); - } + if (atomic_dec_and_test(&done->nr_todo)) + complete(&done->completion); } static void __cpu_stop_queue_work(struct cpu_stopper *stopper, @@ -81,17 +76,21 @@ static void __cpu_stop_queue_work(struct cpu_stopper *stopper, } /* queue @work to @stopper. if offline, @work is completed immediately */ -static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) +static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); unsigned long flags; + bool enabled; spin_lock_irqsave(&stopper->lock, flags); - if (stopper->enabled) + enabled = stopper->enabled; + if (enabled) __cpu_stop_queue_work(stopper, work); - else - cpu_stop_signal_done(work->done, false); + else if (work->done) + cpu_stop_signal_done(work->done); spin_unlock_irqrestore(&stopper->lock, flags); + + return enabled; } /** @@ -124,9 +123,10 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg) struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done }; cpu_stop_init_done(&done, 1); - cpu_stop_queue_work(cpu, &work); + if (!cpu_stop_queue_work(cpu, &work)) + return -ENOENT; wait_for_completion(&done.completion); - return done.executed ? done.ret : -ENOENT; + return done.ret; } /* This controls the threads on each CPU. */ @@ -258,7 +258,6 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * struct cpu_stop_work work1, work2; struct multi_stop_data msdata; - preempt_disable(); msdata = (struct multi_stop_data){ .fn = fn, .data = arg, @@ -277,16 +276,11 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * if (cpu1 > cpu2) swap(cpu1, cpu2); - if (cpu_stop_queue_two_works(cpu1, &work1, cpu2, &work2)) { - preempt_enable(); + if (cpu_stop_queue_two_works(cpu1, &work1, cpu2, &work2)) return -ENOENT; - } - - preempt_enable(); wait_for_completion(&done.completion); - - return done.executed ? done.ret : -ENOENT; + return done.ret; } /** @@ -302,23 +296,28 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * * * CONTEXT: * Don't care. + * + * RETURNS: + * true if cpu_stop_work was queued successfully and @fn will be called, + * false otherwise. */ -void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, +bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, struct cpu_stop_work *work_buf) { *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, }; - cpu_stop_queue_work(cpu, work_buf); + return cpu_stop_queue_work(cpu, work_buf); } /* static data for stop_cpus */ static DEFINE_MUTEX(stop_cpus_mutex); -static void queue_stop_cpus_work(const struct cpumask *cpumask, +static bool queue_stop_cpus_work(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg, struct cpu_stop_done *done) { struct cpu_stop_work *work; unsigned int cpu; + bool queued = false; /* * Disable preemption while queueing to avoid getting @@ -331,9 +330,12 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask, work->fn = fn; work->arg = arg; work->done = done; - cpu_stop_queue_work(cpu, work); + if (cpu_stop_queue_work(cpu, work)) + queued = true; } lg_global_unlock(&stop_cpus_lock); + + return queued; } static int __stop_cpus(const struct cpumask *cpumask, @@ -342,9 +344,10 @@ static int __stop_cpus(const struct cpumask *cpumask, struct cpu_stop_done done; cpu_stop_init_done(&done, cpumask_weight(cpumask)); - queue_stop_cpus_work(cpumask, fn, arg, &done); + if (!queue_stop_cpus_work(cpumask, fn, arg, &done)) + return -ENOENT; wait_for_completion(&done.completion); - return done.executed ? done.ret : -ENOENT; + return done.ret; } /** @@ -432,7 +435,6 @@ static void cpu_stopper_thread(unsigned int cpu) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); struct cpu_stop_work *work; - int ret; repeat: work = NULL; @@ -448,23 +450,19 @@ repeat: cpu_stop_fn_t fn = work->fn; void *arg = work->arg; struct cpu_stop_done *done = work->done; - char ksym_buf[KSYM_NAME_LEN] __maybe_unused; - - /* cpu stop callbacks are not allowed to sleep */ - preempt_disable(); + int ret; + /* cpu stop callbacks must not sleep, make in_atomic() == T */ + preempt_count_inc(); ret = fn(arg); - if (ret) - done->ret = ret; - - /* restore preemption and check it's still balanced */ - preempt_enable(); + if (done) { + if (ret) + done->ret = ret; + cpu_stop_signal_done(done); + } + preempt_count_dec(); WARN_ONCE(preempt_count(), - "cpu_stop: %s(%p) leaked preempt count\n", - kallsyms_lookup((unsigned long)fn, NULL, NULL, NULL, - ksym_buf), arg); - - cpu_stop_signal_done(done, true); + "cpu_stop: %pf(%p) leaked preempt count\n", fn, arg); goto repeat; } } @@ -531,8 +529,6 @@ static int __init cpu_stop_init(void) } early_initcall(cpu_stop_init); -#if defined(CONFIG_SMP) || defined(CONFIG_HOTPLUG_CPU) - static int __stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) { struct multi_stop_data msdata = { @@ -630,5 +626,3 @@ int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data, mutex_unlock(&stop_cpus_mutex); return ret ?: done.ret; } - -#endif /* CONFIG_SMP || CONFIG_HOTPLUG_CPU */ diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 0623787ec..2c5e3a8e0 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -174,6 +174,7 @@ cond_syscall(sys_setfsuid); cond_syscall(sys_setfsgid); cond_syscall(sys_capget); cond_syscall(sys_capset); +cond_syscall(sys_copy_file_range); /* arch-specific weak syscall entries */ cond_syscall(sys_pciconfig_read); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index dc6858d66..f5102fabe 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -173,7 +173,7 @@ extern int no_unaligned_warning; #define SYSCTL_WRITES_WARN 0 #define SYSCTL_WRITES_STRICT 1 -static int sysctl_writes_strict = SYSCTL_WRITES_WARN; +static int sysctl_writes_strict = SYSCTL_WRITES_STRICT; static int proc_do_cad_pid(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); @@ -350,6 +350,17 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, +#ifdef CONFIG_SCHEDSTATS + { + .procname = "sched_schedstats", + .data = NULL, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_schedstats, + .extra1 = &zero, + .extra2 = &one, + }, +#endif /* CONFIG_SCHEDSTATS */ #endif /* CONFIG_SMP */ #ifdef CONFIG_NUMA_BALANCING { @@ -505,7 +516,7 @@ static struct ctl_table kern_table[] = { .data = &latencytop_enabled, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = sysctl_latencytop, }, #endif #ifdef CONFIG_BLK_DEV_INITRD @@ -1568,6 +1579,28 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, +#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS + { + .procname = "mmap_rnd_bits", + .data = &mmap_rnd_bits, + .maxlen = sizeof(mmap_rnd_bits), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = (void *)&mmap_rnd_bits_min, + .extra2 = (void *)&mmap_rnd_bits_max, + }, +#endif +#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS + { + .procname = "mmap_rnd_compat_bits", + .data = &mmap_rnd_compat_bits, + .maxlen = sizeof(mmap_rnd_compat_bits), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = (void *)&mmap_rnd_compat_bits_min, + .extra2 = (void *)&mmap_rnd_compat_bits_max, + }, +#endif { } }; @@ -1735,6 +1768,20 @@ static struct ctl_table fs_table[] = { .proc_handler = &pipe_proc_fn, .extra1 = &pipe_min_size, }, + { + .procname = "pipe-user-pages-hard", + .data = &pipe_user_pages_hard, + .maxlen = sizeof(pipe_user_pages_hard), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "pipe-user-pages-soft", + .data = &pipe_user_pages_soft, + .maxlen = sizeof(pipe_user_pages_soft), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, { } }; @@ -2047,9 +2094,8 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, void *data) { int *i, vleft, first = 1, err = 0; - unsigned long page = 0; size_t left; - char *kbuf; + char *kbuf = NULL, *p; if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) { *lenp = 0; @@ -2078,15 +2124,9 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; - page = __get_free_page(GFP_TEMPORARY); - kbuf = (char *) page; - if (!kbuf) - return -ENOMEM; - if (copy_from_user(kbuf, buffer, left)) { - err = -EFAULT; - goto free; - } - kbuf[left] = 0; + p = kbuf = memdup_user_nul(buffer, left); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); } for (; left && vleft--; i++, first=0) { @@ -2094,11 +2134,11 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, bool neg; if (write) { - left -= proc_skip_spaces(&kbuf); + left -= proc_skip_spaces(&p); if (!left) break; - err = proc_get_long(&kbuf, &left, &lval, &neg, + err = proc_get_long(&p, &left, &lval, &neg, proc_wspace_sep, sizeof(proc_wspace_sep), NULL); if (err) @@ -2125,10 +2165,9 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, if (!write && !first && left && !err) err = proc_put_char(&buffer, &left, '\n'); if (write && !err && left) - left -= proc_skip_spaces(&kbuf); -free: + left -= proc_skip_spaces(&p); if (write) { - free_page(page); + kfree(kbuf); if (first) return err ? : -EINVAL; } @@ -2310,9 +2349,8 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int { unsigned long *i, *min, *max; int vleft, first = 1, err = 0; - unsigned long page = 0; size_t left; - char *kbuf; + char *kbuf = NULL, *p; if (!data || !table->maxlen || !*lenp || (*ppos && !write)) { *lenp = 0; @@ -2340,15 +2378,9 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; - page = __get_free_page(GFP_TEMPORARY); - kbuf = (char *) page; - if (!kbuf) - return -ENOMEM; - if (copy_from_user(kbuf, buffer, left)) { - err = -EFAULT; - goto free; - } - kbuf[left] = 0; + p = kbuf = memdup_user_nul(buffer, left); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); } for (; left && vleft--; i++, first = 0) { @@ -2357,9 +2389,9 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int if (write) { bool neg; - left -= proc_skip_spaces(&kbuf); + left -= proc_skip_spaces(&p); - err = proc_get_long(&kbuf, &left, &val, &neg, + err = proc_get_long(&p, &left, &val, &neg, proc_wspace_sep, sizeof(proc_wspace_sep), NULL); if (err) @@ -2385,10 +2417,9 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int if (!write && !first && left && !err) err = proc_put_char(&buffer, &left, '\n'); if (write && !err) - left -= proc_skip_spaces(&kbuf); -free: + left -= proc_skip_spaces(&p); if (write) { - free_page(page); + kfree(kbuf); if (first) return err ? : -EINVAL; } @@ -2650,34 +2681,27 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, } if (write) { - unsigned long page = 0; - char *kbuf; + char *kbuf, *p; if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; - page = __get_free_page(GFP_TEMPORARY); - kbuf = (char *) page; - if (!kbuf) - return -ENOMEM; - if (copy_from_user(kbuf, buffer, left)) { - free_page(page); - return -EFAULT; - } - kbuf[left] = 0; + p = kbuf = memdup_user_nul(buffer, left); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); tmp_bitmap = kzalloc(BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long), GFP_KERNEL); if (!tmp_bitmap) { - free_page(page); + kfree(kbuf); return -ENOMEM; } - proc_skip_char(&kbuf, &left, '\n'); + proc_skip_char(&p, &left, '\n'); while (!err && left) { unsigned long val_a, val_b; bool neg; - err = proc_get_long(&kbuf, &left, &val_a, &neg, tr_a, + err = proc_get_long(&p, &left, &val_a, &neg, tr_a, sizeof(tr_a), &c); if (err) break; @@ -2688,12 +2712,12 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, val_b = val_a; if (left) { - kbuf++; + p++; left--; } if (c == '-') { - err = proc_get_long(&kbuf, &left, &val_b, + err = proc_get_long(&p, &left, &val_b, &neg, tr_b, sizeof(tr_b), &c); if (err) @@ -2704,16 +2728,16 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, break; } if (left) { - kbuf++; + p++; left--; } } bitmap_set(tmp_bitmap, val_a, val_b - val_a + 1); first = 0; - proc_skip_char(&kbuf, &left, '\n'); + proc_skip_char(&p, &left, '\n'); } - free_page(page); + kfree(kbuf); } else { unsigned long bit_a, bit_b = 0; diff --git a/kernel/task_work.c b/kernel/task_work.c index bce3211e7..53fa971d0 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -118,4 +118,3 @@ void task_work_run(void) } while (work); } } -EXPORT_SYMBOL_GPL(task_work_run); diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 7fbba635a..e840ed867 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -271,11 +271,27 @@ static int alarmtimer_suspend(struct device *dev) __pm_wakeup_event(ws, MSEC_PER_SEC); return ret; } + +static int alarmtimer_resume(struct device *dev) +{ + struct rtc_device *rtc; + + rtc = alarmtimer_get_rtcdev(); + if (rtc) + rtc_timer_cancel(rtc, &rtctimer); + return 0; +} + #else static int alarmtimer_suspend(struct device *dev) { return 0; } + +static int alarmtimer_resume(struct device *dev) +{ + return 0; +} #endif static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type) @@ -800,6 +816,7 @@ out: /* Suspend hook structures */ static const struct dev_pm_ops alarmtimer_pm_ops = { .suspend = alarmtimer_suspend, + .resume = alarmtimer_resume, }; static struct platform_driver alarmtimer_driver = { diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 1347882d1..664de5392 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -218,8 +218,8 @@ static void clocksource_watchdog(unsigned long data) /* Check the deviation from the watchdog clocksource. */ if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) { - pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable because the skew is too large:\n", - cs->name); + pr_warn("timekeeping watchdog on CPU%d: Marking clocksource '%s' as unstable because the skew is too large:\n", + smp_processor_id(), cs->name); pr_warn(" '%s' wd_now: %llx wd_last: %llx mask: %llx\n", watchdog->name, wdnow, wdlast, watchdog->mask); pr_warn(" '%s' cs_now: %llx cs_last: %llx mask: %llx\n", diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 149cc8086..6df8927c5 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -16,8 +16,11 @@ #include <linux/mm.h> #include <linux/module.h> #include <linux/rtc.h> +#include <linux/math64.h> #include "ntp_internal.h" +#include "timekeeping_internal.h" + /* * NTP timekeeping variables: @@ -70,7 +73,7 @@ static long time_esterror = NTP_PHASE_LIMIT; static s64 time_freq; /* time at last adjustment (secs): */ -static long time_reftime; +static time64_t time_reftime; static long time_adjust; @@ -297,25 +300,27 @@ static void ntp_update_offset(long offset) if (!(time_status & STA_PLL)) return; - if (!(time_status & STA_NANO)) + if (!(time_status & STA_NANO)) { + /* Make sure the multiplication below won't overflow */ + offset = clamp(offset, -USEC_PER_SEC, USEC_PER_SEC); offset *= NSEC_PER_USEC; + } /* * Scale the phase adjustment and * clamp to the operating range. */ - offset = min(offset, MAXPHASE); - offset = max(offset, -MAXPHASE); + offset = clamp(offset, -MAXPHASE, MAXPHASE); /* * Select how the frequency is to be controlled * and in which mode (PLL or FLL). */ - secs = get_seconds() - time_reftime; + secs = (long)(__ktime_get_real_seconds() - time_reftime); if (unlikely(time_status & STA_FREQHOLD)) secs = 0; - time_reftime = get_seconds(); + time_reftime = __ktime_get_real_seconds(); offset64 = offset; freq_adj = ntp_update_offset_fll(offset64, secs); @@ -390,10 +395,11 @@ ktime_t ntp_get_next_leap(void) * * Also handles leap second processing, and returns leap offset */ -int second_overflow(unsigned long secs) +int second_overflow(time64_t secs) { s64 delta; int leap = 0; + s32 rem; /* * Leap second processing. If in leap-insert state at the end of the @@ -404,19 +410,19 @@ int second_overflow(unsigned long secs) case TIME_OK: if (time_status & STA_INS) { time_state = TIME_INS; - ntp_next_leap_sec = secs + SECS_PER_DAY - - (secs % SECS_PER_DAY); + div_s64_rem(secs, SECS_PER_DAY, &rem); + ntp_next_leap_sec = secs + SECS_PER_DAY - rem; } else if (time_status & STA_DEL) { time_state = TIME_DEL; - ntp_next_leap_sec = secs + SECS_PER_DAY - - ((secs+1) % SECS_PER_DAY); + div_s64_rem(secs + 1, SECS_PER_DAY, &rem); + ntp_next_leap_sec = secs + SECS_PER_DAY - rem; } break; case TIME_INS: if (!(time_status & STA_INS)) { ntp_next_leap_sec = TIME64_MAX; time_state = TIME_OK; - } else if (secs % SECS_PER_DAY == 0) { + } else if (secs == ntp_next_leap_sec) { leap = -1; time_state = TIME_OOP; printk(KERN_NOTICE @@ -427,7 +433,7 @@ int second_overflow(unsigned long secs) if (!(time_status & STA_DEL)) { ntp_next_leap_sec = TIME64_MAX; time_state = TIME_OK; - } else if ((secs + 1) % SECS_PER_DAY == 0) { + } else if (secs == ntp_next_leap_sec) { leap = 1; ntp_next_leap_sec = TIME64_MAX; time_state = TIME_WAIT; @@ -590,7 +596,7 @@ static inline void process_adj_status(struct timex *txc, struct timespec64 *ts) * reference time to current time. */ if (!(time_status & STA_PLL) && (txc->status & STA_PLL)) - time_reftime = get_seconds(); + time_reftime = __ktime_get_real_seconds(); /* only set allowed bits */ time_status &= STA_RONLY; @@ -674,8 +680,24 @@ int ntp_validate_timex(struct timex *txc) return -EINVAL; } - if ((txc->modes & ADJ_SETOFFSET) && (!capable(CAP_SYS_TIME))) - return -EPERM; + if (txc->modes & ADJ_SETOFFSET) { + /* In order to inject time, you gotta be super-user! */ + if (!capable(CAP_SYS_TIME)) + return -EPERM; + + if (txc->modes & ADJ_NANO) { + struct timespec ts; + + ts.tv_sec = txc->time.tv_sec; + ts.tv_nsec = txc->time.tv_usec; + if (!timespec_inject_offset_valid(&ts)) + return -EINVAL; + + } else { + if (!timeval_inject_offset_valid(&txc->time)) + return -EINVAL; + } + } /* * Check for potential multiplication overflows that can diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h index af924470e..d8a7c11fa 100644 --- a/kernel/time/ntp_internal.h +++ b/kernel/time/ntp_internal.h @@ -6,7 +6,7 @@ extern void ntp_clear(void); /* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */ extern u64 ntp_tick_length(void); extern ktime_t ntp_get_next_leap(void); -extern int second_overflow(unsigned long secs); +extern int second_overflow(time64_t secs); extern int ntp_validate_timex(struct timex *); extern int __do_adjtimex(struct timex *, struct timespec64 *, s32 *); extern void __hardpps(const struct timespec64 *, const struct timespec64 *); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 22c57e191..0b1742434 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -36,16 +36,17 @@ */ static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); -/* - * The time, when the last jiffy update happened. Protected by jiffies_lock. - */ -static ktime_t last_jiffies_update; - struct tick_sched *tick_get_tick_sched(int cpu) { return &per_cpu(tick_cpu_sched, cpu); } +#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS) +/* + * The time, when the last jiffy update happened. Protected by jiffies_lock. + */ +static ktime_t last_jiffies_update; + /* * Must be called with interrupts disabled ! */ @@ -143,7 +144,7 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) * when we go busy again does not account too much ticks. */ if (ts->tick_stopped) { - touch_softlockup_watchdog(); + touch_softlockup_watchdog_sched(); if (is_idle_task(current)) ts->idle_jiffies++; } @@ -151,6 +152,7 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) update_process_times(user_mode(regs)); profile_tick(CPU_PROFILING); } +#endif #ifdef CONFIG_NO_HZ_FULL cpumask_var_t tick_nohz_full_mask; @@ -387,7 +389,7 @@ void __init tick_nohz_init(void) /* * NO HZ enabled ? */ -static int tick_nohz_enabled __read_mostly = 1; +int tick_nohz_enabled __read_mostly = 1; unsigned long tick_nohz_active __read_mostly; /* * Enable / Disable tickless mode @@ -430,7 +432,7 @@ static void tick_nohz_update_jiffies(ktime_t now) tick_do_update_jiffies64(now); local_irq_restore(flags); - touch_softlockup_watchdog(); + touch_softlockup_watchdog_sched(); } /* @@ -603,15 +605,31 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, /* * If the tick is due in the next period, keep it ticking or - * restart it proper. + * force prod the timer. */ delta = next_tick - basemono; if (delta <= (u64)TICK_NSEC) { tick.tv64 = 0; + /* + * We've not stopped the tick yet, and there's a timer in the + * next period, so no point in stopping it either, bail. + */ if (!ts->tick_stopped) goto out; + + /* + * If, OTOH, we did stop it, but there's a pending (expired) + * timer reprogram the timer hardware to fire now. + * + * We will not restart the tick proper, just prod the timer + * hardware into firing an interrupt to process the pending + * timers. Just like tick_irq_exit() will not restart the tick + * for 'normal' interrupts. + * + * Only once we exit the idle loop will we re-enable the tick, + * see tick_nohz_idle_exit(). + */ if (delta == 0) { - /* Tick is stopped, but required now. Enforce it */ tick_nohz_restart(ts, now); goto out; } @@ -694,14 +712,14 @@ out: return tick; } -static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) +static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now, int active) { /* Update jiffies first */ tick_do_update_jiffies64(now); - update_cpu_load_nohz(); + update_cpu_load_nohz(active); calc_load_exit_idle(); - touch_softlockup_watchdog(); + touch_softlockup_watchdog_sched(); /* * Cancel the scheduled timer and restore the tick */ @@ -725,7 +743,7 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts) if (can_stop_full_tick()) tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); else if (ts->tick_stopped) - tick_nohz_restart_sched_tick(ts, ktime_get()); + tick_nohz_restart_sched_tick(ts, ktime_get(), 1); #endif } @@ -875,7 +893,7 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts) #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE unsigned long ticks; - if (vtime_accounting_enabled()) + if (vtime_accounting_cpu_enabled()) return; /* * We stopped the tick in idle. Update process times would miss the @@ -916,7 +934,7 @@ void tick_nohz_idle_exit(void) tick_nohz_stop_idle(ts, now); if (ts->tick_stopped) { - tick_nohz_restart_sched_tick(ts, now); + tick_nohz_restart_sched_tick(ts, now, 0); tick_nohz_account_idle_ticks(ts); } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 99188ee5d..34b4cedfa 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -845,6 +845,19 @@ time64_t ktime_get_real_seconds(void) } EXPORT_SYMBOL_GPL(ktime_get_real_seconds); +/** + * __ktime_get_real_seconds - The same as ktime_get_real_seconds + * but without the sequence counter protect. This internal function + * is called just when timekeeping lock is already held. + */ +time64_t __ktime_get_real_seconds(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + + return tk->xtime_sec; +} + + #ifdef CONFIG_NTP_PPS /** @@ -958,7 +971,7 @@ int timekeeping_inject_offset(struct timespec *ts) struct timespec64 ts64, tmp; int ret = 0; - if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) + if (!timespec_inject_offset_valid(ts)) return -EINVAL; ts64 = timespec_to_timespec64(*ts); @@ -1591,9 +1604,12 @@ static __always_inline void timekeeping_freqadjust(struct timekeeper *tk, { s64 interval = tk->cycle_interval; s64 xinterval = tk->xtime_interval; + u32 base = tk->tkr_mono.clock->mult; + u32 max = tk->tkr_mono.clock->maxadj; + u32 cur_adj = tk->tkr_mono.mult; s64 tick_error; bool negative; - u32 adj; + u32 adj_scale; /* Remove any current error adj from freq calculation */ if (tk->ntp_err_mult) @@ -1612,13 +1628,33 @@ static __always_inline void timekeeping_freqadjust(struct timekeeper *tk, /* preserve the direction of correction */ negative = (tick_error < 0); - /* Sort out the magnitude of the correction */ + /* If any adjustment would pass the max, just return */ + if (negative && (cur_adj - 1) <= (base - max)) + return; + if (!negative && (cur_adj + 1) >= (base + max)) + return; + /* + * Sort out the magnitude of the correction, but + * avoid making so large a correction that we go + * over the max adjustment. + */ + adj_scale = 0; tick_error = abs(tick_error); - for (adj = 0; tick_error > interval; adj++) + while (tick_error > interval) { + u32 adj = 1 << (adj_scale + 1); + + /* Check if adjustment gets us within 1 unit from the max */ + if (negative && (cur_adj - adj) <= (base - max)) + break; + if (!negative && (cur_adj + adj) >= (base + max)) + break; + + adj_scale++; tick_error >>= 1; + } /* scale the corrections */ - timekeeping_apply_adjustment(tk, offset, negative, adj); + timekeeping_apply_adjustment(tk, offset, negative, adj_scale); } /* diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h index 4ea005a7f..5be76270e 100644 --- a/kernel/time/timekeeping_internal.h +++ b/kernel/time/timekeeping_internal.h @@ -17,7 +17,11 @@ static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask) { cycle_t ret = (now - last) & mask; - return (s64) ret > 0 ? ret : 0; + /* + * Prevent time going backwards by checking the MSB of mask in + * the result. If set, return 0. + */ + return ret & ~(mask >> 1) ? 0 : ret; } #else static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask) @@ -26,4 +30,6 @@ static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask) } #endif +extern time64_t __ktime_get_real_seconds(void); + #endif /* _TIMEKEEPING_INTERNAL_H */ diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index a990824c8..2aeb6ffc0 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -349,16 +349,10 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer, if (count >= BLK_TN_MAX_MSG) return -EINVAL; - msg = kmalloc(count + 1, GFP_KERNEL); - if (msg == NULL) - return -ENOMEM; - - if (copy_from_user(msg, buffer, count)) { - kfree(msg); - return -EFAULT; - } + msg = memdup_user_nul(buffer, count); + if (IS_ERR(msg)) + return PTR_ERR(msg); - msg[count] = '\0'; bt = filp->private_data; __trace_note_message(bt, "%s", msg); kfree(msg); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 4228fd368..326a75e88 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -191,14 +191,17 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5) struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; struct bpf_array *array = container_of(map, struct bpf_array, map); struct perf_event *event; + struct file *file; if (unlikely(index >= array->map.max_entries)) return -E2BIG; - event = (struct perf_event *)array->ptrs[index]; - if (!event) + file = (struct file *)array->ptrs[index]; + if (unlikely(!file)) return -ENOENT; + event = file->private_data; + /* make sure event is local and doesn't have pmu::count */ if (event->oncpu != smp_processor_id() || event->pmu->count) @@ -228,6 +231,7 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 index, u64 r4, u64 size) void *data = (void *) (long) r4; struct perf_sample_data sample_data; struct perf_event *event; + struct file *file; struct perf_raw_record raw = { .size = size, .data = data, @@ -236,10 +240,12 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 index, u64 r4, u64 size) if (unlikely(index >= array->map.max_entries)) return -E2BIG; - event = (struct perf_event *)array->ptrs[index]; - if (unlikely(!event)) + file = (struct file *)array->ptrs[index]; + if (unlikely(!file)) return -ENOENT; + event = file->private_data; + if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE || event->attr.config != PERF_COUNT_SW_BPF_OUTPUT)) return -EINVAL; @@ -316,7 +322,7 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type return true; } -static struct bpf_verifier_ops kprobe_prog_ops = { +static const struct bpf_verifier_ops kprobe_prog_ops = { .get_func_proto = kprobe_prog_func_proto, .is_valid_access = kprobe_prog_is_valid_access, }; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 3f743b147..57a6eea84 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -62,8 +62,6 @@ #define FTRACE_HASH_DEFAULT_BITS 10 #define FTRACE_HASH_MAX_BITS 12 -#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_CONTROL) - #ifdef CONFIG_DYNAMIC_FTRACE #define INIT_OPS_HASH(opsname) \ .func_hash = &opsname.local_hash, \ @@ -113,14 +111,9 @@ static int ftrace_disabled __read_mostly; static DEFINE_MUTEX(ftrace_lock); -static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end; static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; static struct ftrace_ops global_ops; -static struct ftrace_ops control_ops; - -static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *regs); #if ARCH_SUPPORTS_FTRACE_OPS static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, @@ -203,7 +196,7 @@ void clear_ftrace_function(void) ftrace_trace_function = ftrace_stub; } -static void control_ops_disable_all(struct ftrace_ops *ops) +static void per_cpu_ops_disable_all(struct ftrace_ops *ops) { int cpu; @@ -211,16 +204,19 @@ static void control_ops_disable_all(struct ftrace_ops *ops) *per_cpu_ptr(ops->disabled, cpu) = 1; } -static int control_ops_alloc(struct ftrace_ops *ops) +static int per_cpu_ops_alloc(struct ftrace_ops *ops) { int __percpu *disabled; + if (WARN_ON_ONCE(!(ops->flags & FTRACE_OPS_FL_PER_CPU))) + return -EINVAL; + disabled = alloc_percpu(int); if (!disabled) return -ENOMEM; ops->disabled = disabled; - control_ops_disable_all(ops); + per_cpu_ops_disable_all(ops); return 0; } @@ -256,10 +252,11 @@ static inline void update_function_graph_func(void) { } static ftrace_func_t ftrace_ops_get_list_func(struct ftrace_ops *ops) { /* - * If this is a dynamic ops or we force list func, + * If this is a dynamic, RCU, or per CPU ops, or we force list func, * then it needs to call the list anyway. */ - if (ops->flags & FTRACE_OPS_FL_DYNAMIC || FTRACE_FORCE_LIST_FUNC) + if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU | + FTRACE_OPS_FL_RCU) || FTRACE_FORCE_LIST_FUNC) return ftrace_ops_list_func; return ftrace_ops_get_func(ops); @@ -383,26 +380,6 @@ static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) return 0; } -static void add_ftrace_list_ops(struct ftrace_ops **list, - struct ftrace_ops *main_ops, - struct ftrace_ops *ops) -{ - int first = *list == &ftrace_list_end; - add_ftrace_ops(list, ops); - if (first) - add_ftrace_ops(&ftrace_ops_list, main_ops); -} - -static int remove_ftrace_list_ops(struct ftrace_ops **list, - struct ftrace_ops *main_ops, - struct ftrace_ops *ops) -{ - int ret = remove_ftrace_ops(list, ops); - if (!ret && *list == &ftrace_list_end) - ret = remove_ftrace_ops(&ftrace_ops_list, main_ops); - return ret; -} - static void ftrace_update_trampoline(struct ftrace_ops *ops); static int __register_ftrace_function(struct ftrace_ops *ops) @@ -430,14 +407,12 @@ static int __register_ftrace_function(struct ftrace_ops *ops) if (!core_kernel_data((unsigned long)ops)) ops->flags |= FTRACE_OPS_FL_DYNAMIC; - if (ops->flags & FTRACE_OPS_FL_CONTROL) { - if (control_ops_alloc(ops)) + if (ops->flags & FTRACE_OPS_FL_PER_CPU) { + if (per_cpu_ops_alloc(ops)) return -ENOMEM; - add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops); - /* The control_ops needs the trampoline update */ - ops = &control_ops; - } else - add_ftrace_ops(&ftrace_ops_list, ops); + } + + add_ftrace_ops(&ftrace_ops_list, ops); /* Always save the function, and reset at unregistering */ ops->saved_func = ops->func; @@ -460,11 +435,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED))) return -EBUSY; - if (ops->flags & FTRACE_OPS_FL_CONTROL) { - ret = remove_ftrace_list_ops(&ftrace_control_list, - &control_ops, ops); - } else - ret = remove_ftrace_ops(&ftrace_ops_list, ops); + ret = remove_ftrace_ops(&ftrace_ops_list, ops); if (ret < 0) return ret; @@ -1687,6 +1658,9 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, int in_hash = 0; int match = 0; + if (rec->flags & FTRACE_FL_DISABLED) + continue; + if (all) { /* * Only the filter_hash affects all records. @@ -1940,7 +1914,7 @@ static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops, return __ftrace_hash_update_ipmodify(ops, old_hash, new_hash); } -static void print_ip_ins(const char *fmt, unsigned char *p) +static void print_ip_ins(const char *fmt, const unsigned char *p) { int i; @@ -1952,6 +1926,31 @@ static void print_ip_ins(const char *fmt, unsigned char *p) static struct ftrace_ops * ftrace_find_tramp_ops_any(struct dyn_ftrace *rec); +static struct ftrace_ops * +ftrace_find_tramp_ops_next(struct dyn_ftrace *rec, struct ftrace_ops *ops); + +enum ftrace_bug_type ftrace_bug_type; +const void *ftrace_expected; + +static void print_bug_type(void) +{ + switch (ftrace_bug_type) { + case FTRACE_BUG_UNKNOWN: + break; + case FTRACE_BUG_INIT: + pr_info("Initializing ftrace call sites\n"); + break; + case FTRACE_BUG_NOP: + pr_info("Setting ftrace call site to NOP\n"); + break; + case FTRACE_BUG_CALL: + pr_info("Setting ftrace call site to call ftrace function\n"); + break; + case FTRACE_BUG_UPDATE: + pr_info("Updating ftrace call site to call a different ftrace function\n"); + break; + } +} /** * ftrace_bug - report and shutdown function tracer @@ -1979,8 +1978,12 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec) FTRACE_WARN_ON_ONCE(1); pr_info("ftrace failed to modify "); print_ip_sym(ip); - print_ip_ins(" actual: ", (unsigned char *)ip); + print_ip_ins(" actual: ", (unsigned char *)ip); pr_cont("\n"); + if (ftrace_expected) { + print_ip_ins(" expected: ", ftrace_expected); + pr_cont("\n"); + } break; case -EPERM: FTRACE_WARN_ON_ONCE(1); @@ -1992,6 +1995,7 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec) pr_info("ftrace faulted on unknown error "); print_ip_sym(ip); } + print_bug_type(); if (rec) { struct ftrace_ops *ops = NULL; @@ -2000,15 +2004,19 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec) rec->flags & FTRACE_FL_REGS ? " R" : " "); if (rec->flags & FTRACE_FL_TRAMP_EN) { ops = ftrace_find_tramp_ops_any(rec); - if (ops) - pr_cont("\ttramp: %pS", - (void *)ops->trampoline); - else + if (ops) { + do { + pr_cont("\ttramp: %pS (%pS)", + (void *)ops->trampoline, + (void *)ops->func); + ops = ftrace_find_tramp_ops_next(rec, ops); + } while (ops); + } else pr_cont("\ttramp: ERROR!"); } ip = ftrace_get_addr_curr(rec); - pr_cont(" expected tramp: %lx\n", ip); + pr_cont("\n expected tramp: %lx\n", ip); } } @@ -2016,6 +2024,11 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) { unsigned long flag = 0UL; + ftrace_bug_type = FTRACE_BUG_UNKNOWN; + + if (rec->flags & FTRACE_FL_DISABLED) + return FTRACE_UPDATE_IGNORE; + /* * If we are updating calls: * @@ -2077,9 +2090,12 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) * from the save regs, to a non-save regs function or * vice versa, or from a trampoline call. */ - if (flag & FTRACE_FL_ENABLED) + if (flag & FTRACE_FL_ENABLED) { + ftrace_bug_type = FTRACE_BUG_CALL; return FTRACE_UPDATE_MAKE_CALL; + } + ftrace_bug_type = FTRACE_BUG_UPDATE; return FTRACE_UPDATE_MODIFY_CALL; } @@ -2096,6 +2112,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) FTRACE_FL_REGS_EN); } + ftrace_bug_type = FTRACE_BUG_NOP; return FTRACE_UPDATE_MAKE_NOP; } @@ -2145,6 +2162,24 @@ ftrace_find_tramp_ops_any(struct dyn_ftrace *rec) } static struct ftrace_ops * +ftrace_find_tramp_ops_next(struct dyn_ftrace *rec, + struct ftrace_ops *op) +{ + unsigned long ip = rec->ip; + + while_for_each_ftrace_op(op) { + + if (!op->trampoline) + continue; + + if (hash_contains_ip(ip, op->func_hash)) + return op; + } + + return NULL; +} + +static struct ftrace_ops * ftrace_find_tramp_ops_curr(struct dyn_ftrace *rec) { struct ftrace_ops *op; @@ -2307,17 +2342,22 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) ret = ftrace_update_record(rec, enable); + ftrace_bug_type = FTRACE_BUG_UNKNOWN; + switch (ret) { case FTRACE_UPDATE_IGNORE: return 0; case FTRACE_UPDATE_MAKE_CALL: + ftrace_bug_type = FTRACE_BUG_CALL; return ftrace_make_call(rec, ftrace_addr); case FTRACE_UPDATE_MAKE_NOP: + ftrace_bug_type = FTRACE_BUG_NOP; return ftrace_make_nop(NULL, rec, ftrace_old_addr); case FTRACE_UPDATE_MODIFY_CALL: + ftrace_bug_type = FTRACE_BUG_UPDATE; return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr); } @@ -2425,6 +2465,7 @@ ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec) ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR); if (ret) { + ftrace_bug_type = FTRACE_BUG_INIT; ftrace_bug(ret, rec); return 0; } @@ -2566,7 +2607,7 @@ void __weak arch_ftrace_trampoline_free(struct ftrace_ops *ops) { } -static void control_ops_free(struct ftrace_ops *ops) +static void per_cpu_ops_free(struct ftrace_ops *ops) { free_percpu(ops->disabled); } @@ -2667,13 +2708,13 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) if (!command || !ftrace_enabled) { /* - * If these are control ops, they still need their + * If these are per_cpu ops, they still need their * per_cpu field freed. Since, function tracing is * not currently active, we can just free them * without synchronizing all CPUs. */ - if (ops->flags & FTRACE_OPS_FL_CONTROL) - control_ops_free(ops); + if (ops->flags & FTRACE_OPS_FL_PER_CPU) + per_cpu_ops_free(ops); return 0; } @@ -2714,7 +2755,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) /* * Dynamic ops may be freed, we must make sure that all * callers are done before leaving this function. - * The same goes for freeing the per_cpu data of the control + * The same goes for freeing the per_cpu data of the per_cpu * ops. * * Again, normal synchronize_sched() is not good enough. @@ -2725,13 +2766,13 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) * infrastructure to do the synchronization, thus we must do it * ourselves. */ - if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_CONTROL)) { + if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU)) { schedule_on_each_cpu(ftrace_sync); arch_ftrace_trampoline_free(ops); - if (ops->flags & FTRACE_OPS_FL_CONTROL) - control_ops_free(ops); + if (ops->flags & FTRACE_OPS_FL_PER_CPU) + per_cpu_ops_free(ops); } return 0; @@ -2798,9 +2839,9 @@ ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec) if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) return 0; - /* If ops traces all mods, we already accounted for it */ + /* If ops traces all then it includes this function */ if (ops_traces_mod(ops)) - return 0; + return 1; /* The function must be in the filter */ if (!ftrace_hash_empty(ops->func_hash->filter_hash) && @@ -2814,64 +2855,41 @@ ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec) return 1; } -static int referenced_filters(struct dyn_ftrace *rec) -{ - struct ftrace_ops *ops; - int cnt = 0; - - for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) { - if (ops_references_rec(ops, rec)) - cnt++; - } - - return cnt; -} - static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs) { struct ftrace_page *pg; struct dyn_ftrace *p; cycle_t start, stop; unsigned long update_cnt = 0; - unsigned long ref = 0; - bool test = false; + unsigned long rec_flags = 0; int i; + start = ftrace_now(raw_smp_processor_id()); + /* - * When adding a module, we need to check if tracers are - * currently enabled and if they are set to trace all functions. - * If they are, we need to enable the module functions as well - * as update the reference counts for those function records. + * When a module is loaded, this function is called to convert + * the calls to mcount in its text to nops, and also to create + * an entry in the ftrace data. Now, if ftrace is activated + * after this call, but before the module sets its text to + * read-only, the modification of enabling ftrace can fail if + * the read-only is done while ftrace is converting the calls. + * To prevent this, the module's records are set as disabled + * and will be enabled after the call to set the module's text + * to read-only. */ - if (mod) { - struct ftrace_ops *ops; - - for (ops = ftrace_ops_list; - ops != &ftrace_list_end; ops = ops->next) { - if (ops->flags & FTRACE_OPS_FL_ENABLED) { - if (ops_traces_mod(ops)) - ref++; - else - test = true; - } - } - } - - start = ftrace_now(raw_smp_processor_id()); + if (mod) + rec_flags |= FTRACE_FL_DISABLED; for (pg = new_pgs; pg; pg = pg->next) { for (i = 0; i < pg->index; i++) { - int cnt = ref; /* If something went wrong, bail without enabling anything */ if (unlikely(ftrace_disabled)) return -1; p = &pg->records[i]; - if (test) - cnt += referenced_filters(p); - p->flags = cnt; + p->flags = rec_flags; /* * Do the initial record conversion from mcount jump @@ -2881,21 +2899,6 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs) break; update_cnt++; - - /* - * If the tracing is enabled, go ahead and enable the record. - * - * The reason not to enable the record immediatelly is the - * inherent check of ftrace_make_nop/ftrace_make_call for - * correct previous instructions. Making first the NOP - * conversion puts the module to the correct state, thus - * passing the ftrace_make_call check. - */ - if (ftrace_start_up && cnt) { - int failed = __ftrace_replace_code(p, 1); - if (failed) - ftrace_bug(failed, p); - } } } @@ -3258,7 +3261,7 @@ static int t_show(struct seq_file *m, void *v) seq_printf(m, "%ps", (void *)rec->ip); if (iter->flags & FTRACE_ITER_ENABLED) { - struct ftrace_ops *ops = NULL; + struct ftrace_ops *ops; seq_printf(m, " (%ld)%s%s", ftrace_rec_count(rec), @@ -3266,14 +3269,19 @@ static int t_show(struct seq_file *m, void *v) rec->flags & FTRACE_FL_IPMODIFY ? " I" : " "); if (rec->flags & FTRACE_FL_TRAMP_EN) { ops = ftrace_find_tramp_ops_any(rec); - if (ops) - seq_printf(m, "\ttramp: %pS", - (void *)ops->trampoline); - else + if (ops) { + do { + seq_printf(m, "\ttramp: %pS (%pS)", + (void *)ops->trampoline, + (void *)ops->func); + add_trampoline_func(m, ops, rec); + ops = ftrace_find_tramp_ops_next(rec, ops); + } while (ops); + } else seq_puts(m, "\ttramp: ERROR!"); - + } else { + add_trampoline_func(m, NULL, rec); } - add_trampoline_func(m, ops, rec); } seq_putc(m, '\n'); @@ -4898,6 +4906,19 @@ static int ftrace_process_locs(struct module *mod, #define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next) +static int referenced_filters(struct dyn_ftrace *rec) +{ + struct ftrace_ops *ops; + int cnt = 0; + + for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) { + if (ops_references_rec(ops, rec)) + cnt++; + } + + return cnt; +} + void ftrace_release_mod(struct module *mod) { struct dyn_ftrace *rec; @@ -4940,44 +4961,85 @@ void ftrace_release_mod(struct module *mod) mutex_unlock(&ftrace_lock); } -static void ftrace_init_module(struct module *mod, - unsigned long *start, unsigned long *end) +void ftrace_module_enable(struct module *mod) { - if (ftrace_disabled || start == end) - return; - ftrace_process_locs(mod, start, end); -} + struct dyn_ftrace *rec; + struct ftrace_page *pg; -void ftrace_module_init(struct module *mod) -{ - ftrace_init_module(mod, mod->ftrace_callsites, - mod->ftrace_callsites + - mod->num_ftrace_callsites); -} + mutex_lock(&ftrace_lock); -static int ftrace_module_notify_exit(struct notifier_block *self, - unsigned long val, void *data) -{ - struct module *mod = data; + if (ftrace_disabled) + goto out_unlock; - if (val == MODULE_STATE_GOING) - ftrace_release_mod(mod); + /* + * If the tracing is enabled, go ahead and enable the record. + * + * The reason not to enable the record immediatelly is the + * inherent check of ftrace_make_nop/ftrace_make_call for + * correct previous instructions. Making first the NOP + * conversion puts the module to the correct state, thus + * passing the ftrace_make_call check. + * + * We also delay this to after the module code already set the + * text to read-only, as we now need to set it back to read-write + * so that we can modify the text. + */ + if (ftrace_start_up) + ftrace_arch_code_modify_prepare(); - return 0; + do_for_each_ftrace_rec(pg, rec) { + int cnt; + /* + * do_for_each_ftrace_rec() is a double loop. + * module text shares the pg. If a record is + * not part of this module, then skip this pg, + * which the "break" will do. + */ + if (!within_module_core(rec->ip, mod)) + break; + + cnt = 0; + + /* + * When adding a module, we need to check if tracers are + * currently enabled and if they are, and can trace this record, + * we need to enable the module functions as well as update the + * reference counts for those function records. + */ + if (ftrace_start_up) + cnt += referenced_filters(rec); + + /* This clears FTRACE_FL_DISABLED */ + rec->flags = cnt; + + if (ftrace_start_up && cnt) { + int failed = __ftrace_replace_code(rec, 1); + if (failed) { + ftrace_bug(failed, rec); + goto out_loop; + } + } + + } while_for_each_ftrace_rec(); + + out_loop: + if (ftrace_start_up) + ftrace_arch_code_modify_post_process(); + + out_unlock: + mutex_unlock(&ftrace_lock); } -#else -static int ftrace_module_notify_exit(struct notifier_block *self, - unsigned long val, void *data) + +void ftrace_module_init(struct module *mod) { - return 0; + if (ftrace_disabled || !mod->num_ftrace_callsites) + return; + + ftrace_process_locs(mod, mod->ftrace_callsites, + mod->ftrace_callsites + mod->num_ftrace_callsites); } #endif /* CONFIG_MODULES */ -struct notifier_block ftrace_module_exit_nb = { - .notifier_call = ftrace_module_notify_exit, - .priority = INT_MIN, /* Run after anything that can remove kprobes */ -}; - void __init ftrace_init(void) { extern unsigned long __start_mcount_loc[]; @@ -5006,10 +5068,6 @@ void __init ftrace_init(void) __start_mcount_loc, __stop_mcount_loc); - ret = register_module_notifier(&ftrace_module_exit_nb); - if (ret) - pr_warning("Failed to register trace ftrace module exit notifier\n"); - set_ftrace_early_filters(); return; @@ -5116,44 +5174,6 @@ void ftrace_reset_array_ops(struct trace_array *tr) tr->ops->func = ftrace_stub; } -static void -ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *regs) -{ - if (unlikely(trace_recursion_test(TRACE_CONTROL_BIT))) - return; - - /* - * Some of the ops may be dynamically allocated, - * they must be freed after a synchronize_sched(). - */ - preempt_disable_notrace(); - trace_recursion_set(TRACE_CONTROL_BIT); - - /* - * Control funcs (perf) uses RCU. Only trace if - * RCU is currently active. - */ - if (!rcu_is_watching()) - goto out; - - do_for_each_ftrace_op(op, ftrace_control_list) { - if (!(op->flags & FTRACE_OPS_FL_STUB) && - !ftrace_function_local_disabled(op) && - ftrace_ops_test(op, ip, regs)) - op->func(ip, parent_ip, op, regs); - } while_for_each_ftrace_op(op); - out: - trace_recursion_clear(TRACE_CONTROL_BIT); - preempt_enable_notrace(); -} - -static struct ftrace_ops control_ops = { - .func = ftrace_ops_control_func, - .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, - INIT_OPS_HASH(control_ops) -}; - static inline void __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ignored, struct pt_regs *regs) @@ -5170,8 +5190,22 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, * they must be freed after a synchronize_sched(). */ preempt_disable_notrace(); + do_for_each_ftrace_op(op, ftrace_ops_list) { - if (ftrace_ops_test(op, ip, regs)) { + /* + * Check the following for each ops before calling their func: + * if RCU flag is set, then rcu_is_watching() must be true + * if PER_CPU is set, then ftrace_function_local_disable() + * must be false + * Otherwise test if the ip matches the ops filter + * + * If any of the above fails then the op->func() is not executed. + */ + if ((!(op->flags & FTRACE_OPS_FL_RCU) || rcu_is_watching()) && + (!(op->flags & FTRACE_OPS_FL_PER_CPU) || + !ftrace_function_local_disabled(op)) && + ftrace_ops_test(op, ip, regs)) { + if (FTRACE_WARN_ON(!op->func)) { pr_warn("op=%p %pS\n", op, op); goto out; @@ -5195,7 +5229,7 @@ out: * being NULL, or CONFIG_DYNAMIC_FTRACE_WITH_REGS. * Note, CONFIG_DYNAMIC_FTRACE_WITH_REGS expects a full regs to be saved. * An architecture can pass partial regs with ftrace_ops and still - * set the ARCH_SUPPORT_FTARCE_OPS. + * set the ARCH_SUPPORTS_FTRACE_OPS. */ #if ARCH_SUPPORTS_FTRACE_OPS static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, @@ -5212,20 +5246,29 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip) /* * If there's only one function registered but it does not support - * recursion, this function will be called by the mcount trampoline. - * This function will handle recursion protection. + * recursion, needs RCU protection and/or requires per cpu handling, then + * this function will be called by the mcount trampoline. */ -static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip, +static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *regs) { int bit; + if ((op->flags & FTRACE_OPS_FL_RCU) && !rcu_is_watching()) + return; + bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX); if (bit < 0) return; - op->func(ip, parent_ip, op, regs); + preempt_disable_notrace(); + if (!(op->flags & FTRACE_OPS_FL_PER_CPU) || + !ftrace_function_local_disabled(op)) { + op->func(ip, parent_ip, op, regs); + } + + preempt_enable_notrace(); trace_clear_recursion(bit); } @@ -5243,12 +5286,12 @@ static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip, ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops) { /* - * If the func handles its own recursion, call it directly. - * Otherwise call the recursion protected function that - * will call the ftrace ops function. + * If the function does not handle recursion, needs to be RCU safe, + * or does per cpu logic, then we need to call the assist handler. */ - if (!(ops->flags & FTRACE_OPS_FL_RECURSION_SAFE)) - return ftrace_ops_recurs_func; + if (!(ops->flags & FTRACE_OPS_FL_RECURSION_SAFE) || + ops->flags & (FTRACE_OPS_FL_RCU | FTRACE_OPS_FL_PER_CPU)) + return ftrace_ops_assist_func; return ops->func; } diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 9c6045a27..95181e368 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1001,17 +1001,13 @@ static int rb_head_page_replace(struct buffer_page *old, /* * rb_tail_page_update - move the tail page forward - * - * Returns 1 if moved tail page, 0 if someone else did. */ -static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer, +static void rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *tail_page, struct buffer_page *next_page) { - struct buffer_page *old_tail; unsigned long old_entries; unsigned long old_write; - int ret = 0; /* * The tail page now needs to be moved forward. @@ -1036,7 +1032,7 @@ static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer, * it is, then it is up to us to update the tail * pointer. */ - if (tail_page == cpu_buffer->tail_page) { + if (tail_page == READ_ONCE(cpu_buffer->tail_page)) { /* Zero the write counter */ unsigned long val = old_write & ~RB_WRITE_MASK; unsigned long eval = old_entries & ~RB_WRITE_MASK; @@ -1061,14 +1057,9 @@ static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer, */ local_set(&next_page->page->commit, 0); - old_tail = cmpxchg(&cpu_buffer->tail_page, - tail_page, next_page); - - if (old_tail == tail_page) - ret = 1; + /* Again, either we update tail_page or an interrupt does */ + (void)cmpxchg(&cpu_buffer->tail_page, tail_page, next_page); } - - return ret; } static int rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer, @@ -2036,12 +2027,15 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer, * the tail page would have moved. */ if (ret == RB_PAGE_NORMAL) { + struct buffer_page *buffer_tail_page; + + buffer_tail_page = READ_ONCE(cpu_buffer->tail_page); /* * If the tail had moved passed next, then we need * to reset the pointer. */ - if (cpu_buffer->tail_page != tail_page && - cpu_buffer->tail_page != next_page) + if (buffer_tail_page != tail_page && + buffer_tail_page != next_page) rb_head_page_set_normal(cpu_buffer, new_head, next_page, RB_PAGE_HEAD); @@ -2135,6 +2129,8 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, local_sub(length, &tail_page->write); } +static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer); + /* * This is the slow path, force gcc not to inline it. */ @@ -2147,7 +2143,6 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer *buffer = cpu_buffer->buffer; struct buffer_page *next_page; int ret; - u64 ts; next_page = tail_page; @@ -2221,20 +2216,17 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, } } - ret = rb_tail_page_update(cpu_buffer, tail_page, next_page); - if (ret) { - /* - * Nested commits always have zero deltas, so - * just reread the time stamp - */ - ts = rb_time_stamp(buffer); - next_page->page->time_stamp = ts; - } + rb_tail_page_update(cpu_buffer, tail_page, next_page); out_again: rb_reset_tail(cpu_buffer, tail, info); + /* Commit what we have for now. */ + rb_end_commit(cpu_buffer); + /* rb_end_commit() decs committing */ + local_inc(&cpu_buffer->committing); + /* fail and let the caller try again */ return ERR_PTR(-EAGAIN); @@ -2362,7 +2354,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, addr = (unsigned long)event; addr &= PAGE_MASK; - bpage = cpu_buffer->tail_page; + bpage = READ_ONCE(cpu_buffer->tail_page); if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { unsigned long write_mask = @@ -2410,7 +2402,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) again: max_count = cpu_buffer->nr_pages * 100; - while (cpu_buffer->commit_page != cpu_buffer->tail_page) { + while (cpu_buffer->commit_page != READ_ONCE(cpu_buffer->tail_page)) { if (RB_WARN_ON(cpu_buffer, !(--max_count))) return; if (RB_WARN_ON(cpu_buffer, @@ -2419,8 +2411,10 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) local_set(&cpu_buffer->commit_page->page->commit, rb_page_write(cpu_buffer->commit_page)); rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); - cpu_buffer->write_stamp = - cpu_buffer->commit_page->page->time_stamp; + /* Only update the write stamp if the page has an event */ + if (rb_page_write(cpu_buffer->commit_page)) + cpu_buffer->write_stamp = + cpu_buffer->commit_page->page->time_stamp; /* add barrier to keep gcc from optimizing too much */ barrier(); } @@ -2443,7 +2437,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) * and pushed the tail page forward, we will be left with * a dangling commit that will never go forward. */ - if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page)) + if (unlikely(cpu_buffer->commit_page != READ_ONCE(cpu_buffer->tail_page))) goto again; } @@ -2699,7 +2693,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, if (unlikely(info->add_timestamp)) info->length += RB_LEN_TIME_EXTEND; - tail_page = info->tail_page = cpu_buffer->tail_page; + /* Don't let the compiler play games with cpu_buffer->tail_page */ + tail_page = info->tail_page = READ_ONCE(cpu_buffer->tail_page); write = local_add_return(info->length, &tail_page->write); /* set write to only the index of the write */ diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 919d9d076..8414fa40b 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -363,8 +363,8 @@ struct trace_option_dentry { * @name: the name chosen to select it on the available_tracers file * @init: called when one switches to this tracer (echo name > current_tracer) * @reset: called when one switches to another tracer - * @start: called when tracing is unpaused (echo 1 > tracing_enabled) - * @stop: called when tracing is paused (echo 0 > tracing_enabled) + * @start: called when tracing is unpaused (echo 1 > tracing_on) + * @stop: called when tracing is paused (echo 0 > tracing_on) * @update_thresh: called when tracing_thresh is updated * @open: called when the trace file is opened * @pipe_open: called when the trace_pipe file is opened @@ -467,8 +467,6 @@ enum { TRACE_INTERNAL_IRQ_BIT, TRACE_INTERNAL_SIRQ_BIT, - TRACE_CONTROL_BIT, - TRACE_BRANCH_BIT, /* * Abuse of the trace_recursion. diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index cc9f7a931..00df25fd8 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -334,7 +334,7 @@ static int perf_ftrace_function_register(struct perf_event *event) { struct ftrace_ops *ops = &event->ftrace_ops; - ops->flags |= FTRACE_OPS_FL_CONTROL; + ops->flags |= FTRACE_OPS_FL_PER_CPU | FTRACE_OPS_FL_RCU; ops->func = perf_ftrace_function_call; return register_ftrace_function(ops); } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index d202d991e..05ddc0820 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1343,15 +1343,9 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, if (cnt >= PAGE_SIZE) return -EINVAL; - buf = (char *)__get_free_page(GFP_TEMPORARY); - if (!buf) - return -ENOMEM; - - if (copy_from_user(buf, ubuf, cnt)) { - free_page((unsigned long) buf); - return -EFAULT; - } - buf[cnt] = '\0'; + buf = memdup_user_nul(ubuf, cnt); + if (IS_ERR(buf)) + return PTR_ERR(buf); mutex_lock(&event_mutex); file = event_file_data(filp); @@ -1359,7 +1353,7 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, err = apply_event_filter(file, buf); mutex_unlock(&event_mutex); - free_page((unsigned long) buf); + kfree(buf); if (err < 0) return err; @@ -1510,18 +1504,12 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, if (cnt >= PAGE_SIZE) return -EINVAL; - buf = (char *)__get_free_page(GFP_TEMPORARY); - if (!buf) - return -ENOMEM; - - if (copy_from_user(buf, ubuf, cnt)) { - free_page((unsigned long) buf); - return -EFAULT; - } - buf[cnt] = '\0'; + buf = memdup_user_nul(ubuf, cnt); + if (IS_ERR(buf)) + return PTR_ERR(buf); err = apply_subsystem_event_filter(dir, buf); - free_page((unsigned long) buf); + kfree(buf); if (err < 0) return err; diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 42a4009fd..b38f617b6 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -237,28 +237,23 @@ static ssize_t event_trigger_regex_write(struct file *file, if (cnt >= PAGE_SIZE) return -EINVAL; - buf = (char *)__get_free_page(GFP_TEMPORARY); - if (!buf) - return -ENOMEM; + buf = memdup_user_nul(ubuf, cnt); + if (IS_ERR(buf)) + return PTR_ERR(buf); - if (copy_from_user(buf, ubuf, cnt)) { - free_page((unsigned long)buf); - return -EFAULT; - } - buf[cnt] = '\0'; strim(buf); mutex_lock(&event_mutex); event_file = event_file_data(file); if (unlikely(!event_file)) { mutex_unlock(&event_mutex); - free_page((unsigned long)buf); + kfree(buf); return -ENODEV; } ret = trigger_process_regex(event_file, buf); mutex_unlock(&event_mutex); - free_page((unsigned long)buf); + kfree(buf); if (ret < 0) goto out; @@ -543,11 +538,12 @@ static int register_trigger(char *glob, struct event_trigger_ops *ops, list_add_rcu(&data->list, &file->triggers); ret++; + update_cond_flag(file); if (trace_event_trigger_enable_disable(file, 1) < 0) { list_del_rcu(&data->list); + update_cond_flag(file); ret--; } - update_cond_flag(file); out: return ret; } @@ -575,8 +571,8 @@ static void unregister_trigger(char *glob, struct event_trigger_ops *ops, if (data->cmd_ops->trigger_type == test->cmd_ops->trigger_type) { unregistered = true; list_del_rcu(&data->list); - update_cond_flag(file); trace_event_trigger_enable_disable(file, 0); + update_cond_flag(file); break; } } @@ -1319,11 +1315,12 @@ static int event_enable_register_trigger(char *glob, list_add_rcu(&data->list, &file->triggers); ret++; + update_cond_flag(file); if (trace_event_trigger_enable_disable(file, 1) < 0) { list_del_rcu(&data->list); + update_cond_flag(file); ret--; } - update_cond_flag(file); out: return ret; } @@ -1344,8 +1341,8 @@ static void event_enable_unregister_trigger(char *glob, (enable_data->file == test_enable_data->file)) { unregistered = true; list_del_rcu(&data->list); - update_cond_flag(file); trace_event_trigger_enable_disable(file, 0); + update_cond_flag(file); break; } } diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 202df6cff..2a1abbaca 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -156,7 +156,11 @@ check_stack(unsigned long ip, unsigned long *stack) for (; p < top && i < stack_trace_max.nr_entries; p++) { if (stack_dump_trace[i] == ULONG_MAX) break; - if (*p == stack_dump_trace[i]) { + /* + * The READ_ONCE_NOCHECK is used to let KASAN know that + * this is not a stack-out-of-bounds error. + */ + if ((READ_ONCE_NOCHECK(*p)) == stack_dump_trace[i]) { stack_dump_trace[x] = stack_dump_trace[i++]; this_size = stack_trace_index[x++] = (top - p) * sizeof(unsigned long); diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 975cb49e3..f8e26ab96 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -93,9 +93,11 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) { struct mm_struct *mm; - /* convert pages-usec to Mbyte-usec */ - stats->coremem = p->acct_rss_mem1 * PAGE_SIZE / MB; - stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE / MB; + /* convert pages-nsec/1024 to Mbyte-usec, see __acct_update_integrals */ + stats->coremem = p->acct_rss_mem1 * PAGE_SIZE; + do_div(stats->coremem, 1000 * KB); + stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE; + do_div(stats->virtmem, 1000 * KB); mm = get_task_mm(p); if (mm) { /* adjust to KB unit */ @@ -123,27 +125,28 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) static void __acct_update_integrals(struct task_struct *tsk, cputime_t utime, cputime_t stime) { - if (likely(tsk->mm)) { - cputime_t time, dtime; - struct timeval value; - unsigned long flags; - u64 delta; - - local_irq_save(flags); - time = stime + utime; - dtime = time - tsk->acct_timexpd; - jiffies_to_timeval(cputime_to_jiffies(dtime), &value); - delta = value.tv_sec; - delta = delta * USEC_PER_SEC + value.tv_usec; - - if (delta == 0) - goto out; - tsk->acct_timexpd = time; - tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm); - tsk->acct_vm_mem1 += delta * tsk->mm->total_vm; - out: - local_irq_restore(flags); - } + cputime_t time, dtime; + u64 delta; + + if (!likely(tsk->mm)) + return; + + time = stime + utime; + dtime = time - tsk->acct_timexpd; + /* Avoid division: cputime_t is often in nanoseconds already. */ + delta = cputime_to_nsecs(dtime); + + if (delta < TICK_NSEC) + return; + + tsk->acct_timexpd = time; + /* + * Divide by 1024 to avoid overflow, and to avoid division. + * The final unit reported to userspace is Mbyte-usecs, + * the rest of the math is done in xacct_add_tsk. + */ + tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm) >> 10; + tsk->acct_vm_mem1 += delta * tsk->mm->total_vm >> 10; } /** @@ -153,9 +156,12 @@ static void __acct_update_integrals(struct task_struct *tsk, void acct_update_integrals(struct task_struct *tsk) { cputime_t utime, stime; + unsigned long flags; + local_irq_save(flags); task_cputime(tsk, &utime, &stime); __acct_update_integrals(tsk, utime, stime); + local_irq_restore(flags); } /** diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 88fefa68c..9bafc2119 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -602,8 +602,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, struct uid_gid_map new_map; unsigned idx; struct uid_gid_extent *extent = NULL; - unsigned long page = 0; - char *kbuf, *pos, *next_line; + char *kbuf = NULL, *pos, *next_line; ssize_t ret = -EINVAL; /* @@ -638,23 +637,18 @@ static ssize_t map_write(struct file *file, const char __user *buf, if (cap_valid(cap_setid) && !file_ns_capable(file, ns, CAP_SYS_ADMIN)) goto out; - /* Get a buffer */ - ret = -ENOMEM; - page = __get_free_page(GFP_TEMPORARY); - kbuf = (char *) page; - if (!page) - goto out; - /* Only allow < page size writes at the beginning of the file */ ret = -EINVAL; if ((*ppos != 0) || (count >= PAGE_SIZE)) goto out; /* Slurp in the user data */ - ret = -EFAULT; - if (copy_from_user(kbuf, buf, count)) + kbuf = memdup_user_nul(buf, count); + if (IS_ERR(kbuf)) { + ret = PTR_ERR(kbuf); + kbuf = NULL; goto out; - kbuf[count] = '\0'; + } /* Parse the user data */ ret = -EINVAL; @@ -756,8 +750,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, ret = count; out: mutex_unlock(&userns_state_mutex); - if (page) - free_page(page); + kfree(kbuf); return ret; } diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 18f34cf75..b3ace6ebb 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -20,6 +20,7 @@ #include <linux/smpboot.h> #include <linux/sched/rt.h> #include <linux/tick.h> +#include <linux/workqueue.h> #include <asm/irq_regs.h> #include <linux/kvm_para.h> @@ -225,7 +226,15 @@ static void __touch_watchdog(void) __this_cpu_write(watchdog_touch_ts, get_timestamp()); } -void touch_softlockup_watchdog(void) +/** + * touch_softlockup_watchdog_sched - touch watchdog on scheduler stalls + * + * Call when the scheduler may have stalled for legitimate reasons + * preventing the watchdog task from executing - e.g. the scheduler + * entering idle state. This should only be used for scheduler events. + * Use touch_softlockup_watchdog() for everything else. + */ +void touch_softlockup_watchdog_sched(void) { /* * Preemption can be enabled. It doesn't matter which CPU's timestamp @@ -233,6 +242,12 @@ void touch_softlockup_watchdog(void) */ raw_cpu_write(watchdog_touch_ts, 0); } + +void touch_softlockup_watchdog(void) +{ + touch_softlockup_watchdog_sched(); + wq_watchdog_touch(raw_smp_processor_id()); +} EXPORT_SYMBOL(touch_softlockup_watchdog); void touch_all_softlockup_watchdogs(void) @@ -246,6 +261,7 @@ void touch_all_softlockup_watchdogs(void) */ for_each_watchdog_cpu(cpu) per_cpu(watchdog_touch_ts, cpu) = 0; + wq_watchdog_touch(-1); } #ifdef CONFIG_HARDLOCKUP_DETECTOR @@ -351,7 +367,7 @@ static void watchdog_overflow_callback(struct perf_event *event, trigger_allbutself_cpu_backtrace(); if (hardlockup_panic) - panic("Hard LOCKUP"); + nmi_panic(regs, "Hard LOCKUP"); __this_cpu_write(hard_watchdog_warn, true); return; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 450c21fd0..7ff5dc7d2 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -148,6 +148,8 @@ struct worker_pool { int id; /* I: pool ID */ unsigned int flags; /* X: flags */ + unsigned long watchdog_ts; /* L: watchdog timestamp */ + struct list_head worklist; /* L: list of pending works */ int nr_workers; /* L: total number of workers */ @@ -299,7 +301,23 @@ static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ static LIST_HEAD(workqueues); /* PR: list of all workqueues */ static bool workqueue_freezing; /* PL: have wqs started freezing? */ -static cpumask_var_t wq_unbound_cpumask; /* PL: low level cpumask for all unbound wqs */ +/* PL: allowable cpus for unbound wqs and work items */ +static cpumask_var_t wq_unbound_cpumask; + +/* CPU where unbound work was last round robin scheduled from this CPU */ +static DEFINE_PER_CPU(int, wq_rr_cpu_last); + +/* + * Local execution of unbound work items is no longer guaranteed. The + * following always forces round-robin CPU selection on unbound work items + * to uncover usages which depend on it. + */ +#ifdef CONFIG_DEBUG_WQ_FORCE_RR_CPU +static bool wq_debug_force_rr_cpu = true; +#else +static bool wq_debug_force_rr_cpu = false; +#endif +module_param_named(debug_force_rr_cpu, wq_debug_force_rr_cpu, bool, 0644); /* the per-cpu worker pools */ static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], @@ -1093,6 +1111,8 @@ static void pwq_activate_delayed_work(struct work_struct *work) struct pool_workqueue *pwq = get_work_pwq(work); trace_workqueue_activate_work(work); + if (list_empty(&pwq->pool->worklist)) + pwq->pool->watchdog_ts = jiffies; move_linked_works(work, &pwq->pool->worklist, NULL); __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); pwq->nr_active++; @@ -1304,6 +1324,39 @@ static bool is_chained_work(struct workqueue_struct *wq) return worker && worker->current_pwq->wq == wq; } +/* + * When queueing an unbound work item to a wq, prefer local CPU if allowed + * by wq_unbound_cpumask. Otherwise, round robin among the allowed ones to + * avoid perturbing sensitive tasks. + */ +static int wq_select_unbound_cpu(int cpu) +{ + static bool printed_dbg_warning; + int new_cpu; + + if (likely(!wq_debug_force_rr_cpu)) { + if (cpumask_test_cpu(cpu, wq_unbound_cpumask)) + return cpu; + } else if (!printed_dbg_warning) { + pr_warn("workqueue: round-robin CPU selection forced, expect performance impact\n"); + printed_dbg_warning = true; + } + + if (cpumask_empty(wq_unbound_cpumask)) + return cpu; + + new_cpu = __this_cpu_read(wq_rr_cpu_last); + new_cpu = cpumask_next_and(new_cpu, wq_unbound_cpumask, cpu_online_mask); + if (unlikely(new_cpu >= nr_cpu_ids)) { + new_cpu = cpumask_first_and(wq_unbound_cpumask, cpu_online_mask); + if (unlikely(new_cpu >= nr_cpu_ids)) + return cpu; + } + __this_cpu_write(wq_rr_cpu_last, new_cpu); + + return new_cpu; +} + static void __queue_work(int cpu, struct workqueue_struct *wq, struct work_struct *work) { @@ -1329,7 +1382,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, return; retry: if (req_cpu == WORK_CPU_UNBOUND) - cpu = raw_smp_processor_id(); + cpu = wq_select_unbound_cpu(raw_smp_processor_id()); /* pwq which will be used unless @work is executing elsewhere */ if (!(wq->flags & WQ_UNBOUND)) @@ -1395,6 +1448,8 @@ retry: trace_workqueue_activate_work(work); pwq->nr_active++; worklist = &pwq->pool->worklist; + if (list_empty(worklist)) + pwq->pool->watchdog_ts = jiffies; } else { work_flags |= WORK_STRUCT_DELAYED; worklist = &pwq->delayed_works; @@ -2167,6 +2222,8 @@ recheck: list_first_entry(&pool->worklist, struct work_struct, entry); + pool->watchdog_ts = jiffies; + if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) { /* optimization path, not strictly necessary */ process_one_work(worker, work); @@ -2250,6 +2307,7 @@ repeat: struct pool_workqueue, mayday_node); struct worker_pool *pool = pwq->pool; struct work_struct *work, *n; + bool first = true; __set_current_state(TASK_RUNNING); list_del_init(&pwq->mayday_node); @@ -2266,9 +2324,14 @@ repeat: * process'em. */ WARN_ON_ONCE(!list_empty(scheduled)); - list_for_each_entry_safe(work, n, &pool->worklist, entry) - if (get_work_pwq(work) == pwq) + list_for_each_entry_safe(work, n, &pool->worklist, entry) { + if (get_work_pwq(work) == pwq) { + if (first) + pool->watchdog_ts = jiffies; move_linked_works(work, scheduled, &n); + } + first = false; + } if (!list_empty(scheduled)) { process_scheduled_works(rescuer); @@ -2326,6 +2389,38 @@ repeat: goto repeat; } +/** + * check_flush_dependency - check for flush dependency sanity + * @target_wq: workqueue being flushed + * @target_work: work item being flushed (NULL for workqueue flushes) + * + * %current is trying to flush the whole @target_wq or @target_work on it. + * If @target_wq doesn't have %WQ_MEM_RECLAIM, verify that %current is not + * reclaiming memory or running on a workqueue which doesn't have + * %WQ_MEM_RECLAIM as that can break forward-progress guarantee leading to + * a deadlock. + */ +static void check_flush_dependency(struct workqueue_struct *target_wq, + struct work_struct *target_work) +{ + work_func_t target_func = target_work ? target_work->func : NULL; + struct worker *worker; + + if (target_wq->flags & WQ_MEM_RECLAIM) + return; + + worker = current_wq_worker(); + + WARN_ONCE(current->flags & PF_MEMALLOC, + "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%pf", + current->pid, current->comm, target_wq->name, target_func); + WARN_ONCE(worker && ((worker->current_pwq->wq->flags & + (WQ_MEM_RECLAIM | __WQ_LEGACY)) == WQ_MEM_RECLAIM), + "workqueue: WQ_MEM_RECLAIM %s:%pf is flushing !WQ_MEM_RECLAIM %s:%pf", + worker->current_pwq->wq->name, worker->current_func, + target_wq->name, target_func); +} + struct wq_barrier { struct work_struct work; struct completion done; @@ -2535,6 +2630,8 @@ void flush_workqueue(struct workqueue_struct *wq) list_add_tail(&this_flusher.list, &wq->flusher_overflow); } + check_flush_dependency(wq, NULL); + mutex_unlock(&wq->mutex); wait_for_completion(&this_flusher.done); @@ -2707,6 +2804,8 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr) pwq = worker->current_pwq; } + check_flush_dependency(pwq->wq, work); + insert_wq_barrier(pwq, barr, work, worker); spin_unlock_irq(&pool->lock); @@ -3079,6 +3178,7 @@ static int init_worker_pool(struct worker_pool *pool) pool->cpu = -1; pool->node = NUMA_NO_NODE; pool->flags |= POOL_DISASSOCIATED; + pool->watchdog_ts = jiffies; INIT_LIST_HEAD(&pool->worklist); INIT_LIST_HEAD(&pool->idle_list); hash_init(pool->busy_hash); @@ -3611,7 +3711,6 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, const struct workqueue_attrs *attrs) { struct apply_wqattrs_ctx *ctx; - int ret = -ENOMEM; /* only unbound workqueues can change attributes */ if (WARN_ON(!(wq->flags & WQ_UNBOUND))) @@ -3622,16 +3721,14 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, return -EINVAL; ctx = apply_wqattrs_prepare(wq, attrs); + if (!ctx) + return -ENOMEM; /* the ctx has been prepared successfully, let's commit it */ - if (ctx) { - apply_wqattrs_commit(ctx); - ret = 0; - } - + apply_wqattrs_commit(ctx); apply_wqattrs_cleanup(ctx); - return ret; + return 0; } /** @@ -4318,7 +4415,9 @@ void show_workqueue_state(void) pr_info("pool %d:", pool->id); pr_cont_pool_info(pool); - pr_cont(" workers=%d", pool->nr_workers); + pr_cont(" hung=%us workers=%d", + jiffies_to_msecs(jiffies - pool->watchdog_ts) / 1000, + pool->nr_workers); if (pool->manager) pr_cont(" manager: %d", task_pid_nr(pool->manager->task)); @@ -5177,6 +5276,154 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq) static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { } #endif /* CONFIG_SYSFS */ +/* + * Workqueue watchdog. + * + * Stall may be caused by various bugs - missing WQ_MEM_RECLAIM, illegal + * flush dependency, a concurrency managed work item which stays RUNNING + * indefinitely. Workqueue stalls can be very difficult to debug as the + * usual warning mechanisms don't trigger and internal workqueue state is + * largely opaque. + * + * Workqueue watchdog monitors all worker pools periodically and dumps + * state if some pools failed to make forward progress for a while where + * forward progress is defined as the first item on ->worklist changing. + * + * This mechanism is controlled through the kernel parameter + * "workqueue.watchdog_thresh" which can be updated at runtime through the + * corresponding sysfs parameter file. + */ +#ifdef CONFIG_WQ_WATCHDOG + +static void wq_watchdog_timer_fn(unsigned long data); + +static unsigned long wq_watchdog_thresh = 30; +static struct timer_list wq_watchdog_timer = + TIMER_DEFERRED_INITIALIZER(wq_watchdog_timer_fn, 0, 0); + +static unsigned long wq_watchdog_touched = INITIAL_JIFFIES; +static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES; + +static void wq_watchdog_reset_touched(void) +{ + int cpu; + + wq_watchdog_touched = jiffies; + for_each_possible_cpu(cpu) + per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies; +} + +static void wq_watchdog_timer_fn(unsigned long data) +{ + unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ; + bool lockup_detected = false; + struct worker_pool *pool; + int pi; + + if (!thresh) + return; + + rcu_read_lock(); + + for_each_pool(pool, pi) { + unsigned long pool_ts, touched, ts; + + if (list_empty(&pool->worklist)) + continue; + + /* get the latest of pool and touched timestamps */ + pool_ts = READ_ONCE(pool->watchdog_ts); + touched = READ_ONCE(wq_watchdog_touched); + + if (time_after(pool_ts, touched)) + ts = pool_ts; + else + ts = touched; + + if (pool->cpu >= 0) { + unsigned long cpu_touched = + READ_ONCE(per_cpu(wq_watchdog_touched_cpu, + pool->cpu)); + if (time_after(cpu_touched, ts)) + ts = cpu_touched; + } + + /* did we stall? */ + if (time_after(jiffies, ts + thresh)) { + lockup_detected = true; + pr_emerg("BUG: workqueue lockup - pool"); + pr_cont_pool_info(pool); + pr_cont(" stuck for %us!\n", + jiffies_to_msecs(jiffies - pool_ts) / 1000); + } + } + + rcu_read_unlock(); + + if (lockup_detected) + show_workqueue_state(); + + wq_watchdog_reset_touched(); + mod_timer(&wq_watchdog_timer, jiffies + thresh); +} + +void wq_watchdog_touch(int cpu) +{ + if (cpu >= 0) + per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies; + else + wq_watchdog_touched = jiffies; +} + +static void wq_watchdog_set_thresh(unsigned long thresh) +{ + wq_watchdog_thresh = 0; + del_timer_sync(&wq_watchdog_timer); + + if (thresh) { + wq_watchdog_thresh = thresh; + wq_watchdog_reset_touched(); + mod_timer(&wq_watchdog_timer, jiffies + thresh * HZ); + } +} + +static int wq_watchdog_param_set_thresh(const char *val, + const struct kernel_param *kp) +{ + unsigned long thresh; + int ret; + + ret = kstrtoul(val, 0, &thresh); + if (ret) + return ret; + + if (system_wq) + wq_watchdog_set_thresh(thresh); + else + wq_watchdog_thresh = thresh; + + return 0; +} + +static const struct kernel_param_ops wq_watchdog_thresh_ops = { + .set = wq_watchdog_param_set_thresh, + .get = param_get_ulong, +}; + +module_param_cb(watchdog_thresh, &wq_watchdog_thresh_ops, &wq_watchdog_thresh, + 0644); + +static void wq_watchdog_init(void) +{ + wq_watchdog_set_thresh(wq_watchdog_thresh); +} + +#else /* CONFIG_WQ_WATCHDOG */ + +static inline void wq_watchdog_init(void) { } + +#endif /* CONFIG_WQ_WATCHDOG */ + static void __init wq_numa_init(void) { cpumask_var_t *tbl; @@ -5300,6 +5547,9 @@ static int __init init_workqueues(void) !system_unbound_wq || !system_freezable_wq || !system_power_efficient_wq || !system_freezable_power_efficient_wq); + + wq_watchdog_init(); + return 0; } early_initcall(init_workqueues); |